From 64ecdc1cb168dddc06aa57a8bac5afb8abe05634 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Tue, 15 Feb 2022 09:57:54 -0500
Subject: [PATCH 001/748] [OpenMP] Pass AMDGPU math libraries into the linker
 wrapper

This patch passes in the AMDPGU math libraries to the linker wrapper.
The wrapper already handles linking OpenMP bitcode libraries via the
`--target-library` option. This should be sufficient to link in math
libraries for the accompanying architecture.

Fixes #53526.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D119841
---
 clang/lib/Driver/ToolChains/Clang.cpp       | 23 +++++++++++++++++++++
 clang/test/Driver/amdgpu-openmp-toolchain.c |  3 +++
 2 files changed, 26 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index de289683596be..a16175ebebbca 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -8199,6 +8199,29 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // Get the AMDGPU math libraries.
+  // FIXME: This method is bad, remove once AMDGPU has a proper math library
+  // (see AMDGCN::OpenMPLinker::constructLLVMLinkCommand).
+  for (auto &I : llvm::make_range(OpenMPTCRange.first, OpenMPTCRange.second)) {
+    const ToolChain *TC = I.second;
+
+    if (!TC->getTriple().isAMDGPU() || Args.hasArg(options::OPT_nogpulib))
+      continue;
+
+    const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP);
+    StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ);
+    const toolchains::ROCMToolChain RocmTC(TC->getDriver(), TC->getTriple(),
+                                           TCArgs);
+
+    SmallVector<std::string, 12> BCLibs =
+        RocmTC.getCommonDeviceLibNames(TCArgs, Arch.str());
+
+    for (StringRef LibName : BCLibs)
+      CmdArgs.push_back(
+          Args.MakeArgString("-target-library=" + TC->getTripleString() + "-" +
+                             Arch + "=" + LibName));
+  }
+
   if (D.isUsingLTO(/* IsOffload */ true)) {
     // Pass in target features for each toolchain.
     for (auto &I :
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index 0d50411f24881..4d90c73034503 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -77,3 +77,6 @@
 
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
 // CHECK-LIB-DEVICE: {{.*}}llvm-link{{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
+
+// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode -fopenmp-new-driver %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE-NEW
+// CHECK-LIB-DEVICE-NEW: {{.*}}clang-linker-wrapper{{.*}}-target-library=amdgcn-amd-amdhsa-gfx803={{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"

From b389fbd015955b96a88adeef75b6ef6af40461e5 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Wed, 9 Feb 2022 14:19:22 +0000
Subject: [PATCH 002/748] [flang] Add Win32 to the list of supported triples

This patch adds Win32 to the list of supported triples in
`fir::CodeGenSpecifics`. This change means that we can use the "native"
triple, even when running tests on Windows. Currently this affects only
1 test, but it will change once we start adding more tests for lowering
and code-generation.

Differential Revision: https://reviews.llvm.org/D119332
---
 flang/lib/Optimizer/CodeGen/Target.cpp | 7 ++++---
 flang/test/Fir/basic-program.fir       | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 26d21cfacdfba..2770c0b664fef 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -238,9 +238,7 @@ struct TargetPPC64le : public GenericTarget<TargetPPC64le> {
 } // namespace
 
 // Instantiate the overloaded target instance based on the triple value.
-// Currently, the implementation only instantiates `i386-unknown-linux-gnu`,
-// `x86_64-unknown-linux-gnu`, aarch64 and ppc64le like triples. Other targets
-// should be added to this file as needed.
+// TODO: Add other targets to this file as needed.
 std::unique_ptr<fir::CodeGenSpecifics>
 fir::CodeGenSpecifics::get(mlir::MLIRContext *ctx, llvm::Triple &&trp,
                            KindMapping &&kindMap) {
@@ -253,6 +251,7 @@ fir::CodeGenSpecifics::get(mlir::MLIRContext *ctx, llvm::Triple &&trp,
       break;
     case llvm::Triple::OSType::Linux:
     case llvm::Triple::OSType::Darwin:
+    case llvm::Triple::OSType::Win32:
       return std::make_unique<TargetI386>(ctx, std::move(trp),
                                           std::move(kindMap));
     }
@@ -263,6 +262,7 @@ fir::CodeGenSpecifics::get(mlir::MLIRContext *ctx, llvm::Triple &&trp,
       break;
     case llvm::Triple::OSType::Linux:
     case llvm::Triple::OSType::Darwin:
+    case llvm::Triple::OSType::Win32:
       return std::make_unique<TargetX86_64>(ctx, std::move(trp),
                                             std::move(kindMap));
     }
@@ -273,6 +273,7 @@ fir::CodeGenSpecifics::get(mlir::MLIRContext *ctx, llvm::Triple &&trp,
       break;
     case llvm::Triple::OSType::Linux:
     case llvm::Triple::OSType::Darwin:
+    case llvm::Triple::OSType::Win32:
       return std::make_unique<TargetAArch64>(ctx, std::move(trp),
                                              std::move(kindMap));
     }
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 02463bef99496..0f22629d7675b 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -1,4 +1,4 @@
-// RUN: tco --target=x86_64-unknown-linux-gnu %s | FileCheck %s
+// RUN: tco %s | FileCheck %s
 
 // Check that tco is working with a basic test.
 

From d95961f214a9f2e7abc2844cee309e42eba5600c Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <medismail.bennani@gmail.com>
Date: Wed, 16 Feb 2022 13:49:48 -0800
Subject: [PATCH 003/748] [lldb/test] Disable scripted_crashlog_json.test on
 non darwin aarch64 systems

This patch adds requirement for the `scripted_crashlog_json` test to
make sure it only runs on apple silicon systems.

This should fix the following green dragon failure:
https://green.lab.llvm.org/green/view/LLDB/job/lldb-cmake/41454

Signed-off-by: Med Ismail Bennani <medismail.bennani@gmail.com>
---
 .../Python/Crashlog/scripted_crashlog_json.test                 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/scripted_crashlog_json.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/scripted_crashlog_json.test
index a94a667eae545..c82c183378d3c 100644
--- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/scripted_crashlog_json.test
+++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/scripted_crashlog_json.test
@@ -1,3 +1,5 @@
+# REQUIRES: python, native && target-aarch64 && system-darwin
+
 # RUN: %clang_host -g %S/Inputs/test.c -o %t.out
 
 # RUN: cp %S/Inputs/scripted_crashlog.ips %t.crash

From f1cdeca4d7914eb82eeedbec5b2dc0f34bc96585 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 16 Feb 2022 16:47:26 -0500
Subject: [PATCH 004/748] try to fix check-llvm after c5fb05f663f

llvm-config wants all libraries referenced in
llvm/lib/CMakeLists.txt to exist on disk.

But WindowsDriver is only referenced in clang and lld and hence
wasn't built as a dependency of check-llvm.

Add it as an explicit dependency to make llvm-config happy.
---
 llvm/test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index b12b7f2ede4d1..7eff058ce78fe 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -53,6 +53,7 @@ set(LLVM_TEST_DEPENDS
           BugpointPasses
           FileCheck
           LLVMHello
+          LLVMWindowsDriver
           UnitTests
           bugpoint
           count

From 48a31c8f429022a07e2e35f3e62d5f495117f2e7 Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Wed, 16 Feb 2022 22:55:09 +0100
Subject: [PATCH 005/748] [clang-format] Mark
 FormatToken::getPreviousNonComment() nodiscard. NFC.

---
 clang/lib/Format/FormatToken.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 6aaf66c7bb7e5..6b7d475232b0e 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -693,7 +693,7 @@ struct FormatToken {
   }
 
   /// Returns the previous token ignoring comments.
-  FormatToken *getPreviousNonComment() const {
+  LLVM_NODISCARD FormatToken *getPreviousNonComment() const {
     FormatToken *Tok = Previous;
     while (Tok && Tok->is(tok::comment))
       Tok = Tok->Previous;

From 668c5c688be7ab0af37739bbbe2d653be82d5c6f Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Fri, 4 Feb 2022 03:17:20 -0600
Subject: [PATCH 006/748] [Attributor][FIX] Use liveness information of the
 right function

When we use liveness for edges during the `genericValueTraversal` we
need to make sure to use the AAIsDead of the correct function. This
patch adds the proper logic and some simple caching scheme. We also
add an assertion to the `isEdgeDead` call to make sure future misuse
is detected earlier.

Fixes https://github.com/llvm/llvm-project/issues/53872
---
 .../Transforms/IPO/AttributorAttributes.cpp   |  43 ++++---
 .../IPConstantProp/return-constant.ll         |   4 +-
 .../IPConstantProp/return-constants.ll        |   4 +-
 llvm/test/Transforms/Attributor/align.ll      | 106 ++++++++++--------
 .../read_write_returned_arguments_scc.ll      |  16 +--
 .../Transforms/Attributor/value-simplify.ll   |  21 ++++
 6 files changed, 121 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index a1cb936dc3763..8e90ac352791a 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -265,13 +265,18 @@ static bool genericValueTraversal(
     function_ref<Value *(Value *)> StripCB = nullptr,
     bool Intraprocedural = false) {
 
-  const AAIsDead *LivenessAA = nullptr;
-  if (IRP.getAnchorScope())
-    LivenessAA = &A.getAAFor<AAIsDead>(
-        QueryingAA,
-        IRPosition::function(*IRP.getAnchorScope(), IRP.getCallBaseContext()),
-        DepClassTy::NONE);
-  bool AnyDead = false;
+  struct LivenessInfo {
+    const AAIsDead *LivenessAA = nullptr;
+    bool AnyDead = false;
+  };
+  DenseMap<const Function *, LivenessInfo> LivenessAAs;
+  auto GetLivenessInfo = [&](const Function &F) -> LivenessInfo & {
+    LivenessInfo &LI = LivenessAAs[&F];
+    if (!LI.LivenessAA)
+      LI.LivenessAA = &A.getAAFor<AAIsDead>(QueryingAA, IRPosition::function(F),
+                                            DepClassTy::NONE);
+    return LI;
+  };
 
   Value *InitialV = &IRP.getAssociatedValue();
   using Item = std::pair<Value *, const Instruction *>;
@@ -341,13 +346,12 @@ static bool genericValueTraversal(
 
     // Look through phi nodes, visit all live operands.
     if (auto *PHI = dyn_cast<PHINode>(V)) {
-      assert(LivenessAA &&
-             "Expected liveness in the presence of instructions!");
+      LivenessInfo &LI = GetLivenessInfo(*PHI->getFunction());
       for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
         BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
-        if (LivenessAA->isEdgeDead(IncomingBB, PHI->getParent())) {
-          AnyDead = true;
-          UsedAssumedInformation |= !LivenessAA->isAtFixpoint();
+        if (LI.LivenessAA->isEdgeDead(IncomingBB, PHI->getParent())) {
+          LI.AnyDead = true;
+          UsedAssumedInformation |= !LI.LivenessAA->isAtFixpoint();
           continue;
         }
         Worklist.push_back(
@@ -401,8 +405,10 @@ static bool genericValueTraversal(
   } while (!Worklist.empty());
 
   // If we actually used liveness information so we have to record a dependence.
-  if (AnyDead)
-    A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+  for (auto &It : LivenessAAs)
+    if (It.second.AnyDead)
+      A.recordDependence(*It.second.LivenessAA, QueryingAA,
+                         DepClassTy::OPTIONAL);
 
   // All values have been visited.
   return true;
@@ -1230,11 +1236,13 @@ struct AAPointerInfoImpl
     // Run the user callback on all writes we cannot skip and return if that
     // succeeded for all or not.
     unsigned NumInterferingWrites = InterferingWrites.size();
-    for (auto &It : InterferingWrites)
+    for (auto &It : InterferingWrites) {
       if (!DT || NumInterferingWrites > MaxInterferingWrites ||
-          !CanSkipAccess(*It.first, It.second))
+          !CanSkipAccess(*It.first, It.second)) {
         if (!UserCB(*It.first, It.second))
           return false;
+      }
+    }
     return true;
   }
 
@@ -3821,6 +3829,9 @@ struct AAIsDeadFunction : public AAIsDead {
   ChangeStatus updateImpl(Attributor &A) override;
 
   bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const override {
+    assert(From->getParent() == getAnchorScope() &&
+           To->getParent() == getAnchorScope() &&
+           "Used AAIsDead of the wrong function");
     return isValidState() && !AssumedLiveEdges.count(std::make_pair(From, To));
   }
 
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
index 5eda27957e6b8..cdec3a09fdfb8 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
-; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
index 9129d25f6bbaf..954840c1207e0 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-constants.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
-; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=8 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=8 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 
diff --git a/llvm/test/Transforms/Attributor/align.ll b/llvm/test/Transforms/Attributor/align.ll
index cbf38e511f48e..fafcf67315202 100644
--- a/llvm/test/Transforms/Attributor/align.ll
+++ b/llvm/test/Transforms/Attributor/align.ll
@@ -171,24 +171,30 @@ define internal i8* @f2(i8* readnone %0) local_unnamed_addr #0 {
 ; IS__CGSCC_OPM: Function Attrs: noinline nounwind uwtable
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f2
 ; IS__CGSCC_OPM-SAME: (i8* readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
-; IS__CGSCC_OPM-NEXT:    unreachable
-; IS__CGSCC_OPM:       2:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP2]], label [[TMP4:%.*]], label [[TMP3:%.*]]
 ; IS__CGSCC_OPM:       3:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6:%.*]]
 ; IS__CGSCC_OPM:       4:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-NEXT:    [[TMP5:%.*]] = tail call i8* @f3(i8* nonnull @a2)
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6]]
+; IS__CGSCC_OPM:       6:
+; IS__CGSCC_OPM-NEXT:    [[TMP7:%.*]] = phi i8* [ undef, [[TMP3]] ], [ [[TMP5]], [[TMP4]] ]
+; IS__CGSCC_OPM-NEXT:    ret i8* [[TMP7]]
 ;
-; IS__CGSCC_NPM: Function Attrs: noinline nounwind uwtable
+; IS__CGSCC_NPM: Function Attrs: noinline norecurse nounwind uwtable
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f2
 ; IS__CGSCC_NPM-SAME: (i8* readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
-; IS__CGSCC_NPM-NEXT:    unreachable
-; IS__CGSCC_NPM:       2:
-; IS__CGSCC_NPM-NEXT:    unreachable
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
+; IS__CGSCC_NPM-NEXT:    br i1 [[TMP2]], label [[TMP4:%.*]], label [[TMP3:%.*]]
 ; IS__CGSCC_NPM:       3:
-; IS__CGSCC_NPM-NEXT:    unreachable
+; IS__CGSCC_NPM-NEXT:    br label [[TMP6:%.*]]
 ; IS__CGSCC_NPM:       4:
-; IS__CGSCC_NPM-NEXT:    unreachable
+; IS__CGSCC_NPM-NEXT:    [[TMP5:%.*]] = tail call i8* @f3()
+; IS__CGSCC_NPM-NEXT:    br label [[TMP6]]
+; IS__CGSCC_NPM:       6:
+; IS__CGSCC_NPM-NEXT:    [[TMP7:%.*]] = phi i8* [ undef, [[TMP3]] ], [ @a1, [[TMP4]] ]
+; IS__CGSCC_NPM-NEXT:    ret i8* [[TMP7]]
 ;
   %2 = icmp eq i8* %0, null
   br i1 %2, label %5, label %3
@@ -211,21 +217,23 @@ define internal i8* @f2(i8* readnone %0) local_unnamed_addr #0 {
 define internal i8* @f3(i8* readnone %0) local_unnamed_addr #0 {
 ; IS__CGSCC_OPM: Function Attrs: noinline nounwind uwtable
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f3
-; IS__CGSCC_OPM-SAME: (i8* nonnull readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
-; IS__CGSCC_OPM-NEXT:    br label [[TMP3:%.*]]
-; IS__CGSCC_OPM:       2:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-SAME: (i8* readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; IS__CGSCC_OPM:       3:
-; IS__CGSCC_OPM-NEXT:    ret i8* @a1
+; IS__CGSCC_OPM-NEXT:    br label [[TMP4]]
+; IS__CGSCC_OPM:       4:
+; IS__CGSCC_OPM-NEXT:    [[TMP5:%.*]] = phi i8* [ @a2, [[TMP3]] ], [ @a1, [[TMP1:%.*]] ]
+; IS__CGSCC_OPM-NEXT:    ret i8* [[TMP5]]
 ;
-; IS__CGSCC_NPM: Function Attrs: noinline nounwind uwtable
+; IS__CGSCC_NPM: Function Attrs: nofree noinline norecurse nosync nounwind readnone willreturn uwtable
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f3
-; IS__CGSCC_NPM-SAME: (i8* nonnull readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; IS__CGSCC_NPM-NEXT:    br label [[TMP3:%.*]]
-; IS__CGSCC_NPM:       2:
+; IS__CGSCC_NPM-SAME: () local_unnamed_addr #[[ATTR0]] {
+; IS__CGSCC_NPM-NEXT:    br label [[TMP2:%.*]]
+; IS__CGSCC_NPM:       1:
 ; IS__CGSCC_NPM-NEXT:    unreachable
-; IS__CGSCC_NPM:       3:
-; IS__CGSCC_NPM-NEXT:    ret i8* @a1
+; IS__CGSCC_NPM:       2:
+; IS__CGSCC_NPM-NEXT:    ret i8* undef
 ;
   %2 = icmp eq i8* %0, null
   br i1 %2, label %3, label %5
@@ -292,24 +300,30 @@ define internal i8* @f2b(i8* readnone %0) local_unnamed_addr #0 {
 ; IS__CGSCC_OPM: Function Attrs: noinline nounwind uwtable
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f2b
 ; IS__CGSCC_OPM-SAME: (i8* readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
-; IS__CGSCC_OPM-NEXT:    unreachable
-; IS__CGSCC_OPM:       2:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP2]], label [[TMP4:%.*]], label [[TMP3:%.*]]
 ; IS__CGSCC_OPM:       3:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6:%.*]]
 ; IS__CGSCC_OPM:       4:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-NEXT:    [[TMP5:%.*]] = tail call i8* @f3b(i8* nonnull @a2)
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6]]
+; IS__CGSCC_OPM:       6:
+; IS__CGSCC_OPM-NEXT:    [[TMP7:%.*]] = phi i8* [ undef, [[TMP3]] ], [ [[TMP5]], [[TMP4]] ]
+; IS__CGSCC_OPM-NEXT:    ret i8* [[TMP7]]
 ;
-; IS__CGSCC_NPM: Function Attrs: noinline nounwind uwtable
+; IS__CGSCC_NPM: Function Attrs: noinline norecurse nounwind uwtable
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f2b
 ; IS__CGSCC_NPM-SAME: (i8* readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; IS__CGSCC_NPM-NEXT:    unreachable
-; IS__CGSCC_NPM:       2:
-; IS__CGSCC_NPM-NEXT:    unreachable
+; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
+; IS__CGSCC_NPM-NEXT:    br i1 [[TMP2]], label [[TMP4:%.*]], label [[TMP3:%.*]]
 ; IS__CGSCC_NPM:       3:
-; IS__CGSCC_NPM-NEXT:    unreachable
+; IS__CGSCC_NPM-NEXT:    br label [[TMP6:%.*]]
 ; IS__CGSCC_NPM:       4:
-; IS__CGSCC_NPM-NEXT:    unreachable
+; IS__CGSCC_NPM-NEXT:    [[TMP5:%.*]] = tail call i8* @f3b()
+; IS__CGSCC_NPM-NEXT:    br label [[TMP6]]
+; IS__CGSCC_NPM:       6:
+; IS__CGSCC_NPM-NEXT:    [[TMP7:%.*]] = phi i8* [ undef, [[TMP3]] ], [ @a1, [[TMP4]] ]
+; IS__CGSCC_NPM-NEXT:    ret i8* [[TMP7]]
 ;
   %2 = icmp eq i8* %0, null
   br i1 %2, label %5, label %3
@@ -333,21 +347,23 @@ define internal i8* @f3b(i8* readnone %0) local_unnamed_addr #0 {
 ;
 ; IS__CGSCC_OPM: Function Attrs: noinline nounwind uwtable
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f3b
-; IS__CGSCC_OPM-SAME: (i8* nonnull readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
-; IS__CGSCC_OPM-NEXT:    br label [[TMP3:%.*]]
-; IS__CGSCC_OPM:       2:
-; IS__CGSCC_OPM-NEXT:    unreachable
+; IS__CGSCC_OPM-SAME: (i8* readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; IS__CGSCC_OPM:       3:
-; IS__CGSCC_OPM-NEXT:    ret i8* @a1
+; IS__CGSCC_OPM-NEXT:    br label [[TMP4]]
+; IS__CGSCC_OPM:       4:
+; IS__CGSCC_OPM-NEXT:    [[TMP5:%.*]] = phi i8* [ @a2, [[TMP3]] ], [ @a1, [[TMP1:%.*]] ]
+; IS__CGSCC_OPM-NEXT:    ret i8* [[TMP5]]
 ;
-; IS__CGSCC_NPM: Function Attrs: noinline nounwind uwtable
+; IS__CGSCC_NPM: Function Attrs: nofree noinline norecurse nosync nounwind readnone willreturn uwtable
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f3b
-; IS__CGSCC_NPM-SAME: (i8* nonnull readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
-; IS__CGSCC_NPM-NEXT:    br label [[TMP3:%.*]]
-; IS__CGSCC_NPM:       2:
+; IS__CGSCC_NPM-SAME: () local_unnamed_addr #[[ATTR0]] {
+; IS__CGSCC_NPM-NEXT:    br label [[TMP2:%.*]]
+; IS__CGSCC_NPM:       1:
 ; IS__CGSCC_NPM-NEXT:    unreachable
-; IS__CGSCC_NPM:       3:
-; IS__CGSCC_NPM-NEXT:    ret i8* @a1
+; IS__CGSCC_NPM:       2:
+; IS__CGSCC_NPM-NEXT:    ret i8* undef
 ;
   %2 = icmp eq i8* %0, null
   br i1 %2, label %3, label %5
@@ -1131,7 +1147,7 @@ attributes #2 = { null_pointer_is_valid }
 ; IS__CGSCC_OPM: attributes #[[ATTR12]] = { readonly willreturn }
 ;.
 ; IS__CGSCC_NPM: attributes #[[ATTR0]] = { nofree noinline norecurse nosync nounwind readnone willreturn uwtable }
-; IS__CGSCC_NPM: attributes #[[ATTR1]] = { noinline nounwind uwtable }
+; IS__CGSCC_NPM: attributes #[[ATTR1]] = { noinline norecurse nounwind uwtable }
 ; IS__CGSCC_NPM: attributes #[[ATTR2]] = { nounwind }
 ; IS__CGSCC_NPM: attributes #[[ATTR3]] = { nofree nosync nounwind }
 ; IS__CGSCC_NPM: attributes #[[ATTR4]] = { argmemonly nofree norecurse nosync nounwind readonly willreturn }
diff --git a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
index 2c8e29b007e51..f3439816628d7 100644
--- a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
+++ b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
-; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=10 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=10 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=9 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=9 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 
@@ -40,8 +40,8 @@ define i32* @external_ret2_nrw(i32* %n0, i32* %r0, i32* %w0) {
 ; IS__TUNIT____-SAME: (i32* nofree [[N0:%.*]], i32* nofree [[R0:%.*]], i32* nofree [[W0:%.*]]) #[[ATTR0:[0-9]+]] {
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32* @internal_ret0_nw(i32* nofree [[N0]], i32* nofree [[W0]]) #[[ATTR3:[0-9]+]]
-; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32* @internal_ret1_rrw(i32* nofree align 4 [[R0]], i32* nofree [[R0]], i32* nofree [[W0]]) #[[ATTR3]]
-; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree readonly align 4 [[R0]], i32* nofree writeonly "no-capture-maybe-returned" [[W0]]) #[[ATTR3]]
+; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32* @internal_ret1_rrw(i32* nofree align 4 [[R0]], i32* nofree align 4 [[R0]], i32* nofree [[W0]]) #[[ATTR3]]
+; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree readonly [[R0]], i32* nofree writeonly "no-capture-maybe-returned" [[W0]]) #[[ATTR3]]
 ; IS__TUNIT____-NEXT:    [[CALL3:%.*]] = call i32* @internal_ret1_rw(i32* nofree align 4 [[R0]], i32* nofree [[W0]]) #[[ATTR3]]
 ; IS__TUNIT____-NEXT:    ret i32* [[CALL3]]
 ;
@@ -91,7 +91,7 @@ define internal i32* @internal_ret0_nw(i32* %n0, i32* %w0) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree nosync nounwind
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@internal_ret0_nw
-; IS__CGSCC____-SAME: (i32* nofree returned [[N0:%.*]], i32* nofree [[W0:%.*]]) #[[ATTR0]] {
+; IS__CGSCC____-SAME: (i32* nofree [[N0:%.*]], i32* nofree [[W0:%.*]]) #[[ATTR0]] {
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[R0:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    [[R1:%.*]] = alloca i32, align 4
@@ -112,7 +112,7 @@ define internal i32* @internal_ret0_nw(i32* %n0, i32* %w0) {
 ; IS__CGSCC____-NEXT:    br label [[RETURN]]
 ; IS__CGSCC____:       return:
 ; IS__CGSCC____-NEXT:    [[RETVAL_0:%.*]] = phi i32* [ [[N0]], [[IF_END]] ], [ [[N0]], [[IF_THEN]] ]
-; IS__CGSCC____-NEXT:    ret i32* [[N0]]
+; IS__CGSCC____-NEXT:    ret i32* undef
 ;
 entry:
   %r0 = alloca i32, align 4
@@ -143,7 +143,7 @@ return:                                           ; preds = %if.end, %if.then
 define internal i32* @internal_ret1_rrw(i32* %r0, i32* %r1, i32* %w0) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@internal_ret1_rrw
-; IS__TUNIT____-SAME: (i32* nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], i32* nofree [[R1:%.*]], i32* nofree [[W0:%.*]]) #[[ATTR0]] {
+; IS__TUNIT____-SAME: (i32* nofree noundef nonnull align 4 dereferenceable(4) [[R0:%.*]], i32* nofree align 4 [[R1:%.*]], i32* nofree [[W0:%.*]]) #[[ATTR0]] {
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* [[R0]], align 4
 ; IS__TUNIT____-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -166,7 +166,7 @@ define internal i32* @internal_ret1_rrw(i32* %r0, i32* %r1, i32* %w0) {
 ; IS__TUNIT____-NEXT:    [[CALL8:%.*]] = call i32* @internal_ret0_nw(i32* nofree nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]]) #[[ATTR3]]
 ; IS__TUNIT____-NEXT:    br label [[RETURN]]
 ; IS__TUNIT____:       return:
-; IS__TUNIT____-NEXT:    [[RETVAL_0:%.*]] = phi i32* [ [[CALL8]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ]
+; IS__TUNIT____-NEXT:    [[RETVAL_0:%.*]] = phi i32* [ [[R1]], [[IF_END]] ], [ [[R1]], [[IF_THEN]] ]
 ; IS__TUNIT____-NEXT:    ret i32* undef
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree nosync nounwind
diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
index 52a0146fc2caa..5500e5c4ab1dd 100644
--- a/llvm/test/Transforms/Attributor/value-simplify.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify.ll
@@ -1099,6 +1099,27 @@ join:
   uselistorder label %join, { 1, 0 }
 }
 
+define i1 @test_liveness(i1 %c) {
+entry:
+    br i1 %c, label %t, label %f
+t:
+    br label %f
+f:
+    %p = phi i1 [true, %entry], [false, %t]
+    %rc1 = call i1 @ret(i1 %p)
+    ret i1 %rc1
+}
+
+define internal i1 @ret(i1 %c) {
+entry:
+    br i1 %c, label %t, label %f
+t:
+    br label %f
+f:
+    %p = phi i1 [%c, %entry], [false, %t]
+    ret i1 %p
+}
+
 ;.
 ; IS__TUNIT_OPM: attributes #[[ATTR0]] = { nofree nosync nounwind willreturn }
 ; IS__TUNIT_OPM: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind readnone willreturn }

From 8ad39fbaf23893b3384cafa0f179d35dcf3c672b Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 14 Feb 2022 18:32:00 -0600
Subject: [PATCH 007/748] [Attributor][FIX] Heap2Stack needs to use the alloca
 AS

When we move an allocation from the heap to the stack we need to
allocate it in the alloca AS and then cast the result. This also
prevents us from inserting the alloca after the allocation call but
rather right before.

Fixes https://github.com/llvm/llvm-project/issues/53858
---
 .../Transforms/IPO/AttributorAttributes.cpp   | 15 ++++---
 .../Attributor/heap_to_stack_gpu.ll           | 44 +++++++++++++++++++
 llvm/test/Transforms/OpenMP/spmdization.ll    | 10 +++--
 3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 8e90ac352791a..0ad64dbb45953 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Assumptions.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -6031,13 +6032,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
       else
         A.emitRemark<OptimizationRemark>(AI.CB, "HeapToStack", Remark);
 
+      const DataLayout &DL = A.getInfoCache().getDL();
       Value *Size;
       Optional<APInt> SizeAPI = getSize(A, *this, AI);
       if (SizeAPI.hasValue()) {
         Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI);
       } else {
         LLVMContext &Ctx = AI.CB->getContext();
-        auto &DL = A.getInfoCache().getDL();
         ObjectSizeOpts Opts;
         ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
         SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
@@ -6057,14 +6058,14 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
             max(Alignment, MaybeAlign(AlignmentAPI.getValue().getZExtValue()));
       }
 
-      unsigned AS = cast<PointerType>(AI.CB->getType())->getAddressSpace();
-      Instruction *Alloca =
-          new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
-                         "", AI.CB->getNextNode());
+      // TODO: Hoist the alloca towards the function entry.
+      unsigned AS = DL.getAllocaAddrSpace();
+      Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
+                                           Size, Alignment, "", AI.CB);
 
       if (Alloca->getType() != AI.CB->getType())
-        Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc",
-                                 Alloca->getNextNode());
+        Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+            Alloca, AI.CB->getType(), "malloc_cast", AI.CB);
 
       auto *I8Ty = Type::getInt8Ty(F->getContext());
       auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty);
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
index 0f207e4027599..5ee0a6892ac69 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll
@@ -4,7 +4,12 @@
 ; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
 
+; FIXME: amdgpu doesn't claim malloc is a thing, so the test is somewhat
+; useless except the __kmpc_alloc_shared part which now also covers the important
+; part this test was initially designed for, make sure the "is freed" check is
+; not sufficient on a GPU.
 target triple = "amdgcn-amd-amdhsa"
+target datalayout = "A5"
 
 declare noalias i8* @malloc(i64)
 
@@ -20,6 +25,7 @@ declare void @no_sync_func(i8* nocapture %p) nofree nosync willreturn
 
 declare void @nofree_func(i8* nocapture %p) nofree  nosync willreturn
 
+declare void @usei8(i8* %p)
 declare void @foo(i32* %p)
 
 declare void @foo_nounw(i32* %p) nounwind nofree
@@ -663,6 +669,43 @@ define void @test16d(i8 %v, i8** %P) {
   store i8* %1, i8** %P
   ret void
 }
+
+declare i8* @__kmpc_alloc_shared(i64)
+declare void @__kmpc_free_shared(i8* nocapture, i64)
+
+define void @test17() {
+; IS________OPM-LABEL: define {{[^@]+}}@test17() {
+; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
+; IS________OPM-NEXT:    tail call void @usei8(i8* noalias nocapture nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
+; IS________OPM-NEXT:    tail call void @__kmpc_free_shared(i8* noalias nocapture [[TMP1]], i64 noundef 4)
+; IS________OPM-NEXT:    ret void
+;
+; IS________NPM-LABEL: define {{[^@]+}}@test17() {
+; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1, addrspace(5)
+; IS________NPM-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP1]] to i8*
+; IS________NPM-NEXT:    tail call void @usei8(i8* noalias nocapture nofree [[MALLOC_CAST]]) #[[ATTR6:[0-9]+]]
+; IS________NPM-NEXT:    ret void
+;
+  %1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
+  tail call void @usei8(i8* nocapture nofree %1) willreturn nounwind nosync
+  tail call void @__kmpc_free_shared(i8* %1, i64 4)
+  ret void
+}
+
+define void @test17b() {
+; CHECK-LABEL: define {{[^@]+}}@test17b() {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
+; CHECK-NEXT:    tail call void @usei8(i8* nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    tail call void @__kmpc_free_shared(i8* nocapture [[TMP1]], i64 noundef 4)
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
+  tail call void @usei8(i8* nofree %1) willreturn nounwind nosync
+  tail call void @__kmpc_free_shared(i8* %1, i64 4)
+  ret void
+}
+
+
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nosync willreturn }
@@ -670,4 +713,5 @@ define void @test16d(i8 %v, i8** %P) {
 ; CHECK: attributes #[[ATTR3]] = { noreturn }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
 ; CHECK: attributes #[[ATTR5]] = { nounwind }
+; CHECK: attributes #[[ATTR6]] = { nosync nounwind willreturn }
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 07ed024cb35bc..752ccff9354ad 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -678,8 +678,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; AMDGPU-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
-; AMDGPU-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
+; AMDGPU-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
+; AMDGPU-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
+; AMDGPU-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
 ; AMDGPU-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
 ; AMDGPU-NEXT:    br label [[FOR_COND:%.*]]
 ; AMDGPU:       for.cond:
@@ -722,8 +723,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4
-; AMDGPU-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
+; AMDGPU-DISABLED-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
+; AMDGPU-DISABLED-NEXT:    [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
 ; AMDGPU-DISABLED-NEXT:    call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
 ; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
 ; AMDGPU-DISABLED:       for.cond:

From ef39235cb94289281d610e4df52ad2a746d6af61 Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Wed, 16 Feb 2022 23:05:34 +0100
Subject: [PATCH 008/748] [clang-format] Make checking for a record more robust
 and avoid a loop.

---
 clang/lib/Format/UnwrappedLineFormatter.cpp | 13 +++++++----
 clang/unittests/Format/FormatTest.cpp       | 24 +++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 16fa2e7b50f1c..dbf1e4cbbf6a3 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -312,10 +312,15 @@ class LineJoiner {
               break;
 
           // Check if the found line starts a record.
-          for (const FormatToken *RecordTok = (*J)->Last; RecordTok;
-               RecordTok = RecordTok->Previous)
-            if (RecordTok->is(tok::l_brace))
-              return isRecordLBrace(*RecordTok);
+          const FormatToken *LastNonComment = (*J)->Last;
+          assert(LastNonComment);
+          if (LastNonComment->is(tok::comment)) {
+            LastNonComment = LastNonComment->getPreviousNonComment();
+            // There must be another token (usually `{`), because we chose a
+            // line that has a smaller level.
+            assert(LastNonComment);
+          }
+          return isRecordLBrace(*LastNonComment);
         }
       }
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 6e5dd3284633c..73503696741a7 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -3809,6 +3809,18 @@ TEST_F(FormatTest, FormatsNamespaces) {
                "  }\n"
                "} // namespace\n",
                ShortInlineFunctions);
+  verifyFormat("namespace { /* comment */\n"
+               "  void f() {\n"
+               "    return;\n"
+               "  }\n"
+               "} // namespace\n",
+               ShortInlineFunctions);
+  verifyFormat("namespace { // comment\n"
+               "  void f() {\n"
+               "    return;\n"
+               "  }\n"
+               "} // namespace\n",
+               ShortInlineFunctions);
   verifyFormat("namespace {\n"
                "  int some_int;\n"
                "  void f() {\n"
@@ -3828,6 +3840,18 @@ TEST_F(FormatTest, FormatsNamespaces) {
                "  };\n"
                "} // namespace\n",
                ShortInlineFunctions);
+  verifyFormat("namespace {\n"
+               "  class X { /* comment */\n"
+               "    void f() { return; }\n"
+               "  };\n"
+               "} // namespace\n",
+               ShortInlineFunctions);
+  verifyFormat("namespace {\n"
+               "  class X { // comment\n"
+               "    void f() { return; }\n"
+               "  };\n"
+               "} // namespace\n",
+               ShortInlineFunctions);
   verifyFormat("namespace {\n"
                "  struct X {\n"
                "    void f() { return; }\n"

From dabbab6861512453aa8f864ef863d31c1d57aa3e Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Sun, 13 Feb 2022 08:00:42 -0800
Subject: [PATCH 009/748] [lld][WebAssembly] Apply global relocs before data
 relocs

Since the code for apply data relocations can sometimes use
the values stored in he globals, they need to be relocated
before the data relocations can be run.

Fixes: https://github.com/emscripten-core/emscripten/issues/13398

Differential Revision: https://reviews.llvm.org/D119666
---
 lld/test/wasm/pie.ll |  4 ++--
 lld/wasm/Writer.cpp  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lld/test/wasm/pie.ll b/lld/test/wasm/pie.ll
index 40159e79cdcae..0ae1c9ed416cf 100644
--- a/lld/test/wasm/pie.ll
+++ b/lld/test/wasm/pie.ll
@@ -88,8 +88,8 @@ declare void @external_func()
 
 ; DISASSEM:       <__wasm_start>:
 ; DISASSEM-EMPTY:
-; DISASSEM-NEXT:   call 2
 ; DISASSEM-NEXT:   call 3
+; DISASSEM-NEXT:   call 2
 ; DISASSEM-NEXT:   end
 
 ; Run the same test with threading support.  In this mode
@@ -107,8 +107,8 @@ declare void @external_func()
 
 ; DISASSEM-SHMEM:       <__wasm_start>:
 ; DISASSEM-SHMEM-EMPTY:
-; DISASSEM-SHMEM-NEXT:   call 3
 ; DISASSEM-SHMEM-NEXT:   call 5
+; DISASSEM-SHMEM-NEXT:   call 3
 ; DISASSEM-SHMEM-NEXT:   end
 
 ; SHMEM:         FunctionNames:
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 95f6483e9e591..621e48e7f273a 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -1253,6 +1253,11 @@ void Writer::createStartFunction() {
     {
       raw_string_ostream os(bodyContent);
       writeUleb128(os, 0, "num locals");
+      if (WasmSym::applyGlobalRelocs) {
+        writeU8(os, WASM_OPCODE_CALL, "CALL");
+        writeUleb128(os, WasmSym::applyGlobalRelocs->getFunctionIndex(),
+                     "function index");
+      }
       if (WasmSym::initMemory) {
         writeU8(os, WASM_OPCODE_CALL, "CALL");
         writeUleb128(os, WasmSym::initMemory->getFunctionIndex(),
@@ -1264,11 +1269,6 @@ void Writer::createStartFunction() {
         writeUleb128(os, WasmSym::applyDataRelocs->getFunctionIndex(),
                      "function index");
       }
-      if (WasmSym::applyGlobalRelocs) {
-        writeU8(os, WASM_OPCODE_CALL, "CALL");
-        writeUleb128(os, WasmSym::applyGlobalRelocs->getFunctionIndex(),
-                     "function index");
-      }
       writeU8(os, WASM_OPCODE_END, "END");
     }
     createFunction(WasmSym::startFunction, bodyContent);

From b171583ae7d187164b40e8dcd90b9aa2f34c06a2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 16 Feb 2022 10:40:28 -0800
Subject: [PATCH 010/748] [mlir] Async: create async.group inside the scf.if
 branch

Reviewed By: cota

Differential Revision: https://reviews.llvm.org/D119959
---
 mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp | 10 +++++-----
 .../Async/async-parallel-for-async-dispatch.mlir       |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
index fd98f31e09feb..cdd85e5c5b406 100644
--- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
+++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
@@ -615,13 +615,13 @@ static void doAsyncDispatch(ImplicitLocOpBuilder &b, PatternRewriter &rewriter,
   };
 
   auto asyncDispatch = [&](OpBuilder &nestedBuilder, Location loc) {
+    ImplicitLocOpBuilder nb(loc, nestedBuilder);
+
     // Create an async.group to wait on all async tokens from the concurrent
     // execution of multiple parallel compute function. First block will be
     // executed synchronously in the caller thread.
-    Value groupSize = b.create<arith::SubIOp>(blockCount, c1);
-    Value group = b.create<CreateGroupOp>(GroupType::get(ctx), groupSize);
-
-    ImplicitLocOpBuilder nb(loc, nestedBuilder);
+    Value groupSize = nb.create<arith::SubIOp>(blockCount, c1);
+    Value group = nb.create<CreateGroupOp>(GroupType::get(ctx), groupSize);
 
     // Launch async dispatch function for [0, blockCount) range.
     SmallVector<Value> operands = {group, c0, blockCount, blockSize};
@@ -631,7 +631,7 @@ static void doAsyncDispatch(ImplicitLocOpBuilder &b, PatternRewriter &rewriter,
                       asyncDispatchFunction.getCallableResults(), operands);
 
     // Wait for the completion of all parallel compute operations.
-    b.create<AwaitAllOp>(group);
+    nb.create<AwaitAllOp>(group);
 
     nb.create<scf::YieldOp>();
   };
diff --git a/mlir/test/Dialect/Async/async-parallel-for-async-dispatch.mlir b/mlir/test/Dialect/Async/async-parallel-for-async-dispatch.mlir
index bf2f7404abb1e..3fdbcf0ca153b 100644
--- a/mlir/test/Dialect/Async/async-parallel-for-async-dispatch.mlir
+++ b/mlir/test/Dialect/Async/async-parallel-for-async-dispatch.mlir
@@ -12,7 +12,7 @@ func @loop_1d(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<?xf32>) {
 
   // CHECK:      scf.if %[[IS_NOOP]] {
   // CHECK-NEXT: } else {
-    // CHECK:        scf.if {{.*}} {
+  // CHECK:        scf.if {{.*}} {
   // CHECK:          call @parallel_compute_fn(%[[C0]]
   // CHECK:        } else {
   // CHECK:          %[[GROUP:.*]] = async.create_group

From 4e24397805ebfef7b3098e36e568ea576473efa1 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 16 Feb 2022 14:26:58 -0800
Subject: [PATCH 011/748] [test][SLPVectorizer][OpaquePtr] Precommit test

---
 .../SLPVectorizer/X86/opaque-ptr-2.ll         | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll
new file mode 100644
index 0000000000000..a2c7f1d202999
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -opaque-pointers < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(ptr %arg, ptr %arg1, ptr %arg2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[ARG1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[ARG2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, <2 x ptr> [[TMP2]], <2 x i64> <i64 -1, i64 128>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[ARG]], i64 1
+; CHECK-NEXT:    store <2 x ptr> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %tmp = getelementptr inbounds i8, ptr %arg2, i64 128
+  %tmp3 = getelementptr inbounds ptr, ptr %arg1, i64 -1
+  %tmp4 = getelementptr inbounds ptr, ptr %arg, i64 0
+  %tmp5 = getelementptr inbounds ptr, ptr %arg, i64 1
+  store ptr %tmp3, ptr %tmp4, align 8
+  store ptr %tmp, ptr %tmp5, align 8
+  ret void
+}

From 826fae51d2a445027d1fe6694bc92b84fe0ec9f0 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 16 Feb 2022 14:30:51 -0800
Subject: [PATCH 012/748] [SLPVectorizer][OpaquePtrs] Check GEP source element
 type

Fixes a miscompile with opaque pointers.

Reviewed By: #opaque-pointers, nikic

Differential Revision: https://reviews.llvm.org/D119980
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp        | 4 ++--
 llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ca1922e80c2cf..f1e42ae9ddec6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4274,9 +4274,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 
       // We can't combine several GEPs into one vector if they operate on
       // different types.
-      Type *Ty0 = VL0->getOperand(0)->getType();
+      Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
       for (Value *V : VL) {
-        Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
+        Type *CurTy = cast<GEPOperator>(V)->getSourceElementType();
         if (Ty0 != CurTy) {
           LLVM_DEBUG(dbgs()
                      << "SLP: not-vectorizable GEP (different types).\n");
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll
index a2c7f1d202999..8390ebb7ff569 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr-2.ll
@@ -6,12 +6,12 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @test(ptr %arg, ptr %arg1, ptr %arg2) {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[ARG1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[ARG2:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, <2 x ptr> [[TMP2]], <2 x i64> <i64 -1, i64 128>
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i8, ptr [[ARG2:%.*]], i64 128
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], i64 -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[ARG]], i64 1
-; CHECK-NEXT:    store <2 x ptr> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store ptr [[TMP]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %tmp = getelementptr inbounds i8, ptr %arg2, i64 128

From 318507edee14976e5ca696366b3a745b7b988aa0 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 16 Feb 2022 18:55:43 +0100
Subject: [PATCH 013/748] [libc++] Remove a few unneeded _LIBCPP_CXX03_LANG
 ifdefs

Reviewed By: Quuxplusone, ldionne, #libc

Spies: libcxx-commits

Differential Revision: https://reviews.llvm.org/D119896
---
 libcxx/include/__debug                     | 24 +++-------------------
 libcxx/include/__iterator/reverse_access.h |  6 +-----
 libcxx/include/__memory/unique_ptr.h       |  2 --
 libcxx/include/atomic                      | 17 +--------------
 4 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/libcxx/include/__debug b/libcxx/include/__debug
index 207591cd4058f..b7677a22c859b 100644
--- a/libcxx/include/__debug
+++ b/libcxx/include/__debug
@@ -45,15 +45,9 @@ struct _LIBCPP_TYPE_VIS __i_node
     __i_node* __next_;
     __c_node* __c_;
 
-#ifndef _LIBCPP_CXX03_LANG
     __i_node(const __i_node&) = delete;
     __i_node& operator=(const __i_node&) = delete;
-#else
-private:
-    __i_node(const __i_node&);
-    __i_node& operator=(const __i_node&);
-public:
-#endif
+
     _LIBCPP_INLINE_VISIBILITY
     __i_node(void* __i, __i_node* __next, __c_node* __c)
         : __i_(__i), __next_(__next), __c_(__c) {}
@@ -68,15 +62,9 @@ struct _LIBCPP_TYPE_VIS __c_node
     __i_node** end_;
     __i_node** cap_;
 
-#ifndef _LIBCPP_CXX03_LANG
     __c_node(const __c_node&) = delete;
     __c_node& operator=(const __c_node&) = delete;
-#else
-private:
-    __c_node(const __c_node&);
-    __c_node& operator=(const __c_node&);
-public:
-#endif
+
     _LIBCPP_INLINE_VISIBILITY
     __c_node(void* __c, __c_node* __next)
         : __c_(__c), __next_(__next), beg_(nullptr), end_(nullptr), cap_(nullptr) {}
@@ -155,15 +143,9 @@ class _LIBCPP_TYPE_VIS __libcpp_db
 
     __libcpp_db();
 public:
-#ifndef _LIBCPP_CXX03_LANG
     __libcpp_db(const __libcpp_db&) = delete;
     __libcpp_db& operator=(const __libcpp_db&) = delete;
-#else
-private:
-    __libcpp_db(const __libcpp_db&);
-    __libcpp_db& operator=(const __libcpp_db&);
-public:
-#endif
+
     ~__libcpp_db();
 
     class __db_c_iterator;
diff --git a/libcxx/include/__iterator/reverse_access.h b/libcxx/include/__iterator/reverse_access.h
index 931ff582323db..40c266378d365 100644
--- a/libcxx/include/__iterator/reverse_access.h
+++ b/libcxx/include/__iterator/reverse_access.h
@@ -21,8 +21,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if !defined(_LIBCPP_CXX03_LANG)
-
 #if _LIBCPP_STD_VER > 11
 
 template <class _Tp, size_t _Np>
@@ -95,9 +93,7 @@ auto crend(const _Cp& __c) -> decltype(_VSTD::rend(__c))
     return _VSTD::rend(__c);
 }
 
-#endif
-
-#endif // !defined(_LIBCPP_CXX03_LANG)
+#endif // _LIBCPP_STD_VER > 11
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 8b330508511af..348c90325e6ff 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -263,7 +263,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr {
   unique_ptr& operator=(unique_ptr const&) = delete;
 #endif
 
-
   _LIBCPP_INLINE_VISIBILITY
   ~unique_ptr() { reset(); }
 
@@ -485,7 +484,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS unique_ptr<_Tp[], _Dp>
   unique_ptr(unique_ptr const&) = delete;
   unique_ptr& operator=(unique_ptr const&) = delete;
 #endif
-
 public:
   _LIBCPP_INLINE_VISIBILITY
   ~unique_ptr() { reset(); }
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 7fed8713e03a7..4a5c4847dabd3 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -1651,13 +1651,7 @@ struct __atomic_base  // false
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
     __atomic_base(_Tp __d) _NOEXCEPT : __a_(__d) {}
 
-#ifndef _LIBCPP_CXX03_LANG
     __atomic_base(const __atomic_base&) = delete;
-#else
-private:
-    _LIBCPP_INLINE_VISIBILITY
-    __atomic_base(const __atomic_base&);
-#endif
 };
 
 #if defined(__cpp_lib_atomic_is_always_lock_free)
@@ -2439,19 +2433,10 @@ typedef struct atomic_flag
     _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
     atomic_flag(bool __b) _NOEXCEPT : __a_(__b) {} // EXTENSION
 
-#ifndef _LIBCPP_CXX03_LANG
     atomic_flag(const atomic_flag&) = delete;
     atomic_flag& operator=(const atomic_flag&) = delete;
     atomic_flag& operator=(const atomic_flag&) volatile = delete;
-#else
-private:
-    _LIBCPP_INLINE_VISIBILITY
-    atomic_flag(const atomic_flag&);
-    _LIBCPP_INLINE_VISIBILITY
-    atomic_flag& operator=(const atomic_flag&);
-    _LIBCPP_INLINE_VISIBILITY
-    atomic_flag& operator=(const atomic_flag&) volatile;
-#endif
+
 } atomic_flag;
 
 

From 7470244475f85e9c8d7c46da87780935860d8026 Mon Sep 17 00:00:00 2001
From: Jacob Lambert <jacob.lambert@amd.com>
Date: Tue, 1 Feb 2022 19:40:11 -0800
Subject: [PATCH 014/748] [AMDGPU] Add agpr_count to metadata and AsmParser

gfx90a allows the number of ACC registers (AGPRs) to be set
independently to the VGPR registers. For both HSA and PAL metadata, we
now include an "agpr_count" key to report the number of AGPRs set for
supported devices (gfx90a, gfx908, as determined by hasMAIInsts()).
This is collected from SIProgramInfo.NumAccVGPR for both HSA and PAL.
The AsmParser also now recognizes ".kernel.agpr_count" for supported
devices.

Differential Revision: https://reviews.llvm.org/D116140
---
 llvm/docs/AMDGPUUsage.rst                     |   5 +
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |   8 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |   6 ++
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  35 +++++-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   4 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   1 +
 .../Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp |   5 +
 .../Target/AMDGPU/Utils/AMDGPUPALMetadata.h   |   4 +
 .../amdpal-metadata-agpr-register-count.ll    |  78 ++++++++++++++
 .../hsa-metadata-agpr-register-count.ll       | 101 ++++++++++++++++++
 .../CodeGen/AMDGPU/hsa-metadata-agpr-small.ll |  57 ++++++++++
 llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s   |  62 +++++++++++
 12 files changed, 363 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
 create mode 100644 llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 4233edd899eea..5fae9129bc6a5 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -3185,6 +3185,10 @@ same *vendor-name*.
                                                                   if a higher numbered
                                                                   register is used
                                                                   explicitly.
+     ".agpr_count"                       integer        Required  Number of accumulator
+                                                                  registers required by
+                                                                  each work-item for
+                                                                  GFX90A, GFX908.
      ".max_flat_workgroup_size"          integer        Required  Maximum flat
                                                                   work-group size
                                                                   supported by the
@@ -11431,6 +11435,7 @@ within a map that has been added by the same *vendor-name*.
      ".lds_size"                integer                  Local Data Share size in bytes.
      ".perf_data_buffer_size"   integer                  Performance data buffer size in bytes.
      ".vgpr_count"              integer                  Number of VGPRs used.
+     ".agpr_count"              integer                  Number of AGPRs used.
      ".sgpr_count"              integer                  Number of SGPRs used.
      ".vgpr_limit"              integer                  If non-zero, indicates the shader was compiled with a
                                                          directive to instruct the compiler to limit the VGPR usage to
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 26c06d2155ca4..b7a16fde574bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1001,6 +1001,13 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
 
   MD->setEntryPoint(CC, MF.getFunction().getName());
   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+
+  // Only set AGPRs for supported devices
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  if (STM.hasMAIInsts()) {
+    MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+  }
+
   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
   MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
   if (AMDGPU::isCompute(CC)) {
@@ -1017,7 +1024,6 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
     MD->setSpiPsInputAddr(MFI->getPSInputAddr());
   }
 
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   if (STM.isWave32())
     MD->setWave32(MF.getFunction().getCallingConv());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 8cc5c1345b0f3..e1e3e6621ee58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -877,6 +877,12 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(STM.getWavefrontSize());
   Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
   Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+
+  // Only add AGPR count to metadata for supported devices
+  if (STM.hasMAIInsts()) {
+    Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+  }
+
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
   Kern[".sgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d20b4918bf09a..d348a4c7e9091 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1125,7 +1125,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
 class KernelScopeInfo {
   int SgprIndexUnusedMin = -1;
   int VgprIndexUnusedMin = -1;
+  int AgprIndexUnusedMin = -1;
   MCContext *Ctx = nullptr;
+  MCSubtargetInfo const *MSTI = nullptr;
 
   void usesSgprAt(int i) {
     if (i >= SgprIndexUnusedMin) {
@@ -1144,7 +1146,31 @@ class KernelScopeInfo {
       if (Ctx) {
         MCSymbol* const Sym =
           Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
-        Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+        int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+                                         VgprIndexUnusedMin);
+        Sym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
+      }
+    }
+  }
+
+  void usesAgprAt(int i) {
+    // Instruction will error in AMDGPUAsmParser::MatchAndEmitInstruction
+    if (!hasMAIInsts(*MSTI))
+      return;
+
+    if (i >= AgprIndexUnusedMin) {
+      AgprIndexUnusedMin = ++i;
+      if (Ctx) {
+        MCSymbol* const Sym =
+          Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count"));
+        Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx));
+
+        // Also update vgpr_count (dependent on agpr_count for gfx908/gfx90a)
+        MCSymbol* const vSym =
+          Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+        int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+                                         VgprIndexUnusedMin);
+        vSym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
       }
     }
   }
@@ -1154,14 +1180,19 @@ class KernelScopeInfo {
 
   void initialize(MCContext &Context) {
     Ctx = &Context;
+    MSTI = Ctx->getSubtargetInfo();
+
     usesSgprAt(SgprIndexUnusedMin = -1);
     usesVgprAt(VgprIndexUnusedMin = -1);
+    if (hasMAIInsts(*MSTI)) {
+      usesAgprAt(AgprIndexUnusedMin = -1);
+    }
   }
 
   void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
     switch (RegKind) {
       case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
-      case IS_AGPR: // fall through
+      case IS_AGPR: usesAgprAt(DwordRegIndex + RegWidth - 1); break;
       case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
       default: break;
     }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 66c99fea052dc..fe345723e12ea 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1523,6 +1523,10 @@ bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
 }
 
+bool hasMAIInsts(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts];
+}
+
 int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
                          int32_t ArgNumVGPR) {
   if (has90AInsts && ArgNumAGPR)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2086684e1255d..7df0eab964e62 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -775,6 +775,7 @@ bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
 bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
 bool isGFX90A(const MCSubtargetInfo &STI);
 bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
+bool hasMAIInsts(const MCSubtargetInfo &STI);
 int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
 
 /// Is Reg - scalar register
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index f6b5975f19347..4ad93f7b0b682 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -209,6 +209,11 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
   getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the number of used agprs in the metadata.
+void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
+  getHwStage(CC)[".agpr_count"] = Val;
+}
+
 // Set the number of used sgprs in the metadata. This is an optional advisory
 // record for logging etc; wave dispatch actually uses the rsrc1 register for
 // the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 7fdd9a8429c15..a45a799e38a9b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -69,6 +69,10 @@ class AMDGPUPALMetadata {
   // the shader stage to determine the number of vgprs to allocate.
   void setNumUsedVgprs(unsigned CC, unsigned Val);
 
+  // Set the number of used agprs in the metadata. This is an optional advisory
+  // record for logging etc;
+  void setNumUsedAgprs(unsigned CC, unsigned Val);
+
   // Set the number of used sgprs in the metadata. This is an optional advisory
   // record for logging etc; wave dispatch actually uses the rsrc1 register for
   // the shader stage to determine the number of sgprs to allocate.
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
new file mode 100644
index 0000000000000..99a7ae37e0e78
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -0,0 +1,78 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+
+; CHECK:      .type          kernel_32_agprs
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_0_agprs
+; CHECK:      NumAgprs:       0
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v0}" ()
+  ret void
+}
+
+; CHECK:      .type           kernel_40_vgprs
+; CHECK:      NumAgprs:       16
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_max_gprs
+; CHECK:      NumAgprs:       256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v255}" ()
+  call void asm sideeffect "", "~{a255}" ()
+  ret void
+}
+
+; CHECK:      .type          func_32_agprs
+; CHECK:      NumAgprs:       32
+define void @func_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_call_func_32_agprs
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+  call void @func_32_agprs() #0
+  ret void
+}
+
+declare void @undef_func()
+
+; CHECK:      .type          kernel_call_undef_func
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+  call void @undef_func()
+  ret void
+}
+
+; CHECK: ---
+; CHECK:  amdpal.pipelines:
+; GFX90A: agpr_count:  0x20
+; GFX90A: vgpr_count:  0x40
+
+; GFX908: agpr_count:  0x20
+; GFX908: vgpr_count:  0x20
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
new file mode 100644
index 0000000000000..b6eff8846dc8c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_32_agprs
+; GFX90A:      .vgpr_count:    44
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    0
+; GFX908:    - .agpr_count:    0
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_0_agprs
+; GFX90A:      .vgpr_count:    1
+; GFX908:      .vgpr_count:    1
+; GFX801:      .vgpr_count:    1
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v0}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    16
+; GFX908:    - .agpr_count:    16
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_40_vgprs
+; GFX90A:      .vgpr_count:    56
+; GFX908:      .vgpr_count:    40
+; GFX801:      .vgpr_count:    40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    256
+; GFX908:    - .agpr_count:    256
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_max_gprs
+; GFX90A:      .vgpr_count:    512
+; GFX908:      .vgpr_count:    256
+; GFX801:      .vgpr_count:    256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v255}" ()
+  call void asm sideeffect "", "~{a255}" ()
+  ret void
+}
+
+define void @func_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_call_func_32_agprs
+; GFX90A:      .vgpr_count:    44
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+  call void @func_32_agprs() #0
+  ret void
+}
+
+declare void @undef_func()
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_call_undef_func
+; GFX90A:      .vgpr_count:    64
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+  call void @undef_func()
+  ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
new file mode 100644
index 0000000000000..5ec1502899edf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Comments for each kernel
+; CHECK: kernel_32_agprs
+; GFX908:   ; NumVgprs: 9
+; GFX908    ; NumAgprs: 32
+; GFX908    ; TotalNumVgprs: 32
+
+; GFX90A:   ; NumVgprs: 9
+; GFX90A    ; NumAgprs: 32
+; GFX90A    ; TotalNumVgprs: 44
+
+; GFX801:   ; NumVgprs: 9
+
+; CHECK: kernel_40_vgprs
+; GFX908:   ; NumVgprs: 40
+; GFX908    ; NumAgprs: 16
+; GFX908    ; TotalNumVgprs: 40
+
+; GFX90A:   ; NumVgprs: 40
+; GFX90A    ; NumAgprs: 16
+; GFX90A    ; TotalNumVgprs: 56
+
+; GFX801:   ; NumVgprs: 40
+
+; COM: Metadata
+; GFX908:    - .agpr_count:    32
+; GFX908:      .vgpr_count:    32
+
+; GFX90A:    - .agpr_count:    32
+; GFX90A:      .vgpr_count:    44
+
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX908:    - .agpr_count:    16
+; GFX908:      .vgpr_count:    40
+
+; GFX90A:    - .agpr_count:    16
+; GFX90A:      .vgpr_count:    56
+
+; GFX801:      .vgpr_count:    40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }
diff --git a/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s
new file mode 100644
index 0000000000000..ea065ea9ef5a9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s
@@ -0,0 +1,62 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck -check-prefixes=GFX908 %s
+// Based on sym_kernel_scope.s
+
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+
+    v_accvgpr_write_b32 a0, v6
+    v_accvgpr_read_b32 v3, a3
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 12
+// GFX908: .byte 7
+
+.amdgpu_hsa_kernel K1
+K1:
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+    v_accvgpr_write_b32 a44, v6
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 45
+// GFX908: .byte 45
+.byte .kernel.vgpr_count
+// GFX90A: .byte 53
+// GFX908: .byte 45
+
+.amdgpu_hsa_kernel K2
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+K2:
+    v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 8
+// GFX908: .byte 4
+
+.text
+.amdgpu_hsa_kernel K3
+K3:
+    v_accvgpr_read_b32 v[0], a0
+    v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] cbsz:1 abid:2 blgp:3
+    s_endpgm
+
+.byte .kernel.agpr_count
+// GFX90A: .byte 16
+// GFX908: .byte 16
+.byte .kernel.vgpr_count
+// GFX90A: .byte 20
+// GFX908: .byte 16

From c195addb606be4bc1d8f2716215657148382dbf8 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Tue, 15 Feb 2022 12:02:35 -0800
Subject: [PATCH 015/748] [NFC] [MTE] [HWASan] Remove unnecessary member of
 AllocaInfo

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D119981
---
 llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h | 4 +---
 llvm/lib/Target/AArch64/AArch64StackTagging.cpp           | 8 ++++----
 .../lib/Transforms/Instrumentation/HWAddressSanitizer.cpp | 3 +--
 llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp        | 7 +++----
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index c00d2c6445a9c..7de9beb8887eb 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -19,7 +19,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
 namespace memtag {
@@ -75,7 +74,6 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst);
 
 struct AllocaInfo {
   AllocaInst *AI;
-  TrackingVH<Instruction> OldAI; // Track through RAUW to replace debug uses.
   SmallVector<IntrinsicInst *, 2> LifetimeStart;
   SmallVector<IntrinsicInst *, 2> LifetimeEnd;
   SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
@@ -102,7 +100,7 @@ class StackInfoBuilder {
 };
 
 uint64_t getAllocaSizeInBytes(const AllocaInst &AI);
-bool alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Align);
+void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Align);
 
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 22383432d1a8a..589074e16d9b0 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -48,6 +48,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -532,9 +533,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   for (auto &I : SInfo.AllocasToInstrument) {
     memtag::AllocaInfo &Info = I.second;
     assert(Info.AI && isInterestingAlloca(*Info.AI));
-    auto *PrevAI = Info.AI;
-    if (memtag::alignAndPadAlloca(Info, kTagGranuleSize))
-      PrevAI->eraseFromParent();
+    TrackingVH<Instruction> OldAI = Info.AI;
+    memtag::alignAndPadAlloca(Info, kTagGranuleSize);
     AllocaInst *AI = Info.AI;
     int Tag = NextTag;
     NextTag = (NextTag + 1) % 16;
@@ -590,7 +590,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
 
     // Fixup debug intrinsics to point to the new alloca.
     for (auto DVI : Info.DbgVariableIntrinsics)
-      DVI->replaceVariableLocationOp(Info.OldAI, Info.AI);
+      DVI->replaceVariableLocationOp(OldAI, Info.AI);
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 0d599733980f4..350f9701d48d4 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1378,8 +1378,7 @@ bool HWAddressSanitizer::instrumentStack(
           II->eraseFromParent();
       }
     }
-    if (memtag::alignAndPadAlloca(Info, Align(Mapping.getObjectAlignment())))
-      AI->eraseFromParent();
+    memtag::alignAndPadAlloca(Info, Align(Mapping.getObjectAlignment()));
   }
   for (auto &I : SInfo.UnrecognizedLifetimes)
     I->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 602a2fe4ed3cd..641dd58456f6f 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -67,7 +67,6 @@ void StackInfoBuilder::visit(Instruction &Inst) {
   if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
     if (IsInterestingAlloca(*AI)) {
       Info.AllocasToInstrument[AI].AI = AI;
-      Info.AllocasToInstrument[AI].OldAI = AI;
     }
     return;
   }
@@ -109,7 +108,7 @@ uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
   return AI.getAllocationSizeInBits(DL).getValue() / 8;
 }
 
-bool alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
+void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   const Align NewAlignment = max(MaybeAlign(Info.AI->getAlign()), Alignment);
   Info.AI->setAlignment(NewAlignment);
   auto &Ctx = Info.AI->getFunction()->getContext();
@@ -117,7 +116,7 @@ bool alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   uint64_t Size = getAllocaSizeInBytes(*Info.AI);
   uint64_t AlignedSize = alignTo(Size, Alignment);
   if (Size == AlignedSize)
-    return false;
+    return;
 
   // Add padding to the alloca.
   Type *AllocatedType =
@@ -139,8 +138,8 @@ bool alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
 
   auto *NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI);
   Info.AI->replaceAllUsesWith(NewPtr);
+  Info.AI->eraseFromParent();
   Info.AI = NewAI;
-  return true;
 }
 
 } // namespace memtag

From 6e2cf33b24b323d35c77cb53bdba7c72bd2b42b9 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 16 Feb 2022 15:42:20 -0800
Subject: [PATCH 016/748] [mlir][doc] Add passes docs to Passes.md

---
 mlir/docs/Passes.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md
index f114483965638..13ec3ba4680d5 100644
--- a/mlir/docs/Passes.md
+++ b/mlir/docs/Passes.md
@@ -8,6 +8,10 @@ This document describes the available MLIR passes and their contracts.
 
 [include "GeneralPasses.md"]
 
+## Bufferization Passes
+
+[include "BufferizationPasses.md"]
+
 ## Conversion Passes
 
 [include "ConversionPasses.md"]
@@ -72,6 +76,10 @@ This document describes the available MLIR passes and their contracts.
 
 [include "TensorPasses.md"]
 
+## `vector` Dialect Passes
+
+[include "VectorPasses.md"]
+
 ## TOSA Dialect Passes
 
 [include "TosaPasses.md"]

From 3884cb92359f68390816344f5e9bb40d78e492a5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 14 Feb 2022 17:22:55 -0500
Subject: [PATCH 017/748] AMDGPU: Always reserve VGPR for AGPR copies on gfx908

Just because there aren't AGPRs in the original program doesn't mean
the register allocator can't choose to use them (unless we were to
forcibly reserve all AGPRs if there weren't any uses). This happens in
high pressure situations and introduces copies to avoid spills.

In this test, the allocator ends up introducing a copy from SGPR to
AGPR which requires an intermediate VGPR. I don't believe it would
introduce a copy from AGPR to AGPR in this situation, since it would
be trying to use an intermediate with a different class.

Theoretically this is also broken on gfx90a, but I have been unable to
come up with a testcase.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   2 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     | 405 +++++++++++++--
 .../regalloc-introduces-copy-sgpr-to-agpr.mir | 471 ++++++++++++++++++
 .../test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll |  12 +-
 4 files changed, 857 insertions(+), 33 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3eaa28a0afeb0..9ce83a65dd0d8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -622,7 +622,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
       } else
         MaxNumAGPRs = 0;
     }
-  } else if (ST.hasMAIInsts() && MFI->usesAGPRs(MF)) {
+  } else if (ST.hasMAIInsts()) {
     // In order to guarantee copying between AGPRs, we need a scratch VGPR
     // available at all times.
     reserveRegisterTuples(Reserved, AMDGPU::VGPR32);
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 8ec375cf93b6d..6a1f13c5d3e1c 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -254,59 +254,59 @@ define amdgpu_kernel void @no_agpr_no_reserve(<32 x i32> addrspace(1)* %arg) #0
 ; GFX908-LABEL: no_agpr_no_reserve:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 7, v0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
-; GFX908-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
-; GFX908-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
-; GFX908-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:32
-; GFX908-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:80
-; GFX908-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:64
-; GFX908-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
-; GFX908-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1] offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1]
+; GFX908-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[17:20], v4, s[0:1] offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[21:24], v4, s[0:1] offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[25:28], v4, s[0:1] offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[33:36], v4, s[0:1] offset:96
 ; GFX908-NEXT:    s_waitcnt vmcnt(7)
 ; GFX908-NEXT:    v_add_u32_e32 v3, v3, v3
 ; GFX908-NEXT:    v_add_u32_e32 v2, v2, v2
 ; GFX908-NEXT:    v_add_u32_e32 v1, v1, v1
 ; GFX908-NEXT:    v_add_u32_e32 v0, v0, v0
 ; GFX908-NEXT:    s_waitcnt vmcnt(6)
+; GFX908-NEXT:    v_add_u32_e32 v8, v8, v8
 ; GFX908-NEXT:    v_add_u32_e32 v7, v7, v7
 ; GFX908-NEXT:    v_add_u32_e32 v6, v6, v6
-; GFX908-NEXT:    v_add_u32_e32 v5, v5, v5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v31, v31, v31
-; GFX908-NEXT:    v_add_u32_e32 v30, v30, v30
-; GFX908-NEXT:    v_add_u32_e32 v29, v29, v29
-; GFX908-NEXT:    v_add_u32_e32 v28, v28, v28
-; GFX908-NEXT:    v_add_u32_e32 v4, v4, v4
+; GFX908-NEXT:    v_add_u32_e32 v36, v36, v36
+; GFX908-NEXT:    v_add_u32_e32 v35, v35, v35
+; GFX908-NEXT:    v_add_u32_e32 v34, v34, v34
+; GFX908-NEXT:    v_add_u32_e32 v33, v33, v33
+; GFX908-NEXT:    v_add_u32_e32 v5, v5, v5
+; GFX908-NEXT:    v_add_u32_e32 v12, v12, v12
 ; GFX908-NEXT:    v_add_u32_e32 v11, v11, v11
 ; GFX908-NEXT:    v_add_u32_e32 v10, v10, v10
 ; GFX908-NEXT:    v_add_u32_e32 v9, v9, v9
-; GFX908-NEXT:    v_add_u32_e32 v8, v8, v8
+; GFX908-NEXT:    v_add_u32_e32 v16, v16, v16
 ; GFX908-NEXT:    v_add_u32_e32 v15, v15, v15
 ; GFX908-NEXT:    v_add_u32_e32 v14, v14, v14
 ; GFX908-NEXT:    v_add_u32_e32 v13, v13, v13
-; GFX908-NEXT:    v_add_u32_e32 v12, v12, v12
+; GFX908-NEXT:    v_add_u32_e32 v20, v20, v20
 ; GFX908-NEXT:    v_add_u32_e32 v19, v19, v19
 ; GFX908-NEXT:    v_add_u32_e32 v18, v18, v18
 ; GFX908-NEXT:    v_add_u32_e32 v17, v17, v17
-; GFX908-NEXT:    v_add_u32_e32 v16, v16, v16
+; GFX908-NEXT:    v_add_u32_e32 v24, v24, v24
 ; GFX908-NEXT:    v_add_u32_e32 v23, v23, v23
 ; GFX908-NEXT:    v_add_u32_e32 v22, v22, v22
 ; GFX908-NEXT:    v_add_u32_e32 v21, v21, v21
-; GFX908-NEXT:    v_add_u32_e32 v20, v20, v20
+; GFX908-NEXT:    v_add_u32_e32 v28, v28, v28
 ; GFX908-NEXT:    v_add_u32_e32 v27, v27, v27
 ; GFX908-NEXT:    v_add_u32_e32 v26, v26, v26
 ; GFX908-NEXT:    v_add_u32_e32 v25, v25, v25
-; GFX908-NEXT:    v_add_u32_e32 v24, v24, v24
-; GFX908-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
-; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
-; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:64
-; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:80
-; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:32
-; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
-; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
-; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT:    global_store_dwordx4 v4, v[33:36], s[0:1] offset:96
+; GFX908-NEXT:    global_store_dwordx4 v4, v[25:28], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v4, v[21:24], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v4, v[17:20], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v4, v[13:16], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v4, v[9:12], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v4, v[5:8], s[0:1]
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: no_agpr_no_reserve:
@@ -518,9 +518,358 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
   ret void
 }
 
+define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 {
+; GFX908-LABEL: introduced_copy_to_sgpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    global_load_ushort v0, v[0:1], off glc
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX908-NEXT:    s_load_dword s7, s[4:5], 0x18
+; GFX908-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX908-NEXT:    s_mov_b32 s6, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX908-NEXT:    s_sub_i32 s4, 0, s1
+; GFX908-NEXT:    s_lshl_b64 s[10:11], s[2:3], 5
+; GFX908-NEXT:    s_or_b32 s10, s10, 28
+; GFX908-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v13, s10
+; GFX908-NEXT:    s_lshr_b32 s12, s7, 16
+; GFX908-NEXT:    v_mov_b32_e32 v32, s11
+; GFX908-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v28, s7
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v29, s12
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v13
+; GFX908-NEXT:    v_mul_lo_u32 v1, s4, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v32
+; GFX908-NEXT:    v_mov_b32_e32 v11, s3
+; GFX908-NEXT:    s_lshl_b64 s[4:5], s[8:9], 5
+; GFX908-NEXT:    v_mul_hi_u32 v3, v2, v1
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_mov_b32_e32 v10, s2
+; GFX908-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX908-NEXT:    v_mul_hi_u32 v4, s0, v2
+; GFX908-NEXT:    v_mul_lo_u32 v5, v4, s1
+; GFX908-NEXT:    v_add_u32_e32 v6, 1, v4
+; GFX908-NEXT:    v_sub_u32_e32 v5, s0, v5
+; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX908-NEXT:    v_subrev_u32_e32 v6, s1, v5
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX908-NEXT:    v_add_u32_e32 v7, 1, v4
+; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_and_b32_e32 v30, 0xffff, v0
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
+; GFX908-NEXT:    v_mul_lo_u32 v8, s9, v30
+; GFX908-NEXT:    v_mul_hi_u32 v9, s8, v30
+; GFX908-NEXT:    v_lshlrev_b64 v[2:3], 5, v[0:1]
+; GFX908-NEXT:    v_mul_lo_u32 v6, s8, v30
+; GFX908-NEXT:    v_add_u32_e32 v7, v9, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_lshlrev_b64 v[6:7], 5, v[6:7]
+; GFX908-NEXT:    s_branch .LBB3_2
+; GFX908-NEXT:  .LBB3_1: ; %bb12
+; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v0
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a3
+; GFX908-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a2
+; GFX908-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT:  .LBB3_2: ; %bb9
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
+; GFX908-NEXT:    s_cbranch_scc0 .LBB3_1
+; GFX908-NEXT:  ; %bb.3: ; %bb14
+; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0
+; GFX908-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off
+; GFX908-NEXT:    s_mov_b32 s7, s6
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[10:11]
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a1
+; GFX908-NEXT:    v_mov_b32_e32 v17, s7
+; GFX908-NEXT:    v_mov_b32_e32 v19, s7
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a0
+; GFX908-NEXT:    v_mov_b32_e32 v16, s6
+; GFX908-NEXT:    v_mov_b32_e32 v18, s6
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v22, vcc, 1, v12
+; GFX908-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v13, vcc
+; GFX908-NEXT:    v_mul_lo_u32 v23, s4, v20
+; GFX908-NEXT:    v_mul_hi_u32 v24, s4, v22
+; GFX908-NEXT:    v_mul_lo_u32 v25, s5, v22
+; GFX908-NEXT:    v_mul_lo_u32 v31, s4, v22
+; GFX908-NEXT:    v_mov_b32_e32 v21, s7
+; GFX908-NEXT:    v_add_u32_e32 v22, v24, v23
+; GFX908-NEXT:    v_add_u32_e32 v33, v22, v25
+; GFX908-NEXT:    v_mov_b32_e32 v23, s7
+; GFX908-NEXT:    v_mov_b32_e32 v20, s6
+; GFX908-NEXT:    v_mov_b32_e32 v22, s6
+; GFX908-NEXT:    s_branch .LBB3_5
+; GFX908-NEXT:  .LBB3_4: ; %bb58
+; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v30
+; GFX908-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX908-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[12:13]
+; GFX908-NEXT:    v_add_co_u32_e64 v14, s[2:3], v14, v6
+; GFX908-NEXT:    v_addc_co_u32_e64 v15, s[2:3], v15, v7, s[2:3]
+; GFX908-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
+; GFX908-NEXT:  .LBB3_5: ; %bb16
+; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_add_co_u32_e32 v24, vcc, v14, v31
+; GFX908-NEXT:    v_addc_co_u32_e32 v25, vcc, v15, v33, vcc
+; GFX908-NEXT:    global_load_dword v35, v[24:25], off offset:-12 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    global_load_dword v34, v[24:25], off offset:-8 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    global_load_dword v26, v[24:25], off offset:-4 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    global_load_dword v24, v[24:25], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    ds_read_b64 v[24:25], v1
+; GFX908-NEXT:    ds_read_b64 v[26:27], v0
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_cbranch_vccnz .LBB3_4
+; GFX908-NEXT:  ; %bb.6: ; %bb51
+; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v8, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v9, v35
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v34, v34
+; GFX908-NEXT:    v_add_f32_e32 v4, v28, v24
+; GFX908-NEXT:    v_add_f32_e32 v5, v29, v25
+; GFX908-NEXT:    v_add_f32_e32 v2, 0, v24
+; GFX908-NEXT:    v_add_f32_e32 v3, 0, v25
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v27
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v26
+; GFX908-NEXT:    v_add_f32_e32 v25, v35, v25
+; GFX908-NEXT:    v_add_f32_e32 v24, v34, v24
+; GFX908-NEXT:    v_add_f32_e32 v17, v17, v5
+; GFX908-NEXT:    v_add_f32_e32 v16, v16, v4
+; GFX908-NEXT:    v_add_f32_e32 v19, v19, v3
+; GFX908-NEXT:    v_add_f32_e32 v18, v18, v2
+; GFX908-NEXT:    v_add_f32_e32 v20, v20, v9
+; GFX908-NEXT:    v_add_f32_e32 v21, v21, v8
+; GFX908-NEXT:    v_add_f32_e32 v22, v22, v24
+; GFX908-NEXT:    v_add_f32_e32 v23, v23, v25
+; GFX908-NEXT:    s_branch .LBB3_4
+;
+; GFX90A-LABEL: introduced_copy_to_sgpr:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    global_load_ushort v10, v[0:1], off glc
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
+; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x10
+; GFX90A-NEXT:    s_load_dword s2, s[4:5], 0x18
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX90A-NEXT:    s_sub_i32 s5, 0, s7
+; GFX90A-NEXT:    s_lshr_b32 s12, s2, 16
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s12
+; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[8:9], 5
+; GFX90A-NEXT:    s_or_b32 s10, s10, 28
+; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
+; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v0, s7
+; GFX90A-NEXT:    v_sub_u32_e32 v8, s6, v8
+; GFX90A-NEXT:    v_add_u32_e32 v9, 1, v0
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s7, v8
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX90A-NEXT:    v_subrev_u32_e32 v9, s7, v8
+; GFX90A-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v9, 1, v0
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s7, v8
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX90A-NEXT:    v_lshlrev_b64 v[8:9], 5, v[0:1]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_and_b32_e32 v30, 0xffff, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v11, s1, v30
+; GFX90A-NEXT:    v_mul_hi_u32 v12, s0, v30
+; GFX90A-NEXT:    v_mul_lo_u32 v10, s0, v30
+; GFX90A-NEXT:    v_add_u32_e32 v11, v12, v11
+; GFX90A-NEXT:    v_lshlrev_b64 v[10:11], 5, v[10:11]
+; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], 0, 0
+; GFX90A-NEXT:    s_branch .LBB3_2
+; GFX90A-NEXT:  .LBB3_1: ; %bb12
+; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v8
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX90A-NEXT:  .LBB3_2: ; %bb9
+; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
+; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_1
+; GFX90A-NEXT:  ; %bb.3: ; %bb14
+; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
+; GFX90A-NEXT:    s_mov_b32 s5, s4
+; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[16:17], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[18:19], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[20:21], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[22:23], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v24, vcc, 1, v14
+; GFX90A-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v15, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v25, s2, v25
+; GFX90A-NEXT:    v_mul_hi_u32 v26, s2, v24
+; GFX90A-NEXT:    v_mul_lo_u32 v27, s3, v24
+; GFX90A-NEXT:    v_mul_lo_u32 v31, s2, v24
+; GFX90A-NEXT:    v_add_u32_e32 v24, v26, v25
+; GFX90A-NEXT:    v_add_u32_e32 v32, v24, v27
+; GFX90A-NEXT:    v_pk_mov_b32 v[24:25], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    s_branch .LBB3_5
+; GFX90A-NEXT:  .LBB3_4: ; %bb58
+; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v30
+; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v11, vcc
+; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX90A-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
+; GFX90A-NEXT:  .LBB3_5: ; %bb16
+; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
+; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT:    v_add_co_u32_e32 v26, vcc, v16, v31
+; GFX90A-NEXT:    v_addc_co_u32_e32 v27, vcc, v17, v32, vcc
+; GFX90A-NEXT:    global_load_dword v34, v[26:27], off offset:-12 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v33, v[26:27], off offset:-8 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v28, v[26:27], off offset:-4 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v28, v[26:27], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    ; kill: killed $vgpr26 killed $vgpr27
+; GFX90A-NEXT:    ds_read_b64 v[26:27], v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    ds_read_b64 v[28:29], v0
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_4
+; GFX90A-NEXT:  ; %bb.6: ; %bb51
+; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v34, v34
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v37, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v36, v33
+; GFX90A-NEXT:    v_pk_add_f32 v[38:39], v[2:3], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[40:41], v[26:27], 0 op_sel_hi:[1,0]
+; GFX90A-NEXT:    v_pk_add_f32 v[28:29], v[34:35], v[28:29]
+; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[36:37], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[38:39]
+; GFX90A-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[40:41]
+; GFX90A-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[28:29]
+; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[24:25], v[26:27]
+; GFX90A-NEXT:    s_branch .LBB3_4
+bb:
+  %i = load volatile i16, i16 addrspace(4)* undef, align 2
+  %i6 = zext i16 %i to i64
+  %i7 = udiv i32 %arg1, %arg2
+  %i8 = zext i32 %i7 to i64
+  br label %bb9
+
+bb9:                                              ; preds = %bb12, %bb
+  %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
+  %i11 = icmp slt i64 %i10, 0
+  br i1 undef, label %bb14, label %bb12
+
+bb12:                                             ; preds = %bb58, %bb9
+  %i13 = add nuw nsw i64 %i10, %i8
+  br label %bb9
+
+bb14:                                             ; preds = %bb9
+  %i15 = load i64, i64 addrspace(1)* null, align 8
+  br label %bb16
+
+bb16:                                             ; preds = %bb58, %bb14
+  %i17 = phi i64 [ %i65, %bb58 ], [ %i15, %bb14 ]
+  %i18 = phi <2 x float> [ %i59, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i19 = phi <2 x float> [ %i60, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i20 = phi <2 x float> [ %i61, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i21 = phi <2 x float> [ %i62, %bb58 ], [ zeroinitializer, %bb14 ]
+  %i22 = add nsw i64 %i17, 1
+  %i23 = mul nsw i64 %i22, %arg
+  %i24 = add nsw i64 %i23, %i10
+  %i25 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 8
+  %i26 = bitcast half addrspace(1)* %i25 to <2 x half> addrspace(1)*
+  %i27 = load volatile <2 x half>, <2 x half> addrspace(1)* %i26, align 16
+  %i28 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 10
+  %i29 = bitcast half addrspace(1)* %i28 to <2 x half> addrspace(1)*
+  %i30 = load volatile <2 x half>, <2 x half> addrspace(1)* %i29, align 4
+  %i31 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 12
+  %i32 = bitcast half addrspace(1)* %i31 to <2 x half> addrspace(1)*
+  %i33 = load volatile <2 x half>, <2 x half> addrspace(1)* %i32, align 8
+  %i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
+  %i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
+  %i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
+  %i37 = fpext <2 x half> %arg4 to <2 x float>
+  %i39 = fpext <2 x half> %i27 to <2 x float>
+  %i40 = fpext <2 x half> %i30 to <2 x float>
+  %i41 = fpext <2 x half> %i33 to <2 x float>
+  %i42 = fpext <2 x half> %i36 to <2 x float>
+  %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
+  %i44 = fadd contract <2 x float> %i37, %i43
+  %i45 = fadd contract <2 x float> %i43, zeroinitializer
+  %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
+  %i47 = fadd contract <2 x float> %i39, %i46
+  %i48 = fadd contract <2 x float> %i40, %i43
+  %i49 = fadd contract <2 x float> %i41, zeroinitializer
+  %i50 = fadd contract <2 x float> %i42, zeroinitializer
+  fence syncscope("workgroup") acquire
+  br i1 %i11, label %bb58, label %bb51
+
+bb51:                                             ; preds = %bb16
+  %i52 = fadd contract <2 x float> %i18, %i44
+  %i53 = fadd contract <2 x float> %i19, %i45
+  %i54 = fadd contract <2 x float> %i20, %i47
+  %i55 = fadd contract <2 x float> %i21, %i48
+  %i56 = fadd contract <2 x float> %i49, zeroinitializer
+  %i57 = fadd contract <2 x float> %i50, zeroinitializer
+  br label %bb58
+
+bb58:                                             ; preds = %bb51, %bb16
+  %i59 = phi <2 x float> [ %i18, %bb16 ], [ %i52, %bb51 ]
+  %i60 = phi <2 x float> [ %i19, %bb16 ], [ %i53, %bb51 ]
+  %i61 = phi <2 x float> [ %i20, %bb16 ], [ %i54, %bb51 ]
+  %i62 = phi <2 x float> [ %i21, %bb16 ], [ %i55, %bb51 ]
+  %i63 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i56, %bb51 ]
+  %i64 = phi <2 x float> [ zeroinitializer, %bb16 ], [ %i57, %bb51 ]
+  %i65 = add nsw i64 %i17, %i6
+  %i66 = icmp slt i64 %i65, 0
+  br i1 %i66, label %bb16, label %bb12
+}
+
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { "amdgpu-waves-per-eu"="6,6" }
 attributes #1 = { convergent nounwind readnone willreturn }
 attributes #2 = { nounwind readnone willreturn }
+attributes #3 = { "amdgpu-waves-per-eu"="7,7" }
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
new file mode 100644
index 0000000000000..00936443ba9be
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir
@@ -0,0 +1,471 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,0 -stop-after=postrapseudos -o - %s | FileCheck -check-prefix=GFX908 %s
+
+# This testcase has a long sequence of sgpr to vgpr copies with a lot of vgpr
+# pressure, encouraging the allocator to displace some into AGPRs. This
+# introduces SGPR to AGPR copies which require a reserved temporary VGPR to
+# handle.
+
+--- |
+
+  define amdgpu_kernel void @regalloc_introduces_s_to_a_copy() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="7,7" }
+
+...
+---
+name:            regalloc_introduces_s_to_a_copy
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+
+    ; GFX908-LABEL: name: regalloc_introduces_s_to_a_copy
+    ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47, $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, $vgpr32_vgpr33_vgpr34_vgpr35, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+    ; GFX908-NEXT: {{  $}}
+    ; GFX908-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr7, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr32 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr33 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr34 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $vgpr34 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX908-NEXT: $agpr35 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr5 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr6 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr7 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr8 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr9 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr10 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr11 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr12 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr13 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr14 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr16 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr18 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr19 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr20 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr21 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr22 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr23 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr24 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr25 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr26 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr27 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr28 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr29 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr30 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr31 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr34 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr35 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr36 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr37 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr38 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr39 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: renamable $sgpr40 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    ; GFX908-NEXT: $vgpr32 = V_MOV_B32_e32 killed $sgpr4, implicit $exec
+    ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr32, implicit $exec, implicit $exec
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr8, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr10, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr12, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr13, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr16, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr17, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr20, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr21, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr22, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr23, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, implicit $exec :: (store (s32) into %stack.19, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr24, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr25, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, implicit $exec :: (store (s32) into %stack.21, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr26, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, implicit $exec :: (store (s32) into %stack.22, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr27, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, implicit $exec :: (store (s32) into %stack.23, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr28, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, implicit $exec :: (store (s32) into %stack.24, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr29, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, implicit $exec :: (store (s32) into %stack.25, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr30, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, implicit $exec :: (store (s32) into %stack.26, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr31, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, implicit $exec :: (store (s32) into %stack.27, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, implicit $exec :: (store (s32) into %stack.28, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr35, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, implicit $exec :: (store (s32) into %stack.29, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr36, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, implicit $exec :: (store (s32) into %stack.30, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr37, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, implicit $exec :: (store (s32) into %stack.31, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr38, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, implicit $exec :: (store (s32) into %stack.32, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr39, implicit $exec, implicit $exec
+    ; GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr34, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, implicit $exec :: (store (s32) into %stack.33, addrspace 5)
+    ; GFX908-NEXT: $vgpr34 = V_MOV_B32_e32 killed $sgpr40, implicit $exec, implicit $exec
+    ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, implicit $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, implicit $vgpr35
+    ; GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr32, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr33, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr34, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec, implicit $exec
+    ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec
+    ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX908-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX908-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX908-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX908-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX908-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX908-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX908-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX908-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX908-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX908-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX908-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX908-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX908-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX908-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX908-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX908-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, implicit $exec :: (load (s32) from %stack.19, addrspace 5)
+    ; GFX908-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
+    ; GFX908-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, implicit $exec :: (load (s32) from %stack.21, addrspace 5)
+    ; GFX908-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, implicit $exec :: (load (s32) from %stack.22, addrspace 5)
+    ; GFX908-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, implicit $exec :: (load (s32) from %stack.23, addrspace 5)
+    ; GFX908-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, implicit $exec :: (load (s32) from %stack.24, addrspace 5)
+    ; GFX908-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, implicit $exec :: (load (s32) from %stack.25, addrspace 5)
+    ; GFX908-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, implicit $exec :: (load (s32) from %stack.26, addrspace 5)
+    ; GFX908-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, implicit $exec :: (load (s32) from %stack.27, addrspace 5)
+    ; GFX908-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, implicit $exec :: (load (s32) from %stack.28, addrspace 5)
+    ; GFX908-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, implicit $exec :: (load (s32) from %stack.29, addrspace 5)
+    ; GFX908-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, implicit $exec :: (load (s32) from %stack.30, addrspace 5)
+    ; GFX908-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, implicit $exec :: (load (s32) from %stack.31, addrspace 5)
+    ; GFX908-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, implicit $exec :: (load (s32) from %stack.32, addrspace 5)
+    ; GFX908-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, implicit $exec :: (load (s32) from %stack.33, addrspace 5)
+    ; GFX908-NEXT: S_NOP 0, implicit renamable $agpr0, implicit killed renamable $vgpr1, implicit killed renamable $vgpr2, implicit killed renamable $vgpr3, implicit killed renamable $vgpr4, implicit killed renamable $vgpr5, implicit killed renamable $vgpr6, implicit killed renamable $vgpr7, implicit killed renamable $vgpr8, implicit killed renamable $vgpr9, implicit killed renamable $vgpr10, implicit killed renamable $vgpr11, implicit killed renamable $vgpr12, implicit killed renamable $vgpr13, implicit killed renamable $vgpr14, implicit killed renamable $vgpr15, implicit killed renamable $vgpr16, implicit killed renamable $vgpr17, implicit killed renamable $vgpr18, implicit killed renamable $vgpr19, implicit killed renamable $vgpr20, implicit killed renamable $vgpr21, implicit killed renamable $vgpr22, implicit killed renamable $vgpr23, implicit killed renamable $vgpr24, implicit killed renamable $vgpr25, implicit killed renamable $vgpr26, implicit killed renamable $vgpr27, implicit killed renamable $vgpr28, implicit killed renamable $vgpr29, implicit killed renamable $vgpr30, implicit killed renamable $vgpr31, implicit killed renamable $vgpr33, implicit killed renamable $vgpr35, implicit killed renamable $vgpr34
+    ; GFX908-NEXT: S_ENDPGM 0, implicit killed renamable $agpr0
+    %v0:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v1:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v2:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v3:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v4:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v5:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v6:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v7:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v8:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v9:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v10:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v11:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v12:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v13:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v14:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v15:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v16:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v17:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v18:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v19:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v20:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v21:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v22:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v23:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v24:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v25:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v26:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v27:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v28:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v29:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v30:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v31:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v32:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v33:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v34:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %v35:vgpr_32 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, implicit $exec
+    %s0:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s1:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s2:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s3:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s4:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s5:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s6:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s7:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s8:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s9:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s10:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s11:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s12:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s13:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s14:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s15:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s16:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s17:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s18:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s19:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s20:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s21:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s22:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s23:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s24:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s25:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s26:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s27:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s28:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s29:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s30:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s31:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s32:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s33:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %s34:sgpr_32 = S_LOAD_DWORD_IMM undef $sgpr8_sgpr9, 0, 0
+    %71:vgpr_32 = COPY %s0
+    %72:vgpr_32 = COPY %s1
+    %73:vgpr_32 = COPY %s2
+    %74:vgpr_32 = COPY %s3
+    %75:vgpr_32 = COPY %s4
+    %76:vgpr_32 = COPY %s5
+    %77:vgpr_32 = COPY %s6
+    %78:vgpr_32 = COPY %s7
+    %79:vgpr_32 = COPY %s8
+    %80:vgpr_32 = COPY %s9
+    %81:vgpr_32 = COPY %s10
+    %82:vgpr_32 = COPY %s11
+    %83:vgpr_32 = COPY %s12
+    %84:vgpr_32 = COPY %s13
+    %85:vgpr_32 = COPY %s14
+    %86:vgpr_32 = COPY %s15
+    %87:vgpr_32 = COPY %s16
+    %88:vgpr_32 = COPY %s17
+    %89:vgpr_32 = COPY %s18
+    %90:vgpr_32 = COPY %s19
+    %91:vgpr_32 = COPY %s20
+    %92:vgpr_32 = COPY %s21
+    %93:vgpr_32 = COPY %s22
+    %94:vgpr_32 = COPY %s23
+    %95:vgpr_32 = COPY %s24
+    %96:vgpr_32 = COPY %s25
+    %97:vgpr_32 = COPY %s26
+    %98:vgpr_32 = COPY %s27
+    %99:vgpr_32 = COPY %s28
+    %100:vgpr_32 = COPY %s29
+    %101:vgpr_32 = COPY %s30
+    %102:vgpr_32 = COPY %s31
+    %103:vgpr_32 = COPY %s32
+    %104:vgpr_32 = COPY %s33
+    %105:vgpr_32 = COPY %s34
+    S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19, implicit $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, implicit $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, implicit $vgpr35
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v1, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v2, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v3, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v4, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v5, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v6, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v7, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v8, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v9, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v10, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v11, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v12, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v13, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v14, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v15, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v16, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v17, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v18, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v19, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v20, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v21, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v22, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v23, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v24, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v25, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v26, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v27, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v28, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v29, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v30, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v31, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v32, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v33, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v34, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %v35, 0, 0, implicit $exec
+    S_NOP 0, implicit %71, implicit %72, implicit %73, implicit %74, implicit %75, implicit %76, implicit %77, implicit %78, implicit %79, implicit %80, implicit %81, implicit %82, implicit %83, implicit %84, implicit %85, implicit %86, implicit %87, implicit %88, implicit %89, implicit %90, implicit %91, implicit %92, implicit %93, implicit %94, implicit %95, implicit %96, implicit %97, implicit %98, implicit %99, implicit %100, implicit %101, implicit %102, implicit %103, implicit %104, implicit %105
+    S_ENDPGM 0, implicit %71
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index da95349c297b2..d0474fe47ebb1 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -194,11 +194,13 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p)
 ; GFX908-NOT: buffer_
 ; GFX908-DAG: v_accvgpr_read_b32
 
-; GCN:    NumVgprs: 256
+; GFX900: NumVgprs: 256
 ; GFX900: ScratchSize: 148
+; GFX908: NumVgprs: 255
 ; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
-; GCN:    NumVGPRsForWavesPerEU: 256
+; GFX900:    NumVGPRsForWavesPerEU: 256
+; GFX908:    NumVGPRsForWavesPerEU: 255
 define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
@@ -242,11 +244,13 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
 ; GFX908-NOT: buffer_
 ; GFX908-DAG: v_accvgpr_read_b32
 
-; GCN:    NumVgprs: 256
+; GFX900:    NumVgprs: 256
+; GFX908:    NumVgprs: 253
 ; GFX900: ScratchSize: 2052
 ; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
-; GCN:    NumVGPRsForWavesPerEU: 256
+; GFX900:    NumVGPRsForWavesPerEU: 256
+; GFX908:    NumVGPRsForWavesPerEU: 253
 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid

From c60d8229651c25ae869e9a3bfece3e74118a5ce0 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Mon, 14 Feb 2022 15:13:07 -0800
Subject: [PATCH 018/748] [WebAssembly] Make __wasm_lpad_context thread-local

This makes `__wasm_lpad_context`, a struct that is used as a
communication channel between compiler-generated code and personality
function in libunwind, thread local. The library code will be changed to
thread local in the emscripten side.

Reviewed By: sbc100, tlively

Differential Revision: https://reviews.llvm.org/D119803
---
 llvm/lib/CodeGen/WasmEHPrepare.cpp             | 14 +++++++++++++-
 llvm/test/CodeGen/WebAssembly/wasmehprepare.ll |  6 ++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index c04a7b28eff9d..6b7df758e4579 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -212,9 +212,21 @@ bool WasmEHPrepare::prepareEHPads(Function &F) {
 
   assert(F.hasPersonalityFn() && "Personality function not found");
 
-  // __wasm_lpad_context global variable
+  // __wasm_lpad_context global variable.
+  // If the target supports TLS, make this thread-local. We can't just
+  // unconditionally make it thread-local and depend on
+  // CoalesceFeaturesAndStripAtomics to downgrade it, because stripping TLS has
+  // the side effect of disallowing the object from being linked into a
+  // shared-memory module, which we don't want to be responsible for.
   LPadContextGV = cast<GlobalVariable>(
       M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
+  Attribute FSAttr = F.getFnAttribute("target-features");
+  if (FSAttr.isValid()) {
+    StringRef FS = FSAttr.getValueAsString();
+    if (FS.contains("+atomics") && FS.contains("+bulk-memory"))
+      LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
+  }
+
   LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0,
                                           "lpad_index_gep");
   LSDAField =
diff --git a/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll b/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll
index 63bdf2c6bea08..081a9776fa9aa 100644
--- a/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll
+++ b/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll
@@ -1,9 +1,11 @@
-; RUN: opt < %s -winehprepare -demote-catchswitch-only -wasmehprepare -S | FileCheck %s
+; RUN: opt < %s -winehprepare -demote-catchswitch-only -wasmehprepare -S | FileCheck %s --check-prefixes=CHECK,NO-TLS
+; RUN: opt < %s -winehprepare -demote-catchswitch-only -wasmehprepare -S --mattr=+atomics,+bulk-memory | FileCheck %s --check-prefixes=CHECK,TLS
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
-; CHECK: @__wasm_lpad_context = external global { i32, i8*, i32 }
+; NO-TLS: @__wasm_lpad_context = external global { i32, i8*, i32 }
+; TLS: @__wasm_lpad_context = external thread_local global { i32, i8*, i32 }
 
 @_ZTIi = external constant i8*
 %struct.Temp = type { i8 }

From a99989529eac2a96fec4b3b59cee6d2f7ab22a92 Mon Sep 17 00:00:00 2001
From: Daniil Suchkov <dsuchkov@azul.com>
Date: Tue, 15 Feb 2022 23:46:21 +0000
Subject: [PATCH 019/748] [RewriteStatepointsForGC] Add a test exposing an
 incorrect assertion

---
 .../phi-vector-bitcast.ll                     | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll

diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll b/llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll
new file mode 100644
index 0000000000000..6f69234cc993e
--- /dev/null
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll
@@ -0,0 +1,28 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: opt < %s -disable-output -passes=rewrite-statepoints-for-gc
+
+; We shouldn't crash when we encounter a vector phi with more than one input
+; from the same predecessor.
+define void @foo(<2 x i8 addrspace(1)*> %arg1, i32 %arg2, i1 %arg3, <2 x i64 addrspace(1)*> %arg4) gc "statepoint-example" personality i32* null {
+bb:
+  %tmp = bitcast <2 x i8 addrspace(1)*> %arg1 to <2 x i64 addrspace(1)*>
+  switch i32 %arg2, label %bb2 [
+    i32 1, label %bb4
+    i32 2, label %bb4
+  ]
+
+bb2:                                              ; preds = %bb
+  br i1 %arg3, label %bb8, label %bb4
+
+bb4:                                              ; preds = %bb2, %bb, %bb
+  %tmp5 = phi <2 x i64 addrspace(1)*> [ %tmp, %bb ], [ %tmp, %bb ], [ %arg4, %bb2 ]
+  call void @bar()
+  %tmp6 = extractelement <2 x i64 addrspace(1)*> %tmp5, i32 1
+  ret void
+
+bb8:                                              ; preds = %bb2
+  ret void
+}
+
+declare void @bar()

From 69297cf639044acf48dd5d9b39b95c54dd50561d Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@fb.com>
Date: Wed, 16 Feb 2022 20:03:57 -0500
Subject: [PATCH 020/748] [lld-macho] Don't include CommandFlags.h in
 CommonLinkerContext.h

Main motivation: including `llvm/CodeGen/CommandFlags.h` in
`CommonLinkerContext.h` means that the declaration of `llvm::Reloc` is
visible in any file that includes `CommonLinkerContext.h`. Since our
cpp files have both `using namespace llvm` and `using namespace
lld::macho`, this results in conflicts with `lld::macho::Reloc`.

I suppose we could put `llvm::Reloc` into a nested namespace, but in general,
I think we should avoid transitively including too many header files in
a very widely used header like `CommonLinkerContext.h`.

RegisterCodeGenFlags' ctor initializes a bunch of function-`static`
structures and does nothing else, so it should be fine to "initialize"
it as a temporary stack variable rather than as a file static.

Reviewed By: aganea

Differential Revision: https://reviews.llvm.org/D119913
---
 lld/Common/CommonLinkerContext.cpp           | 8 +++++++-
 lld/ELF/Writer.cpp                           | 1 +
 lld/MachO/ICF.cpp                            | 1 +
 lld/include/lld/Common/CommonLinkerContext.h | 4 ----
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/lld/Common/CommonLinkerContext.cpp b/lld/Common/CommonLinkerContext.cpp
index 50ccbb37c7966..12f56bc10ec96 100644
--- a/lld/Common/CommonLinkerContext.cpp
+++ b/lld/Common/CommonLinkerContext.cpp
@@ -10,6 +10,8 @@
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/Memory.h"
 
+#include "llvm/CodeGen/CommandFlags.h"
+
 using namespace llvm;
 using namespace lld;
 
@@ -20,7 +22,11 @@ using namespace lld;
 // state.
 static CommonLinkerContext *lctx;
 
-CommonLinkerContext::CommonLinkerContext() { lctx = this; }
+CommonLinkerContext::CommonLinkerContext() {
+  lctx = this;
+  // Fire off the static initializations in CGF's constructor.
+  codegen::RegisterCodeGenFlags CGF;
+}
 
 CommonLinkerContext::~CommonLinkerContext() {
   assert(lctx);
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 35d8b01308f74..0282d7d6b5a78 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -24,6 +24,7 @@
 #include "lld/Common/Filesystem.h"
 #include "lld/Common/Strings.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include "llvm/Support/SHA1.h"
diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index f9dea4b861ac3..fa018f4d3ce13 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -12,6 +12,7 @@
 #include "Symbols.h"
 #include "UnwindInfoSection.h"
 
+#include "lld/Common/CommonLinkerContext.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/TimeProfiler.h"
 
diff --git a/lld/include/lld/Common/CommonLinkerContext.h b/lld/include/lld/Common/CommonLinkerContext.h
index 3954d38ded636..0627bbdc8bd87 100644
--- a/lld/include/lld/Common/CommonLinkerContext.h
+++ b/lld/include/lld/Common/CommonLinkerContext.h
@@ -21,7 +21,6 @@
 
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/Memory.h"
-#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/Support/StringSaver.h"
 
 namespace llvm {
@@ -42,9 +41,6 @@ class CommonLinkerContext {
   llvm::DenseMap<void *, SpecificAllocBase *> instances;
 
   ErrorHandler e;
-
-private:
-  llvm::codegen::RegisterCodeGenFlags cgf;
 };
 
 // Retrieve the global state. Currently only one state can exist per process,

From 21aaa1fb22db892f4deedac98af12c03fb870d85 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Wed, 16 Feb 2022 16:28:05 -0800
Subject: [PATCH 021/748] [bazel] Add libc dependency.

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f4b5d3747ed35..632cf987441e8 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -400,7 +400,12 @@ libc_math_function(name = "frexpl")
 
 libc_math_function(name = "hypot")
 
-libc_math_function(name = "hypotf")
+libc_math_function(
+    name = "hypotf",
+    additional_deps = [
+        ":__support_fputil_sqrt",
+    ],
+)
 
 libc_math_function(name = "logb")
 

From 3671bdbcd214c8ace66f6358f7c509e8cc28b28e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 16 Feb 2022 11:54:38 -0800
Subject: [PATCH 022/748] [BPF] Fix a BTF type pruning bug

In BPF backend, BTF type generation may skip
some debuginfo types if they are the pointee
type of a struct member. For example,
  struct task_struct {
    ...
    struct mm_struct                *mm;
    ...
  };
BPF backend may generate a forward decl for
'struct mm_struct' instead of full type if
there are no other usage of 'struct mm_struct'.
The reason is to avoid bringing too much unneeded types
in BTF.

Alexei found a pruning bug where we may miss
some full type generation. The following is an illustrating
example:
   struct t1 { ... }
   struct t2 { struct t1 *p; };
   struct t2 g;
   void foo(struct t1 *arg) { ... }
In the above case, we will have partial debuginfo chain like below:
   struct t2 -> member p
                        \ -> ptr -> struct t1
                        /
     foo -> argument arg
During traversing
   struct t2 -> member p -> ptr -> struct t1
The corresponding BTF types are generated except 'struct t1' which
will be in FixUp stage. Later, when traversing
   foo -> argument arg -> ptr -> struct t1
The 'ptr' BTF type has been generated and currently implementation
ignores 'pointer' type hence 'struct t1' is not generated.

This patch fixed the issue not just for the above case, but for
general case with multiple derived types, e.g.,
   struct t2 -> member p
                        \ -> const -> ptr -> volatile -> struct t1
                        /
     foo -> argument arg

Differential Revision: https://reviews.llvm.org/D119986
---
 llvm/lib/Target/BPF/BTFDebug.cpp              | 30 +++++--
 .../BPF/BTF/pruning-multi-derived-type.ll     | 87 +++++++++++++++++++
 2 files changed, 110 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/BTF/pruning-multi-derived-type.ll

diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index ff89a897a325a..47b1064169eef 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -786,15 +786,31 @@ void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
     // already defined, we should keep moving to eventually
     // bring in types for "struct t". Otherwise, the "struct s2"
     // definition won't be correct.
+    //
+    // In the above, we have following debuginfo:
+    //  {ptr, struct_member} ->  typedef -> struct
+    // and BTF type for 'typedef' is generated while 'struct' may
+    // be in FixUp. But let us generalize the above to handle
+    //  {different types} -> [various derived types]+ -> another type.
+    // For example,
+    //  {func_param, struct_member} -> const -> ptr -> volatile -> struct
+    // We will traverse const/ptr/volatile which already have corresponding
+    // BTF types and generate type for 'struct' which might be in Fixup
+    // state.
     if (Ty && (!CheckPointer || !SeenPointer)) {
       if (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
-        unsigned Tag = DTy->getTag();
-        if (Tag == dwarf::DW_TAG_typedef || Tag == dwarf::DW_TAG_const_type ||
-            Tag == dwarf::DW_TAG_volatile_type ||
-            Tag == dwarf::DW_TAG_restrict_type) {
-          uint32_t TmpTypeId;
-          visitTypeEntry(DTy->getBaseType(), TmpTypeId, CheckPointer,
-                         SeenPointer);
+        while (DTy) {
+          const DIType *BaseTy = DTy->getBaseType();
+          if (!BaseTy)
+            break;
+
+          if (DIToIdMap.find(BaseTy) != DIToIdMap.end()) {
+            DTy = dyn_cast<DIDerivedType>(BaseTy);
+          } else {
+            uint32_t TmpTypeId;
+            visitTypeEntry(BaseTy, TmpTypeId, CheckPointer, SeenPointer);
+            break;
+          }
         }
       }
     }
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-multi-derived-type.ll b/llvm/test/CodeGen/BPF/BTF/pruning-multi-derived-type.ll
new file mode 100644
index 0000000000000..63c864fd0e3a8
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-multi-derived-type.ll
@@ -0,0 +1,87 @@
+; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; Source:
+;   struct t1 {
+;     int a;
+;   };
+;   struct t2 {
+;     const struct t1 * const a;
+;   };
+;   int foo(struct t2 *arg) { return 0; }
+;   int bar(const struct t1 * const arg) { return 0; }
+; Compilation flags:
+;   clang -target bpf -O2 -g -S -emit-llvm t.c
+
+%struct.t2 = type { %struct.t1* }
+%struct.t1 = type { i32 }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define dso_local i32 @foo(%struct.t2* nocapture noundef readnone %arg) local_unnamed_addr #0 !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata %struct.t2* %arg, metadata !22, metadata !DIExpression()), !dbg !23
+  ret i32 0, !dbg !24
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
+define dso_local i32 @bar(%struct.t1* nocapture noundef readnone %arg) local_unnamed_addr #0 !dbg !25 {
+entry:
+  call void @llvm.dbg.value(metadata %struct.t1* %arg, metadata !29, metadata !DIExpression()), !dbg !30
+  ret i32 0, !dbg !31
+}
+
+; CHECK:             .long   10                              # BTF_KIND_INT(id = 7)
+; CHECK-NEXT:        .long   16777216                        # 0x1000000
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   16777248                        # 0x1000020
+
+; CHECK:             .long   69                              # BTF_KIND_STRUCT(id = 9)
+; CHECK-NEXT:        .long   67108865                        # 0x4000001
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   7
+
+; CHECK:             .byte   97                              # string offset=4
+; CHECK:             .ascii  "t1"                            # string offset=69
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git c34c8afcb85ae9142d0f783bb899c464e8bd2356)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/work/tests/llvm/btf_ptr", checksumkind: CSK_MD5, checksum: "d43a0541e830263021772349589e47a5")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"frame-pointer", i32 2}
+!6 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git c34c8afcb85ae9142d0f783bb899c464e8bd2356)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !21)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2", file: !1, line: 4, size: 64, elements: !13)
+!13 = !{!14}
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !12, file: !1, line: 5, baseType: !15, size: 64)
+!15 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !16)
+!16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 64)
+!17 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !18)
+!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: !1, line: 1, size: 32, elements: !19)
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !18, file: !1, line: 2, baseType: !10, size: 32)
+!21 = !{!22}
+!22 = !DILocalVariable(name: "arg", arg: 1, scope: !7, file: !1, line: 7, type: !11)
+!23 = !DILocation(line: 0, scope: !7)
+!24 = !DILocation(line: 7, column: 27, scope: !7)
+!25 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 8, type: !26, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !28)
+!26 = !DISubroutineType(types: !27)
+!27 = !{!10, !15}
+!28 = !{!29}
+!29 = !DILocalVariable(name: "arg", arg: 1, scope: !25, file: !1, line: 8, type: !15)
+!30 = !DILocation(line: 0, scope: !25)
+!31 = !DILocation(line: 8, column: 40, scope: !25)

From c8b8c8e989e5aaf53494ea2f5021d238b6d77184 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Wed, 16 Feb 2022 15:06:07 -0800
Subject: [PATCH 023/748] [AArch64][GlobalISel] Implement support for
 clang.arc.attachedcall call operand bundles.

Differential Revision: https://reviews.llvm.org/D119983
---
 .../AArch64/GISel/AArch64CallLowering.cpp     | 17 +++++++++-
 llvm/test/CodeGen/AArch64/call-rv-marker.ll   | 33 ++++++++-----------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 097b93e4fccae..3027b9a36a5c3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -18,6 +18,7 @@
 #include "AArch64Subtarget.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -1127,9 +1128,23 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
-  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
+
+  // Calls with operand bundle "clang.arc.attachedcall" are special. They should
+  // be expanded to the call, directly followed by a special marker sequence and
+  // a call to an ObjC library function.
+  unsigned Opc = 0;
+  if (Info.CB && objcarc::hasAttachedCallOpBundle(Info.CB))
+    Opc = AArch64::BLR_RVMARKER;
+  else
+    Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
 
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+  if (Opc == AArch64::BLR_RVMARKER) {
+    // Add a target global address for the retainRV/claimRV runtime function
+    // just before the call target.
+    Function *ARCFn = *objcarc::getAttachedARCFunction(Info.CB);
+    MIB.addGlobalAddress(ARCFn);
+  }
   MIB.add(Info.Callee);
 
   // Tell the call which registers are clobbered.
diff --git a/llvm/test/CodeGen/AArch64/call-rv-marker.ll b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
index d6ba82c5c375d..cb040c6d6bfb0 100644
--- a/llvm/test/CodeGen/AArch64/call-rv-marker.ll
+++ b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
@@ -29,9 +29,8 @@ define dso_local i8* @rv_marker_1_retain() {
 ; CHECK-LABEL:    rv_marker_1_retain:
 ; CHECK:           .cfi_offset w30, -16
 ; CHECK-NEXT:      bl foo1
-; SELDAG-NEXT:     mov x29, x29
-; SELDAG-NEXT:     bl objc_retainAutoreleasedReturnValue
-; GISEL-NOT:       mov x29, x29
+; CHECK-NEXT:     mov x29, x29
+; CHECK-NEXT:     bl objc_retainAutoreleasedReturnValue
 ;
 entry:
   %call = call i8* @foo1() [ "clang.arc.attachedcall"(i8* (i8*)* @objc_retainAutoreleasedReturnValue) ]
@@ -42,9 +41,8 @@ define dso_local i8* @rv_marker_1_unsafeClaim() {
 ; CHECK-LABEL:    rv_marker_1_unsafeClaim:
 ; CHECK:           .cfi_offset w30, -16
 ; CHECK-NEXT:      bl foo1
-; SELDAG-NEXT:     mov x29, x29
-; SELDAG-NEXT:     bl objc_unsafeClaimAutoreleasedReturnValue
-; GISEL-NOT:       mov x29, x29
+; CHECK-NEXT:     mov x29, x29
+; CHECK-NEXT:     bl objc_unsafeClaimAutoreleasedReturnValue
 ;
 entry:
   %call = call i8* @foo1() [ "clang.arc.attachedcall"(i8* (i8*)* @objc_unsafeClaimAutoreleasedReturnValue) ]
@@ -56,8 +54,8 @@ define dso_local void @rv_marker_2_select(i32 %c) {
 ; SELDAG:        cinc  w0, w8, eq
 ; GISEL:         csinc w0, w8, wzr, eq
 ; CHECK-NEXT:    bl  foo0
-; SELDAG-NEXT:   mov x29, x29
-; SELDAG-NEXT:   bl objc_retainAutoreleasedReturnValue
+; CHECK-NEXT:   mov x29, x29
+; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ; CHECK-NEXT:    ldr x30, [sp], #16
 ; CHECK-NEXT:    b  foo2
 ;
@@ -73,8 +71,8 @@ define dso_local void @rv_marker_3() personality i8* bitcast (i32 (...)* @__gxx_
 ; CHECK-LABEL: rv_marker_3
 ; CHECK:         .cfi_offset w30, -32
 ; CHECK-NEXT:    bl  foo1
-; SELDAG-NEXT:   mov x29, x29
-; SELDAG-NEXT:   bl objc_retainAutoreleasedReturnValue
+; CHECK-NEXT:   mov x29, x29
+; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 ;
 entry:
   %call = call i8* @foo1() [ "clang.arc.attachedcall"(i8* (i8*)* @objc_retainAutoreleasedReturnValue) ]
@@ -96,8 +94,8 @@ define dso_local void @rv_marker_4() personality i8* bitcast (i32 (...)* @__gxx_
 ; CHECK-LABEL: rv_marker_4
 ; CHECK:       .Ltmp3:
 ; CHECK-NEXT:     bl  foo1
-; SELDAG-NEXT:    mov x29, x29
-; SELDAG-NEXT:    bl objc_retainAutoreleasedReturnValue
+; CHECK-NEXT:    mov x29, x29
+; CHECK-NEXT:    bl objc_retainAutoreleasedReturnValue
 ; CHECK-NEXT: .Ltmp4:
 ;
 entry:
@@ -139,10 +137,8 @@ define dso_local i8* @rv_marker_5_indirect_call() {
 ; CHECK-LABEL: rv_marker_5_indirect_call
 ; CHECK:         ldr [[ADDR:x[0-9]+]], [
 ; CHECK-NEXT:    blr [[ADDR]]
-; SELDAG-NEXT:   mov x29, x29
-; SELDAG-NEXT:   bl objc_retainAutoreleasedReturnValue
-; GISEL-NOT:     mov x29, x29
-;
+; CHECK-NEXT:   mov x29, x29
+; CHECK-NEXT:   bl objc_retainAutoreleasedReturnValue
 entry:
   %0 = load i8* ()*, i8* ()** @fptr, align 8
   %call = call i8* %0() [ "clang.arc.attachedcall"(i8* (i8*)* @objc_retainAutoreleasedReturnValue) ]
@@ -158,9 +154,8 @@ define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
 ; CHECK-NEXT:   mov x0, x2
 ; CHECK-NEXT:   mov x2, [[TMP]]
 ; CHECK-NEXT:   bl  foo
-; SELDAG-NEXT:  mov x29, x29
-; SELDAG-NEXT:  bl objc_retainAutoreleasedReturnValue
-; GISEL-NOT:    mov x29, x29
+; CHECK-NEXT:  mov x29, x29
+; CHECK-NEXT:  bl objc_retainAutoreleasedReturnValue
   call i8* @foo(i64 %c, i64 %b, i64 %a) [ "clang.arc.attachedcall"(i8* (i8*)* @objc_retainAutoreleasedReturnValue) ]
   ret void
 }

From 34381a76c1a37ad316c650f290f5846f92cbd86c Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Wed, 16 Feb 2022 16:30:46 -0800
Subject: [PATCH 024/748] [mlir][sparse] avoid some codeup in sparsification
 transformation

A very small refactoring, but a big impact on tests that expect an exact order.
This revision fixes the tests, but also makes them less brittle for similar
minor changes in the future!

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D119992
---
 .../Transforms/Sparsification.cpp             |  10 +-
 mlir/test/Dialect/SparseTensor/dense.mlir     |  16 +-
 mlir/test/Dialect/SparseTensor/sparse_1d.mlir | 448 ++++++-------
 mlir/test/Dialect/SparseTensor/sparse_2d.mlir | 442 ++++++-------
 mlir/test/Dialect/SparseTensor/sparse_3d.mlir | 588 +++++++++---------
 .../Dialect/SparseTensor/sparse_affine.mlir   |  36 +-
 .../Dialect/SparseTensor/sparse_kernels.mlir  |  48 +-
 .../Dialect/SparseTensor/sparse_lower.mlir    |  32 +-
 .../SparseTensor/sparse_lower_col.mlir        |  32 +-
 mlir/test/Dialect/SparseTensor/sparse_nd.mlir |  36 +-
 .../Dialect/SparseTensor/sparse_perm.mlir     |  28 +-
 .../SparseTensor/sparse_perm_lower.mlir       |  24 +-
 12 files changed, 870 insertions(+), 870 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index 7a6dd312ef88a..427ee30795945 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -464,15 +464,15 @@ static Value genOutputBuffer(CodeGen &codegen, PatternRewriter &rewriter,
   // impact the running complexity of the sparse kernel. If the tensor
   // materializes into the computation, we need to preserve the zero
   // initialization assumption of all sparse output buffers.
+  Value alloc = rewriter.create<memref::AllocOp>(loc, denseTp, args);
   if (isMaterializing(tensor)) {
-    Value alloc = rewriter.create<memref::AllocOp>(loc, denseTp, args);
     Value zero = constantZero(rewriter, loc, denseTp.getElementType());
     rewriter.create<linalg::FillOp>(loc, zero, alloc);
-    return alloc;
+  } else {
+    Value init =
+        rewriter.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+    rewriter.create<memref::CopyOp>(loc, init, alloc);
   }
-  Value init = rewriter.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
-  Value alloc = rewriter.create<memref::AllocOp>(loc, denseTp, args);
-  rewriter.create<memref::CopyOp>(loc, init, alloc);
   return alloc;
 }
 
diff --git a/mlir/test/Dialect/SparseTensor/dense.mlir b/mlir/test/Dialect/SparseTensor/dense.mlir
index 012f968f2cb78..25c60724f5bc7 100644
--- a/mlir/test/Dialect/SparseTensor/dense.mlir
+++ b/mlir/test/Dialect/SparseTensor/dense.mlir
@@ -35,14 +35,14 @@
 // CHECK-LABEL:   func @dense1(
 // CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>>,
 // CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32> {linalg.inplaceable = false}) -> tensor<32x16xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_9:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_8]], %[[VAL_9]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
index d56ac7101202d..cf9ff82929154 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
@@ -14,15 +14,15 @@
 }
 
 // CHECK-LABEL:   func @add_d(
-// CHECK-SAME:                %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                %[[VAL_1:.*]]: f32,
-// CHECK-SAME:                %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_8:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: f32,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_7]], %[[VAL_8]] : memref<32xf32> to memref<32xf32>
 // CHECK:           scf.for %[[VAL_9:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_9]]] : memref<?xf32>
@@ -44,8 +44,8 @@ func @add_d(%arga: tensor<32xf32, #DV>, %argb: f32, %argx: tensor<32xf32>) -> te
 }
 
 // CHECK-LABEL:   func @add_d_init(
-// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                     %[[VAL_1:.*]]: f32) -> tensor<32xf32> {
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: f32) -> tensor<32xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant 32 : index
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
@@ -74,15 +74,15 @@ func @add_d_init(%arga: tensor<32xf32, #DV>, %argb: f32) -> tensor<32xf32> {
 }
 
 // CHECK-LABEL:   func @mul_d(
-// CHECK-SAME:                %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                %[[VAL_1:.*]]: f32,
-// CHECK-SAME:                %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_8:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: f32,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_7]], %[[VAL_8]] : memref<32xf32> to memref<32xf32>
 // CHECK:           scf.for %[[VAL_9:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_9]]] : memref<?xf32>
@@ -104,18 +104,18 @@ func @mul_d(%arga: tensor<32xf32, #DV>, %argb: f32, %argx: tensor<32xf32>) -> te
 }
 
 // CHECK-LABEL:   func @add_s(
-// CHECK-SAME:                %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                %[[VAL_1:.*]]: f32,
-// CHECK-SAME:                %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: f32,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
@@ -160,15 +160,15 @@ func @add_s(%arga: tensor<32xf32, #SV>, %argb: f32, %argx: tensor<32xf32>) -> te
 }
 
 // CHECK-LABEL:   func @repeated_add_s(
-// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_4:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_2]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_2]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
-// CHECK:           %[[VAL_8:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_2]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_2]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_7]], %[[VAL_8]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -200,16 +200,16 @@ func @repeated_add_s(%arga: tensor<32xf32, #SV>, %argx: tensor<32xf32>) -> tenso
 }
 
 // CHECK-LABEL:   func @mul_s(
-// CHECK-SAME:                %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                %[[VAL_1:.*]]: f32,
-// CHECK-SAME:                %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_9:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: f32,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_8]], %[[VAL_9]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -244,16 +244,16 @@ func @mul_s(%arga: tensor<32xf32, #SV>, %argb: f32, %argx: tensor<32xf32>) -> te
 }
 
 // CHECK-LABEL:   func @add_dd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_9:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_8]], %[[VAL_9]] : memref<32xf32> to memref<32xf32>
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<?xf32>
@@ -276,16 +276,16 @@ func @add_dd(%arga: tensor<32xf32, #DV>, %argb: tensor<32xf32>, %argx: tensor<32
 }
 
 // CHECK-LABEL:   func @mul_dd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_9:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_8]], %[[VAL_9]] : memref<32xf32> to memref<32xf32>
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref<?xf32>
@@ -308,19 +308,19 @@ func @mul_dd(%arga: tensor<32xf32, #DV>, %argb: tensor<32xf32>, %argx: tensor<32
 }
 
 // CHECK-LABEL:   func @add_ds(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32xf32>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32xf32>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
@@ -368,17 +368,17 @@ func @add_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tensor<32
 }
 
 // CHECK-LABEL:   func @mul_ds(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32xf32>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32xf32>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_10:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -404,19 +404,19 @@ func @mul_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tensor<32
 }
 
 // CHECK-LABEL:   func @add_sd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
@@ -464,17 +464,17 @@ func @add_sd(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32>, %argx: tensor<32
 }
 
 // CHECK-LABEL:   func @mul_sd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_10:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -500,19 +500,19 @@ func @mul_sd(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32>, %argx: tensor<32
 }
 
 // CHECK-LABEL:   func @add_ss(
-// CHECK-SAME:                 %[[VAL_0:.*0]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*1]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_2:.*2]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -584,19 +584,19 @@ func @add_ss(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32, #SV>, %argx: tens
 }
 
 // CHECK-LABEL:   func @mul_ss(
-// CHECK-SAME:                 %[[VAL_0:.*0]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*1]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_2:.*2]]: tensor<32xf32>) -> tensor<32xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: tensor<32xf32>) -> tensor<32xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -646,20 +646,20 @@ func @mul_ss(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32, #SV>, %argx: tens
 }
 
 // CHECK-LABEL:   func @two_way_inv(
-// CHECK-SAME:                      %[[VAL_0:.*0]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                      %[[VAL_1:.*1]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                      %[[VAL_2:.*2]]: f32,
-// CHECK-SAME:                      %[[VAL_3:.*3]]: tensor<16xf32>) -> tensor<16xf32> {
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<16xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<16xf32>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: f32,
+// CHECK-SAME:      %[[VAL_3:.*3]]: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<16xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<16xf32> to memref<16xf32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -740,20 +740,20 @@ func @two_way_inv(%arga: tensor<16xf32, #SV>, %argb: tensor<16xf32, #SV>, %argc:
 }
 
 // CHECK-LABEL:   func @two_way_inv_alt(
-// CHECK-SAME:                          %[[VAL_0:.*0]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                          %[[VAL_1:.*1]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                          %[[VAL_2:.*2]]: f32,
-// CHECK-SAME:                          %[[VAL_3:.*3]]: tensor<16xf32>) -> tensor<16xf32> {
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<16xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<16xf32>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: f32,
+// CHECK-SAME:      %[[VAL_3:.*3]]: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<16xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<16xf32> to memref<16xf32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -846,10 +846,10 @@ func @two_way_inv_alt(%arga: tensor<16xf32, #SV>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_4:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_2]] : tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK:           %[[VAL_7:.*]] = memref.alloc() : memref<f32>
+// CHECK-DAG:       %[[VAL_4:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_2]] : tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_7:.*]] = memref.alloc() : memref<f32>
 // CHECK:           memref.copy %[[VAL_6]], %[[VAL_7]] : memref<f32> to memref<f32>
 // CHECK-DAG:       %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -890,14 +890,14 @@ func @sum_reduction(%arga: tensor<?xf32, #SV>, %argx: tensor<f32>) -> tensor<f32
 // CHECK-SAME:      %[[VAL_2:.*2]]: tensor<f32>) -> tensor<f32> {
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<f32>
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<f32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<f32> to memref<f32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref<f32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -999,15 +999,15 @@ func @sum_reduction_ss(%arga: tensor<16xf32, #SV>,
 // CHECK-SAME:      %[[VAL_3:.*3]]: tensor<f32>) -> tensor<f32> {
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_2]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_2]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f32>
-// CHECK:           %[[VAL_14:.*]] = memref.alloc() : memref<f32>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_2]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_2]], %[[VAL_4]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f32>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.alloc() : memref<f32>
 // CHECK:           memref.copy %[[VAL_13]], %[[VAL_14]] : memref<f32> to memref<f32>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_14]][] : memref<f32>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
@@ -1110,25 +1110,25 @@ func @sum_reduction_inv(%arga: tensor<16xf32, #SV>,
 }
 
 // CHECK-LABEL:   func @four_tensors_op(
-// CHECK-SAME:                          %[[VAL_0:.*0]]: tensor<?xf64>,
-// CHECK-SAME:                          %[[VAL_1:.*1]]: tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                          %[[VAL_2:.*2]]: tensor<?xf64>,
-// CHECK-SAME:                          %[[VAL_3:.*3]]: tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                          %[[VAL_4:.*]]: tensor<?xf64>) -> tensor<?xf64> {
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?xf64>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?xf64>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.pointers %[[VAL_3]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.indices %[[VAL_3]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
-// CHECK:           %[[VAL_16:.*]] = tensor.dim %[[VAL_4]], %[[VAL_5]] : tensor<?xf64>
-// CHECK:           %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_4]] : memref<?xf64>
-// CHECK:           %[[VAL_18:.*]] = memref.alloc(%[[VAL_16]]) : memref<?xf64>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<?xf64>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: tensor<?xf64>,
+// CHECK-SAME:      %[[VAL_3:.*3]]: tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_4:.*]]: tensor<?xf64>) -> tensor<?xf64> {
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?xf64>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.pointers %[[VAL_3]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.indices %[[VAL_3]], %[[VAL_5]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_16:.*]] = tensor.dim %[[VAL_4]], %[[VAL_5]] : tensor<?xf64>
+// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_4]] : memref<?xf64>
+// CHECK-DAG:       %[[VAL_18:.*]] = memref.alloc(%[[VAL_16]]) : memref<?xf64>
 // CHECK:           memref.copy %[[VAL_17]], %[[VAL_18]] : memref<?xf64> to memref<?xf64>
 // CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -1295,17 +1295,17 @@ func @four_tensors_op(%arga: tensor<?xf64>,
 // CHECK-SAME:      %[[VAL_3:.*3]]: tensor<f64>) -> tensor<f64> {
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_2]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_2]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f64>
-// CHECK:           %[[VAL_16:.*]] = memref.alloc() : memref<f64>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_2]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_2]], %[[VAL_4]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f64>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.alloc() : memref<f64>
 // CHECK:           memref.copy %[[VAL_15]], %[[VAL_16]] : memref<f64> to memref<f64>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_16]][] : memref<f64>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
index d17601535188b..7c318be65abee 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
@@ -17,17 +17,17 @@
 }
 
 // CHECK-LABEL:   func @add_dd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_10:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
@@ -54,17 +54,17 @@ func @add_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @mul_dd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_10:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
@@ -91,20 +91,20 @@ func @mul_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @add_ds(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref<?xindex>
@@ -155,18 +155,18 @@ func @add_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @mul_ds(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
@@ -195,20 +195,20 @@ func @mul_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @add_sd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -264,18 +264,18 @@ func @add_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @mul_sd(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -305,22 +305,22 @@ func @mul_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @add_ss(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_15:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_14]], %[[VAL_15]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -400,19 +400,19 @@ func @add_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @mul_ss(
-// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32>,
-// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -444,23 +444,23 @@ func @mul_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %argx: te
 }
 
 // CHECK-LABEL:   func @add_ss_ss(
-// CHECK-SAME:                    %[[VAL_0:.*0]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_1:.*1]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_2:.*2]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_16:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_15]], %[[VAL_16]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -609,23 +609,23 @@ func @add_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #Tss>,
 }
 
 // CHECK-LABEL:   func @mul_ss_ss(
-// CHECK-SAME:                    %[[VAL_0:.*0]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_1:.*1]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_2:.*2]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_16:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_15]], %[[VAL_16]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -706,22 +706,22 @@ func @mul_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #Tss>,
 }
 
 // CHECK-LABEL:   func @add_sd_ds(
-// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_1:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_15:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_7]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_14]], %[[VAL_15]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -813,20 +813,20 @@ func @add_sd_ds(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32, #Tds>,
 }
 
 // CHECK-LABEL:   func @mul_sd_ds(
-// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_1:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16xf32>) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_5]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -876,12 +876,12 @@ func @mul_sd_ds(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32, #Tds>,
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 16 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<16xf32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<16xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16x32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<16xf32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<16xf32> to memref<16xf32>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
@@ -928,10 +928,10 @@ func @matvec(%argA: tensor<16x32xf32, #Tds>, %argb: tensor<32xf32>, %argx: tenso
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 10 : index
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK:           %[[VAL_8:.*]] = memref.alloc() : memref<f32>
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_8:.*]] = memref.alloc() : memref<f32>
 // CHECK:           memref.copy %[[VAL_7]], %[[VAL_8]] : memref<f32> to memref<f32>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK:           %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_9]]) -> (f32) {
@@ -970,18 +970,18 @@ func @sum_reduction(%arga: tensor<10x20xf32, #Tds>, %argx: tensor<f32>) -> tenso
 }
 
 // CHECK-LABEL:   func @scale(
-// CHECK-SAME:                %[[VAL_0:.*]]: tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                %[[VAL_1:.*]]: tensor<?x?xf64>) -> tensor<?x?xf64> {
-// CHECK-DAG:           %[[VAL_2:.*]] = arith.constant 2.000000e+00 : f64
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
-// CHECK:           %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?xf64>
-// CHECK:           %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc(%[[VAL_8]], %[[VAL_9]]) : memref<?x?xf64>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<?x?xf64>) -> tensor<?x?xf64> {
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 2.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc(%[[VAL_8]], %[[VAL_9]]) : memref<?x?xf64>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<?x?xf64> to memref<?x?xf64>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_12]]] : memref<?xindex>
@@ -1025,20 +1025,20 @@ func @scale(%arga: tensor<?x?xf64, #Tds>, %argx: tensor<?x?xf64>) -> tensor<?x?x
 // CHECK-SAME:      %[[VAL_1:.*1]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[VAL_2:.*2]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
-// CHECK:           %[[VAL_12:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
-// CHECK:           %[[VAL_14:.*]] = tensor.dim %[[VAL_3]], %[[VAL_4]] : tensor<?x?xf32>
-// CHECK:           %[[VAL_15:.*]] = tensor.dim %[[VAL_3]], %[[VAL_5]] : tensor<?x?xf32>
-// CHECK:           %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
-// CHECK:           %[[VAL_17:.*]] = memref.alloc(%[[VAL_14]], %[[VAL_15]]) : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = tensor.dim %[[VAL_3]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = tensor.dim %[[VAL_3]], %[[VAL_5]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.alloc(%[[VAL_14]], %[[VAL_15]]) : memref<?x?xf32>
 // CHECK:           memref.copy %[[VAL_16]], %[[VAL_17]] : memref<?x?xf32> to memref<?x?xf32>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -1048,9 +1048,9 @@ func @scale(%arga: tensor<?x?xf64, #Tds>, %argx: tensor<?x?xf64>) -> tensor<?x?x
 // CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_20]], %[[VAL_5]] : index
 // CHECK:             %[[VAL_24:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_23]]] : memref<?xindex>
 // CHECK:             scf.for %[[VAL_25:.*]] = %[[VAL_22]] to %[[VAL_24]] step %[[VAL_5]] {
-// CHECK-DAG:           %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_25]]] : memref<?xindex>
-// CHECK-DAG:           %[[VAL_27:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_25]]] : memref<?xf32>
-// CHECK-DAG:           %[[VAL_28:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_21]], %[[VAL_26]]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_25]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_27:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_25]]] : memref<?xf32>
+// CHECK-DAG:       %[[VAL_28:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_21]], %[[VAL_26]]] : memref<?x?xf32>
 // CHECK:               %[[VAL_29:.*]] = scf.for %[[VAL_30:.*]] = %[[VAL_4]] to %[[VAL_12]] step %[[VAL_5]] iter_args(%[[VAL_31:.*]] = %[[VAL_28]]) -> (f32) {
 // CHECK:                 %[[VAL_32:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_21]], %[[VAL_30]]] : memref<?x?xf32>
 // CHECK:                 %[[VAL_33:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_30]], %[[VAL_26]]] : memref<?x?xf32>
@@ -1104,22 +1104,22 @@ func @sampled_dense_dense(%args: tensor<?x?xf32, #Tss>,
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_17:.*]] = sparse_tensor.pointers %[[VAL_2]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_18:.*]] = sparse_tensor.indices %[[VAL_2]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?xf32>
-// CHECK:           %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : memref<f32>
-// CHECK:           %[[VAL_22:.*]] = tensor.dim %[[VAL_5]], %[[VAL_6]] : tensor<?xf32>
-// CHECK:           %[[VAL_23:.*]] = bufferization.to_memref %[[VAL_5]] : memref<?xf32>
-// CHECK:           %[[VAL_24:.*]] = memref.alloc(%[[VAL_22]]) : memref<?xf32>
+// CHECK-DAG:       %[[VAL_17:.*]] = sparse_tensor.pointers %[[VAL_2]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_18:.*]] = sparse_tensor.indices %[[VAL_2]], %[[VAL_7]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?xf32>
+// CHECK-DAG:       %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : memref<f32>
+// CHECK-DAG:       %[[VAL_22:.*]] = tensor.dim %[[VAL_5]], %[[VAL_6]] : tensor<?xf32>
+// CHECK-DAG:       %[[VAL_23:.*]] = bufferization.to_memref %[[VAL_5]] : memref<?xf32>
+// CHECK-DAG:       %[[VAL_24:.*]] = memref.alloc(%[[VAL_22]]) : memref<?xf32>
 // CHECK:           memref.copy %[[VAL_23]], %[[VAL_24]] : memref<?xf32> to memref<?xf32>
 // CHECK:           %[[VAL_25:.*]] = memref.load %[[VAL_21]][] : memref<f32>
 // CHECK:           %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
index aea77ac313eb5..648d4f7e68adb 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
@@ -23,18 +23,18 @@
 }
 
 // CHECK-LABEL:   func @add_ddd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             scf.for %[[VAL_13:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
@@ -65,18 +65,18 @@ func @add_ddd(%arga: tensor<32x16x8xf32, #Tddd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_ddd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             scf.for %[[VAL_13:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
@@ -107,22 +107,22 @@ func @mul_ddd(%arga: tensor<32x16x8xf32, #Tddd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @add_dds(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_15:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_14]], %[[VAL_15]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_16:.*]] = %[[VAL_7]] to %[[VAL_4]] step %[[VAL_9]] {
 // CHECK:             scf.for %[[VAL_17:.*]] = %[[VAL_7]] to %[[VAL_5]] step %[[VAL_9]] {
@@ -177,20 +177,20 @@ func @add_dds(%arga: tensor<32x16x8xf32, #Tdds>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_dds(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 16 : index
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
 // CHECK:             scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_5]] step %[[VAL_7]] {
@@ -223,21 +223,21 @@ func @mul_dds(%arga: tensor<32x16x8xf32, #Tdds>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @add_dsd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_13]], %[[VAL_14]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_7]] to %[[VAL_3]] step %[[VAL_8]] {
 // CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref<?xindex>
@@ -296,19 +296,19 @@ func @add_dsd(%arga: tensor<32x16x8xf32, #Tdsd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_dsd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xindex>
@@ -341,24 +341,24 @@ func @mul_dsd(%arga: tensor<32x16x8xf32, #Tdsd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @add_dss(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_17:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_16]], %[[VAL_17]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_18:.*]] = %[[VAL_8]] to %[[VAL_4]] step %[[VAL_9]] {
 // CHECK:             %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_18]]] : memref<?xindex>
@@ -441,21 +441,21 @@ func @add_dss(%arga: tensor<32x16x8xf32, #Tdss>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_dss(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_6]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_13]], %[[VAL_14]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
@@ -490,21 +490,21 @@ func @mul_dss(%arga: tensor<32x16x8xf32, #Tdss>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @add_sdd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_13]], %[[VAL_14]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_8]]] : memref<?xindex>
@@ -568,19 +568,19 @@ func @add_sdd(%arga: tensor<32x16x8xf32, #Tsdd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_sdd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
@@ -614,24 +614,24 @@ func @mul_sdd(%arga: tensor<32x16x8xf32, #Tsdd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @add_sds(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_17:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_17:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_16]], %[[VAL_17]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_8]]] : memref<?xindex>
 // CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_9]]] : memref<?xindex>
@@ -719,21 +719,21 @@ func @add_sds(%arga: tensor<32x16x8xf32, #Tsds>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_sds(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_13]], %[[VAL_14]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
@@ -769,23 +769,23 @@ func @mul_sds(%arga: tensor<32x16x8xf32, #Tsds>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @add_ssd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_16:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_7]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_15]], %[[VAL_16]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_8]]] : memref<?xindex>
@@ -877,20 +877,20 @@ func @add_ssd(%arga: tensor<32x16x8xf32, #Tssd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_ssd(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -926,26 +926,26 @@ func @mul_ssd(%arga: tensor<32x16x8xf32, #Tssd>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @add_sss(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 32 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 16 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 8 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant true
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_9:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_19:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 8 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant true
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_8]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_9]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_19:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_18]], %[[VAL_19]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_8]]] : memref<?xindex>
 // CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_9]]] : memref<?xindex>
@@ -1061,22 +1061,22 @@ func @add_sss(%arga: tensor<32x16x8xf32, #Tsss>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @mul_sss(
-// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<32x16x8xf32>,
-// CHECK-SAME:                  %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
-// CHECK-DAG:           %[[VAL_3:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK:           %[[VAL_15:.*]] = memref.alloc() : memref<32x16x8xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x16x8xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<32x16x8xf32>) -> tensor<32x16x8xf32> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = memref.alloc() : memref<32x16x8xf32>
 // CHECK:           memref.copy %[[VAL_14]], %[[VAL_15]] : memref<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -1125,23 +1125,23 @@ func @mul_sss(%arga: tensor<32x16x8xf32, #Tsss>, %argb: tensor<32x16x8xf32>, %ar
 }
 
 // CHECK-LABEL:   func @kernel_3d(
-// CHECK-SAME:                    %[[VAL_0:.*0]]: tensor<?x?xf32>,
-// CHECK-SAME:                    %[[VAL_1:.*1]]: tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                    %[[VAL_2:.*2]]: tensor<?x?xf32>,
-// CHECK-SAME:                    %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 2 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_5]] : tensor<?x?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
-// CHECK:           %[[VAL_13:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32>
-// CHECK:           %[[VAL_14:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32>
-// CHECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?xf32>
-// CHECK:           %[[VAL_16:.*]] = memref.alloc(%[[VAL_13]], %[[VAL_14]]) : memref<?x?xf32>
+// CHECK-SAME:      %[[VAL_0:.*0]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[VAL_1:.*1]]: tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_2:.*2]]: tensor<?x?xf32>,
+// CHECK-SAME:      %[[VAL_3:.*3]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_5]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = memref.alloc(%[[VAL_13]], %[[VAL_14]]) : memref<?x?xf32>
 // CHECK:           memref.copy %[[VAL_15]], %[[VAL_16]] : memref<?x?xf32> to memref<?x?xf32>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_5]] to %[[VAL_13]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_18:.*]] = %[[VAL_5]] to %[[VAL_10]] step %[[VAL_6]] {
@@ -1194,17 +1194,17 @@ func @kernel_3d(%arga: tensor<?x?xf32>,
 }
 
 // CHECK-LABEL:   func @sum_reduction(
-// CHECK-SAME:       %[[VAL_0:.*]]: tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_2]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}>>
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}>>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK:           %[[VAL_10:.*]] = memref.alloc() : memref<f32>
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_2]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}>>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}>>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<f32>
 // CHECK:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<f32> to memref<f32>
 // CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref<?xindex>
@@ -1255,16 +1255,16 @@ func @sum_reduction(%arga: tensor<10x20x30xf32, #Tsss>, %argx: tensor<f32>) -> t
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?x?xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-SAME:      %[[VAL_2:.*]]: tensor<f32>) -> tensor<f32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32>
-// CHECK:           %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?x?xf32>
-// CHECK:           %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
-// CHECK:           %[[VAL_12:.*]] = memref.alloc() : memref<f32>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
+// CHECK-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<f32>
 // CHECK:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<f32> to memref<f32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref<f32>
 // CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (f32) {
@@ -1310,20 +1310,20 @@ func @sum_reduction_inv(%arga: tensor<?x?x?xf32>,
 }
 
 // CHECK-LABEL:   func @invariants(
-// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
-// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<20xf32>,
-// CHECK-SAME:                     %[[VAL_2:.*]]: tensor<30xf32>,
-// CHECK-SAME:                     %[[VAL_3:.*]]: tensor<10x20x30xf32>) -> tensor<10x20x30xf32> {
-// CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 10 : index
-// CHECK-DAG:           %[[VAL_5:.*]] = arith.constant 20 : index
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 30 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 0 : index
-// CHECK-DAG:           %[[VAL_8:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<30xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<10x20x30xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<10x20x30xf32>
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<20xf32>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<30xf32>,
+// CHECK-SAME:      %[[VAL_3:.*]]: tensor<10x20x30xf32>) -> tensor<10x20x30xf32> {
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 10 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 20 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 30 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<30xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<10x20x30xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<10x20x30xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<10x20x30xf32> to memref<10x20x30xf32>
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_7]] to %[[VAL_4]] step %[[VAL_8]] {
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_14]]] : memref<?xf32>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
index 551b2b6d04b59..8212f1c20650d 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
@@ -21,12 +21,12 @@
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 3 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<4xf32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<32xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<4xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32xf32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<4xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -73,12 +73,12 @@ func @mul_inv_dense1d(%arga: tensor<32xf32, #SpVec>,
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 2 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<34xi32>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi32>
-// CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<32xi32>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<34xi32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi32>
+// CHECK-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32xi32>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32xi32> to memref<32xi32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -125,12 +125,12 @@ func @and_affine_dense1d(%arga: tensor<32xi32, #SpVec>,
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 2 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 3 : index
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<34x19xf64>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf64>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf64>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<34x19xf64>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf64>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<32x16xf64>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<32x16xf64> to memref<32x16xf64>
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_3]] {
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
index 7d8461ce2e167..6d427d5824b30 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
@@ -12,14 +12,14 @@
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 30 : index
-// CHECK:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20x30xf32>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<10x30xf32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<10x30xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_3]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_3]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_4]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_4]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20x30xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<10x30xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<10x30xf32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<10x30xf32> to memref<10x30xf32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -159,14 +159,14 @@ func @matmul2(%A: tensor<4x8xf64, #DCSR>,
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 6 : index
-// CHECK:           %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<8x8xi32>
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<6x6xi32>
-// CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<6x6xi32>
+// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<8x8xi32>
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x3xi32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<6x6xi32>
+// CHECK-DAG:       %[[VAL_13:.*]] = memref.alloc() : memref<6x6xi32>
 // CHECK:           memref.copy %[[VAL_12]], %[[VAL_13]] : memref<6x6xi32> to memref<6x6xi32>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -211,14 +211,14 @@ func @conv2d(%input:  tensor<8x8xi32>,
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 5 : index
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<5x3xi8>
-// CHECK:           %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_5]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_5]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<5x6xi64>
-// CHECK:           %[[VAL_14:.*]] = memref.alloc() : memref<5x6xi64>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<5x3xi8>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_5]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_5]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x6xi8, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<5x6xi64>
+// CHECK-DAG:       %[[VAL_14:.*]] = memref.alloc() : memref<5x6xi64>
 // CHECK:           memref.copy %[[VAL_13]], %[[VAL_14]] : memref<5x6xi64> to memref<5x6xi64>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
index 22a8e3a2c9b53..abde97eac3645 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
@@ -27,12 +27,12 @@
 // CHECK-HIR-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-HIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-HIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-HIR:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-HIR:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-HIR:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-HIR:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-HIR:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
-// CHECK-HIR:           %[[VAL_11:.*]] = memref.alloc() : memref<32xf64>
+// CHECK-HIR-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-HIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-HIR-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32xf64>
 // CHECK-HIR:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32xf64> to memref<32xf64>
 // CHECK-HIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-HIR-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
@@ -60,12 +60,12 @@
 // CHECK-MIR-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-MIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-MIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-MIR:           %[[VAL_6:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR:           %[[VAL_7:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR:           %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
-// CHECK-MIR:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-MIR:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
-// CHECK-MIR:           %[[VAL_11:.*]] = memref.alloc() : memref<32xf64>
+// CHECK-MIR-DAG:       %[[VAL_6:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-MIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-MIR-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32xf64>
 // CHECK-MIR:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32xf64> to memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-MIR-DAG:         %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_14]]] : memref<?xindex>
@@ -93,10 +93,10 @@
 // CHECK-LIR-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-LIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-LIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-LIR:           %[[VAL_6:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR:           %[[VAL_7:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR:           %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
-// CHECK-LIR:           %[[VAL_9:.*]] = memref.alloc() : memref<32xf64>
+// CHECK-LIR-DAG:       %[[VAL_6:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_7:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-LIR-DAG:       %[[VAL_9:.*]] = memref.alloc() : memref<32xf64>
 // CHECK-LIR:           memref.copy %[[VAL_2]], %[[VAL_9]] : memref<32xf64> to memref<32xf64>
 // CHECK-LIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-LIR-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
index d06231bed7c24..122021811ebab 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
@@ -30,12 +30,12 @@
 // CHECK-HIR-DAG:       %[[VAL_3:.*]] = arith.constant 64 : index
 // CHECK-HIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-HIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-HIR:           %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK-HIR:           %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK-HIR:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
-// CHECK-HIR:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-HIR:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
-// CHECK-HIR:           %[[VAL_11:.*]] = memref.alloc() : memref<32xf64>
+// CHECK-HIR-DAG:       %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-HIR-DAG:       %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]], %[[VAL_5]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-HIR-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf64>
+// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-HIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-HIR-DAG:       %[[VAL_11:.*]] = memref.alloc() : memref<32xf64>
 // CHECK-HIR:           memref.copy %[[VAL_10]], %[[VAL_11]] : memref<32xf64> to memref<32xf64>
 // CHECK-HIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-HIR:             %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
@@ -62,12 +62,12 @@
 // CHECK-MIR-DAG:       %[[VAL_3:.*]] = arith.constant 64 : index
 // CHECK-MIR-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-MIR-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-MIR:           %[[VAL_7:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR:           %[[VAL_8:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR:           %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
-// CHECK-MIR:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-MIR:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
-// CHECK-MIR:           %[[VAL_12:.*]] = memref.alloc() : memref<32xf64>
+// CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-MIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-MIR-DAG:       %[[VAL_12:.*]] = memref.alloc() : memref<32xf64>
 // CHECK-MIR:           memref.copy %[[VAL_11]], %[[VAL_12]] : memref<32xf64> to memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK-MIR:             %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<64xf64>
@@ -94,10 +94,10 @@
 // CHECK-LIR-DAG:       %[[VAL_3:.*]] = arith.constant 64 : index
 // CHECK-LIR-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-LIR-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-LIR:           %[[VAL_7:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR:           %[[VAL_8:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR:           %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
-// CHECK-LIR:           %[[VAL_10:.*]] = memref.alloc() : memref<32xf64>
+// CHECK-LIR-DAG:       %[[VAL_7:.*]] = call @sparsePointers(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_8:.*]] = call @sparseIndices(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-LIR-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<32xf64>
 // CHECK-LIR:           memref.copy %[[VAL_2]], %[[VAL_10]] : memref<32xf64> to memref<32xf64>
 // CHECK-LIR:           scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK-LIR:             %[[VAL_14:.*]] = memref.load %[[VAL_1]]{{\[}}%[[VAL_13]]] : memref<64xf64>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_nd.mlir b/mlir/test/Dialect/SparseTensor/sparse_nd.mlir
index 7c6e98fdd566c..5cf64309fd541 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_nd.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_nd.mlir
@@ -24,24 +24,24 @@
 // CHECK-SAME:              %[[VAL_0:.*]]: tensor<10x20x30x40x50x60x70x80xf32>,
 // CHECK-SAME:              %[[VAL_1:.*]]: tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>>,
 // CHECK-SAME:              %[[VAL_2:.*]]: tensor<10x20x30x40x50x60x70x80xf32>) -> tensor<10x20x30x40x50x60x70x80xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 3 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 4 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 10 : index
-// CHECK:           %[[VAL_6:.*]] = arith.constant 20 : index
-// CHECK:           %[[VAL_7:.*]] = arith.constant 30 : index
-// CHECK:           %[[VAL_8:.*]] = arith.constant 60 : index
-// CHECK:           %[[VAL_9:.*]] = arith.constant 70 : index
-// CHECK:           %[[VAL_10:.*]] = arith.constant 80 : index
-// CHECK:           %[[VAL_11:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_0]] : memref<10x20x30x40x50x60x70x80xf32>
-// CHECK:           %[[VAL_14:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_15:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_16:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_17:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
-// CHECK:           %[[VAL_18:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
-// CHECK:           %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : memref<10x20x30x40x50x60x70x80xf32>
-// CHECK:           %[[VAL_20:.*]] = memref.alloc() : memref<10x20x30x40x50x60x70x80xf32>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 4 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 10 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 20 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 30 : index
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 60 : index
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 70 : index
+// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 80 : index
+// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_12:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_0]] : memref<10x20x30x40x50x60x70x80xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_3]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_3]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_16:.*]] = sparse_tensor.pointers %[[VAL_1]], %[[VAL_4]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_17:.*]] = sparse_tensor.indices %[[VAL_1]], %[[VAL_4]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_18:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "dense", "compressed", "compressed", "dense", "dense", "dense" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : memref<10x20x30x40x50x60x70x80xf32>
+// CHECK-DAG:       %[[VAL_20:.*]] = memref.alloc() : memref<10x20x30x40x50x60x70x80xf32>
 // CHECK:           memref.copy %[[VAL_19]], %[[VAL_20]] : memref<10x20x30x40x50x60x70x80xf32> to memref<10x20x30x40x50x60x70x80xf32>
 // CHECK:           scf.for %[[VAL_21:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_12]] {
 // CHECK:             scf.for %[[VAL_22:.*]] = %[[VAL_11]] to %[[VAL_9]] step %[[VAL_12]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
index 463db3c47d355..14c8b78d4b752 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
@@ -17,14 +17,14 @@
 // CHECK-LABEL:   func @sparse_static_dims(
 // CHECK-SAME:                          %[[VAL_0:.*]]: tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:                          %[[VAL_1:.*]]: tensor<20x30x10xf32>) -> tensor<20x30x10xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 20 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 30 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20x30x10xf32>
-// CHECK:           %[[VAL_9:.*]] = memref.alloc() : memref<20x30x10xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 20 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 30 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 10 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20x30xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20x30x10xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = memref.alloc() : memref<20x30x10xf32>
 // CHECK:           memref.copy %[[VAL_8]], %[[VAL_9]] : memref<20x30x10xf32> to memref<20x30x10xf32>
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
@@ -58,12 +58,12 @@ func @sparse_static_dims(%arga: tensor<10x20x30xf32, #X>,
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 2 : index
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK:           %[[VAL_6:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?x?xf32>
-// CHECK:           %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32>
-// CHECK:           %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor<?x?x?xf32>
-// CHECK:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?x?xf32>
-// CHECK:           %[[VAL_10:.*]] = memref.alloc(%[[VAL_6]], %[[VAL_7]], %[[VAL_8]]) : memref<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = memref.alloc(%[[VAL_6]], %[[VAL_7]], %[[VAL_8]]) : memref<?x?x?xf32>
 // CHECK:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<?x?x?xf32> to memref<?x?x?xf32>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
 // CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
index a01e38c0efb63..b9b1fcbb26d26 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
@@ -22,12 +22,12 @@
 // CHECK-HIR-DAG:       %[[VAL_2:.*]] = arith.constant 1 : index
 // CHECK-HIR-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-HIR-DAG:       %[[VAL_4:.*]] = arith.constant 2 : index
-// CHECK-HIR:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-HIR:           %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-HIR:           %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-HIR:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-HIR:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK-HIR:           %[[VAL_10:.*]] = memref.alloc() : memref<f32>
+// CHECK-HIR-DAG:       %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-HIR-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<f32>
 // CHECK-HIR:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<f32> to memref<f32>
 // CHECK-HIR:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f32>
 // CHECK-HIR:           %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_3]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) {
@@ -56,12 +56,12 @@
 // CHECK-MIR-DAG:       %[[VAL_2:.*]] = arith.constant 2 : index
 // CHECK-MIR-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-MIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK-MIR:           %[[VAL_5:.*]] = call @sparseDimSize(%[[VAL_0]], %[[VAL_4]]) : (!llvm.ptr<i8>, index) -> index
-// CHECK-MIR:           %[[VAL_6:.*]] = call @sparseDimSize(%[[VAL_0]], %[[VAL_3]]) : (!llvm.ptr<i8>, index) -> index
-// CHECK-MIR:           %[[VAL_7:.*]] = call @sparseDimSize(%[[VAL_0]], %[[VAL_2]]) : (!llvm.ptr<i8>, index) -> index
-// CHECK-MIR:           %[[VAL_8:.*]] = call @sparseValuesF32(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf32>
-// CHECK-MIR:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK-MIR:           %[[VAL_10:.*]] = memref.alloc() : memref<f32>
+// CHECK-MIR-DAG:       %[[VAL_5:.*]] = call @sparseDimSize(%[[VAL_0]], %[[VAL_4]]) : (!llvm.ptr<i8>, index) -> index
+// CHECK-MIR-DAG:       %[[VAL_6:.*]] = call @sparseDimSize(%[[VAL_0]], %[[VAL_3]]) : (!llvm.ptr<i8>, index) -> index
+// CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparseDimSize(%[[VAL_0]], %[[VAL_2]]) : (!llvm.ptr<i8>, index) -> index
+// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF32(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf32>
+// CHECK-MIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-MIR-DAG:       %[[VAL_10:.*]] = memref.alloc() : memref<f32>
 // CHECK-MIR:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<f32> to memref<f32>
 // CHECK-MIR:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f32>
 // CHECK-MIR:           %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_5]] step %[[VAL_3]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) {

From d8364e3ea4ed50498d9721cdf87e06f81cf23d69 Mon Sep 17 00:00:00 2001
From: V Donaldson <vdonaldson@nvidia.com>
Date: Wed, 16 Feb 2022 15:26:50 -0800
Subject: [PATCH 025/748] [flang] Allow tabs as white space in formats

The fortran standard views blanks in IO formats as white space in
non-string contexts.  Other compilers extend this to also view horizontal
tabs as white space.  Some compilers additionally add other white space
characters to this group.

Add recognition of horizontal and vertical tabs to runtime format
validation code to match what the runtime code currently does.
---
 flang/include/flang/Common/format.h | 15 +++++++++++++--
 flang/test/Semantics/io08.f90       |  3 +++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/flang/include/flang/Common/format.h b/flang/include/flang/Common/format.h
index 7ca3faa79f290..9ba76e919b7fd 100644
--- a/flang/include/flang/Common/format.h
+++ b/flang/include/flang/Common/format.h
@@ -149,9 +149,20 @@ template <typename CHAR = char> class FormatValidator {
   int maxNesting_{0}; // max level of nested parentheses
 };
 
+template <typename CHAR> static inline bool IsWhite(CHAR c) {
+  // White space.  ' ' is standard.  Other characters are extensions.
+  // Extension candidates:
+  //   '\t' (horizontal tab)
+  //   '\n' (new line)
+  //   '\v' (vertical tab)
+  //   '\f' (form feed)
+  //   '\r' (carriage ret)
+  return c == ' ' || c == '\t' || c == '\v';
+}
+
 template <typename CHAR> CHAR FormatValidator<CHAR>::NextChar() {
   for (++cursor_; cursor_ < end_; ++cursor_) {
-    if (*cursor_ != ' ') {
+    if (!IsWhite(*cursor_)) {
       return toupper(*cursor_);
     }
   }
@@ -161,7 +172,7 @@ template <typename CHAR> CHAR FormatValidator<CHAR>::NextChar() {
 
 template <typename CHAR> CHAR FormatValidator<CHAR>::LookAheadChar() {
   for (laCursor_ = cursor_ + 1; laCursor_ < end_; ++laCursor_) {
-    if (*laCursor_ != ' ') {
+    if (!IsWhite(*cursor_)) {
       return toupper(*laCursor_);
     }
   }
diff --git a/flang/test/Semantics/io08.f90 b/flang/test/Semantics/io08.f90
index 843028acfd5bf..b4e8d9f4b6a01 100644
--- a/flang/test/Semantics/io08.f90
+++ b/flang/test/Semantics/io08.f90
@@ -37,6 +37,9 @@
   write(*,'($)')
   write(*,'(\)')
   write(*,'(RZ,RU,RP,RN,RD,RC,SS,SP,S,3G15.3e2)')
+  write(*, '(' // achar( 9) // ')') ! horizontal tab
+  write(*, '(' // achar(11) // ')') ! vertical tab
+  write(*, '(' // achar(32) // ')') ! space
 
   ! C1302 warnings; no errors
   write(*,'(3P7I2)')

From dd8490d207d3a1612091abbea04bf660f133a89f Mon Sep 17 00:00:00 2001
From: Jim Ingham <jingham@apple.com>
Date: Wed, 16 Feb 2022 17:46:53 -0800
Subject: [PATCH 026/748] Add a test for breaking on overloaded functions by
 name.

---
 .../breakpoint_on_overload/Makefile           |  4 +++
 .../TestBreakOnOverload.py                    | 32 +++++++++++++++++++
 .../breakpoint_on_overload/main.cpp           | 29 +++++++++++++++++
 3 files changed, 65 insertions(+)
 create mode 100644 lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/Makefile
 create mode 100644 lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py
 create mode 100644 lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/main.cpp

diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/Makefile b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/Makefile
new file mode 100644
index 0000000000000..a27336ffd9acd
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/Makefile
@@ -0,0 +1,4 @@
+CXX_SOURCES := main.cpp
+CXXFLAGS_EXTRAS := -std=c++14
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py
new file mode 100644
index 0000000000000..49d7442f6763d
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py
@@ -0,0 +1,32 @@
+"""
+Test setting a breakpoint on an overloaded function by name.
+"""
+
+import re
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestBreakpointOnOverload(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    def check_breakpoint(self, name):
+        bkpt = self.target.BreakpointCreateByName(name)
+        self.assertEqual(bkpt.num_locations, 1, "Got one location")
+        addr = bkpt.locations[0].GetAddress()
+        self.assertTrue(addr.function.IsValid(), "Got a real function")
+        self.assertEqual(addr.function.name, name, "Got the right name")
+        
+    def test_break_on_overload(self):
+        self.build()
+        self.target = lldbutil.run_to_breakpoint_make_target(self)
+        self.check_breakpoint("a_function(int)")
+        self.check_breakpoint("a_function(double)")
+        self.check_breakpoint("a_function(int, double)")
+        self.check_breakpoint("a_function(double, int)")
+        
+        
+        
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/main.cpp b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/main.cpp
new file mode 100644
index 0000000000000..55afab8ad0c3c
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/main.cpp
@@ -0,0 +1,29 @@
+int a_function(int x) {
+  return x * x;
+}
+
+int a_function(double x) {
+  return static_cast<int>(x * x);
+}
+
+int a_function(double x, int y) {
+  return y * y;
+}
+
+int a_function(int x, double y) {
+  return static_cast<int>(y * y);
+}
+
+int main(int argc, char const *argv[]) {
+  // This is a random comment.
+
+  int int_val = 20;
+  double double_val = 20.0;
+
+  int result = a_function(int_val);
+  result += a_function(double_val);
+  result += a_function(double_val, int_val);
+  result += a_function(int_val, double_val);
+
+  return result;
+}

From 64f5f6d7592cb0c346759d60e507c23295585de0 Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Thu, 17 Feb 2022 03:00:17 +0000
Subject: [PATCH 027/748] [libc] Use '+' constraint on inline assembly

As suggested by @mcgrathr in D118099

Reviewed By: lntue

Differential Revision: https://reviews.llvm.org/D119978
---
 libc/src/math/x86_64/cos.cpp | 5 ++---
 libc/src/math/x86_64/sin.cpp | 5 ++---
 libc/src/math/x86_64/tan.cpp | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/libc/src/math/x86_64/cos.cpp b/libc/src/math/x86_64/cos.cpp
index 3b785a2f78cdf..1d2480e3df147 100644
--- a/libc/src/math/x86_64/cos.cpp
+++ b/libc/src/math/x86_64/cos.cpp
@@ -12,9 +12,8 @@
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(double, cos, (double x)) {
-  double result;
-  __asm__ __volatile__("fcos" : "=t"(result) : "f"(x) : "cc");
-  return result;
+  __asm__ __volatile__("fcos" : "+t"(x));
+  return x;
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/x86_64/sin.cpp b/libc/src/math/x86_64/sin.cpp
index e94aa1a3f0925..bda4acbe41223 100644
--- a/libc/src/math/x86_64/sin.cpp
+++ b/libc/src/math/x86_64/sin.cpp
@@ -12,9 +12,8 @@
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(double, sin, (double x)) {
-  double result;
-  __asm__ __volatile__("fsin" : "=t"(result) : "f"(x) : "cc");
-  return result;
+  __asm__ __volatile__("fsin" : "+t"(x));
+  return x;
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/math/x86_64/tan.cpp b/libc/src/math/x86_64/tan.cpp
index 0503af7a16dde..f25ff77095c88 100644
--- a/libc/src/math/x86_64/tan.cpp
+++ b/libc/src/math/x86_64/tan.cpp
@@ -13,10 +13,9 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(double, tan, (double x)) {
   double result;
-  double one;
   // The fptan instruction pushes the number 1 on to the FP stack after
   // computing tan. So, we read out the one before popping the actual result.
-  __asm__ __volatile__("fptan" : "=t"(one) : "f"(x) : "cc");
+  __asm__ __volatile__("fptan" : "+t"(x));
   __asm__ __volatile__("fstpl %0" : "=m"(result));
   return result;
 }

From 05f10ae0d8548e65130475730a1565203da8726d Mon Sep 17 00:00:00 2001
From: Jim Ingham <jingham@apple.com>
Date: Wed, 16 Feb 2022 19:09:01 -0800
Subject: [PATCH 028/748] On Windows, the function name contains the return
 parameter, so the test has to be "function name contains the name we used to
 specify the breakpoint" not IS the name...

---
 .../breakpoint_on_overload/TestBreakOnOverload.py          | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py
index 49d7442f6763d..30124f8335d73 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_on_overload/TestBreakOnOverload.py
@@ -18,7 +18,12 @@ def check_breakpoint(self, name):
         self.assertEqual(bkpt.num_locations, 1, "Got one location")
         addr = bkpt.locations[0].GetAddress()
         self.assertTrue(addr.function.IsValid(), "Got a real function")
-        self.assertEqual(addr.function.name, name, "Got the right name")
+        # On Window, the name of the function includes the return value.
+        # We still succeed in setting the breakpoint, but the resultant
+        # name is not the same.
+        # So just look for the name we used for the breakpoint in the
+        # function name, rather than doing an equality check.
+        self.assertIn(name, addr.function.name, "Got the right name")
         
     def test_break_on_overload(self):
         self.build()

From 194899caef241fe3b61be092fd5dd81bfd2c3975 Mon Sep 17 00:00:00 2001
From: Serguei Katkov <serguei.katkov@azul.com>
Date: Tue, 15 Feb 2022 18:12:22 +0700
Subject: [PATCH 029/748] [MemoryDependency] Relax the re-ordering of atomic
 store and unordered load/store

Atomic store with Release semantic allows re-ordering of unordered load/store before the store.
Implement it.

Reviewers: reames
Reviewed By: reames
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D119844
---
 .../lib/Analysis/MemoryDependenceAnalysis.cpp | 23 ++++++++++++++++---
 .../reorder-over-store-atomic.ll              | 10 ++------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index a4491f481c0fb..aaeba903f43df 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -424,6 +424,16 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     return false;
   };
 
+  // Return "true" if and only if the instruction I is either a non-unordered
+  // load or a non-unordered store.
+  auto isNonUnorderedLoadOrStore = [](Instruction *I) -> bool {
+    if (auto *LI = dyn_cast<LoadInst>(I))
+      return !LI->isUnordered();
+    if (auto *SI = dyn_cast<StoreInst>(I))
+      return !SI->isUnordered();
+    return false;
+  };
+
   // Return "true" if I is not a load and not a store, but it does access
   // memory.
   auto isOtherMemAccess = [](Instruction *I) -> bool {
@@ -549,11 +559,18 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // A Monotonic store is OK if the query inst is itself not atomic.
       // FIXME: This is overly conservative.
       if (!SI->isUnordered() && SI->isAtomic()) {
-        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
+        if (!QueryInst || isNonUnorderedLoadOrStore(QueryInst) ||
             isOtherMemAccess(QueryInst))
           return MemDepResult::getClobber(SI);
-        if (SI->getOrdering() != AtomicOrdering::Monotonic)
-          return MemDepResult::getClobber(SI);
+        // Ok, if we are here the guard above guarantee us that
+        // QueryInst is a non-atomic or unordered load/store.
+        // SI is atomic with monotonic or release semantic (seq_cst for store
+        // is actually a release semantic plus total order over other seq_cst
+        // instructions, as soon as QueryInst is not seq_cst we can consider it
+        // as simple release semantic).
+        // Monotonic and Release semantic allows re-ordering before store
+        // so we are safe to go further and check the aliasing. It will prohibit
+        // re-ordering in case locations are may or must alias.
       }
 
       // While volatile access cannot be eliminated, they do not have to clobber
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/reorder-over-store-atomic.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/reorder-over-store-atomic.ll
index a06c7e0792792..b6153078816b5 100644
--- a/llvm/test/Analysis/MemoryDependenceAnalysis/reorder-over-store-atomic.ll
+++ b/llvm/test/Analysis/MemoryDependenceAnalysis/reorder-over-store-atomic.ll
@@ -40,11 +40,8 @@ define i32 @test_load_acquire_unordered() {
 
 define i32 @test_store_cst_unordered(i32 %x) {
 ; CHECK-LABEL: @test_store_cst_unordered(
-; CHECK-NEXT:    [[L1:%.*]] = load atomic i32, i32* @w unordered, align 4
 ; CHECK-NEXT:    store atomic i32 [[X:%.*]], i32* @u seq_cst, align 4
-; CHECK-NEXT:    [[L2:%.*]] = load atomic i32, i32* @w unordered, align 4
-; CHECK-NEXT:    [[RES:%.*]] = sub i32 [[L1]], [[L2]]
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    ret i32 0
 ;
   %l1 = load atomic i32, i32* @w unordered, align 4
   store atomic i32 %x, i32* @u seq_cst, align 4
@@ -55,11 +52,8 @@ define i32 @test_store_cst_unordered(i32 %x) {
 
 define i32 @test_store_release_unordered(i32 %x) {
 ; CHECK-LABEL: @test_store_release_unordered(
-; CHECK-NEXT:    [[L1:%.*]] = load atomic i32, i32* @w unordered, align 4
 ; CHECK-NEXT:    store atomic i32 [[X:%.*]], i32* @u release, align 4
-; CHECK-NEXT:    [[L2:%.*]] = load atomic i32, i32* @w unordered, align 4
-; CHECK-NEXT:    [[RES:%.*]] = sub i32 [[L1]], [[L2]]
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    ret i32 0
 ;
   %l1 = load atomic i32, i32* @w unordered, align 4
   store atomic i32 %x, i32* @u release, align 4

From 99dd49cf97a49eba074c5c3e060e2ddf1da3bc2b Mon Sep 17 00:00:00 2001
From: Damian Rouson <damian@sourceryinstitute.org>
Date: Thu, 11 Nov 2021 11:27:15 -0800
Subject: [PATCH 030/748] [flang] add semantics test for sync all

Test a range of acceptable forms of SYNC ALL statements,
including combinations with and without the stat-variable
and errmsg-variable present.  Also test that several invalid
forms of SYNC ALL call generate the correct error messages.

Differential Revision: https://reviews.llvm.org/D114181
---
 flang/test/Semantics/synchronization01.f90 | 80 ++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 flang/test/Semantics/synchronization01.f90

diff --git a/flang/test/Semantics/synchronization01.f90 b/flang/test/Semantics/synchronization01.f90
new file mode 100644
index 0000000000000..5281c68fbb9ae
--- /dev/null
+++ b/flang/test/Semantics/synchronization01.f90
@@ -0,0 +1,80 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! XFAIL: *
+! This test checks for semantic errors in sync all statements based on the
+! statement specification in section 11.6.3 of the Fortran 2018 standard.
+
+program test_sync_all
+  implicit none
+
+  integer sync_status, co_indexed_integer[*], superfluous_stat, non_scalar(1)
+  character(len=128) error_message, co_indexed_character[*], superfluous_errmsg
+  logical invalid_type
+  
+  !___ standard-conforming statement ___
+
+  sync all
+  sync all(stat=sync_status)
+  sync all(                  errmsg=error_message)
+  sync all(stat=sync_status, errmsg=error_message)
+ 
+  !___ non-standard-conforming statement ___
+
+  !______ invalid sync-stat-lists: invalid stat= ____________
+
+  !ERROR: expected execution part construct
+  sync all(status=sync_status)
+
+  ! Stat-variable must an integer scalar
+  !ERROR: TBD
+  sync all(stat=invalid_type)
+
+  ! Stat-variable must an integer scalar
+  !ERROR: TBD
+  sync all(stat=non_scalar)
+
+  ! Invalid sync-stat-list: missing stat-variable
+  !ERROR: expected execution part construct
+  sync all(stat)
+
+  ! Invalid sync-stat-list: missing 'stat='
+  !ERROR: expected execution part construct
+  sync all(sync_status)
+
+  !______ invalid sync-stat-lists: invalid errmsg= ____________
+
+  ! Invalid errmsg-variable keyword
+  !ERROR: expected execution part construct
+  sync all(errormsg=error_message)
+
+  !ERROR: TBD
+  sync all(errmsg=invalid_type)
+
+  ! Invalid sync-stat-list: missing 'errmsg='
+  !ERROR: expected execution part construct
+  sync all(error_message)
+
+  ! Invalid sync-stat-list: missing errmsg-variable
+  !ERROR: expected execution part construct
+  sync all(errmsg)
+
+  !______ invalid sync-stat-lists: redundant sync-stat-list ____________
+
+  ! No specifier shall appear more than once in a given sync-stat-list
+  !ERROR: to be determined
+  sync all(stat=sync_status, stat=superfluous_stat)
+
+  ! No specifier shall appear more than once in a given sync-stat-list
+  !ERROR: to be determined
+  sync all(errmsg=error_message, errmsg=superfluous_errmsg)
+ 
+  !______ invalid sync-stat-lists: coindexed stat-variable ____________
+
+  ! Check constraint C1173 from the Fortran 2018 standard
+  !ERROR: to be determined
+  sync all(stat=co_indexed_integer[1])
+ 
+  ! Check constraint C1173 from the Fortran 2018 standard
+  !ERROR: to be determined
+  sync all(errmsg=co_indexed_character[1])
+
+end program test_sync_all

From abe2dee5ebb97403a953a8b71f8ffa8b72cff861 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 16 Feb 2022 20:35:05 -0800
Subject: [PATCH 031/748] [mlir] NFC Async: always use 'b' for the current
 builder

Currently some of the nested IR building inconsistently uses `nb` and `b`, it's very easy to call wrong builder outside of the current scope, so for simplicity all builders are always called `b`, and in nested IR building regions they just shadow the "parent" builder.

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D120003
---
 .../Async/Transforms/AsyncParallelFor.cpp     | 90 +++++++++----------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
index cdd85e5c5b406..e596fc3e73488 100644
--- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
+++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
@@ -373,23 +373,23 @@ static ParallelComputeFunction createParallelComputeFunction(
   LoopNestBuilder workLoopBuilder = [&](size_t loopIdx) -> LoopBodyBuilder {
     return [&, loopIdx](OpBuilder &nestedBuilder, Location loc, Value iv,
                         ValueRange args) {
-      ImplicitLocOpBuilder nb(loc, nestedBuilder);
+      ImplicitLocOpBuilder b(loc, nestedBuilder);
 
       // Compute induction variable for `loopIdx`.
-      computeBlockInductionVars[loopIdx] = nb.create<arith::AddIOp>(
-          lowerBounds[loopIdx], nb.create<arith::MulIOp>(iv, steps[loopIdx]));
+      computeBlockInductionVars[loopIdx] = b.create<arith::AddIOp>(
+          lowerBounds[loopIdx], b.create<arith::MulIOp>(iv, steps[loopIdx]));
 
       // Check if we are inside first or last iteration of the loop.
-      isBlockFirstCoord[loopIdx] = nb.create<arith::CmpIOp>(
+      isBlockFirstCoord[loopIdx] = b.create<arith::CmpIOp>(
           arith::CmpIPredicate::eq, iv, blockFirstCoord[loopIdx]);
-      isBlockLastCoord[loopIdx] = nb.create<arith::CmpIOp>(
+      isBlockLastCoord[loopIdx] = b.create<arith::CmpIOp>(
           arith::CmpIPredicate::eq, iv, blockLastCoord[loopIdx]);
 
       // Check if the previous loop is in its first or last iteration.
       if (loopIdx > 0) {
-        isBlockFirstCoord[loopIdx] = nb.create<arith::AndIOp>(
+        isBlockFirstCoord[loopIdx] = b.create<arith::AndIOp>(
             isBlockFirstCoord[loopIdx], isBlockFirstCoord[loopIdx - 1]);
-        isBlockLastCoord[loopIdx] = nb.create<arith::AndIOp>(
+        isBlockLastCoord[loopIdx] = b.create<arith::AndIOp>(
             isBlockLastCoord[loopIdx], isBlockLastCoord[loopIdx - 1]);
       }
 
@@ -398,24 +398,24 @@ static ParallelComputeFunction createParallelComputeFunction(
         if (loopIdx + 1 >= op.getNumLoops() - numBlockAlignedInnerLoops) {
           // For block aligned loops we always iterate starting from 0 up to
           // the loop trip counts.
-          nb.create<scf::ForOp>(c0, tripCounts[loopIdx + 1], c1, ValueRange(),
-                                workLoopBuilder(loopIdx + 1));
+          b.create<scf::ForOp>(c0, tripCounts[loopIdx + 1], c1, ValueRange(),
+                               workLoopBuilder(loopIdx + 1));
 
         } else {
           // Select nested loop lower/upper bounds depending on our position in
           // the multi-dimensional iteration space.
-          auto lb = nb.create<arith::SelectOp>(
-              isBlockFirstCoord[loopIdx], blockFirstCoord[loopIdx + 1], c0);
+          auto lb = b.create<arith::SelectOp>(isBlockFirstCoord[loopIdx],
+                                              blockFirstCoord[loopIdx + 1], c0);
 
-          auto ub = nb.create<arith::SelectOp>(isBlockLastCoord[loopIdx],
-                                               blockEndCoord[loopIdx + 1],
-                                               tripCounts[loopIdx + 1]);
+          auto ub = b.create<arith::SelectOp>(isBlockLastCoord[loopIdx],
+                                              blockEndCoord[loopIdx + 1],
+                                              tripCounts[loopIdx + 1]);
 
-          nb.create<scf::ForOp>(lb, ub, c1, ValueRange(),
-                                workLoopBuilder(loopIdx + 1));
+          b.create<scf::ForOp>(lb, ub, c1, ValueRange(),
+                               workLoopBuilder(loopIdx + 1));
         }
 
-        nb.create<scf::YieldOp>(loc);
+        b.create<scf::YieldOp>(loc);
         return;
       }
 
@@ -425,7 +425,7 @@ static ParallelComputeFunction createParallelComputeFunction(
       mapping.map(computeFuncType.captures, captures);
 
       for (auto &bodyOp : op.getLoopBody().getOps())
-        nb.clone(bodyOp, mapping);
+        b.clone(bodyOp, mapping);
     };
   };
 
@@ -602,38 +602,38 @@ static void doAsyncDispatch(ImplicitLocOpBuilder &b, PatternRewriter &rewriter,
       b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, blockCount, c1);
 
   auto syncDispatch = [&](OpBuilder &nestedBuilder, Location loc) {
-    ImplicitLocOpBuilder nb(loc, nestedBuilder);
+    ImplicitLocOpBuilder b(loc, nestedBuilder);
 
     // Call parallel compute function for the single block.
     SmallVector<Value> operands = {c0, blockSize};
     appendBlockComputeOperands(operands);
 
-    nb.create<CallOp>(parallelComputeFunction.func.sym_name(),
-                      parallelComputeFunction.func.getCallableResults(),
-                      operands);
-    nb.create<scf::YieldOp>();
+    b.create<CallOp>(parallelComputeFunction.func.sym_name(),
+                     parallelComputeFunction.func.getCallableResults(),
+                     operands);
+    b.create<scf::YieldOp>();
   };
 
   auto asyncDispatch = [&](OpBuilder &nestedBuilder, Location loc) {
-    ImplicitLocOpBuilder nb(loc, nestedBuilder);
+    ImplicitLocOpBuilder b(loc, nestedBuilder);
 
     // Create an async.group to wait on all async tokens from the concurrent
     // execution of multiple parallel compute function. First block will be
     // executed synchronously in the caller thread.
-    Value groupSize = nb.create<arith::SubIOp>(blockCount, c1);
-    Value group = nb.create<CreateGroupOp>(GroupType::get(ctx), groupSize);
+    Value groupSize = b.create<arith::SubIOp>(blockCount, c1);
+    Value group = b.create<CreateGroupOp>(GroupType::get(ctx), groupSize);
 
     // Launch async dispatch function for [0, blockCount) range.
     SmallVector<Value> operands = {group, c0, blockCount, blockSize};
     appendBlockComputeOperands(operands);
 
-    nb.create<CallOp>(asyncDispatchFunction.sym_name(),
-                      asyncDispatchFunction.getCallableResults(), operands);
+    b.create<CallOp>(asyncDispatchFunction.sym_name(),
+                     asyncDispatchFunction.getCallableResults(), operands);
 
     // Wait for the completion of all parallel compute operations.
-    nb.create<AwaitAllOp>(group);
+    b.create<AwaitAllOp>(group);
 
-    nb.create<scf::YieldOp>();
+    b.create<scf::YieldOp>();
   };
 
   // Dispatch either single block compute function, or launch async dispatch.
@@ -680,7 +680,7 @@ doSequentialDispatch(ImplicitLocOpBuilder &b, PatternRewriter &rewriter,
   // Induction variable is the index of the block: [0, blockCount).
   LoopBodyBuilder loopBuilder = [&](OpBuilder &loopBuilder, Location loc,
                                     Value iv, ValueRange args) {
-    ImplicitLocOpBuilder nb(loc, loopBuilder);
+    ImplicitLocOpBuilder b(loc, loopBuilder);
 
     // Call parallel compute function inside the async.execute region.
     auto executeBodyBuilder = [&](OpBuilder &executeBuilder,
@@ -692,10 +692,10 @@ doSequentialDispatch(ImplicitLocOpBuilder &b, PatternRewriter &rewriter,
     };
 
     // Create async.execute operation to launch parallel computate function.
-    auto execute = nb.create<ExecuteOp>(TypeRange(), ValueRange(), ValueRange(),
-                                        executeBodyBuilder);
-    nb.create<AddToGroupOp>(rewriter.getIndexType(), execute.token(), group);
-    nb.create<scf::YieldOp>();
+    auto execute = b.create<ExecuteOp>(TypeRange(), ValueRange(), ValueRange(),
+                                       executeBodyBuilder);
+    b.create<AddToGroupOp>(rewriter.getIndexType(), execute.token(), group);
+    b.create<scf::YieldOp>();
   };
 
   // Iterate over all compute blocks and launch parallel compute operations.
@@ -758,7 +758,7 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
   // Compute the parallel block size and dispatch concurrent tasks computing
   // results for each block.
   auto dispatch = [&](OpBuilder &nestedBuilder, Location loc) {
-    ImplicitLocOpBuilder nb(loc, nestedBuilder);
+    ImplicitLocOpBuilder b(loc, nestedBuilder);
 
     // Collect statically known constants defining the loop nest in the parallel
     // compute function. LLVM can't always push constants across the non-trivial
@@ -872,10 +872,10 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
     // Unroll when numUnrollableLoops > 0 && blockSize >= maxIterations.
     bool staticShouldUnroll = numUnrollableLoops > 0;
     auto dispatchNotUnrollable = [&](OpBuilder &nestedBuilder, Location loc) {
-      ImplicitLocOpBuilder nb(loc, nestedBuilder);
+      ImplicitLocOpBuilder b(loc, nestedBuilder);
       doDispatch(b, rewriter, notUnrollableParallelComputeFunction, op,
                  blockSize, blockCount, tripCounts);
-      nb.create<scf::YieldOp>();
+      b.create<scf::YieldOp>();
     };
 
     if (staticShouldUnroll) {
@@ -888,23 +888,23 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
                                         rewriter);
 
       auto dispatchUnrollable = [&](OpBuilder &nestedBuilder, Location loc) {
-        ImplicitLocOpBuilder nb(loc, nestedBuilder);
+        ImplicitLocOpBuilder b(loc, nestedBuilder);
         // Align the block size to be a multiple of the statically known
         // number of iterations in the inner loops.
-        Value numIters = nb.create<arith::ConstantIndexOp>(
+        Value numIters = b.create<arith::ConstantIndexOp>(
             numIterations[op.getNumLoops() - numUnrollableLoops]);
-        Value alignedBlockSize = nb.create<arith::MulIOp>(
-            nb.create<arith::CeilDivSIOp>(blockSize, numIters), numIters);
+        Value alignedBlockSize = b.create<arith::MulIOp>(
+            b.create<arith::CeilDivSIOp>(blockSize, numIters), numIters);
         doDispatch(b, rewriter, unrollableParallelComputeFunction, op,
                    alignedBlockSize, blockCount, tripCounts);
-        nb.create<scf::YieldOp>();
+        b.create<scf::YieldOp>();
       };
 
       b.create<scf::IfOp>(TypeRange(), dynamicShouldUnroll, dispatchUnrollable,
                           dispatchNotUnrollable);
-      nb.create<scf::YieldOp>();
+      b.create<scf::YieldOp>();
     } else {
-      dispatchNotUnrollable(nb, loc);
+      dispatchNotUnrollable(b, loc);
     }
   };
 

From 5bec1ea7a74895895e7831fd951dd8130d4f3d01 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <Shraiysh.Vaishay@amd.com>
Date: Thu, 17 Feb 2022 10:24:10 +0530
Subject: [PATCH 032/748] [mlir] Added oilist primitive

This patch attempts to add the `oilist` primitive proposed in the [[ https://llvm.discourse.group/t/rfc-extending-declarative-assembly-format-to-support-order-independent-variadic-segments/4388 | RFC: Extending Declarative Assembly Format to support order-independent variadic segments ]].

This element supports optional order-independent variadic segments for operations. This will allow OpenACC and OpenMP Dialects to have similar and relaxed requirements while encouraging the use of Declarative Assembly Format and avoiding code duplication.

An oilist element parses grammar of the form:
```
clause-list := clause clause-list | empty
clause := `keyword` clause1 | `otherKeyword` clause2
clause1 := <assembly-format element>
clause2 := <assembly-format element>
```

AssemblyFormat specification:
```
let assemblyFormat = [{
  oilist( `keyword` clause1
        | `otherkeyword` clause2
        ...
        )
}];
```

Example:
```
oilist( `private` `(` $arg0 `:` type($arg0) `)`
      | `nowait`
      | `reduction` custom<ReductionClause>($arg1, type($arg1)))

oilist( `private` `=` $arg0 `:` type($arg0)
      | `reduction` `=` $arg1 `:` type($arg1)
      | `firstprivate` `=` $arg3 `:` type($arg2))
```

Reviewed By: Mogball, rriddle

Differential Revision: https://reviews.llvm.org/D115215
---
 mlir/docs/OpDefinitions.md                 |   8 +
 mlir/test/IR/traits.mlir                   |  69 ++++++
 mlir/test/lib/Dialect/Test/TestDialect.cpp |  17 ++
 mlir/test/lib/Dialect/Test/TestOps.td      |  48 ++++
 mlir/test/mlir-tblgen/op-format-spec.td    |  51 +++++
 mlir/tools/mlir-tblgen/FormatGen.cpp       |  10 +-
 mlir/tools/mlir-tblgen/FormatGen.h         |   3 +
 mlir/tools/mlir-tblgen/OpFormatGen.cpp     | 251 ++++++++++++++++++++-
 8 files changed, 455 insertions(+), 2 deletions(-)

diff --git a/mlir/docs/OpDefinitions.md b/mlir/docs/OpDefinitions.md
index 1058b33480073..e9aa37f5fa76c 100644
--- a/mlir/docs/OpDefinitions.md
+++ b/mlir/docs/OpDefinitions.md
@@ -619,6 +619,14 @@ The available directives are as follows:
     -   The constraints on `inputs` and `results` are the same as the `input` of
         the `type` directive.
 
+*   `oilist` ( \`keyword\` elements | \`otherKeyword\` elements ...)
+
+    -   Represents an optional order-independent list of clauses. Each clause
+        has a keyword and corresponding assembly format.
+    -   Each clause can appear 0 or 1 time (in any order).
+    -   Only literals, types and variables can be used within an oilist element.
+    -   All the variables must be optional or variadic.
+
 *   `operands`
 
     -   Represents all of the operands of an operation.
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
index c0fb012975bac..e6283b52caa52 100644
--- a/mlir/test/IR/traits.mlir
+++ b/mlir/test/IR/traits.mlir
@@ -488,6 +488,75 @@ func @succeededResultSizeAttr() {
 
 // -----
 
+// CHECK-LABEL: @succeededOilistTrivial
+func @succeededOilistTrivial() {
+  // CHECK: test.oilist_with_keywords_only keyword
+  test.oilist_with_keywords_only keyword
+  // CHECK: test.oilist_with_keywords_only otherKeyword
+  test.oilist_with_keywords_only otherKeyword
+  // CHECK: test.oilist_with_keywords_only keyword otherKeyword
+  test.oilist_with_keywords_only keyword otherKeyword
+  // CHECK: test.oilist_with_keywords_only keyword otherKeyword
+  test.oilist_with_keywords_only otherKeyword keyword
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @succeededOilistSimple
+func @succeededOilistSimple(%arg0 : i32, %arg1 : i32, %arg2 : i32) {
+  // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32
+  test.oilist_with_simple_args keyword %arg0 : i32
+  // CHECK: test.oilist_with_simple_args otherKeyword %{{.*}} : i32
+  test.oilist_with_simple_args otherKeyword %arg0 : i32
+  // CHECK: test.oilist_with_simple_args thirdKeyword %{{.*}} : i32
+  test.oilist_with_simple_args thirdKeyword %arg0 : i32
+
+  // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32 otherKeyword %{{.*}} : i32
+  test.oilist_with_simple_args keyword %arg0 : i32 otherKeyword %arg1 : i32
+  // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32 thirdKeyword %{{.*}} : i32
+  test.oilist_with_simple_args keyword %arg0 : i32 thirdKeyword %arg1 : i32
+  // CHECK: test.oilist_with_simple_args otherKeyword %{{.*}} : i32 thirdKeyword %{{.*}} : i32
+  test.oilist_with_simple_args thirdKeyword %arg0 : i32 otherKeyword %arg1 : i32
+
+  // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32 otherKeyword %{{.*}} : i32 thirdKeyword %{{.*}} : i32
+  test.oilist_with_simple_args keyword %arg0 : i32 otherKeyword %arg1 : i32 thirdKeyword %arg2 : i32
+  // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32 otherKeyword %{{.*}} : i32 thirdKeyword %{{.*}} : i32
+  test.oilist_with_simple_args otherKeyword %arg0 : i32 keyword %arg1 : i32 thirdKeyword %arg2 : i32
+  // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32 otherKeyword %{{.*}} : i32 thirdKeyword %{{.*}} : i32
+  test.oilist_with_simple_args otherKeyword %arg0 : i32 thirdKeyword %arg1 : i32 keyword %arg2 : i32
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @succeededOilistVariadic
+// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32)
+func @succeededOilistVariadic(%arg0: i32, %arg1: i32, %arg2: i32) {
+  // CHECK: test.oilist_variadic_with_parens keyword(%[[ARG0]], %[[ARG1]] : i32, i32)
+  test.oilist_variadic_with_parens keyword (%arg0, %arg1 : i32, i32)
+  // CHECK: test.oilist_variadic_with_parens keyword(%[[ARG0]], %[[ARG1]] : i32, i32) otherKeyword(%[[ARG2]], %[[ARG1]] : i32, i32)
+  test.oilist_variadic_with_parens otherKeyword (%arg2, %arg1 : i32, i32) keyword (%arg0, %arg1 : i32, i32)
+  // CHECK: test.oilist_variadic_with_parens keyword(%[[ARG0]], %[[ARG1]] : i32, i32) otherKeyword(%[[ARG0]], %[[ARG1]] : i32, i32) thirdKeyword(%[[ARG2]], %[[ARG0]], %[[ARG1]] : i32, i32, i32)
+  test.oilist_variadic_with_parens thirdKeyword (%arg2, %arg0, %arg1 : i32, i32, i32) keyword (%arg0, %arg1 : i32, i32) otherKeyword (%arg0, %arg1 : i32, i32)
+  return
+}
+
+// -----
+// CHECK-LABEL: succeededOilistCustom
+// CHECK-SAME: (%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32)
+func @succeededOilistCustom(%arg0: i32, %arg1: i32, %arg2: i32) {
+  // CHECK: test.oilist_custom private(%[[ARG0]], %[[ARG1]] : i32, i32)
+  test.oilist_custom private (%arg0, %arg1 : i32, i32)
+  // CHECK: test.oilist_custom private(%[[ARG0]], %[[ARG1]] : i32, i32) nowait
+  test.oilist_custom private (%arg0, %arg1 : i32, i32) nowait
+  // CHECK: test.oilist_custom private(%arg0, %arg1 : i32, i32) nowait reduction (%arg1)
+  test.oilist_custom nowait reduction (%arg1) private (%arg0, %arg1 : i32, i32)
+  return
+}
+
+// -----
+
 func @failedHasDominanceScopeOutsideDominanceFreeScope() -> () {
   "test.ssacfg_region"() ({
     test.graph_region {
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 623d51295516e..f3f4d54d26e1d 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -387,6 +387,17 @@ void FoldToCallOp::getCanonicalizationPatterns(RewritePatternSet &results,
 //===----------------------------------------------------------------------===//
 // Parsing
 
+static ParseResult
+parseCustomOptionalOperand(OpAsmParser &parser,
+                           Optional<OpAsmParser::OperandType> &optOperand) {
+  if (succeeded(parser.parseOptionalLParen())) {
+    optOperand.emplace();
+    if (parser.parseOperand(*optOperand) || parser.parseRParen())
+      return failure();
+  }
+  return success();
+}
+
 static ParseResult parseCustomDirectiveOperands(
     OpAsmParser &parser, OpAsmParser::OperandType &operand,
     Optional<OpAsmParser::OperandType> &optOperand,
@@ -505,6 +516,12 @@ static ParseResult parseCustomDirectiveOptionalOperandRef(
 //===----------------------------------------------------------------------===//
 // Printing
 
+static void printCustomOptionalOperand(OpAsmPrinter &printer, Operation *,
+                                       Value optOperand) {
+  if (optOperand)
+    printer << "(" << optOperand << ") ";
+}
+
 static void printCustomDirectiveOperands(OpAsmPrinter &printer, Operation *,
                                          Value operand, Value optOperand,
                                          OperandRange varOperands) {
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index f5834efe9cb5a..40bec4f4807e4 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -636,6 +636,54 @@ def AttrSizedResultOp : TEST_Op<"attr_sized_results",
 // is the dialect parser and printer hooks.
 def CustomFormatFallbackOp : TEST_Op<"dialect_custom_format_fallback">;
 
+// Ops related to OIList primitive
+def OIListTrivial : TEST_Op<"oilist_with_keywords_only"> {
+  let assemblyFormat = [{
+    oilist( `keyword`
+          | `otherKeyword`) attr-dict
+  }];
+}
+
+def OIListSimple : TEST_Op<"oilist_with_simple_args", [AttrSizedOperandSegments]> {
+  let arguments = (ins Optional<AnyType>:$arg0,
+                       Optional<AnyType>:$arg1,
+                       Optional<AnyType>:$arg2);
+  let assemblyFormat = [{
+    oilist( `keyword` $arg0 `:` type($arg0)
+          | `otherKeyword` $arg1 `:` type($arg1)
+          | `thirdKeyword` $arg2 `:` type($arg2) ) attr-dict
+  }];
+}
+
+def OIListVariadic : TEST_Op<"oilist_variadic_with_parens", [AttrSizedOperandSegments]> {
+  let arguments = (ins Variadic<AnyType>:$arg0,
+                       Variadic<AnyType>:$arg1,
+                       Variadic<AnyType>:$arg2);
+  let assemblyFormat = [{
+    oilist( `keyword` `(` $arg0 `:` type($arg0) `)`
+          | `otherKeyword` `(` $arg1 `:` type($arg1) `)`
+          | `thirdKeyword` `(` $arg2 `:` type($arg2) `)`) attr-dict
+  }];
+}
+
+def OIListCustom : TEST_Op<"oilist_custom", [AttrSizedOperandSegments]> {
+  let arguments = (ins Variadic<AnyType>:$arg0,
+                       Optional<I32>:$optOperand,
+                       UnitAttr:$nowait);
+  let assemblyFormat = [{
+    oilist( `private` `(` $arg0 `:` type($arg0) `)`
+          | `nowait`
+          | `reduction` custom<CustomOptionalOperand>($optOperand)
+    ) attr-dict
+  }];
+}
+
+def OIListAllowedLiteral : TEST_Op<"oilist_allowed_literal"> {
+  let assemblyFormat = [{
+    oilist( `foo` | `bar` ) `buzz` attr-dict
+  }];
+}
+
 // This is used to test encoding of a string attribute into an SSA name of a
 // pretty printed value name.
 def StringAttrPrettyNameOp
diff --git a/mlir/test/mlir-tblgen/op-format-spec.td b/mlir/test/mlir-tblgen/op-format-spec.td
index 1c419424d6021..84edca8e621ac 100644
--- a/mlir/test/mlir-tblgen/op-format-spec.td
+++ b/mlir/test/mlir-tblgen/op-format-spec.td
@@ -344,6 +344,57 @@ def LiteralValid : TestFormat_Op<[{
   attr-dict
 }]>;
 
+//===----------------------------------------------------------------------===//
+// OIList Element
+//===----------------------------------------------------------------------===//
+
+// CHECK: error: format ambiguity because bar is used in two adjacent oilist elements.
+def OIListAdjacentOIList : TestFormat_Op<[{
+  oilist ( `foo` | `bar` ) oilist ( `bar` | `buzz` ) attr-dict
+}]>;
+// CHECK: error: expected literal, but got ')'
+def OIListErrorExpectedLiteral : TestFormat_Op<[{
+  oilist( `keyword` | ) attr-dict
+}]>;
+// CHECK: error: expected literal, but got ')'
+def OIListErrorExpectedEmpty : TestFormat_Op<[{
+  oilist() attr-dict
+}]>;
+// CHECK: error: expected literal, but got '$arg0'
+def OIListErrorNoLiteral : TestFormat_Op<[{
+  oilist( $arg0 `:` type($arg0) | $arg1 `:` type($arg1) ) attr-dict
+}], [AttrSizedOperandSegments]>, Arguments<(ins Optional<AnyType>:$arg0, Optional<AnyType>:$arg1)>;
+// CHECK: error: format ambiguity because foo is used both in oilist element and the adjacent literal.
+def OIListLiteralAmbiguity : TestFormat_Op<[{
+  oilist( `foo` | `bar` ) `foo` attr-dict
+}]>;
+// CHECK: error: expected '(' before oilist argument list
+def OIListStartingToken : TestFormat_Op<[{
+  oilist `wrong` attr-dict
+}]>;
+
+// CHECK-NOT: error
+def OIListTrivial : TestFormat_Op<[{
+  oilist(`keyword` `(` `)` | `otherkeyword` `(` `)`) attr-dict
+}]>;
+def OIListSimple : TestFormat_Op<[{
+  oilist( `keyword` $arg0 `:` type($arg0)
+        | `otherkeyword` $arg1 `:` type($arg1)
+        | `thirdkeyword` $arg2 `:` type($arg2) )
+  attr-dict
+}], [AttrSizedOperandSegments]>, Arguments<(ins Optional<AnyType>:$arg0, Optional<AnyType>:$arg1, Optional<AnyType>:$arg2)>;
+def OIListVariadic : TestFormat_Op<[{
+  oilist( `keyword` `(` $args0 `:` type($args0) `)`
+        | `otherkeyword` `(` $args1 `:` type($args1) `)`
+        | `thirdkeyword` `(` $args2 `:` type($args2) `)`)
+  attr-dict
+}], [AttrSizedOperandSegments]>, Arguments<(ins Variadic<AnyType>:$args0, Variadic<AnyType>:$args1, Variadic<AnyType>:$args2)>;
+def OIListCustom : TestFormat_Op<[{
+  oilist( `private` `(` $arg0 `:` type($arg0) `)`
+        | `nowait`
+        | `reduction` custom<ReductionClause>($arg1, type($arg1))) attr-dict
+}], [AttrSizedOperandSegments]>, Arguments<(ins Optional<AnyType>:$arg0, Optional<AnyType>:$arg1)>;
+
 //===----------------------------------------------------------------------===//
 // Optional Groups
 //===----------------------------------------------------------------------===//
diff --git a/mlir/tools/mlir-tblgen/FormatGen.cpp b/mlir/tools/mlir-tblgen/FormatGen.cpp
index a4c9dcf28981f..8d08340800c91 100644
--- a/mlir/tools/mlir-tblgen/FormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/FormatGen.cpp
@@ -115,6 +115,8 @@ FormatToken FormatLexer::lexToken() {
     return formToken(FormatToken::r_paren, tokStart);
   case '*':
     return formToken(FormatToken::star, tokStart);
+  case '|':
+    return formToken(FormatToken::pipe, tokStart);
 
   // Ignore whitespace characters.
   case 0:
@@ -164,6 +166,7 @@ FormatToken FormatLexer::lexIdentifier(const char *tokStart) {
           .Case("attr-dict-with-keyword", FormatToken::kw_attr_dict_w_keyword)
           .Case("custom", FormatToken::kw_custom)
           .Case("functional-type", FormatToken::kw_functional_type)
+          .Case("oilist", FormatToken::kw_oilist)
           .Case("operands", FormatToken::kw_operands)
           .Case("params", FormatToken::kw_params)
           .Case("ref", FormatToken::kw_ref)
@@ -230,7 +233,12 @@ FailureOr<FormatElement *> FormatParser::parseLiteral(Context ctx) {
         "literals may only be used in the top-level section of the format");
   }
   // Get the spelling without the surrounding backticks.
-  StringRef value = tok.getSpelling().drop_front().drop_back();
+  StringRef value = tok.getSpelling();
+  // Prevents things like `$arg0` or empty literals (when a literal is expected
+  // but not found) from getting segmentation faults.
+  if (value.size() < 2 || value[0] != '`' || value[value.size() - 1] != '`')
+    return emitError(tok.getLoc(), "expected literal, but got '" + value + "'");
+  value = value.drop_front().drop_back();
 
   // The parsed literal is a space element (`` or ` `) or a newline.
   if (value.empty() || value == " " || value == "\\n")
diff --git a/mlir/tools/mlir-tblgen/FormatGen.h b/mlir/tools/mlir-tblgen/FormatGen.h
index 4ad591d49ebc6..741e2716f0388 100644
--- a/mlir/tools/mlir-tblgen/FormatGen.h
+++ b/mlir/tools/mlir-tblgen/FormatGen.h
@@ -54,6 +54,7 @@ class FormatToken {
     greater,
     question,
     star,
+    pipe,
 
     // Keywords.
     keyword_start,
@@ -61,6 +62,7 @@ class FormatToken {
     kw_attr_dict_w_keyword,
     kw_custom,
     kw_functional_type,
+    kw_oilist,
     kw_operands,
     kw_params,
     kw_qualified,
@@ -271,6 +273,7 @@ class DirectiveElement : public FormatElementBase<FormatElement::Directive> {
     AttrDict,
     Custom,
     FunctionalType,
+    OIList,
     Operands,
     Ref,
     Regions,
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 3e395c2f77310..37e62880b543c 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -185,6 +185,62 @@ class TypeDirective : public DirectiveElementBase<DirectiveElement::Type> {
 
   bool shouldBeQualifiedFlag = false;
 };
+
+/// This class represents a group of order-independent optional clauses. Each
+/// clause starts with a literal element and has a coressponding parsing
+/// element. A parsing element is a continous sequence of format elements.
+/// Each clause can appear 0 or 1 time.
+class OIListElement : public DirectiveElementBase<DirectiveElement::OIList> {
+public:
+  OIListElement(std::vector<FormatElement *> &&literalElements,
+                std::vector<std::vector<FormatElement *>> &&parsingElements)
+      : literalElements(std::move(literalElements)),
+        parsingElements(std::move(parsingElements)) {}
+
+  /// Returns a range to iterate over the LiteralElements.
+  auto getLiteralElements() const {
+    // The use of std::function is unfortunate but necessary here. Lambda
+    // functions cannot be copied but std::function can be copied. This copy
+    // constructor is used in llvm::zip.
+    std::function<LiteralElement *(FormatElement * el)>
+        literalElementCastConverter =
+            [](FormatElement *el) { return cast<LiteralElement>(el); };
+    return llvm::map_range(literalElements, literalElementCastConverter);
+  }
+
+  /// Returns a range to iterate over the parsing elements corresponding to the
+  /// clauses.
+  ArrayRef<std::vector<FormatElement *>> getParsingElements() const {
+    return parsingElements;
+  }
+
+  /// Returns a range to iterate over tuples of parsing and literal elements.
+  auto getClauses() const {
+    return llvm::zip(getLiteralElements(), getParsingElements());
+  }
+
+private:
+  /// A vector of `LiteralElement` objects. Each element stores the keyword
+  /// for one case of oilist element. For example, an oilist element along with
+  /// the `literalElements` vector:
+  /// ```
+  ///  oilist [ `keyword` `=` `(` $arg0 `)` | `otherKeyword` `<` $arg1 `>`]
+  ///  literalElements = { `keyword`, `otherKeyword` }
+  /// ```
+  std::vector<FormatElement *> literalElements;
+
+  /// A vector of valid declarative assembly format vectors. Each object in
+  /// parsing elements is a vector of elements in assembly format syntax.
+  /// For example, an oilist element along with the parsingElements vector:
+  /// ```
+  ///  oilist [ `keyword` `=` `(` $arg0 `)` | `otherKeyword` `<` $arg1 `>`]
+  ///  parsingElements = {
+  ///    { `=`, `(`, $arg0, `)` },
+  ///    { `<`, $arg1, `>` }
+  ///  }
+  /// ```
+  std::vector<std::vector<FormatElement *>> parsingElements;
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -630,6 +686,19 @@ const char *successorParserCode = R"(
     return ::mlir::failure();
 )";
 
+/// The code snippet used to generate a parser for OIList
+///
+/// {0}: literal keyword corresponding to a case for oilist
+const char *oilistParserCode = R"(
+  if ({0}Clause) {
+    return parser.emitError(parser.getNameLoc())
+          << "`{0}` clause can appear at most once in the expansion of the "
+             "oilist directive";
+  }
+  {0}Clause = true;
+  result.addAttribute("{0}", UnitAttr::get(parser.getContext()));
+)";
+
 namespace {
 /// The type of length for a given parse argument.
 enum class ArgumentLengthKind {
@@ -720,6 +789,11 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
     for (FormatElement *childElement : optional->getElseElements())
       genElementParserStorage(childElement, op, body);
 
+  } else if (auto *oilist = dyn_cast<OIListElement>(element)) {
+    for (ArrayRef<FormatElement *> pelement : oilist->getParsingElements())
+      for (FormatElement *element : pelement)
+        genElementParserStorage(element, op, body);
+
   } else if (auto *custom = dyn_cast<CustomDirective>(element)) {
     for (FormatElement *paramElement : custom->getArguments())
       genElementParserStorage(paramElement, op, body);
@@ -1104,6 +1178,31 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
     }
     body << "\n";
 
+    /// OIList Directive
+  } else if (OIListElement *oilist = dyn_cast<OIListElement>(element)) {
+    for (LiteralElement *le : oilist->getLiteralElements())
+      body << "  bool " << le->getSpelling() << "Clause = false;\n";
+
+    // Generate the parsing loop
+    body << "  while(true) {\n";
+    for (auto clause : oilist->getClauses()) {
+      LiteralElement *lelement = std::get<0>(clause);
+      ArrayRef<FormatElement *> pelement = std::get<1>(clause);
+      body << "if (succeeded(parser.parseOptional";
+      genLiteralParser(lelement->getSpelling(), body);
+      body << ")) {\n";
+      StringRef attrName = lelement->getSpelling();
+      body << formatv(oilistParserCode, attrName);
+      inferredAttributes.insert(attrName);
+      for (FormatElement *el : pelement)
+        genElementParser(el, body, attrTypeCtx);
+      body << "    } else ";
+    }
+    body << " {\n";
+    body << "    break;\n";
+    body << "  }\n";
+    body << "}\n";
+
     /// Literals.
   } else if (LiteralElement *literal = dyn_cast<LiteralElement>(element)) {
     body << "  if (parser.parse";
@@ -1844,6 +1943,26 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     return;
   }
 
+  // Emit the OIList
+  if (auto *oilist = dyn_cast<OIListElement>(element)) {
+    genLiteralPrinter(" ", body, shouldEmitSpace, lastWasPunctuation);
+    for (auto clause : oilist->getClauses()) {
+      LiteralElement *lelement = std::get<0>(clause);
+      ArrayRef<FormatElement *> pelement = std::get<1>(clause);
+
+      body << "  if ((*this)->hasAttrOfType<UnitAttr>(\""
+           << lelement->getSpelling() << "\")) {\n";
+      genLiteralPrinter(lelement->getSpelling(), body, shouldEmitSpace,
+                        lastWasPunctuation);
+      for (FormatElement *element : pelement) {
+        genElementPrinter(element, body, op, shouldEmitSpace,
+                          lastWasPunctuation);
+      }
+      body << "  }\n";
+    }
+    return;
+  }
+
   // Emit the attribute dictionary.
   if (auto *attrDict = dyn_cast<AttrDictDirective>(element)) {
     genAttrDictPrinter(*this, op, body, attrDict->isWithKeyword());
@@ -2061,6 +2180,9 @@ class OpFormatParser : public FormatParser {
   /// Verify the state of operation successors within the format.
   LogicalResult verifySuccessors(SMLoc loc);
 
+  LogicalResult verifyOIListElements(SMLoc loc,
+                                     ArrayRef<FormatElement *> elements);
+
   /// Given the values of an `AllTypesMatch` trait, check for inferable type
   /// resolution.
   void handleAllTypesMatchConstraint(
@@ -2087,6 +2209,8 @@ class OpFormatParser : public FormatParser {
                                                     bool withKeyword);
   FailureOr<FormatElement *> parseFunctionalTypeDirective(SMLoc loc,
                                                           Context context);
+  FailureOr<FormatElement *> parseOIListDirective(SMLoc loc, Context context);
+  LogicalResult verifyOIListParsingElement(FormatElement *element, SMLoc loc);
   FailureOr<FormatElement *> parseOperandsDirective(SMLoc loc, Context context);
   FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc,
                                                      Context context);
@@ -2157,7 +2281,8 @@ LogicalResult OpFormatParser::verify(SMLoc loc,
   if (failed(verifyAttributes(loc, elements)) ||
       failed(verifyResults(loc, variableTyResolver)) ||
       failed(verifyOperands(loc, variableTyResolver)) ||
-      failed(verifyRegions(loc)) || failed(verifySuccessors(loc)))
+      failed(verifyRegions(loc)) || failed(verifySuccessors(loc)) ||
+      failed(verifyOIListElements(loc, elements)))
     return failure();
 
   // Collect the set of used attributes in the format.
@@ -2377,6 +2502,43 @@ LogicalResult OpFormatParser::verifySuccessors(SMLoc loc) {
   return success();
 }
 
+LogicalResult
+OpFormatParser::verifyOIListElements(SMLoc loc,
+                                     ArrayRef<FormatElement *> elements) {
+  // Check that all of the successors are within the format.
+  SmallVector<StringRef> prohibitedLiterals;
+  for (FormatElement *it : elements) {
+    if (auto *oilist = dyn_cast<OIListElement>(it)) {
+      if (!prohibitedLiterals.empty()) {
+        // We just saw an oilist element in last iteration. Literals should not
+        // match.
+        for (LiteralElement *literal : oilist->getLiteralElements()) {
+          if (find(prohibitedLiterals, literal->getSpelling()) !=
+              prohibitedLiterals.end()) {
+            return emitError(
+                loc, "format ambiguity because " + literal->getSpelling() +
+                         " is used in two adjacent oilist elements.");
+          }
+        }
+      }
+      for (LiteralElement *literal : oilist->getLiteralElements())
+        prohibitedLiterals.push_back(literal->getSpelling());
+    } else if (auto *literal = dyn_cast<LiteralElement>(it)) {
+      if (find(prohibitedLiterals, literal->getSpelling()) !=
+          prohibitedLiterals.end()) {
+        return emitError(
+            loc,
+            "format ambiguity because " + literal->getSpelling() +
+                " is used both in oilist element and the adjacent literal.");
+      }
+      prohibitedLiterals.clear();
+    } else {
+      prohibitedLiterals.clear();
+    }
+  }
+  return success();
+}
+
 void OpFormatParser::handleAllTypesMatchConstraint(
     ArrayRef<StringRef> values,
     llvm::StringMap<TypeResolutionInstance> &variableTyResolver) {
@@ -2532,6 +2694,8 @@ OpFormatParser::parseDirectiveImpl(SMLoc loc, FormatToken::Kind kind,
     return parseReferenceDirective(loc, ctx);
   case FormatToken::kw_type:
     return parseTypeDirective(loc, ctx);
+  case FormatToken::kw_oilist:
+    return parseOIListDirective(loc, ctx);
 
   default:
     return emitError(loc, "unsupported directive kind");
@@ -2675,6 +2839,91 @@ OpFormatParser::parseSuccessorsDirective(SMLoc loc, Context context) {
   return create<SuccessorsDirective>();
 }
 
+FailureOr<FormatElement *>
+OpFormatParser::parseOIListDirective(SMLoc loc, Context context) {
+  if (failed(parseToken(FormatToken::l_paren,
+                        "expected '(' before oilist argument list")))
+    return failure();
+  std::vector<FormatElement *> literalElements;
+  std::vector<std::vector<FormatElement *>> parsingElements;
+  do {
+    FailureOr<FormatElement *> lelement = parseLiteral(context);
+    if (failed(lelement))
+      return failure();
+    literalElements.push_back(*lelement);
+    parsingElements.push_back(std::vector<FormatElement *>());
+    std::vector<FormatElement *> &currParsingElements = parsingElements.back();
+    while (peekToken().getKind() != FormatToken::pipe &&
+           peekToken().getKind() != FormatToken::r_paren) {
+      FailureOr<FormatElement *> pelement = parseElement(context);
+      if (failed(pelement) ||
+          failed(verifyOIListParsingElement(*pelement, loc)))
+        return failure();
+      currParsingElements.push_back(*pelement);
+    }
+    if (peekToken().getKind() == FormatToken::pipe) {
+      consumeToken();
+      continue;
+    }
+    if (peekToken().getKind() == FormatToken::r_paren) {
+      consumeToken();
+      break;
+    }
+  } while (true);
+
+  return create<OIListElement>(std::move(literalElements),
+                               std::move(parsingElements));
+}
+
+LogicalResult OpFormatParser::verifyOIListParsingElement(FormatElement *element,
+                                                         SMLoc loc) {
+  return TypeSwitch<FormatElement *, LogicalResult>(element)
+      // Only optional attributes can be within an oilist parsing group.
+      .Case([&](AttributeVariable *attrEle) {
+        if (!attrEle->getVar()->attr.isOptional())
+          return emitError(loc, "only optional attributes can be used to "
+                                "in an oilist parsing group");
+        return success();
+      })
+      // Only optional-like(i.e. variadic) operands can be within an oilist
+      // parsing group.
+      .Case([&](OperandVariable *ele) {
+        if (!ele->getVar()->isVariableLength())
+          return emitError(loc, "only variable length operands can be "
+                                "used within an oilist parsing group");
+        return success();
+      })
+      // Only optional-like(i.e. variadic) results can be within an oilist
+      // parsing group.
+      .Case([&](ResultVariable *ele) {
+        if (!ele->getVar()->isVariableLength())
+          return emitError(loc, "only variable length results can be "
+                                "used within an oilist parsing group");
+        return success();
+      })
+      .Case([&](RegionVariable *) {
+        // TODO: When ODS has proper support for marking "optional" regions, add
+        // a check here.
+        return success();
+      })
+      .Case([&](TypeDirective *ele) {
+        return verifyOIListParsingElement(ele->getArg(), loc);
+      })
+      .Case([&](FunctionalTypeDirective *ele) {
+        if (failed(verifyOIListParsingElement(ele->getInputs(), loc)))
+          return failure();
+        return verifyOIListParsingElement(ele->getResults(), loc);
+      })
+      // Literals, whitespace, and custom directives may be used.
+      .Case<LiteralElement, WhitespaceElement, CustomDirective,
+            FunctionalTypeDirective, OptionalElement>(
+          [&](FormatElement *) { return success(); })
+      .Default([&](FormatElement *) {
+        return emitError(loc, "only literals, types, and variables can be "
+                              "used within an oilist group");
+      });
+}
+
 FailureOr<FormatElement *> OpFormatParser::parseTypeDirective(SMLoc loc,
                                                               Context context) {
   if (context == TypeDirectiveContext)

From fee491a10a3277c51115a7ab6e79fe5223618e86 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 16 Feb 2022 22:18:03 -0800
Subject: [PATCH 033/748] issue-release-workflow: Add support for /cherry-pick
 command in issue body

Reviewed By: kwk

Differential Revision: https://reviews.llvm.org/D119312
---
 .github/workflows/issue-release-workflow.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index 318b4c22379be..e0e03a8970ad0 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -19,9 +19,12 @@ on:
     types:
       - created
       - edited
+  issues:
+    types:
+      - opened
 
 env:
-  COMMENT_BODY: ${{ github.event.comment.body }}
+  COMMENT_BODY: ${{ github.event.action == 'opened' && github.event.issue.body || github.event.comment.body  }}
 
 jobs:
   backport-commits:
@@ -30,7 +33,7 @@ jobs:
     if: >-
         (github.repository == 'llvm/llvm-project') &&
         !startswith(github.event.comment.body, '<!--IGNORE-->') &&
-        contains(github.event.comment.body, '/cherry-pick')
+        contains(github.event.action == 'opened' && github.event.issue.body || github.event.comment.body, '/cherry-pick')
     steps:
       - name: Fetch LLVM sources
         uses: actions/checkout@v2

From d4332a88429f27c2924e2bf8309d6ce65d39dbca Mon Sep 17 00:00:00 2001
From: Damian Rouson <damian@sourceryinstitute.org>
Date: Thu, 11 Nov 2021 16:20:30 -0800
Subject: [PATCH 034/748] [flang] add semantics test for sync images

Test a range of acceptable forms of SYNC IMAGES statements,
including combinations with and without the stat-variable
and errmsg-variable present.  Also test that several invalid
forms of SYNC IMAGES call generate the correct error messages.

Differential Revision: https://reviews.llvm.org/D118933
---
 flang/test/Semantics/synchronization02.f90 | 106 +++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 flang/test/Semantics/synchronization02.f90

diff --git a/flang/test/Semantics/synchronization02.f90 b/flang/test/Semantics/synchronization02.f90
new file mode 100644
index 0000000000000..1a2a4b9f18864
--- /dev/null
+++ b/flang/test/Semantics/synchronization02.f90
@@ -0,0 +1,106 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! XFAIL: *
+! Check for semantic errors in sync images statements
+
+program test_sync_images
+  implicit none
+
+  integer, parameter :: invalid_rank(*,*) = reshape([1], [1,1])
+  integer sync_status, non_scalar(2), superfluous_stat, coindexed_integer[*], me
+  character(len=128) error_message, superfluous_errmsg, coindexed_character[*]
+  logical invalid_type
+  
+  !___ standard-conforming statement ___
+
+  sync images(*, stat=sync_status, errmsg=error_message)
+  sync images(*, stat=sync_status                      )
+  sync images(*,                   errmsg=error_message)
+  sync images(*                                        )
+
+  sync images(me,   stat=sync_status, errmsg=error_message)
+  sync images(me+1, stat=sync_status, errmsg=error_message)
+  sync images(1,    stat=sync_status, errmsg=error_message)
+  sync images(1,    stat=sync_status                      )
+  sync images(1,                      errmsg=error_message)
+  sync images(1                                           )
+
+  sync images([1],  stat=sync_status, errmsg=error_message)
+  sync images([1],  stat=sync_status                      )
+  sync images([1],                    errmsg=error_message)
+  sync images([1]                                         )
+
+  !___ non-standard-conforming statement ___
+
+  !______ invalid image sets ______
+
+  ! Image set shall not depend on the value of stat-variable
+  !ERROR: TBD
+  sync images(sync_status, stat=sync_status)
+
+  ! Image set shall not depend on the value of errmsg-variable
+  !ERROR: TBD
+  sync images(len(error_message), errmsg=error_message)
+
+  ! Image set shall be a scalar or rank-1 array
+  !ERROR: TBD
+  sync images(invalid_rank)
+ 
+  !______ invalid sync-stat-lists: invalid stat= ____________
+
+  ! Invalid sync-stat-list keyword
+  !ERROR: expected ')'
+  sync images(1, status=sync_status)
+
+  !ERROR: TBD
+  sync images([1], stat=invalid_type)
+
+  ! Stat-variable must an integer scalar
+  !ERROR: TBD
+  sync images(*, stat=non_scalar)
+ 
+  ! Invalid sync-stat-list: missing stat-variable
+  !ERROR: expected ')'
+  sync images(1, stat)
+
+  ! Invalid sync-stat-list: missing 'stat='
+  !ERROR: expected ')'
+  sync images([1], sync_status)
+
+  !______ invalid sync-stat-lists: invalid errmsg= ____________
+
+  ! Invalid errmsg-variable keyword
+  !ERROR: expected ')'
+  sync images(*, errormsg=error_message)
+
+  !ERROR: TBD
+  sync images(1, errmsg=invalid_type)
+
+  ! Invalid sync-stat-list: missing 'errmsg='
+  !ERROR: expected ')'
+  sync images([1], error_message)
+
+  ! Invalid sync-stat-list: missing errmsg-variable
+  !ERROR: expected ')'
+  sync images(*, errmsg)
+
+  !______ invalid sync-stat-lists: redundant sync-stat-list ____________
+
+  ! No specifier shall appear more than once in a given sync-stat-list
+  !ERROR: to be determined
+  sync images(1, stat=sync_status, stat=superfluous_stat)
+
+  ! No specifier shall appear more than once in a given sync-stat-list
+  !ERROR: to be determined
+  sync images([1], errmsg=error_message, errmsg=superfluous_errmsg)
+ 
+  !______ invalid sync-stat-lists: coindexed stat-variable ____________
+
+  ! Check constraint C1173 from the Fortran 2018 standard
+  !ERROR: to be determined
+  sync images(*, stat=coindexed_integer[1])
+ 
+  ! Check constraint C1173 from the Fortran 2018 standard
+  !ERROR: to be determined
+  sync images(1, errmsg=coindexed_character[1])
+
+end program test_sync_images

From 910a642c0a5b66a8d2517026b890a1acdc447f19 Mon Sep 17 00:00:00 2001
From: Pavel Kosov <kosov.pavel@huawei.com>
Date: Thu, 17 Feb 2022 10:11:25 +0300
Subject: [PATCH 035/748] [compiler-rt] Implement  ARM atomic operations for
 architectures without SMP support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ARMv5 and older architectures don’t  support SMP and do not have atomic instructions. Still they’re in use in IoT world, where one has to stick to libgcc.

Reviewed By: mstorsjo

Differential Revision: https://reviews.llvm.org/D116088
---
 .../cmake/Modules/CompilerRTUtils.cmake       | 10 ++++++
 compiler-rt/cmake/config-ix.cmake             |  5 +++
 compiler-rt/lib/builtins/CMakeLists.txt       |  1 +
 compiler-rt/lib/builtins/arm/sync-ops.h       | 33 +++++++++++++++++++
 4 files changed, 49 insertions(+)

diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
index 052095801aaed..557fa96eea5eb 100644
--- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
@@ -110,6 +110,16 @@ function(check_compile_definition def argstring out_var)
   cmake_pop_check_state()
 endfunction()
 
+macro(test_arm_smp_support arch cflags_var)
+  if (${arch} STREQUAL "arm")
+    try_compile(HAS_${arch}_SMP ${CMAKE_BINARY_DIR}
+	    ${ARM_SMP_CHECK_SRC} COMPILE_DEFINITIONS "${CMAKE_C_FLAGS} ${_TARGET_${arch}_CFLAGS}")
+    if (HAS_${arch}_SMP)
+      list(APPEND ${cflags_var} -DCOMPILER_RT_HAS_SMP_SUPPORT)
+    endif()
+  endif()
+endmacro()
+
 # test_target_arch(<arch> <def> <target flags...>)
 # Checks if architecture is supported: runs host compiler with provided
 # flags to verify that:
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 4299a0589a7b7..ccf57009fd43f 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -200,6 +200,11 @@ set(COMPILER_RT_SUPPORTED_ARCH)
 set(SIMPLE_SOURCE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/simple.cc)
 file(WRITE ${SIMPLE_SOURCE} "#include <stdlib.h>\n#include <stdio.h>\nint main() { printf(\"hello, world\"); }\n")
 
+# Check if we have SMP support for particular ARM architecture
+# If not use stubs instead of real atomic operations - see sync-ops.h
+set(ARM_SMP_CHECK_SRC ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/arm-barrier.cc)
+file(WRITE ${ARM_SMP_CHECK_SRC} "int main() { asm(\"dmb\"); return 0; }")
+
 # Detect whether the current target platform is 32-bit or 64-bit, and setup
 # the correct commandline flags needed to attempt to target 32-bit and 64-bit.
 if (NOT CMAKE_SIZEOF_VOID_P EQUAL 4 AND
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index ea5ad9cdb8643..12268dc1ce63d 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -740,6 +740,7 @@ else ()
         list(APPEND BUILTIN_CFLAGS_${arch} -fomit-frame-pointer -DCOMPILER_RT_ARMHF_TARGET)
       endif()
 
+      test_arm_smp_support(${arch} BUILTIN_CFLAGS_${arch})
       # For RISCV32, we must force enable int128 for compiling long
       # double routines.
       if("${arch}" STREQUAL "riscv32")
diff --git a/compiler-rt/lib/builtins/arm/sync-ops.h b/compiler-rt/lib/builtins/arm/sync-ops.h
index c9623249e5d20..b924b33f80eb3 100644
--- a/compiler-rt/lib/builtins/arm/sync-ops.h
+++ b/compiler-rt/lib/builtins/arm/sync-ops.h
@@ -14,6 +14,8 @@
 
 #include "../assembly.h"
 
+#ifdef COMPILER_RT_HAS_SMP_SUPPORT
+
 #define SYNC_OP_4(op)                                                          \
   .p2align 2;                                                                  \
   .thumb;                                                                      \
@@ -45,6 +47,37 @@
   dmb;                                                                         \
   pop { r4, r5, r6, pc }
 
+#else
+
+#define SYNC_OP_4(op)                                                          \
+  .p2align 2;                                                                  \
+  DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
+  LOCAL_LABEL(tryatomic_##op) :                                                \
+  mov r12, r0;                                                                 \
+  op(r2, r0, r1);                                                              \
+  str r2, [r12];                                                               \
+  ldr r12, [r12];                                                              \
+  cmp r12, r2;                                                                 \
+  bne LOCAL_LABEL(tryatomic_##op);                                             \
+  bx lr
+
+#define SYNC_OP_8(op)                                                          \
+  .p2align 2;                                                                  \
+  DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
+  push {r4, r5, r6, lr};                                                       \
+  LOCAL_LABEL(tryatomic_##op) :                                                \
+  mov r12, r0;                                                                 \
+  op(r4, r5, r0, r1, r2, r3);                                                  \
+  stm r12, {r4, r5};                                                           \
+  ldm r12, {r6, r12};                                                          \
+  cmp r6, r4;                                                                  \
+  bne LOCAL_LABEL(tryatomic_##op);                                             \
+  cmp r12, r5;                                                                 \
+  bne LOCAL_LABEL(tryatomic_##op);                                             \
+  pop { r4, r5, r6, pc }
+
+#endif
+
 #define MINMAX_4(rD, rN, rM, cmp_kind)                                         \
   cmp rN, rM;                                                                  \
   mov rD, rM;                                                                  \

From f165c23bf3598990aaf2174a6bc40be75199ee1a Mon Sep 17 00:00:00 2001
From: Pavel Kosov <kosov.pavel@huawei.com>
Date: Thu, 17 Feb 2022 10:21:22 +0300
Subject: [PATCH 036/748] [NFC][compiler-rt] Format file
 lib/builtins/arm/sync-ops.h

---
 compiler-rt/lib/builtins/arm/sync-ops.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/lib/builtins/arm/sync-ops.h b/compiler-rt/lib/builtins/arm/sync-ops.h
index b924b33f80eb3..9f7f23f98f085 100644
--- a/compiler-rt/lib/builtins/arm/sync-ops.h
+++ b/compiler-rt/lib/builtins/arm/sync-ops.h
@@ -36,7 +36,7 @@
   .thumb;                                                                      \
   .syntax unified;                                                             \
   DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  push {r4, r5, r6, lr};                                                       \
+  push{r4, r5, r6, lr};                                                        \
   dmb;                                                                         \
   mov r12, r0;                                                                 \
   LOCAL_LABEL(tryatomic_##op) : ldrexd r0, r1, [r12];                          \
@@ -52,8 +52,7 @@
 #define SYNC_OP_4(op)                                                          \
   .p2align 2;                                                                  \
   DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  LOCAL_LABEL(tryatomic_##op) :                                                \
-  mov r12, r0;                                                                 \
+  LOCAL_LABEL(tryatomic_##op) : mov r12, r0;                                   \
   op(r2, r0, r1);                                                              \
   str r2, [r12];                                                               \
   ldr r12, [r12];                                                              \
@@ -64,9 +63,8 @@
 #define SYNC_OP_8(op)                                                          \
   .p2align 2;                                                                  \
   DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  push {r4, r5, r6, lr};                                                       \
-  LOCAL_LABEL(tryatomic_##op) :                                                \
-  mov r12, r0;                                                                 \
+  push{r4, r5, r6, lr};                                                        \
+  LOCAL_LABEL(tryatomic_##op) : mov r12, r0;                                   \
   op(r4, r5, r0, r1, r2, r3);                                                  \
   stm r12, {r4, r5};                                                           \
   ldm r12, {r6, r12};                                                          \

From d271fc04d5b97b12e6b797c6067d3c96a8d7470e Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Wed, 16 Feb 2022 15:38:14 +0300
Subject: [PATCH 037/748] [mlir][gpu] Split ops sinking from
 gpu-kernel-outlining pass into separate pass

Previously `gpu-kernel-outlining` pass was also doing index computation sinking into gpu.launch before actual outlining.
Split ops sinking from `gpu-kernel-outlining` pass into separate pass, so users can use theirs own sinking pass before outlining.
To achieve old behavior users will need to call both passes: `-gpu-launch-sink-index-computations -gpu-kernel-outlining`.

Differential Revision: https://reviews.llvm.org/D119932
---
 mlir/include/mlir/Dialect/GPU/Passes.h        |   4 +
 mlir/include/mlir/Dialect/GPU/Passes.td       |   6 ++
 .../GPU/Transforms/KernelOutlining.cpp        |  29 ++++-
 mlir/lib/Dialect/GPU/Transforms/PassDetail.h  |   1 +
 mlir/test/Dialect/GPU/outlining.mlir          |   4 +-
 mlir/test/Dialect/GPU/sink-ops.mlir           | 100 ++++++++++++++++++
 6 files changed, 138 insertions(+), 6 deletions(-)
 create mode 100644 mlir/test/Dialect/GPU/sink-ops.mlir

diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
index c9c6f8668b4d3..729363ace255b 100644
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -23,6 +23,10 @@ class Module;
 } // namespace llvm
 
 namespace mlir {
+/// Pass that moves ops which are likely an index computation into gpu.launch
+/// body.
+std::unique_ptr<Pass> createGpuLauchSinkIndexComputationsPass();
+
 /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
 /// a separate kernel function.
 std::unique_ptr<OperationPass<ModuleOp>>
diff --git a/mlir/include/mlir/Dialect/GPU/Passes.td b/mlir/include/mlir/Dialect/GPU/Passes.td
index eaabc6ea36012..0380e1db5e01e 100644
--- a/mlir/include/mlir/Dialect/GPU/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Passes.td
@@ -11,6 +11,12 @@
 
 include "mlir/Pass/PassBase.td"
 
+def GpuLaunchSinkIndexComputations : Pass<"gpu-launch-sink-index-computations"> {
+  let summary = "Sink index computations into gpu.launch body";
+  let constructor = "mlir::createGpuLauchSinkIndexComputationsPass()";
+  let dependentDialects = ["mlir::gpu::GPUDialect"];
+}
+
 def GpuKernelOutlining : Pass<"gpu-kernel-outlining", "ModuleOp"> {
   let summary = "Outline gpu.launch bodies to kernel functions";
   let constructor = "mlir::createGpuKernelOutliningPass()";
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index c8b9b9b2bc9a2..3b0f51444e027 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -59,7 +59,7 @@ static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
 /// Identifies operations that are beneficial to sink into kernels. These
 /// operations may not have side-effects, as otherwise sinking (and hence
 /// duplicating them) is not legal.
-static bool isLikelyAnIndexComputatio(Operation *op) {
+static bool isLikelyAnIndexComputation(Operation *op) {
   return isa<arith::ConstantOp, ConstantOp, memref::DimOp, arith::SelectOp,
              arith::CmpIOp>(op);
 }
@@ -232,6 +232,26 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
 }
 
 namespace {
+/// Pass that moves ops which are likely an index computation into gpu.launch
+/// body.
+class GpuLaunchSinkIndexComputationsPass
+    : public GpuLaunchSinkIndexComputationsBase<
+          GpuLaunchSinkIndexComputationsPass> {
+public:
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    if (op->walk([](gpu::LaunchOp launch) {
+            // Pull in instructions that can be sunk
+            if (failed(sinkOperationsIntoLaunchOp(launch,
+                                                  isLikelyAnIndexComputation)))
+              return WalkResult::interrupt();
+
+            return WalkResult::advance();
+          }).wasInterrupted())
+      signalPassFailure();
+  }
+};
+
 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
 ///
 /// This pass moves the kernel code of each LaunchOp into a function created
@@ -280,9 +300,6 @@ class GpuKernelOutliningPass
         std::string kernelFnName =
             Twine(op->getParentOfType<FuncOp>().getName(), "_kernel").str();
 
-        // Pull in instructions that can be sunk
-        if (failed(sinkOperationsIntoLaunchOp(op, isLikelyAnIndexComputatio)))
-          return WalkResult::interrupt();
         gpu::GPUFuncOp outlinedFunc =
             outlineKernelFuncImpl(op, kernelFnName, operands);
 
@@ -360,6 +377,10 @@ class GpuKernelOutliningPass
 
 } // namespace
 
+std::unique_ptr<Pass> mlir::createGpuLauchSinkIndexComputationsPass() {
+  return std::make_unique<GpuLaunchSinkIndexComputationsPass>();
+}
+
 std::unique_ptr<OperationPass<ModuleOp>>
 mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
   return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
diff --git a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
index 44e99e00fa155..faa9d3cf7231a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
@@ -11,6 +11,7 @@
 
 #include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 4b15e5b449066..fc418ca442c46 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining=data-layout-str='#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>' -split-input-file %s | FileCheck --check-prefix CHECK-DL %s
+// RUN: mlir-opt -allow-unregistered-dialect -gpu-launch-sink-index-computations -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -gpu-launch-sink-index-computations -gpu-kernel-outlining=data-layout-str='#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>' -split-input-file %s | FileCheck --check-prefix CHECK-DL %s
 
 // CHECK: module attributes {gpu.container_module}
 
diff --git a/mlir/test/Dialect/GPU/sink-ops.mlir b/mlir/test/Dialect/GPU/sink-ops.mlir
new file mode 100644
index 0000000000000..e2b4c238b9ce2
--- /dev/null
+++ b/mlir/test/Dialect/GPU/sink-ops.mlir
@@ -0,0 +1,100 @@
+// RUN: mlir-opt -allow-unregistered-dialect -gpu-launch-sink-index-computations -split-input-file -verify-diagnostics %s | FileCheck %s
+
+
+// CHECK-LABEL: @extra_constants
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
+func @extra_constants(%arg0: memref<?xf32>) {
+  %cst = arith.constant 8 : index
+  %cst2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %cst3 = memref.dim %arg0, %c0 : memref<?xf32>
+  // CHECK: gpu.launch blocks
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    // CHECK-NOT: arith.constant 8
+    // CHECK: %[[CST2:.*]] = arith.constant 2
+    // CHECK-NEXT: %[[CST0:.*]] = arith.constant 0
+    // CHECK-NEXT: %[[DIM:.*]] = memref.dim %[[ARG0]], %[[CST0]]
+    // CHECK-NEXT: "use"(%[[CST2]], %[[ARG0]], %[[DIM]]) : (index, memref<?xf32>, index) -> ()
+    // CHECK-NEXT: gpu.terminator
+    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @extra_constants_not_inlined
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
+func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
+  %cst = arith.constant 8 : index
+  %cst2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  // CHECK: %[[CST_X:.*]] = "secret_constant"()
+  %cst3 = "secret_constant"() : () -> index
+  // CHECK: gpu.launch blocks
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    // CHECK-NOT: arith.constant 8
+    // CHECK-NOT: "secret_constant"()
+    // CHECK: %[[CST2:.*]] = arith.constant 2
+    // CHECK-NEXT: "use"(%[[CST2]], %[[ARG0]], %[[CST_X]]) : (index, memref<?xf32>, index) -> ()
+    // CHECK-NEXT: gpu.terminator
+    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @multiple_uses
+func @multiple_uses(%arg0 : memref<?xf32>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  // CHECK: gpu.launch blocks
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
+                                       %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
+                                        %block_z = %c1) {
+    // CHECK: %[[C2:.*]] = arith.constant 2
+    // CHECK-NEXT: "use1"(%[[C2]], %[[C2]])
+    // CHECK-NEXT: "use2"(%[[C2]])
+    // CHECK-NEXT: gpu.terminator
+    "use1"(%c2, %c2) : (index, index) -> ()
+    "use2"(%c2) : (index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @multiple_uses2
+func @multiple_uses2(%arg0 : memref<*xf32>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %d = memref.dim %arg0, %c2 : memref<*xf32>
+  // CHECK: gpu.launch blocks
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
+                                       %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
+                                        %block_z = %c1) {
+    // CHECK: %[[C2:.*]] = arith.constant 2 : index
+    // CHECK: %[[D:.*]] = memref.dim %[[ARG:.*]], %[[C2]]
+    // CHECK: "use1"(%[[D]])
+    // CHECK: "use2"(%[[C2]], %[[C2]])
+    // CHECK: "use3"(%[[ARG]])
+    // CHECK: gpu.terminator
+    "use1"(%d) : (index) -> ()
+    "use2"(%c2, %c2) : (index, index) -> ()
+    "use3"(%arg0) : (memref<*xf32>) -> ()
+    gpu.terminator
+  }
+  return
+}

From 859567725d8971477ed6a14799645c818c7878ad Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Feb 2022 09:28:33 +0100
Subject: [PATCH 038/748] [IndVars] Don't run full optimization pipeline in
 test (NFC)

This extracts the IR prior to IndVarSimplify and only runs the
single pass.
---
 .../Transforms/IndVarSimplify/X86/pr45360.ll  | 167 ++++++++----------
 1 file changed, 70 insertions(+), 97 deletions(-)

diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr45360.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr45360.ll
index 8f43029fa3034..a124dbd970d69 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr45360.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr45360.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; TODO: Run under new PM after switch. The IR is the same but basic block labels are different.
-; RUN: opt -S -O2 -scev-cheap-expansion-budget=1024 %s -enable-new-pm=0 | FileCheck %s
+; RUN: opt -S -indvars -scev-cheap-expansion-budget=1024 %s | FileCheck %s
 
 ; See https://bugs.llvm.org/show_bug.cgi?id=45360
 ; This is reduced from that (runnable) test.
@@ -17,123 +16,97 @@ target triple = "x86_64-pc-linux-gnu"
 @b = dso_local global i32 0, align 4
 @e = dso_local global i32 0, align 4
 
-define dso_local i32 @main() {
+define i32 @main() {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I6:%.*]] = load i32, i32* @a, align 4
 ; CHECK-NEXT:    [[I24:%.*]] = load i32, i32* @b, align 4
-; CHECK-NEXT:    [[D_PROMOTED7:%.*]] = load i32, i32* @d, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = and i32 [[D_PROMOTED7]], [[I6]]
-; CHECK-NEXT:    [[I21:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[I21]], label [[BB27_THREAD:%.*]], label [[BB27_PREHEADER:%.*]]
-; CHECK:       bb27.preheader:
-; CHECK-NEXT:    [[I26:%.*]] = urem i32 [[I24]], [[TMP0]]
-; CHECK-NEXT:    store i32 [[I26]], i32* @e, align 4
-; CHECK-NEXT:    [[I30_NOT:%.*]] = icmp eq i32 [[I26]], 0
-; CHECK-NEXT:    br label [[BB27:%.*]]
+; CHECK-NEXT:    [[D_PROMOTED10:%.*]] = load i32, i32* @d, align 4
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB5:%.*]]
+; CHECK:       bb13.preheader:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0:%.*]], [[BB5]] ]
+; CHECK-NEXT:    [[I21:%.*]] = icmp eq i32 [[DOTLCSSA]], 0
+; CHECK-NEXT:    br i1 [[I21]], label [[BB27_THREAD:%.*]], label [[BB27:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP0]] = and i32 [[D_PROMOTED10]], [[I6]]
+; CHECK-NEXT:    br i1 false, label [[BB5]], label [[BB13_PREHEADER:%.*]]
 ; CHECK:       bb27.thread:
-; CHECK-NEXT:    store i32 0, i32* @d, align 4
-; CHECK-NEXT:    store i32 -1, i32* @f, align 4
+; CHECK-NEXT:    [[DOTLCSSA_LCSSA:%.*]] = phi i32 [ [[DOTLCSSA]], [[BB13_PREHEADER]] ]
+; CHECK-NEXT:    [[I11_LCSSA_LCSSA:%.*]] = phi i32 [ -1, [[BB13_PREHEADER]] ]
+; CHECK-NEXT:    store i32 [[DOTLCSSA_LCSSA]], i32* @d, align 4
+; CHECK-NEXT:    store i32 [[I11_LCSSA_LCSSA]], i32* @f, align 4
 ; CHECK-NEXT:    store i32 0, i32* @c, align 4
+; CHECK-NEXT:    store i32 0, i32* @e, align 4
 ; CHECK-NEXT:    br label [[BB32:%.*]]
 ; CHECK:       bb27:
+; CHECK-NEXT:    [[I26:%.*]] = urem i32 [[I24]], [[DOTLCSSA]]
+; CHECK-NEXT:    store i32 [[I26]], i32* @e, align 4
+; CHECK-NEXT:    [[I30_NOT:%.*]] = icmp eq i32 [[I26]], 0
 ; CHECK-NEXT:    br i1 [[I30_NOT]], label [[BB32_LOOPEXIT:%.*]], label [[BB36:%.*]]
 ; CHECK:       bb32.loopexit:
-; CHECK-NEXT:    store i32 [[TMP0]], i32* @d, align 4
-; CHECK-NEXT:    store i32 -1, i32* @f, align 4
+; CHECK-NEXT:    [[DOTLCSSA_LCSSA15:%.*]] = phi i32 [ [[DOTLCSSA]], [[BB27]] ]
+; CHECK-NEXT:    [[I11_LCSSA_LCSSA14:%.*]] = phi i32 [ -1, [[BB27]] ]
+; CHECK-NEXT:    store i32 [[DOTLCSSA_LCSSA15]], i32* @d, align 4
+; CHECK-NEXT:    store i32 [[I11_LCSSA_LCSSA14]], i32* @f, align 4
+; CHECK-NEXT:    store i32 0, i32* @c, align 4
 ; CHECK-NEXT:    br label [[BB32]]
 ; CHECK:       bb32:
-; CHECK-NEXT:    [[C_SINK:%.*]] = phi i32* [ @c, [[BB32_LOOPEXIT]] ], [ @e, [[BB27_THREAD]] ]
-; CHECK-NEXT:    store i32 0, i32* [[C_SINK]], align 4
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       bb36:
 ; CHECK-NEXT:    store i32 1, i32* @c, align 4
-; CHECK-NEXT:    br i1 [[I21]], label [[BB27_THREAD]], label [[BB27]]
+; CHECK-NEXT:    br label [[BB1]]
 ;
 bb:
-  %i = alloca i32, align 4
-  store i32 0, i32* %i, align 4
-  br label %bb1
-
-bb1:
-  store i32 0, i32* @f, align 4
-  br label %bb2
-
-bb2:
-  %i3 = load i32, i32* @f, align 4
-  %i4 = icmp sge i32 %i3, 0
-  br i1 %i4, label %bb5, label %bb12
-
-bb5:
   %i6 = load i32, i32* @a, align 4
-  %i7 = load i32, i32* @d, align 4
-  %i8 = and i32 %i7, %i6
-  store i32 %i8, i32* @d, align 4
-  br label %bb9
-
-bb9:
-  %i10 = load i32, i32* @f, align 4
-  %i11 = add nsw i32 %i10, -1
-  store i32 %i11, i32* @f, align 4
-  br label %bb2
-
-bb12:
-  store i32 0, i32* @c, align 4
-  br label %bb13
-
-bb13:
-  %i14 = load i32, i32* @c, align 4
-  %i15 = icmp sle i32 %i14, 0
-  br i1 %i15, label %bb16, label %bb39
-
-bb16:
-  %i17 = load i32, i32* @f, align 4
-  %i18 = icmp ne i32 %i17, 0
-  br i1 %i18, label %bb19, label %bb34
-
-bb19:
-  %i20 = load i32, i32* @d, align 4
-  %i21 = icmp eq i32 %i20, 0
-  br i1 %i21, label %bb22, label %bb23
-
-bb22:
-  br label %bb27
-
-bb23:
   %i24 = load i32, i32* @b, align 4
-  %i25 = load i32, i32* @d, align 4
-  %i26 = urem i32 %i24, %i25
-  br label %bb27
-
-bb27:
-  %i28 = phi i32 [ 0, %bb22 ], [ %i26, %bb23 ]
-  store i32 %i28, i32* @e, align 4
-  %i29 = load i32, i32* @e, align 4
-  %i30 = icmp ne i32 %i29, 0
-  br i1 %i30, label %bb31, label %bb32
+  %d.promoted10 = load i32, i32* @d, align 4
+  br label %bb1
 
-bb31:
-  br label %bb33
+bb1:                                              ; preds = %bb36, %bb
+  br label %bb5
+
+bb13.preheader:                                   ; preds = %bb5
+  %.lcssa = phi i32 [ %0, %bb5 ]
+  %i11.lcssa = phi i32 [ %i11, %bb5 ]
+  %i21 = icmp eq i32 %.lcssa, 0
+  br i1 %i21, label %bb27.thread, label %bb27
+
+bb5:                                              ; preds = %bb1, %bb5
+  %storemerge6 = phi i32 [ 0, %bb1 ], [ %i11, %bb5 ]
+  %0 = and i32 %d.promoted10, %i6
+  %i11 = add nsw i32 %storemerge6, -1
+  %i4 = icmp sgt i32 %storemerge6, 0
+  br i1 %i4, label %bb5, label %bb13.preheader
+
+bb27.thread:                                      ; preds = %bb13.preheader
+  %.lcssa.lcssa = phi i32 [ %.lcssa, %bb13.preheader ]
+  %i11.lcssa.lcssa = phi i32 [ %i11.lcssa, %bb13.preheader ]
+  store i32 %.lcssa.lcssa, i32* @d, align 4
+  store i32 %i11.lcssa.lcssa, i32* @f, align 4
+  store i32 0, i32* @c, align 4
+  store i32 0, i32* @e, align 4
+  br label %bb32
+
+bb27:                                             ; preds = %bb13.preheader
+  %i26 = urem i32 %i24, %.lcssa
+  store i32 %i26, i32* @e, align 4
+  %i30.not = icmp eq i32 %i26, 0
+  br i1 %i30.not, label %bb32.loopexit, label %bb36
+
+bb32.loopexit:                                    ; preds = %bb27
+  %.lcssa.lcssa15 = phi i32 [ %.lcssa, %bb27 ]
+  %i11.lcssa.lcssa14 = phi i32 [ %i11.lcssa, %bb27 ]
+  store i32 %.lcssa.lcssa15, i32* @d, align 4
+  store i32 %i11.lcssa.lcssa14, i32* @f, align 4
+  store i32 0, i32* @c, align 4
+  br label %bb32
 
-bb32:
+bb32:                                             ; preds = %bb32.loopexit, %bb27.thread
   ret i32 0
 
-bb33:
-  br label %bb35
-
-bb34:
-  store i32 0, i32* @d, align 4
-  br label %bb35
-
-bb35:
-  br label %bb36
-
-bb36:
-  %i37 = load i32, i32* @c, align 4
-  %i38 = add nsw i32 %i37, 1
-  store i32 %i38, i32* @c, align 4
-  br label %bb13
-
-bb39:
+bb36:                                             ; preds = %bb27
+  store i32 1, i32* @c, align 4
   br label %bb1
 }

From 0ae2464fcd4d2c2f285b83d16ff6e2426dd722d2 Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Thu, 17 Feb 2022 09:45:52 +0100
Subject: [PATCH 039/748] [clang-format] Fix wrong assertion with non-negative
 shift when aligning tokens.

Fixes https://github.com/llvm/llvm-project/issues/53880.
---
 clang/lib/Format/WhitespaceManager.cpp         | 2 +-
 clang/unittests/Format/FormatTestSelective.cpp | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 758dc5860888e..55e0b7f8e8d9e 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -406,7 +406,7 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
       Changes[i].Spaces += Shift;
 
     // We should not remove required spaces unless we break the line before.
-    assert(Changes[i].NewlinesBefore > 0 ||
+    assert(Shift >= 0 || Changes[i].NewlinesBefore > 0 ||
            Changes[i].Spaces >=
                static_cast<int>(Changes[i].Tok->SpacesRequiredBefore) ||
            Changes[i].Tok->is(tok::eof));
diff --git a/clang/unittests/Format/FormatTestSelective.cpp b/clang/unittests/Format/FormatTestSelective.cpp
index c88d1b8bd8ba2..2725e4cf776f6 100644
--- a/clang/unittests/Format/FormatTestSelective.cpp
+++ b/clang/unittests/Format/FormatTestSelective.cpp
@@ -603,6 +603,14 @@ TEST_F(FormatTestSelective, KeepsIndentAfterCommentSectionImport) {
   EXPECT_EQ(Code, format(Code, 47, 1));
 }
 
+TEST_F(FormatTestSelective, DontAssert) {
+  // https://llvm.org/PR53880
+  std::string Code = "void f() {\n"
+                     "  return a == 8 ? 32 : 16;\n"
+                     "}\n";
+  EXPECT_EQ(Code, format(Code, 40, 0));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang

From 32d2473a5dba417eb8d34146575289e4e53c91fa Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Wed, 16 Feb 2022 20:39:59 -0800
Subject: [PATCH 040/748] [BOLT][NFC] Report errors from createBinaryContext
 and RewriteInstance ctor

Refactor createBinaryContext and RewriteInstance/MachORewriteInstance
constructors to report an error in a library and fuzzer-friendly way instead of
returning a nullptr or exiting.

Reviewed By: rafauler

Differential Revision: https://reviews.llvm.org/D119658
---
 bolt/include/bolt/Core/BinaryContext.h        |  2 +-
 .../bolt/Rewrite/MachORewriteInstance.h       | 10 ++-
 bolt/include/bolt/Rewrite/RewriteInstance.h   |  9 ++-
 bolt/lib/Core/BinaryContext.cpp               | 74 +++++++++----------
 bolt/lib/Rewrite/DWARFRewriter.cpp            |  5 +-
 bolt/lib/Rewrite/MachORewriteInstance.cpp     | 26 ++++++-
 bolt/lib/Rewrite/RewriteInstance.cpp          | 30 ++++++--
 bolt/tools/driver/llvm-bolt.cpp               | 24 +++++-
 bolt/tools/heatmap/heatmap.cpp                |  7 +-
 bolt/unittests/Core/MCPlusBuilder.cpp         |  4 +-
 10 files changed, 132 insertions(+), 59 deletions(-)

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index aff770112be1c..ce246d51281a1 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -211,7 +211,7 @@ class BinaryContext {
   std::map<unsigned, DwarfLineTable> DwarfLineTablesCUMap;
 
 public:
-  static std::unique_ptr<BinaryContext>
+  static Expected<std::unique_ptr<BinaryContext>>
   createBinaryContext(const ObjectFile *File, bool IsPIC,
                       std::unique_ptr<DWARFContext> DwCtx);
 
diff --git a/bolt/include/bolt/Rewrite/MachORewriteInstance.h b/bolt/include/bolt/Rewrite/MachORewriteInstance.h
index 81a6331b6462d..0d3b72d5eac8c 100644
--- a/bolt/include/bolt/Rewrite/MachORewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/MachORewriteInstance.h
@@ -65,7 +65,15 @@ class MachORewriteInstance {
   void rewriteFile();
 
 public:
-  MachORewriteInstance(object::MachOObjectFile *InputFile, StringRef ToolPath);
+  // This constructor has complex initialization that can fail during
+  // construction. Constructors can’t return errors, so clients must test \p Err
+  // after the object is constructed. Use createMachORewriteInstance instead.
+  MachORewriteInstance(object::MachOObjectFile *InputFile, StringRef ToolPath,
+                       Error &Err);
+
+  static Expected<std::unique_ptr<MachORewriteInstance>>
+  createMachORewriteInstance(object::MachOObjectFile *InputFile,
+                             StringRef ToolPath);
   ~MachORewriteInstance();
 
   Error setProfile(StringRef FileName);
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index e0cb8b1fd631f..6c8d91042e0a3 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -41,8 +41,15 @@ class ProfileReaderBase;
 /// events.
 class RewriteInstance {
 public:
+  // This constructor has complex initialization that can fail during
+  // construction. Constructors can’t return errors, so clients must test \p Err
+  // after the object is constructed. Use createRewriteInstance instead.
   RewriteInstance(llvm::object::ELFObjectFileBase *File, const int Argc,
-                  const char *const *Argv, StringRef ToolPath);
+                  const char *const *Argv, StringRef ToolPath, Error &Err);
+
+  static Expected<std::unique_ptr<RewriteInstance>>
+  createRewriteInstance(llvm::object::ELFObjectFileBase *File, const int Argc,
+                        const char *const *Argv, StringRef ToolPath);
   ~RewriteInstance();
 
   /// Assign profile from \p Filename to this instance.
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 36745580217ed..36092e3a945f7 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Regex.h"
 #include <algorithm>
 #include <functional>
@@ -115,7 +116,7 @@ BinaryContext::~BinaryContext() {
 
 /// Create BinaryContext for a given architecture \p ArchName and
 /// triple \p TripleName.
-std::unique_ptr<BinaryContext>
+Expected<std::unique_ptr<BinaryContext>>
 BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
                                    std::unique_ptr<DWARFContext> DwCtx) {
   StringRef ArchName = "";
@@ -131,8 +132,8 @@ BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
                   "+fullfp16,+spe,+fuse-aes,+rcpc";
     break;
   default:
-    errs() << "BOLT-ERROR: Unrecognized machine in ELF file.\n";
-    return nullptr;
+    return createStringError(std::errc::not_supported,
+                             "BOLT-ERROR: Unrecognized machine in ELF file");
   }
 
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
@@ -141,39 +142,37 @@ BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
   std::string Error;
   const Target *TheTarget =
       TargetRegistry::lookupTarget(std::string(ArchName), *TheTriple, Error);
-  if (!TheTarget) {
-    errs() << "BOLT-ERROR: " << Error;
-    return nullptr;
-  }
+  if (!TheTarget)
+    return createStringError(make_error_code(std::errc::not_supported),
+                             Twine("BOLT-ERROR: ", Error));
 
   std::unique_ptr<const MCRegisterInfo> MRI(
       TheTarget->createMCRegInfo(TripleName));
-  if (!MRI) {
-    errs() << "BOLT-ERROR: no register info for target " << TripleName << "\n";
-    return nullptr;
-  }
+  if (!MRI)
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        Twine("BOLT-ERROR: no register info for target ", TripleName));
 
   // Set up disassembler.
   std::unique_ptr<const MCAsmInfo> AsmInfo(
       TheTarget->createMCAsmInfo(*MRI, TripleName, MCTargetOptions()));
-  if (!AsmInfo) {
-    errs() << "BOLT-ERROR: no assembly info for target " << TripleName << "\n";
-    return nullptr;
-  }
+  if (!AsmInfo)
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        Twine("BOLT-ERROR: no assembly info for target ", TripleName));
 
   std::unique_ptr<const MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr));
-  if (!STI) {
-    errs() << "BOLT-ERROR: no subtarget info for target " << TripleName << "\n";
-    return nullptr;
-  }
+  if (!STI)
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        Twine("BOLT-ERROR: no subtarget info for target ", TripleName));
 
   std::unique_ptr<const MCInstrInfo> MII(TheTarget->createMCInstrInfo());
-  if (!MII) {
-    errs() << "BOLT-ERROR: no instruction info for target " << TripleName
-           << "\n";
-    return nullptr;
-  }
+  if (!MII)
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        Twine("BOLT-ERROR: no instruction info for target ", TripleName));
 
   std::unique_ptr<MCContext> Ctx(
       new MCContext(*TheTriple, AsmInfo.get(), MRI.get(), STI.get()));
@@ -198,28 +197,27 @@ BinaryContext::createBinaryContext(const ObjectFile *File, bool IsPIC,
   std::unique_ptr<MCDisassembler> DisAsm(
       TheTarget->createMCDisassembler(*STI, *Ctx));
 
-  if (!DisAsm) {
-    errs() << "BOLT-ERROR: no disassembler for target " << TripleName << "\n";
-    return nullptr;
-  }
+  if (!DisAsm)
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        Twine("BOLT-ERROR: no disassembler info for target ", TripleName));
 
   std::unique_ptr<const MCInstrAnalysis> MIA(
       TheTarget->createMCInstrAnalysis(MII.get()));
-  if (!MIA) {
-    errs() << "BOLT-ERROR: failed to create instruction analysis for target"
-           << TripleName << "\n";
-    return nullptr;
-  }
+  if (!MIA)
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        Twine("BOLT-ERROR: failed to create instruction analysis for target ",
+              TripleName));
 
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
   std::unique_ptr<MCInstPrinter> InstructionPrinter(
       TheTarget->createMCInstPrinter(*TheTriple, AsmPrinterVariant, *AsmInfo,
                                      *MII, *MRI));
-  if (!InstructionPrinter) {
-    errs() << "BOLT-ERROR: no instruction printer for target " << TripleName
-           << '\n';
-    return nullptr;
-  }
+  if (!InstructionPrinter)
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        Twine("BOLT-ERROR: no instruction printer for target ", TripleName));
   InstructionPrinter->setPrintImmHex(true);
 
   std::unique_ptr<MCCodeEmitter> MCE(
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 071759a436f04..40d43bf858f8f 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -863,11 +864,11 @@ namespace {
 
 std::unique_ptr<BinaryContext>
 createDwarfOnlyBC(const object::ObjectFile &File) {
-  return BinaryContext::createBinaryContext(
+  return cantFail(BinaryContext::createBinaryContext(
       &File, false,
       DWARFContext::create(File, DWARFContext::ProcessDebugRelocations::Ignore,
                            nullptr, "", WithColor::defaultErrorHandler,
-                           WithColor::defaultWarningHandler));
+                           WithColor::defaultWarningHandler)));
 }
 
 StringMap<KnownSectionsEntry>
diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp
index 207f6c070fa8f..00fe8cf9fb5f2 100644
--- a/bolt/lib/Rewrite/MachORewriteInstance.cpp
+++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include <memory>
 
 namespace opts {
 
@@ -82,11 +83,28 @@ MCPlusBuilder *createMCPlusBuilder(const Triple::ArchType Arch,
 
 #define DEBUG_TYPE "bolt"
 
+Expected<std::unique_ptr<MachORewriteInstance>>
+MachORewriteInstance::createMachORewriteInstance(
+    object::MachOObjectFile *InputFile, StringRef ToolPath) {
+  Error Err = Error::success();
+  auto MachORI =
+      std::make_unique<MachORewriteInstance>(InputFile, ToolPath, Err);
+  if (Err)
+    return std::move(Err);
+  return MachORI;
+}
+
 MachORewriteInstance::MachORewriteInstance(object::MachOObjectFile *InputFile,
-                                           StringRef ToolPath)
-    : InputFile(InputFile), ToolPath(ToolPath),
-      BC(BinaryContext::createBinaryContext(InputFile, /* IsPIC */ true,
-                                            DWARFContext::create(*InputFile))) {
+                                           StringRef ToolPath, Error &Err)
+    : InputFile(InputFile), ToolPath(ToolPath) {
+  ErrorAsOutParameter EAO(&Err);
+  auto BCOrErr = BinaryContext::createBinaryContext(
+      InputFile, /* IsPIC */ true, DWARFContext::create(*InputFile));
+  if (Error E = BCOrErr.takeError()) {
+    Err = std::move(E);
+    return;
+  }
+  BC = std::move(BCOrErr.get());
   BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(createMCPlusBuilder(
       BC->TheTriple->getArch(), BC->MIA.get(), BC->MII.get(), BC->MRI.get())));
   if (opts::Instrument)
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index b07f849b6fe90..2671df8ebc31c 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -55,6 +56,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <fstream>
+#include <memory>
 #include <system_error>
 
 #undef  DEBUG_TYPE
@@ -353,14 +355,28 @@ bool refersToReorderedSection(ErrorOr<BinarySection &> Section) {
 
 } // anonymous namespace
 
+Expected<std::unique_ptr<RewriteInstance>>
+RewriteInstance::createRewriteInstance(ELFObjectFileBase *File, const int Argc,
+                                       const char *const *Argv,
+                                       StringRef ToolPath) {
+  Error Err = Error::success();
+  auto RI = std::make_unique<RewriteInstance>(File, Argc, Argv, ToolPath, Err);
+  if (Err)
+    return std::move(Err);
+  return RI;
+}
+
 RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
-                                 const char *const *Argv, StringRef ToolPath)
+                                 const char *const *Argv, StringRef ToolPath,
+                                 Error &Err)
     : InputFile(File), Argc(Argc), Argv(Argv), ToolPath(ToolPath),
       SHStrTab(StringTableBuilder::ELF) {
+  ErrorAsOutParameter EAO(&Err);
   auto ELF64LEFile = dyn_cast<ELF64LEObjectFile>(InputFile);
   if (!ELF64LEFile) {
-    errs() << "BOLT-ERROR: only 64-bit LE ELF binaries are supported\n";
-    exit(1);
+    Err = createStringError(errc::not_supported,
+                            "Only 64-bit LE ELF binaries are supported");
+    return;
   }
 
   bool IsPIC = false;
@@ -371,13 +387,17 @@ RewriteInstance::RewriteInstance(ELFObjectFileBase *File, const int Argc,
     IsPIC = true;
   }
 
-  BC = BinaryContext::createBinaryContext(
+  auto BCOrErr = BinaryContext::createBinaryContext(
       File, IsPIC,
       DWARFContext::create(*File, DWARFContext::ProcessDebugRelocations::Ignore,
                            nullptr, opts::DWPPathName,
                            WithColor::defaultErrorHandler,
                            WithColor::defaultWarningHandler));
-
+  if (Error E = BCOrErr.takeError()) {
+    Err = std::move(E);
+    return;
+  }
+  BC = std::move(BCOrErr.get());
   BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(createMCPlusBuilder(
       BC->TheTriple->getArch(), BC->MIA.get(), BC->MII.get(), BC->MRI.get())));
 
diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp
index 0e522891229db..2c8d1d28f840b 100644
--- a/bolt/tools/driver/llvm-bolt.cpp
+++ b/bolt/tools/driver/llvm-bolt.cpp
@@ -216,7 +216,11 @@ int main(int argc, char **argv) {
     Binary &Binary = *BinaryOrErr.get().getBinary();
 
     if (auto *e = dyn_cast<ELFObjectFileBase>(&Binary)) {
-      RewriteInstance RI(e, argc, argv, ToolPath);
+      auto RIOrErr =
+          RewriteInstance::createRewriteInstance(e, argc, argv, ToolPath);
+      if (Error E = RIOrErr.takeError())
+        report_error(opts::InputFilename, std::move(E));
+      RewriteInstance &RI = *RIOrErr.get();
       if (!opts::PerfData.empty()) {
         if (!opts::AggregateOnly) {
           errs() << ToolName
@@ -239,7 +243,11 @@ int main(int argc, char **argv) {
 
       RI.run();
     } else if (auto *O = dyn_cast<MachOObjectFile>(&Binary)) {
-      MachORewriteInstance MachORI(O, ToolPath);
+      auto MachORIOrErr =
+          MachORewriteInstance::createMachORewriteInstance(O, ToolPath);
+      if (Error E = MachORIOrErr.takeError())
+        report_error(opts::InputFilename, std::move(E));
+      MachORewriteInstance &MachORI = *MachORIOrErr.get();
 
       if (!opts::InputDataFilename.empty())
         if (Error E = MachORI.setProfile(opts::InputDataFilename))
@@ -266,10 +274,18 @@ int main(int argc, char **argv) {
   Binary &Binary2 = *BinaryOrErr2.get().getBinary();
   if (auto *ELFObj1 = dyn_cast<ELFObjectFileBase>(&Binary1)) {
     if (auto *ELFObj2 = dyn_cast<ELFObjectFileBase>(&Binary2)) {
-      RewriteInstance RI1(ELFObj1, argc, argv, ToolPath);
+      auto RI1OrErr =
+          RewriteInstance::createRewriteInstance(ELFObj1, argc, argv, ToolPath);
+      if (Error E = RI1OrErr.takeError())
+        report_error(opts::InputFilename, std::move(E));
+      RewriteInstance &RI1 = *RI1OrErr.get();
       if (Error E = RI1.setProfile(opts::InputDataFilename))
         report_error(opts::InputDataFilename, std::move(E));
-      RewriteInstance RI2(ELFObj2, argc, argv, ToolPath);
+      auto RI2OrErr =
+          RewriteInstance::createRewriteInstance(ELFObj2, argc, argv, ToolPath);
+      if (Error E = RI2OrErr.takeError())
+        report_error(opts::InputFilename2, std::move(E));
+      RewriteInstance &RI2 = *RI2OrErr.get();
       if (Error E = RI2.setProfile(opts::InputDataFilename2))
         report_error(opts::InputDataFilename2, std::move(E));
       outs() << "BOLT-DIFF: *** Analyzing binary 1: " << opts::InputFilename
diff --git a/bolt/tools/heatmap/heatmap.cpp b/bolt/tools/heatmap/heatmap.cpp
index 887a120de6cf1..0ab6a4fa52b7c 100644
--- a/bolt/tools/heatmap/heatmap.cpp
+++ b/bolt/tools/heatmap/heatmap.cpp
@@ -85,7 +85,12 @@ int main(int argc, char **argv) {
   Binary &Binary = *BinaryOrErr.get().getBinary();
 
   if (auto *e = dyn_cast<ELFObjectFileBase>(&Binary)) {
-    RewriteInstance RI(e, argc, argv, ToolPath);
+    auto RIOrErr =
+        RewriteInstance::createRewriteInstance(e, argc, argv, ToolPath);
+    if (Error E = RIOrErr.takeError())
+      report_error("RewriteInstance", std::move(E));
+
+    RewriteInstance &RI = *RIOrErr.get();
     if (Error E = RI.setProfile(opts::PerfData))
       report_error(opts::PerfData, std::move(E));
 
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index ec881bf4ead1d..2158f652c0719 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -47,8 +47,8 @@ struct MCPlusBuilderTester : public testing::TestWithParam<Triple::ArchType> {
   }
 
   void initializeBolt() {
-    BC = BinaryContext::createBinaryContext(
-        ObjFile.get(), true, DWARFContext::create(*ObjFile.get()));
+    BC = cantFail(BinaryContext::createBinaryContext(
+        ObjFile.get(), true, DWARFContext::create(*ObjFile.get())));
     ASSERT_FALSE(!BC);
     BC->initializeTarget(std::unique_ptr<MCPlusBuilder>(createMCPlusBuilder(
         GetParam(), BC->MIA.get(), BC->MII.get(), BC->MRI.get())));

From 77c7ce03845d31b1c92cc2a93d56f2efa485964f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 20 Jan 2022 09:30:57 +0000
Subject: [PATCH 041/748] [libcxx] [test] Make the put_long_double test pass on
 mingw, clarify quirks in put_double

Expect the same NAN formatting on Windows as on Glibc. (Both MSVC and
MinGW produce the same formatting there.)

The hex float formatting tests pass on MinGW, so opt in to those tests.

Document exactly what issues are remaining in Clang-cl/MSVC
configurations. (It's easily possible to make the tests pass there too,
but it requires a whole lot of small-scope ifndefs in the test file;
around 60 ifdefs in total for those both test files. Those could
be avoided if the CI environment could run with a newer version
of UCRT, but that's nontrivial to fix right away.)

Differential Revision: https://reviews.llvm.org/D119766
---
 .../facet.num.put.members/put_double.pass.cpp | 14 ++++++++++++-
 .../put_long_double.pass.cpp                  | 21 ++++++++++++++++---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
index 6554e603d3646..08c9718e68e72 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
@@ -13,7 +13,19 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const;
 
 // FIXME: The printf functions in Microsoft's CRT have a couple quirks in
-// corner cases, failing this test.
+// corner cases, failing this test:
+// - With the Microsoft UCRT, printf("%#.*g", 0, 0.0) produces "0.0" while
+//   other C runtimes produce "0.". For other precisions than 0, Microsoft's
+//   consistently produce one digit more than others. In the MinGW test setups,
+//   the code is built with __USE_MINGW_ANSI_STDIO=1, which uses MinGW's own
+//   reimplementation of stdio functions, which doesn't have this issue.
+//   This bug requires excluding everything that runs with showpoint() enabled.
+//   https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837
+//   This issue is fixed in newer UCRT versions, since 10.0.19041.0.
+// - With the Microsoft UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0"
+//   while other C runtimes produce just "0x0p+0". This requires omitting all
+//   tests of hex float formatting.
+//   https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
 // XFAIL: msvc
 
 // XFAIL: LIBCXX-AIX-FIXME
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
index a41efe746db62..2eebc1cabc958 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
@@ -12,7 +12,22 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const;
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
+// FIXME: The printf functions in Microsoft's CRT have a couple quirks in
+// corner cases, failing this test:
+// - With the Microsoft UCRT, printf("%#.*g", 0, 0.0) produces "0.0" while
+//   other C runtimes produce "0.". For other precisions than 0, Microsoft's
+//   consistently produce one digit more than others. In the MinGW test setups,
+//   the code is built with __USE_MINGW_ANSI_STDIO=1, which uses MinGW's own
+//   reimplementation of stdio functions, which doesn't have this issue.
+//   This bug requires excluding everything that runs with showpoint() enabled.
+//   https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837
+//   This issue is fixed in newer UCRT versions, since 10.0.19041.0.
+// - With the Microsoft UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0"
+//   while other C runtimes produce just "0x0p+0". This requires omitting all
+//   tests of hex float formatting.
+//   https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
+// XFAIL: msvc
+
 // XFAIL: LIBCXX-AIX-FIXME
 
 #include <locale>
@@ -10717,7 +10732,7 @@ void test5()
     std::locale lc = std::locale::classic();
     std::locale lg(lc, new my_numpunct);
     const my_facet f(1);
-#if defined(TEST_HAS_GLIBC)
+#if defined(TEST_HAS_GLIBC) || defined(_WIN32)
     std::string pnan_sign = "+";
     std::string pnan_padding25 = "*********************";
 #else
@@ -24410,7 +24425,7 @@ void test12()
 {
     std::locale lc = std::locale::classic();
     std::locale lg(lc, new my_numpunct);
-#if (defined(__APPLE__) || defined(TEST_HAS_GLIBC)) && defined(__x86_64__)
+#if (defined(__APPLE__) || defined(TEST_HAS_GLIBC) || defined(__MINGW32__)) && defined(__x86_64__)
 // This test is failing on FreeBSD, possibly due to different representations
 // of the floating point numbers.
     const my_facet f(1);

From f081cc50372f9415ef4fa2204a4b7f54153af455 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 25 Jan 2022 10:32:55 +0000
Subject: [PATCH 042/748] [libcxx] [test] Fix the locale get_one_wide test for
 windows and glibc

Differential Revision: https://reviews.llvm.org/D119790
---
 .../get_one_wide.pass.cpp                     | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one_wide.pass.cpp b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one_wide.pass.cpp
index c762f970016ba..3aac42048ee41 100644
--- a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one_wide.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one_wide.pass.cpp
@@ -9,8 +9,6 @@
 // NetBSD does not support LC_TIME at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // XFAIL: libcpp-has-no-wide-characters
 
 // REQUIRES: locale.en_US.UTF-8
@@ -25,9 +23,6 @@
 // iter_type get(iter_type s, iter_type end, ios_base& f,
 //               ios_base::iostate& err, tm *t, char format, char modifier = 0) const;
 
-// TODO: investigation needed
-// XFAIL: target={{.*}}-linux-gnu{{.*}}
-
 #include <locale>
 #include <cassert>
 #include "test_macros.h"
@@ -54,7 +49,15 @@ int main(int, char**)
     std::tm t;
     {
         const my_facet f(LOCALE_en_US_UTF_8, 1);
+#ifdef _WIN32
+        // On Windows, the "%c" format lacks the leading week day, which
+        // means that t.tm_wday doesn't get set when parsing the string.
+        const wchar_t in[] = L"12/31/2061 11:55:59 PM";
+#elif defined(TEST_HAS_GLIBC)
+        const wchar_t in[] = L"Sat 31 Dec 2061 11:55:59 PM";
+#else
         const wchar_t in[] = L"Sat Dec 31 23:55:59 2061";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'c');
@@ -65,12 +68,18 @@ int main(int, char**)
         assert(t.tm_mday == 31);
         assert(t.tm_mon == 11);
         assert(t.tm_year == 161);
+#ifndef _WIN32
         assert(t.tm_wday == 6);
+#endif
         assert(err == std::ios_base::eofbit);
     }
     {
         const my_facet f(LOCALE_en_US_UTF_8, 1);
+#if defined(_WIN32) || defined(TEST_HAS_GLIBC)
+        const wchar_t in[] = L"11:55:59 PM";
+#else
         const wchar_t in[] = L"23:55:59";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'X');
@@ -82,7 +91,13 @@ int main(int, char**)
     }
     {
         const my_facet f(LOCALE_fr_FR_UTF_8, 1);
+#ifdef _WIN32
+        const wchar_t in[] = L"31/12/2061 23:55:59";
+#elif defined(TEST_HAS_GLIBC)
+        const wchar_t in[] = L"sam. 31 d" L"\xE9" L"c. 2061 23:55:59";
+#else
         const wchar_t in[] = L"Sam 31 d" L"\xE9" L"c 23:55:59 2061";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'c');
@@ -93,7 +108,9 @@ int main(int, char**)
         assert(t.tm_mday == 31);
         assert(t.tm_mon == 11);
         assert(t.tm_year == 161);
+#ifndef _WIN32
         assert(t.tm_wday == 6);
+#endif
         assert(err == std::ios_base::eofbit);
     }
     {
@@ -164,7 +181,11 @@ int main(int, char**)
 #endif
     {
         const my_facet f(LOCALE_zh_CN_UTF_8, 1);
+#ifdef _WIN32
+        const wchar_t in[] = L"23:55:59";
+#else
         const wchar_t in[] = L"23" L"\x65F6" L"55" L"\x5206" L"59" L"\x79D2";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'X');

From 83c2aa467e22f85a0952fb1788771647829d9633 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 24 Jan 2022 22:25:41 +0000
Subject: [PATCH 043/748] [libcxx] [test] Fix locale.time.get.byname get_date
 and get_date_wide on Windows

Also apply the same fix on glibc. This takes the test one step closer
to passing on glibc, but it still fails on the zh_CN test (which
requires a more involved fix in libc++ itself).

Differential Revision: https://reviews.llvm.org/D119791
---
 .../locale.time.get.byname/get_date.pass.cpp             | 9 +++++----
 .../locale.time.get.byname/get_date_wide.pass.cpp        | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date.pass.cpp b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date.pass.cpp
index 9989f8cd801c9..4f17a019dbccf 100644
--- a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date.pass.cpp
@@ -14,12 +14,9 @@
 // REQUIRES: locale.ru_RU.UTF-8
 // REQUIRES: locale.zh_CN.UTF-8
 
-// GLIBC Expects "10/06/2009" for fr_FR as opposed to "10.06.2009"
-// GLIBC also fails on the zh_CN test.
+// GLIBC fails on the zh_CN test.
 // XFAIL: linux
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // <locale>
 
 // class time_get_byname<charT, InputIterator>
@@ -66,7 +63,11 @@ int main(int, char**)
     }
     {
         const my_facet f(LOCALE_fr_FR_UTF_8, 1);
+#if defined(_WIN32) || defined(TEST_HAS_GLIBC)
+        const char in[] = "10/06/2009";
+#else
         const char in[] = "10.06.2009";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get_date(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t);
diff --git a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date_wide.pass.cpp b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date_wide.pass.cpp
index 69dd3893e3faf..f7f15568e440c 100644
--- a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date_wide.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_date_wide.pass.cpp
@@ -16,12 +16,9 @@
 // REQUIRES: locale.ru_RU.UTF-8
 // REQUIRES: locale.zh_CN.UTF-8
 
-// GLIBC Expects "10/06/2009" for fr_FR as opposed to "10.06.2009"
-// GLIBC also fails on the zh_CN test.
+// GLIBC fails on the zh_CN test.
 // XFAIL: linux
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // <locale>
 
 // class time_get_byname<charT, InputIterator>
@@ -68,7 +65,11 @@ int main(int, char**)
     }
     {
         const my_facet f(LOCALE_fr_FR_UTF_8, 1);
+#if defined(_WIN32) || defined(TEST_HAS_GLIBC)
+        const wchar_t in[] = L"10/06/2009";
+#else
         const wchar_t in[] = L"10.06.2009";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get_date(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t);

From c3c5280b0ef4f8ea85695befaac70485cb538f11 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Feb 2022 09:54:07 +0100
Subject: [PATCH 044/748] [InstSimplify] Delay creation of constants for
 offsets (NFC)

Return APInt from stripAndComputeConstantOffsets(), and only
create corresponding Constants later, if we actually need them.
---
 llvm/lib/Analysis/InstructionSimplify.cpp | 65 +++++++++--------------
 1 file changed, 25 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 19db5e37cdddb..23f2e06b6e777 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -692,37 +692,29 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
 /// Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
-/// accumulates the total constant offset applied in the returned constant. It
-/// returns 0 if V is not a pointer, and returns the constant '0' if there are
-/// no constant offsets applied.
+/// accumulates the total constant offset applied in the returned constant.
+/// It returns zero if there are no constant offsets applied.
 ///
-/// This is very similar to GetPointerBaseWithConstantOffset except it doesn't
-/// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
-/// folding.
-static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
-                                                bool AllowNonInbounds = false) {
+/// This is very similar to stripAndAccumulateConstantOffsets(), except it
+/// normalizes the offset bitwidth to the stripped pointer type, not the
+/// original pointer type.
+static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
+                                            bool AllowNonInbounds = false) {
   assert(V->getType()->isPtrOrPtrVectorTy());
 
   APInt Offset = APInt::getZero(DL.getIndexTypeSizeInBits(V->getType()));
-
   V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds);
   // As that strip may trace through `addrspacecast`, need to sext or trunc
   // the offset calculated.
-  Type *IntIdxTy = DL.getIndexType(V->getType())->getScalarType();
-  Offset = Offset.sextOrTrunc(IntIdxTy->getIntegerBitWidth());
-
-  Constant *OffsetIntPtr = ConstantInt::get(IntIdxTy, Offset);
-  if (VectorType *VecTy = dyn_cast<VectorType>(V->getType()))
-    return ConstantVector::getSplat(VecTy->getElementCount(), OffsetIntPtr);
-  return OffsetIntPtr;
+  return Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(V->getType()));
 }
 
 /// Compute the constant difference between two pointer values.
 /// If the difference is not a constant, returns zero.
 static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
                                           Value *RHS) {
-  Constant *LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
-  Constant *RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
+  APInt LHSOffset = stripAndComputeConstantOffsets(DL, LHS);
+  APInt RHSOffset = stripAndComputeConstantOffsets(DL, RHS);
 
   // If LHS and RHS are not related via constant offsets to the same base
   // value, there is nothing we can do here.
@@ -733,7 +725,10 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
   //    LHS - RHS
   //  = (LHSOffset + Base) - (RHSOffset + Base)
   //  = LHSOffset - RHSOffset
-  return ConstantExpr::getSub(LHSOffset, RHSOffset);
+  Constant *Res = ConstantInt::get(LHS->getContext(), LHSOffset - RHSOffset);
+  if (auto *VecTy = dyn_cast<VectorType>(LHS->getType()))
+    Res = ConstantVector::getSplat(VecTy->getElementCount(), Res);
+  return Res;
 }
 
 /// Given operands for a Sub, see if we can fold the result.
@@ -2592,15 +2587,14 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   // Even if an non-inbounds GEP occurs along the path we can still optimize
   // equality comparisons concerning the result.
   bool AllowNonInbounds = ICmpInst::isEquality(Pred);
-  Constant *LHSOffset =
-      stripAndComputeConstantOffsets(DL, LHS, AllowNonInbounds);
-  Constant *RHSOffset =
-      stripAndComputeConstantOffsets(DL, RHS, AllowNonInbounds);
+  APInt LHSOffset = stripAndComputeConstantOffsets(DL, LHS, AllowNonInbounds);
+  APInt RHSOffset = stripAndComputeConstantOffsets(DL, RHS, AllowNonInbounds);
 
   // If LHS and RHS are related via constant offsets to the same base
   // value, we can replace it with an icmp which just compares the offsets.
   if (LHS == RHS)
-    return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
+    return ConstantInt::get(
+        GetCompareTy(LHS), ICmpInst::compare(LHSOffset, RHSOffset, Pred));
 
   // Various optimizations for (in)equality comparisons.
   if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
@@ -2635,32 +2629,23 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
     // address, due to canonicalization and constant folding.
     if (isa<AllocaInst>(LHS) &&
         (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
-      ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
-      ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
       uint64_t LHSSize, RHSSize;
       ObjectSizeOpts Opts;
       Opts.NullIsUnknownSize =
           NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
-      if (LHSOffsetCI && RHSOffsetCI &&
-          getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
-          getObjectSize(RHS, RHSSize, DL, TLI, Opts)) {
-        const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
-        const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
-        if (!LHSOffsetValue.isNegative() &&
-            !RHSOffsetValue.isNegative() &&
-            LHSOffsetValue.ult(LHSSize) &&
-            RHSOffsetValue.ult(RHSSize)) {
-          return ConstantInt::get(GetCompareTy(LHS),
-                                  !CmpInst::isTrueWhenEqual(Pred));
-        }
+      if (getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
+          getObjectSize(RHS, RHSSize, DL, TLI, Opts) &&
+          !LHSOffset.isNegative() && !RHSOffset.isNegative() &&
+          LHSOffset.ult(LHSSize) && RHSOffset.ult(RHSSize)) {
+        return ConstantInt::get(GetCompareTy(LHS),
+                                !CmpInst::isTrueWhenEqual(Pred));
       }
 
       // Repeat the above check but this time without depending on DataLayout
       // or being able to compute a precise size.
       if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
           !cast<PointerType>(RHS->getType())->isEmptyTy() &&
-          LHSOffset->isNullValue() &&
-          RHSOffset->isNullValue())
+          LHSOffset.isNullValue() && RHSOffset.isNullValue())
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
     }

From 25f1d50ca5a0193713b6b95927230bf7aea987f2 Mon Sep 17 00:00:00 2001
From: Lorenzo Chelini <l.chelini@icloud.com>
Date: Thu, 17 Feb 2022 10:06:16 +0100
Subject: [PATCH 045/748] [MLIR][PDL] Fix typo (NFC)

---
 mlir/include/mlir-c/Dialect/PDL.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir-c/Dialect/PDL.h b/mlir/include/mlir-c/Dialect/PDL.h
index 1b152899948ca..8bd7976e2a491 100644
--- a/mlir/include/mlir-c/Dialect/PDL.h
+++ b/mlir/include/mlir-c/Dialect/PDL.h
@@ -71,4 +71,4 @@ MLIR_CAPI_EXPORTED MlirType mlirPDLValueTypeGet(MlirContext ctx);
 }
 #endif
 
-#endif // MLIR_C_DIALECT_QUANT_H
+#endif // MLIR_C_DIALECT_PDL_H

From dd4dde8d39a9c36ea692635bdfc0c90cc8d755fd Mon Sep 17 00:00:00 2001
From: Stanislav Gatev <sgatev@google.com>
Date: Wed, 16 Feb 2022 16:47:37 +0000
Subject: [PATCH 046/748] [clang][dataflow] Add transfer functions for logical
 and, or, not.

This is part of the implementation of the dataflow analysis framework.
See "[RFC] A dataflow analysis framework for Clang AST" on cfe-dev.

Reviewed-by: xazax.hun

Differential Revision: https://reviews.llvm.org/D119953
---
 .../FlowSensitive/DataflowAnalysisContext.h   |  12 +-
 .../FlowSensitive/DataflowEnvironment.h       |   2 +-
 .../clang/Analysis/FlowSensitive/Transfer.h   |  13 +-
 .../clang/Analysis/FlowSensitive/Value.h      |  97 +++++++++++++-
 clang/lib/Analysis/FlowSensitive/Transfer.cpp |  71 ++++++++--
 .../TypeErasedDataflowAnalysis.cpp            |  44 +++++--
 .../Analysis/FlowSensitive/TransferTest.cpp   | 124 +++++++++++++++++-
 .../TypeErasedDataflowAnalysisTest.cpp        |   3 +-
 8 files changed, 334 insertions(+), 32 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 5c1b41d538921..52f738d59b812 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -34,8 +34,8 @@ namespace dataflow {
 class DataflowAnalysisContext {
 public:
   DataflowAnalysisContext()
-      : TrueVal(&takeOwnership(std::make_unique<BoolValue>())),
-        FalseVal(&takeOwnership(std::make_unique<BoolValue>())) {}
+      : TrueVal(takeOwnership(std::make_unique<AtomicBoolValue>())),
+        FalseVal(takeOwnership(std::make_unique<AtomicBoolValue>())) {}
 
   /// Takes ownership of `Loc` and returns a reference to it.
   ///
@@ -115,8 +115,8 @@ class DataflowAnalysisContext {
 
   /// Returns a symbolic boolean value that models a boolean literal equal to
   /// `Value`.
-  BoolValue &getBoolLiteralValue(bool Value) const {
-    return Value ? *TrueVal : *FalseVal;
+  AtomicBoolValue &getBoolLiteralValue(bool Value) const {
+    return Value ? TrueVal : FalseVal;
   }
 
 private:
@@ -135,8 +135,8 @@ class DataflowAnalysisContext {
   StorageLocation *ThisPointeeLoc = nullptr;
 
   // FIXME: Add support for boolean expressions.
-  BoolValue *TrueVal;
-  BoolValue *FalseVal;
+  AtomicBoolValue &TrueVal;
+  AtomicBoolValue &FalseVal;
 };
 
 } // namespace dataflow
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index cebfb66ef242f..af613c95bb8dc 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -226,7 +226,7 @@ class Environment {
 
   /// Returns a symbolic boolean value that models a boolean literal equal to
   /// `Value`
-  BoolValue &getBoolLiteralValue(bool Value) const {
+  AtomicBoolValue &getBoolLiteralValue(bool Value) const {
     return DACtx->getBoolLiteralValue(Value);
   }
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/Transfer.h b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
index a12674a173be4..a6b663b997fd6 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Transfer.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
@@ -20,12 +20,23 @@
 namespace clang {
 namespace dataflow {
 
+/// Maps statements to the environments of basic blocks that contain them.
+class StmtToEnvMap {
+public:
+  virtual ~StmtToEnvMap() = default;
+
+  /// Returns the environment of the basic block that contains `S` or nullptr if
+  /// there isn't one.
+  /// FIXME: Ensure that the result can't be null and return a const reference.
+  virtual const Environment *getEnvironment(const Stmt &S) const = 0;
+};
+
 /// Evaluates `S` and updates `Env` accordingly.
 ///
 /// Requirements:
 ///
 ///  The type of `S` must not be `ParenExpr`.
-void transfer(const Stmt &S, Environment &Env);
+void transfer(const StmtToEnvMap &StmtToEnv, const Stmt &S, Environment &Env);
 
 } // namespace dataflow
 } // namespace clang
diff --git a/clang/include/clang/Analysis/FlowSensitive/Value.h b/clang/include/clang/Analysis/FlowSensitive/Value.h
index da04f926c597b..7c02cc6c3505b 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Value.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Value.h
@@ -28,7 +28,19 @@ namespace dataflow {
 /// Base class for all values computed by abstract interpretation.
 class Value {
 public:
-  enum class Kind { Bool, Integer, Reference, Pointer, Struct };
+  enum class Kind {
+    Integer,
+    Reference,
+    Pointer,
+    Struct,
+
+    // Synthetic boolean values are either atomic values or composites that
+    // represent conjunctions, disjunctions, and negations.
+    AtomicBool,
+    Conjunction,
+    Disjunction,
+    Negation
+  };
 
   explicit Value(Kind ValKind) : ValKind(ValKind) {}
 
@@ -43,9 +55,88 @@ class Value {
 /// Models a boolean.
 class BoolValue : public Value {
 public:
-  explicit BoolValue() : Value(Kind::Bool) {}
+  explicit BoolValue(Kind ValueKind) : Value(ValueKind) {}
 
-  static bool classof(const Value *Val) { return Val->getKind() == Kind::Bool; }
+  static bool classof(const Value *Val) {
+    return Val->getKind() == Kind::AtomicBool ||
+           Val->getKind() == Kind::Conjunction ||
+           Val->getKind() == Kind::Disjunction ||
+           Val->getKind() == Kind::Negation;
+  }
+};
+
+/// Models an atomic boolean.
+class AtomicBoolValue : public BoolValue {
+public:
+  explicit AtomicBoolValue() : BoolValue(Kind::AtomicBool) {}
+
+  static bool classof(const Value *Val) {
+    return Val->getKind() == Kind::AtomicBool;
+  }
+};
+
+/// Models a boolean conjunction.
+// FIXME: Consider representing binary and unary boolean operations similar
+// to how they are represented in the AST. This might become more pressing
+// when such operations need to be added for other data types.
+class ConjunctionValue : public BoolValue {
+public:
+  explicit ConjunctionValue(BoolValue &LeftSubVal, BoolValue &RightSubVal)
+      : BoolValue(Kind::Conjunction), LeftSubVal(LeftSubVal),
+        RightSubVal(RightSubVal) {}
+
+  static bool classof(const Value *Val) {
+    return Val->getKind() == Kind::Conjunction;
+  }
+
+  /// Returns the left sub-value of the conjunction.
+  BoolValue &getLeftSubValue() const { return LeftSubVal; }
+
+  /// Returns the right sub-value of the conjunction.
+  BoolValue &getRightSubValue() const { return RightSubVal; }
+
+private:
+  BoolValue &LeftSubVal;
+  BoolValue &RightSubVal;
+};
+
+/// Models a boolean disjunction.
+class DisjunctionValue : public BoolValue {
+public:
+  explicit DisjunctionValue(BoolValue &LeftSubVal, BoolValue &RightSubVal)
+      : BoolValue(Kind::Disjunction), LeftSubVal(LeftSubVal),
+        RightSubVal(RightSubVal) {}
+
+  static bool classof(const Value *Val) {
+    return Val->getKind() == Kind::Disjunction;
+  }
+
+  /// Returns the left sub-value of the disjunction.
+  BoolValue &getLeftSubValue() const { return LeftSubVal; }
+
+  /// Returns the right sub-value of the disjunction.
+  BoolValue &getRightSubValue() const { return RightSubVal; }
+
+private:
+  BoolValue &LeftSubVal;
+  BoolValue &RightSubVal;
+};
+
+/// Models a boolean negation.
+class NegationValue : public BoolValue {
+public:
+  explicit NegationValue(BoolValue &SubVal)
+      : BoolValue(Kind::Negation), SubVal(SubVal) {}
+
+  static bool classof(const Value *Val) {
+    return Val->getKind() == Kind::Negation;
+  }
+
+  /// Returns the sub-value of the negation.
+  BoolValue &getSubVal() const { return SubVal; }
+
+private:
+  BoolValue &SubVal;
 };
 
 /// Models an integer.
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 51a86b727e339..72475e0c79d90 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -39,10 +39,12 @@ static const Expr *skipExprWithCleanups(const Expr *E) {
 
 class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
 public:
-  TransferVisitor(Environment &Env) : Env(Env) {}
+  TransferVisitor(const StmtToEnvMap &StmtToEnv, Environment &Env)
+      : StmtToEnv(StmtToEnv), Env(Env) {}
 
   void VisitBinaryOperator(const BinaryOperator *S) {
-    if (S->getOpcode() == BO_Assign) {
+    switch (S->getOpcode()) {
+    case BO_Assign: {
       // The CFG does not contain `ParenExpr` as top-level statements in basic
       // blocks, however sub-expressions can still be of that type.
       assert(S->getLHS() != nullptr);
@@ -51,7 +53,7 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       assert(LHS != nullptr);
       auto *LHSLoc = Env.getStorageLocation(*LHS, SkipPast::Reference);
       if (LHSLoc == nullptr)
-        return;
+        break;
 
       // The CFG does not contain `ParenExpr` as top-level statements in basic
       // blocks, however sub-expressions can still be of that type.
@@ -61,15 +63,57 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       assert(RHS != nullptr);
       Value *RHSVal = Env.getValue(*RHS, SkipPast::Reference);
       if (RHSVal == nullptr)
-        return;
+        break;
 
       // Assign a value to the storage location of the left-hand side.
       Env.setValue(*LHSLoc, *RHSVal);
 
       // Assign a storage location for the whole expression.
       Env.setStorageLocation(*S, *LHSLoc);
+      break;
+    }
+    case BO_LAnd:
+    case BO_LOr: {
+      const Expr *LHS = S->getLHS();
+      assert(LHS != nullptr);
+
+      const Expr *RHS = S->getRHS();
+      assert(RHS != nullptr);
+
+      BoolValue *LHSVal =
+          dyn_cast_or_null<BoolValue>(Env.getValue(*LHS, SkipPast::Reference));
+
+      // `RHS` and `S` might be part of different basic blocks. We need to
+      // access their values from the corresponding environments.
+      BoolValue *RHSVal = nullptr;
+      const Environment *RHSEnv = StmtToEnv.getEnvironment(*RHS);
+      if (RHSEnv != nullptr)
+        RHSVal = dyn_cast_or_null<BoolValue>(
+            RHSEnv->getValue(*RHS, SkipPast::Reference));
+
+      // Create fresh values for unknown boolean expressions.
+      // FIXME: Consider providing a `GetOrCreateFresh` util in case this style
+      // is expected to be common or make sure that all expressions are assigned
+      // values and drop this.
+      if (LHSVal == nullptr)
+        LHSVal = &Env.takeOwnership(std::make_unique<AtomicBoolValue>());
+      if (RHSVal == nullptr)
+        RHSVal = &Env.takeOwnership(std::make_unique<AtomicBoolValue>());
+
+      auto &Loc = Env.createStorageLocation(*S);
+      Env.setStorageLocation(*S, Loc);
+      if (S->getOpcode() == BO_LAnd)
+        Env.setValue(Loc, Env.takeOwnership(std::make_unique<ConjunctionValue>(
+                              *LHSVal, *RHSVal)));
+      else
+        Env.setValue(Loc, Env.takeOwnership(std::make_unique<DisjunctionValue>(
+                              *LHSVal, *RHSVal)));
+      break;
+    }
+    default:
+      // FIXME: Add support for BO_EQ, BO_NE.
+      break;
     }
-    // FIXME: Add support for BO_EQ, BO_NE.
   }
 
   void VisitDeclRefExpr(const DeclRefExpr *S) {
@@ -212,8 +256,18 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       Env.setValue(PointerLoc, PointerVal);
       break;
     }
+    case UO_LNot: {
+      auto *SubExprVal =
+          dyn_cast_or_null<BoolValue>(Env.getValue(*SubExpr, SkipPast::None));
+      if (SubExprVal == nullptr)
+        return;
+
+      auto &ExprLoc = Env.createStorageLocation(*S);
+      Env.setStorageLocation(*S, ExprLoc);
+      Env.setValue(ExprLoc, Env.takeOwnership(
+                                std::make_unique<NegationValue>(*SubExprVal)));
+    }
     default:
-      // FIXME: Add support for UO_LNot.
       break;
     }
   }
@@ -450,12 +504,13 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
   }
 
 private:
+  const StmtToEnvMap &StmtToEnv;
   Environment &Env;
 };
 
-void transfer(const Stmt &S, Environment &Env) {
+void transfer(const StmtToEnvMap &StmtToEnv, const Stmt &S, Environment &Env) {
   assert(!isa<ParenExpr>(&S));
-  TransferVisitor(Env).Visit(&S);
+  TransferVisitor(StmtToEnv, Env).Visit(&S);
 }
 
 } // namespace dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 6b14b5ceaf69a..3acfc656a9c66 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -24,6 +24,7 @@
 #include "clang/Analysis/FlowSensitive/Transfer.h"
 #include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -32,6 +33,27 @@
 namespace clang {
 namespace dataflow {
 
+class StmtToEnvMapImpl : public StmtToEnvMap {
+public:
+  StmtToEnvMapImpl(
+      const ControlFlowContext &CFCtx,
+      llvm::ArrayRef<llvm::Optional<TypeErasedDataflowAnalysisState>>
+          BlockToState)
+      : CFCtx(CFCtx), BlockToState(BlockToState) {}
+
+  const Environment *getEnvironment(const Stmt &S) const override {
+    auto BlockIT = CFCtx.getStmtToBlock().find(&S);
+    assert(BlockIT != CFCtx.getStmtToBlock().end());
+    const auto &State = BlockToState[BlockIT->getSecond()->getBlockID()];
+    assert(State.hasValue());
+    return &State.getValue().Env;
+  }
+
+private:
+  const ControlFlowContext &CFCtx;
+  llvm::ArrayRef<llvm::Optional<TypeErasedDataflowAnalysisState>> BlockToState;
+};
+
 /// Computes the input state for a given basic block by joining the output
 /// states of its predecessors.
 ///
@@ -42,7 +64,7 @@ namespace dataflow {
 ///   `llvm::None` represent basic blocks that are not evaluated yet.
 static TypeErasedDataflowAnalysisState computeBlockInputState(
     const ControlFlowContext &CFCtx,
-    std::vector<llvm::Optional<TypeErasedDataflowAnalysisState>> &BlockStates,
+    llvm::ArrayRef<llvm::Optional<TypeErasedDataflowAnalysisState>> BlockStates,
     const CFGBlock &Block, const Environment &InitEnv,
     TypeErasedDataflowAnalysis &Analysis) {
   llvm::DenseSet<const CFGBlock *> Preds;
@@ -111,17 +133,19 @@ static TypeErasedDataflowAnalysisState computeBlockInputState(
 /// Transfers `State` by evaluating `CfgStmt` in the context of `Analysis`.
 /// `HandleTransferredStmt` (if provided) will be applied to `CfgStmt`, after it
 /// is evaluated.
-static void
-transferCFGStmt(const CFGStmt &CfgStmt, TypeErasedDataflowAnalysis &Analysis,
-                TypeErasedDataflowAnalysisState &State,
-                std::function<void(const CFGStmt &,
-                                   const TypeErasedDataflowAnalysisState &)>
-                    HandleTransferredStmt) {
+static void transferCFGStmt(
+    const ControlFlowContext &CFCtx,
+    llvm::ArrayRef<llvm::Optional<TypeErasedDataflowAnalysisState>> BlockStates,
+    const CFGStmt &CfgStmt, TypeErasedDataflowAnalysis &Analysis,
+    TypeErasedDataflowAnalysisState &State,
+    std::function<void(const CFGStmt &,
+                       const TypeErasedDataflowAnalysisState &)>
+        HandleTransferredStmt) {
   const Stmt *S = CfgStmt.getStmt();
   assert(S != nullptr);
 
   if (Analysis.applyBuiltinTransfer())
-    transfer(*S, State.Env);
+    transfer(StmtToEnvMapImpl(CFCtx, BlockStates), *S, State.Env);
   Analysis.transferTypeErased(S, State.Lattice, State.Env);
 
   if (HandleTransferredStmt != nullptr)
@@ -176,8 +200,8 @@ TypeErasedDataflowAnalysisState transferBlock(
   for (const CFGElement &Element : Block) {
     switch (Element.getKind()) {
     case CFGElement::Statement:
-      transferCFGStmt(*Element.getAs<CFGStmt>(), Analysis, State,
-                      HandleTransferredStmt);
+      transferCFGStmt(CFCtx, BlockStates, *Element.getAs<CFGStmt>(), Analysis,
+                      State, HandleTransferredStmt);
       break;
     case CFGElement::Initializer:
       if (Analysis.applyBuiltinTransfer())
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 978768333c386..83ccba1a25382 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2006,6 +2006,42 @@ TEST_F(TransferTest, AssignFromBoolLiteral) {
       // [[p]]
     }
   )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                const auto *FooVal = dyn_cast_or_null<AtomicBoolValue>(
+                    Env.getValue(*FooDecl, SkipPast::None));
+                ASSERT_THAT(FooVal, NotNull());
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const auto *BarVal = dyn_cast_or_null<AtomicBoolValue>(
+                    Env.getValue(*BarDecl, SkipPast::None));
+                ASSERT_THAT(BarVal, NotNull());
+
+                EXPECT_EQ(FooVal, &Env.getBoolLiteralValue(true));
+                EXPECT_EQ(BarVal, &Env.getBoolLiteralValue(false));
+              });
+}
+
+TEST_F(TransferTest, AssignFromBoolConjunction) {
+  std::string Code = R"(
+    void target() {
+      bool Foo = true;
+      bool Bar = true;
+      bool Baz = (Foo) && (Bar);
+      // [[p]]
+    }
+  )";
   runDataflow(
       Code, [](llvm::ArrayRef<
                    std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
@@ -2028,9 +2064,93 @@ TEST_F(TransferTest, AssignFromBoolLiteral) {
             dyn_cast_or_null<BoolValue>(Env.getValue(*BarDecl, SkipPast::None));
         ASSERT_THAT(BarVal, NotNull());
 
-        EXPECT_EQ(FooVal, &Env.getBoolLiteralValue(true));
-        EXPECT_EQ(BarVal, &Env.getBoolLiteralValue(false));
+        const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+        ASSERT_THAT(BazDecl, NotNull());
+
+        const auto *BazVal = dyn_cast_or_null<ConjunctionValue>(
+            Env.getValue(*BazDecl, SkipPast::None));
+        ASSERT_THAT(BazVal, NotNull());
+
+        EXPECT_EQ(&BazVal->getLeftSubValue(), FooVal);
+        EXPECT_EQ(&BazVal->getRightSubValue(), BarVal);
       });
 }
 
+TEST_F(TransferTest, AssignFromBoolDisjunction) {
+  std::string Code = R"(
+    void target() {
+      bool Foo = true;
+      bool Bar = true;
+      bool Baz = (Foo) || (Bar);
+      // [[p]]
+    }
+  )";
+  runDataflow(
+      Code, [](llvm::ArrayRef<
+                   std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                   Results,
+               ASTContext &ASTCtx) {
+        ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+        const Environment &Env = Results[0].second.Env;
+
+        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+        ASSERT_THAT(FooDecl, NotNull());
+
+        const auto *FooVal =
+            dyn_cast_or_null<BoolValue>(Env.getValue(*FooDecl, SkipPast::None));
+        ASSERT_THAT(FooVal, NotNull());
+
+        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+        ASSERT_THAT(BarDecl, NotNull());
+
+        const auto *BarVal =
+            dyn_cast_or_null<BoolValue>(Env.getValue(*BarDecl, SkipPast::None));
+        ASSERT_THAT(BarVal, NotNull());
+
+        const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+        ASSERT_THAT(BazDecl, NotNull());
+
+        const auto *BazVal = dyn_cast_or_null<DisjunctionValue>(
+            Env.getValue(*BazDecl, SkipPast::None));
+        ASSERT_THAT(BazVal, NotNull());
+
+        EXPECT_EQ(&BazVal->getLeftSubValue(), FooVal);
+        EXPECT_EQ(&BazVal->getRightSubValue(), BarVal);
+      });
+}
+
+TEST_F(TransferTest, AssignFromBoolNegation) {
+  std::string Code = R"(
+    void target() {
+      bool Foo = true;
+      bool Bar = !(Foo);
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                const auto *FooVal = dyn_cast_or_null<AtomicBoolValue>(
+                    Env.getValue(*FooDecl, SkipPast::None));
+                ASSERT_THAT(FooVal, NotNull());
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const auto *BarVal = dyn_cast_or_null<NegationValue>(
+                    Env.getValue(*BarDecl, SkipPast::None));
+                ASSERT_THAT(BarVal, NotNull());
+
+                EXPECT_EQ(&BarVal->getSubVal(), FooVal);
+              });
+}
+
 } // namespace
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 90d7d73c85a55..faeac009725a2 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -387,7 +387,8 @@ class WideningTest : public Test {
             Code, "target",
             [this](ASTContext &Context, Environment &Env) {
               assert(HasValueTop == nullptr);
-              HasValueTop = &Env.takeOwnership(std::make_unique<BoolValue>());
+              HasValueTop =
+                  &Env.takeOwnership(std::make_unique<AtomicBoolValue>());
               return OptionalIntAnalysis(Context, *HasValueTop);
             },
             [&Match](

From d4a53f3bfa3e29d412e571765a8568bee7da5483 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Wed, 16 Feb 2022 14:54:54 +0100
Subject: [PATCH 047/748] [mlir] call target materialization more in dialect
 conversion

During dialect conversion, target materialization is triggered to create
cast-like operations when a type mismatch occurs between the value that
replaces a rewritten operation and the type that another operations expects as
operands processed by the type conversion. First, a dummy cast is inserted to
make sure the pattern application can proceed. The decision to trigger the
user-provided materialization hook is taken later based on the result of the
dummy cast having uses. However, it only has uses if other patterns constructed
new operations using the casted value as operand. If existing (legal)
operations use the replaced value, they may have not been updated to use the
casted value yet. The conversion infra would then delete the dummy cast first,
and then would replace the uses with now-invalid (null in the bast case) value.
When deciding whether to trigger cast materialization, check for liveness the
uses not only of the casted value, but also of all the values that it replaces.

This was discovered in the finalizing bufferize pass that cleans up
mutually-cancelling casts without touching other operations. It is not
impossible that there are other scenarios where the dialect converison infra
could produce invalid operand uses because of dummy casts erased too eagerly.

Reviewed By: springerm

Differential Revision: https://reviews.llvm.org/D119937
---
 .../Transforms/Utils/DialectConversion.cpp    |  5 ++
 ...galize-target-materialization-no-uses.mlir | 27 ++++++++++
 mlir/test/lib/Dialect/Test/TestOps.td         |  2 +
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   | 53 +++++++++++++++++++
 4 files changed, 87 insertions(+)
 create mode 100644 mlir/test/Transforms/test-legalize-target-materialization-no-uses.mlir

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index de15a23b906df..51eed1fcddb6c 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -2588,6 +2588,11 @@ static void computeNecessaryMaterializations(
         return !necessaryMaterializations.count(matIt->second);
       return rewriterImpl.isOpIgnored(user);
     };
+    // This value may be replacing another value that has a live user.
+    for (Value inv : inverseMapping.lookup(value))
+      if (llvm::find_if_not(inv.getUsers(), findFn) != inv.user_end())
+        return true;
+    // Or have live users itself.
     return llvm::find_if_not(value.getUsers(), findFn) != value.user_end();
   };
 
diff --git a/mlir/test/Transforms/test-legalize-target-materialization-no-uses.mlir b/mlir/test/Transforms/test-legalize-target-materialization-no-uses.mlir
new file mode 100644
index 0000000000000..0918ca4216e61
--- /dev/null
+++ b/mlir/test/Transforms/test-legalize-target-materialization-no-uses.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt -test-target-materialization-with-no-uses %s | FileCheck %s
+
+// The conversion is set up as follows:
+// - type_changer ops are illegal;
+// - type_changer ops are replaced with their operands;
+// - i16 types are converted to i64 by the type conversion;
+// - the rest of the types are legal.
+// The first type_changer is replaced with its operand. For the pattern to
+// apply to the second type_changer, the conversion infra creates a dummy
+// cast operation to cast from the i32 to i64 because the original op takes an
+// (illegal) i16 that became i64. This dummy operation should be replaced by
+// the one produced by the target materialization hook. At the moment when the
+// materialization decision is taken, the i64 replacement of the first type
+// change (the result of the dummy cast) has no uses, but the value it replaces
+// does, so the infra must call the materialization rather than assume the
+// dummy cast to be dead.
+
+// CHECK-LABEL: @foo
+func @foo() {
+  %0 = "test.type_producer"() : () -> i32
+  // CHECK: test.cast
+  // CHECK-NOT: test.type_changer
+  %1 = "test.type_changer"(%0) : (i32) -> i16
+  %2 = "test.type_changer"(%1) : (i16) -> i64
+  "test.type_consumer"(%2) : (i64) -> ()
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 40bec4f4807e4..4bf82ada9aac6 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1603,6 +1603,8 @@ def TestAnotherTypeProducerOp : TEST_Op<"another_type_producer">,
   Results<(outs AnyType)>;
 def TestTypeConsumerOp : TEST_Op<"type_consumer">,
   Arguments<(ins AnyType)>;
+def TestTypeChangerOp : TEST_Op<"type_changer">,
+  Arguments<(ins AnyType)>, Results<(outs AnyType)>;
 def TestValidOp : TEST_Op<"valid", [Terminator]>,
   Arguments<(ins Variadic<AnyType>)>;
 
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 53661511ee324..5e0c253a77861 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1135,6 +1135,58 @@ struct TestTypeConversionDriver
 };
 } // namespace
 
+//===----------------------------------------------------------------------===//
+// Test Target Materialization With No Uses
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ForwardOperandPattern : public OpConversionPattern<TestTypeChangerOp> {
+  using OpConversionPattern<TestTypeChangerOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(TestTypeChangerOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOp(op, adaptor.getOperands());
+    return success();
+  }
+};
+
+struct TestTargetMaterializationWithNoUses
+    : public PassWrapper<TestTargetMaterializationWithNoUses,
+                         OperationPass<ModuleOp>> {
+  StringRef getArgument() const final {
+    return "test-target-materialization-with-no-uses";
+  }
+  StringRef getDescription() const final {
+    return "Test a special case of target materialization in DialectConversion";
+  }
+
+  void runOnOperation() override {
+    TypeConverter converter;
+    converter.addConversion([](Type t) { return t; });
+    converter.addConversion([](IntegerType intTy) -> Type {
+      if (intTy.getWidth() == 16)
+        return IntegerType::get(intTy.getContext(), 64);
+      return intTy;
+    });
+    converter.addTargetMaterialization(
+        [](OpBuilder &builder, Type type, ValueRange inputs, Location loc) {
+          return builder.create<TestCastOp>(loc, type, inputs).getResult();
+        });
+
+    ConversionTarget target(getContext());
+    target.addIllegalOp<TestTypeChangerOp>();
+
+    RewritePatternSet patterns(&getContext());
+    patterns.add<ForwardOperandPattern>(converter, &getContext());
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // Test Block Merging
 //===----------------------------------------------------------------------===//
@@ -1317,6 +1369,7 @@ void registerPatternsTestPass() {
   PassRegistration<TestUnknownRootOpDriver>();
 
   PassRegistration<TestTypeConversionDriver>();
+  PassRegistration<TestTargetMaterializationWithNoUses>();
 
   PassRegistration<TestMergeBlocksPatternDriver>();
   PassRegistration<TestSelectiveReplacementPatternDriver>();

From 371fcb720e15906e8c63600253afcb806b9b10d0 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Thu, 17 Feb 2022 12:07:17 +0300
Subject: [PATCH 048/748] [SimplifyCFG][PhaseOrdering] Defer lowering switch
 into an integer range comparison and branch until after at least the IPSCCP

That transformation is lossy, as discussed in
https://github.com/llvm/llvm-project/issues/53853
and https://github.com/rust-lang/rust/issues/85133#issuecomment-904185574

This is an alternative to D119839,
which would add a limited IPSCCP into SimplifyCFG.

Unlike lowering switch to lookup, we still want this transformation
to happen relatively early, but after giving a chance for the things
like CVP to do their thing. It seems like deferring it just until
the IPSCCP is enough for the tests at hand, but perhaps we need to
be more aggressive and disable it until CVP.

Fixes https://github.com/llvm/llvm-project/issues/53853
Refs. https://github.com/rust-lang/rust/issues/85133

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D119854
---
 .../Transforms/Utils/SimplifyCFGOptions.h     |  5 ++
 llvm/lib/Passes/PassBuilder.cpp               |  2 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      | 48 ++++++++++++-------
 llvm/lib/Passes/PassRegistry.def              |  1 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |  1 +
 .../Target/Hexagon/HexagonTargetMachine.cpp   |  1 +
 .../lib/Transforms/IPO/PassManagerBuilder.cpp | 29 +++++++----
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |  9 ++++
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  4 +-
 llvm/test/Other/new-pm-print-pipeline.ll      |  4 +-
 .../Coroutines/coro-catchswitch-cleanuppad.ll |  2 +-
 .../Transforms/LoopUnroll/ARM/upperbound.ll   |  2 +-
 ...witch-lowering-vs-correlatedpropagation.ll | 28 +++++------
 llvm/test/Transforms/SimplifyCFG/DeadSetCC.ll |  2 +-
 .../SimplifyCFG/EqualPHIEdgeBlockMerge.ll     |  2 +-
 ...risonIntoPredecessors-no-new-successors.ll |  2 +-
 .../ForwardSwitchConditionToPHI.ll            |  8 ++--
 .../SimplifyCFG/preserve-branchweights.ll     |  2 +-
 .../SimplifyCFG/switch-dead-default.ll        |  2 +-
 .../SimplifyCFG/switch-range-to-icmp.ll       |  2 +-
 .../Transforms/SimplifyCFG/switch-to-icmp.ll  |  2 +-
 .../SimplifyCFG/switch_create-custom-dl.ll    |  2 +-
 .../Transforms/SimplifyCFG/switch_create.ll   |  4 +-
 23 files changed, 104 insertions(+), 60 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index fb3a7490346f4..7af879638a4d8 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -23,6 +23,7 @@ class AssumptionCache;
 struct SimplifyCFGOptions {
   int BonusInstThreshold = 1;
   bool ForwardSwitchCondToPhi = false;
+  bool ConvertSwitchRangeToICmp = false;
   bool ConvertSwitchToLookupTable = false;
   bool NeedCanonicalLoop = true;
   bool HoistCommonInsts = false;
@@ -41,6 +42,10 @@ struct SimplifyCFGOptions {
     ForwardSwitchCondToPhi = B;
     return *this;
   }
+  SimplifyCFGOptions &convertSwitchRangeToICmp(bool B) {
+    ConvertSwitchRangeToICmp = B;
+    return *this;
+  }
   SimplifyCFGOptions &convertSwitchToLookupTable(bool B) {
     ConvertSwitchToLookupTable = B;
     return *this;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 015ca1eec4df3..dedfc81f11bba 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -679,6 +679,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
     bool Enable = !ParamName.consume_front("no-");
     if (ParamName == "forward-switch-cond") {
       Result.forwardSwitchCondToPhi(Enable);
+    } else if (ParamName == "switch-range-to-icmp") {
+      Result.convertSwitchRangeToICmp(Enable);
     } else if (ParamName == "switch-to-lookup") {
       Result.convertSwitchToLookupTable(Enable);
     } else if (ParamName == "keep-loops") {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 93637c890c4fa..2aba7ef262340 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -259,14 +259,16 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
 
   // Hoisting of scalars and load expressions.
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
 
   FPM.addPass(LibCallsShrinkWrapPass());
 
   invokePeepholeEPCallbacks(FPM, Level);
 
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   // Form canonically associated expression trees, and simplify the trees using
   // basic mathematical properties. For example, this will form (nearly)
@@ -335,7 +337,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
                                               /*UseMemorySSA=*/true,
                                               /*UseBlockFrequencyInfo=*/true));
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
@@ -373,7 +376,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   // the simplifications and basic cleanup after all the simplifications.
   // TODO: Investigate if this is too expensive.
   FPM.addPass(ADCEPass());
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
@@ -408,7 +412,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // Global value numbering based sinking.
   if (EnableGVNSink) {
     FPM.addPass(GVNSinkPass());
-    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(
+        SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   }
 
   if (EnableConstraintElimination)
@@ -421,7 +426,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
 
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   if (Level == OptimizationLevel::O3)
     FPM.addPass(AggressiveInstCombinePass());
@@ -438,7 +444,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
     FPM.addPass(PGOMemOPSizeOpt());
 
   FPM.addPass(TailCallElimPass());
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   // Form canonically associated expression trees, and simplify the trees using
   // basic mathematical properties. For example, this will form (nearly)
@@ -510,7 +517,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
                                               /*UseMemorySSA=*/true,
                                               /*UseBlockFrequencyInfo=*/true));
-  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
   // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
@@ -575,8 +583,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   for (auto &C : ScalarOptimizerLateEPCallbacks)
     C(FPM, Level);
 
-  FPM.addPass(SimplifyCFGPass(
-      SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true)));
+  FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
+                                  .convertSwitchRangeToICmp(true)
+                                  .hoistCommonInsts(true)
+                                  .sinkCommonInsts(true)));
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
@@ -614,7 +624,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
     FunctionPassManager FPM;
     FPM.addPass(SROAPass());
     FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
-    FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
+    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+        true)));                    // Merge & remove basic blocks.
     FPM.addPass(InstCombinePass()); // Combine silly sequences.
     invokePeepholeEPCallbacks(FPM, Level);
 
@@ -928,7 +939,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   GlobalCleanupPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
 
-  GlobalCleanupPM.addPass(SimplifyCFGPass());
+  GlobalCleanupPM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM),
                                                 PTO.EagerlyInvalidateAnalyses));
 
@@ -1015,7 +1027,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     ExtraPasses.addPass(
         createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
                                         /*UseBlockFrequencyInfo=*/true));
-    ExtraPasses.addPass(SimplifyCFGPass());
+    ExtraPasses.addPass(
+        SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     ExtraPasses.addPass(InstCombinePass());
     FPM.addPass(std::move(ExtraPasses));
   }
@@ -1031,6 +1044,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   // before SLP vectorization.
   FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                   .forwardSwitchCondToPhi(true)
+                                  .convertSwitchRangeToICmp(true)
                                   .convertSwitchToLookupTable(true)
                                   .needCanonicalLoops(false)
                                   .hoistCommonInsts(true)
@@ -1202,7 +1216,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  OptimizePM.addPass(SimplifyCFGPass());
+  OptimizePM.addPass(
+      SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   OptimizePM.addPass(CoroCleanupPass());
 
@@ -1676,8 +1691,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   // Add late LTO optimization passes.
   // Delete basic blocks, which optimization passes may have killed.
-  MPM.addPass(createModuleToFunctionPassAdaptor(
-      SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))));
+  MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass(
+      SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts(
+          true))));
 
   // Drop bodies of available eternally objects to improve GlobalDCE.
   MPM.addPass(EliminateAvailableExternallyPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 8e0af11b854d0..69d8d8c432675 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -423,6 +423,7 @@ FUNCTION_PASS_WITH_PARAMS("simplifycfg",
                            },
                           parseSimplifyCFGOptions,
                           "no-forward-switch-cond;forward-switch-cond;"
+                          "no-switch-range-to-icmp;switch-range-to-icmp;"
                           "no-switch-to-lookup;switch-to-lookup;"
                           "no-keep-loops;keep-loops;"
                           "no-hoist-common-insts;hoist-common-insts;"
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index ab8ee0beca041..59832da3a07a8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -531,6 +531,7 @@ void AArch64PassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass(SimplifyCFGOptions()
                                             .forwardSwitchCondToPhi(true)
+                                            .convertSwitchRangeToICmp(true)
                                             .convertSwitchToLookupTable(true)
                                             .needCanonicalLoops(false)
                                             .hoistCommonInsts(true)
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 305375778b539..0744c186d451e 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -344,6 +344,7 @@ void HexagonPassConfig::addIRPasses() {
     if (EnableInitialCFGCleanup)
       addPass(createCFGSimplificationPass(SimplifyCFGOptions()
                                               .forwardSwitchCondToPhi(true)
+                                              .convertSwitchRangeToICmp(true)
                                               .convertSwitchToLookupTable(true)
                                               .needCanonicalLoops(false)
                                               .hoistCommonInsts(true)
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 74f68531b89a7..d13eedf80e3bf 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -365,7 +365,9 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
     MPM.add(createFunctionInliningPass(IP));
     MPM.add(createSROAPass());
     MPM.add(createEarlyCSEPass());             // Catch trivial redundancies
-    MPM.add(createCFGSimplificationPass());    // Merge & remove BBs
+    MPM.add(createCFGSimplificationPass(
+        SimplifyCFGOptions().convertSwitchRangeToICmp(
+            true)));                           // Merge & remove BBs
     MPM.add(createInstructionCombiningPass()); // Combine silly seq's
     addExtensionsToPM(EP_Peephole, MPM);
   }
@@ -404,7 +406,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
       MPM.add(createGVNHoistPass());
     if (EnableGVNSink) {
       MPM.add(createGVNSinkPass());
-      MPM.add(createCFGSimplificationPass());
+      MPM.add(createCFGSimplificationPass(
+          SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     }
   }
 
@@ -418,7 +421,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createJumpThreadingPass());         // Thread jumps.
     MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   }
-  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(
+      createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+          true))); // Merge & remove BBs
   // Combine silly seq's
   if (OptLevel > 2)
     MPM.add(createAggressiveInstCombinerPass());
@@ -434,7 +439,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // TODO: Investigate the cost/benefit of tail call elimination on debugging.
   if (OptLevel > 1)
     MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
-  MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
+  MPM.add(
+      createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+          true)));                            // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
 
   // The matrix extension can introduce large vector operations early, which can
@@ -465,7 +472,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // FIXME: We break the loop pass pipeline here in order to do full
   // simplifycfg. Eventually loop-simplifycfg should be enhanced to replace the
   // need for this.
-  MPM.add(createCFGSimplificationPass());
+  MPM.add(createCFGSimplificationPass(
+      SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   MPM.add(createInstructionCombiningPass());
   // We resume loop passes creating a second loop pipeline here.
   if (EnableLoopFlatten) {
@@ -582,7 +590,8 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
     PM.add(createInstructionCombiningPass());
     PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
     PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
-    PM.add(createCFGSimplificationPass());
+    PM.add(createCFGSimplificationPass(
+        SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     PM.add(createInstructionCombiningPass());
   }
 
@@ -597,6 +606,7 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
   // before SLP vectorization.
   PM.add(createCFGSimplificationPass(SimplifyCFGOptions()
                                          .forwardSwitchCondToPhi(true)
+                                         .convertSwitchRangeToICmp(true)
                                          .convertSwitchToLookupTable(true)
                                          .needCanonicalLoops(false)
                                          .hoistCommonInsts(true)
@@ -772,7 +782,9 @@ void PassManagerBuilder::populateModulePassManager(
 
   MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
+  MPM.add(
+      createCFGSimplificationPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
+          true))); // Clean up after IPCP & DAE
 
   // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
   // call promotion as it will change the CFG too much to make the 2nd
@@ -972,7 +984,8 @@ void PassManagerBuilder::populateModulePassManager(
 
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
-  MPM.add(createCFGSimplificationPass());
+  MPM.add(createCFGSimplificationPass(
+      SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
 
   addExtensionsToPM(EP_OptimizerLast, MPM);
 
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index ee17da1875e50..b8972751066d8 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -59,6 +59,11 @@ static cl::opt<bool> UserKeepLoops(
     "keep-loops", cl::Hidden, cl::init(true),
     cl::desc("Preserve canonical loop structure (default = true)"));
 
+static cl::opt<bool> UserSwitchRangeToICmp(
+    "switch-range-to-icmp", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Convert switches into an integer range comparison (default = false)"));
+
 static cl::opt<bool> UserSwitchToLookup(
     "switch-to-lookup", cl::Hidden, cl::init(false),
     cl::desc("Convert switches to lookup tables (default = false)"));
@@ -311,6 +316,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.BonusInstThreshold = UserBonusInstThreshold;
   if (UserForwardSwitchCond.getNumOccurrences())
     Options.ForwardSwitchCondToPhi = UserForwardSwitchCond;
+  if (UserSwitchRangeToICmp.getNumOccurrences())
+    Options.ConvertSwitchRangeToICmp = UserSwitchRangeToICmp;
   if (UserSwitchToLookup.getNumOccurrences())
     Options.ConvertSwitchToLookupTable = UserSwitchToLookup;
   if (UserKeepLoops.getNumOccurrences())
@@ -337,6 +344,8 @@ void SimplifyCFGPass::printPipeline(
   OS << "<";
   OS << "bonus-inst-threshold=" << Options.BonusInstThreshold << ";";
   OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;";
+  OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-")
+     << "switch-range-to-icmp;";
   OS << (Options.ConvertSwitchToLookupTable ? "" : "no-")
      << "switch-to-lookup;";
   OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 865a236a048c2..dbf22ab1e2298 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6559,7 +6559,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   }
 
   // Try to transform the switch into an icmp and a branch.
-  if (TurnSwitchRangeIntoICmp(SI, Builder))
+  // The conversion from switch to comparison may lose information on
+  // impossible switch values, so disable it early in the pipeline.
+  if (Options.ConvertSwitchRangeToICmp && TurnSwitchRangeIntoICmp(SI, Builder))
     return requestResimplify();
 
   // Remove unreachable cases.
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index 20d624a968526..3abf54ff4b491 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -1,7 +1,7 @@
 ;; Test that the -print-pipeline-passes option correctly prints some explicitly specified pipelines.
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(adce),function(simplifycfg<bonus-inst-threshold=123;no-forward-switch-cond;switch-to-lookup;keep-loops;no-hoist-common-insts;sink-common-insts>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-0
-; CHECK-0: function(adce),function(simplifycfg<bonus-inst-threshold=123;no-forward-switch-cond;switch-to-lookup;keep-loops;no-hoist-common-insts;sink-common-insts>)
+; CHECK-0: function(adce),function(simplifycfg<bonus-inst-threshold=123;no-forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;no-hoist-common-insts;sink-common-insts>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='module(rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate)),invalidate<globals-aa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-1
 ; CHECK-1: rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate)),invalidate<globals-aa>
@@ -56,7 +56,7 @@
 ; CHECK-17: function(print<stack-lifetime><may>,print<stack-lifetime><must>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
-; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts>)
+; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only>,loop-vectorize<interleave-forced-only;vectorize-forced-only>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19
 ; CHECK-19: function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,loop-vectorize<interleave-forced-only;vectorize-forced-only;>)
diff --git a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
index 4650820c84eac..c14094e5b48df 100644
--- a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
+++ b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
@@ -1,6 +1,6 @@
 ; Tests the PHI nodes in cleanuppads for catchswitch instructions are correctly
 ; split up.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg<switch-range-to-icmp>,early-cse' -S | FileCheck %s
 
 declare i32 @__CxxFrameHandler3(...)
 define i8* @f2(i1 %val) "coroutine.presplit"="1" personality i32 (...)* @__CxxFrameHandler3 {
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
index 33151c68b3198..ffa474afcd508 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-unroll -simplifycfg -instcombine -simplifycfg -S -mtriple arm-none-eabi -mcpu=cortex-m7 %s | FileCheck %s
+; RUN: opt -loop-unroll -simplifycfg -switch-range-to-icmp -instcombine -simplifycfg -S -mtriple arm-none-eabi -mcpu=cortex-m7 %s | FileCheck %s
 
 ; This test is meant to check that this loop is unrolled into three iterations.
 define void @test(i32* %x, i32 %n) {
diff --git a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
index 4e1863304e6ba..0ca042f9ad7ff 100644
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-switch-lowering-vs-correlatedpropagation.ll
@@ -14,11 +14,9 @@
 define i64 @test1(i64 %x) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i64 [[X:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[X]], 100
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TMP0]], i64 200, i64 10
-; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[SWITCH]], i64 0, i64 [[DOT]]
-; CHECK-NEXT:    ret i64 [[COMMON_RET_OP]]
+; CHECK-NEXT:    [[SWITCH_SELECTCMP:%.*]] = icmp eq i64 [[X:%.*]], 0
+; CHECK-NEXT:    [[SWITCH_SELECT:%.*]] = select i1 [[SWITCH_SELECTCMP]], i64 0, i64 10
+; CHECK-NEXT:    ret i64 [[SWITCH_SELECT]]
 ;
 entry:
   switch i64 %x, label %bb3 [
@@ -42,11 +40,9 @@ bb5:
 define i64 @test2(i64 %x) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SWITCH_SELECTCMP:%.*]] = icmp eq i64 [[X:%.*]], 101
-; CHECK-NEXT:    [[SWITCH_SELECT:%.*]] = select i1 [[SWITCH_SELECTCMP]], i64 200, i64 10
-; CHECK-NEXT:    [[SWITCH_SELECTCMP1:%.*]] = icmp eq i64 [[X]], 1
-; CHECK-NEXT:    [[SWITCH_SELECT2:%.*]] = select i1 [[SWITCH_SELECTCMP1]], i64 0, i64 [[SWITCH_SELECT]]
-; CHECK-NEXT:    ret i64 [[SWITCH_SELECT2]]
+; CHECK-NEXT:    [[SWITCH_SELECTCMP:%.*]] = icmp eq i64 [[X:%.*]], 1
+; CHECK-NEXT:    [[SWITCH_SELECT:%.*]] = select i1 [[SWITCH_SELECTCMP]], i64 0, i64 10
+; CHECK-NEXT:    ret i64 [[SWITCH_SELECT]]
 ;
 entry:
   switch i64 %x, label %bb3 [
@@ -96,10 +92,8 @@ define i64 @test_fail1(i64 %x) {
 ; CHECK-LABEL: @test_fail1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i64 [[X:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[X]], 100
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TMP0]], i64 200, i64 10
-; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[SWITCH]], i64 0, i64 [[DOT]]
-; CHECK-NEXT:    ret i64 [[COMMON_RET_OP]]
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[SWITCH]], i64 0, i64 10
+; CHECK-NEXT:    ret i64 [[SPEC_SELECT]]
 ;
 entry:
   switch i64 %x, label %bb3 [
@@ -124,9 +118,9 @@ bb5:
 define i64 @test_fail2(i64 %x) {
 ; CHECK-LABEL: @test_fail2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i64 [[X:%.*]], 0
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[SWITCH]], i64 2, i64 [[X]]
-; CHECK-NEXT:    ret i64 [[SPEC_SELECT]]
+; CHECK-NEXT:    [[SWITCH_SELECTCMP:%.*]] = icmp eq i64 [[X:%.*]], 0
+; CHECK-NEXT:    [[SWITCH_SELECT:%.*]] = select i1 [[SWITCH_SELECTCMP]], i64 2, i64 1
+; CHECK-NEXT:    ret i64 [[SWITCH_SELECT]]
 ;
 entry:
   switch i64 %x, label %bb2 [
diff --git a/llvm/test/Transforms/SimplifyCFG/DeadSetCC.ll b/llvm/test/Transforms/SimplifyCFG/DeadSetCC.ll
index fd322a938f8f2..ff9c3be3397d4 100644
--- a/llvm/test/Transforms/SimplifyCFG/DeadSetCC.ll
+++ b/llvm/test/Transforms/SimplifyCFG/DeadSetCC.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp -S | FileCheck %s
 
 ; Check that simplifycfg deletes a dead 'seteq' instruction when it
 ; folds a conditional branch into a switch instruction.
diff --git a/llvm/test/Transforms/SimplifyCFG/EqualPHIEdgeBlockMerge.ll b/llvm/test/Transforms/SimplifyCFG/EqualPHIEdgeBlockMerge.ll
index 96cd0469a38d7..c550abd9ad822 100644
--- a/llvm/test/Transforms/SimplifyCFG/EqualPHIEdgeBlockMerge.ll
+++ b/llvm/test/Transforms/SimplifyCFG/EqualPHIEdgeBlockMerge.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test merging of blocks with phi nodes.
 ;
-; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp -S | FileCheck %s
 ;
 
 ; ModuleID = '<stdin>'
diff --git a/llvm/test/Transforms/SimplifyCFG/FoldValueComparisonIntoPredecessors-no-new-successors.ll b/llvm/test/Transforms/SimplifyCFG/FoldValueComparisonIntoPredecessors-no-new-successors.ll
index 04d5d4d5a645b..3fbd5bfe4574b 100644
--- a/llvm/test/Transforms/SimplifyCFG/FoldValueComparisonIntoPredecessors-no-new-successors.ll
+++ b/llvm/test/Transforms/SimplifyCFG/FoldValueComparisonIntoPredecessors-no-new-successors.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
+; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck %s
 
 define void @widget(i32 %arg) {
 ; CHECK-LABEL: @widget(
diff --git a/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll b/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
index b57284d16688b..bebe18abfb73a 100644
--- a/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
+++ b/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -forward-switch-cond=false -S | FileCheck %s --check-prefix=NO_FWD
-; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -forward-switch-cond=true  -S | FileCheck %s --check-prefix=FWD
+; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -forward-switch-cond=false -switch-range-to-icmp -S | FileCheck %s --check-prefix=NO_FWD
+; RUN: opt < %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -forward-switch-cond=true  -switch-range-to-icmp -S | FileCheck %s --check-prefix=FWD
 
-; RUN: opt < %s -passes='simplifycfg<no-forward-switch-cond>' -S | FileCheck %s --check-prefix=NO_FWD
-; RUN: opt < %s -passes='simplifycfg<forward-switch-cond>' -S | FileCheck %s --check-prefix=FWD
+; RUN: opt < %s -passes='simplifycfg<no-forward-switch-cond;switch-range-to-icmp>' -S | FileCheck %s --check-prefix=NO_FWD
+; RUN: opt < %s -passes='simplifycfg<forward-switch-cond;switch-range-to-icmp>' -S | FileCheck %s --check-prefix=FWD
 
 ; PR10131
 
diff --git a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index f113ad84dc8fc..fc200e041125e 100644
--- a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -o - < %s | FileCheck %s
+; RUN: opt -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp -S -o - < %s | FileCheck %s
 
 declare void @helper(i32)
 
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
index 154ecb2166310..1662bb99f27bc 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt %s -S -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 | FileCheck %s
+; RUN: opt %s -S -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 declare void @foo(i32)
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 56a6895f29b8f..5a6cb463bd6f1 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+; RUN: opt %s -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp -S | FileCheck %s
 
 declare i32 @f(i32)
 
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-to-icmp.ll
index bfb27cd461147..c9ba839ec0a5b 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-to-icmp.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
+; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck %s
 
 define zeroext i1 @test1(i32 %x) nounwind readnone ssp noredzone {
 entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll b/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll
index 499064e40a907..59064fbd8dc1a 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
+; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck %s
 target datalayout="p:40:64:64:32"
 
 declare void @foo1()
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create.ll b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
index 1ba0971fc4f01..d4ae9471a2810 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
-; RUN: opt -S -data-layout="p:32:32-p1:16:16" -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
+; RUN: opt -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck %s
+; RUN: opt -S -data-layout="p:32:32-p1:16:16" -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
 
 declare void @foo1()
 

From 0b93e90971c0a43199e2da70c9422ebb073080ae Mon Sep 17 00:00:00 2001
From: Ben Shi <ben.shi@streamcomputing.com>
Date: Thu, 17 Feb 2022 17:24:38 +0800
Subject: [PATCH 049/748] Revert "[RISCV] LUI used for address computation
 should not isAsCheapAsAMove"

This reverts commit 23a50736004e94704a2393aa36a905d737f2b20f.

Although this patch achieved better codegen in most cases, it is really
important to accurately describe the cost of instructions. So I revert it.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp   |  2 --
 llvm/test/CodeGen/RISCV/unroll-loop-cse.ll | 20 ++++++++++++++------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index a5f072c7c2601..8f931c6ad1d9d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -998,8 +998,6 @@ bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
     return (MI.getOperand(1).isReg() &&
             MI.getOperand(1).getReg() == RISCV::X0) ||
            (MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0);
-  case RISCV::LUI:
-    return MI.getOperand(1).getTargetFlags() != RISCVII::MO_HI;
   }
   return MI.isAsCheapAsAMove();
 }
diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
index 91aec53c47210..00b0d32e07d30 100644
--- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
+++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
@@ -18,20 +18,28 @@ define signext i32 @unroll_loop_cse() {
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a3, 4(a1)
+; CHECK-NEXT:    lw a1, 4(a1)
 ; CHECK-NEXT:    addi a2, a2, %lo(check)
-; CHECK-NEXT:    lw a4, 4(a2)
-; CHECK-NEXT:    bne a3, a4, .LBB0_6
+; CHECK-NEXT:    lw a2, 4(a2)
+; CHECK-NEXT:    bne a1, a2, .LBB0_6
 ; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    lui a1, %hi(x)
+; CHECK-NEXT:    addi a1, a1, %lo(x)
 ; CHECK-NEXT:    lw a3, 8(a1)
+; CHECK-NEXT:    lui a2, %hi(check)
+; CHECK-NEXT:    addi a2, a2, %lo(check)
 ; CHECK-NEXT:    lw a4, 8(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    lw a3, 12(a1)
-; CHECK-NEXT:    lw a4, 12(a2)
-; CHECK-NEXT:    bne a3, a4, .LBB0_6
+; CHECK-NEXT:    lw a1, 12(a1)
+; CHECK-NEXT:    lw a2, 12(a2)
+; CHECK-NEXT:    bne a1, a2, .LBB0_6
 ; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    lui a1, %hi(x)
+; CHECK-NEXT:    addi a1, a1, %lo(x)
 ; CHECK-NEXT:    lw a3, 16(a1)
+; CHECK-NEXT:    lui a2, %hi(check)
+; CHECK-NEXT:    addi a2, a2, %lo(check)
 ; CHECK-NEXT:    lw a4, 16(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.5:

From 3c9229c6635ecae1991e3229f446f1cac130f4fa Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 10:56:56 +0000
Subject: [PATCH 050/748] [CodeGen] Return better Changed status from
 DetectDeadLanes

Differential Revision: https://reviews.llvm.org/D119940
---
 llvm/lib/CodeGen/DetectDeadLanes.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/DetectDeadLanes.cpp b/llvm/lib/CodeGen/DetectDeadLanes.cpp
index 1337e57f360bb..6b08d8149fa04 100644
--- a/llvm/lib/CodeGen/DetectDeadLanes.cpp
+++ b/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -93,7 +93,7 @@ class DetectDeadLanes : public MachineFunctionPass {
   LaneBitmask transferUsedLanes(const MachineInstr &MI, LaneBitmask UsedLanes,
                                 const MachineOperand &MO) const;
 
-  bool runOnce(MachineFunction &MF);
+  std::pair<bool, bool> runOnce(MachineFunction &MF);
 
   LaneBitmask determineInitialDefinedLanes(unsigned Reg);
   LaneBitmask determineInitialUsedLanes(unsigned Reg);
@@ -487,7 +487,7 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
   return true;
 }
 
-bool DetectDeadLanes::runOnce(MachineFunction &MF) {
+std::pair<bool, bool> DetectDeadLanes::runOnce(MachineFunction &MF) {
   // First pass: Populate defs/uses of vregs with initial values
   unsigned NumVirtRegs = MRI->getNumVirtRegs();
   for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
@@ -528,6 +528,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
     dbgs() << "\n";
   });
 
+  bool Changed = false;
   bool Again = false;
   // Mark operands as dead/unused.
   for (MachineBasicBlock &MBB : MF) {
@@ -544,6 +545,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
           LLVM_DEBUG(dbgs()
                      << "Marking operand '" << MO << "' as dead in " << MI);
           MO.setIsDead();
+          Changed = true;
         }
         if (MO.readsReg()) {
           bool CrossCopy = false;
@@ -551,10 +553,12 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
             LLVM_DEBUG(dbgs()
                        << "Marking operand '" << MO << "' as undef in " << MI);
             MO.setIsUndef();
+            Changed = true;
           } else if (isUndefInput(MO, &CrossCopy)) {
             LLVM_DEBUG(dbgs()
                        << "Marking operand '" << MO << "' as undef in " << MI);
             MO.setIsUndef();
+            Changed = true;
             if (CrossCopy)
               Again = true;
           }
@@ -563,7 +567,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
     }
   }
 
-  return Again;
+  return std::make_pair(Changed, Again);
 }
 
 bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
@@ -585,13 +589,16 @@ bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
   WorklistMembers.resize(NumVirtRegs);
   DefinedByCopy.resize(NumVirtRegs);
 
+  bool Changed = false;
   bool Again;
   do {
-    Again = runOnce(MF);
+    bool LocalChanged;
+    std::tie(LocalChanged, Again) = runOnce(MF);
+    Changed |= LocalChanged;
   } while(Again);
 
   DefinedByCopy.clear();
   WorklistMembers.clear();
   delete[] VRegInfos;
-  return true;
+  return Changed;
 }

From f0092f9ded34ce733d36a302fd1f33f134418594 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 10:57:50 +0000
Subject: [PATCH 051/748] [CodeGen] Return false from
 LiveIntervals::runOnMachineFunction

This is an analysis pass so it does not modify the MachineFunction.

Differential Revision: https://reviews.llvm.org/D119941
---
 llvm/lib/CodeGen/LiveIntervals.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 2b23de2859531..37114d862ca62 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -149,7 +149,7 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
       getRegUnit(i);
   }
   LLVM_DEBUG(dump());
-  return true;
+  return false;
 }
 
 void LiveIntervals::print(raw_ostream &OS, const Module* ) const {

From 50ddb5d2d12087c9b0c8da021179739324214e95 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 10:58:54 +0000
Subject: [PATCH 052/748] [CodeGen] Return better Changed status from
 LocalStackSlotAllocation

Differential Revision: https://reviews.llvm.org/D119942
---
 llvm/lib/CodeGen/LocalStackSlotAllocation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 37fd3e4853acf..75c59d9b5814e 100644
--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -118,7 +118,7 @@ bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
   // If the target doesn't want/need this pass, or if there are no locals
   // to consider, early exit.
   if (LocalObjectCount == 0 || !TRI->requiresVirtualBaseRegisters(MF))
-    return true;
+    return false;
 
   // Make sure we have enough space to store the local offsets.
   LocalOffsets.resize(MFI.getObjectIndexEnd());

From 77e793d0255b67454176257d8306c932704ebd94 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 10:59:39 +0000
Subject: [PATCH 053/748] [AMDGPU] Return better Changed status from
 AMDGPUAnnotateUniformValues

Differential Revision: https://reviews.llvm.org/D119943
---
 .../AMDGPU/AMDGPUAnnotateUniformValues.cpp    | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 440c527addff9..74be0336851cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -33,6 +33,17 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
   MemorySSA *MSSA;
   AliasAnalysis *AA;
   bool isEntryFunc;
+  bool Changed;
+
+  void setUniformMetadata(Instruction *I) {
+    I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
+    Changed = true;
+  }
+
+  void setNoClobberMetadata(Instruction *I) {
+    I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+    Changed = true;
+  }
 
 public:
   static char ID;
@@ -66,13 +77,6 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
 
 char AMDGPUAnnotateUniformValues::ID = 0;
 
-static void setUniformMetadata(Instruction *I) {
-  I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
-}
-static void setNoClobberMetadata(Instruction *I) {
-  I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
-}
-
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
   if (DA->isUniform(&I))
     setUniformMetadata(&I);
@@ -109,8 +113,9 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
 
+  Changed = false;
   visit(F);
-  return true;
+  return Changed;
 }
 
 FunctionPass *

From 1822a5ecdd363ffcf465a7ad2e4e6fb92cab69f7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 11:00:05 +0000
Subject: [PATCH 054/748] [AMDGPU] Return better Changed status from
 AMDGPUPerfHintAnalysis

Differential Revision: https://reviews.llvm.org/D119944
---
 llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 8ad344816ad2d..de97b76b1e093 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -267,19 +267,23 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
                     << " LSMInst cost: " << Info->LSMInstCost << '\n'
                     << " TotalInst cost: " << Info->InstCost << '\n');
 
+  bool Changed = false;
+
   if (isMemBound(*Info)) {
     LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
     NumMemBound++;
     F.addFnAttr("amdgpu-memory-bound", "true");
+    Changed = true;
   }
 
   if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
     LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
     NumLimitWave++;
     F.addFnAttr("amdgpu-wave-limiter", "true");
+    Changed = true;
   }
 
-  return true;
+  return Changed;
 }
 
 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {

From a48084156653e8f3791d13c19ce64df13c44a11e Mon Sep 17 00:00:00 2001
From: Stanislav Gatev <sgatev@google.com>
Date: Thu, 17 Feb 2022 09:37:02 +0000
Subject: [PATCH 055/748] Add missing break statement in switch.

---
 clang/lib/Analysis/FlowSensitive/Transfer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 72475e0c79d90..cd9b8b0e454e4 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -260,12 +260,13 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       auto *SubExprVal =
           dyn_cast_or_null<BoolValue>(Env.getValue(*SubExpr, SkipPast::None));
       if (SubExprVal == nullptr)
-        return;
+        break;
 
       auto &ExprLoc = Env.createStorageLocation(*S);
       Env.setStorageLocation(*S, ExprLoc);
       Env.setValue(ExprLoc, Env.takeOwnership(
                                 std::make_unique<NegationValue>(*SubExprVal)));
+      break;
     }
     default:
       break;

From 78ebb1dd241b0804fcb762d488f8d6a05ae5808c Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 11:00:25 +0000
Subject: [PATCH 056/748] [AMDGPU] Return better Changed status from
 SIAnnotateControlFlow

Differential Revision: https://reviews.llvm.org/D119945
---
 .../Target/AMDGPU/SIAnnotateControlFlow.cpp   | 58 +++++++++++--------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index b81fac36fc957..afd2a38b11ec5 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -73,19 +73,19 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   bool hasKill(const BasicBlock *BB);
 
-  void eraseIfUnused(PHINode *Phi);
+  bool eraseIfUnused(PHINode *Phi);
 
-  void openIf(BranchInst *Term);
+  bool openIf(BranchInst *Term);
 
-  void insertElse(BranchInst *Term);
+  bool insertElse(BranchInst *Term);
 
   Value *
   handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
                       BranchInst *Term);
 
-  void handleLoop(BranchInst *Term);
+  bool handleLoop(BranchInst *Term);
 
-  void closeControlFlow(BasicBlock *BB);
+  bool closeControlFlow(BasicBlock *BB);
 
 public:
   static char ID;
@@ -193,31 +193,34 @@ bool SIAnnotateControlFlow::hasKill(const BasicBlock *BB) {
   return false;
 }
 
-// Erase "Phi" if it is not used any more
-void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
-  if (RecursivelyDeleteDeadPHINode(Phi)) {
+// Erase "Phi" if it is not used any more. Return true if any change was made.
+bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+  bool Changed = RecursivelyDeleteDeadPHINode(Phi);
+  if (Changed)
     LLVM_DEBUG(dbgs() << "Erased unused condition phi\n");
-  }
+  return Changed;
 }
 
 /// Open a new "If" block
-void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
   if (isUniform(Term))
-    return;
+    return false;
 
   Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+  return true;
 }
 
 /// Close the last "If" block and open a new "Else" block
-void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+bool SIAnnotateControlFlow::insertElse(BranchInst *Term) {
   if (isUniform(Term)) {
-    return;
+    return false;
   }
   Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+  return true;
 }
 
 /// Recursively handle the condition leading to a loop
@@ -255,14 +258,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
 }
 
 /// Handle a back edge (loop)
-void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   if (isUniform(Term))
-    return;
+    return false;
 
   BasicBlock *BB = Term->getParent();
   llvm::Loop *L = LI->getLoopFor(BB);
   if (!L)
-    return;
+    return false;
 
   BasicBlock *Target = Term->getSuccessor(1);
   PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front());
@@ -286,10 +289,12 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
   push(Term->getSuccessor(0), Arg);
+
+  return true;
 }
 
 /// Close the last opened control flow
-void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
   assert(Stack.back().first == BB);
@@ -322,6 +327,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     }
     CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
   }
+
+  return true;
 }
 
 /// Annotate the control flow with intrinsics so the backend can
@@ -333,6 +340,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
   TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
 
+  bool Changed = false;
   initialize(*F.getParent(), TM.getSubtarget<GCNSubtarget>(F));
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
@@ -341,32 +349,32 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
 
     if (!Term || Term->isUnconditional()) {
       if (isTopOfStack(BB))
-        closeControlFlow(BB);
+        Changed |= closeControlFlow(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
       if (isTopOfStack(BB))
-        closeControlFlow(BB);
+        Changed |= closeControlFlow(BB);
 
       if (DT->dominates(Term->getSuccessor(1), BB))
-        handleLoop(Term);
+        Changed |= handleLoop(Term);
       continue;
     }
 
     if (isTopOfStack(BB)) {
       PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
       if (Phi && Phi->getParent() == BB && isElse(Phi) && !hasKill(BB)) {
-        insertElse(Term);
-        eraseIfUnused(Phi);
+        Changed |= insertElse(Term);
+        Changed |= eraseIfUnused(Phi);
         continue;
       }
 
-      closeControlFlow(BB);
+      Changed |= closeControlFlow(BB);
     }
 
-    openIf(Term);
+    Changed |= openIf(Term);
   }
 
   if (!Stack.empty()) {
@@ -374,7 +382,7 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
     report_fatal_error("failed to annotate CFG");
   }
 
-  return true;
+  return Changed;
 }
 
 /// Create the annotation pass

From c08896d292562a4aa3f4a47494b6220b384e6078 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 11:00:44 +0000
Subject: [PATCH 057/748] [AMDGPU] Return better Changed status from
 SILowerI1Copies

Differential Revision: https://reviews.llvm.org/D119946
---
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 33 +++++++++++++++-------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 672266f0c11e7..5fb545b50228a 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -79,9 +79,9 @@ class SILowerI1Copies : public MachineFunctionPass {
   }
 
 private:
-  void lowerCopiesFromI1();
-  void lowerPhis();
-  void lowerCopiesToI1();
+  bool lowerCopiesFromI1();
+  bool lowerPhis();
+  bool lowerCopiesToI1();
   bool isConstantLaneMask(Register Reg, bool &Val) const;
   void buildMergeLaneMasks(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, const DebugLoc &DL,
@@ -473,15 +473,17 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
     OrN2Op = AMDGPU::S_ORN2_B64;
   }
 
-  lowerCopiesFromI1();
-  lowerPhis();
-  lowerCopiesToI1();
+  bool Changed = false;
+  Changed |= lowerCopiesFromI1();
+  Changed |= lowerPhis();
+  Changed |= lowerCopiesToI1();
 
+  assert(Changed || ConstrainRegs.empty());
   for (unsigned Reg : ConstrainRegs)
     MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
   ConstrainRegs.clear();
 
-  return true;
+  return Changed;
 }
 
 #ifndef NDEBUG
@@ -493,7 +495,8 @@ static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
 }
 #endif
 
-void SILowerI1Copies::lowerCopiesFromI1() {
+bool SILowerI1Copies::lowerCopiesFromI1() {
+  bool Changed = false;
   SmallVector<MachineInstr *, 4> DeadCopies;
 
   for (MachineBasicBlock &MBB : *MF) {
@@ -509,6 +512,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
       if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
         continue;
 
+      Changed = true;
+
       // Copy into a 32-bit vector register.
       LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
       DebugLoc DL = MI.getDebugLoc();
@@ -530,9 +535,10 @@ void SILowerI1Copies::lowerCopiesFromI1() {
       MI->eraseFromParent();
     DeadCopies.clear();
   }
+  return Changed;
 }
 
-void SILowerI1Copies::lowerPhis() {
+bool SILowerI1Copies::lowerPhis() {
   MachineSSAUpdater SSAUpdater(*MF);
   LoopFinder LF(*DT, *PDT);
   PhiIncomingAnalysis PIA(*PDT);
@@ -550,6 +556,8 @@ void SILowerI1Copies::lowerPhis() {
         Vreg1Phis.push_back(&MI);
     }
   }
+  if (Vreg1Phis.empty())
+    return false;
 
   MachineBasicBlock *PrevMBB = nullptr;
   for (MachineInstr *MI : Vreg1Phis) {
@@ -662,9 +670,11 @@ void SILowerI1Copies::lowerPhis() {
     IncomingRegs.clear();
     IncomingUpdated.clear();
   }
+  return true;
 }
 
-void SILowerI1Copies::lowerCopiesToI1() {
+bool SILowerI1Copies::lowerCopiesToI1() {
+  bool Changed = false;
   MachineSSAUpdater SSAUpdater(*MF);
   LoopFinder LF(*DT, *PDT);
   SmallVector<MachineInstr *, 4> DeadCopies;
@@ -681,6 +691,8 @@ void SILowerI1Copies::lowerCopiesToI1() {
       if (!isVreg1(DstReg))
         continue;
 
+      Changed = true;
+
       if (MRI->use_empty(DstReg)) {
         DeadCopies.push_back(&MI);
         continue;
@@ -731,6 +743,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
       MI->eraseFromParent();
     DeadCopies.clear();
   }
+  return Changed;
 }
 
 bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {

From f3bc7fd5465a3a919388b6a0307553ef4a6d39c9 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 17 Feb 2022 10:03:28 +0000
Subject: [PATCH 058/748] [AArch64] Cleanup for
 performCommonVectorExtendCombine. NFC

This is some NFC (hopefully!) cleanup for performCommonVectorExtendCombine
and related methods, removing conditions that cannot occur and otherwise
cleaning up the code a little.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 31 +++++--------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a371d3bef15d9..9f8b183635012 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13352,7 +13352,7 @@ static bool IsSVECntIntrinsic(SDValue S) {
 ///
 /// \returns The type representing the \p Extend source type, or \p MVT::Other
 /// if no valid type can be determined
-static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
+static EVT calculatePreExtendType(SDValue Extend) {
   switch (Extend.getOpcode()) {
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
@@ -13385,15 +13385,12 @@ static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
   default:
     return MVT::Other;
   }
-
-  llvm_unreachable("Code path unhandled in calculatePreExtendType!");
 }
 
 /// Combines a dup(sext/zext) node pattern into sext/zext(dup)
 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
 static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
                                                 SelectionDAG &DAG) {
-
   ShuffleVectorSDNode *ShuffleNode =
       dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
   if (!ShuffleNode)
@@ -13424,24 +13421,14 @@ static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
       ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
     return SDValue();
 
-  EVT TargetType = VectorShuffle.getValueType();
-  EVT PreExtendType = calculatePreExtendType(Extend, DAG);
-
-  if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
-       TargetType != MVT::v2i64) ||
-      (PreExtendType == MVT::Other))
-    return SDValue();
-
   // Restrict valid pre-extend data type
+  EVT PreExtendType = calculatePreExtendType(Extend);
   if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
       PreExtendType != MVT::i32)
     return SDValue();
 
+  EVT TargetType = VectorShuffle.getValueType();
   EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
-
-  if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
-    return SDValue();
-
   if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
     return SDValue();
 
@@ -13458,17 +13445,16 @@ static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
       DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
                            DAG.getUNDEF(PreExtendVT), ShuffleMask);
 
-  SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
-                                   DL, TargetType, VectorShuffleNode);
-
-  return ExtendNode;
+  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+                     TargetType, VectorShuffleNode);
 }
 
 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
   // If the value type isn't a vector, none of the operands are going to be dups
-  if (!Mul->getValueType(0).isVector())
+  EVT VT = Mul->getValueType(0);
+  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
     return SDValue();
 
   SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
@@ -13479,8 +13465,7 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
     return SDValue();
 
   SDLoc DL(Mul);
-  return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
-                     Op0 ? Op0 : Mul->getOperand(0),
+  return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
                      Op1 ? Op1 : Mul->getOperand(1));
 }
 

From 24a37a396a9bd6b73b05b4eafce8b87e7a748cf9 Mon Sep 17 00:00:00 2001
From: Siddharth Bhat <siddu.druid@gmail.com>
Date: Thu, 17 Feb 2022 15:40:19 +0530
Subject: [PATCH 059/748] [MLIR] add entry block to MLIR grammar.

The MLIR parser allows regions to have an unnamed entry block.
Make this explicit in the language grammar.

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D119950
---
 mlir/docs/LangRef.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mlir/docs/LangRef.md b/mlir/docs/LangRef.md
index 92a5413a656d8..2fe1c6248c8a4 100644
--- a/mlir/docs/LangRef.md
+++ b/mlir/docs/LangRef.md
@@ -443,7 +443,8 @@ entry block cannot be listed as a successor of any other block. The syntax for a
 region is as follows:
 
 ```
-region ::= `{` block* `}`
+region      ::= `{` entry-block? block* `}`
+entry-block ::= operation+
 ```
 
 A function body is an example of a region: it consists of a CFG of blocks and
@@ -454,6 +455,11 @@ arguments must match the result types of the function signature. Similarly, the
 function arguments must match the types and count of the region arguments. In
 general, operations with regions can define these correspondences arbitrarily.
 
+An *entry block* is a block with no label and no arguments that may occur at
+the beginning of a region. It enables a common pattern of using a region to
+open a new scope.
+
+
 ### Value Scoping
 
 Regions provide hierarchical encapsulation of programs: it is impossible to

From f75da0c8e65cf1b09012a8b62cd7f3e9a646bbc9 Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Fri, 11 Feb 2022 21:42:40 +0300
Subject: [PATCH 060/748] [llvm-objcopy][NFC] Move core implementation of
 llvm-objcopy into separate library.

This patch moves core implementation of llvm-objcopy into Object library
(http://lists.llvm.org/pipermail/llvm-dev/2020-September/145075.html).
The functionality for parsing input options is left inside tools/llvm-objcopy.
The interface of ObjCopy library:

ObjCopy/ELF/ELFObjcopy.h

```
Error executeObjcopyOnIHex(const CopyConfig &Config, MemoryBuffer &In,
                           Buffer &Out);
Error executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
                                Buffer &Out);
Error executeObjcopyOnBinary(const CopyConfig &Config,
                             object::ELFObjectFileBase &In, Buffer &Out);

```
ObjCopy/COFF/COFFObjcopy.h

```
Error executeObjcopyOnBinary(const CopyConfig &Config,
                             object::COFFObjectFile &In, Buffer &Out);

```
ObjCopy/MachO/MachOObjcopy.h

```
Error executeObjcopyOnBinary(const CopyConfig &Config,
                             object::MachOObjectFile &In, Buffer &Out);

```
ObjCopy/wasm/WasmObjcopy.h

```
Error executeObjcopyOnBinary(const CopyConfig &Config,
                             object::WasmObjectFile &In, Buffer &Out);

```

Differential Revision: https://reviews.llvm.org/D88827
---
 .../llvm/ObjCopy}/COFF/COFFConfig.h           |   6 +-
 .../llvm/ObjCopy}/COFF/COFFObjcopy.h          |   6 +-
 .../llvm/ObjCopy}/CommonConfig.h              |   6 +-
 llvm/include/llvm/ObjCopy/ConfigManager.h     |  46 ++++++
 .../llvm/ObjCopy}/ELF/ELFConfig.h             |   6 +-
 .../llvm/ObjCopy}/ELF/ELFObjcopy.h            |   6 +-
 .../llvm/ObjCopy}/MachO/MachOConfig.h         |   6 +-
 .../llvm/ObjCopy}/MachO/MachOObjcopy.h        |   6 +-
 .../llvm/ObjCopy}/MultiFormatConfig.h         |   6 +-
 llvm/include/llvm/ObjCopy/ObjCopy.h           |  42 +++++
 .../llvm/ObjCopy}/wasm/WasmConfig.h           |   6 +-
 .../llvm/ObjCopy}/wasm/WasmObjcopy.h          |   6 +-
 llvm/lib/CMakeLists.txt                       |   1 +
 llvm/lib/ObjCopy/Archive.cpp                  | 105 ++++++++++++
 .../llvm-objcopy.h => lib/ObjCopy/Archive.h}  |  25 ++-
 llvm/lib/ObjCopy/CMakeLists.txt               |  31 ++++
 .../ObjCopy}/COFF/COFFObjcopy.cpp             |   6 +-
 .../ObjCopy}/COFF/Object.cpp                  |   0
 .../ObjCopy}/COFF/Object.h                    |   6 +-
 .../ObjCopy}/COFF/Reader.cpp                  |   0
 .../ObjCopy}/COFF/Reader.h                    |   6 +-
 .../ObjCopy}/COFF/Writer.cpp                  |   0
 .../ObjCopy}/COFF/Writer.h                    |   6 +-
 llvm/lib/ObjCopy/ConfigManager.cpp            |  70 ++++++++
 .../ObjCopy}/ELF/ELFObjcopy.cpp               |   9 +-
 .../ObjCopy}/ELF/Object.cpp                   |   6 +-
 .../llvm-objcopy => lib/ObjCopy}/ELF/Object.h |  12 +-
 .../ObjCopy}/MachO/MachOLayoutBuilder.cpp     |   0
 .../ObjCopy}/MachO/MachOLayoutBuilder.h       |   8 +-
 .../ObjCopy}/MachO/MachOObjcopy.cpp           |  11 +-
 .../ObjCopy}/MachO/MachOReader.cpp            |   0
 .../ObjCopy}/MachO/MachOReader.h              |   7 +-
 .../ObjCopy}/MachO/MachOWriter.cpp            |   0
 .../ObjCopy}/MachO/MachOWriter.h              |   7 +-
 .../ObjCopy}/MachO/Object.cpp                 |   0
 .../ObjCopy}/MachO/Object.h                   |   6 +-
 llvm/lib/ObjCopy/ObjCopy.cpp                  |  79 +++++++++
 .../ObjCopy}/wasm/Object.cpp                  |   0
 .../ObjCopy}/wasm/Object.h                    |   6 +-
 .../ObjCopy}/wasm/Reader.cpp                  |   0
 .../ObjCopy}/wasm/Reader.h                    |   6 +-
 .../ObjCopy}/wasm/WasmObjcopy.cpp             |   4 +-
 .../ObjCopy}/wasm/Writer.cpp                  |   0
 .../ObjCopy}/wasm/Writer.h                    |   6 +-
 llvm/tools/llvm-objcopy/CMakeLists.txt        |  18 +--
 .../{ConfigManager.cpp => ObjcopyOptions.cpp} |  87 ++--------
 .../{ConfigManager.h => ObjcopyOptions.h}     |  34 +---
 llvm/tools/llvm-objcopy/llvm-objcopy.cpp      | 153 ++----------------
 llvm/unittests/ObjCopy/CMakeLists.txt         |  11 ++
 llvm/unittests/ObjCopy/ObjCopyTest.cpp        | 118 ++++++++++++++
 50 files changed, 634 insertions(+), 358 deletions(-)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/COFF/COFFConfig.h (82%)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/COFF/COFFObjcopy.h (86%)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/CommonConfig.h (98%)
 create mode 100644 llvm/include/llvm/ObjCopy/ConfigManager.h
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/ELF/ELFConfig.h (88%)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/ELF/ELFObjcopy.h (90%)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/MachO/MachOConfig.h (87%)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/MachO/MachOObjcopy.h (89%)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/MultiFormatConfig.h (86%)
 create mode 100644 llvm/include/llvm/ObjCopy/ObjCopy.h
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/wasm/WasmConfig.h (78%)
 rename llvm/{tools/llvm-objcopy => include/llvm/ObjCopy}/wasm/WasmObjcopy.h (83%)
 create mode 100644 llvm/lib/ObjCopy/Archive.cpp
 rename llvm/{tools/llvm-objcopy/llvm-objcopy.h => lib/ObjCopy/Archive.h} (61%)
 create mode 100644 llvm/lib/ObjCopy/CMakeLists.txt
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/COFF/COFFObjcopy.cpp (98%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/COFF/Object.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/COFF/Object.h (98%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/COFF/Reader.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/COFF/Reader.h (89%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/COFF/Writer.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/COFF/Writer.h (92%)
 create mode 100644 llvm/lib/ObjCopy/ConfigManager.cpp
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/ELF/ELFObjcopy.cpp (99%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/ELF/Object.cpp (99%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/ELF/Object.h (99%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/MachOLayoutBuilder.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/MachOLayoutBuilder.h (94%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/MachOObjcopy.cpp (98%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/MachOReader.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/MachOReader.h (91%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/MachOWriter.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/MachOWriter.h (91%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/Object.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/MachO/Object.h (99%)
 create mode 100644 llvm/lib/ObjCopy/ObjCopy.cpp
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/wasm/Object.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/wasm/Object.h (90%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/wasm/Reader.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/wasm/Reader.h (83%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/wasm/WasmObjcopy.cpp (98%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/wasm/Writer.cpp (100%)
 rename llvm/{tools/llvm-objcopy => lib/ObjCopy}/wasm/Writer.h (91%)
 rename llvm/tools/llvm-objcopy/{ConfigManager.cpp => ObjcopyOptions.cpp} (93%)
 rename llvm/tools/llvm-objcopy/{ConfigManager.h => ObjcopyOptions.h} (68%)
 create mode 100644 llvm/unittests/ObjCopy/CMakeLists.txt
 create mode 100644 llvm/unittests/ObjCopy/ObjCopyTest.cpp

diff --git a/llvm/tools/llvm-objcopy/COFF/COFFConfig.h b/llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
similarity index 82%
rename from llvm/tools/llvm-objcopy/COFF/COFFConfig.h
rename to llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
index 7bf673fa4af9b..29d56d75698be 100644
--- a/llvm/tools/llvm-objcopy/COFF/COFFConfig.h
+++ b/llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_COFF_COFFCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_COFF_COFFCONFIG_H
+#ifndef LLVM_OBJCOPY_COFF_COFFCONFIG_H
+#define LLVM_OBJCOPY_COFF_COFFCONFIG_H
 
 #include "llvm/ADT/Optional.h"
 
@@ -24,4 +24,4 @@ struct COFFConfig {
 } // namespace objcopy
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_COFF_COFFCONFIG_H
+#endif // LLVM_OBJCOPY_COFF_COFFCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h b/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
similarity index 86%
rename from llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h
rename to llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
index 2c7ccd34653d7..f8925e21159be 100644
--- a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
+#ifndef LLVM_OBJCOPY_COFF_COFFOBJCOPY_H
+#define LLVM_OBJCOPY_COFF_COFFOBJCOPY_H
 
 namespace llvm {
 class Error;
@@ -30,4 +30,4 @@ Error executeObjcopyOnBinary(const CommonConfig &Config, const COFFConfig &,
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_COFFOBJCOPY_H
+#endif // LLVM_OBJCOPY_COFF_COFFOBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
similarity index 98%
rename from llvm/tools/llvm-objcopy/CommonConfig.h
rename to llvm/include/llvm/ObjCopy/CommonConfig.h
index ea39a6da2ba56..ecb169a4e8ec2 100644
--- a/llvm/tools/llvm-objcopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_COMMONCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_COMMONCONFIG_H
+#ifndef LLVM_OBJCOPY_COMMONCONFIG_H
+#define LLVM_OBJCOPY_COMMONCONFIG_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/CachedHashString.h"
@@ -257,4 +257,4 @@ struct CommonConfig {
 } // namespace objcopy
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_COMMONCONFIG_H
+#endif // LLVM_OBJCOPY_COMMONCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/ConfigManager.h b/llvm/include/llvm/ObjCopy/ConfigManager.h
new file mode 100644
index 0000000000000..3aac601fca9a3
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/ConfigManager.h
@@ -0,0 +1,46 @@
+//===- ConfigManager.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_CONFIGMANAGER_H
+#define LLVM_OBJCOPY_CONFIGMANAGER_H
+
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/wasm/WasmConfig.h"
+
+namespace llvm {
+namespace objcopy {
+
+struct ConfigManager : public MultiFormatConfig {
+  virtual ~ConfigManager() {}
+
+  const CommonConfig &getCommonConfig() const override { return Common; }
+
+  Expected<const ELFConfig &> getELFConfig() const override { return ELF; }
+
+  Expected<const COFFConfig &> getCOFFConfig() const override;
+
+  Expected<const MachOConfig &> getMachOConfig() const override;
+
+  Expected<const WasmConfig &> getWasmConfig() const override;
+
+  // All configs.
+  CommonConfig Common;
+  ELFConfig ELF;
+  COFFConfig COFF;
+  MachOConfig MachO;
+  WasmConfig Wasm;
+};
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif // LLVM_OBJCOPY_CONFIGMANAGER_H
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFConfig.h b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
similarity index 88%
rename from llvm/tools/llvm-objcopy/ELF/ELFConfig.h
rename to llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
index 229a8d61fb83c..52bc728e36ffa 100644
--- a/llvm/tools/llvm-objcopy/ELF/ELFConfig.h
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_ELF_ELFCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_ELF_ELFCONFIG_H
+#ifndef LLVM_OBJCOPY_ELF_ELFCONFIG_H
+#define LLVM_OBJCOPY_ELF_ELFCONFIG_H
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
@@ -35,4 +35,4 @@ struct ELFConfig {
 } // namespace objcopy
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_ELF_ELFCONFIG_H
+#endif // LLVM_OBJCOPY_ELF_ELFCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h b/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
similarity index 90%
rename from llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h
rename to llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
index 852661e68f37b..676af4bec0844 100644
--- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
+#ifndef LLVM_OBJCOPY_ELF_ELFOBJCOPY_H
+#define LLVM_OBJCOPY_ELF_ELFOBJCOPY_H
 
 namespace llvm {
 class Error;
@@ -37,4 +37,4 @@ Error executeObjcopyOnBinary(const CommonConfig &Config,
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
+#endif // LLVM_OBJCOPY_ELF_ELFOBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOConfig.h b/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
similarity index 87%
rename from llvm/tools/llvm-objcopy/MachO/MachOConfig.h
rename to llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
index 93f9facfcf0bb..f65cea36a920b 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOConfig.h
+++ b/llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
+#ifndef LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
+#define LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -40,4 +40,4 @@ struct MachOConfig {
 } // namespace objcopy
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
+#endif // LLVM_OBJCOPY_MACHO_MACHOCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h b/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
similarity index 89%
rename from llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
rename to llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
index d03eee9d5fdbb..79f6ba4cf8a84 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_MACHOOBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_MACHOOBJCOPY_H
+#ifndef LLVM_OBJCOPY_MACHO_MACHOOBJCOPY_H
+#define LLVM_OBJCOPY_MACHO_MACHOOBJCOPY_H
 
 namespace llvm {
 class Error;
@@ -36,4 +36,4 @@ Error executeObjcopyOnMachOUniversalBinary(
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_MACHOOBJCOPY_H
+#endif // LLVM_OBJCOPY_MACHO_MACHOOBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/MultiFormatConfig.h b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
similarity index 86%
rename from llvm/tools/llvm-objcopy/MultiFormatConfig.h
rename to llvm/include/llvm/ObjCopy/MultiFormatConfig.h
index 31d9883d6d3a3..022751b6228bb 100644
--- a/llvm/tools/llvm-objcopy/MultiFormatConfig.h
+++ b/llvm/include/llvm/ObjCopy/MultiFormatConfig.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_MULTIFORMATCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_MULTIFORMATCONFIG_H
+#ifndef LLVM_OBJCOPY_MULTIFORMATCONFIG_H
+#define LLVM_OBJCOPY_MULTIFORMATCONFIG_H
 
 #include "llvm/Support/Error.h"
 
@@ -34,4 +34,4 @@ class MultiFormatConfig {
 } // namespace objcopy
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_MULTIFORMATCONFIG_H
+#endif // LLVM_OBJCOPY_MULTIFORMATCONFIG_H
diff --git a/llvm/include/llvm/ObjCopy/ObjCopy.h b/llvm/include/llvm/ObjCopy/ObjCopy.h
new file mode 100644
index 0000000000000..023814002c727
--- /dev/null
+++ b/llvm/include/llvm/ObjCopy/ObjCopy.h
@@ -0,0 +1,42 @@
+//===- ObjCopy.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJCOPY_OBJCOPY_H
+#define LLVM_OBJCOPY_OBJCOPY_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+class raw_ostream;
+
+namespace object {
+class Archive;
+class Binary;
+} // end namespace object
+
+namespace objcopy {
+class MultiFormatConfig;
+
+/// Applies the transformations described by \p Config to
+/// each member in archive \p Ar.
+/// Writes a result in a file specified by \p Config.OutputFilename.
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnArchive(const MultiFormatConfig &Config,
+                              const object::Archive &Ar);
+
+/// Applies the transformations described by \p Config to \p In and writes
+/// the result into \p Out. This function does the dispatch based on the
+/// format of the input binary (COFF, ELF, MachO or wasm).
+/// \returns any Error encountered whilst performing the operation.
+Error executeObjcopyOnBinary(const MultiFormatConfig &Config,
+                             object::Binary &In, raw_ostream &Out);
+
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_OBJCOPY_OBJCOPY_H
diff --git a/llvm/tools/llvm-objcopy/wasm/WasmConfig.h b/llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
similarity index 78%
rename from llvm/tools/llvm-objcopy/wasm/WasmConfig.h
rename to llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
index 4e40926ae4530..56a7055da9a77 100644
--- a/llvm/tools/llvm-objcopy/wasm/WasmConfig.h
+++ b/llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMCONFIG_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMCONFIG_H
+#ifndef LLVM_OBJCOPY_WASM_WASMCONFIG_H
+#define LLVM_OBJCOPY_WASM_WASMCONFIG_H
 
 namespace llvm {
 namespace objcopy {
@@ -18,4 +18,4 @@ struct WasmConfig {};
 } // namespace objcopy
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMCONFIG_H
+#endif // LLVM_OBJCOPY_WASM_WASMCONFIG_H
diff --git a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h b/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
similarity index 83%
rename from llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h
rename to llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
index 28268e38c5849..36a9103a35df3 100644
--- a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
+#ifndef LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
+#define LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
 
 namespace llvm {
 class Error;
@@ -29,4 +29,4 @@ Error executeObjcopyOnBinary(const CommonConfig &Config, const WasmConfig &,
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
+#endif // LLVM_OBJCOPY_WASM_WASMOBJCOPY_H
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index 57bf209a67699..5ecdf5af956a3 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -21,6 +21,7 @@ add_subdirectory(Analysis)
 add_subdirectory(LTO)
 add_subdirectory(MC)
 add_subdirectory(MCA)
+add_subdirectory(ObjCopy)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
diff --git a/llvm/lib/ObjCopy/Archive.cpp b/llvm/lib/ObjCopy/Archive.cpp
new file mode 100644
index 0000000000000..ef893ccb409cb
--- /dev/null
+++ b/llvm/lib/ObjCopy/Archive.cpp
@@ -0,0 +1,105 @@
+//===- Archive.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Archive.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
+
+namespace llvm {
+namespace objcopy {
+
+using namespace llvm::object;
+
+Expected<std::vector<NewArchiveMember>>
+createNewArchiveMembers(const MultiFormatConfig &Config, const Archive &Ar) {
+  std::vector<NewArchiveMember> NewArchiveMembers;
+  Error Err = Error::success();
+  for (const Archive::Child &Child : Ar.children(Err)) {
+    Expected<StringRef> ChildNameOrErr = Child.getName();
+    if (!ChildNameOrErr)
+      return createFileError(Ar.getFileName(), ChildNameOrErr.takeError());
+
+    Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
+    if (!ChildOrErr)
+      return createFileError(Ar.getFileName() + "(" + *ChildNameOrErr + ")",
+                             ChildOrErr.takeError());
+
+    SmallVector<char, 0> Buffer;
+    raw_svector_ostream MemStream(Buffer);
+
+    if (Error E = executeObjcopyOnBinary(Config, *ChildOrErr->get(), MemStream))
+      return std::move(E);
+
+    Expected<NewArchiveMember> Member = NewArchiveMember::getOldMember(
+        Child, Config.getCommonConfig().DeterministicArchives);
+    if (!Member)
+      return createFileError(Ar.getFileName(), Member.takeError());
+
+    Member->Buf = std::make_unique<SmallVectorMemoryBuffer>(
+        std::move(Buffer), ChildNameOrErr.get());
+    Member->MemberName = Member->Buf->getBufferIdentifier();
+    NewArchiveMembers.push_back(std::move(*Member));
+  }
+  if (Err)
+    return createFileError(Config.getCommonConfig().InputFilename,
+                           std::move(Err));
+  return std::move(NewArchiveMembers);
+}
+
+// For regular archives this function simply calls llvm::writeArchive,
+// For thin archives it writes the archive file itself as well as its members.
+static Error deepWriteArchive(StringRef ArcName,
+                              ArrayRef<NewArchiveMember> NewMembers,
+                              bool WriteSymtab, object::Archive::Kind Kind,
+                              bool Deterministic, bool Thin) {
+  if (Error E = writeArchive(ArcName, NewMembers, WriteSymtab, Kind,
+                             Deterministic, Thin))
+    return createFileError(ArcName, std::move(E));
+
+  if (!Thin)
+    return Error::success();
+
+  for (const NewArchiveMember &Member : NewMembers) {
+    // For regular files (as is the case for deepWriteArchive),
+    // FileOutputBuffer::create will return OnDiskBuffer.
+    // OnDiskBuffer uses a temporary file and then renames it. So in reality
+    // there is no inefficiency / duplicated in-memory buffers in this case. For
+    // now in-memory buffers can not be completely avoided since
+    // NewArchiveMember still requires them even though writeArchive does not
+    // write them on disk.
+    Expected<std::unique_ptr<FileOutputBuffer>> FB =
+        FileOutputBuffer::create(Member.MemberName, Member.Buf->getBufferSize(),
+                                 FileOutputBuffer::F_executable);
+    if (!FB)
+      return FB.takeError();
+    std::copy(Member.Buf->getBufferStart(), Member.Buf->getBufferEnd(),
+              (*FB)->getBufferStart());
+    if (Error E = (*FB)->commit())
+      return E;
+  }
+  return Error::success();
+}
+
+Error executeObjcopyOnArchive(const MultiFormatConfig &Config,
+                              const object::Archive &Ar) {
+  Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
+      createNewArchiveMembers(Config, Ar);
+  if (!NewArchiveMembersOrErr)
+    return NewArchiveMembersOrErr.takeError();
+  const CommonConfig &CommonConfig = Config.getCommonConfig();
+  return deepWriteArchive(CommonConfig.OutputFilename, *NewArchiveMembersOrErr,
+                          Ar.hasSymbolTable(), Ar.kind(),
+                          CommonConfig.DeterministicArchives, Ar.isThin());
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.h b/llvm/lib/ObjCopy/Archive.h
similarity index 61%
rename from llvm/tools/llvm-objcopy/llvm-objcopy.h
rename to llvm/lib/ObjCopy/Archive.h
index 182c95dc64c8c..08aae563505ce 100644
--- a/llvm/tools/llvm-objcopy/llvm-objcopy.h
+++ b/llvm/lib/ObjCopy/Archive.h
@@ -1,4 +1,4 @@
-//===- llvm-objcopy.h -------------------------------------------*- C++ -*-===//
+//===- Archive.h ------------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,24 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_OBJCOPY_H
-#define LLVM_TOOLS_OBJCOPY_OBJCOPY_H
+#ifndef LLVM_LIB_OBJCOPY_ARCHIVE_H
+#define LLVM_LIB_OBJCOPY_ARCHIVE_H
 
+#include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
+#include <vector>
 
 namespace llvm {
-
-struct NewArchiveMember;
-
-namespace object {
-
-class Archive;
-
-} // end namespace object
-
 namespace objcopy {
+
 class MultiFormatConfig;
+
+/// Applies the transformations described by \p Config to
+/// each member in archive \p Ar.
+/// \returns Vector of transformed archive members.
 Expected<std::vector<NewArchiveMember>>
 createNewArchiveMembers(const MultiFormatConfig &Config,
                         const object::Archive &Ar);
@@ -31,4 +28,4 @@ createNewArchiveMembers(const MultiFormatConfig &Config,
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_OBJCOPY_H
+#endif // LLVM_LIB_OBJCOPY_ARCHIVE_H
diff --git a/llvm/lib/ObjCopy/CMakeLists.txt b/llvm/lib/ObjCopy/CMakeLists.txt
new file mode 100644
index 0000000000000..c272d2637bdcc
--- /dev/null
+++ b/llvm/lib/ObjCopy/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_llvm_component_library(LLVMObjCopy
+  Archive.cpp
+  ObjCopy.cpp
+  ConfigManager.cpp
+  COFF/COFFObjcopy.cpp
+  COFF/Object.cpp
+  COFF/Reader.cpp
+  COFF/Writer.cpp
+  ELF/ELFObjcopy.cpp
+  ELF/Object.cpp
+  MachO/MachOObjcopy.cpp
+  MachO/MachOReader.cpp
+  MachO/MachOWriter.cpp
+  MachO/MachOLayoutBuilder.cpp
+  MachO/Object.cpp
+  wasm/Object.cpp
+  wasm/Reader.cpp
+  wasm/Writer.cpp
+  wasm/WasmObjcopy.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Object
+
+  DEPENDS
+  intrinsics_gen
+
+  LINK_COMPONENTS
+  Object
+  Support
+  MC
+  )
diff --git a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
similarity index 98%
rename from llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
rename to llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
index e93d2775665dd..31801231e46be 100644
--- a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "COFFObjcopy.h"
-#include "COFFConfig.h"
-#include "CommonConfig.h"
+#include "llvm/ObjCopy/COFF/COFFObjcopy.h"
 #include "Object.h"
 #include "Reader.h"
 #include "Writer.h"
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/CommonConfig.h"
 
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
diff --git a/llvm/tools/llvm-objcopy/COFF/Object.cpp b/llvm/lib/ObjCopy/COFF/Object.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/COFF/Object.cpp
rename to llvm/lib/ObjCopy/COFF/Object.cpp
diff --git a/llvm/tools/llvm-objcopy/COFF/Object.h b/llvm/lib/ObjCopy/COFF/Object.h
similarity index 98%
rename from llvm/tools/llvm-objcopy/COFF/Object.h
rename to llvm/lib/ObjCopy/COFF/Object.h
index 4a478cda8c151..2f4d8af41fdee 100644
--- a/llvm/tools/llvm-objcopy/COFF/Object.h
+++ b/llvm/lib/ObjCopy/COFF/Object.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
-#define LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_COFF_OBJECT_H
+#define LLVM_LIB_OBJCOPY_COFF_OBJECT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -209,4 +209,4 @@ void copyPeHeader(PeHeader1Ty &Dest, const PeHeader2Ty &Src) {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_COFF_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_COFF_OBJECT_H
diff --git a/llvm/tools/llvm-objcopy/COFF/Reader.cpp b/llvm/lib/ObjCopy/COFF/Reader.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/COFF/Reader.cpp
rename to llvm/lib/ObjCopy/COFF/Reader.cpp
diff --git a/llvm/tools/llvm-objcopy/COFF/Reader.h b/llvm/lib/ObjCopy/COFF/Reader.h
similarity index 89%
rename from llvm/tools/llvm-objcopy/COFF/Reader.h
rename to llvm/lib/ObjCopy/COFF/Reader.h
index 48c050b6ea115..9e4d5124829c7 100644
--- a/llvm/tools/llvm-objcopy/COFF/Reader.h
+++ b/llvm/lib/ObjCopy/COFF/Reader.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_COFF_READER_H
-#define LLVM_TOOLS_OBJCOPY_COFF_READER_H
+#ifndef LLVM_LIB_OBJCOPY_COFF_READER_H
+#define LLVM_LIB_OBJCOPY_COFF_READER_H
 
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/COFF.h"
@@ -38,4 +38,4 @@ class COFFReader {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_COFF_READER_H
+#endif // LLVM_LIB_OBJCOPY_COFF_READER_H
diff --git a/llvm/tools/llvm-objcopy/COFF/Writer.cpp b/llvm/lib/ObjCopy/COFF/Writer.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/COFF/Writer.cpp
rename to llvm/lib/ObjCopy/COFF/Writer.cpp
diff --git a/llvm/tools/llvm-objcopy/COFF/Writer.h b/llvm/lib/ObjCopy/COFF/Writer.h
similarity index 92%
rename from llvm/tools/llvm-objcopy/COFF/Writer.h
rename to llvm/lib/ObjCopy/COFF/Writer.h
index eed43b3e58146..5856c0f30b9f0 100644
--- a/llvm/tools/llvm-objcopy/COFF/Writer.h
+++ b/llvm/lib/ObjCopy/COFF/Writer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
-#define LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
+#ifndef LLVM_LIB_OBJCOPY_COFF_WRITER_H
+#define LLVM_LIB_OBJCOPY_COFF_WRITER_H
 
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Error.h"
@@ -60,4 +60,4 @@ class COFFWriter {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_COFF_WRITER_H
+#endif // LLVM_LIB_OBJCOPY_COFF_WRITER_H
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
new file mode 100644
index 0000000000000..6f6e1bd1a74f8
--- /dev/null
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -0,0 +1,70 @@
+//===- ConfigManager.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/ConfigManager.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace objcopy {
+
+Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
+  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() || !Common.DumpSection.empty() ||
+      !Common.KeepSection.empty() || !Common.SymbolsToGlobalize.empty() ||
+      !Common.SymbolsToKeep.empty() || !Common.SymbolsToLocalize.empty() ||
+      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
+      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
+      Common.ExtractDWO || Common.PreserveDates || Common.StripDWO ||
+      Common.StripNonAlloc || Common.StripSections || Common.Weaken ||
+      Common.DecompressDebugSections ||
+      Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty())
+    return createStringError(llvm::errc::invalid_argument,
+                             "option is not supported for COFF");
+
+  return COFF;
+}
+
+Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
+  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
+      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
+      !Common.SymbolsToLocalize.empty() || !Common.SymbolsToWeaken.empty() ||
+      !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() ||
+      !Common.UnneededSymbolsToRemove.empty() ||
+      !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
+      Common.ExtractDWO || Common.PreserveDates || Common.StripAllGNU ||
+      Common.StripDWO || Common.StripNonAlloc || Common.StripSections ||
+      Common.Weaken || Common.DecompressDebugSections || Common.StripUnneeded ||
+      Common.DiscardMode == DiscardType::Locals || !Common.SymbolsToAdd.empty())
+    return createStringError(llvm::errc::invalid_argument,
+                             "option is not supported for MachO");
+
+  return MachO;
+}
+
+Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
+  if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
+      !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
+      !Common.AllocSectionsPrefix.empty() ||
+      Common.DiscardMode != DiscardType::None || !Common.SymbolsToAdd.empty() ||
+      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToLocalize.empty() ||
+      !Common.SymbolsToKeep.empty() || !Common.SymbolsToRemove.empty() ||
+      !Common.UnneededSymbolsToRemove.empty() ||
+      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
+      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
+      !Common.SetSectionFlags.empty() || !Common.SymbolsToRename.empty())
+    return createStringError(llvm::errc::invalid_argument,
+                             "only flags for section dumping, removal, and "
+                             "addition are supported");
+
+  return Wasm;
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
similarity index 99%
rename from llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
rename to llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index f8521fa0d5b70..9e41a04919522 100644
--- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -6,11 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ELFObjcopy.h"
-#include "CommonConfig.h"
-#include "ELFConfig.h"
+#include "llvm/ObjCopy/ELF/ELFObjcopy.h"
 #include "Object.h"
-#include "llvm-objcopy.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
@@ -20,6 +17,8 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFTypes.h"
@@ -169,7 +168,7 @@ static std::unique_ptr<Writer> createWriter(const CommonConfig &Config,
 
 template <class... Ts>
 static Error makeStringError(std::error_code EC, const Twine &Msg,
-                             Ts &&... Args) {
+                             Ts &&...Args) {
   std::string FullMsg = (EC.message() + ": " + Msg).str();
   return createStringError(EC, FullMsg.c_str(), std::forward<Ts>(Args)...);
 }
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.cpp b/llvm/lib/ObjCopy/ELF/Object.cpp
similarity index 99%
rename from llvm/tools/llvm-objcopy/ELF/Object.cpp
rename to llvm/lib/ObjCopy/ELF/Object.cpp
index 3b4152a74a4af..be255470ebc8e 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.cpp
+++ b/llvm/lib/ObjCopy/ELF/Object.cpp
@@ -1813,9 +1813,9 @@ template <class ELFT> Error ELFBuilder<ELFT>::readSectionHeaders() {
     Sec->EntrySize = Shdr.sh_entsize;
     Sec->Index = Index++;
     Sec->OriginalIndex = Sec->Index;
-    Sec->OriginalData =
-        ArrayRef<uint8_t>(ElfFile.base() + Shdr.sh_offset,
-                          (Shdr.sh_type == SHT_NOBITS) ? (size_t)0 : Shdr.sh_size);
+    Sec->OriginalData = ArrayRef<uint8_t>(
+        ElfFile.base() + Shdr.sh_offset,
+        (Shdr.sh_type == SHT_NOBITS) ? (size_t)0 : Shdr.sh_size);
   }
 
   return Error::success();
diff --git a/llvm/tools/llvm-objcopy/ELF/Object.h b/llvm/lib/ObjCopy/ELF/Object.h
similarity index 99%
rename from llvm/tools/llvm-objcopy/ELF/Object.h
rename to llvm/lib/ObjCopy/ELF/Object.h
index 681ab8f56381c..b14f7f2c72384 100644
--- a/llvm/tools/llvm-objcopy/ELF/Object.h
+++ b/llvm/lib/ObjCopy/ELF/Object.h
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJCOPY_OBJECT_H
-#define LLVM_TOOLS_OBJCOPY_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_ELF_OBJECT_H
+#define LLVM_LIB_OBJCOPY_ELF_OBJECT_H
 
-#include "CommonConfig.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
@@ -998,7 +998,7 @@ class IHexReader : public Reader {
                                  std::move(E));
   }
   template <typename... Ts>
-  Error parseError(size_t LineNo, char const *Fmt, const Ts &... Vals) const {
+  Error parseError(size_t LineNo, char const *Fmt, const Ts &...Vals) const {
     Error E = createStringError(errc::invalid_argument, Fmt, Vals...);
     return parseError(LineNo, std::move(E));
   }
@@ -1088,7 +1088,7 @@ class Object {
                        std::function<bool(const SectionBase &)> ToRemove);
   Error replaceSections(const DenseMap<SectionBase *, SectionBase *> &FromTo);
   Error removeSymbols(function_ref<bool(const Symbol &)> ToRemove);
-  template <class T, class... Ts> T &addSection(Ts &&... Args) {
+  template <class T, class... Ts> T &addSection(Ts &&...Args) {
     auto Sec = std::make_unique<T>(std::forward<Ts>(Args)...);
     auto Ptr = Sec.get();
     MustBeRelocatable |= isa<RelocationSection>(*Ptr);
@@ -1110,4 +1110,4 @@ class Object {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_OBJCOPY_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_ELF_OBJECT_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
rename to llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
similarity index 94%
rename from llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
rename to llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
index 44d03b4af7e83..709534306fda9 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
-#define LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
 
-#include "MachOObjcopy.h"
 #include "Object.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
 
 namespace llvm {
 namespace objcopy {
@@ -94,4 +94,4 @@ class MachOLayoutBuilder {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
similarity index 98%
rename from llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
rename to llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
index 0f92ca516bef7..b8e21222aa3e1 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
@@ -6,14 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MachOObjcopy.h"
-#include "../llvm-objcopy.h"
-#include "CommonConfig.h"
-#include "MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "Archive.h"
 #include "MachOReader.h"
 #include "MachOWriter.h"
-#include "MultiFormatConfig.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/ObjCopy.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/MachOUniversalWriter.h"
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
rename to llvm/lib/ObjCopy/MachO/MachOReader.cpp
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h
similarity index 91%
rename from llvm/tools/llvm-objcopy/MachO/MachOReader.h
rename to llvm/lib/ObjCopy/MachO/MachOReader.h
index b29e86ca642e4..fee2112845a5e 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOReader.h
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.h
@@ -6,9 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MachOObjcopy.h"
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
+
 #include "Object.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
 #include "llvm/Object/MachO.h"
 #include <memory>
 
@@ -55,3 +58,5 @@ class MachOReader : public Reader {
 } // end namespace macho
 } // end namespace objcopy
 } // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
rename to llvm/lib/ObjCopy/MachO/MachOWriter.cpp
diff --git a/llvm/tools/llvm-objcopy/MachO/MachOWriter.h b/llvm/lib/ObjCopy/MachO/MachOWriter.h
similarity index 91%
rename from llvm/tools/llvm-objcopy/MachO/MachOWriter.h
rename to llvm/lib/ObjCopy/MachO/MachOWriter.h
index a172534dac8a3..2898df6c4bf0f 100644
--- a/llvm/tools/llvm-objcopy/MachO/MachOWriter.h
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.h
@@ -6,10 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOWRITER_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOWRITER_H
+
 #include "MachOLayoutBuilder.h"
-#include "MachOObjcopy.h"
 #include "Object.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
 #include "llvm/Object/MachO.h"
 
 namespace llvm {
@@ -69,3 +72,5 @@ class MachOWriter {
 } // end namespace macho
 } // end namespace objcopy
 } // end namespace llvm
+
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOWRITER_H
diff --git a/llvm/tools/llvm-objcopy/MachO/Object.cpp b/llvm/lib/ObjCopy/MachO/Object.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/MachO/Object.cpp
rename to llvm/lib/ObjCopy/MachO/Object.cpp
diff --git a/llvm/tools/llvm-objcopy/MachO/Object.h b/llvm/lib/ObjCopy/MachO/Object.h
similarity index 99%
rename from llvm/tools/llvm-objcopy/MachO/Object.h
rename to llvm/lib/ObjCopy/MachO/Object.h
index 13aaf42634b09..bb7f1fa81800f 100644
--- a/llvm/tools/llvm-objcopy/MachO/Object.h
+++ b/llvm/lib/ObjCopy/MachO/Object.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJCOPY_MACHO_OBJECT_H
-#define LLVM_OBJCOPY_MACHO_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_MACHO_OBJECT_H
+#define LLVM_LIB_OBJCOPY_MACHO_OBJECT_H
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
@@ -371,4 +371,4 @@ struct Object {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_OBJCOPY_MACHO_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_MACHO_OBJECT_H
diff --git a/llvm/lib/ObjCopy/ObjCopy.cpp b/llvm/lib/ObjCopy/ObjCopy.cpp
new file mode 100644
index 0000000000000..a62f476567f56
--- /dev/null
+++ b/llvm/lib/ObjCopy/ObjCopy.cpp
@@ -0,0 +1,79 @@
+//===- Objcopy.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/COFF/COFFObjcopy.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
+#include "llvm/ObjCopy/ELF/ELFObjcopy.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "llvm/ObjCopy/MultiFormatConfig.h"
+#include "llvm/ObjCopy/wasm/WasmConfig.h"
+#include "llvm/ObjCopy/wasm/WasmObjcopy.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/Wasm.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
+
+namespace llvm {
+namespace objcopy {
+
+using namespace llvm::object;
+
+/// The function executeObjcopyOnBinary does the dispatch based on the format
+/// of the input binary (ELF, MachO or COFF).
+Error executeObjcopyOnBinary(const MultiFormatConfig &Config,
+                             object::Binary &In, raw_ostream &Out) {
+  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In)) {
+    Expected<const ELFConfig &> ELFConfig = Config.getELFConfig();
+    if (!ELFConfig)
+      return ELFConfig.takeError();
+
+    return elf::executeObjcopyOnBinary(Config.getCommonConfig(), *ELFConfig,
+                                       *ELFBinary, Out);
+  }
+  if (auto *COFFBinary = dyn_cast<object::COFFObjectFile>(&In)) {
+    Expected<const COFFConfig &> COFFConfig = Config.getCOFFConfig();
+    if (!COFFConfig)
+      return COFFConfig.takeError();
+
+    return coff::executeObjcopyOnBinary(Config.getCommonConfig(), *COFFConfig,
+                                        *COFFBinary, Out);
+  }
+  if (auto *MachOBinary = dyn_cast<object::MachOObjectFile>(&In)) {
+    Expected<const MachOConfig &> MachOConfig = Config.getMachOConfig();
+    if (!MachOConfig)
+      return MachOConfig.takeError();
+
+    return macho::executeObjcopyOnBinary(Config.getCommonConfig(), *MachOConfig,
+                                         *MachOBinary, Out);
+  }
+  if (auto *MachOUniversalBinary =
+          dyn_cast<object::MachOUniversalBinary>(&In)) {
+    return macho::executeObjcopyOnMachOUniversalBinary(
+        Config, *MachOUniversalBinary, Out);
+  }
+  if (auto *WasmBinary = dyn_cast<object::WasmObjectFile>(&In)) {
+    Expected<const WasmConfig &> WasmConfig = Config.getWasmConfig();
+    if (!WasmConfig)
+      return WasmConfig.takeError();
+
+    return objcopy::wasm::executeObjcopyOnBinary(Config.getCommonConfig(),
+                                                 *WasmConfig, *WasmBinary, Out);
+  }
+  return createStringError(object_error::invalid_file_type,
+                           "unsupported object file format");
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/llvm/tools/llvm-objcopy/wasm/Object.cpp b/llvm/lib/ObjCopy/wasm/Object.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/wasm/Object.cpp
rename to llvm/lib/ObjCopy/wasm/Object.cpp
diff --git a/llvm/tools/llvm-objcopy/wasm/Object.h b/llvm/lib/ObjCopy/wasm/Object.h
similarity index 90%
rename from llvm/tools/llvm-objcopy/wasm/Object.h
rename to llvm/lib/ObjCopy/wasm/Object.h
index 9db91c41e2e26..e58fc8c454962 100644
--- a/llvm/tools/llvm-objcopy/wasm/Object.h
+++ b/llvm/lib/ObjCopy/wasm/Object.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_OBJECT_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_WASM_OBJECT_H
+#define LLVM_LIB_OBJCOPY_WASM_OBJECT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -44,4 +44,4 @@ struct Object {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_WASM_OBJECT_H
diff --git a/llvm/tools/llvm-objcopy/wasm/Reader.cpp b/llvm/lib/ObjCopy/wasm/Reader.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/wasm/Reader.cpp
rename to llvm/lib/ObjCopy/wasm/Reader.cpp
diff --git a/llvm/tools/llvm-objcopy/wasm/Reader.h b/llvm/lib/ObjCopy/wasm/Reader.h
similarity index 83%
rename from llvm/tools/llvm-objcopy/wasm/Reader.h
rename to llvm/lib/ObjCopy/wasm/Reader.h
index 2dcf7dde029a0..d8dd541894541 100644
--- a/llvm/tools/llvm-objcopy/wasm/Reader.h
+++ b/llvm/lib/ObjCopy/wasm/Reader.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_READER_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_READER_H
+#ifndef LLVM_LIB_OBJCOPY_WASM_READER_H
+#define LLVM_LIB_OBJCOPY_WASM_READER_H
 
 #include "Object.h"
 
@@ -28,4 +28,4 @@ class Reader {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_READER_H
+#endif // LLVM_LIB_OBJCOPY_WASM_READER_H
diff --git a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
similarity index 98%
rename from llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
rename to llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
index 397d09757e54c..69b5e6fe8bee4 100644
--- a/llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "WasmObjcopy.h"
-#include "CommonConfig.h"
+#include "llvm/ObjCopy/wasm/WasmObjcopy.h"
 #include "Object.h"
 #include "Reader.h"
 #include "Writer.h"
+#include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
 
diff --git a/llvm/tools/llvm-objcopy/wasm/Writer.cpp b/llvm/lib/ObjCopy/wasm/Writer.cpp
similarity index 100%
rename from llvm/tools/llvm-objcopy/wasm/Writer.cpp
rename to llvm/lib/ObjCopy/wasm/Writer.cpp
diff --git a/llvm/tools/llvm-objcopy/wasm/Writer.h b/llvm/lib/ObjCopy/wasm/Writer.h
similarity index 91%
rename from llvm/tools/llvm-objcopy/wasm/Writer.h
rename to llvm/lib/ObjCopy/wasm/Writer.h
index 4404cd8caf843..332b96e892516 100644
--- a/llvm/tools/llvm-objcopy/wasm/Writer.h
+++ b/llvm/lib/ObjCopy/wasm/Writer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_WASM_WRITER_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_WASM_WRITER_H
+#ifndef LLVM_LIB_OBJCOPY_WASM_WRITER_H
+#define LLVM_LIB_OBJCOPY_WASM_WRITER_H
 
 #include "Object.h"
 #include <cstdint>
@@ -46,4 +46,4 @@ class Writer {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_WASM_WRITER_H
+#endif // LLVM_LIB_OBJCOPY_WASM_WRITER_H
diff --git a/llvm/tools/llvm-objcopy/CMakeLists.txt b/llvm/tools/llvm-objcopy/CMakeLists.txt
index d14d2135f5db7..99e884a8cf0fa 100644
--- a/llvm/tools/llvm-objcopy/CMakeLists.txt
+++ b/llvm/tools/llvm-objcopy/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   Object
+  ObjCopy
   Option
   Support
   MC
@@ -22,23 +23,8 @@ tablegen(LLVM StripOpts.inc -gen-opt-parser-defs)
 add_public_tablegen_target(StripOptsTableGen)
 
 add_llvm_tool(llvm-objcopy
-  ConfigManager.cpp
+  ObjcopyOptions.cpp
   llvm-objcopy.cpp
-  COFF/COFFObjcopy.cpp
-  COFF/Object.cpp
-  COFF/Reader.cpp
-  COFF/Writer.cpp
-  ELF/ELFObjcopy.cpp
-  ELF/Object.cpp
-  MachO/MachOObjcopy.cpp
-  MachO/MachOReader.cpp
-  MachO/MachOWriter.cpp
-  MachO/MachOLayoutBuilder.cpp
-  MachO/Object.cpp
-  wasm/Object.cpp
-  wasm/Reader.cpp
-  wasm/Writer.cpp
-  wasm/WasmObjcopy.cpp
   DEPENDS
   ObjcopyOptsTableGen
   InstallNameToolOptsTableGen
diff --git a/llvm/tools/llvm-objcopy/ConfigManager.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
similarity index 93%
rename from llvm/tools/llvm-objcopy/ConfigManager.cpp
rename to llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index 90730c421a46d..65bbd033d3c28 100644
--- a/llvm/tools/llvm-objcopy/ConfigManager.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -1,4 +1,4 @@
-//===- ConfigManager.cpp --------------------------------------------------===//
+//===- ObjcopyOptions.cpp -------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ConfigManager.h"
+#include "ObjcopyOptions.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/ObjCopy/ConfigManager.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CRC.h"
@@ -20,8 +21,6 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/StringSaver.h"
-#include <memory>
 
 using namespace llvm;
 using namespace llvm::objcopy;
@@ -559,68 +558,6 @@ static Expected<NewSymbolInfo> parseNewSymbolInfo(StringRef FlagValue) {
   return SI;
 }
 
-Expected<const ELFConfig &> ConfigManager::getELFConfig() const {
-  return ELF;
-}
-
-Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
-  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.AllocSectionsPrefix.empty() || !Common.DumpSection.empty() ||
-      !Common.KeepSection.empty() || !Common.SymbolsToGlobalize.empty() ||
-      !Common.SymbolsToKeep.empty() || !Common.SymbolsToLocalize.empty() ||
-      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
-      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
-      Common.ExtractDWO || Common.PreserveDates || Common.StripDWO ||
-      Common.StripNonAlloc || Common.StripSections || Common.Weaken ||
-      Common.DecompressDebugSections ||
-      Common.DiscardMode == DiscardType::Locals ||
-      !Common.SymbolsToAdd.empty()) {
-    return createStringError(llvm::errc::invalid_argument,
-                             "option not supported by llvm-objcopy for COFF");
-  }
-
-  return COFF;
-}
-
-Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
-  if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
-      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
-      !Common.SymbolsToLocalize.empty() || !Common.SymbolsToWeaken.empty() ||
-      !Common.SymbolsToKeepGlobal.empty() || !Common.SectionsToRename.empty() ||
-      !Common.UnneededSymbolsToRemove.empty() ||
-      !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
-      Common.ExtractDWO || Common.PreserveDates || Common.StripAllGNU ||
-      Common.StripDWO || Common.StripNonAlloc || Common.StripSections ||
-      Common.Weaken || Common.DecompressDebugSections || Common.StripUnneeded ||
-      Common.DiscardMode == DiscardType::Locals ||
-      !Common.SymbolsToAdd.empty()) {
-    return createStringError(llvm::errc::invalid_argument,
-                             "option not supported by llvm-objcopy for MachO");
-  }
-
-  return MachO;
-}
-
-Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
-  if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
-      !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.AllocSectionsPrefix.empty() ||
-      Common.DiscardMode != DiscardType::None || !Common.SymbolsToAdd.empty() ||
-      !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToLocalize.empty() ||
-      !Common.SymbolsToKeep.empty() || !Common.SymbolsToRemove.empty() ||
-      !Common.UnneededSymbolsToRemove.empty() ||
-      !Common.SymbolsToWeaken.empty() || !Common.SymbolsToKeepGlobal.empty() ||
-      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
-      !Common.SetSectionFlags.empty() || !Common.SymbolsToRename.empty()) {
-    return createStringError(
-        llvm::errc::invalid_argument,
-        "only flags for section dumping, removal, and addition are supported");
-  }
-
-  return Wasm;
-}
-
 // ParseObjcopyOptions returns the config and sets the input arguments. If a
 // help flag is set then ParseObjcopyOptions will print the help messege and
 // exit.
@@ -695,11 +632,10 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
   MatchStyle SectionMatchStyle = InputArgs.hasArg(OBJCOPY_regex)
                                      ? MatchStyle::Regex
                                      : MatchStyle::Wildcard;
-  MatchStyle SymbolMatchStyle = InputArgs.hasArg(OBJCOPY_regex)
-                                    ? MatchStyle::Regex
-                                    : InputArgs.hasArg(OBJCOPY_wildcard)
-                                          ? MatchStyle::Wildcard
-                                          : MatchStyle::Literal;
+  MatchStyle SymbolMatchStyle
+      = InputArgs.hasArg(OBJCOPY_regex)    ? MatchStyle::Regex
+      : InputArgs.hasArg(OBJCOPY_wildcard) ? MatchStyle::Wildcard
+                                           : MatchStyle::Literal;
   StringRef InputFormat, OutputFormat;
   if (InputArgs.hasArg(OBJCOPY_target)) {
     InputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
@@ -1337,11 +1273,10 @@ objcopy::parseStripOptions(ArrayRef<const char *> RawArgsArr,
                              "--regex and --wildcard are incompatible");
   MatchStyle SectionMatchStyle =
       InputArgs.hasArg(STRIP_regex) ? MatchStyle::Regex : MatchStyle::Wildcard;
-  MatchStyle SymbolMatchStyle = InputArgs.hasArg(STRIP_regex)
-                                    ? MatchStyle::Regex
-                                    : InputArgs.hasArg(STRIP_wildcard)
-                                          ? MatchStyle::Wildcard
-                                          : MatchStyle::Literal;
+  MatchStyle SymbolMatchStyle
+      = InputArgs.hasArg(STRIP_regex)    ? MatchStyle::Regex
+      : InputArgs.hasArg(STRIP_wildcard) ? MatchStyle::Wildcard
+                                         : MatchStyle::Literal;
   ELFConfig.AllowBrokenLinks = InputArgs.hasArg(STRIP_allow_broken_links);
   Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
 
diff --git a/llvm/tools/llvm-objcopy/ConfigManager.h b/llvm/tools/llvm-objcopy/ObjcopyOptions.h
similarity index 68%
rename from llvm/tools/llvm-objcopy/ConfigManager.h
rename to llvm/tools/llvm-objcopy/ObjcopyOptions.h
index c0d0e8bbc7219..d5c1fad10b7ab 100644
--- a/llvm/tools/llvm-objcopy/ConfigManager.h
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.h
@@ -1,4 +1,4 @@
-//===- ConfigManager.h ----------------------------------------------------===//
+//===- ObjcopyOptions.h ---------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,40 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_OBJCOPY_CONFIGMANAGER_H
-#define LLVM_TOOLS_LLVM_OBJCOPY_CONFIGMANAGER_H
+#ifndef LLVM_TOOLS_LLVM_OBJCOPY_OBJCOPYOPTIONS_H
+#define LLVM_TOOLS_LLVM_OBJCOPY_OBJCOPYOPTIONS_H
 
-#include "COFF/COFFConfig.h"
-#include "CommonConfig.h"
-#include "ELF/ELFConfig.h"
-#include "MachO/MachOConfig.h"
-#include "MultiFormatConfig.h"
-#include "wasm/WasmConfig.h"
+#include "llvm/ObjCopy/ConfigManager.h"
 #include "llvm/Support/Allocator.h"
 #include <vector>
 
 namespace llvm {
 namespace objcopy {
 
-// ConfigManager keeps all configurations and prepare
-// format-specific options.
-struct ConfigManager : public MultiFormatConfig {
-  virtual ~ConfigManager() {}
-
-  const CommonConfig &getCommonConfig() const override { return Common; }
-  Expected<const ELFConfig &> getELFConfig() const override;
-  Expected<const COFFConfig &> getCOFFConfig() const override;
-  Expected<const MachOConfig &> getMachOConfig() const override;
-  Expected<const WasmConfig &> getWasmConfig() const override;
-
-  // All configs.
-  CommonConfig Common;
-  ELFConfig ELF;
-  COFFConfig COFF;
-  MachOConfig MachO;
-  WasmConfig Wasm;
-};
-
 // Configuration for the overall invocation of this tool. When invoked as
 // objcopy, will always contain exactly one CopyConfig. When invoked as strip,
 // will contain one or more CopyConfigs.
@@ -77,4 +53,4 @@ parseStripOptions(ArrayRef<const char *> ArgsArr,
 } // namespace objcopy
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_OBJCOPY_CONFIGMANAGER_H
+#endif // LLVM_TOOLS_LLVM_OBJCOPY_OBJCOPYOPTIONS_H
diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
index a5963985f78ab..26484d5d8e5b1 100644
--- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -6,23 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-objcopy.h"
-#include "COFF/COFFConfig.h"
-#include "COFF/COFFObjcopy.h"
-#include "CommonConfig.h"
-#include "ConfigManager.h"
-#include "ELF/ELFConfig.h"
-#include "ELF/ELFObjcopy.h"
-#include "MachO/MachOConfig.h"
-#include "MachO/MachOObjcopy.h"
-#include "wasm/WasmConfig.h"
-#include "wasm/WasmObjcopy.h"
-
+#include "ObjcopyOptions.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ObjCopy/COFF/COFFConfig.h"
+#include "llvm/ObjCopy/COFF/COFFObjcopy.h"
+#include "llvm/ObjCopy/CommonConfig.h"
+#include "llvm/ObjCopy/ELF/ELFConfig.h"
+#include "llvm/ObjCopy/ELF/ELFObjcopy.h"
+#include "llvm/ObjCopy/MachO/MachOConfig.h"
+#include "llvm/ObjCopy/MachO/MachOObjcopy.h"
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/ObjCopy/wasm/WasmConfig.h"
+#include "llvm/ObjCopy/wasm/WasmObjcopy.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
@@ -96,40 +95,6 @@ static Expected<DriverConfig> getDriverConfig(ArrayRef<const char *> Args) {
     return parseObjcopyOptions(Args, reportWarning);
 }
 
-// For regular archives this function simply calls llvm::writeArchive,
-// For thin archives it writes the archive file itself as well as its members.
-static Error deepWriteArchive(StringRef ArcName,
-                              ArrayRef<NewArchiveMember> NewMembers,
-                              bool WriteSymtab, object::Archive::Kind Kind,
-                              bool Deterministic, bool Thin) {
-  if (Error E = writeArchive(ArcName, NewMembers, WriteSymtab, Kind,
-                             Deterministic, Thin))
-    return createFileError(ArcName, std::move(E));
-
-  if (!Thin)
-    return Error::success();
-
-  for (const NewArchiveMember &Member : NewMembers) {
-    // For regular files (as is the case for deepWriteArchive),
-    // FileOutputBuffer::create will return OnDiskBuffer.
-    // OnDiskBuffer uses a temporary file and then renames it. So in reality
-    // there is no inefficiency / duplicated in-memory buffers in this case. For
-    // now in-memory buffers can not be completely avoided since
-    // NewArchiveMember still requires them even though writeArchive does not
-    // write them on disk.
-    Expected<std::unique_ptr<FileOutputBuffer>> FB =
-        FileOutputBuffer::create(Member.MemberName, Member.Buf->getBufferSize(),
-                                 FileOutputBuffer::F_executable);
-    if (!FB)
-      return FB.takeError();
-    std::copy(Member.Buf->getBufferStart(), Member.Buf->getBufferEnd(),
-              (*FB)->getBufferStart());
-    if (Error E = (*FB)->commit())
-      return E;
-  }
-  return Error::success();
-}
-
 /// The function executeObjcopyOnIHex does the dispatch based on the format
 /// of the output specified by the command line options.
 static Error executeObjcopyOnIHex(ConfigManager &ConfigMgr, MemoryBuffer &In,
@@ -166,102 +131,6 @@ static Error executeObjcopyOnRawBinary(ConfigManager &ConfigMgr,
   llvm_unreachable("unsupported output format");
 }
 
-/// The function executeObjcopyOnBinary does the dispatch based on the format
-/// of the input binary (ELF, MachO or COFF).
-static Error executeObjcopyOnBinary(const MultiFormatConfig &Config,
-                                    object::Binary &In, raw_ostream &Out) {
-  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In)) {
-    Expected<const ELFConfig &> ELFConfig = Config.getELFConfig();
-    if (!ELFConfig)
-      return ELFConfig.takeError();
-
-    return elf::executeObjcopyOnBinary(Config.getCommonConfig(), *ELFConfig,
-                                       *ELFBinary, Out);
-  } else if (auto *COFFBinary = dyn_cast<object::COFFObjectFile>(&In)) {
-    Expected<const COFFConfig &> COFFConfig = Config.getCOFFConfig();
-    if (!COFFConfig)
-      return COFFConfig.takeError();
-
-    return coff::executeObjcopyOnBinary(Config.getCommonConfig(), *COFFConfig,
-                                        *COFFBinary, Out);
-  } else if (auto *MachOBinary = dyn_cast<object::MachOObjectFile>(&In)) {
-    Expected<const MachOConfig &> MachOConfig = Config.getMachOConfig();
-    if (!MachOConfig)
-      return MachOConfig.takeError();
-
-    return macho::executeObjcopyOnBinary(Config.getCommonConfig(), *MachOConfig,
-                                         *MachOBinary, Out);
-  } else if (auto *MachOUniversalBinary =
-                 dyn_cast<object::MachOUniversalBinary>(&In)) {
-    return macho::executeObjcopyOnMachOUniversalBinary(
-        Config, *MachOUniversalBinary, Out);
-  } else if (auto *WasmBinary = dyn_cast<object::WasmObjectFile>(&In)) {
-    Expected<const WasmConfig &> WasmConfig = Config.getWasmConfig();
-    if (!WasmConfig)
-      return WasmConfig.takeError();
-
-    return objcopy::wasm::executeObjcopyOnBinary(Config.getCommonConfig(),
-                                                 *WasmConfig, *WasmBinary, Out);
-  } else
-    return createStringError(object_error::invalid_file_type,
-                             "unsupported object file format");
-}
-
-namespace llvm {
-namespace objcopy {
-
-Expected<std::vector<NewArchiveMember>>
-createNewArchiveMembers(const MultiFormatConfig &Config, const Archive &Ar) {
-  std::vector<NewArchiveMember> NewArchiveMembers;
-  Error Err = Error::success();
-  for (const Archive::Child &Child : Ar.children(Err)) {
-    Expected<StringRef> ChildNameOrErr = Child.getName();
-    if (!ChildNameOrErr)
-      return createFileError(Ar.getFileName(), ChildNameOrErr.takeError());
-
-    Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
-    if (!ChildOrErr)
-      return createFileError(Ar.getFileName() + "(" + *ChildNameOrErr + ")",
-                             ChildOrErr.takeError());
-
-    SmallVector<char, 0> Buffer;
-    raw_svector_ostream MemStream(Buffer);
-
-    if (Error E = executeObjcopyOnBinary(Config, *ChildOrErr->get(), MemStream))
-      return std::move(E);
-
-    Expected<NewArchiveMember> Member = NewArchiveMember::getOldMember(
-        Child, Config.getCommonConfig().DeterministicArchives);
-    if (!Member)
-      return createFileError(Ar.getFileName(), Member.takeError());
-
-    Member->Buf = std::make_unique<SmallVectorMemoryBuffer>(
-        std::move(Buffer), ChildNameOrErr.get(),
-        /*RequiresNullTerminator=*/false);
-    Member->MemberName = Member->Buf->getBufferIdentifier();
-    NewArchiveMembers.push_back(std::move(*Member));
-  }
-  if (Err)
-    return createFileError(Config.getCommonConfig().InputFilename,
-                           std::move(Err));
-  return std::move(NewArchiveMembers);
-}
-
-} // end namespace objcopy
-} // end namespace llvm
-
-static Error executeObjcopyOnArchive(const ConfigManager &ConfigMgr,
-                                     const object::Archive &Ar) {
-  Expected<std::vector<NewArchiveMember>> NewArchiveMembersOrErr =
-      createNewArchiveMembers(ConfigMgr, Ar);
-  if (!NewArchiveMembersOrErr)
-    return NewArchiveMembersOrErr.takeError();
-  const CommonConfig &Config = ConfigMgr.getCommonConfig();
-  return deepWriteArchive(Config.OutputFilename, *NewArchiveMembersOrErr,
-                          Ar.hasSymbolTable(), Ar.kind(),
-                          Config.DeterministicArchives, Ar.isThin());
-}
-
 static Error restoreStatOnFile(StringRef Filename,
                                const sys::fs::file_status &Stat,
                                const ConfigManager &ConfigMgr) {
diff --git a/llvm/unittests/ObjCopy/CMakeLists.txt b/llvm/unittests/ObjCopy/CMakeLists.txt
new file mode 100644
index 0000000000000..b44fd832e437f
--- /dev/null
+++ b/llvm/unittests/ObjCopy/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(LLVM_LINK_COMPONENTS
+  Object
+  ObjCopy
+  ObjectYAML
+  )
+
+add_llvm_unittest(ObjCopyTests
+  ObjCopyTest.cpp
+  )
+
+target_link_libraries(ObjCopyTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/ObjCopy/ObjCopyTest.cpp b/llvm/unittests/ObjCopy/ObjCopyTest.cpp
new file mode 100644
index 0000000000000..8d208be988a71
--- /dev/null
+++ b/llvm/unittests/ObjCopy/ObjCopyTest.cpp
@@ -0,0 +1,118 @@
+//===- ObjCopyTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjCopy/ObjCopy.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ObjCopy/ConfigManager.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace object;
+using namespace objcopy;
+using namespace yaml;
+
+void copySimpleInMemoryFileImpl(
+    const char *YamlCreationString,
+    std::function<bool(const Binary &File)> IsValidFormat) {
+  auto ErrHandler = [&](const Twine &Msg) { FAIL() << "Error: " << Msg; };
+
+  // Create Object file from YAML description.
+  SmallVector<char> Storage;
+  std::unique_ptr<ObjectFile> Obj =
+      yaml2ObjectFile(Storage, YamlCreationString, ErrHandler);
+  ASSERT_TRUE(Obj);
+  ASSERT_TRUE(IsValidFormat(*Obj));
+
+  ConfigManager Config;
+  Config.Common.OutputFilename = "a.out";
+
+  // Call executeObjcopyOnBinary()
+  SmallVector<char> DataVector;
+  raw_svector_ostream OutStream(DataVector);
+  Error Err = objcopy::executeObjcopyOnBinary(Config, *Obj.get(), OutStream);
+  ASSERT_FALSE(std::move(Err));
+
+  MemoryBufferRef Buffer(StringRef(DataVector.data(), DataVector.size()),
+                         Config.Common.OutputFilename);
+
+  // Check copied file.
+  Expected<std::unique_ptr<Binary>> Result = createBinary(Buffer);
+  ASSERT_THAT_EXPECTED(Result, Succeeded());
+  ASSERT_TRUE(IsValidFormat(**Result));
+}
+
+TEST(CopySimpleInMemoryFile, COFF) {
+  SCOPED_TRACE("CopySimpleInMemoryFileCOFF");
+
+  copySimpleInMemoryFileImpl(
+      R"(
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            .text
+    Characteristics: [  ]
+    Alignment:       4
+    SectionData:     E800000000C3C3C3
+symbols:
+...
+)",
+      [](const Binary &File) { return File.isCOFF(); });
+}
+
+TEST(CopySimpleInMemoryFile, ELF) {
+  SCOPED_TRACE("CopySimpleInMemoryFileELF");
+
+  copySimpleInMemoryFileImpl(
+      R"(
+--- !ELF
+FileHeader:
+   Class:    ELFCLASS64
+   Data:     ELFDATA2LSB
+   Type:     ET_REL)",
+      [](const Binary &File) { return File.isELF(); });
+}
+
+TEST(CopySimpleInMemoryFile, MachO) {
+  SCOPED_TRACE("CopySimpleInMemoryFileMachO");
+
+  copySimpleInMemoryFileImpl(
+      R"(
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x80000003
+  filetype:        0x00000002
+  ncmds:           0
+  sizeofcmds:      0
+  flags:           0x00218085
+  reserved:        0x00000000
+...
+)",
+      [](const Binary &File) { return File.isMachO(); });
+}
+
+TEST(CopySimpleInMemoryFile, Wasm) {
+  SCOPED_TRACE("CopySimpleInMemoryFileWasm");
+
+  copySimpleInMemoryFileImpl(
+      R"(
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+...
+)",
+      [](const Binary &File) { return File.isWasm(); });
+}

From d20e01bb06fb81a7584c4ae7c7daf5aec2ed0a1a Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Thu, 17 Feb 2022 02:16:25 -0800
Subject: [PATCH 061/748] Revert "[NFC][compiler-rt] Format file
 lib/builtins/arm/sync-ops.h"

This reverts commit f165c23bf3598990aaf2174a6bc40be75199ee1a.

Part of revert sequence for 910a642c0a.
---
 compiler-rt/lib/builtins/arm/sync-ops.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/builtins/arm/sync-ops.h b/compiler-rt/lib/builtins/arm/sync-ops.h
index 9f7f23f98f085..b924b33f80eb3 100644
--- a/compiler-rt/lib/builtins/arm/sync-ops.h
+++ b/compiler-rt/lib/builtins/arm/sync-ops.h
@@ -36,7 +36,7 @@
   .thumb;                                                                      \
   .syntax unified;                                                             \
   DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  push{r4, r5, r6, lr};                                                        \
+  push {r4, r5, r6, lr};                                                       \
   dmb;                                                                         \
   mov r12, r0;                                                                 \
   LOCAL_LABEL(tryatomic_##op) : ldrexd r0, r1, [r12];                          \
@@ -52,7 +52,8 @@
 #define SYNC_OP_4(op)                                                          \
   .p2align 2;                                                                  \
   DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  LOCAL_LABEL(tryatomic_##op) : mov r12, r0;                                   \
+  LOCAL_LABEL(tryatomic_##op) :                                                \
+  mov r12, r0;                                                                 \
   op(r2, r0, r1);                                                              \
   str r2, [r12];                                                               \
   ldr r12, [r12];                                                              \
@@ -63,8 +64,9 @@
 #define SYNC_OP_8(op)                                                          \
   .p2align 2;                                                                  \
   DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  push{r4, r5, r6, lr};                                                        \
-  LOCAL_LABEL(tryatomic_##op) : mov r12, r0;                                   \
+  push {r4, r5, r6, lr};                                                       \
+  LOCAL_LABEL(tryatomic_##op) :                                                \
+  mov r12, r0;                                                                 \
   op(r4, r5, r0, r1, r2, r3);                                                  \
   stm r12, {r4, r5};                                                           \
   ldm r12, {r6, r12};                                                          \

From 0389f2edf7c252e49323ae0ca6adfea9f2614f6d Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Thu, 17 Feb 2022 02:17:27 -0800
Subject: [PATCH 062/748] Revert "[compiler-rt] Implement  ARM atomic
 operations for architectures without SMP support"

This reverts commit 910a642c0a5b66a8d2517026b890a1acdc447f19.

There are serious correctness issues with the current approach: __sync_*
routines which are not actually atomic should not be enabled by default.

I'll continue discussion on the review.
---
 .../cmake/Modules/CompilerRTUtils.cmake       | 10 ------
 compiler-rt/cmake/config-ix.cmake             |  5 ---
 compiler-rt/lib/builtins/CMakeLists.txt       |  1 -
 compiler-rt/lib/builtins/arm/sync-ops.h       | 33 -------------------
 4 files changed, 49 deletions(-)

diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
index 557fa96eea5eb..052095801aaed 100644
--- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
@@ -110,16 +110,6 @@ function(check_compile_definition def argstring out_var)
   cmake_pop_check_state()
 endfunction()
 
-macro(test_arm_smp_support arch cflags_var)
-  if (${arch} STREQUAL "arm")
-    try_compile(HAS_${arch}_SMP ${CMAKE_BINARY_DIR}
-	    ${ARM_SMP_CHECK_SRC} COMPILE_DEFINITIONS "${CMAKE_C_FLAGS} ${_TARGET_${arch}_CFLAGS}")
-    if (HAS_${arch}_SMP)
-      list(APPEND ${cflags_var} -DCOMPILER_RT_HAS_SMP_SUPPORT)
-    endif()
-  endif()
-endmacro()
-
 # test_target_arch(<arch> <def> <target flags...>)
 # Checks if architecture is supported: runs host compiler with provided
 # flags to verify that:
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index ccf57009fd43f..4299a0589a7b7 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -200,11 +200,6 @@ set(COMPILER_RT_SUPPORTED_ARCH)
 set(SIMPLE_SOURCE ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/simple.cc)
 file(WRITE ${SIMPLE_SOURCE} "#include <stdlib.h>\n#include <stdio.h>\nint main() { printf(\"hello, world\"); }\n")
 
-# Check if we have SMP support for particular ARM architecture
-# If not use stubs instead of real atomic operations - see sync-ops.h
-set(ARM_SMP_CHECK_SRC ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/arm-barrier.cc)
-file(WRITE ${ARM_SMP_CHECK_SRC} "int main() { asm(\"dmb\"); return 0; }")
-
 # Detect whether the current target platform is 32-bit or 64-bit, and setup
 # the correct commandline flags needed to attempt to target 32-bit and 64-bit.
 if (NOT CMAKE_SIZEOF_VOID_P EQUAL 4 AND
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 12268dc1ce63d..ea5ad9cdb8643 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -740,7 +740,6 @@ else ()
         list(APPEND BUILTIN_CFLAGS_${arch} -fomit-frame-pointer -DCOMPILER_RT_ARMHF_TARGET)
       endif()
 
-      test_arm_smp_support(${arch} BUILTIN_CFLAGS_${arch})
       # For RISCV32, we must force enable int128 for compiling long
       # double routines.
       if("${arch}" STREQUAL "riscv32")
diff --git a/compiler-rt/lib/builtins/arm/sync-ops.h b/compiler-rt/lib/builtins/arm/sync-ops.h
index b924b33f80eb3..c9623249e5d20 100644
--- a/compiler-rt/lib/builtins/arm/sync-ops.h
+++ b/compiler-rt/lib/builtins/arm/sync-ops.h
@@ -14,8 +14,6 @@
 
 #include "../assembly.h"
 
-#ifdef COMPILER_RT_HAS_SMP_SUPPORT
-
 #define SYNC_OP_4(op)                                                          \
   .p2align 2;                                                                  \
   .thumb;                                                                      \
@@ -47,37 +45,6 @@
   dmb;                                                                         \
   pop { r4, r5, r6, pc }
 
-#else
-
-#define SYNC_OP_4(op)                                                          \
-  .p2align 2;                                                                  \
-  DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  LOCAL_LABEL(tryatomic_##op) :                                                \
-  mov r12, r0;                                                                 \
-  op(r2, r0, r1);                                                              \
-  str r2, [r12];                                                               \
-  ldr r12, [r12];                                                              \
-  cmp r12, r2;                                                                 \
-  bne LOCAL_LABEL(tryatomic_##op);                                             \
-  bx lr
-
-#define SYNC_OP_8(op)                                                          \
-  .p2align 2;                                                                  \
-  DEFINE_COMPILERRT_THUMB_FUNCTION(__sync_fetch_and_##op)                      \
-  push {r4, r5, r6, lr};                                                       \
-  LOCAL_LABEL(tryatomic_##op) :                                                \
-  mov r12, r0;                                                                 \
-  op(r4, r5, r0, r1, r2, r3);                                                  \
-  stm r12, {r4, r5};                                                           \
-  ldm r12, {r6, r12};                                                          \
-  cmp r6, r4;                                                                  \
-  bne LOCAL_LABEL(tryatomic_##op);                                             \
-  cmp r12, r5;                                                                 \
-  bne LOCAL_LABEL(tryatomic_##op);                                             \
-  pop { r4, r5, r6, pc }
-
-#endif
-
 #define MINMAX_4(rD, rN, rM, cmp_kind)                                         \
   cmp rN, rM;                                                                  \
   mov rD, rM;                                                                  \

From 093ecccdab47640c5d94ace7ad440972f19f66a9 Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Fri, 28 Jan 2022 08:16:25 -0800
Subject: [PATCH 063/748] [RISCV] Add the passthru operand for
 vadc/vsbc/vmerge/vfmerge IR intrinsics.

The goal is support tail and mask policy in RVV builtins.
We focus on IR part first.
If the passthru operand is undef, we use tail agnostic, otherwise
use tail undisturbed.

Reviewed By: rogfer01

Differential Revision: https://reviews.llvm.org/D119686
---
 clang/include/clang/Basic/riscv_vector.td     |  10 +-
 .../RISCV/rvv-intrinsics-overloaded/vadc.c    | 176 ++++++------
 .../RISCV/rvv-intrinsics-overloaded/vfmerge.c |  18 +-
 .../RISCV/rvv-intrinsics-overloaded/vmerge.c  | 194 +++++++-------
 .../RISCV/rvv-intrinsics-overloaded/vsbc.c    | 176 ++++++------
 .../test/CodeGen/RISCV/rvv-intrinsics/vadc.c  | 176 ++++++------
 .../CodeGen/RISCV/rvv-intrinsics/vfmerge.c    |  30 +--
 .../CodeGen/RISCV/rvv-intrinsics/vmerge.c     | 206 +++++++-------
 .../test/CodeGen/RISCV/rvv-intrinsics/vsbc.c  | 176 ++++++------
 llvm/include/llvm/IR/IntrinsicsRISCV.td       |   8 +-
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 113 +++++++-
 llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll    | 251 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vadc-rv32.ll      | 110 ++++++++
 llvm/test/CodeGen/RISCV/rvv/vadc-rv64.ll      | 110 ++++++++
 llvm/test/CodeGen/RISCV/rvv/vfmerge.ll        |  75 ++++++
 llvm/test/CodeGen/RISCV/rvv/vmerge-rv32.ll    | 140 ++++++++++
 llvm/test/CodeGen/RISCV/rvv/vmerge-rv64.ll    | 140 ++++++++++
 llvm/test/CodeGen/RISCV/rvv/vsbc-rv32.ll      |  88 ++++++
 llvm/test/CodeGen/RISCV/rvv/vsbc-rv64.ll      |  88 ++++++
 19 files changed, 1691 insertions(+), 594 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index cb00f9436f5d2..efc074aba246a 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -1633,10 +1633,12 @@ let Log2LMUL = [-3, -2, -1, 0] in {
 
 // 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
 let HasMask = false, HasPolicy = false in {
-  defm vadc : RVVCarryinBuiltinSet;
+  let HasNoMaskPassThru = true in {
+    defm vadc : RVVCarryinBuiltinSet;
+    defm vsbc : RVVCarryinBuiltinSet;
+  }
   defm vmadc : RVVCarryOutInBuiltinSet<"vmadc_carry_in">;
   defm vmadc : RVVIntMaskOutBuiltinSet;
-  defm vsbc : RVVCarryinBuiltinSet;
   defm vmsbc : RVVCarryOutInBuiltinSet<"vmsbc_borrow_in">;
   defm vmsbc : RVVIntMaskOutBuiltinSet;
 }
@@ -1742,6 +1744,8 @@ let HasMask = false, HasPolicy = false,
     ManualCodegen = [{
       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.begin() + 3);
       IntrinsicTypes = {ResultType, Ops[1]->getType(), Ops[3]->getType()};
+      // insert undef passthru
+      Ops.insert(Ops.begin(), llvm::UndefValue::get(ResultType));
     }] in {
   defm vmerge : RVVOutOp1BuiltinSet<"vmerge", "csil",
                                     [["vvm", "v", "vmvv"],
@@ -1876,6 +1880,8 @@ let HasMask = false, HasPolicy = false,
     ManualCodegen = [{
       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.begin() + 3);
       IntrinsicTypes = {ResultType, Ops[1]->getType(), Ops[3]->getType()};
+      // insert undef passthru
+      Ops.insert(Ops.begin(), llvm::UndefValue::get(ResultType));
     }] in {
   defm vmerge : RVVOutOp1BuiltinSet<"vmerge", "xfd",
                                     [["vvm", "v", "vmvv"]]>;
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vadc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vadc.c
index db3ea976f4803..224915dc12b60 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vadc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vadc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vadc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
@@ -16,7 +16,7 @@ vint8mf8_t test_vadc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vadc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t carryin,
@@ -26,7 +26,7 @@ vint8mf8_t test_vadc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vadc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
@@ -36,7 +36,7 @@ vint8mf4_t test_vadc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vadc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t carryin,
@@ -46,7 +46,7 @@ vint8mf4_t test_vadc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vadc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
@@ -56,7 +56,7 @@ vint8mf2_t test_vadc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vadc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t carryin,
@@ -66,7 +66,7 @@ vint8mf2_t test_vadc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vadc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t carryin,
@@ -76,7 +76,7 @@ vint8m1_t test_vadc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vadc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t carryin,
@@ -86,7 +86,7 @@ vint8m1_t test_vadc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vadc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t carryin,
@@ -96,7 +96,7 @@ vint8m2_t test_vadc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vadc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t carryin,
@@ -106,7 +106,7 @@ vint8m2_t test_vadc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vadc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t carryin,
@@ -116,7 +116,7 @@ vint8m4_t test_vadc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vadc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t carryin,
@@ -126,7 +126,7 @@ vint8m4_t test_vadc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vadc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t carryin,
@@ -136,7 +136,7 @@ vint8m8_t test_vadc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vadc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t carryin,
@@ -146,7 +146,7 @@ vint8m8_t test_vadc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vadc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
@@ -156,7 +156,7 @@ vint16mf4_t test_vadc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vadc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
@@ -166,7 +166,7 @@ vint16mf4_t test_vadc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vadc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
@@ -176,7 +176,7 @@ vint16mf2_t test_vadc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vadc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
@@ -186,7 +186,7 @@ vint16mf2_t test_vadc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vadc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
@@ -196,7 +196,7 @@ vint16m1_t test_vadc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vadc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t carryin,
@@ -206,7 +206,7 @@ vint16m1_t test_vadc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vadc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2, vbool8_t carryin,
@@ -216,7 +216,7 @@ vint16m2_t test_vadc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vadc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t carryin,
@@ -226,7 +226,7 @@ vint16m2_t test_vadc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vadc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2, vbool4_t carryin,
@@ -236,7 +236,7 @@ vint16m4_t test_vadc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vadc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t carryin,
@@ -246,7 +246,7 @@ vint16m4_t test_vadc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vadc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2, vbool2_t carryin,
@@ -256,7 +256,7 @@ vint16m8_t test_vadc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vadc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t carryin,
@@ -266,7 +266,7 @@ vint16m8_t test_vadc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vadc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
@@ -276,7 +276,7 @@ vint32mf2_t test_vadc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vadc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
@@ -286,7 +286,7 @@ vint32mf2_t test_vadc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vadc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
@@ -296,7 +296,7 @@ vint32m1_t test_vadc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vadc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t carryin,
@@ -306,7 +306,7 @@ vint32m1_t test_vadc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vadc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
@@ -316,7 +316,7 @@ vint32m2_t test_vadc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vadc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t carryin,
@@ -326,7 +326,7 @@ vint32m2_t test_vadc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vadc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2, vbool8_t carryin,
@@ -336,7 +336,7 @@ vint32m4_t test_vadc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vadc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t carryin,
@@ -346,7 +346,7 @@ vint32m4_t test_vadc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vadc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2, vbool4_t carryin,
@@ -356,7 +356,7 @@ vint32m8_t test_vadc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vadc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t carryin,
@@ -366,7 +366,7 @@ vint32m8_t test_vadc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vadc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
@@ -376,7 +376,7 @@ vint64m1_t test_vadc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vadc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t carryin,
@@ -386,7 +386,7 @@ vint64m1_t test_vadc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vadc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
@@ -396,7 +396,7 @@ vint64m2_t test_vadc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vadc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t carryin,
@@ -406,7 +406,7 @@ vint64m2_t test_vadc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vadc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
@@ -416,7 +416,7 @@ vint64m4_t test_vadc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vadc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t carryin,
@@ -426,7 +426,7 @@ vint64m4_t test_vadc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vadc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2, vbool8_t carryin,
@@ -436,7 +436,7 @@ vint64m8_t test_vadc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vadc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t carryin,
@@ -446,7 +446,7 @@ vint64m8_t test_vadc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vadc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
@@ -456,7 +456,7 @@ vuint8mf8_t test_vadc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vadc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2, vbool64_t carryin,
@@ -466,7 +466,7 @@ vuint8mf8_t test_vadc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2, vbool64_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vadc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
@@ -476,7 +476,7 @@ vuint8mf4_t test_vadc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vadc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2, vbool32_t carryin,
@@ -486,7 +486,7 @@ vuint8mf4_t test_vadc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vadc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
@@ -496,7 +496,7 @@ vuint8mf2_t test_vadc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vadc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2, vbool16_t carryin,
@@ -506,7 +506,7 @@ vuint8mf2_t test_vadc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vadc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t carryin,
@@ -516,7 +516,7 @@ vuint8m1_t test_vadc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vadc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t carryin,
@@ -526,7 +526,7 @@ vuint8m1_t test_vadc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vadc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t carryin,
@@ -536,7 +536,7 @@ vuint8m2_t test_vadc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vadc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t carryin,
@@ -546,7 +546,7 @@ vuint8m2_t test_vadc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vadc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t carryin,
@@ -556,7 +556,7 @@ vuint8m4_t test_vadc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vadc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t carryin,
@@ -566,7 +566,7 @@ vuint8m4_t test_vadc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vadc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t carryin,
@@ -576,7 +576,7 @@ vuint8m8_t test_vadc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vadc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t carryin,
@@ -586,7 +586,7 @@ vuint8m8_t test_vadc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vadc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
@@ -596,7 +596,7 @@ vuint16mf4_t test_vadc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vadc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
@@ -606,7 +606,7 @@ vuint16mf4_t test_vadc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vadc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
@@ -616,7 +616,7 @@ vuint16mf2_t test_vadc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vadc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
@@ -626,7 +626,7 @@ vuint16mf2_t test_vadc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vadc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
@@ -636,7 +636,7 @@ vuint16m1_t test_vadc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vadc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
@@ -646,7 +646,7 @@ vuint16m1_t test_vadc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vadc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
@@ -656,7 +656,7 @@ vuint16m2_t test_vadc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vadc_vxm_u16m2(vuint16m2_t op1, uint16_t op2, vbool8_t carryin,
@@ -666,7 +666,7 @@ vuint16m2_t test_vadc_vxm_u16m2(vuint16m2_t op1, uint16_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vadc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
@@ -676,7 +676,7 @@ vuint16m4_t test_vadc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vadc_vxm_u16m4(vuint16m4_t op1, uint16_t op2, vbool4_t carryin,
@@ -686,7 +686,7 @@ vuint16m4_t test_vadc_vxm_u16m4(vuint16m4_t op1, uint16_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vadc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
@@ -696,7 +696,7 @@ vuint16m8_t test_vadc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vadc_vxm_u16m8(vuint16m8_t op1, uint16_t op2, vbool2_t carryin,
@@ -706,7 +706,7 @@ vuint16m8_t test_vadc_vxm_u16m8(vuint16m8_t op1, uint16_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vadc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
@@ -716,7 +716,7 @@ vuint32mf2_t test_vadc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vadc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
@@ -726,7 +726,7 @@ vuint32mf2_t test_vadc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vadc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
@@ -736,7 +736,7 @@ vuint32m1_t test_vadc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vadc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
@@ -746,7 +746,7 @@ vuint32m1_t test_vadc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vadc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
@@ -756,7 +756,7 @@ vuint32m2_t test_vadc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vadc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
@@ -766,7 +766,7 @@ vuint32m2_t test_vadc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vadc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
@@ -776,7 +776,7 @@ vuint32m4_t test_vadc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vadc_vxm_u32m4(vuint32m4_t op1, uint32_t op2, vbool8_t carryin,
@@ -786,7 +786,7 @@ vuint32m4_t test_vadc_vxm_u32m4(vuint32m4_t op1, uint32_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vadc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
@@ -796,7 +796,7 @@ vuint32m8_t test_vadc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vadc_vxm_u32m8(vuint32m8_t op1, uint32_t op2, vbool4_t carryin,
@@ -806,7 +806,7 @@ vuint32m8_t test_vadc_vxm_u32m8(vuint32m8_t op1, uint32_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vadc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
@@ -816,7 +816,7 @@ vuint64m1_t test_vadc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vadc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
@@ -826,7 +826,7 @@ vuint64m1_t test_vadc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vadc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
@@ -836,7 +836,7 @@ vuint64m2_t test_vadc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vadc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
@@ -846,7 +846,7 @@ vuint64m2_t test_vadc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vadc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
@@ -856,7 +856,7 @@ vuint64m4_t test_vadc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vadc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
@@ -866,7 +866,7 @@ vuint64m4_t test_vadc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vadc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
@@ -876,7 +876,7 @@ vuint64m8_t test_vadc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vadc_vxm_u64m8(vuint64m8_t op1, uint64_t op2, vbool8_t carryin,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmerge.c
index a945c0bafdfa5..96e6282f1dba2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmerge.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.f32.i64(<vscale x 1 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.f32.i64(<vscale x 1 x float> undef, <vscale x 1 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmerge_vfm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfmerge_vfm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.f32.i64(<vscale x 2 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.f32.i64(<vscale x 2 x float> undef, <vscale x 2 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmerge_vfm_f32m1(vbool32_t mask, vfloat32m1_t op1, float op2,
@@ -27,7 +27,7 @@ vfloat32m1_t test_vfmerge_vfm_f32m1(vbool32_t mask, vfloat32m1_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.f32.i64(<vscale x 4 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.f32.i64(<vscale x 4 x float> undef, <vscale x 4 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmerge_vfm_f32m2(vbool16_t mask, vfloat32m2_t op1, float op2,
@@ -37,7 +37,7 @@ vfloat32m2_t test_vfmerge_vfm_f32m2(vbool16_t mask, vfloat32m2_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.f32.i64(<vscale x 8 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.f32.i64(<vscale x 8 x float> undef, <vscale x 8 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmerge_vfm_f32m4(vbool8_t mask, vfloat32m4_t op1, float op2,
@@ -47,7 +47,7 @@ vfloat32m4_t test_vfmerge_vfm_f32m4(vbool8_t mask, vfloat32m4_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.f32.i64(<vscale x 16 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmerge_vfm_f32m8(vbool4_t mask, vfloat32m8_t op1, float op2,
@@ -57,7 +57,7 @@ vfloat32m8_t test_vfmerge_vfm_f32m8(vbool4_t mask, vfloat32m8_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.f64.i64(<vscale x 1 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.f64.i64(<vscale x 1 x double> undef, <vscale x 1 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmerge_vfm_f64m1(vbool64_t mask, vfloat64m1_t op1,
@@ -67,7 +67,7 @@ vfloat64m1_t test_vfmerge_vfm_f64m1(vbool64_t mask, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.f64.i64(<vscale x 2 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.f64.i64(<vscale x 2 x double> undef, <vscale x 2 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmerge_vfm_f64m2(vbool32_t mask, vfloat64m2_t op1,
@@ -77,7 +77,7 @@ vfloat64m2_t test_vfmerge_vfm_f64m2(vbool32_t mask, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.f64.i64(<vscale x 4 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.f64.i64(<vscale x 4 x double> undef, <vscale x 4 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmerge_vfm_f64m4(vbool16_t mask, vfloat64m4_t op1,
@@ -87,7 +87,7 @@ vfloat64m4_t test_vfmerge_vfm_f64m4(vbool16_t mask, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64.i64(<vscale x 8 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64.i64(<vscale x 8 x double> undef, <vscale x 8 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmerge_vfm_f64m8(vbool8_t mask, vfloat64m8_t op1, double op2,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmerge.c
index b36e8f8a153a5..9e0588a2adb84 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmerge.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmerge_vvm_i8mf8(vbool64_t mask, vint8mf8_t op1, vint8mf8_t op2,
@@ -17,7 +17,7 @@ vint8mf8_t test_vmerge_vvm_i8mf8(vbool64_t mask, vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmerge_vxm_i8mf8(vbool64_t mask, vint8mf8_t op1, int8_t op2,
@@ -27,7 +27,7 @@ vint8mf8_t test_vmerge_vxm_i8mf8(vbool64_t mask, vint8mf8_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmerge_vvm_i8mf4(vbool32_t mask, vint8mf4_t op1, vint8mf4_t op2,
@@ -37,7 +37,7 @@ vint8mf4_t test_vmerge_vvm_i8mf4(vbool32_t mask, vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmerge_vxm_i8mf4(vbool32_t mask, vint8mf4_t op1, int8_t op2,
@@ -47,7 +47,7 @@ vint8mf4_t test_vmerge_vxm_i8mf4(vbool32_t mask, vint8mf4_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmerge_vvm_i8mf2(vbool16_t mask, vint8mf2_t op1, vint8mf2_t op2,
@@ -57,7 +57,7 @@ vint8mf2_t test_vmerge_vvm_i8mf2(vbool16_t mask, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmerge_vxm_i8mf2(vbool16_t mask, vint8mf2_t op1, int8_t op2,
@@ -67,7 +67,7 @@ vint8mf2_t test_vmerge_vxm_i8mf2(vbool16_t mask, vint8mf2_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmerge_vvm_i8m1(vbool8_t mask, vint8m1_t op1, vint8m1_t op2,
@@ -77,7 +77,7 @@ vint8m1_t test_vmerge_vvm_i8m1(vbool8_t mask, vint8m1_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmerge_vxm_i8m1(vbool8_t mask, vint8m1_t op1, int8_t op2,
@@ -87,7 +87,7 @@ vint8m1_t test_vmerge_vxm_i8m1(vbool8_t mask, vint8m1_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmerge_vvm_i8m2(vbool4_t mask, vint8m2_t op1, vint8m2_t op2,
@@ -97,7 +97,7 @@ vint8m2_t test_vmerge_vvm_i8m2(vbool4_t mask, vint8m2_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmerge_vxm_i8m2(vbool4_t mask, vint8m2_t op1, int8_t op2,
@@ -107,7 +107,7 @@ vint8m2_t test_vmerge_vxm_i8m2(vbool4_t mask, vint8m2_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmerge_vvm_i8m4(vbool2_t mask, vint8m4_t op1, vint8m4_t op2,
@@ -117,7 +117,7 @@ vint8m4_t test_vmerge_vvm_i8m4(vbool2_t mask, vint8m4_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmerge_vxm_i8m4(vbool2_t mask, vint8m4_t op1, int8_t op2,
@@ -127,7 +127,7 @@ vint8m4_t test_vmerge_vxm_i8m4(vbool2_t mask, vint8m4_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmerge_vvm_i8m8(vbool1_t mask, vint8m8_t op1, vint8m8_t op2,
@@ -137,7 +137,7 @@ vint8m8_t test_vmerge_vvm_i8m8(vbool1_t mask, vint8m8_t op1, vint8m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmerge_vxm_i8m8(vbool1_t mask, vint8m8_t op1, int8_t op2,
@@ -147,7 +147,7 @@ vint8m8_t test_vmerge_vxm_i8m8(vbool1_t mask, vint8m8_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmerge_vvm_i16mf4(vbool64_t mask, vint16mf4_t op1,
@@ -157,7 +157,7 @@ vint16mf4_t test_vmerge_vvm_i16mf4(vbool64_t mask, vint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmerge_vxm_i16mf4(vbool64_t mask, vint16mf4_t op1, int16_t op2,
@@ -167,7 +167,7 @@ vint16mf4_t test_vmerge_vxm_i16mf4(vbool64_t mask, vint16mf4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmerge_vvm_i16mf2(vbool32_t mask, vint16mf2_t op1,
@@ -177,7 +177,7 @@ vint16mf2_t test_vmerge_vvm_i16mf2(vbool32_t mask, vint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmerge_vxm_i16mf2(vbool32_t mask, vint16mf2_t op1, int16_t op2,
@@ -187,7 +187,7 @@ vint16mf2_t test_vmerge_vxm_i16mf2(vbool32_t mask, vint16mf2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmerge_vvm_i16m1(vbool16_t mask, vint16m1_t op1, vint16m1_t op2,
@@ -197,7 +197,7 @@ vint16m1_t test_vmerge_vvm_i16m1(vbool16_t mask, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmerge_vxm_i16m1(vbool16_t mask, vint16m1_t op1, int16_t op2,
@@ -207,7 +207,7 @@ vint16m1_t test_vmerge_vxm_i16m1(vbool16_t mask, vint16m1_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmerge_vvm_i16m2(vbool8_t mask, vint16m2_t op1, vint16m2_t op2,
@@ -217,7 +217,7 @@ vint16m2_t test_vmerge_vvm_i16m2(vbool8_t mask, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmerge_vxm_i16m2(vbool8_t mask, vint16m2_t op1, int16_t op2,
@@ -227,7 +227,7 @@ vint16m2_t test_vmerge_vxm_i16m2(vbool8_t mask, vint16m2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmerge_vvm_i16m4(vbool4_t mask, vint16m4_t op1, vint16m4_t op2,
@@ -237,7 +237,7 @@ vint16m4_t test_vmerge_vvm_i16m4(vbool4_t mask, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmerge_vxm_i16m4(vbool4_t mask, vint16m4_t op1, int16_t op2,
@@ -247,7 +247,7 @@ vint16m4_t test_vmerge_vxm_i16m4(vbool4_t mask, vint16m4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmerge_vvm_i16m8(vbool2_t mask, vint16m8_t op1, vint16m8_t op2,
@@ -257,7 +257,7 @@ vint16m8_t test_vmerge_vvm_i16m8(vbool2_t mask, vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmerge_vxm_i16m8(vbool2_t mask, vint16m8_t op1, int16_t op2,
@@ -267,7 +267,7 @@ vint16m8_t test_vmerge_vxm_i16m8(vbool2_t mask, vint16m8_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmerge_vvm_i32mf2(vbool64_t mask, vint32mf2_t op1,
@@ -277,7 +277,7 @@ vint32mf2_t test_vmerge_vvm_i32mf2(vbool64_t mask, vint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmerge_vxm_i32mf2(vbool64_t mask, vint32mf2_t op1, int32_t op2,
@@ -287,7 +287,7 @@ vint32mf2_t test_vmerge_vxm_i32mf2(vbool64_t mask, vint32mf2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmerge_vvm_i32m1(vbool32_t mask, vint32m1_t op1, vint32m1_t op2,
@@ -297,7 +297,7 @@ vint32m1_t test_vmerge_vvm_i32m1(vbool32_t mask, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmerge_vxm_i32m1(vbool32_t mask, vint32m1_t op1, int32_t op2,
@@ -307,7 +307,7 @@ vint32m1_t test_vmerge_vxm_i32m1(vbool32_t mask, vint32m1_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmerge_vvm_i32m2(vbool16_t mask, vint32m2_t op1, vint32m2_t op2,
@@ -317,7 +317,7 @@ vint32m2_t test_vmerge_vvm_i32m2(vbool16_t mask, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmerge_vxm_i32m2(vbool16_t mask, vint32m2_t op1, int32_t op2,
@@ -327,7 +327,7 @@ vint32m2_t test_vmerge_vxm_i32m2(vbool16_t mask, vint32m2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmerge_vvm_i32m4(vbool8_t mask, vint32m4_t op1, vint32m4_t op2,
@@ -337,7 +337,7 @@ vint32m4_t test_vmerge_vvm_i32m4(vbool8_t mask, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmerge_vxm_i32m4(vbool8_t mask, vint32m4_t op1, int32_t op2,
@@ -347,7 +347,7 @@ vint32m4_t test_vmerge_vxm_i32m4(vbool8_t mask, vint32m4_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmerge_vvm_i32m8(vbool4_t mask, vint32m8_t op1, vint32m8_t op2,
@@ -357,7 +357,7 @@ vint32m8_t test_vmerge_vvm_i32m8(vbool4_t mask, vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmerge_vxm_i32m8(vbool4_t mask, vint32m8_t op1, int32_t op2,
@@ -367,7 +367,7 @@ vint32m8_t test_vmerge_vxm_i32m8(vbool4_t mask, vint32m8_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmerge_vvm_i64m1(vbool64_t mask, vint64m1_t op1, vint64m1_t op2,
@@ -377,7 +377,7 @@ vint64m1_t test_vmerge_vvm_i64m1(vbool64_t mask, vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmerge_vxm_i64m1(vbool64_t mask, vint64m1_t op1, int64_t op2,
@@ -387,7 +387,7 @@ vint64m1_t test_vmerge_vxm_i64m1(vbool64_t mask, vint64m1_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmerge_vvm_i64m2(vbool32_t mask, vint64m2_t op1, vint64m2_t op2,
@@ -397,7 +397,7 @@ vint64m2_t test_vmerge_vvm_i64m2(vbool32_t mask, vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmerge_vxm_i64m2(vbool32_t mask, vint64m2_t op1, int64_t op2,
@@ -407,7 +407,7 @@ vint64m2_t test_vmerge_vxm_i64m2(vbool32_t mask, vint64m2_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmerge_vvm_i64m4(vbool16_t mask, vint64m4_t op1, vint64m4_t op2,
@@ -417,7 +417,7 @@ vint64m4_t test_vmerge_vvm_i64m4(vbool16_t mask, vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmerge_vxm_i64m4(vbool16_t mask, vint64m4_t op1, int64_t op2,
@@ -427,7 +427,7 @@ vint64m4_t test_vmerge_vxm_i64m4(vbool16_t mask, vint64m4_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmerge_vvm_i64m8(vbool8_t mask, vint64m8_t op1, vint64m8_t op2,
@@ -437,7 +437,7 @@ vint64m8_t test_vmerge_vvm_i64m8(vbool8_t mask, vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmerge_vxm_i64m8(vbool8_t mask, vint64m8_t op1, int64_t op2,
@@ -447,7 +447,7 @@ vint64m8_t test_vmerge_vxm_i64m8(vbool8_t mask, vint64m8_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmerge_vvm_u8mf8(vbool64_t mask, vuint8mf8_t op1,
@@ -457,7 +457,7 @@ vuint8mf8_t test_vmerge_vvm_u8mf8(vbool64_t mask, vuint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmerge_vxm_u8mf8(vbool64_t mask, vuint8mf8_t op1, uint8_t op2,
@@ -467,7 +467,7 @@ vuint8mf8_t test_vmerge_vxm_u8mf8(vbool64_t mask, vuint8mf8_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmerge_vvm_u8mf4(vbool32_t mask, vuint8mf4_t op1,
@@ -477,7 +477,7 @@ vuint8mf4_t test_vmerge_vvm_u8mf4(vbool32_t mask, vuint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmerge_vxm_u8mf4(vbool32_t mask, vuint8mf4_t op1, uint8_t op2,
@@ -487,7 +487,7 @@ vuint8mf4_t test_vmerge_vxm_u8mf4(vbool32_t mask, vuint8mf4_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmerge_vvm_u8mf2(vbool16_t mask, vuint8mf2_t op1,
@@ -497,7 +497,7 @@ vuint8mf2_t test_vmerge_vvm_u8mf2(vbool16_t mask, vuint8mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmerge_vxm_u8mf2(vbool16_t mask, vuint8mf2_t op1, uint8_t op2,
@@ -507,7 +507,7 @@ vuint8mf2_t test_vmerge_vxm_u8mf2(vbool16_t mask, vuint8mf2_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmerge_vvm_u8m1(vbool8_t mask, vuint8m1_t op1, vuint8m1_t op2,
@@ -517,7 +517,7 @@ vuint8m1_t test_vmerge_vvm_u8m1(vbool8_t mask, vuint8m1_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmerge_vxm_u8m1(vbool8_t mask, vuint8m1_t op1, uint8_t op2,
@@ -527,7 +527,7 @@ vuint8m1_t test_vmerge_vxm_u8m1(vbool8_t mask, vuint8m1_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmerge_vvm_u8m2(vbool4_t mask, vuint8m2_t op1, vuint8m2_t op2,
@@ -537,7 +537,7 @@ vuint8m2_t test_vmerge_vvm_u8m2(vbool4_t mask, vuint8m2_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmerge_vxm_u8m2(vbool4_t mask, vuint8m2_t op1, uint8_t op2,
@@ -547,7 +547,7 @@ vuint8m2_t test_vmerge_vxm_u8m2(vbool4_t mask, vuint8m2_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmerge_vvm_u8m4(vbool2_t mask, vuint8m4_t op1, vuint8m4_t op2,
@@ -557,7 +557,7 @@ vuint8m4_t test_vmerge_vvm_u8m4(vbool2_t mask, vuint8m4_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmerge_vxm_u8m4(vbool2_t mask, vuint8m4_t op1, uint8_t op2,
@@ -567,7 +567,7 @@ vuint8m4_t test_vmerge_vxm_u8m4(vbool2_t mask, vuint8m4_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmerge_vvm_u8m8(vbool1_t mask, vuint8m8_t op1, vuint8m8_t op2,
@@ -577,7 +577,7 @@ vuint8m8_t test_vmerge_vvm_u8m8(vbool1_t mask, vuint8m8_t op1, vuint8m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmerge_vxm_u8m8(vbool1_t mask, vuint8m8_t op1, uint8_t op2,
@@ -587,7 +587,7 @@ vuint8m8_t test_vmerge_vxm_u8m8(vbool1_t mask, vuint8m8_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmerge_vvm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
@@ -597,7 +597,7 @@ vuint16mf4_t test_vmerge_vvm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmerge_vxm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
@@ -607,7 +607,7 @@ vuint16mf4_t test_vmerge_vxm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmerge_vvm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
@@ -617,7 +617,7 @@ vuint16mf2_t test_vmerge_vvm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmerge_vxm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
@@ -627,7 +627,7 @@ vuint16mf2_t test_vmerge_vxm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmerge_vvm_u16m1(vbool16_t mask, vuint16m1_t op1,
@@ -637,7 +637,7 @@ vuint16m1_t test_vmerge_vvm_u16m1(vbool16_t mask, vuint16m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmerge_vxm_u16m1(vbool16_t mask, vuint16m1_t op1, uint16_t op2,
@@ -647,7 +647,7 @@ vuint16m1_t test_vmerge_vxm_u16m1(vbool16_t mask, vuint16m1_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmerge_vvm_u16m2(vbool8_t mask, vuint16m2_t op1,
@@ -657,7 +657,7 @@ vuint16m2_t test_vmerge_vvm_u16m2(vbool8_t mask, vuint16m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmerge_vxm_u16m2(vbool8_t mask, vuint16m2_t op1, uint16_t op2,
@@ -667,7 +667,7 @@ vuint16m2_t test_vmerge_vxm_u16m2(vbool8_t mask, vuint16m2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmerge_vvm_u16m4(vbool4_t mask, vuint16m4_t op1,
@@ -677,7 +677,7 @@ vuint16m4_t test_vmerge_vvm_u16m4(vbool4_t mask, vuint16m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmerge_vxm_u16m4(vbool4_t mask, vuint16m4_t op1, uint16_t op2,
@@ -687,7 +687,7 @@ vuint16m4_t test_vmerge_vxm_u16m4(vbool4_t mask, vuint16m4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmerge_vvm_u16m8(vbool2_t mask, vuint16m8_t op1,
@@ -697,7 +697,7 @@ vuint16m8_t test_vmerge_vvm_u16m8(vbool2_t mask, vuint16m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmerge_vxm_u16m8(vbool2_t mask, vuint16m8_t op1, uint16_t op2,
@@ -707,7 +707,7 @@ vuint16m8_t test_vmerge_vxm_u16m8(vbool2_t mask, vuint16m8_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmerge_vvm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
@@ -717,7 +717,7 @@ vuint32mf2_t test_vmerge_vvm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmerge_vxm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
@@ -727,7 +727,7 @@ vuint32mf2_t test_vmerge_vxm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmerge_vvm_u32m1(vbool32_t mask, vuint32m1_t op1,
@@ -737,7 +737,7 @@ vuint32m1_t test_vmerge_vvm_u32m1(vbool32_t mask, vuint32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmerge_vxm_u32m1(vbool32_t mask, vuint32m1_t op1, uint32_t op2,
@@ -747,7 +747,7 @@ vuint32m1_t test_vmerge_vxm_u32m1(vbool32_t mask, vuint32m1_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmerge_vvm_u32m2(vbool16_t mask, vuint32m2_t op1,
@@ -757,7 +757,7 @@ vuint32m2_t test_vmerge_vvm_u32m2(vbool16_t mask, vuint32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmerge_vxm_u32m2(vbool16_t mask, vuint32m2_t op1, uint32_t op2,
@@ -767,7 +767,7 @@ vuint32m2_t test_vmerge_vxm_u32m2(vbool16_t mask, vuint32m2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmerge_vvm_u32m4(vbool8_t mask, vuint32m4_t op1,
@@ -777,7 +777,7 @@ vuint32m4_t test_vmerge_vvm_u32m4(vbool8_t mask, vuint32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmerge_vxm_u32m4(vbool8_t mask, vuint32m4_t op1, uint32_t op2,
@@ -787,7 +787,7 @@ vuint32m4_t test_vmerge_vxm_u32m4(vbool8_t mask, vuint32m4_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmerge_vvm_u32m8(vbool4_t mask, vuint32m8_t op1,
@@ -797,7 +797,7 @@ vuint32m8_t test_vmerge_vvm_u32m8(vbool4_t mask, vuint32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmerge_vxm_u32m8(vbool4_t mask, vuint32m8_t op1, uint32_t op2,
@@ -807,7 +807,7 @@ vuint32m8_t test_vmerge_vxm_u32m8(vbool4_t mask, vuint32m8_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmerge_vvm_u64m1(vbool64_t mask, vuint64m1_t op1,
@@ -817,7 +817,7 @@ vuint64m1_t test_vmerge_vvm_u64m1(vbool64_t mask, vuint64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmerge_vxm_u64m1(vbool64_t mask, vuint64m1_t op1, uint64_t op2,
@@ -827,7 +827,7 @@ vuint64m1_t test_vmerge_vxm_u64m1(vbool64_t mask, vuint64m1_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmerge_vvm_u64m2(vbool32_t mask, vuint64m2_t op1,
@@ -837,7 +837,7 @@ vuint64m2_t test_vmerge_vvm_u64m2(vbool32_t mask, vuint64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmerge_vxm_u64m2(vbool32_t mask, vuint64m2_t op1, uint64_t op2,
@@ -847,7 +847,7 @@ vuint64m2_t test_vmerge_vxm_u64m2(vbool32_t mask, vuint64m2_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmerge_vvm_u64m4(vbool16_t mask, vuint64m4_t op1,
@@ -857,7 +857,7 @@ vuint64m4_t test_vmerge_vvm_u64m4(vbool16_t mask, vuint64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmerge_vxm_u64m4(vbool16_t mask, vuint64m4_t op1, uint64_t op2,
@@ -867,7 +867,7 @@ vuint64m4_t test_vmerge_vxm_u64m4(vbool16_t mask, vuint64m4_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmerge_vvm_u64m8(vbool8_t mask, vuint64m8_t op1,
@@ -877,7 +877,7 @@ vuint64m8_t test_vmerge_vvm_u64m8(vbool8_t mask, vuint64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmerge_vxm_u64m8(vbool8_t mask, vuint64m8_t op1, uint64_t op2,
@@ -887,7 +887,7 @@ vuint64m8_t test_vmerge_vxm_u64m8(vbool8_t mask, vuint64m8_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32.i64(<vscale x 1 x float> undef, <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vmerge_vvm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
@@ -897,7 +897,7 @@ vfloat32mf2_t test_vmerge_vvm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32.i64(<vscale x 2 x float> undef, <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vmerge_vvm_f32m1(vbool32_t mask, vfloat32m1_t op1,
@@ -907,7 +907,7 @@ vfloat32m1_t test_vmerge_vvm_f32m1(vbool32_t mask, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32.i64(<vscale x 4 x float> undef, <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vmerge_vvm_f32m2(vbool16_t mask, vfloat32m2_t op1,
@@ -917,7 +917,7 @@ vfloat32m2_t test_vmerge_vvm_f32m2(vbool16_t mask, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32.i64(<vscale x 8 x float> undef, <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vmerge_vvm_f32m4(vbool8_t mask, vfloat32m4_t op1,
@@ -927,7 +927,7 @@ vfloat32m4_t test_vmerge_vvm_f32m4(vbool8_t mask, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vmerge_vvm_f32m8(vbool4_t mask, vfloat32m8_t op1,
@@ -937,7 +937,7 @@ vfloat32m8_t test_vmerge_vvm_f32m8(vbool4_t mask, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64.i64(<vscale x 1 x double> undef, <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vmerge_vvm_f64m1(vbool64_t mask, vfloat64m1_t op1,
@@ -947,7 +947,7 @@ vfloat64m1_t test_vmerge_vvm_f64m1(vbool64_t mask, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64.i64(<vscale x 2 x double> undef, <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vmerge_vvm_f64m2(vbool32_t mask, vfloat64m2_t op1,
@@ -957,7 +957,7 @@ vfloat64m2_t test_vmerge_vvm_f64m2(vbool32_t mask, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64.i64(<vscale x 4 x double> undef, <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vmerge_vvm_f64m4(vbool16_t mask, vfloat64m4_t op1,
@@ -967,7 +967,7 @@ vfloat64m4_t test_vmerge_vvm_f64m4(vbool16_t mask, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64.i64(<vscale x 8 x double> undef, <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vmerge_vvm_f64m8(vbool8_t mask, vfloat64m8_t op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsbc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsbc.c
index 79c0453a514ca..806ab724b1beb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsbc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vsbc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vsbc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
@@ -16,7 +16,7 @@ vint8mf8_t test_vsbc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vsbc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t borrowin,
@@ -26,7 +26,7 @@ vint8mf8_t test_vsbc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vsbc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
@@ -36,7 +36,7 @@ vint8mf4_t test_vsbc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vsbc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t borrowin,
@@ -46,7 +46,7 @@ vint8mf4_t test_vsbc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vsbc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
@@ -56,7 +56,7 @@ vint8mf2_t test_vsbc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vsbc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t borrowin,
@@ -66,7 +66,7 @@ vint8mf2_t test_vsbc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vsbc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t borrowin,
@@ -76,7 +76,7 @@ vint8m1_t test_vsbc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vsbc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t borrowin,
@@ -86,7 +86,7 @@ vint8m1_t test_vsbc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vsbc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t borrowin,
@@ -96,7 +96,7 @@ vint8m2_t test_vsbc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vsbc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t borrowin,
@@ -106,7 +106,7 @@ vint8m2_t test_vsbc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vsbc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t borrowin,
@@ -116,7 +116,7 @@ vint8m4_t test_vsbc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vsbc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t borrowin,
@@ -126,7 +126,7 @@ vint8m4_t test_vsbc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vsbc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t borrowin,
@@ -136,7 +136,7 @@ vint8m8_t test_vsbc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vsbc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t borrowin,
@@ -146,7 +146,7 @@ vint8m8_t test_vsbc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vsbc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
@@ -156,7 +156,7 @@ vint16mf4_t test_vsbc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vsbc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
@@ -166,7 +166,7 @@ vint16mf4_t test_vsbc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vsbc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
@@ -176,7 +176,7 @@ vint16mf2_t test_vsbc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vsbc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
@@ -186,7 +186,7 @@ vint16mf2_t test_vsbc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vsbc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
@@ -196,7 +196,7 @@ vint16m1_t test_vsbc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vsbc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t borrowin,
@@ -206,7 +206,7 @@ vint16m1_t test_vsbc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vsbc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2,
@@ -216,7 +216,7 @@ vint16m2_t test_vsbc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vsbc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t borrowin,
@@ -226,7 +226,7 @@ vint16m2_t test_vsbc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vsbc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2,
@@ -236,7 +236,7 @@ vint16m4_t test_vsbc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vsbc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t borrowin,
@@ -246,7 +246,7 @@ vint16m4_t test_vsbc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vsbc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2,
@@ -256,7 +256,7 @@ vint16m8_t test_vsbc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vsbc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t borrowin,
@@ -266,7 +266,7 @@ vint16m8_t test_vsbc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vsbc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
@@ -276,7 +276,7 @@ vint32mf2_t test_vsbc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vsbc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
@@ -286,7 +286,7 @@ vint32mf2_t test_vsbc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vsbc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
@@ -296,7 +296,7 @@ vint32m1_t test_vsbc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vsbc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t borrowin,
@@ -306,7 +306,7 @@ vint32m1_t test_vsbc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vsbc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
@@ -316,7 +316,7 @@ vint32m2_t test_vsbc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vsbc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t borrowin,
@@ -326,7 +326,7 @@ vint32m2_t test_vsbc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vsbc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2,
@@ -336,7 +336,7 @@ vint32m4_t test_vsbc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vsbc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t borrowin,
@@ -346,7 +346,7 @@ vint32m4_t test_vsbc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vsbc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2,
@@ -356,7 +356,7 @@ vint32m8_t test_vsbc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vsbc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t borrowin,
@@ -366,7 +366,7 @@ vint32m8_t test_vsbc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vsbc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
@@ -376,7 +376,7 @@ vint64m1_t test_vsbc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vsbc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t borrowin,
@@ -386,7 +386,7 @@ vint64m1_t test_vsbc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vsbc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
@@ -396,7 +396,7 @@ vint64m2_t test_vsbc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vsbc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t borrowin,
@@ -406,7 +406,7 @@ vint64m2_t test_vsbc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vsbc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
@@ -416,7 +416,7 @@ vint64m4_t test_vsbc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vsbc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t borrowin,
@@ -426,7 +426,7 @@ vint64m4_t test_vsbc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vsbc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2,
@@ -436,7 +436,7 @@ vint64m8_t test_vsbc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vsbc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t borrowin,
@@ -446,7 +446,7 @@ vint64m8_t test_vsbc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vsbc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
@@ -456,7 +456,7 @@ vuint8mf8_t test_vsbc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vsbc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2,
@@ -466,7 +466,7 @@ vuint8mf8_t test_vsbc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vsbc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
@@ -476,7 +476,7 @@ vuint8mf4_t test_vsbc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vsbc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2,
@@ -486,7 +486,7 @@ vuint8mf4_t test_vsbc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vsbc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
@@ -496,7 +496,7 @@ vuint8mf2_t test_vsbc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vsbc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2,
@@ -506,7 +506,7 @@ vuint8mf2_t test_vsbc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vsbc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t borrowin,
@@ -516,7 +516,7 @@ vuint8m1_t test_vsbc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vsbc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t borrowin,
@@ -526,7 +526,7 @@ vuint8m1_t test_vsbc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vsbc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t borrowin,
@@ -536,7 +536,7 @@ vuint8m2_t test_vsbc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vsbc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t borrowin,
@@ -546,7 +546,7 @@ vuint8m2_t test_vsbc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vsbc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t borrowin,
@@ -556,7 +556,7 @@ vuint8m4_t test_vsbc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vsbc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t borrowin,
@@ -566,7 +566,7 @@ vuint8m4_t test_vsbc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vsbc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t borrowin,
@@ -576,7 +576,7 @@ vuint8m8_t test_vsbc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vsbc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t borrowin,
@@ -586,7 +586,7 @@ vuint8m8_t test_vsbc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vsbc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
@@ -596,7 +596,7 @@ vuint16mf4_t test_vsbc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vsbc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
@@ -606,7 +606,7 @@ vuint16mf4_t test_vsbc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vsbc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
@@ -616,7 +616,7 @@ vuint16mf2_t test_vsbc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vsbc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
@@ -626,7 +626,7 @@ vuint16mf2_t test_vsbc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vsbc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
@@ -636,7 +636,7 @@ vuint16m1_t test_vsbc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vsbc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
@@ -646,7 +646,7 @@ vuint16m1_t test_vsbc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vsbc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
@@ -656,7 +656,7 @@ vuint16m2_t test_vsbc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vsbc_vxm_u16m2(vuint16m2_t op1, uint16_t op2,
@@ -666,7 +666,7 @@ vuint16m2_t test_vsbc_vxm_u16m2(vuint16m2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vsbc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
@@ -676,7 +676,7 @@ vuint16m4_t test_vsbc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vsbc_vxm_u16m4(vuint16m4_t op1, uint16_t op2,
@@ -686,7 +686,7 @@ vuint16m4_t test_vsbc_vxm_u16m4(vuint16m4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vsbc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
@@ -696,7 +696,7 @@ vuint16m8_t test_vsbc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vsbc_vxm_u16m8(vuint16m8_t op1, uint16_t op2,
@@ -706,7 +706,7 @@ vuint16m8_t test_vsbc_vxm_u16m8(vuint16m8_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vsbc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
@@ -716,7 +716,7 @@ vuint32mf2_t test_vsbc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vsbc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
@@ -726,7 +726,7 @@ vuint32mf2_t test_vsbc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vsbc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
@@ -736,7 +736,7 @@ vuint32m1_t test_vsbc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vsbc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
@@ -746,7 +746,7 @@ vuint32m1_t test_vsbc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vsbc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
@@ -756,7 +756,7 @@ vuint32m2_t test_vsbc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vsbc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
@@ -766,7 +766,7 @@ vuint32m2_t test_vsbc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vsbc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
@@ -776,7 +776,7 @@ vuint32m4_t test_vsbc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vsbc_vxm_u32m4(vuint32m4_t op1, uint32_t op2,
@@ -786,7 +786,7 @@ vuint32m4_t test_vsbc_vxm_u32m4(vuint32m4_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vsbc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
@@ -796,7 +796,7 @@ vuint32m8_t test_vsbc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vsbc_vxm_u32m8(vuint32m8_t op1, uint32_t op2,
@@ -806,7 +806,7 @@ vuint32m8_t test_vsbc_vxm_u32m8(vuint32m8_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vsbc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
@@ -816,7 +816,7 @@ vuint64m1_t test_vsbc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vsbc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
@@ -826,7 +826,7 @@ vuint64m1_t test_vsbc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vsbc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
@@ -836,7 +836,7 @@ vuint64m2_t test_vsbc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vsbc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
@@ -846,7 +846,7 @@ vuint64m2_t test_vsbc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vsbc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
@@ -856,7 +856,7 @@ vuint64m4_t test_vsbc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vsbc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
@@ -866,7 +866,7 @@ vuint64m4_t test_vsbc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vsbc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
@@ -876,7 +876,7 @@ vuint64m8_t test_vsbc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vsbc_vxm_u64m8(vuint64m8_t op1, uint64_t op2,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vadc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vadc.c
index c9134fc0d2bb7..b0697da1443ee 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vadc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vadc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vadc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
@@ -16,7 +16,7 @@ vint8mf8_t test_vadc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vadc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t carryin,
@@ -26,7 +26,7 @@ vint8mf8_t test_vadc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vadc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
@@ -36,7 +36,7 @@ vint8mf4_t test_vadc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vadc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t carryin,
@@ -46,7 +46,7 @@ vint8mf4_t test_vadc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vadc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
@@ -56,7 +56,7 @@ vint8mf2_t test_vadc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vadc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t carryin,
@@ -66,7 +66,7 @@ vint8mf2_t test_vadc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vadc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t carryin,
@@ -76,7 +76,7 @@ vint8m1_t test_vadc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vadc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t carryin,
@@ -86,7 +86,7 @@ vint8m1_t test_vadc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vadc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t carryin,
@@ -96,7 +96,7 @@ vint8m2_t test_vadc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vadc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t carryin,
@@ -106,7 +106,7 @@ vint8m2_t test_vadc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vadc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t carryin,
@@ -116,7 +116,7 @@ vint8m4_t test_vadc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vadc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t carryin,
@@ -126,7 +126,7 @@ vint8m4_t test_vadc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vadc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t carryin,
@@ -136,7 +136,7 @@ vint8m8_t test_vadc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vadc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t carryin,
@@ -146,7 +146,7 @@ vint8m8_t test_vadc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vadc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
@@ -156,7 +156,7 @@ vint16mf4_t test_vadc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vadc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
@@ -166,7 +166,7 @@ vint16mf4_t test_vadc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vadc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
@@ -176,7 +176,7 @@ vint16mf2_t test_vadc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vadc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
@@ -186,7 +186,7 @@ vint16mf2_t test_vadc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vadc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
@@ -196,7 +196,7 @@ vint16m1_t test_vadc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vadc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t carryin,
@@ -206,7 +206,7 @@ vint16m1_t test_vadc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vadc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2, vbool8_t carryin,
@@ -216,7 +216,7 @@ vint16m2_t test_vadc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vadc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t carryin,
@@ -226,7 +226,7 @@ vint16m2_t test_vadc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vadc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2, vbool4_t carryin,
@@ -236,7 +236,7 @@ vint16m4_t test_vadc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vadc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t carryin,
@@ -246,7 +246,7 @@ vint16m4_t test_vadc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vadc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2, vbool2_t carryin,
@@ -256,7 +256,7 @@ vint16m8_t test_vadc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vadc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t carryin,
@@ -266,7 +266,7 @@ vint16m8_t test_vadc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vadc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
@@ -276,7 +276,7 @@ vint32mf2_t test_vadc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vadc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
@@ -286,7 +286,7 @@ vint32mf2_t test_vadc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vadc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
@@ -296,7 +296,7 @@ vint32m1_t test_vadc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vadc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t carryin,
@@ -306,7 +306,7 @@ vint32m1_t test_vadc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vadc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
@@ -316,7 +316,7 @@ vint32m2_t test_vadc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vadc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t carryin,
@@ -326,7 +326,7 @@ vint32m2_t test_vadc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vadc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2, vbool8_t carryin,
@@ -336,7 +336,7 @@ vint32m4_t test_vadc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vadc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t carryin,
@@ -346,7 +346,7 @@ vint32m4_t test_vadc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vadc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2, vbool4_t carryin,
@@ -356,7 +356,7 @@ vint32m8_t test_vadc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vadc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t carryin,
@@ -366,7 +366,7 @@ vint32m8_t test_vadc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vadc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
@@ -376,7 +376,7 @@ vint64m1_t test_vadc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vadc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t carryin,
@@ -386,7 +386,7 @@ vint64m1_t test_vadc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vadc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
@@ -396,7 +396,7 @@ vint64m2_t test_vadc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vadc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t carryin,
@@ -406,7 +406,7 @@ vint64m2_t test_vadc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vadc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
@@ -416,7 +416,7 @@ vint64m4_t test_vadc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vadc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t carryin,
@@ -426,7 +426,7 @@ vint64m4_t test_vadc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vadc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2, vbool8_t carryin,
@@ -436,7 +436,7 @@ vint64m8_t test_vadc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vadc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t carryin,
@@ -446,7 +446,7 @@ vint64m8_t test_vadc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vadc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
@@ -456,7 +456,7 @@ vuint8mf8_t test_vadc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vadc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2, vbool64_t carryin,
@@ -466,7 +466,7 @@ vuint8mf8_t test_vadc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2, vbool64_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vadc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
@@ -476,7 +476,7 @@ vuint8mf4_t test_vadc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vadc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2, vbool32_t carryin,
@@ -486,7 +486,7 @@ vuint8mf4_t test_vadc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2, vbool32_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vadc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
@@ -496,7 +496,7 @@ vuint8mf2_t test_vadc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vadc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2, vbool16_t carryin,
@@ -506,7 +506,7 @@ vuint8mf2_t test_vadc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2, vbool16_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vadc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t carryin,
@@ -516,7 +516,7 @@ vuint8m1_t test_vadc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vadc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t carryin,
@@ -526,7 +526,7 @@ vuint8m1_t test_vadc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vadc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t carryin,
@@ -536,7 +536,7 @@ vuint8m2_t test_vadc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vadc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t carryin,
@@ -546,7 +546,7 @@ vuint8m2_t test_vadc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vadc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t carryin,
@@ -556,7 +556,7 @@ vuint8m4_t test_vadc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vadc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t carryin,
@@ -566,7 +566,7 @@ vuint8m4_t test_vadc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vadc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t carryin,
@@ -576,7 +576,7 @@ vuint8m8_t test_vadc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vadc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t carryin,
@@ -586,7 +586,7 @@ vuint8m8_t test_vadc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vadc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
@@ -596,7 +596,7 @@ vuint16mf4_t test_vadc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vadc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
@@ -606,7 +606,7 @@ vuint16mf4_t test_vadc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vadc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
@@ -616,7 +616,7 @@ vuint16mf2_t test_vadc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vadc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
@@ -626,7 +626,7 @@ vuint16mf2_t test_vadc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vadc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
@@ -636,7 +636,7 @@ vuint16m1_t test_vadc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vadc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
@@ -646,7 +646,7 @@ vuint16m1_t test_vadc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vadc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
@@ -656,7 +656,7 @@ vuint16m2_t test_vadc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vadc_vxm_u16m2(vuint16m2_t op1, uint16_t op2, vbool8_t carryin,
@@ -666,7 +666,7 @@ vuint16m2_t test_vadc_vxm_u16m2(vuint16m2_t op1, uint16_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vadc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
@@ -676,7 +676,7 @@ vuint16m4_t test_vadc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vadc_vxm_u16m4(vuint16m4_t op1, uint16_t op2, vbool4_t carryin,
@@ -686,7 +686,7 @@ vuint16m4_t test_vadc_vxm_u16m4(vuint16m4_t op1, uint16_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vadc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
@@ -696,7 +696,7 @@ vuint16m8_t test_vadc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vadc_vxm_u16m8(vuint16m8_t op1, uint16_t op2, vbool2_t carryin,
@@ -706,7 +706,7 @@ vuint16m8_t test_vadc_vxm_u16m8(vuint16m8_t op1, uint16_t op2, vbool2_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vadc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
@@ -716,7 +716,7 @@ vuint32mf2_t test_vadc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vadc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
@@ -726,7 +726,7 @@ vuint32mf2_t test_vadc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vadc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
@@ -736,7 +736,7 @@ vuint32m1_t test_vadc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vadc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
@@ -746,7 +746,7 @@ vuint32m1_t test_vadc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vadc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
@@ -756,7 +756,7 @@ vuint32m2_t test_vadc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vadc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
@@ -766,7 +766,7 @@ vuint32m2_t test_vadc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vadc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
@@ -776,7 +776,7 @@ vuint32m4_t test_vadc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vadc_vxm_u32m4(vuint32m4_t op1, uint32_t op2, vbool8_t carryin,
@@ -786,7 +786,7 @@ vuint32m4_t test_vadc_vxm_u32m4(vuint32m4_t op1, uint32_t op2, vbool8_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vadc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
@@ -796,7 +796,7 @@ vuint32m8_t test_vadc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vadc_vxm_u32m8(vuint32m8_t op1, uint32_t op2, vbool4_t carryin,
@@ -806,7 +806,7 @@ vuint32m8_t test_vadc_vxm_u32m8(vuint32m8_t op1, uint32_t op2, vbool4_t carryin,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vadc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
@@ -816,7 +816,7 @@ vuint64m1_t test_vadc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vadc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
@@ -826,7 +826,7 @@ vuint64m1_t test_vadc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vadc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
@@ -836,7 +836,7 @@ vuint64m2_t test_vadc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vadc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
@@ -846,7 +846,7 @@ vuint64m2_t test_vadc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vadc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
@@ -856,7 +856,7 @@ vuint64m4_t test_vadc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vadc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
@@ -866,7 +866,7 @@ vuint64m4_t test_vadc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vvm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vadc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
@@ -876,7 +876,7 @@ vuint64m8_t test_vadc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vadc_vxm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[CARRYIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vadc_vxm_u64m8(vuint64m8_t op1, uint64_t op2, vbool8_t carryin,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmerge.c
index 48f0506267435..42a3ceae0bbc3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmerge.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.f32.i64(<vscale x 1 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.f32.i64(<vscale x 1 x float> undef, <vscale x 1 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmerge_vfm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfmerge_vfm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.f32.i64(<vscale x 2 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.f32.i64(<vscale x 2 x float> undef, <vscale x 2 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmerge_vfm_f32m1(vbool32_t mask, vfloat32m1_t op1, float op2,
@@ -28,7 +28,7 @@ vfloat32m1_t test_vfmerge_vfm_f32m1(vbool32_t mask, vfloat32m1_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.f32.i64(<vscale x 4 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.f32.i64(<vscale x 4 x float> undef, <vscale x 4 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmerge_vfm_f32m2(vbool16_t mask, vfloat32m2_t op1, float op2,
@@ -38,7 +38,7 @@ vfloat32m2_t test_vfmerge_vfm_f32m2(vbool16_t mask, vfloat32m2_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.f32.i64(<vscale x 8 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.f32.i64(<vscale x 8 x float> undef, <vscale x 8 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmerge_vfm_f32m4(vbool8_t mask, vfloat32m4_t op1, float op2,
@@ -48,7 +48,7 @@ vfloat32m4_t test_vfmerge_vfm_f32m4(vbool8_t mask, vfloat32m4_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.f32.i64(<vscale x 16 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[OP1:%.*]], float [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmerge_vfm_f32m8(vbool4_t mask, vfloat32m8_t op1, float op2,
@@ -58,7 +58,7 @@ vfloat32m8_t test_vfmerge_vfm_f32m8(vbool4_t mask, vfloat32m8_t op1, float op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.f64.i64(<vscale x 1 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.f64.i64(<vscale x 1 x double> undef, <vscale x 1 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmerge_vfm_f64m1(vbool64_t mask, vfloat64m1_t op1,
@@ -68,7 +68,7 @@ vfloat64m1_t test_vfmerge_vfm_f64m1(vbool64_t mask, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.f64.i64(<vscale x 2 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.f64.i64(<vscale x 2 x double> undef, <vscale x 2 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmerge_vfm_f64m2(vbool32_t mask, vfloat64m2_t op1,
@@ -78,7 +78,7 @@ vfloat64m2_t test_vfmerge_vfm_f64m2(vbool32_t mask, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.f64.i64(<vscale x 4 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.f64.i64(<vscale x 4 x double> undef, <vscale x 4 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmerge_vfm_f64m4(vbool16_t mask, vfloat64m4_t op1,
@@ -88,7 +88,7 @@ vfloat64m4_t test_vfmerge_vfm_f64m4(vbool16_t mask, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64.i64(<vscale x 8 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64.i64(<vscale x 8 x double> undef, <vscale x 8 x double> [[OP1:%.*]], double [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmerge_vfm_f64m8(vbool8_t mask, vfloat64m8_t op1, double op2,
@@ -98,7 +98,7 @@ vfloat64m8_t test_vfmerge_vfm_f64m8(vbool8_t mask, vfloat64m8_t op1, double op2,
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.f16.i64(<vscale x 1 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.f16.i64(<vscale x 1 x half> undef, <vscale x 1 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmerge_vfm_f16mf4 (vbool64_t mask, vfloat16mf4_t op1, _Float16 op2, size_t vl) {
@@ -107,7 +107,7 @@ vfloat16mf4_t test_vfmerge_vfm_f16mf4 (vbool64_t mask, vfloat16mf4_t op1, _Float
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.f16.i64(<vscale x 2 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.f16.i64(<vscale x 2 x half> undef, <vscale x 2 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmerge_vfm_f16mf2 (vbool32_t mask, vfloat16mf2_t op1, _Float16 op2, size_t vl) {
@@ -116,7 +116,7 @@ vfloat16mf2_t test_vfmerge_vfm_f16mf2 (vbool32_t mask, vfloat16mf2_t op1, _Float
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.f16.i64(<vscale x 4 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.f16.i64(<vscale x 4 x half> undef, <vscale x 4 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmerge_vfm_f16m1 (vbool16_t mask, vfloat16m1_t op1, _Float16 op2, size_t vl) {
@@ -125,7 +125,7 @@ vfloat16m1_t test_vfmerge_vfm_f16m1 (vbool16_t mask, vfloat16m1_t op1, _Float16
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.f16.i64(<vscale x 8 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.f16.i64(<vscale x 8 x half> undef, <vscale x 8 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmerge_vfm_f16m2 (vbool8_t mask, vfloat16m2_t op1, _Float16 op2, size_t vl) {
@@ -134,7 +134,7 @@ vfloat16m2_t test_vfmerge_vfm_f16m2 (vbool8_t mask, vfloat16m2_t op1, _Float16 o
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.f16.i64(<vscale x 16 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.f16.i64(<vscale x 16 x half> undef, <vscale x 16 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmerge_vfm_f16m4 (vbool4_t mask, vfloat16m4_t op1, _Float16 op2, size_t vl) {
@@ -143,7 +143,7 @@ vfloat16m4_t test_vfmerge_vfm_f16m4 (vbool4_t mask, vfloat16m4_t op1, _Float16 o
 
 // CHECK-RV64-LABEL: @test_vfmerge_vfm_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.f16.i64(<vscale x 32 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.f16.i64(<vscale x 32 x half> undef, <vscale x 32 x half> [[OP1:%.*]], half [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmerge_vfm_f16m8 (vbool2_t mask, vfloat16m8_t op1, _Float16 op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmerge.c
index a512ad402edb6..f58f90e89c394 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmerge.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmerge.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmerge_vvm_i8mf8(vbool64_t mask, vint8mf8_t op1, vint8mf8_t op2,
@@ -18,7 +18,7 @@ vint8mf8_t test_vmerge_vvm_i8mf8(vbool64_t mask, vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmerge_vxm_i8mf8(vbool64_t mask, vint8mf8_t op1, int8_t op2,
@@ -28,7 +28,7 @@ vint8mf8_t test_vmerge_vxm_i8mf8(vbool64_t mask, vint8mf8_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmerge_vvm_i8mf4(vbool32_t mask, vint8mf4_t op1, vint8mf4_t op2,
@@ -38,7 +38,7 @@ vint8mf4_t test_vmerge_vvm_i8mf4(vbool32_t mask, vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmerge_vxm_i8mf4(vbool32_t mask, vint8mf4_t op1, int8_t op2,
@@ -48,7 +48,7 @@ vint8mf4_t test_vmerge_vxm_i8mf4(vbool32_t mask, vint8mf4_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmerge_vvm_i8mf2(vbool16_t mask, vint8mf2_t op1, vint8mf2_t op2,
@@ -58,7 +58,7 @@ vint8mf2_t test_vmerge_vvm_i8mf2(vbool16_t mask, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmerge_vxm_i8mf2(vbool16_t mask, vint8mf2_t op1, int8_t op2,
@@ -68,7 +68,7 @@ vint8mf2_t test_vmerge_vxm_i8mf2(vbool16_t mask, vint8mf2_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmerge_vvm_i8m1(vbool8_t mask, vint8m1_t op1, vint8m1_t op2,
@@ -78,7 +78,7 @@ vint8m1_t test_vmerge_vvm_i8m1(vbool8_t mask, vint8m1_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmerge_vxm_i8m1(vbool8_t mask, vint8m1_t op1, int8_t op2,
@@ -88,7 +88,7 @@ vint8m1_t test_vmerge_vxm_i8m1(vbool8_t mask, vint8m1_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmerge_vvm_i8m2(vbool4_t mask, vint8m2_t op1, vint8m2_t op2,
@@ -98,7 +98,7 @@ vint8m2_t test_vmerge_vvm_i8m2(vbool4_t mask, vint8m2_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmerge_vxm_i8m2(vbool4_t mask, vint8m2_t op1, int8_t op2,
@@ -108,7 +108,7 @@ vint8m2_t test_vmerge_vxm_i8m2(vbool4_t mask, vint8m2_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmerge_vvm_i8m4(vbool2_t mask, vint8m4_t op1, vint8m4_t op2,
@@ -118,7 +118,7 @@ vint8m4_t test_vmerge_vvm_i8m4(vbool2_t mask, vint8m4_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmerge_vxm_i8m4(vbool2_t mask, vint8m4_t op1, int8_t op2,
@@ -128,7 +128,7 @@ vint8m4_t test_vmerge_vxm_i8m4(vbool2_t mask, vint8m4_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmerge_vvm_i8m8(vbool1_t mask, vint8m8_t op1, vint8m8_t op2,
@@ -138,7 +138,7 @@ vint8m8_t test_vmerge_vvm_i8m8(vbool1_t mask, vint8m8_t op1, vint8m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmerge_vxm_i8m8(vbool1_t mask, vint8m8_t op1, int8_t op2,
@@ -148,7 +148,7 @@ vint8m8_t test_vmerge_vxm_i8m8(vbool1_t mask, vint8m8_t op1, int8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmerge_vvm_i16mf4(vbool64_t mask, vint16mf4_t op1,
@@ -158,7 +158,7 @@ vint16mf4_t test_vmerge_vvm_i16mf4(vbool64_t mask, vint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmerge_vxm_i16mf4(vbool64_t mask, vint16mf4_t op1, int16_t op2,
@@ -168,7 +168,7 @@ vint16mf4_t test_vmerge_vxm_i16mf4(vbool64_t mask, vint16mf4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmerge_vvm_i16mf2(vbool32_t mask, vint16mf2_t op1,
@@ -178,7 +178,7 @@ vint16mf2_t test_vmerge_vvm_i16mf2(vbool32_t mask, vint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmerge_vxm_i16mf2(vbool32_t mask, vint16mf2_t op1, int16_t op2,
@@ -188,7 +188,7 @@ vint16mf2_t test_vmerge_vxm_i16mf2(vbool32_t mask, vint16mf2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmerge_vvm_i16m1(vbool16_t mask, vint16m1_t op1, vint16m1_t op2,
@@ -198,7 +198,7 @@ vint16m1_t test_vmerge_vvm_i16m1(vbool16_t mask, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmerge_vxm_i16m1(vbool16_t mask, vint16m1_t op1, int16_t op2,
@@ -208,7 +208,7 @@ vint16m1_t test_vmerge_vxm_i16m1(vbool16_t mask, vint16m1_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmerge_vvm_i16m2(vbool8_t mask, vint16m2_t op1, vint16m2_t op2,
@@ -218,7 +218,7 @@ vint16m2_t test_vmerge_vvm_i16m2(vbool8_t mask, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmerge_vxm_i16m2(vbool8_t mask, vint16m2_t op1, int16_t op2,
@@ -228,7 +228,7 @@ vint16m2_t test_vmerge_vxm_i16m2(vbool8_t mask, vint16m2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmerge_vvm_i16m4(vbool4_t mask, vint16m4_t op1, vint16m4_t op2,
@@ -238,7 +238,7 @@ vint16m4_t test_vmerge_vvm_i16m4(vbool4_t mask, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmerge_vxm_i16m4(vbool4_t mask, vint16m4_t op1, int16_t op2,
@@ -248,7 +248,7 @@ vint16m4_t test_vmerge_vxm_i16m4(vbool4_t mask, vint16m4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmerge_vvm_i16m8(vbool2_t mask, vint16m8_t op1, vint16m8_t op2,
@@ -258,7 +258,7 @@ vint16m8_t test_vmerge_vvm_i16m8(vbool2_t mask, vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmerge_vxm_i16m8(vbool2_t mask, vint16m8_t op1, int16_t op2,
@@ -268,7 +268,7 @@ vint16m8_t test_vmerge_vxm_i16m8(vbool2_t mask, vint16m8_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmerge_vvm_i32mf2(vbool64_t mask, vint32mf2_t op1,
@@ -278,7 +278,7 @@ vint32mf2_t test_vmerge_vvm_i32mf2(vbool64_t mask, vint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmerge_vxm_i32mf2(vbool64_t mask, vint32mf2_t op1, int32_t op2,
@@ -288,7 +288,7 @@ vint32mf2_t test_vmerge_vxm_i32mf2(vbool64_t mask, vint32mf2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmerge_vvm_i32m1(vbool32_t mask, vint32m1_t op1, vint32m1_t op2,
@@ -298,7 +298,7 @@ vint32m1_t test_vmerge_vvm_i32m1(vbool32_t mask, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmerge_vxm_i32m1(vbool32_t mask, vint32m1_t op1, int32_t op2,
@@ -308,7 +308,7 @@ vint32m1_t test_vmerge_vxm_i32m1(vbool32_t mask, vint32m1_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmerge_vvm_i32m2(vbool16_t mask, vint32m2_t op1, vint32m2_t op2,
@@ -318,7 +318,7 @@ vint32m2_t test_vmerge_vvm_i32m2(vbool16_t mask, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmerge_vxm_i32m2(vbool16_t mask, vint32m2_t op1, int32_t op2,
@@ -328,7 +328,7 @@ vint32m2_t test_vmerge_vxm_i32m2(vbool16_t mask, vint32m2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmerge_vvm_i32m4(vbool8_t mask, vint32m4_t op1, vint32m4_t op2,
@@ -338,7 +338,7 @@ vint32m4_t test_vmerge_vvm_i32m4(vbool8_t mask, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmerge_vxm_i32m4(vbool8_t mask, vint32m4_t op1, int32_t op2,
@@ -348,7 +348,7 @@ vint32m4_t test_vmerge_vxm_i32m4(vbool8_t mask, vint32m4_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmerge_vvm_i32m8(vbool4_t mask, vint32m8_t op1, vint32m8_t op2,
@@ -358,7 +358,7 @@ vint32m8_t test_vmerge_vvm_i32m8(vbool4_t mask, vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmerge_vxm_i32m8(vbool4_t mask, vint32m8_t op1, int32_t op2,
@@ -368,7 +368,7 @@ vint32m8_t test_vmerge_vxm_i32m8(vbool4_t mask, vint32m8_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmerge_vvm_i64m1(vbool64_t mask, vint64m1_t op1, vint64m1_t op2,
@@ -378,7 +378,7 @@ vint64m1_t test_vmerge_vvm_i64m1(vbool64_t mask, vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmerge_vxm_i64m1(vbool64_t mask, vint64m1_t op1, int64_t op2,
@@ -388,7 +388,7 @@ vint64m1_t test_vmerge_vxm_i64m1(vbool64_t mask, vint64m1_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmerge_vvm_i64m2(vbool32_t mask, vint64m2_t op1, vint64m2_t op2,
@@ -398,7 +398,7 @@ vint64m2_t test_vmerge_vvm_i64m2(vbool32_t mask, vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmerge_vxm_i64m2(vbool32_t mask, vint64m2_t op1, int64_t op2,
@@ -408,7 +408,7 @@ vint64m2_t test_vmerge_vxm_i64m2(vbool32_t mask, vint64m2_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmerge_vvm_i64m4(vbool16_t mask, vint64m4_t op1, vint64m4_t op2,
@@ -418,7 +418,7 @@ vint64m4_t test_vmerge_vvm_i64m4(vbool16_t mask, vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmerge_vxm_i64m4(vbool16_t mask, vint64m4_t op1, int64_t op2,
@@ -428,7 +428,7 @@ vint64m4_t test_vmerge_vxm_i64m4(vbool16_t mask, vint64m4_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmerge_vvm_i64m8(vbool8_t mask, vint64m8_t op1, vint64m8_t op2,
@@ -438,7 +438,7 @@ vint64m8_t test_vmerge_vvm_i64m8(vbool8_t mask, vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmerge_vxm_i64m8(vbool8_t mask, vint64m8_t op1, int64_t op2,
@@ -448,7 +448,7 @@ vint64m8_t test_vmerge_vxm_i64m8(vbool8_t mask, vint64m8_t op1, int64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmerge_vvm_u8mf8(vbool64_t mask, vuint8mf8_t op1,
@@ -458,7 +458,7 @@ vuint8mf8_t test_vmerge_vvm_u8mf8(vbool64_t mask, vuint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmerge_vxm_u8mf8(vbool64_t mask, vuint8mf8_t op1, uint8_t op2,
@@ -468,7 +468,7 @@ vuint8mf8_t test_vmerge_vxm_u8mf8(vbool64_t mask, vuint8mf8_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmerge_vvm_u8mf4(vbool32_t mask, vuint8mf4_t op1,
@@ -478,7 +478,7 @@ vuint8mf4_t test_vmerge_vvm_u8mf4(vbool32_t mask, vuint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmerge_vxm_u8mf4(vbool32_t mask, vuint8mf4_t op1, uint8_t op2,
@@ -488,7 +488,7 @@ vuint8mf4_t test_vmerge_vxm_u8mf4(vbool32_t mask, vuint8mf4_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmerge_vvm_u8mf2(vbool16_t mask, vuint8mf2_t op1,
@@ -498,7 +498,7 @@ vuint8mf2_t test_vmerge_vvm_u8mf2(vbool16_t mask, vuint8mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmerge_vxm_u8mf2(vbool16_t mask, vuint8mf2_t op1, uint8_t op2,
@@ -508,7 +508,7 @@ vuint8mf2_t test_vmerge_vxm_u8mf2(vbool16_t mask, vuint8mf2_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmerge_vvm_u8m1(vbool8_t mask, vuint8m1_t op1, vuint8m1_t op2,
@@ -518,7 +518,7 @@ vuint8m1_t test_vmerge_vvm_u8m1(vbool8_t mask, vuint8m1_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmerge_vxm_u8m1(vbool8_t mask, vuint8m1_t op1, uint8_t op2,
@@ -528,7 +528,7 @@ vuint8m1_t test_vmerge_vxm_u8m1(vbool8_t mask, vuint8m1_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmerge_vvm_u8m2(vbool4_t mask, vuint8m2_t op1, vuint8m2_t op2,
@@ -538,7 +538,7 @@ vuint8m2_t test_vmerge_vvm_u8m2(vbool4_t mask, vuint8m2_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmerge_vxm_u8m2(vbool4_t mask, vuint8m2_t op1, uint8_t op2,
@@ -548,7 +548,7 @@ vuint8m2_t test_vmerge_vxm_u8m2(vbool4_t mask, vuint8m2_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmerge_vvm_u8m4(vbool2_t mask, vuint8m4_t op1, vuint8m4_t op2,
@@ -558,7 +558,7 @@ vuint8m4_t test_vmerge_vvm_u8m4(vbool2_t mask, vuint8m4_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmerge_vxm_u8m4(vbool2_t mask, vuint8m4_t op1, uint8_t op2,
@@ -568,7 +568,7 @@ vuint8m4_t test_vmerge_vxm_u8m4(vbool2_t mask, vuint8m4_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmerge_vvm_u8m8(vbool1_t mask, vuint8m8_t op1, vuint8m8_t op2,
@@ -578,7 +578,7 @@ vuint8m8_t test_vmerge_vvm_u8m8(vbool1_t mask, vuint8m8_t op1, vuint8m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmerge_vxm_u8m8(vbool1_t mask, vuint8m8_t op1, uint8_t op2,
@@ -588,7 +588,7 @@ vuint8m8_t test_vmerge_vxm_u8m8(vbool1_t mask, vuint8m8_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmerge_vvm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
@@ -598,7 +598,7 @@ vuint16mf4_t test_vmerge_vvm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmerge_vxm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
@@ -608,7 +608,7 @@ vuint16mf4_t test_vmerge_vxm_u16mf4(vbool64_t mask, vuint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmerge_vvm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
@@ -618,7 +618,7 @@ vuint16mf2_t test_vmerge_vvm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmerge_vxm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
@@ -628,7 +628,7 @@ vuint16mf2_t test_vmerge_vxm_u16mf2(vbool32_t mask, vuint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmerge_vvm_u16m1(vbool16_t mask, vuint16m1_t op1,
@@ -638,7 +638,7 @@ vuint16m1_t test_vmerge_vvm_u16m1(vbool16_t mask, vuint16m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmerge_vxm_u16m1(vbool16_t mask, vuint16m1_t op1, uint16_t op2,
@@ -648,7 +648,7 @@ vuint16m1_t test_vmerge_vxm_u16m1(vbool16_t mask, vuint16m1_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmerge_vvm_u16m2(vbool8_t mask, vuint16m2_t op1,
@@ -658,7 +658,7 @@ vuint16m2_t test_vmerge_vvm_u16m2(vbool8_t mask, vuint16m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmerge_vxm_u16m2(vbool8_t mask, vuint16m2_t op1, uint16_t op2,
@@ -668,7 +668,7 @@ vuint16m2_t test_vmerge_vxm_u16m2(vbool8_t mask, vuint16m2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmerge_vvm_u16m4(vbool4_t mask, vuint16m4_t op1,
@@ -678,7 +678,7 @@ vuint16m4_t test_vmerge_vvm_u16m4(vbool4_t mask, vuint16m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmerge_vxm_u16m4(vbool4_t mask, vuint16m4_t op1, uint16_t op2,
@@ -688,7 +688,7 @@ vuint16m4_t test_vmerge_vxm_u16m4(vbool4_t mask, vuint16m4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmerge_vvm_u16m8(vbool2_t mask, vuint16m8_t op1,
@@ -698,7 +698,7 @@ vuint16m8_t test_vmerge_vvm_u16m8(vbool2_t mask, vuint16m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmerge_vxm_u16m8(vbool2_t mask, vuint16m8_t op1, uint16_t op2,
@@ -708,7 +708,7 @@ vuint16m8_t test_vmerge_vxm_u16m8(vbool2_t mask, vuint16m8_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmerge_vvm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
@@ -718,7 +718,7 @@ vuint32mf2_t test_vmerge_vvm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmerge_vxm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
@@ -728,7 +728,7 @@ vuint32mf2_t test_vmerge_vxm_u32mf2(vbool64_t mask, vuint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmerge_vvm_u32m1(vbool32_t mask, vuint32m1_t op1,
@@ -738,7 +738,7 @@ vuint32m1_t test_vmerge_vvm_u32m1(vbool32_t mask, vuint32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmerge_vxm_u32m1(vbool32_t mask, vuint32m1_t op1, uint32_t op2,
@@ -748,7 +748,7 @@ vuint32m1_t test_vmerge_vxm_u32m1(vbool32_t mask, vuint32m1_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmerge_vvm_u32m2(vbool16_t mask, vuint32m2_t op1,
@@ -758,7 +758,7 @@ vuint32m2_t test_vmerge_vvm_u32m2(vbool16_t mask, vuint32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmerge_vxm_u32m2(vbool16_t mask, vuint32m2_t op1, uint32_t op2,
@@ -768,7 +768,7 @@ vuint32m2_t test_vmerge_vxm_u32m2(vbool16_t mask, vuint32m2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmerge_vvm_u32m4(vbool8_t mask, vuint32m4_t op1,
@@ -778,7 +778,7 @@ vuint32m4_t test_vmerge_vvm_u32m4(vbool8_t mask, vuint32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmerge_vxm_u32m4(vbool8_t mask, vuint32m4_t op1, uint32_t op2,
@@ -788,7 +788,7 @@ vuint32m4_t test_vmerge_vxm_u32m4(vbool8_t mask, vuint32m4_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmerge_vvm_u32m8(vbool4_t mask, vuint32m8_t op1,
@@ -798,7 +798,7 @@ vuint32m8_t test_vmerge_vvm_u32m8(vbool4_t mask, vuint32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmerge_vxm_u32m8(vbool4_t mask, vuint32m8_t op1, uint32_t op2,
@@ -808,7 +808,7 @@ vuint32m8_t test_vmerge_vxm_u32m8(vbool4_t mask, vuint32m8_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmerge_vvm_u64m1(vbool64_t mask, vuint64m1_t op1,
@@ -818,7 +818,7 @@ vuint64m1_t test_vmerge_vvm_u64m1(vbool64_t mask, vuint64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmerge_vxm_u64m1(vbool64_t mask, vuint64m1_t op1, uint64_t op2,
@@ -828,7 +828,7 @@ vuint64m1_t test_vmerge_vxm_u64m1(vbool64_t mask, vuint64m1_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmerge_vvm_u64m2(vbool32_t mask, vuint64m2_t op1,
@@ -838,7 +838,7 @@ vuint64m2_t test_vmerge_vvm_u64m2(vbool32_t mask, vuint64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmerge_vxm_u64m2(vbool32_t mask, vuint64m2_t op1, uint64_t op2,
@@ -848,7 +848,7 @@ vuint64m2_t test_vmerge_vxm_u64m2(vbool32_t mask, vuint64m2_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmerge_vvm_u64m4(vbool16_t mask, vuint64m4_t op1,
@@ -858,7 +858,7 @@ vuint64m4_t test_vmerge_vvm_u64m4(vbool16_t mask, vuint64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmerge_vxm_u64m4(vbool16_t mask, vuint64m4_t op1, uint64_t op2,
@@ -868,7 +868,7 @@ vuint64m4_t test_vmerge_vxm_u64m4(vbool16_t mask, vuint64m4_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmerge_vvm_u64m8(vbool8_t mask, vuint64m8_t op1,
@@ -878,7 +878,7 @@ vuint64m8_t test_vmerge_vvm_u64m8(vbool8_t mask, vuint64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vxm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmerge_vxm_u64m8(vbool8_t mask, vuint64m8_t op1, uint64_t op2,
@@ -888,7 +888,7 @@ vuint64m8_t test_vmerge_vxm_u64m8(vbool8_t mask, vuint64m8_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32.i64(<vscale x 1 x float> undef, <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vmerge_vvm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
@@ -898,7 +898,7 @@ vfloat32mf2_t test_vmerge_vvm_f32mf2(vbool64_t mask, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32.i64(<vscale x 2 x float> undef, <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vmerge_vvm_f32m1(vbool32_t mask, vfloat32m1_t op1,
@@ -908,7 +908,7 @@ vfloat32m1_t test_vmerge_vvm_f32m1(vbool32_t mask, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32.i64(<vscale x 4 x float> undef, <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vmerge_vvm_f32m2(vbool16_t mask, vfloat32m2_t op1,
@@ -918,7 +918,7 @@ vfloat32m2_t test_vmerge_vvm_f32m2(vbool16_t mask, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32.i64(<vscale x 8 x float> undef, <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vmerge_vvm_f32m4(vbool8_t mask, vfloat32m4_t op1,
@@ -928,7 +928,7 @@ vfloat32m4_t test_vmerge_vvm_f32m4(vbool8_t mask, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vmerge_vvm_f32m8(vbool4_t mask, vfloat32m8_t op1,
@@ -938,7 +938,7 @@ vfloat32m8_t test_vmerge_vvm_f32m8(vbool4_t mask, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64.i64(<vscale x 1 x double> undef, <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vmerge_vvm_f64m1(vbool64_t mask, vfloat64m1_t op1,
@@ -948,7 +948,7 @@ vfloat64m1_t test_vmerge_vvm_f64m1(vbool64_t mask, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64.i64(<vscale x 2 x double> undef, <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vmerge_vvm_f64m2(vbool32_t mask, vfloat64m2_t op1,
@@ -958,7 +958,7 @@ vfloat64m2_t test_vmerge_vvm_f64m2(vbool32_t mask, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64.i64(<vscale x 4 x double> undef, <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vmerge_vvm_f64m4(vbool16_t mask, vfloat64m4_t op1,
@@ -968,7 +968,7 @@ vfloat64m4_t test_vmerge_vvm_f64m4(vbool16_t mask, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64.i64(<vscale x 8 x double> undef, <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vmerge_vvm_f64m8(vbool8_t mask, vfloat64m8_t op1,
@@ -978,7 +978,7 @@ vfloat64m8_t test_vmerge_vvm_f64m8(vbool8_t mask, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[OP1:%.*]], <vscale x 1 x half> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16.i64(<vscale x 1 x half> undef, <vscale x 1 x half> [[OP1:%.*]], <vscale x 1 x half> [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vmerge_vvm_f16mf4 (vbool64_t mask, vfloat16mf4_t op1, vfloat16mf4_t op2, size_t vl) {
@@ -987,7 +987,7 @@ vfloat16mf4_t test_vmerge_vvm_f16mf4 (vbool64_t mask, vfloat16mf4_t op1, vfloat1
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[OP1:%.*]], <vscale x 2 x half> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16.i64(<vscale x 2 x half> undef, <vscale x 2 x half> [[OP1:%.*]], <vscale x 2 x half> [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vmerge_vvm_f16mf2 (vbool32_t mask, vfloat16mf2_t op1, vfloat16mf2_t op2, size_t vl) {
@@ -996,7 +996,7 @@ vfloat16mf2_t test_vmerge_vvm_f16mf2 (vbool32_t mask, vfloat16mf2_t op1, vfloat1
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[OP1:%.*]], <vscale x 4 x half> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16.i64(<vscale x 4 x half> undef, <vscale x 4 x half> [[OP1:%.*]], <vscale x 4 x half> [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vmerge_vvm_f16m1 (vbool16_t mask, vfloat16m1_t op1, vfloat16m1_t op2, size_t vl) {
@@ -1005,7 +1005,7 @@ vfloat16m1_t test_vmerge_vvm_f16m1 (vbool16_t mask, vfloat16m1_t op1, vfloat16m1
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[OP1:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16.i64(<vscale x 8 x half> undef, <vscale x 8 x half> [[OP1:%.*]], <vscale x 8 x half> [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vmerge_vvm_f16m2 (vbool8_t mask, vfloat16m2_t op1, vfloat16m2_t op2, size_t vl) {
@@ -1014,7 +1014,7 @@ vfloat16m2_t test_vmerge_vvm_f16m2 (vbool8_t mask, vfloat16m2_t op1, vfloat16m2_
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[OP1:%.*]], <vscale x 16 x half> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16.i64(<vscale x 16 x half> undef, <vscale x 16 x half> [[OP1:%.*]], <vscale x 16 x half> [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vmerge_vvm_f16m4 (vbool4_t mask, vfloat16m4_t op1, vfloat16m4_t op2, size_t vl) {
@@ -1023,7 +1023,7 @@ vfloat16m4_t test_vmerge_vvm_f16m4 (vbool4_t mask, vfloat16m4_t op1, vfloat16m4_
 
 // CHECK-RV64-LABEL: @test_vmerge_vvm_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[OP1:%.*]], <vscale x 32 x half> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16.i64(<vscale x 32 x half> undef, <vscale x 32 x half> [[OP1:%.*]], <vscale x 32 x half> [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vmerge_vvm_f16m8 (vbool2_t mask, vfloat16m8_t op1, vfloat16m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vsbc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vsbc.c
index b3cac73c30f7e..dc224bcacda84 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vsbc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vsbc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vsbc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
@@ -16,7 +16,7 @@ vint8mf8_t test_vsbc_vvm_i8mf8(vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vsbc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t borrowin,
@@ -26,7 +26,7 @@ vint8mf8_t test_vsbc_vxm_i8mf8(vint8mf8_t op1, int8_t op2, vbool64_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vsbc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
@@ -36,7 +36,7 @@ vint8mf4_t test_vsbc_vvm_i8mf4(vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vsbc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t borrowin,
@@ -46,7 +46,7 @@ vint8mf4_t test_vsbc_vxm_i8mf4(vint8mf4_t op1, int8_t op2, vbool32_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vsbc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
@@ -56,7 +56,7 @@ vint8mf2_t test_vsbc_vvm_i8mf2(vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vsbc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t borrowin,
@@ -66,7 +66,7 @@ vint8mf2_t test_vsbc_vxm_i8mf2(vint8mf2_t op1, int8_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vsbc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t borrowin,
@@ -76,7 +76,7 @@ vint8m1_t test_vsbc_vvm_i8m1(vint8m1_t op1, vint8m1_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vsbc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t borrowin,
@@ -86,7 +86,7 @@ vint8m1_t test_vsbc_vxm_i8m1(vint8m1_t op1, int8_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vsbc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t borrowin,
@@ -96,7 +96,7 @@ vint8m2_t test_vsbc_vvm_i8m2(vint8m2_t op1, vint8m2_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vsbc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t borrowin,
@@ -106,7 +106,7 @@ vint8m2_t test_vsbc_vxm_i8m2(vint8m2_t op1, int8_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vsbc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t borrowin,
@@ -116,7 +116,7 @@ vint8m4_t test_vsbc_vvm_i8m4(vint8m4_t op1, vint8m4_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vsbc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t borrowin,
@@ -126,7 +126,7 @@ vint8m4_t test_vsbc_vxm_i8m4(vint8m4_t op1, int8_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vsbc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t borrowin,
@@ -136,7 +136,7 @@ vint8m8_t test_vsbc_vvm_i8m8(vint8m8_t op1, vint8m8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vsbc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t borrowin,
@@ -146,7 +146,7 @@ vint8m8_t test_vsbc_vxm_i8m8(vint8m8_t op1, int8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vsbc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
@@ -156,7 +156,7 @@ vint16mf4_t test_vsbc_vvm_i16mf4(vint16mf4_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vsbc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
@@ -166,7 +166,7 @@ vint16mf4_t test_vsbc_vxm_i16mf4(vint16mf4_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vsbc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
@@ -176,7 +176,7 @@ vint16mf2_t test_vsbc_vvm_i16mf2(vint16mf2_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vsbc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
@@ -186,7 +186,7 @@ vint16mf2_t test_vsbc_vxm_i16mf2(vint16mf2_t op1, int16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vsbc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
@@ -196,7 +196,7 @@ vint16m1_t test_vsbc_vvm_i16m1(vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vsbc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t borrowin,
@@ -206,7 +206,7 @@ vint16m1_t test_vsbc_vxm_i16m1(vint16m1_t op1, int16_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vsbc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2,
@@ -216,7 +216,7 @@ vint16m2_t test_vsbc_vvm_i16m2(vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vsbc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t borrowin,
@@ -226,7 +226,7 @@ vint16m2_t test_vsbc_vxm_i16m2(vint16m2_t op1, int16_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vsbc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2,
@@ -236,7 +236,7 @@ vint16m4_t test_vsbc_vvm_i16m4(vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vsbc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t borrowin,
@@ -246,7 +246,7 @@ vint16m4_t test_vsbc_vxm_i16m4(vint16m4_t op1, int16_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vsbc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2,
@@ -256,7 +256,7 @@ vint16m8_t test_vsbc_vvm_i16m8(vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vsbc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t borrowin,
@@ -266,7 +266,7 @@ vint16m8_t test_vsbc_vxm_i16m8(vint16m8_t op1, int16_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vsbc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
@@ -276,7 +276,7 @@ vint32mf2_t test_vsbc_vvm_i32mf2(vint32mf2_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vsbc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
@@ -286,7 +286,7 @@ vint32mf2_t test_vsbc_vxm_i32mf2(vint32mf2_t op1, int32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vsbc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
@@ -296,7 +296,7 @@ vint32m1_t test_vsbc_vvm_i32m1(vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vsbc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t borrowin,
@@ -306,7 +306,7 @@ vint32m1_t test_vsbc_vxm_i32m1(vint32m1_t op1, int32_t op2, vbool32_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vsbc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
@@ -316,7 +316,7 @@ vint32m2_t test_vsbc_vvm_i32m2(vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vsbc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t borrowin,
@@ -326,7 +326,7 @@ vint32m2_t test_vsbc_vxm_i32m2(vint32m2_t op1, int32_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vsbc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2,
@@ -336,7 +336,7 @@ vint32m4_t test_vsbc_vvm_i32m4(vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vsbc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t borrowin,
@@ -346,7 +346,7 @@ vint32m4_t test_vsbc_vxm_i32m4(vint32m4_t op1, int32_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vsbc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2,
@@ -356,7 +356,7 @@ vint32m8_t test_vsbc_vvm_i32m8(vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vsbc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t borrowin,
@@ -366,7 +366,7 @@ vint32m8_t test_vsbc_vxm_i32m8(vint32m8_t op1, int32_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vsbc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
@@ -376,7 +376,7 @@ vint64m1_t test_vsbc_vvm_i64m1(vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vsbc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t borrowin,
@@ -386,7 +386,7 @@ vint64m1_t test_vsbc_vxm_i64m1(vint64m1_t op1, int64_t op2, vbool64_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vsbc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
@@ -396,7 +396,7 @@ vint64m2_t test_vsbc_vvm_i64m2(vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vsbc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t borrowin,
@@ -406,7 +406,7 @@ vint64m2_t test_vsbc_vxm_i64m2(vint64m2_t op1, int64_t op2, vbool32_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vsbc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
@@ -416,7 +416,7 @@ vint64m4_t test_vsbc_vvm_i64m4(vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vsbc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t borrowin,
@@ -426,7 +426,7 @@ vint64m4_t test_vsbc_vxm_i64m4(vint64m4_t op1, int64_t op2, vbool16_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vsbc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2,
@@ -436,7 +436,7 @@ vint64m8_t test_vsbc_vvm_i64m8(vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vsbc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t borrowin,
@@ -446,7 +446,7 @@ vint64m8_t test_vsbc_vxm_i64m8(vint64m8_t op1, int64_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vsbc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
@@ -456,7 +456,7 @@ vuint8mf8_t test_vsbc_vvm_u8mf8(vuint8mf8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vsbc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2,
@@ -466,7 +466,7 @@ vuint8mf8_t test_vsbc_vxm_u8mf8(vuint8mf8_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vsbc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
@@ -476,7 +476,7 @@ vuint8mf4_t test_vsbc_vvm_u8mf4(vuint8mf4_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vsbc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2,
@@ -486,7 +486,7 @@ vuint8mf4_t test_vsbc_vxm_u8mf4(vuint8mf4_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vsbc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
@@ -496,7 +496,7 @@ vuint8mf2_t test_vsbc_vvm_u8mf2(vuint8mf2_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vsbc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2,
@@ -506,7 +506,7 @@ vuint8mf2_t test_vsbc_vxm_u8mf2(vuint8mf2_t op1, uint8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vsbc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t borrowin,
@@ -516,7 +516,7 @@ vuint8m1_t test_vsbc_vvm_u8m1(vuint8m1_t op1, vuint8m1_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vsbc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t borrowin,
@@ -526,7 +526,7 @@ vuint8m1_t test_vsbc_vxm_u8m1(vuint8m1_t op1, uint8_t op2, vbool8_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vsbc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t borrowin,
@@ -536,7 +536,7 @@ vuint8m2_t test_vsbc_vvm_u8m2(vuint8m2_t op1, vuint8m2_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vsbc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t borrowin,
@@ -546,7 +546,7 @@ vuint8m2_t test_vsbc_vxm_u8m2(vuint8m2_t op1, uint8_t op2, vbool4_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vsbc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t borrowin,
@@ -556,7 +556,7 @@ vuint8m4_t test_vsbc_vvm_u8m4(vuint8m4_t op1, vuint8m4_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vsbc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t borrowin,
@@ -566,7 +566,7 @@ vuint8m4_t test_vsbc_vxm_u8m4(vuint8m4_t op1, uint8_t op2, vbool2_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vsbc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t borrowin,
@@ -576,7 +576,7 @@ vuint8m8_t test_vsbc_vvm_u8m8(vuint8m8_t op1, vuint8m8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[OP1:%.*]], i8 [[OP2:%.*]], <vscale x 64 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vsbc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t borrowin,
@@ -586,7 +586,7 @@ vuint8m8_t test_vsbc_vxm_u8m8(vuint8m8_t op1, uint8_t op2, vbool1_t borrowin,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vsbc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
@@ -596,7 +596,7 @@ vuint16mf4_t test_vsbc_vvm_u16mf4(vuint16mf4_t op1, vuint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vsbc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
@@ -606,7 +606,7 @@ vuint16mf4_t test_vsbc_vxm_u16mf4(vuint16mf4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vsbc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
@@ -616,7 +616,7 @@ vuint16mf2_t test_vsbc_vvm_u16mf2(vuint16mf2_t op1, vuint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vsbc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
@@ -626,7 +626,7 @@ vuint16mf2_t test_vsbc_vxm_u16mf2(vuint16mf2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vsbc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
@@ -636,7 +636,7 @@ vuint16m1_t test_vsbc_vvm_u16m1(vuint16m1_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vsbc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
@@ -646,7 +646,7 @@ vuint16m1_t test_vsbc_vxm_u16m1(vuint16m1_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vsbc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
@@ -656,7 +656,7 @@ vuint16m2_t test_vsbc_vvm_u16m2(vuint16m2_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vsbc_vxm_u16m2(vuint16m2_t op1, uint16_t op2,
@@ -666,7 +666,7 @@ vuint16m2_t test_vsbc_vxm_u16m2(vuint16m2_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vsbc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
@@ -676,7 +676,7 @@ vuint16m4_t test_vsbc_vvm_u16m4(vuint16m4_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vsbc_vxm_u16m4(vuint16m4_t op1, uint16_t op2,
@@ -686,7 +686,7 @@ vuint16m4_t test_vsbc_vxm_u16m4(vuint16m4_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vsbc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
@@ -696,7 +696,7 @@ vuint16m8_t test_vsbc_vvm_u16m8(vuint16m8_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[OP1:%.*]], i16 [[OP2:%.*]], <vscale x 32 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vsbc_vxm_u16m8(vuint16m8_t op1, uint16_t op2,
@@ -706,7 +706,7 @@ vuint16m8_t test_vsbc_vxm_u16m8(vuint16m8_t op1, uint16_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vsbc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
@@ -716,7 +716,7 @@ vuint32mf2_t test_vsbc_vvm_u32mf2(vuint32mf2_t op1, vuint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vsbc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
@@ -726,7 +726,7 @@ vuint32mf2_t test_vsbc_vxm_u32mf2(vuint32mf2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vsbc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
@@ -736,7 +736,7 @@ vuint32m1_t test_vsbc_vvm_u32m1(vuint32m1_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vsbc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
@@ -746,7 +746,7 @@ vuint32m1_t test_vsbc_vxm_u32m1(vuint32m1_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vsbc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
@@ -756,7 +756,7 @@ vuint32m2_t test_vsbc_vvm_u32m2(vuint32m2_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vsbc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
@@ -766,7 +766,7 @@ vuint32m2_t test_vsbc_vxm_u32m2(vuint32m2_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vsbc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
@@ -776,7 +776,7 @@ vuint32m4_t test_vsbc_vvm_u32m4(vuint32m4_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vsbc_vxm_u32m4(vuint32m4_t op1, uint32_t op2,
@@ -786,7 +786,7 @@ vuint32m4_t test_vsbc_vxm_u32m4(vuint32m4_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vsbc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
@@ -796,7 +796,7 @@ vuint32m8_t test_vsbc_vvm_u32m8(vuint32m8_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[OP1:%.*]], i32 [[OP2:%.*]], <vscale x 16 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vsbc_vxm_u32m8(vuint32m8_t op1, uint32_t op2,
@@ -806,7 +806,7 @@ vuint32m8_t test_vsbc_vxm_u32m8(vuint32m8_t op1, uint32_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vsbc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
@@ -816,7 +816,7 @@ vuint64m1_t test_vsbc_vvm_u64m1(vuint64m1_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 1 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vsbc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
@@ -826,7 +826,7 @@ vuint64m1_t test_vsbc_vxm_u64m1(vuint64m1_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vsbc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
@@ -836,7 +836,7 @@ vuint64m2_t test_vsbc_vvm_u64m2(vuint64m2_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 2 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vsbc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
@@ -846,7 +846,7 @@ vuint64m2_t test_vsbc_vxm_u64m2(vuint64m2_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vsbc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
@@ -856,7 +856,7 @@ vuint64m4_t test_vsbc_vvm_u64m4(vuint64m4_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 4 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vsbc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
@@ -866,7 +866,7 @@ vuint64m4_t test_vsbc_vxm_u64m4(vuint64m4_t op1, uint64_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vvm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vsbc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
@@ -876,7 +876,7 @@ vuint64m8_t test_vsbc_vvm_u64m8(vuint64m8_t op1, vuint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vsbc_vxm_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[OP1:%.*]], i64 [[OP2:%.*]], <vscale x 8 x i1> [[BORROWIN:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vsbc_vxm_u64m8(vuint64m8_t op1, uint64_t op2,
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 553c7e00a244d..2c338f139cdb5 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -527,15 +527,15 @@ let TargetPrefix = "riscv" in {
     let VLOperand = 4;
   }
   // For binary operations with V0 as input.
-  // Input: (vector_in, vector_in/scalar_in, V0, vl)
+  // Input: (passthru, vector_in, vector_in/scalar_in, V0, vl)
   class RISCVBinaryWithV0
         : Intrinsic<[llvm_anyvector_ty],
-                    [LLVMMatchType<0>, llvm_any_ty,
+                    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_any_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                      llvm_anyint_ty],
                     [IntrNoMem]>, RISCVVIntrinsic {
-    let SplatOperand = 1;
-    let VLOperand = 3;
+    let SplatOperand = 2;
+    let VLOperand = 4;
   }
   // For binary operations with mask type output and V0 as input.
   // Output: (mask type output)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 03ed195e5198d..f0caf72e01204 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2353,6 +2353,13 @@ multiclass VPseudoVCALU_VM_XM_IM {
             Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
   defm "" : VPseudoBinaryV_IM,
             Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
+  // Tied versions to allow codegen control over the tail elements
+  defm "" : VPseudoTiedBinaryV_VM,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_XM,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_IM,
+            Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
 }
 
 multiclass VPseudoVCALU_VM_XM {
@@ -2360,6 +2367,11 @@ multiclass VPseudoVCALU_VM_XM {
             Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
   defm "" : VPseudoBinaryV_XM,
             Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  // Tied versions to allow codegen control over the tail elements
+  defm "" : VPseudoTiedBinaryV_VM,
+            Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_XM,
+            Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
 }
 
 multiclass VPseudoVCALUM_VM_XM_IM<string Constraint> {
@@ -3383,6 +3395,42 @@ multiclass VPatBinarySwapped<string intrinsic,
                               op2_kind>;
 }
 
+multiclass VPatBinaryCarryInTAIL<string intrinsic,
+                                 string inst,
+                                 string kind,
+                                 ValueType result_type,
+                                 ValueType op1_type,
+                                 ValueType op2_type,
+                                 ValueType mask_type,
+                                 int sew,
+                                 LMULInfo vlmul,
+                                 VReg result_reg_class,
+                                 VReg op1_reg_class,
+                                 DAGOperand op2_kind>
+{
+  def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                         (result_type undef),
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0),
+                         VLOpFrag)),
+                         (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0), GPR:$vl, sew)>;
+  def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
+                         (result_type result_reg_class:$merge),
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0),
+                         VLOpFrag)),
+                         (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_TU")
+                         (result_type result_reg_class:$merge),
+                         (op1_type op1_reg_class:$rs1),
+                         (op2_type op2_kind:$rs2),
+                         (mask_type V0), GPR:$vl, sew)>;
+}
+
 multiclass VPatBinaryCarryIn<string intrinsic,
                              string inst,
                              string kind,
@@ -3672,6 +3720,39 @@ multiclass VPatBinaryV_IM<string intrinsic, string instruction,
                              vti.RegClass, simm5>;
 }
 
+multiclass VPatBinaryV_VM_TAIL<string intrinsic, string instruction,
+                               bit CarryOut = 0,
+                               list<VTypeInfo> vtilist = AllIntegerVectors> {
+  foreach vti = vtilist in
+    defm : VPatBinaryCarryInTAIL<intrinsic, instruction, "VVM",
+                                 !if(CarryOut, vti.Mask, vti.Vector),
+                                 vti.Vector, vti.Vector, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, vti.RegClass>;
+}
+
+multiclass VPatBinaryV_XM_TAIL<string intrinsic, string instruction,
+                               bit CarryOut = 0,
+                               list<VTypeInfo> vtilist = AllIntegerVectors> {
+  foreach vti = vtilist in
+    defm : VPatBinaryCarryInTAIL<intrinsic, instruction,
+                                 "V"#vti.ScalarSuffix#"M",
+                                 !if(CarryOut, vti.Mask, vti.Vector),
+                                 vti.Vector, vti.Scalar, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, vti.ScalarRegClass>;
+}
+
+multiclass VPatBinaryV_IM_TAIL<string intrinsic, string instruction,
+                               bit CarryOut = 0> {
+  foreach vti = AllIntegerVectors in
+    defm : VPatBinaryCarryInTAIL<intrinsic, instruction, "VIM",
+                                 !if(CarryOut, vti.Mask, vti.Vector),
+                                 vti.Vector, XLenVT, vti.Mask,
+                                 vti.Log2SEW, vti.LMul,
+                                 vti.RegClass, vti.RegClass, simm5>;
+}
+
 multiclass VPatBinaryV_V<string intrinsic, string instruction> {
   foreach vti = AllIntegerVectors in
     defm : VPatBinaryMaskOut<intrinsic, instruction, "VV",
@@ -3767,9 +3848,9 @@ multiclass VPatBinaryV_WV_WX_WI<string intrinsic, string instruction,
       VPatBinaryV_WI<intrinsic, instruction, vtilist>;
 
 multiclass VPatBinaryV_VM_XM_IM<string intrinsic, string instruction>
-    : VPatBinaryV_VM<intrinsic, instruction>,
-      VPatBinaryV_XM<intrinsic, instruction>,
-      VPatBinaryV_IM<intrinsic, instruction>;
+    : VPatBinaryV_VM_TAIL<intrinsic, instruction>,
+      VPatBinaryV_XM_TAIL<intrinsic, instruction>,
+      VPatBinaryV_IM_TAIL<intrinsic, instruction>;
 
 multiclass VPatBinaryM_VM_XM_IM<string intrinsic, string instruction>
     : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>,
@@ -3782,8 +3863,8 @@ multiclass VPatBinaryM_V_X_I<string intrinsic, string instruction>
       VPatBinaryV_I<intrinsic, instruction>;
 
 multiclass VPatBinaryV_VM_XM<string intrinsic, string instruction>
-    : VPatBinaryV_VM<intrinsic, instruction>,
-      VPatBinaryV_XM<intrinsic, instruction>;
+    : VPatBinaryV_VM_TAIL<intrinsic, instruction>,
+      VPatBinaryV_XM_TAIL<intrinsic, instruction>;
 
 multiclass VPatBinaryM_VM_XM<string intrinsic, string instruction>
     : VPatBinaryV_VM<intrinsic, instruction, /*CarryOut=*/1>,
@@ -5134,19 +5215,27 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 // We can use vmerge.vvm to support vector-vector vfmerge.
 // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
 // int_riscv_vmerge. Support both for compatibility.
-defm : VPatBinaryV_VM<"int_riscv_vmerge", "PseudoVMERGE",
-                      /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
-defm : VPatBinaryV_VM<"int_riscv_vfmerge", "PseudoVMERGE",
-                      /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
-defm : VPatBinaryV_XM<"int_riscv_vfmerge", "PseudoVFMERGE",
-                      /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_VM_TAIL<"int_riscv_vmerge", "PseudoVMERGE",
+                           /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_VM_TAIL<"int_riscv_vfmerge", "PseudoVMERGE",
+                           /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
+defm : VPatBinaryV_XM_TAIL<"int_riscv_vfmerge", "PseudoVFMERGE",
+                           /*CarryOut = */0, /*vtilist=*/AllFloatVectors>;
 
 foreach fvti = AllFloatVectors in {
   defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
-  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$rs2),
+  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector undef),
+                                            (fvti.Vector fvti.RegClass:$rs2),
                                             (fvti.Scalar (fpimm0)),
                                             (fvti.Mask V0), VLOpFrag)),
             (instr fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
+  defvar instr_tu = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX#"_TU");
+  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$merge),
+                                            (fvti.Vector fvti.RegClass:$rs2),
+                                            (fvti.Scalar (fpimm0)),
+                                            (fvti.Mask V0), VLOpFrag)),
+            (instr_tu fvti.RegClass:$merge, fvti.RegClass:$rs2, 0,
+                      (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
index 8157ae1b0e403..8147a08481923 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
@@ -2984,3 +2984,254 @@ entry:
 
   ret <vscale x 1 x i8> %a
 }
+
+declare <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vadc_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vadc_vvm_nxv1i8_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV32-NEXT:    vadc.vvm v8, v9, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vadc_vvm_nxv1i8_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV64-NEXT:    vadc.vvm v8, v9, v10, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vsbc_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsbc_vvm_nxv1i8_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV32-NEXT:    vsbc.vvm v8, v9, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsbc_vvm_nxv1i8_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV64-NEXT:    vsbc.vvm v8, v9, v10, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vmerge_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vmerge_vvm_nxv1i8_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV32-NEXT:    vmerge.vvm v8, v9, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmerge_vvm_nxv1i8_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV64-NEXT:    vmerge.vvm v8, v9, v10, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+  <vscale x 8 x i64>,
+  <vscale x 8 x i64>,
+  i64,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
+; RV32-NEXT:    vmerge.vvm v8, v16, v24, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, tu, mu
+; RV64-NEXT:    vmerge.vxm v8, v16, a0, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+    <vscale x 8 x i64> %0,
+    <vscale x 8 x i64> %1,
+    i64 %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x i64> %a
+}
+
+define <vscale x 8 x i64> @intrinsic_vmerge_vim_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vmerge_vim_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    li a1, 15
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    li a1, -1
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
+; RV32-NEXT:    vmerge.vvm v8, v16, v24, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmerge_vim_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    srli a1, a1, 28
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; RV64-NEXT:    vmerge.vxm v8, v16, a1, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+    <vscale x 8 x i64> %0,
+    <vscale x 8 x i64> %1,
+    i64 68719476735,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64(
+  <vscale x 8 x double>,
+  <vscale x 8 x double>,
+  double,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vfmerge_vfm_nxv8f64_nxv8f64_f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, double %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vfmerge_vfm_nxv8f64_nxv8f64_f64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; RV32-NEXT:    vfmerge.vfm v8, v16, fa0, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmerge_vfm_nxv8f64_nxv8f64_f64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; RV64-NEXT:    vfmerge.vfm v8, v16, fa0, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64(
+    <vscale x 8 x double> %0,
+    <vscale x 8 x double> %1,
+    double %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; RV32-NEXT:    vmerge.vvm v8, v9, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; RV64-NEXT:    vmerge.vvm v8, v9, v10, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+    <vscale x 1 x half> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.f16(
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  half,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vfmerge_vzm_nxv1f16_nxv1f16_f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfmerge_vzm_nxv1f16_nxv1f16_f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; RV32-NEXT:    vmerge.vim v8, v9, 0, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmerge_vzm_nxv1f16_nxv1f16_f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; RV64-NEXT:    vmerge.vim v8, v9, 0, v0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.f16(
+    <vscale x 1 x half> %0,
+    <vscale x 1 x half> %1,
+    half zeroinitializer,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 1 x half> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadc-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vadc-rv32.ll
index 94c1f4dd52cc9..a6b389134a4d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadc-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadc-rv32.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
@@ -15,6 +16,7 @@ define <vscale x 1 x i8> @intrinsic_vadc_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i1> %2,
@@ -24,6 +26,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
@@ -37,6 +40,7 @@ define <vscale x 2 x i8> @intrinsic_vadc_vvm_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i1> %2,
@@ -46,6 +50,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
@@ -59,6 +64,7 @@ define <vscale x 4 x i8> @intrinsic_vadc_vvm_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i1> %2,
@@ -68,6 +74,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
@@ -81,6 +88,7 @@ define <vscale x 8 x i8> @intrinsic_vadc_vvm_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i1> %2,
@@ -90,6 +98,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
@@ -103,6 +112,7 @@ define <vscale x 16 x i8> @intrinsic_vadc_vvm_nxv16i8_nxv16i8_nxv16i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i1> %2,
@@ -112,6 +122,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
@@ -125,6 +136,7 @@ define <vscale x 32 x i8> @intrinsic_vadc_vvm_nxv32i8_nxv32i8_nxv32i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i1> %2,
@@ -134,6 +146,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
@@ -147,6 +160,7 @@ define <vscale x 64 x i8> @intrinsic_vadc_vvm_nxv64i8_nxv64i8_nxv64i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i1> %2,
@@ -156,6 +170,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
@@ -169,6 +184,7 @@ define <vscale x 1 x i16> @intrinsic_vadc_vvm_nxv1i16_nxv1i16_nxv1i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i1> %2,
@@ -178,6 +194,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
@@ -191,6 +208,7 @@ define <vscale x 2 x i16> @intrinsic_vadc_vvm_nxv2i16_nxv2i16_nxv2i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i1> %2,
@@ -200,6 +218,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
@@ -213,6 +232,7 @@ define <vscale x 4 x i16> @intrinsic_vadc_vvm_nxv4i16_nxv4i16_nxv4i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i1> %2,
@@ -222,6 +242,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
@@ -235,6 +256,7 @@ define <vscale x 8 x i16> @intrinsic_vadc_vvm_nxv8i16_nxv8i16_nxv8i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i1> %2,
@@ -244,6 +266,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
@@ -257,6 +280,7 @@ define <vscale x 16 x i16> @intrinsic_vadc_vvm_nxv16i16_nxv16i16_nxv16i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i1> %2,
@@ -266,6 +290,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
@@ -279,6 +304,7 @@ define <vscale x 32 x i16> @intrinsic_vadc_vvm_nxv32i16_nxv32i16_nxv32i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i1> %2,
@@ -288,6 +314,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
@@ -301,6 +328,7 @@ define <vscale x 1 x i32> @intrinsic_vadc_vvm_nxv1i32_nxv1i32_nxv1i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i1> %2,
@@ -310,6 +338,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
@@ -323,6 +352,7 @@ define <vscale x 2 x i32> @intrinsic_vadc_vvm_nxv2i32_nxv2i32_nxv2i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i1> %2,
@@ -332,6 +362,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
@@ -345,6 +376,7 @@ define <vscale x 4 x i32> @intrinsic_vadc_vvm_nxv4i32_nxv4i32_nxv4i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i1> %2,
@@ -354,6 +386,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
@@ -367,6 +400,7 @@ define <vscale x 8 x i32> @intrinsic_vadc_vvm_nxv8i32_nxv8i32_nxv8i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i1> %2,
@@ -376,6 +410,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
@@ -389,6 +424,7 @@ define <vscale x 16 x i32> @intrinsic_vadc_vvm_nxv16i32_nxv16i32_nxv16i32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i1> %2,
@@ -398,6 +434,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
@@ -411,6 +448,7 @@ define <vscale x 1 x i64> @intrinsic_vadc_vvm_nxv1i64_nxv1i64_nxv1i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i1> %2,
@@ -420,6 +458,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
@@ -433,6 +472,7 @@ define <vscale x 2 x i64> @intrinsic_vadc_vvm_nxv2i64_nxv2i64_nxv2i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i1> %2,
@@ -442,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
@@ -455,6 +496,7 @@ define <vscale x 4 x i64> @intrinsic_vadc_vvm_nxv4i64_nxv4i64_nxv4i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i1> %2,
@@ -464,6 +506,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
@@ -477,6 +520,7 @@ define <vscale x 8 x i64> @intrinsic_vadc_vvm_nxv8i64_nxv8i64_nxv8i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i1> %2,
@@ -486,6 +530,7 @@ entry:
 }
 
 declare <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
@@ -499,6 +544,7 @@ define <vscale x 1 x i8> @intrinsic_vadc_vxm_nxv1i8_nxv1i8_i8(<vscale x 1 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i1> %2,
@@ -508,6 +554,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
@@ -521,6 +568,7 @@ define <vscale x 2 x i8> @intrinsic_vadc_vxm_nxv2i8_nxv2i8_i8(<vscale x 2 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i1> %2,
@@ -530,6 +578,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
@@ -543,6 +592,7 @@ define <vscale x 4 x i8> @intrinsic_vadc_vxm_nxv4i8_nxv4i8_i8(<vscale x 4 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i1> %2,
@@ -552,6 +602,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
@@ -565,6 +616,7 @@ define <vscale x 8 x i8> @intrinsic_vadc_vxm_nxv8i8_nxv8i8_i8(<vscale x 8 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i1> %2,
@@ -574,6 +626,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
@@ -587,6 +640,7 @@ define <vscale x 16 x i8> @intrinsic_vadc_vxm_nxv16i8_nxv16i8_i8(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i1> %2,
@@ -596,6 +650,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
@@ -609,6 +664,7 @@ define <vscale x 32 x i8> @intrinsic_vadc_vxm_nxv32i8_nxv32i8_i8(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i1> %2,
@@ -618,6 +674,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
@@ -631,6 +688,7 @@ define <vscale x 64 x i8> @intrinsic_vadc_vxm_nxv64i8_nxv64i8_i8(<vscale x 64 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
     <vscale x 64 x i1> %2,
@@ -640,6 +698,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
@@ -653,6 +712,7 @@ define <vscale x 1 x i16> @intrinsic_vadc_vxm_nxv1i16_nxv1i16_i16(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i1> %2,
@@ -662,6 +722,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
@@ -675,6 +736,7 @@ define <vscale x 2 x i16> @intrinsic_vadc_vxm_nxv2i16_nxv2i16_i16(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i1> %2,
@@ -684,6 +746,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
@@ -697,6 +760,7 @@ define <vscale x 4 x i16> @intrinsic_vadc_vxm_nxv4i16_nxv4i16_i16(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i1> %2,
@@ -706,6 +770,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
@@ -719,6 +784,7 @@ define <vscale x 8 x i16> @intrinsic_vadc_vxm_nxv8i16_nxv8i16_i16(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i1> %2,
@@ -728,6 +794,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
@@ -741,6 +808,7 @@ define <vscale x 16 x i16> @intrinsic_vadc_vxm_nxv16i16_nxv16i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i1> %2,
@@ -750,6 +818,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
@@ -763,6 +832,7 @@ define <vscale x 32 x i16> @intrinsic_vadc_vxm_nxv32i16_nxv32i16_i16(<vscale x 3
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
     <vscale x 32 x i1> %2,
@@ -772,6 +842,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
@@ -785,6 +856,7 @@ define <vscale x 1 x i32> @intrinsic_vadc_vxm_nxv1i32_nxv1i32_i32(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i1> %2,
@@ -794,6 +866,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
@@ -807,6 +880,7 @@ define <vscale x 2 x i32> @intrinsic_vadc_vxm_nxv2i32_nxv2i32_i32(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i1> %2,
@@ -816,6 +890,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
@@ -829,6 +904,7 @@ define <vscale x 4 x i32> @intrinsic_vadc_vxm_nxv4i32_nxv4i32_i32(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i1> %2,
@@ -838,6 +914,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
@@ -851,6 +928,7 @@ define <vscale x 8 x i32> @intrinsic_vadc_vxm_nxv8i32_nxv8i32_i32(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i1> %2,
@@ -860,6 +938,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
@@ -873,6 +952,7 @@ define <vscale x 16 x i32> @intrinsic_vadc_vxm_nxv16i32_nxv16i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
     <vscale x 16 x i1> %2,
@@ -882,6 +962,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
@@ -901,6 +982,7 @@ define <vscale x 1 x i64> @intrinsic_vadc_vxm_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i1> %2,
@@ -910,6 +992,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
@@ -929,6 +1012,7 @@ define <vscale x 2 x i64> @intrinsic_vadc_vxm_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i1> %2,
@@ -938,6 +1022,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
@@ -957,6 +1042,7 @@ define <vscale x 4 x i64> @intrinsic_vadc_vxm_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i1> %2,
@@ -966,6 +1052,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
@@ -985,6 +1072,7 @@ define <vscale x 8 x i64> @intrinsic_vadc_vxm_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
     <vscale x 8 x i1> %2,
@@ -1001,6 +1089,7 @@ define <vscale x 1 x i8> @intrinsic_vadc_vim_nxv1i8_nxv1i8_i8(<vscale x 1 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 -9,
     <vscale x 1 x i1> %1,
@@ -1017,6 +1106,7 @@ define <vscale x 2 x i8> @intrinsic_vadc_vim_nxv2i8_nxv2i8_i8(<vscale x 2 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 9,
     <vscale x 2 x i1> %1,
@@ -1033,6 +1123,7 @@ define <vscale x 4 x i8> @intrinsic_vadc_vim_nxv4i8_nxv4i8_i8(<vscale x 4 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 -9,
     <vscale x 4 x i1> %1,
@@ -1049,6 +1140,7 @@ define <vscale x 8 x i8> @intrinsic_vadc_vim_nxv8i8_nxv8i8_i8(<vscale x 8 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 9,
     <vscale x 8 x i1> %1,
@@ -1065,6 +1157,7 @@ define <vscale x 16 x i8> @intrinsic_vadc_vim_nxv16i8_nxv16i8_i8(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 -9,
     <vscale x 16 x i1> %1,
@@ -1081,6 +1174,7 @@ define <vscale x 32 x i8> @intrinsic_vadc_vim_nxv32i8_nxv32i8_i8(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 9,
     <vscale x 32 x i1> %1,
@@ -1097,6 +1191,7 @@ define <vscale x 64 x i8> @intrinsic_vadc_vim_nxv64i8_nxv64i8_i8(<vscale x 64 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 -9,
     <vscale x 64 x i1> %1,
@@ -1113,6 +1208,7 @@ define <vscale x 1 x i16> @intrinsic_vadc_vim_nxv1i16_nxv1i16_i16(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 9,
     <vscale x 1 x i1> %1,
@@ -1129,6 +1225,7 @@ define <vscale x 2 x i16> @intrinsic_vadc_vim_nxv2i16_nxv2i16_i16(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 -9,
     <vscale x 2 x i1> %1,
@@ -1145,6 +1242,7 @@ define <vscale x 4 x i16> @intrinsic_vadc_vim_nxv4i16_nxv4i16_i16(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 9,
     <vscale x 4 x i1> %1,
@@ -1161,6 +1259,7 @@ define <vscale x 8 x i16> @intrinsic_vadc_vim_nxv8i16_nxv8i16_i16(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 -9,
     <vscale x 8 x i1> %1,
@@ -1177,6 +1276,7 @@ define <vscale x 16 x i16> @intrinsic_vadc_vim_nxv16i16_nxv16i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 9,
     <vscale x 16 x i1> %1,
@@ -1193,6 +1293,7 @@ define <vscale x 32 x i16> @intrinsic_vadc_vim_nxv32i16_nxv32i16_i16(<vscale x 3
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 -9,
     <vscale x 32 x i1> %1,
@@ -1209,6 +1310,7 @@ define <vscale x 1 x i32> @intrinsic_vadc_vim_nxv1i32_nxv1i32_i32(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 9,
     <vscale x 1 x i1> %1,
@@ -1225,6 +1327,7 @@ define <vscale x 2 x i32> @intrinsic_vadc_vim_nxv2i32_nxv2i32_i32(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 -9,
     <vscale x 2 x i1> %1,
@@ -1241,6 +1344,7 @@ define <vscale x 4 x i32> @intrinsic_vadc_vim_nxv4i32_nxv4i32_i32(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 9,
     <vscale x 4 x i1> %1,
@@ -1257,6 +1361,7 @@ define <vscale x 8 x i32> @intrinsic_vadc_vim_nxv8i32_nxv8i32_i32(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 -9,
     <vscale x 8 x i1> %1,
@@ -1273,6 +1378,7 @@ define <vscale x 16 x i32> @intrinsic_vadc_vim_nxv16i32_nxv16i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 9,
     <vscale x 16 x i1> %1,
@@ -1289,6 +1395,7 @@ define <vscale x 1 x i64> @intrinsic_vadc_vim_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
     <vscale x 1 x i1> %1,
@@ -1305,6 +1412,7 @@ define <vscale x 2 x i64> @intrinsic_vadc_vim_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 -9,
     <vscale x 2 x i1> %1,
@@ -1321,6 +1429,7 @@ define <vscale x 4 x i64> @intrinsic_vadc_vim_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
     <vscale x 4 x i1> %1,
@@ -1337,6 +1446,7 @@ define <vscale x 8 x i64> @intrinsic_vadc_vim_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 -9,
     <vscale x 8 x i1> %1,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadc-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vadc-rv64.ll
index 40c4fe51c1e9b..f85e62b414808 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadc-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadc-rv64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
@@ -15,6 +16,7 @@ define <vscale x 1 x i8> @intrinsic_vadc_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i1> %2,
@@ -24,6 +26,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
@@ -37,6 +40,7 @@ define <vscale x 2 x i8> @intrinsic_vadc_vvm_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i1> %2,
@@ -46,6 +50,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
@@ -59,6 +64,7 @@ define <vscale x 4 x i8> @intrinsic_vadc_vvm_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i1> %2,
@@ -68,6 +74,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
@@ -81,6 +88,7 @@ define <vscale x 8 x i8> @intrinsic_vadc_vvm_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i1> %2,
@@ -90,6 +98,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
@@ -103,6 +112,7 @@ define <vscale x 16 x i8> @intrinsic_vadc_vvm_nxv16i8_nxv16i8_nxv16i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i1> %2,
@@ -112,6 +122,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
@@ -125,6 +136,7 @@ define <vscale x 32 x i8> @intrinsic_vadc_vvm_nxv32i8_nxv32i8_nxv32i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i1> %2,
@@ -134,6 +146,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
@@ -147,6 +160,7 @@ define <vscale x 64 x i8> @intrinsic_vadc_vvm_nxv64i8_nxv64i8_nxv64i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i1> %2,
@@ -156,6 +170,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
@@ -169,6 +184,7 @@ define <vscale x 1 x i16> @intrinsic_vadc_vvm_nxv1i16_nxv1i16_nxv1i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i1> %2,
@@ -178,6 +194,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
@@ -191,6 +208,7 @@ define <vscale x 2 x i16> @intrinsic_vadc_vvm_nxv2i16_nxv2i16_nxv2i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i1> %2,
@@ -200,6 +218,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
@@ -213,6 +232,7 @@ define <vscale x 4 x i16> @intrinsic_vadc_vvm_nxv4i16_nxv4i16_nxv4i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i1> %2,
@@ -222,6 +242,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
@@ -235,6 +256,7 @@ define <vscale x 8 x i16> @intrinsic_vadc_vvm_nxv8i16_nxv8i16_nxv8i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i1> %2,
@@ -244,6 +266,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
@@ -257,6 +280,7 @@ define <vscale x 16 x i16> @intrinsic_vadc_vvm_nxv16i16_nxv16i16_nxv16i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i1> %2,
@@ -266,6 +290,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
@@ -279,6 +304,7 @@ define <vscale x 32 x i16> @intrinsic_vadc_vvm_nxv32i16_nxv32i16_nxv32i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i1> %2,
@@ -288,6 +314,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
@@ -301,6 +328,7 @@ define <vscale x 1 x i32> @intrinsic_vadc_vvm_nxv1i32_nxv1i32_nxv1i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i1> %2,
@@ -310,6 +338,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
@@ -323,6 +352,7 @@ define <vscale x 2 x i32> @intrinsic_vadc_vvm_nxv2i32_nxv2i32_nxv2i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i1> %2,
@@ -332,6 +362,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
@@ -345,6 +376,7 @@ define <vscale x 4 x i32> @intrinsic_vadc_vvm_nxv4i32_nxv4i32_nxv4i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i1> %2,
@@ -354,6 +386,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
@@ -367,6 +400,7 @@ define <vscale x 8 x i32> @intrinsic_vadc_vvm_nxv8i32_nxv8i32_nxv8i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i1> %2,
@@ -376,6 +410,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
@@ -389,6 +424,7 @@ define <vscale x 16 x i32> @intrinsic_vadc_vvm_nxv16i32_nxv16i32_nxv16i32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i1> %2,
@@ -398,6 +434,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
@@ -411,6 +448,7 @@ define <vscale x 1 x i64> @intrinsic_vadc_vvm_nxv1i64_nxv1i64_nxv1i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i1> %2,
@@ -420,6 +458,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
@@ -433,6 +472,7 @@ define <vscale x 2 x i64> @intrinsic_vadc_vvm_nxv2i64_nxv2i64_nxv2i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i1> %2,
@@ -442,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
@@ -455,6 +496,7 @@ define <vscale x 4 x i64> @intrinsic_vadc_vvm_nxv4i64_nxv4i64_nxv4i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i1> %2,
@@ -464,6 +506,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
@@ -477,6 +520,7 @@ define <vscale x 8 x i64> @intrinsic_vadc_vvm_nxv8i64_nxv8i64_nxv8i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i1> %2,
@@ -486,6 +530,7 @@ entry:
 }
 
 declare <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
@@ -499,6 +544,7 @@ define <vscale x 1 x i8> @intrinsic_vadc_vxm_nxv1i8_nxv1i8_i8(<vscale x 1 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i1> %2,
@@ -508,6 +554,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
@@ -521,6 +568,7 @@ define <vscale x 2 x i8> @intrinsic_vadc_vxm_nxv2i8_nxv2i8_i8(<vscale x 2 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i1> %2,
@@ -530,6 +578,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
@@ -543,6 +592,7 @@ define <vscale x 4 x i8> @intrinsic_vadc_vxm_nxv4i8_nxv4i8_i8(<vscale x 4 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i1> %2,
@@ -552,6 +602,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
@@ -565,6 +616,7 @@ define <vscale x 8 x i8> @intrinsic_vadc_vxm_nxv8i8_nxv8i8_i8(<vscale x 8 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i1> %2,
@@ -574,6 +626,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
@@ -587,6 +640,7 @@ define <vscale x 16 x i8> @intrinsic_vadc_vxm_nxv16i8_nxv16i8_i8(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i1> %2,
@@ -596,6 +650,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
@@ -609,6 +664,7 @@ define <vscale x 32 x i8> @intrinsic_vadc_vxm_nxv32i8_nxv32i8_i8(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i1> %2,
@@ -618,6 +674,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
@@ -631,6 +688,7 @@ define <vscale x 64 x i8> @intrinsic_vadc_vxm_nxv64i8_nxv64i8_i8(<vscale x 64 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
     <vscale x 64 x i1> %2,
@@ -640,6 +698,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
@@ -653,6 +712,7 @@ define <vscale x 1 x i16> @intrinsic_vadc_vxm_nxv1i16_nxv1i16_i16(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i1> %2,
@@ -662,6 +722,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
@@ -675,6 +736,7 @@ define <vscale x 2 x i16> @intrinsic_vadc_vxm_nxv2i16_nxv2i16_i16(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i1> %2,
@@ -684,6 +746,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
@@ -697,6 +760,7 @@ define <vscale x 4 x i16> @intrinsic_vadc_vxm_nxv4i16_nxv4i16_i16(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i1> %2,
@@ -706,6 +770,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
@@ -719,6 +784,7 @@ define <vscale x 8 x i16> @intrinsic_vadc_vxm_nxv8i16_nxv8i16_i16(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i1> %2,
@@ -728,6 +794,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
@@ -741,6 +808,7 @@ define <vscale x 16 x i16> @intrinsic_vadc_vxm_nxv16i16_nxv16i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i1> %2,
@@ -750,6 +818,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
@@ -763,6 +832,7 @@ define <vscale x 32 x i16> @intrinsic_vadc_vxm_nxv32i16_nxv32i16_i16(<vscale x 3
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
     <vscale x 32 x i1> %2,
@@ -772,6 +842,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
@@ -785,6 +856,7 @@ define <vscale x 1 x i32> @intrinsic_vadc_vxm_nxv1i32_nxv1i32_i32(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i1> %2,
@@ -794,6 +866,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
@@ -807,6 +880,7 @@ define <vscale x 2 x i32> @intrinsic_vadc_vxm_nxv2i32_nxv2i32_i32(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i1> %2,
@@ -816,6 +890,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
@@ -829,6 +904,7 @@ define <vscale x 4 x i32> @intrinsic_vadc_vxm_nxv4i32_nxv4i32_i32(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i1> %2,
@@ -838,6 +914,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
@@ -851,6 +928,7 @@ define <vscale x 8 x i32> @intrinsic_vadc_vxm_nxv8i32_nxv8i32_i32(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i1> %2,
@@ -860,6 +938,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
@@ -873,6 +952,7 @@ define <vscale x 16 x i32> @intrinsic_vadc_vxm_nxv16i32_nxv16i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
     <vscale x 16 x i1> %2,
@@ -882,6 +962,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
@@ -895,6 +976,7 @@ define <vscale x 1 x i64> @intrinsic_vadc_vxm_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i1> %2,
@@ -904,6 +986,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
@@ -917,6 +1000,7 @@ define <vscale x 2 x i64> @intrinsic_vadc_vxm_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i1> %2,
@@ -926,6 +1010,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
@@ -939,6 +1024,7 @@ define <vscale x 4 x i64> @intrinsic_vadc_vxm_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i1> %2,
@@ -948,6 +1034,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
@@ -961,6 +1048,7 @@ define <vscale x 8 x i64> @intrinsic_vadc_vxm_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
     <vscale x 8 x i1> %2,
@@ -977,6 +1065,7 @@ define <vscale x 1 x i8> @intrinsic_vadc_vim_nxv1i8_nxv1i8_i8(<vscale x 1 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadc.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 9,
     <vscale x 1 x i1> %1,
@@ -993,6 +1082,7 @@ define <vscale x 2 x i8> @intrinsic_vadc_vim_nxv2i8_nxv2i8_i8(<vscale x 2 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vadc.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 -9,
     <vscale x 2 x i1> %1,
@@ -1009,6 +1099,7 @@ define <vscale x 4 x i8> @intrinsic_vadc_vim_nxv4i8_nxv4i8_i8(<vscale x 4 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vadc.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 9,
     <vscale x 4 x i1> %1,
@@ -1025,6 +1116,7 @@ define <vscale x 8 x i8> @intrinsic_vadc_vim_nxv8i8_nxv8i8_i8(<vscale x 8 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vadc.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 -9,
     <vscale x 8 x i1> %1,
@@ -1041,6 +1133,7 @@ define <vscale x 16 x i8> @intrinsic_vadc_vim_nxv16i8_nxv16i8_i8(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vadc.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 9,
     <vscale x 16 x i1> %1,
@@ -1057,6 +1150,7 @@ define <vscale x 32 x i8> @intrinsic_vadc_vim_nxv32i8_nxv32i8_i8(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vadc.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 -9,
     <vscale x 32 x i1> %1,
@@ -1073,6 +1167,7 @@ define <vscale x 64 x i8> @intrinsic_vadc_vim_nxv64i8_nxv64i8_i8(<vscale x 64 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vadc.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 9,
     <vscale x 64 x i1> %1,
@@ -1089,6 +1184,7 @@ define <vscale x 1 x i16> @intrinsic_vadc_vim_nxv1i16_nxv1i16_i16(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vadc.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 -9,
     <vscale x 1 x i1> %1,
@@ -1105,6 +1201,7 @@ define <vscale x 2 x i16> @intrinsic_vadc_vim_nxv2i16_nxv2i16_i16(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vadc.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 9,
     <vscale x 2 x i1> %1,
@@ -1121,6 +1218,7 @@ define <vscale x 4 x i16> @intrinsic_vadc_vim_nxv4i16_nxv4i16_i16(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vadc.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 -9,
     <vscale x 4 x i1> %1,
@@ -1137,6 +1235,7 @@ define <vscale x 8 x i16> @intrinsic_vadc_vim_nxv8i16_nxv8i16_i16(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vadc.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 9,
     <vscale x 8 x i1> %1,
@@ -1153,6 +1252,7 @@ define <vscale x 16 x i16> @intrinsic_vadc_vim_nxv16i16_nxv16i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vadc.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 -9,
     <vscale x 16 x i1> %1,
@@ -1169,6 +1269,7 @@ define <vscale x 32 x i16> @intrinsic_vadc_vim_nxv32i16_nxv32i16_i16(<vscale x 3
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vadc.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 9,
     <vscale x 32 x i1> %1,
@@ -1185,6 +1286,7 @@ define <vscale x 1 x i32> @intrinsic_vadc_vim_nxv1i32_nxv1i32_i32(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vadc.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 -9,
     <vscale x 1 x i1> %1,
@@ -1201,6 +1303,7 @@ define <vscale x 2 x i32> @intrinsic_vadc_vim_nxv2i32_nxv2i32_i32(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vadc.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 9,
     <vscale x 2 x i1> %1,
@@ -1217,6 +1320,7 @@ define <vscale x 4 x i32> @intrinsic_vadc_vim_nxv4i32_nxv4i32_i32(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vadc.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 -9,
     <vscale x 4 x i1> %1,
@@ -1233,6 +1337,7 @@ define <vscale x 8 x i32> @intrinsic_vadc_vim_nxv8i32_nxv8i32_i32(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vadc.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 9,
     <vscale x 8 x i1> %1,
@@ -1249,6 +1354,7 @@ define <vscale x 16 x i32> @intrinsic_vadc_vim_nxv16i32_nxv16i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vadc.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 -9,
     <vscale x 16 x i1> %1,
@@ -1265,6 +1371,7 @@ define <vscale x 1 x i64> @intrinsic_vadc_vim_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vadc.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
     <vscale x 1 x i1> %1,
@@ -1281,6 +1388,7 @@ define <vscale x 2 x i64> @intrinsic_vadc_vim_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vadc.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 -9,
     <vscale x 2 x i1> %1,
@@ -1297,6 +1405,7 @@ define <vscale x 4 x i64> @intrinsic_vadc_vim_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vadc.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
     <vscale x 4 x i1> %1,
@@ -1313,6 +1422,7 @@ define <vscale x 8 x i64> @intrinsic_vadc_vim_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vadc.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 -9,
     <vscale x 8 x i1> %1,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
index eb3efd1fa0373..499e72ce712e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
@@ -4,6 +4,7 @@
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+  <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i1>,
@@ -17,6 +18,7 @@ define <vscale x 1 x half> @intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x i1> %2,
@@ -26,6 +28,7 @@ entry:
 }
 
 declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.f16(
+  <vscale x 1 x half>,
   <vscale x 1 x half>,
   half,
   <vscale x 1 x i1>,
@@ -39,6 +42,7 @@ define <vscale x 1 x half> @intrinsic_vfmerge_vfm_nxv1f16_nxv1f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.f16(
+    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x i1> %2,
@@ -48,6 +52,7 @@ entry:
 }
 
 declare <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.nxv2f16(
+  <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i1>,
@@ -61,6 +66,7 @@ define <vscale x 2 x half> @intrinsic_vfmerge_vvm_nxv2f16_nxv2f16_nxv2f16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.nxv2f16(
+    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x i1> %2,
@@ -70,6 +76,7 @@ entry:
 }
 
 declare <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.f16(
+  <vscale x 2 x half>,
   <vscale x 2 x half>,
   half,
   <vscale x 2 x i1>,
@@ -83,6 +90,7 @@ define <vscale x 2 x half> @intrinsic_vfmerge_vfm_nxv2f16_nxv2f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.f16(
+    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x i1> %2,
@@ -92,6 +100,7 @@ entry:
 }
 
 declare <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.nxv4f16(
+  <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i1>,
@@ -105,6 +114,7 @@ define <vscale x 4 x half> @intrinsic_vfmerge_vvm_nxv4f16_nxv4f16_nxv4f16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.nxv4f16(
+    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x i1> %2,
@@ -114,6 +124,7 @@ entry:
 }
 
 declare <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.f16(
+  <vscale x 4 x half>,
   <vscale x 4 x half>,
   half,
   <vscale x 4 x i1>,
@@ -127,6 +138,7 @@ define <vscale x 4 x half> @intrinsic_vfmerge_vfm_nxv4f16_nxv4f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.f16(
+    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x i1> %2,
@@ -136,6 +148,7 @@ entry:
 }
 
 declare <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.nxv8f16(
+  <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i1>,
@@ -149,6 +162,7 @@ define <vscale x 8 x half> @intrinsic_vfmerge_vvm_nxv8f16_nxv8f16_nxv8f16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.nxv8f16(
+    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x i1> %2,
@@ -158,6 +172,7 @@ entry:
 }
 
 declare <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.f16(
+  <vscale x 8 x half>,
   <vscale x 8 x half>,
   half,
   <vscale x 8 x i1>,
@@ -171,6 +186,7 @@ define <vscale x 8 x half> @intrinsic_vfmerge_vfm_nxv8f16_nxv8f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.f16(
+    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x i1> %2,
@@ -180,6 +196,7 @@ entry:
 }
 
 declare <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.nxv16f16(
+  <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i1>,
@@ -193,6 +210,7 @@ define <vscale x 16 x half> @intrinsic_vfmerge_vvm_nxv16f16_nxv16f16_nxv16f16(<v
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.nxv16f16(
+    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x i1> %2,
@@ -202,6 +220,7 @@ entry:
 }
 
 declare <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.f16(
+  <vscale x 16 x half>,
   <vscale x 16 x half>,
   half,
   <vscale x 16 x i1>,
@@ -215,6 +234,7 @@ define <vscale x 16 x half> @intrinsic_vfmerge_vfm_nxv16f16_nxv16f16_f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.f16(
+    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x i1> %2,
@@ -224,6 +244,7 @@ entry:
 }
 
 declare <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.nxv32f16(
+  <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i1>,
@@ -237,6 +258,7 @@ define <vscale x 32 x half> @intrinsic_vfmerge_vvm_nxv32f16_nxv32f16_nxv32f16(<v
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.nxv32f16(
+    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
     <vscale x 32 x i1> %2,
@@ -246,6 +268,7 @@ entry:
 }
 
 declare <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.f16(
+  <vscale x 32 x half>,
   <vscale x 32 x half>,
   half,
   <vscale x 32 x i1>,
@@ -259,6 +282,7 @@ define <vscale x 32 x half> @intrinsic_vfmerge_vfm_nxv32f16_nxv32f16_f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.f16(
+    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     half %1,
     <vscale x 32 x i1> %2,
@@ -268,6 +292,7 @@ entry:
 }
 
 declare <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i1>,
@@ -281,6 +306,7 @@ define <vscale x 1 x float> @intrinsic_vfmerge_vvm_nxv1f32_nxv1f32_nxv1f32(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.nxv1f32(
+    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x i1> %2,
@@ -290,6 +316,7 @@ entry:
 }
 
 declare <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.f32(
+  <vscale x 1 x float>,
   <vscale x 1 x float>,
   float,
   <vscale x 1 x i1>,
@@ -303,6 +330,7 @@ define <vscale x 1 x float> @intrinsic_vfmerge_vfm_nxv1f32_nxv1f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.f32(
+    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x i1> %2,
@@ -312,6 +340,7 @@ entry:
 }
 
 declare <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.nxv2f32(
+  <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i1>,
@@ -325,6 +354,7 @@ define <vscale x 2 x float> @intrinsic_vfmerge_vvm_nxv2f32_nxv2f32_nxv2f32(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.nxv2f32(
+    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x i1> %2,
@@ -334,6 +364,7 @@ entry:
 }
 
 declare <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.f32(
+  <vscale x 2 x float>,
   <vscale x 2 x float>,
   float,
   <vscale x 2 x i1>,
@@ -347,6 +378,7 @@ define <vscale x 2 x float> @intrinsic_vfmerge_vfm_nxv2f32_nxv2f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.f32(
+    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x i1> %2,
@@ -356,6 +388,7 @@ entry:
 }
 
 declare <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.nxv4f32(
+  <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i1>,
@@ -369,6 +402,7 @@ define <vscale x 4 x float> @intrinsic_vfmerge_vvm_nxv4f32_nxv4f32_nxv4f32(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.nxv4f32(
+    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x i1> %2,
@@ -378,6 +412,7 @@ entry:
 }
 
 declare <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.f32(
+  <vscale x 4 x float>,
   <vscale x 4 x float>,
   float,
   <vscale x 4 x i1>,
@@ -391,6 +426,7 @@ define <vscale x 4 x float> @intrinsic_vfmerge_vfm_nxv4f32_nxv4f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.f32(
+    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x i1> %2,
@@ -400,6 +436,7 @@ entry:
 }
 
 declare <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.nxv8f32(
+  <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i1>,
@@ -413,6 +450,7 @@ define <vscale x 8 x float> @intrinsic_vfmerge_vvm_nxv8f32_nxv8f32_nxv8f32(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.nxv8f32(
+    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x i1> %2,
@@ -422,6 +460,7 @@ entry:
 }
 
 declare <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.f32(
+  <vscale x 8 x float>,
   <vscale x 8 x float>,
   float,
   <vscale x 8 x i1>,
@@ -435,6 +474,7 @@ define <vscale x 8 x float> @intrinsic_vfmerge_vfm_nxv8f32_nxv8f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.f32(
+    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x i1> %2,
@@ -444,6 +484,7 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.nxv16f32(
+  <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i1>,
@@ -457,6 +498,7 @@ define <vscale x 16 x float> @intrinsic_vfmerge_vvm_nxv16f32_nxv16f32_nxv16f32(<
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.nxv16f32(
+    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
     <vscale x 16 x i1> %2,
@@ -466,6 +508,7 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.f32(
+  <vscale x 16 x float>,
   <vscale x 16 x float>,
   float,
   <vscale x 16 x i1>,
@@ -479,6 +522,7 @@ define <vscale x 16 x float> @intrinsic_vfmerge_vfm_nxv16f32_nxv16f32_f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.f32(
+    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     float %1,
     <vscale x 16 x i1> %2,
@@ -488,6 +532,7 @@ entry:
 }
 
 declare <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.nxv1f64(
+  <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i1>,
@@ -501,6 +546,7 @@ define <vscale x 1 x double> @intrinsic_vfmerge_vvm_nxv1f64_nxv1f64_nxv1f64(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.nxv1f64(
+    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x i1> %2,
@@ -510,6 +556,7 @@ entry:
 }
 
 declare <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.f64(
+  <vscale x 1 x double>,
   <vscale x 1 x double>,
   double,
   <vscale x 1 x i1>,
@@ -523,6 +570,7 @@ define <vscale x 1 x double> @intrinsic_vfmerge_vfm_nxv1f64_nxv1f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.f64(
+    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x i1> %2,
@@ -532,6 +580,7 @@ entry:
 }
 
 declare <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.nxv2f64(
+  <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x i1>,
@@ -545,6 +594,7 @@ define <vscale x 2 x double> @intrinsic_vfmerge_vvm_nxv2f64_nxv2f64_nxv2f64(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.nxv2f64(
+    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x i1> %2,
@@ -554,6 +604,7 @@ entry:
 }
 
 declare <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.f64(
+  <vscale x 2 x double>,
   <vscale x 2 x double>,
   double,
   <vscale x 2 x i1>,
@@ -567,6 +618,7 @@ define <vscale x 2 x double> @intrinsic_vfmerge_vfm_nxv2f64_nxv2f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.f64(
+    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x i1> %2,
@@ -576,6 +628,7 @@ entry:
 }
 
 declare <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.nxv4f64(
+  <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i1>,
@@ -589,6 +642,7 @@ define <vscale x 4 x double> @intrinsic_vfmerge_vvm_nxv4f64_nxv4f64_nxv4f64(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.nxv4f64(
+    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x i1> %2,
@@ -598,6 +652,7 @@ entry:
 }
 
 declare <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.f64(
+  <vscale x 4 x double>,
   <vscale x 4 x double>,
   double,
   <vscale x 4 x i1>,
@@ -611,6 +666,7 @@ define <vscale x 4 x double> @intrinsic_vfmerge_vfm_nxv4f64_nxv4f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.f64(
+    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x i1> %2,
@@ -620,6 +676,7 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.nxv8f64(
+  <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i1>,
@@ -633,6 +690,7 @@ define <vscale x 8 x double> @intrinsic_vfmerge_vvm_nxv8f64_nxv8f64_nxv8f64(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.nxv8f64(
+    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
     <vscale x 8 x i1> %2,
@@ -642,6 +700,7 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64(
+  <vscale x 8 x double>,
   <vscale x 8 x double>,
   double,
   <vscale x 8 x i1>,
@@ -655,6 +714,7 @@ define <vscale x 8 x double> @intrinsic_vfmerge_vfm_nxv8f64_nxv8f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64(
+    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     double %1,
     <vscale x 8 x i1> %2,
@@ -671,6 +731,7 @@ define <vscale x 1 x half> @intrinsic_vfmerge_vzm_nxv1f16_nxv1f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.f16(
+    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     half zeroinitializer,
     <vscale x 1 x i1> %1,
@@ -687,6 +748,7 @@ define <vscale x 2 x half> @intrinsic_vfmerge_vzm_nxv2f16_nxv2f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.f16(
+    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     half zeroinitializer,
     <vscale x 2 x i1> %1,
@@ -703,6 +765,7 @@ define <vscale x 4 x half> @intrinsic_vfmerge_vzm_nxv4f16_nxv4f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.f16(
+    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     half zeroinitializer,
     <vscale x 4 x i1> %1,
@@ -719,6 +782,7 @@ define <vscale x 8 x half> @intrinsic_vfmerge_vzm_nxv8f16_nxv8f16_f16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.f16(
+    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     half zeroinitializer,
     <vscale x 8 x i1> %1,
@@ -735,6 +799,7 @@ define <vscale x 16 x half> @intrinsic_vfmerge_vzm_nxv16f16_nxv16f16_f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.f16(
+    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     half zeroinitializer,
     <vscale x 16 x i1> %1,
@@ -751,6 +816,7 @@ define <vscale x 32 x half> @intrinsic_vfmerge_vzm_nxv32f16_nxv32f16_f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.f16(
+    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     half zeroinitializer,
     <vscale x 32 x i1> %1,
@@ -767,6 +833,7 @@ define <vscale x 1 x float> @intrinsic_vfmerge_vzm_nxv1f32_nxv1f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.f32(
+    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     float zeroinitializer,
     <vscale x 1 x i1> %1,
@@ -783,6 +850,7 @@ define <vscale x 2 x float> @intrinsic_vfmerge_vzm_nxv2f32_nxv2f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.f32(
+    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     float zeroinitializer,
     <vscale x 2 x i1> %1,
@@ -799,6 +867,7 @@ define <vscale x 4 x float> @intrinsic_vfmerge_vzm_nxv4f32_nxv4f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.f32(
+    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     float zeroinitializer,
     <vscale x 4 x i1> %1,
@@ -815,6 +884,7 @@ define <vscale x 8 x float> @intrinsic_vfmerge_vzm_nxv8f32_nxv8f32_f32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.f32(
+    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     float zeroinitializer,
     <vscale x 8 x i1> %1,
@@ -831,6 +901,7 @@ define <vscale x 16 x float> @intrinsic_vfmerge_vzm_nxv16f32_nxv16f32_f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.f32(
+    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     float zeroinitializer,
     <vscale x 16 x i1> %1,
@@ -847,6 +918,7 @@ define <vscale x 1 x double> @intrinsic_vfmerge_vzm_nxv1f64_nxv1f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.f64(
+    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     double zeroinitializer,
     <vscale x 1 x i1> %1,
@@ -863,6 +935,7 @@ define <vscale x 2 x double> @intrinsic_vfmerge_vzm_nxv2f64_nxv2f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.f64(
+    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     double zeroinitializer,
     <vscale x 2 x i1> %1,
@@ -879,6 +952,7 @@ define <vscale x 4 x double> @intrinsic_vfmerge_vzm_nxv4f64_nxv4f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.f64(
+    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     double zeroinitializer,
     <vscale x 4 x i1> %1,
@@ -895,6 +969,7 @@ define <vscale x 8 x double> @intrinsic_vfmerge_vzm_nxv8f64_nxv8f64_f64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.f64(
+    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     double zeroinitializer,
     <vscale x 8 x i1> %1,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmerge-rv32.ll
index 25ca3b0631c08..11e92e65dbc9f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-rv32.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
@@ -15,6 +16,7 @@ define <vscale x 1 x i8> @intrinsic_vmerge_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i1> %2,
@@ -24,6 +26,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
@@ -37,6 +40,7 @@ define <vscale x 2 x i8> @intrinsic_vmerge_vvm_nxv2i8_nxv2i8_nxv2i8(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i1> %2,
@@ -46,6 +50,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
@@ -59,6 +64,7 @@ define <vscale x 4 x i8> @intrinsic_vmerge_vvm_nxv4i8_nxv4i8_nxv4i8(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i1> %2,
@@ -68,6 +74,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
@@ -81,6 +88,7 @@ define <vscale x 8 x i8> @intrinsic_vmerge_vvm_nxv8i8_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i1> %2,
@@ -90,6 +98,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
@@ -103,6 +112,7 @@ define <vscale x 16 x i8> @intrinsic_vmerge_vvm_nxv16i8_nxv16i8_nxv16i8(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i1> %2,
@@ -112,6 +122,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
@@ -125,6 +136,7 @@ define <vscale x 32 x i8> @intrinsic_vmerge_vvm_nxv32i8_nxv32i8_nxv32i8(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i1> %2,
@@ -134,6 +146,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
@@ -147,6 +160,7 @@ define <vscale x 64 x i8> @intrinsic_vmerge_vvm_nxv64i8_nxv64i8_nxv64i8(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i1> %2,
@@ -156,6 +170,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
@@ -169,6 +184,7 @@ define <vscale x 1 x i16> @intrinsic_vmerge_vvm_nxv1i16_nxv1i16_nxv1i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i1> %2,
@@ -178,6 +194,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
@@ -191,6 +208,7 @@ define <vscale x 2 x i16> @intrinsic_vmerge_vvm_nxv2i16_nxv2i16_nxv2i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i1> %2,
@@ -200,6 +218,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
@@ -213,6 +232,7 @@ define <vscale x 4 x i16> @intrinsic_vmerge_vvm_nxv4i16_nxv4i16_nxv4i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i1> %2,
@@ -222,6 +242,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
@@ -235,6 +256,7 @@ define <vscale x 8 x i16> @intrinsic_vmerge_vvm_nxv8i16_nxv8i16_nxv8i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i1> %2,
@@ -244,6 +266,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
@@ -257,6 +280,7 @@ define <vscale x 16 x i16> @intrinsic_vmerge_vvm_nxv16i16_nxv16i16_nxv16i16(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i1> %2,
@@ -266,6 +290,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
@@ -279,6 +304,7 @@ define <vscale x 32 x i16> @intrinsic_vmerge_vvm_nxv32i16_nxv32i16_nxv32i16(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i1> %2,
@@ -288,6 +314,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
@@ -301,6 +328,7 @@ define <vscale x 1 x i32> @intrinsic_vmerge_vvm_nxv1i32_nxv1i32_nxv1i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i1> %2,
@@ -310,6 +338,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
@@ -323,6 +352,7 @@ define <vscale x 2 x i32> @intrinsic_vmerge_vvm_nxv2i32_nxv2i32_nxv2i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i1> %2,
@@ -332,6 +362,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
@@ -345,6 +376,7 @@ define <vscale x 4 x i32> @intrinsic_vmerge_vvm_nxv4i32_nxv4i32_nxv4i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i1> %2,
@@ -354,6 +386,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
@@ -367,6 +400,7 @@ define <vscale x 8 x i32> @intrinsic_vmerge_vvm_nxv8i32_nxv8i32_nxv8i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i1> %2,
@@ -376,6 +410,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
@@ -389,6 +424,7 @@ define <vscale x 16 x i32> @intrinsic_vmerge_vvm_nxv16i32_nxv16i32_nxv16i32(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i1> %2,
@@ -398,6 +434,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
@@ -411,6 +448,7 @@ define <vscale x 1 x i64> @intrinsic_vmerge_vvm_nxv1i64_nxv1i64_nxv1i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i1> %2,
@@ -420,6 +458,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
@@ -433,6 +472,7 @@ define <vscale x 2 x i64> @intrinsic_vmerge_vvm_nxv2i64_nxv2i64_nxv2i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i1> %2,
@@ -442,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
@@ -455,6 +496,7 @@ define <vscale x 4 x i64> @intrinsic_vmerge_vvm_nxv4i64_nxv4i64_nxv4i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i1> %2,
@@ -464,6 +506,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
@@ -477,6 +520,7 @@ define <vscale x 8 x i64> @intrinsic_vmerge_vvm_nxv8i64_nxv8i64_nxv8i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i1> %2,
@@ -486,6 +530,7 @@ entry:
 }
 
 declare <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
@@ -499,6 +544,7 @@ define <vscale x 1 x i8> @intrinsic_vmerge_vxm_nxv1i8_nxv1i8_i8(<vscale x 1 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i1> %2,
@@ -508,6 +554,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
@@ -521,6 +568,7 @@ define <vscale x 2 x i8> @intrinsic_vmerge_vxm_nxv2i8_nxv2i8_i8(<vscale x 2 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i1> %2,
@@ -530,6 +578,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
@@ -543,6 +592,7 @@ define <vscale x 4 x i8> @intrinsic_vmerge_vxm_nxv4i8_nxv4i8_i8(<vscale x 4 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i1> %2,
@@ -552,6 +602,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
@@ -565,6 +616,7 @@ define <vscale x 8 x i8> @intrinsic_vmerge_vxm_nxv8i8_nxv8i8_i8(<vscale x 8 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i1> %2,
@@ -574,6 +626,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
@@ -587,6 +640,7 @@ define <vscale x 16 x i8> @intrinsic_vmerge_vxm_nxv16i8_nxv16i8_i8(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i1> %2,
@@ -596,6 +650,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
@@ -609,6 +664,7 @@ define <vscale x 32 x i8> @intrinsic_vmerge_vxm_nxv32i8_nxv32i8_i8(<vscale x 32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i1> %2,
@@ -618,6 +674,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
@@ -631,6 +688,7 @@ define <vscale x 64 x i8> @intrinsic_vmerge_vxm_nxv64i8_nxv64i8_i8(<vscale x 64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
     <vscale x 64 x i1> %2,
@@ -640,6 +698,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
@@ -653,6 +712,7 @@ define <vscale x 1 x i16> @intrinsic_vmerge_vxm_nxv1i16_nxv1i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i1> %2,
@@ -662,6 +722,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
@@ -675,6 +736,7 @@ define <vscale x 2 x i16> @intrinsic_vmerge_vxm_nxv2i16_nxv2i16_i16(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i1> %2,
@@ -684,6 +746,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
@@ -697,6 +760,7 @@ define <vscale x 4 x i16> @intrinsic_vmerge_vxm_nxv4i16_nxv4i16_i16(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i1> %2,
@@ -706,6 +770,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
@@ -719,6 +784,7 @@ define <vscale x 8 x i16> @intrinsic_vmerge_vxm_nxv8i16_nxv8i16_i16(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i1> %2,
@@ -728,6 +794,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
@@ -741,6 +808,7 @@ define <vscale x 16 x i16> @intrinsic_vmerge_vxm_nxv16i16_nxv16i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i1> %2,
@@ -750,6 +818,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
@@ -763,6 +832,7 @@ define <vscale x 32 x i16> @intrinsic_vmerge_vxm_nxv32i16_nxv32i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
     <vscale x 32 x i1> %2,
@@ -772,6 +842,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
@@ -785,6 +856,7 @@ define <vscale x 1 x i32> @intrinsic_vmerge_vxm_nxv1i32_nxv1i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i1> %2,
@@ -794,6 +866,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
@@ -807,6 +880,7 @@ define <vscale x 2 x i32> @intrinsic_vmerge_vxm_nxv2i32_nxv2i32_i32(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i1> %2,
@@ -816,6 +890,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
@@ -829,6 +904,7 @@ define <vscale x 4 x i32> @intrinsic_vmerge_vxm_nxv4i32_nxv4i32_i32(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i1> %2,
@@ -838,6 +914,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
@@ -851,6 +928,7 @@ define <vscale x 8 x i32> @intrinsic_vmerge_vxm_nxv8i32_nxv8i32_i32(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i1> %2,
@@ -860,6 +938,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
@@ -873,6 +952,7 @@ define <vscale x 16 x i32> @intrinsic_vmerge_vxm_nxv16i32_nxv16i32_i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
     <vscale x 16 x i1> %2,
@@ -882,6 +962,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
@@ -901,6 +982,7 @@ define <vscale x 1 x i64> @intrinsic_vmerge_vxm_nxv1i64_nxv1i64_i64(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i1> %2,
@@ -910,6 +992,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
@@ -929,6 +1012,7 @@ define <vscale x 2 x i64> @intrinsic_vmerge_vxm_nxv2i64_nxv2i64_i64(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i1> %2,
@@ -938,6 +1022,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
@@ -957,6 +1042,7 @@ define <vscale x 4 x i64> @intrinsic_vmerge_vxm_nxv4i64_nxv4i64_i64(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i1> %2,
@@ -966,6 +1052,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
@@ -985,6 +1072,7 @@ define <vscale x 8 x i64> @intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
     <vscale x 8 x i1> %2,
@@ -1001,6 +1089,7 @@ define <vscale x 1 x i8> @intrinsic_vmerge_vim_nxv1i8_nxv1i8_i8(<vscale x 1 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 9,
     <vscale x 1 x i1> %1,
@@ -1017,6 +1106,7 @@ define <vscale x 2 x i8> @intrinsic_vmerge_vim_nxv2i8_nxv2i8_i8(<vscale x 2 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 9,
     <vscale x 2 x i1> %1,
@@ -1033,6 +1123,7 @@ define <vscale x 4 x i8> @intrinsic_vmerge_vim_nxv4i8_nxv4i8_i8(<vscale x 4 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 9,
     <vscale x 4 x i1> %1,
@@ -1049,6 +1140,7 @@ define <vscale x 8 x i8> @intrinsic_vmerge_vim_nxv8i8_nxv8i8_i8(<vscale x 8 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 9,
     <vscale x 8 x i1> %1,
@@ -1065,6 +1157,7 @@ define <vscale x 16 x i8> @intrinsic_vmerge_vim_nxv16i8_nxv16i8_i8(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 9,
     <vscale x 16 x i1> %1,
@@ -1081,6 +1174,7 @@ define <vscale x 32 x i8> @intrinsic_vmerge_vim_nxv32i8_nxv32i8_i8(<vscale x 32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 9,
     <vscale x 32 x i1> %1,
@@ -1097,6 +1191,7 @@ define <vscale x 64 x i8> @intrinsic_vmerge_vim_nxv64i8_nxv64i8_i8(<vscale x 64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 9,
     <vscale x 64 x i1> %1,
@@ -1113,6 +1208,7 @@ define <vscale x 1 x i16> @intrinsic_vmerge_vim_nxv1i16_nxv1i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 9,
     <vscale x 1 x i1> %1,
@@ -1129,6 +1225,7 @@ define <vscale x 2 x i16> @intrinsic_vmerge_vim_nxv2i16_nxv2i16_i16(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 9,
     <vscale x 2 x i1> %1,
@@ -1145,6 +1242,7 @@ define <vscale x 4 x i16> @intrinsic_vmerge_vim_nxv4i16_nxv4i16_i16(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 9,
     <vscale x 4 x i1> %1,
@@ -1161,6 +1259,7 @@ define <vscale x 8 x i16> @intrinsic_vmerge_vim_nxv8i16_nxv8i16_i16(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 9,
     <vscale x 8 x i1> %1,
@@ -1177,6 +1276,7 @@ define <vscale x 16 x i16> @intrinsic_vmerge_vim_nxv16i16_nxv16i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 9,
     <vscale x 16 x i1> %1,
@@ -1193,6 +1293,7 @@ define <vscale x 32 x i16> @intrinsic_vmerge_vim_nxv32i16_nxv32i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 9,
     <vscale x 32 x i1> %1,
@@ -1209,6 +1310,7 @@ define <vscale x 1 x i32> @intrinsic_vmerge_vim_nxv1i32_nxv1i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 9,
     <vscale x 1 x i1> %1,
@@ -1225,6 +1327,7 @@ define <vscale x 2 x i32> @intrinsic_vmerge_vim_nxv2i32_nxv2i32_i32(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 9,
     <vscale x 2 x i1> %1,
@@ -1241,6 +1344,7 @@ define <vscale x 4 x i32> @intrinsic_vmerge_vim_nxv4i32_nxv4i32_i32(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 9,
     <vscale x 4 x i1> %1,
@@ -1257,6 +1361,7 @@ define <vscale x 8 x i32> @intrinsic_vmerge_vim_nxv8i32_nxv8i32_i32(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 9,
     <vscale x 8 x i1> %1,
@@ -1273,6 +1378,7 @@ define <vscale x 16 x i32> @intrinsic_vmerge_vim_nxv16i32_nxv16i32_i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 9,
     <vscale x 16 x i1> %1,
@@ -1289,6 +1395,7 @@ define <vscale x 1 x i64> @intrinsic_vmerge_vim_nxv1i64_nxv1i64_i64(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
     <vscale x 1 x i1> %1,
@@ -1305,6 +1412,7 @@ define <vscale x 2 x i64> @intrinsic_vmerge_vim_nxv2i64_nxv2i64_i64(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 9,
     <vscale x 2 x i1> %1,
@@ -1321,6 +1429,7 @@ define <vscale x 4 x i64> @intrinsic_vmerge_vim_nxv4i64_nxv4i64_i64(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
     <vscale x 4 x i1> %1,
@@ -1337,6 +1446,7 @@ define <vscale x 8 x i64> @intrinsic_vmerge_vim_nxv8i64_nxv8i64_i64(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 9,
     <vscale x 8 x i1> %1,
@@ -1346,6 +1456,7 @@ entry:
 }
 
 declare <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
+  <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i1>,
@@ -1359,6 +1470,7 @@ define <vscale x 1 x half> @intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
+    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x i1> %2,
@@ -1368,6 +1480,7 @@ entry:
 }
 
 declare <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
+  <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i1>,
@@ -1381,6 +1494,7 @@ define <vscale x 2 x half> @intrinsic_vmerge_vvm_nxv2f16_nxv2f16_nxv2f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
+    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x i1> %2,
@@ -1390,6 +1504,7 @@ entry:
 }
 
 declare <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
+  <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i1>,
@@ -1403,6 +1518,7 @@ define <vscale x 4 x half> @intrinsic_vmerge_vvm_nxv4f16_nxv4f16_nxv4f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
+    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x i1> %2,
@@ -1412,6 +1528,7 @@ entry:
 }
 
 declare <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
+  <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i1>,
@@ -1425,6 +1542,7 @@ define <vscale x 8 x half> @intrinsic_vmerge_vvm_nxv8f16_nxv8f16_nxv8f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
+    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x i1> %2,
@@ -1434,6 +1552,7 @@ entry:
 }
 
 declare <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
+  <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i1>,
@@ -1447,6 +1566,7 @@ define <vscale x 16 x half> @intrinsic_vmerge_vvm_nxv16f16_nxv16f16_nxv16f16(<vs
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
+    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x i1> %2,
@@ -1456,6 +1576,7 @@ entry:
 }
 
 declare <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
+  <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i1>,
@@ -1469,6 +1590,7 @@ define <vscale x 32 x half> @intrinsic_vmerge_vvm_nxv32f16_nxv32f16_nxv32f16(<vs
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
+    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
     <vscale x 32 x i1> %2,
@@ -1478,6 +1600,7 @@ entry:
 }
 
 declare <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i1>,
@@ -1491,6 +1614,7 @@ define <vscale x 1 x float> @intrinsic_vmerge_vvm_nxv1f32_nxv1f32_nxv1f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
+    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x i1> %2,
@@ -1500,6 +1624,7 @@ entry:
 }
 
 declare <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
+  <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i1>,
@@ -1513,6 +1638,7 @@ define <vscale x 2 x float> @intrinsic_vmerge_vvm_nxv2f32_nxv2f32_nxv2f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
+    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x i1> %2,
@@ -1522,6 +1648,7 @@ entry:
 }
 
 declare <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
+  <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i1>,
@@ -1535,6 +1662,7 @@ define <vscale x 4 x float> @intrinsic_vmerge_vvm_nxv4f32_nxv4f32_nxv4f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
+    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x i1> %2,
@@ -1544,6 +1672,7 @@ entry:
 }
 
 declare <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
+  <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i1>,
@@ -1557,6 +1686,7 @@ define <vscale x 8 x float> @intrinsic_vmerge_vvm_nxv8f32_nxv8f32_nxv8f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
+    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x i1> %2,
@@ -1566,6 +1696,7 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
+  <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i1>,
@@ -1579,6 +1710,7 @@ define <vscale x 16 x float> @intrinsic_vmerge_vvm_nxv16f32_nxv16f32_nxv16f32(<v
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
+    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
     <vscale x 16 x i1> %2,
@@ -1588,6 +1720,7 @@ entry:
 }
 
 declare <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
+  <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i1>,
@@ -1601,6 +1734,7 @@ define <vscale x 1 x double> @intrinsic_vmerge_vvm_nxv1f64_nxv1f64_nxv1f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
+    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x i1> %2,
@@ -1610,6 +1744,7 @@ entry:
 }
 
 declare <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64(
+  <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x i1>,
@@ -1623,6 +1758,7 @@ define <vscale x 2 x double> @intrinsic_vmerge_vvm_nxv2f64_nxv2f64_nxv2f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64(
+    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x i1> %2,
@@ -1632,6 +1768,7 @@ entry:
 }
 
 declare <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
+  <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i1>,
@@ -1645,6 +1782,7 @@ define <vscale x 4 x double> @intrinsic_vmerge_vvm_nxv4f64_nxv4f64_nxv4f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
+    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x i1> %2,
@@ -1654,6 +1792,7 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
+  <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i1>,
@@ -1667,6 +1806,7 @@ define <vscale x 8 x double> @intrinsic_vmerge_vvm_nxv8f64_nxv8f64_nxv8f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
+    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
     <vscale x 8 x i1> %2,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmerge-rv64.ll
index 2360985310ec8..60c56b6f0ef82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-rv64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
@@ -15,6 +16,7 @@ define <vscale x 1 x i8> @intrinsic_vmerge_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i1> %2,
@@ -24,6 +26,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
@@ -37,6 +40,7 @@ define <vscale x 2 x i8> @intrinsic_vmerge_vvm_nxv2i8_nxv2i8_nxv2i8(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i1> %2,
@@ -46,6 +50,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
@@ -59,6 +64,7 @@ define <vscale x 4 x i8> @intrinsic_vmerge_vvm_nxv4i8_nxv4i8_nxv4i8(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i1> %2,
@@ -68,6 +74,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
@@ -81,6 +88,7 @@ define <vscale x 8 x i8> @intrinsic_vmerge_vvm_nxv8i8_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i1> %2,
@@ -90,6 +98,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
@@ -103,6 +112,7 @@ define <vscale x 16 x i8> @intrinsic_vmerge_vvm_nxv16i8_nxv16i8_nxv16i8(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i1> %2,
@@ -112,6 +122,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
@@ -125,6 +136,7 @@ define <vscale x 32 x i8> @intrinsic_vmerge_vvm_nxv32i8_nxv32i8_nxv32i8(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i1> %2,
@@ -134,6 +146,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
@@ -147,6 +160,7 @@ define <vscale x 64 x i8> @intrinsic_vmerge_vvm_nxv64i8_nxv64i8_nxv64i8(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i1> %2,
@@ -156,6 +170,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
@@ -169,6 +184,7 @@ define <vscale x 1 x i16> @intrinsic_vmerge_vvm_nxv1i16_nxv1i16_nxv1i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i1> %2,
@@ -178,6 +194,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
@@ -191,6 +208,7 @@ define <vscale x 2 x i16> @intrinsic_vmerge_vvm_nxv2i16_nxv2i16_nxv2i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i1> %2,
@@ -200,6 +218,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
@@ -213,6 +232,7 @@ define <vscale x 4 x i16> @intrinsic_vmerge_vvm_nxv4i16_nxv4i16_nxv4i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i1> %2,
@@ -222,6 +242,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
@@ -235,6 +256,7 @@ define <vscale x 8 x i16> @intrinsic_vmerge_vvm_nxv8i16_nxv8i16_nxv8i16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i1> %2,
@@ -244,6 +266,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
@@ -257,6 +280,7 @@ define <vscale x 16 x i16> @intrinsic_vmerge_vvm_nxv16i16_nxv16i16_nxv16i16(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i1> %2,
@@ -266,6 +290,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
@@ -279,6 +304,7 @@ define <vscale x 32 x i16> @intrinsic_vmerge_vvm_nxv32i16_nxv32i16_nxv32i16(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i1> %2,
@@ -288,6 +314,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
@@ -301,6 +328,7 @@ define <vscale x 1 x i32> @intrinsic_vmerge_vvm_nxv1i32_nxv1i32_nxv1i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i1> %2,
@@ -310,6 +338,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
@@ -323,6 +352,7 @@ define <vscale x 2 x i32> @intrinsic_vmerge_vvm_nxv2i32_nxv2i32_nxv2i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i1> %2,
@@ -332,6 +362,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
@@ -345,6 +376,7 @@ define <vscale x 4 x i32> @intrinsic_vmerge_vvm_nxv4i32_nxv4i32_nxv4i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i1> %2,
@@ -354,6 +386,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
@@ -367,6 +400,7 @@ define <vscale x 8 x i32> @intrinsic_vmerge_vvm_nxv8i32_nxv8i32_nxv8i32(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i1> %2,
@@ -376,6 +410,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
@@ -389,6 +424,7 @@ define <vscale x 16 x i32> @intrinsic_vmerge_vvm_nxv16i32_nxv16i32_nxv16i32(<vsc
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i1> %2,
@@ -398,6 +434,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
@@ -411,6 +448,7 @@ define <vscale x 1 x i64> @intrinsic_vmerge_vvm_nxv1i64_nxv1i64_nxv1i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i1> %2,
@@ -420,6 +458,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
@@ -433,6 +472,7 @@ define <vscale x 2 x i64> @intrinsic_vmerge_vvm_nxv2i64_nxv2i64_nxv2i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i1> %2,
@@ -442,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
@@ -455,6 +496,7 @@ define <vscale x 4 x i64> @intrinsic_vmerge_vvm_nxv4i64_nxv4i64_nxv4i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i1> %2,
@@ -464,6 +506,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
@@ -477,6 +520,7 @@ define <vscale x 8 x i64> @intrinsic_vmerge_vvm_nxv8i64_nxv8i64_nxv8i64(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i1> %2,
@@ -486,6 +530,7 @@ entry:
 }
 
 declare <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
@@ -499,6 +544,7 @@ define <vscale x 1 x i8> @intrinsic_vmerge_vxm_nxv1i8_nxv1i8_i8(<vscale x 1 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i1> %2,
@@ -508,6 +554,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
@@ -521,6 +568,7 @@ define <vscale x 2 x i8> @intrinsic_vmerge_vxm_nxv2i8_nxv2i8_i8(<vscale x 2 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i1> %2,
@@ -530,6 +578,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
@@ -543,6 +592,7 @@ define <vscale x 4 x i8> @intrinsic_vmerge_vxm_nxv4i8_nxv4i8_i8(<vscale x 4 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i1> %2,
@@ -552,6 +602,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
@@ -565,6 +616,7 @@ define <vscale x 8 x i8> @intrinsic_vmerge_vxm_nxv8i8_nxv8i8_i8(<vscale x 8 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i1> %2,
@@ -574,6 +626,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
@@ -587,6 +640,7 @@ define <vscale x 16 x i8> @intrinsic_vmerge_vxm_nxv16i8_nxv16i8_i8(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i1> %2,
@@ -596,6 +650,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
@@ -609,6 +664,7 @@ define <vscale x 32 x i8> @intrinsic_vmerge_vxm_nxv32i8_nxv32i8_i8(<vscale x 32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i1> %2,
@@ -618,6 +674,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
@@ -631,6 +688,7 @@ define <vscale x 64 x i8> @intrinsic_vmerge_vxm_nxv64i8_nxv64i8_i8(<vscale x 64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
     <vscale x 64 x i1> %2,
@@ -640,6 +698,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
@@ -653,6 +712,7 @@ define <vscale x 1 x i16> @intrinsic_vmerge_vxm_nxv1i16_nxv1i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i1> %2,
@@ -662,6 +722,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
@@ -675,6 +736,7 @@ define <vscale x 2 x i16> @intrinsic_vmerge_vxm_nxv2i16_nxv2i16_i16(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i1> %2,
@@ -684,6 +746,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
@@ -697,6 +760,7 @@ define <vscale x 4 x i16> @intrinsic_vmerge_vxm_nxv4i16_nxv4i16_i16(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i1> %2,
@@ -706,6 +770,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
@@ -719,6 +784,7 @@ define <vscale x 8 x i16> @intrinsic_vmerge_vxm_nxv8i16_nxv8i16_i16(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i1> %2,
@@ -728,6 +794,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
@@ -741,6 +808,7 @@ define <vscale x 16 x i16> @intrinsic_vmerge_vxm_nxv16i16_nxv16i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i1> %2,
@@ -750,6 +818,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
@@ -763,6 +832,7 @@ define <vscale x 32 x i16> @intrinsic_vmerge_vxm_nxv32i16_nxv32i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
     <vscale x 32 x i1> %2,
@@ -772,6 +842,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
@@ -785,6 +856,7 @@ define <vscale x 1 x i32> @intrinsic_vmerge_vxm_nxv1i32_nxv1i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i1> %2,
@@ -794,6 +866,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
@@ -807,6 +880,7 @@ define <vscale x 2 x i32> @intrinsic_vmerge_vxm_nxv2i32_nxv2i32_i32(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i1> %2,
@@ -816,6 +890,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
@@ -829,6 +904,7 @@ define <vscale x 4 x i32> @intrinsic_vmerge_vxm_nxv4i32_nxv4i32_i32(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i1> %2,
@@ -838,6 +914,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
@@ -851,6 +928,7 @@ define <vscale x 8 x i32> @intrinsic_vmerge_vxm_nxv8i32_nxv8i32_i32(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i1> %2,
@@ -860,6 +938,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
@@ -873,6 +952,7 @@ define <vscale x 16 x i32> @intrinsic_vmerge_vxm_nxv16i32_nxv16i32_i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
     <vscale x 16 x i1> %2,
@@ -882,6 +962,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
@@ -895,6 +976,7 @@ define <vscale x 1 x i64> @intrinsic_vmerge_vxm_nxv1i64_nxv1i64_i64(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i1> %2,
@@ -904,6 +986,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
@@ -917,6 +1000,7 @@ define <vscale x 2 x i64> @intrinsic_vmerge_vxm_nxv2i64_nxv2i64_i64(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i1> %2,
@@ -926,6 +1010,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
@@ -939,6 +1024,7 @@ define <vscale x 4 x i64> @intrinsic_vmerge_vxm_nxv4i64_nxv4i64_i64(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i1> %2,
@@ -948,6 +1034,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
@@ -961,6 +1048,7 @@ define <vscale x 8 x i64> @intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
     <vscale x 8 x i1> %2,
@@ -977,6 +1065,7 @@ define <vscale x 1 x i8> @intrinsic_vmerge_vim_nxv1i8_nxv1i8_i8(<vscale x 1 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 9,
     <vscale x 1 x i1> %1,
@@ -993,6 +1082,7 @@ define <vscale x 2 x i8> @intrinsic_vmerge_vim_nxv2i8_nxv2i8_i8(<vscale x 2 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmerge.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 9,
     <vscale x 2 x i1> %1,
@@ -1009,6 +1099,7 @@ define <vscale x 4 x i8> @intrinsic_vmerge_vim_nxv4i8_nxv4i8_i8(<vscale x 4 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmerge.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 9,
     <vscale x 4 x i1> %1,
@@ -1025,6 +1116,7 @@ define <vscale x 8 x i8> @intrinsic_vmerge_vim_nxv8i8_nxv8i8_i8(<vscale x 8 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmerge.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 9,
     <vscale x 8 x i1> %1,
@@ -1041,6 +1133,7 @@ define <vscale x 16 x i8> @intrinsic_vmerge_vim_nxv16i8_nxv16i8_i8(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmerge.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 9,
     <vscale x 16 x i1> %1,
@@ -1057,6 +1150,7 @@ define <vscale x 32 x i8> @intrinsic_vmerge_vim_nxv32i8_nxv32i8_i8(<vscale x 32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmerge.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 9,
     <vscale x 32 x i1> %1,
@@ -1073,6 +1167,7 @@ define <vscale x 64 x i8> @intrinsic_vmerge_vim_nxv64i8_nxv64i8_i8(<vscale x 64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmerge.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 9,
     <vscale x 64 x i1> %1,
@@ -1089,6 +1184,7 @@ define <vscale x 1 x i16> @intrinsic_vmerge_vim_nxv1i16_nxv1i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmerge.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 9,
     <vscale x 1 x i1> %1,
@@ -1105,6 +1201,7 @@ define <vscale x 2 x i16> @intrinsic_vmerge_vim_nxv2i16_nxv2i16_i16(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmerge.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 9,
     <vscale x 2 x i1> %1,
@@ -1121,6 +1218,7 @@ define <vscale x 4 x i16> @intrinsic_vmerge_vim_nxv4i16_nxv4i16_i16(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmerge.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 9,
     <vscale x 4 x i1> %1,
@@ -1137,6 +1235,7 @@ define <vscale x 8 x i16> @intrinsic_vmerge_vim_nxv8i16_nxv8i16_i16(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmerge.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 9,
     <vscale x 8 x i1> %1,
@@ -1153,6 +1252,7 @@ define <vscale x 16 x i16> @intrinsic_vmerge_vim_nxv16i16_nxv16i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmerge.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 9,
     <vscale x 16 x i1> %1,
@@ -1169,6 +1269,7 @@ define <vscale x 32 x i16> @intrinsic_vmerge_vim_nxv32i16_nxv32i16_i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmerge.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 9,
     <vscale x 32 x i1> %1,
@@ -1185,6 +1286,7 @@ define <vscale x 1 x i32> @intrinsic_vmerge_vim_nxv1i32_nxv1i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmerge.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 9,
     <vscale x 1 x i1> %1,
@@ -1201,6 +1303,7 @@ define <vscale x 2 x i32> @intrinsic_vmerge_vim_nxv2i32_nxv2i32_i32(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 9,
     <vscale x 2 x i1> %1,
@@ -1217,6 +1320,7 @@ define <vscale x 4 x i32> @intrinsic_vmerge_vim_nxv4i32_nxv4i32_i32(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmerge.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 9,
     <vscale x 4 x i1> %1,
@@ -1233,6 +1337,7 @@ define <vscale x 8 x i32> @intrinsic_vmerge_vim_nxv8i32_nxv8i32_i32(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmerge.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 9,
     <vscale x 8 x i1> %1,
@@ -1249,6 +1354,7 @@ define <vscale x 16 x i32> @intrinsic_vmerge_vim_nxv16i32_nxv16i32_i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmerge.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 9,
     <vscale x 16 x i1> %1,
@@ -1265,6 +1371,7 @@ define <vscale x 1 x i64> @intrinsic_vmerge_vim_nxv1i64_nxv1i64_i64(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmerge.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
     <vscale x 1 x i1> %1,
@@ -1281,6 +1388,7 @@ define <vscale x 2 x i64> @intrinsic_vmerge_vim_nxv2i64_nxv2i64_i64(<vscale x 2
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmerge.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 9,
     <vscale x 2 x i1> %1,
@@ -1297,6 +1405,7 @@ define <vscale x 4 x i64> @intrinsic_vmerge_vim_nxv4i64_nxv4i64_i64(<vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmerge.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
     <vscale x 4 x i1> %1,
@@ -1313,6 +1422,7 @@ define <vscale x 8 x i64> @intrinsic_vmerge_vim_nxv8i64_nxv8i64_i64(<vscale x 8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmerge.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 9,
     <vscale x 8 x i1> %1,
@@ -1322,6 +1432,7 @@ entry:
 }
 
 declare <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
+  <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i1>,
@@ -1335,6 +1446,7 @@ define <vscale x 1 x half> @intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
+    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x i1> %2,
@@ -1344,6 +1456,7 @@ entry:
 }
 
 declare <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
+  <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i1>,
@@ -1357,6 +1470,7 @@ define <vscale x 2 x half> @intrinsic_vmerge_vvm_nxv2f16_nxv2f16_nxv2f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
+    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x i1> %2,
@@ -1366,6 +1480,7 @@ entry:
 }
 
 declare <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
+  <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i1>,
@@ -1379,6 +1494,7 @@ define <vscale x 4 x half> @intrinsic_vmerge_vvm_nxv4f16_nxv4f16_nxv4f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
+    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x i1> %2,
@@ -1388,6 +1504,7 @@ entry:
 }
 
 declare <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
+  <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i1>,
@@ -1401,6 +1518,7 @@ define <vscale x 8 x half> @intrinsic_vmerge_vvm_nxv8f16_nxv8f16_nxv8f16(<vscale
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
+    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x i1> %2,
@@ -1410,6 +1528,7 @@ entry:
 }
 
 declare <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
+  <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i1>,
@@ -1423,6 +1542,7 @@ define <vscale x 16 x half> @intrinsic_vmerge_vvm_nxv16f16_nxv16f16_nxv16f16(<vs
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
+    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x i1> %2,
@@ -1432,6 +1552,7 @@ entry:
 }
 
 declare <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
+  <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i1>,
@@ -1445,6 +1566,7 @@ define <vscale x 32 x half> @intrinsic_vmerge_vvm_nxv32f16_nxv32f16_nxv32f16(<vs
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
+    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
     <vscale x 32 x i1> %2,
@@ -1454,6 +1576,7 @@ entry:
 }
 
 declare <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i1>,
@@ -1467,6 +1590,7 @@ define <vscale x 1 x float> @intrinsic_vmerge_vvm_nxv1f32_nxv1f32_nxv1f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
+    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x i1> %2,
@@ -1476,6 +1600,7 @@ entry:
 }
 
 declare <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
+  <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i1>,
@@ -1489,6 +1614,7 @@ define <vscale x 2 x float> @intrinsic_vmerge_vvm_nxv2f32_nxv2f32_nxv2f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
+    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x i1> %2,
@@ -1498,6 +1624,7 @@ entry:
 }
 
 declare <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
+  <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i1>,
@@ -1511,6 +1638,7 @@ define <vscale x 4 x float> @intrinsic_vmerge_vvm_nxv4f32_nxv4f32_nxv4f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
+    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x i1> %2,
@@ -1520,6 +1648,7 @@ entry:
 }
 
 declare <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
+  <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i1>,
@@ -1533,6 +1662,7 @@ define <vscale x 8 x float> @intrinsic_vmerge_vvm_nxv8f32_nxv8f32_nxv8f32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
+    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x i1> %2,
@@ -1542,6 +1672,7 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
+  <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i1>,
@@ -1555,6 +1686,7 @@ define <vscale x 16 x float> @intrinsic_vmerge_vvm_nxv16f32_nxv16f32_nxv16f32(<v
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
+    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
     <vscale x 16 x i1> %2,
@@ -1564,6 +1696,7 @@ entry:
 }
 
 declare <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
+  <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i1>,
@@ -1577,6 +1710,7 @@ define <vscale x 1 x double> @intrinsic_vmerge_vvm_nxv1f64_nxv1f64_nxv1f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
+    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x i1> %2,
@@ -1586,6 +1720,7 @@ entry:
 }
 
 declare <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64(
+  <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x i1>,
@@ -1599,6 +1734,7 @@ define <vscale x 2 x double> @intrinsic_vmerge_vvm_nxv2f64_nxv2f64_nxv2f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64(
+    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x i1> %2,
@@ -1608,6 +1744,7 @@ entry:
 }
 
 declare <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
+  <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i1>,
@@ -1621,6 +1758,7 @@ define <vscale x 4 x double> @intrinsic_vmerge_vvm_nxv4f64_nxv4f64_nxv4f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
+    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x i1> %2,
@@ -1630,6 +1768,7 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
+  <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i1>,
@@ -1643,6 +1782,7 @@ define <vscale x 8 x double> @intrinsic_vmerge_vvm_nxv8f64_nxv8f64_nxv8f64(<vsca
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
+    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
     <vscale x 8 x i1> %2,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsbc-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsbc-rv32.ll
index 2c5ef91cf9c20..b8818b2fc97ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsbc-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsbc-rv32.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
@@ -15,6 +16,7 @@ define <vscale x 1 x i8> @intrinsic_vsbc_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i1> %2,
@@ -24,6 +26,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
@@ -37,6 +40,7 @@ define <vscale x 2 x i8> @intrinsic_vsbc_vvm_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i1> %2,
@@ -46,6 +50,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
@@ -59,6 +64,7 @@ define <vscale x 4 x i8> @intrinsic_vsbc_vvm_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i1> %2,
@@ -68,6 +74,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
@@ -81,6 +88,7 @@ define <vscale x 8 x i8> @intrinsic_vsbc_vvm_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i1> %2,
@@ -90,6 +98,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
@@ -103,6 +112,7 @@ define <vscale x 16 x i8> @intrinsic_vsbc_vvm_nxv16i8_nxv16i8_nxv16i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i1> %2,
@@ -112,6 +122,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
@@ -125,6 +136,7 @@ define <vscale x 32 x i8> @intrinsic_vsbc_vvm_nxv32i8_nxv32i8_nxv32i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i1> %2,
@@ -134,6 +146,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
@@ -147,6 +160,7 @@ define <vscale x 64 x i8> @intrinsic_vsbc_vvm_nxv64i8_nxv64i8_nxv64i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i1> %2,
@@ -156,6 +170,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
@@ -169,6 +184,7 @@ define <vscale x 1 x i16> @intrinsic_vsbc_vvm_nxv1i16_nxv1i16_nxv1i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i1> %2,
@@ -178,6 +194,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
@@ -191,6 +208,7 @@ define <vscale x 2 x i16> @intrinsic_vsbc_vvm_nxv2i16_nxv2i16_nxv2i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i1> %2,
@@ -200,6 +218,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
@@ -213,6 +232,7 @@ define <vscale x 4 x i16> @intrinsic_vsbc_vvm_nxv4i16_nxv4i16_nxv4i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i1> %2,
@@ -222,6 +242,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
@@ -235,6 +256,7 @@ define <vscale x 8 x i16> @intrinsic_vsbc_vvm_nxv8i16_nxv8i16_nxv8i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i1> %2,
@@ -244,6 +266,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
@@ -257,6 +280,7 @@ define <vscale x 16 x i16> @intrinsic_vsbc_vvm_nxv16i16_nxv16i16_nxv16i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i1> %2,
@@ -266,6 +290,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
@@ -279,6 +304,7 @@ define <vscale x 32 x i16> @intrinsic_vsbc_vvm_nxv32i16_nxv32i16_nxv32i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i1> %2,
@@ -288,6 +314,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
@@ -301,6 +328,7 @@ define <vscale x 1 x i32> @intrinsic_vsbc_vvm_nxv1i32_nxv1i32_nxv1i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i1> %2,
@@ -310,6 +338,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
@@ -323,6 +352,7 @@ define <vscale x 2 x i32> @intrinsic_vsbc_vvm_nxv2i32_nxv2i32_nxv2i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i1> %2,
@@ -332,6 +362,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
@@ -345,6 +376,7 @@ define <vscale x 4 x i32> @intrinsic_vsbc_vvm_nxv4i32_nxv4i32_nxv4i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i1> %2,
@@ -354,6 +386,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
@@ -367,6 +400,7 @@ define <vscale x 8 x i32> @intrinsic_vsbc_vvm_nxv8i32_nxv8i32_nxv8i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i1> %2,
@@ -376,6 +410,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
@@ -389,6 +424,7 @@ define <vscale x 16 x i32> @intrinsic_vsbc_vvm_nxv16i32_nxv16i32_nxv16i32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i1> %2,
@@ -398,6 +434,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
@@ -411,6 +448,7 @@ define <vscale x 1 x i64> @intrinsic_vsbc_vvm_nxv1i64_nxv1i64_nxv1i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i1> %2,
@@ -420,6 +458,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
@@ -433,6 +472,7 @@ define <vscale x 2 x i64> @intrinsic_vsbc_vvm_nxv2i64_nxv2i64_nxv2i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i1> %2,
@@ -442,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
@@ -455,6 +496,7 @@ define <vscale x 4 x i64> @intrinsic_vsbc_vvm_nxv4i64_nxv4i64_nxv4i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i1> %2,
@@ -464,6 +506,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
@@ -477,6 +520,7 @@ define <vscale x 8 x i64> @intrinsic_vsbc_vvm_nxv8i64_nxv8i64_nxv8i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i1> %2,
@@ -486,6 +530,7 @@ entry:
 }
 
 declare <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
@@ -499,6 +544,7 @@ define <vscale x 1 x i8> @intrinsic_vsbc_vxm_nxv1i8_nxv1i8_i8(<vscale x 1 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i1> %2,
@@ -508,6 +554,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
@@ -521,6 +568,7 @@ define <vscale x 2 x i8> @intrinsic_vsbc_vxm_nxv2i8_nxv2i8_i8(<vscale x 2 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i1> %2,
@@ -530,6 +578,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
@@ -543,6 +592,7 @@ define <vscale x 4 x i8> @intrinsic_vsbc_vxm_nxv4i8_nxv4i8_i8(<vscale x 4 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i1> %2,
@@ -552,6 +602,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
@@ -565,6 +616,7 @@ define <vscale x 8 x i8> @intrinsic_vsbc_vxm_nxv8i8_nxv8i8_i8(<vscale x 8 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i1> %2,
@@ -574,6 +626,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
@@ -587,6 +640,7 @@ define <vscale x 16 x i8> @intrinsic_vsbc_vxm_nxv16i8_nxv16i8_i8(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i1> %2,
@@ -596,6 +650,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
@@ -609,6 +664,7 @@ define <vscale x 32 x i8> @intrinsic_vsbc_vxm_nxv32i8_nxv32i8_i8(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i1> %2,
@@ -618,6 +674,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
@@ -631,6 +688,7 @@ define <vscale x 64 x i8> @intrinsic_vsbc_vxm_nxv64i8_nxv64i8_i8(<vscale x 64 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
     <vscale x 64 x i1> %2,
@@ -640,6 +698,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
@@ -653,6 +712,7 @@ define <vscale x 1 x i16> @intrinsic_vsbc_vxm_nxv1i16_nxv1i16_i16(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i1> %2,
@@ -662,6 +722,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
@@ -675,6 +736,7 @@ define <vscale x 2 x i16> @intrinsic_vsbc_vxm_nxv2i16_nxv2i16_i16(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i1> %2,
@@ -684,6 +746,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
@@ -697,6 +760,7 @@ define <vscale x 4 x i16> @intrinsic_vsbc_vxm_nxv4i16_nxv4i16_i16(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i1> %2,
@@ -706,6 +770,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
@@ -719,6 +784,7 @@ define <vscale x 8 x i16> @intrinsic_vsbc_vxm_nxv8i16_nxv8i16_i16(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i1> %2,
@@ -728,6 +794,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
@@ -741,6 +808,7 @@ define <vscale x 16 x i16> @intrinsic_vsbc_vxm_nxv16i16_nxv16i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i1> %2,
@@ -750,6 +818,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
@@ -763,6 +832,7 @@ define <vscale x 32 x i16> @intrinsic_vsbc_vxm_nxv32i16_nxv32i16_i16(<vscale x 3
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
     <vscale x 32 x i1> %2,
@@ -772,6 +842,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
@@ -785,6 +856,7 @@ define <vscale x 1 x i32> @intrinsic_vsbc_vxm_nxv1i32_nxv1i32_i32(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i1> %2,
@@ -794,6 +866,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
@@ -807,6 +880,7 @@ define <vscale x 2 x i32> @intrinsic_vsbc_vxm_nxv2i32_nxv2i32_i32(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i1> %2,
@@ -816,6 +890,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
@@ -829,6 +904,7 @@ define <vscale x 4 x i32> @intrinsic_vsbc_vxm_nxv4i32_nxv4i32_i32(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i1> %2,
@@ -838,6 +914,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
@@ -851,6 +928,7 @@ define <vscale x 8 x i32> @intrinsic_vsbc_vxm_nxv8i32_nxv8i32_i32(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i1> %2,
@@ -860,6 +938,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
@@ -873,6 +952,7 @@ define <vscale x 16 x i32> @intrinsic_vsbc_vxm_nxv16i32_nxv16i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
     <vscale x 16 x i1> %2,
@@ -882,6 +962,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
@@ -901,6 +982,7 @@ define <vscale x 1 x i64> @intrinsic_vsbc_vxm_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i1> %2,
@@ -910,6 +992,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
@@ -929,6 +1012,7 @@ define <vscale x 2 x i64> @intrinsic_vsbc_vxm_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i1> %2,
@@ -938,6 +1022,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
@@ -957,6 +1042,7 @@ define <vscale x 4 x i64> @intrinsic_vsbc_vxm_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i1> %2,
@@ -966,6 +1052,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
@@ -985,6 +1072,7 @@ define <vscale x 8 x i64> @intrinsic_vsbc_vxm_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
     <vscale x 8 x i1> %2,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsbc-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsbc-rv64.ll
index 555c74c918551..5fe8ee59657da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsbc-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsbc-rv64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
@@ -15,6 +16,7 @@ define <vscale x 1 x i8> @intrinsic_vsbc_vvm_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i1> %2,
@@ -24,6 +26,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
@@ -37,6 +40,7 @@ define <vscale x 2 x i8> @intrinsic_vsbc_vvm_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i1> %2,
@@ -46,6 +50,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
@@ -59,6 +64,7 @@ define <vscale x 4 x i8> @intrinsic_vsbc_vvm_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i1> %2,
@@ -68,6 +74,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
@@ -81,6 +88,7 @@ define <vscale x 8 x i8> @intrinsic_vsbc_vvm_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i1> %2,
@@ -90,6 +98,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
@@ -103,6 +112,7 @@ define <vscale x 16 x i8> @intrinsic_vsbc_vvm_nxv16i8_nxv16i8_nxv16i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i1> %2,
@@ -112,6 +122,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
@@ -125,6 +136,7 @@ define <vscale x 32 x i8> @intrinsic_vsbc_vvm_nxv32i8_nxv32i8_nxv32i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i1> %2,
@@ -134,6 +146,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
@@ -147,6 +160,7 @@ define <vscale x 64 x i8> @intrinsic_vsbc_vvm_nxv64i8_nxv64i8_nxv64i8(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i1> %2,
@@ -156,6 +170,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
@@ -169,6 +184,7 @@ define <vscale x 1 x i16> @intrinsic_vsbc_vvm_nxv1i16_nxv1i16_nxv1i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i1> %2,
@@ -178,6 +194,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
@@ -191,6 +208,7 @@ define <vscale x 2 x i16> @intrinsic_vsbc_vvm_nxv2i16_nxv2i16_nxv2i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i1> %2,
@@ -200,6 +218,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
@@ -213,6 +232,7 @@ define <vscale x 4 x i16> @intrinsic_vsbc_vvm_nxv4i16_nxv4i16_nxv4i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i1> %2,
@@ -222,6 +242,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
@@ -235,6 +256,7 @@ define <vscale x 8 x i16> @intrinsic_vsbc_vvm_nxv8i16_nxv8i16_nxv8i16(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i1> %2,
@@ -244,6 +266,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
@@ -257,6 +280,7 @@ define <vscale x 16 x i16> @intrinsic_vsbc_vvm_nxv16i16_nxv16i16_nxv16i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i1> %2,
@@ -266,6 +290,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
@@ -279,6 +304,7 @@ define <vscale x 32 x i16> @intrinsic_vsbc_vvm_nxv32i16_nxv32i16_nxv32i16(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i1> %2,
@@ -288,6 +314,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
@@ -301,6 +328,7 @@ define <vscale x 1 x i32> @intrinsic_vsbc_vvm_nxv1i32_nxv1i32_nxv1i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i1> %2,
@@ -310,6 +338,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
@@ -323,6 +352,7 @@ define <vscale x 2 x i32> @intrinsic_vsbc_vvm_nxv2i32_nxv2i32_nxv2i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i1> %2,
@@ -332,6 +362,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
@@ -345,6 +376,7 @@ define <vscale x 4 x i32> @intrinsic_vsbc_vvm_nxv4i32_nxv4i32_nxv4i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i1> %2,
@@ -354,6 +386,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
@@ -367,6 +400,7 @@ define <vscale x 8 x i32> @intrinsic_vsbc_vvm_nxv8i32_nxv8i32_nxv8i32(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i1> %2,
@@ -376,6 +410,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
@@ -389,6 +424,7 @@ define <vscale x 16 x i32> @intrinsic_vsbc_vvm_nxv16i32_nxv16i32_nxv16i32(<vscal
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i1> %2,
@@ -398,6 +434,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
@@ -411,6 +448,7 @@ define <vscale x 1 x i64> @intrinsic_vsbc_vvm_nxv1i64_nxv1i64_nxv1i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i1> %2,
@@ -420,6 +458,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
@@ -433,6 +472,7 @@ define <vscale x 2 x i64> @intrinsic_vsbc_vvm_nxv2i64_nxv2i64_nxv2i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i1> %2,
@@ -442,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
@@ -455,6 +496,7 @@ define <vscale x 4 x i64> @intrinsic_vsbc_vvm_nxv4i64_nxv4i64_nxv4i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i1> %2,
@@ -464,6 +506,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
@@ -477,6 +520,7 @@ define <vscale x 8 x i64> @intrinsic_vsbc_vvm_nxv8i64_nxv8i64_nxv8i64(<vscale x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i1> %2,
@@ -486,6 +530,7 @@ entry:
 }
 
 declare <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
@@ -499,6 +544,7 @@ define <vscale x 1 x i8> @intrinsic_vsbc_vxm_nxv1i8_nxv1i8_i8(<vscale x 1 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vsbc.nxv1i8.i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i1> %2,
@@ -508,6 +554,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
@@ -521,6 +568,7 @@ define <vscale x 2 x i8> @intrinsic_vsbc_vxm_nxv2i8_nxv2i8_i8(<vscale x 2 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vsbc.nxv2i8.i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i1> %2,
@@ -530,6 +578,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
@@ -543,6 +592,7 @@ define <vscale x 4 x i8> @intrinsic_vsbc_vxm_nxv4i8_nxv4i8_i8(<vscale x 4 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vsbc.nxv4i8.i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i1> %2,
@@ -552,6 +602,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
@@ -565,6 +616,7 @@ define <vscale x 8 x i8> @intrinsic_vsbc_vxm_nxv8i8_nxv8i8_i8(<vscale x 8 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vsbc.nxv8i8.i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i1> %2,
@@ -574,6 +626,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
@@ -587,6 +640,7 @@ define <vscale x 16 x i8> @intrinsic_vsbc_vxm_nxv16i8_nxv16i8_i8(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vsbc.nxv16i8.i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i1> %2,
@@ -596,6 +650,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
@@ -609,6 +664,7 @@ define <vscale x 32 x i8> @intrinsic_vsbc_vxm_nxv32i8_nxv32i8_i8(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vsbc.nxv32i8.i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i1> %2,
@@ -618,6 +674,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
@@ -631,6 +688,7 @@ define <vscale x 64 x i8> @intrinsic_vsbc_vxm_nxv64i8_nxv64i8_i8(<vscale x 64 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vsbc.nxv64i8.i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
     <vscale x 64 x i1> %2,
@@ -640,6 +698,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
@@ -653,6 +712,7 @@ define <vscale x 1 x i16> @intrinsic_vsbc_vxm_nxv1i16_nxv1i16_i16(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vsbc.nxv1i16.i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i1> %2,
@@ -662,6 +722,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
@@ -675,6 +736,7 @@ define <vscale x 2 x i16> @intrinsic_vsbc_vxm_nxv2i16_nxv2i16_i16(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vsbc.nxv2i16.i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i1> %2,
@@ -684,6 +746,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
@@ -697,6 +760,7 @@ define <vscale x 4 x i16> @intrinsic_vsbc_vxm_nxv4i16_nxv4i16_i16(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vsbc.nxv4i16.i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i1> %2,
@@ -706,6 +770,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
@@ -719,6 +784,7 @@ define <vscale x 8 x i16> @intrinsic_vsbc_vxm_nxv8i16_nxv8i16_i16(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vsbc.nxv8i16.i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i1> %2,
@@ -728,6 +794,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
@@ -741,6 +808,7 @@ define <vscale x 16 x i16> @intrinsic_vsbc_vxm_nxv16i16_nxv16i16_i16(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vsbc.nxv16i16.i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i1> %2,
@@ -750,6 +818,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
@@ -763,6 +832,7 @@ define <vscale x 32 x i16> @intrinsic_vsbc_vxm_nxv32i16_nxv32i16_i16(<vscale x 3
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vsbc.nxv32i16.i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
     <vscale x 32 x i1> %2,
@@ -772,6 +842,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
@@ -785,6 +856,7 @@ define <vscale x 1 x i32> @intrinsic_vsbc_vxm_nxv1i32_nxv1i32_i32(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vsbc.nxv1i32.i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i1> %2,
@@ -794,6 +866,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
@@ -807,6 +880,7 @@ define <vscale x 2 x i32> @intrinsic_vsbc_vxm_nxv2i32_nxv2i32_i32(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vsbc.nxv2i32.i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i1> %2,
@@ -816,6 +890,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
@@ -829,6 +904,7 @@ define <vscale x 4 x i32> @intrinsic_vsbc_vxm_nxv4i32_nxv4i32_i32(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsbc.nxv4i32.i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i1> %2,
@@ -838,6 +914,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
@@ -851,6 +928,7 @@ define <vscale x 8 x i32> @intrinsic_vsbc_vxm_nxv8i32_nxv8i32_i32(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsbc.nxv8i32.i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i1> %2,
@@ -860,6 +938,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
@@ -873,6 +952,7 @@ define <vscale x 16 x i32> @intrinsic_vsbc_vxm_nxv16i32_nxv16i32_i32(<vscale x 1
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsbc.nxv16i32.i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
     <vscale x 16 x i1> %2,
@@ -882,6 +962,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
@@ -895,6 +976,7 @@ define <vscale x 1 x i64> @intrinsic_vsbc_vxm_nxv1i64_nxv1i64_i64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsbc.nxv1i64.i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i1> %2,
@@ -904,6 +986,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
@@ -917,6 +1000,7 @@ define <vscale x 2 x i64> @intrinsic_vsbc_vxm_nxv2i64_nxv2i64_i64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsbc.nxv2i64.i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i1> %2,
@@ -926,6 +1010,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
@@ -939,6 +1024,7 @@ define <vscale x 4 x i64> @intrinsic_vsbc_vxm_nxv4i64_nxv4i64_i64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsbc.nxv4i64.i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i1> %2,
@@ -948,6 +1034,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
@@ -961,6 +1048,7 @@ define <vscale x 8 x i64> @intrinsic_vsbc_vxm_nxv8i64_nxv8i64_i64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsbc.nxv8i64.i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
     <vscale x 8 x i1> %2,

From 5065076698cf32b5ad3b6f88b5f3b84d68948589 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 16 Feb 2022 16:38:11 +0100
Subject: [PATCH 064/748] [CodeGen] Rename deprecated Address constructor

To make uses of the deprecated constructor easier to spot, and to
ensure that no new uses are introduced, rename it to
Address::deprecated().

While doing the rename, I've filled in element types in cases
where it was relatively obvious, but we're still left with 135
calls to the deprecated constructor.
---
 clang/include/clang/Basic/riscv_vector.td |  24 +--
 clang/lib/CodeGen/Address.h               |   7 +-
 clang/lib/CodeGen/CGAtomic.cpp            |  13 +-
 clang/lib/CodeGen/CGBlocks.cpp            |  42 +++---
 clang/lib/CodeGen/CGBuilder.h             |   4 +-
 clang/lib/CodeGen/CGBuiltin.cpp           |  16 +-
 clang/lib/CodeGen/CGCUDANV.cpp            |   7 +-
 clang/lib/CodeGen/CGCXXABI.h              |   2 +-
 clang/lib/CodeGen/CGCall.cpp              |  20 +--
 clang/lib/CodeGen/CGClass.cpp             |   6 +-
 clang/lib/CodeGen/CGCleanup.cpp           |  11 +-
 clang/lib/CodeGen/CGDecl.cpp              |   9 +-
 clang/lib/CodeGen/CGException.cpp         |   5 +-
 clang/lib/CodeGen/CGExpr.cpp              |  10 +-
 clang/lib/CodeGen/CGExprCXX.cpp           |  21 +--
 clang/lib/CodeGen/CGExprScalar.cpp        |   4 +-
 clang/lib/CodeGen/CGNonTrivialStruct.cpp  |  11 +-
 clang/lib/CodeGen/CGObjC.cpp              |  15 +-
 clang/lib/CodeGen/CGObjCGNU.cpp           |   4 +-
 clang/lib/CodeGen/CGObjCMac.cpp           |   6 +-
 clang/lib/CodeGen/CGObjCRuntime.cpp       |   2 +-
 clang/lib/CodeGen/CGOpenMPRuntime.cpp     | 173 ++++++++++++----------
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp  |  62 ++++----
 clang/lib/CodeGen/CGStmtOpenMP.cpp        |  73 ++++-----
 clang/lib/CodeGen/CGVTables.cpp           |   8 +-
 clang/lib/CodeGen/CodeGenFunction.cpp     |  12 +-
 clang/lib/CodeGen/CodeGenModule.cpp       |   3 +-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp     |  17 ++-
 clang/lib/CodeGen/TargetInfo.cpp          | 133 +++++++++--------
 clang/utils/TableGen/MveEmitter.cpp       |   4 +-
 30 files changed, 388 insertions(+), 336 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index efc074aba246a..a497f85705c72 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -630,7 +630,7 @@ multiclass RVVVLEFFBuiltin<list<string> types> {
         clang::CharUnits Align =
             CGM.getNaturalPointeeTypeAlignment(E->getArg(1)->getType());
         Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {1}),
-                            Address(NewVL, Align));
+                            Address::deprecated(NewVL, Align));
         return V;
       }
       }],
@@ -650,7 +650,7 @@ multiclass RVVVLEFFBuiltin<list<string> types> {
         clang::CharUnits Align =
             CGM.getNaturalPointeeTypeAlignment(E->getArg(3)->getType());
         Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {1}),
-                            Address(NewVL, Align));
+                            Address::deprecated(NewVL, Align));
         return V;
       }
       }] in {
@@ -868,7 +868,7 @@ multiclass RVVUnitStridedSegLoad<string op> {
       llvm::Value *V;
       for (unsigned I = 0; I < NF; ++I) {
         V = Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                                Address(Ops[I], Align));
+                                Address::deprecated(Ops[I], Align));
       }
       return V;
     }
@@ -894,7 +894,7 @@ multiclass RVVUnitStridedSegLoad<string op> {
       llvm::Value *V;
       for (unsigned I = 0; I < NF; ++I) {
         V = Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                                Address(Ops[I], Align));
+                                Address::deprecated(Ops[I], Align));
       }
       return V;
     }
@@ -939,11 +939,11 @@ multiclass RVVUnitStridedSegLoadFF<string op> {
           CGM.getNaturalPointeeTypeAlignment(E->getArg(0)->getType());
       for (unsigned I = 0; I < NF; ++I) {
         Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                            Address(Ops[I], Align));
+                            Address::deprecated(Ops[I], Align));
       }
       // Store new_vl.
       return Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {NF}),
-                                 Address(NewVL, Align));
+                                 Address::deprecated(NewVL, Align));
     }
             }],
             ManualCodegenMask = [{
@@ -967,11 +967,11 @@ multiclass RVVUnitStridedSegLoadFF<string op> {
           CGM.getNaturalPointeeTypeAlignment(E->getArg(0)->getType());
       for (unsigned I = 0; I < NF; ++I) {
         Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                            Address(Ops[I], Align));
+                            Address::deprecated(Ops[I], Align));
       }
       // Store new_vl.
       return Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {NF}),
-                                 Address(NewVL, Align));
+                                 Address::deprecated(NewVL, Align));
     }
             }] in {
           defvar PV = PVString<nf, /*signed=*/true>.S;
@@ -1014,7 +1014,7 @@ multiclass RVVStridedSegLoad<string op> {
       llvm::Value *V;
       for (unsigned I = 0; I < NF; ++I) {
         V = Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                                Address(Ops[I], Align));
+                                Address::deprecated(Ops[I], Align));
       }
       return V;
     }
@@ -1041,7 +1041,7 @@ multiclass RVVStridedSegLoad<string op> {
       llvm::Value *V;
       for (unsigned I = 0; I < NF; ++I) {
         V = Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                                Address(Ops[I], Align));
+                                Address::deprecated(Ops[I], Align));
       }
       return V;
     }
@@ -1081,7 +1081,7 @@ multiclass RVVIndexedSegLoad<string op> {
       llvm::Value *V;
       for (unsigned I = 0; I < NF; ++I) {
         V = Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                                Address(Ops[I], Align));
+                                Address::deprecated(Ops[I], Align));
       }
       return V;
     }
@@ -1108,7 +1108,7 @@ multiclass RVVIndexedSegLoad<string op> {
       llvm::Value *V;
       for (unsigned I = 0; I < NF; ++I) {
         V = Builder.CreateStore(Builder.CreateExtractValue(LoadValue, {I}),
-                                Address(Ops[I], Align));
+                                Address::deprecated(Ops[I], Align));
       }
       return V;
     }
diff --git a/clang/lib/CodeGen/Address.h b/clang/lib/CodeGen/Address.h
index 3ac0f4f0d7e56..e1cec561653f6 100644
--- a/clang/lib/CodeGen/Address.h
+++ b/clang/lib/CodeGen/Address.h
@@ -88,9 +88,10 @@ class Address {
   }
 
   // Deprecated: Use constructor with explicit element type instead.
-  Address(llvm::Value *Pointer, CharUnits Alignment)
-      : Address(Pointer, Pointer->getType()->getPointerElementType(),
-                Alignment) {}
+  static Address deprecated(llvm::Value *Pointer, CharUnits Alignment) {
+    return Address(Pointer, Pointer->getType()->getPointerElementType(),
+                   Alignment);
+  }
 
   static Address invalid() { return Address(nullptr); }
   bool isValid() const { return A.getPointer() != nullptr; }
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index e17fd156229cd..3eefe753bbe6a 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -86,15 +86,14 @@ namespace {
             lvalue.getAlignment();
         VoidPtrAddr = CGF.Builder.CreateConstGEP1_64(
             CGF.Int8Ty, VoidPtrAddr, OffsetInChars.getQuantity());
+        llvm::Type *IntTy = CGF.Builder.getIntNTy(AtomicSizeInBits);
         auto Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            VoidPtrAddr,
-            CGF.Builder.getIntNTy(AtomicSizeInBits)->getPointerTo(),
-            "atomic_bitfield_base");
+            VoidPtrAddr, IntTy->getPointerTo(), "atomic_bitfield_base");
         BFI = OrigBFI;
         BFI.Offset = Offset;
         BFI.StorageSize = AtomicSizeInBits;
         BFI.StorageOffset += OffsetInChars;
-        LVal = LValue::MakeBitfield(Address(Addr, lvalue.getAlignment()),
+        LVal = LValue::MakeBitfield(Address(Addr, IntTy, lvalue.getAlignment()),
                                     BFI, lvalue.getType(), lvalue.getBaseInfo(),
                                     lvalue.getTBAAInfo());
         AtomicTy = C.getIntTypeForBitwidth(AtomicSizeInBits, OrigBFI.IsSigned);
@@ -788,9 +787,9 @@ AddDirectArgument(CodeGenFunction &CGF, CallArgList &Args,
     int64_t SizeInBits = CGF.getContext().toBits(SizeInChars);
     ValTy =
         CGF.getContext().getIntTypeForBitwidth(SizeInBits, /*Signed=*/false);
-    llvm::Type *IPtrTy = llvm::IntegerType::get(CGF.getLLVMContext(),
-                                                SizeInBits)->getPointerTo();
-    Address Ptr = Address(CGF.Builder.CreateBitCast(Val, IPtrTy), Align);
+    llvm::Type *ITy = llvm::IntegerType::get(CGF.getLLVMContext(), SizeInBits);
+    Address Ptr = Address(CGF.Builder.CreateBitCast(Val, ITy->getPointerTo()),
+                          ITy, Align);
     Val = CGF.EmitLoadOfScalar(Ptr, false,
                                CGF.getContext().getPointerType(ValTy),
                                Loc);
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 9696c9d3e4d2a..5af61e52b9be5 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1263,7 +1263,8 @@ Address CodeGenFunction::GetAddrOfBlockDecl(const VarDecl *variable) {
     // to byref*.
 
     auto &byrefInfo = getBlockByrefInfo(variable);
-    addr = Address(Builder.CreateLoad(addr), byrefInfo.ByrefAlignment);
+    addr =
+        Address::deprecated(Builder.CreateLoad(addr), byrefInfo.ByrefAlignment);
 
     addr = Builder.CreateElementBitCast(addr, byrefInfo.Type, "byref.addr");
 
@@ -1440,15 +1441,12 @@ void CodeGenFunction::setBlockContextParameter(const ImplicitParamDecl *D,
 Address CodeGenFunction::LoadBlockStruct() {
   assert(BlockInfo && "not in a block invocation function!");
   assert(BlockPointer && "no block pointer set!");
-  return Address(BlockPointer, BlockInfo->BlockAlign);
+  return Address::deprecated(BlockPointer, BlockInfo->BlockAlign);
 }
 
-llvm::Function *
-CodeGenFunction::GenerateBlockFunction(GlobalDecl GD,
-                                       const CGBlockInfo &blockInfo,
-                                       const DeclMapTy &ldm,
-                                       bool IsLambdaConversionToBlock,
-                                       bool BuildGlobalBlock) {
+llvm::Function *CodeGenFunction::GenerateBlockFunction(
+    GlobalDecl GD, const CGBlockInfo &blockInfo, const DeclMapTy &ldm,
+    bool IsLambdaConversionToBlock, bool BuildGlobalBlock) {
   const BlockDecl *blockDecl = blockInfo.getBlockDecl();
 
   CurGD = GD;
@@ -1940,12 +1938,12 @@ CodeGenFunction::GenerateCopyHelperFunction(const CGBlockInfo &blockInfo) {
   auto AL = ApplyDebugLocation::CreateArtificial(*this);
 
   Address src = GetAddrOfLocalVar(&SrcDecl);
-  src = Address(Builder.CreateLoad(src), blockInfo.BlockAlign);
+  src = Address::deprecated(Builder.CreateLoad(src), blockInfo.BlockAlign);
   src = Builder.CreateElementBitCast(src, blockInfo.StructureType,
                                      "block.source");
 
   Address dst = GetAddrOfLocalVar(&DstDecl);
-  dst = Address(Builder.CreateLoad(dst), blockInfo.BlockAlign);
+  dst = Address::deprecated(Builder.CreateLoad(dst), blockInfo.BlockAlign);
   dst =
       Builder.CreateElementBitCast(dst, blockInfo.StructureType, "block.dest");
 
@@ -2130,7 +2128,7 @@ CodeGenFunction::GenerateDestroyHelperFunction(const CGBlockInfo &blockInfo) {
   auto AL = ApplyDebugLocation::CreateArtificial(*this);
 
   Address src = GetAddrOfLocalVar(&SrcDecl);
-  src = Address(Builder.CreateLoad(src), blockInfo.BlockAlign);
+  src = Address::deprecated(Builder.CreateLoad(src), blockInfo.BlockAlign);
   src = Builder.CreateElementBitCast(src, blockInfo.StructureType, "block");
 
   CodeGenFunction::RunCleanupsScope cleanups(*this);
@@ -2375,19 +2373,19 @@ generateByrefCopyHelper(CodeGenFunction &CGF, const BlockByrefInfo &byrefInfo,
   if (generator.needsCopy()) {
     // dst->x
     Address destField = CGF.GetAddrOfLocalVar(&Dst);
-    destField = Address(CGF.Builder.CreateLoad(destField),
-                        byrefInfo.ByrefAlignment);
+    destField = Address::deprecated(CGF.Builder.CreateLoad(destField),
+                                    byrefInfo.ByrefAlignment);
     destField = CGF.Builder.CreateElementBitCast(destField, byrefInfo.Type);
-    destField = CGF.emitBlockByrefAddress(destField, byrefInfo, false,
-                                          "dest-object");
+    destField =
+        CGF.emitBlockByrefAddress(destField, byrefInfo, false, "dest-object");
 
     // src->x
     Address srcField = CGF.GetAddrOfLocalVar(&Src);
-    srcField = Address(CGF.Builder.CreateLoad(srcField),
-                       byrefInfo.ByrefAlignment);
+    srcField = Address::deprecated(CGF.Builder.CreateLoad(srcField),
+                                   byrefInfo.ByrefAlignment);
     srcField = CGF.Builder.CreateElementBitCast(srcField, byrefInfo.Type);
-    srcField = CGF.emitBlockByrefAddress(srcField, byrefInfo, false,
-                                         "src-object");
+    srcField =
+        CGF.emitBlockByrefAddress(srcField, byrefInfo, false, "src-object");
 
     generator.emitCopy(CGF, destField, srcField);
   }
@@ -2441,7 +2439,8 @@ generateByrefDisposeHelper(CodeGenFunction &CGF,
 
   if (generator.needsDispose()) {
     Address addr = CGF.GetAddrOfLocalVar(&Src);
-    addr = Address(CGF.Builder.CreateLoad(addr), byrefInfo.ByrefAlignment);
+    addr = Address::deprecated(CGF.Builder.CreateLoad(addr),
+                               byrefInfo.ByrefAlignment);
     addr = CGF.Builder.CreateElementBitCast(addr, byrefInfo.Type);
     addr = CGF.emitBlockByrefAddress(addr, byrefInfo, false, "object");
 
@@ -2588,7 +2587,8 @@ Address CodeGenFunction::emitBlockByrefAddress(Address baseAddr,
   // Chase the forwarding address if requested.
   if (followForward) {
     Address forwardingAddr = Builder.CreateStructGEP(baseAddr, 1, "forwarding");
-    baseAddr = Address(Builder.CreateLoad(forwardingAddr), info.ByrefAlignment);
+    baseAddr = Address::deprecated(Builder.CreateLoad(forwardingAddr),
+                                   info.ByrefAlignment);
   }
 
   return Builder.CreateStructGEP(baseAddr, info.FieldIndex, name);
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
index eaf563a436ca1..27ced0cec041b 100644
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -173,8 +173,8 @@ class CGBuilderTy : public CGBuilderBaseTy {
   Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty,
                                               const llvm::Twine &Name = "") {
     llvm::Value *Ptr =
-      CreatePointerBitCastOrAddrSpaceCast(Addr.getPointer(), Ty, Name);
-    return Address(Ptr, Addr.getAlignment());
+        CreatePointerBitCastOrAddrSpaceCast(Addr.getPointer(), Ty, Name);
+    return Address::deprecated(Ptr, Addr.getAlignment());
   }
 
   /// Given
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 427cb6954f56c..43993f092988e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -373,7 +373,7 @@ static Value *EmitAtomicCmpXchg128ForMSIntrin(CodeGenFunction &CGF,
   llvm::Type *Int128PtrTy = Int128Ty->getPointerTo();
   Destination = CGF.Builder.CreateBitCast(Destination, Int128PtrTy);
   Address ComparandResult(CGF.Builder.CreateBitCast(ComparandPtr, Int128PtrTy),
-                          CGF.getContext().toCharUnitsFromBits(128));
+                          Int128Ty, CGF.getContext().toCharUnitsFromBits(128));
 
   // (((i128)hi) << 64) | ((i128)lo)
   ExchangeHigh = CGF.Builder.CreateZExt(ExchangeHigh, Int128Ty);
@@ -961,7 +961,7 @@ static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
   Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
   Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
                                                  ByteIndex, "bittest.byteaddr"),
-                   CharUnits::One());
+                   CGF.Int8Ty, CharUnits::One());
   Value *PosLow =
       CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
                             llvm::ConstantInt::get(CGF.Int8Ty, 0x7));
@@ -1778,8 +1778,8 @@ llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
   auto AL = ApplyDebugLocation::CreateArtificial(*this);
 
   CharUnits Offset;
-  Address BufAddr(Builder.CreateLoad(GetAddrOfLocalVar(Args[0]), "buf"),
-                  BufferAlignment);
+  Address BufAddr = Address::deprecated(
+      Builder.CreateLoad(GetAddrOfLocalVar(Args[0]), "buf"), BufferAlignment);
   Builder.CreateStore(Builder.getInt8(Layout.getSummaryByte()),
                       Builder.CreateConstByteGEP(BufAddr, Offset++, "summary"));
   Builder.CreateStore(Builder.getInt8(Layout.getNumArgsByte()),
@@ -2108,7 +2108,7 @@ static llvm::Value *dumpRecord(CodeGenFunction &CGF, QualType RType,
                              ? Types[Context.VoidPtrTy]
                              : Types[CanonicalType];
 
-    Address FieldAddress = Address(FieldPtr, Align);
+    Address FieldAddress = Address::deprecated(FieldPtr, Align);
     FieldPtr = CGF.Builder.CreateLoad(FieldAddress);
 
     // FIXME Need to handle bitfield here
@@ -9597,7 +9597,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
       for (size_t i = 0; i < 8; i++) {
         llvm::Value *ValOffsetPtr =
             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
-        Address Addr(ValOffsetPtr, CharUnits::fromQuantity(8));
+        Address Addr =
+            Address::deprecated(ValOffsetPtr, CharUnits::fromQuantity(8));
         ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr);
       }
       return ToRet;
@@ -9609,7 +9610,8 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
       for (size_t i = 0; i < 8; i++) {
         llvm::Value *ValOffsetPtr =
             Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
-        Address Addr(ValOffsetPtr, CharUnits::fromQuantity(8));
+        Address Addr =
+            Address::deprecated(ValOffsetPtr, CharUnits::fromQuantity(8));
         Args.push_back(Builder.CreateLoad(Addr));
       }
 
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index c4e3f7f54f4f2..293bdf99d272f 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -822,7 +822,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     if (Linkage != llvm::GlobalValue::InternalLinkage)
       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
     Address GpuBinaryAddr(
-        GpuBinaryHandle,
+        GpuBinaryHandle, VoidPtrPtrTy,
         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
     {
       auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
@@ -958,8 +958,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
   CGBuilderTy DtorBuilder(CGM, Context);
   DtorBuilder.SetInsertPoint(DtorEntryBB);
 
-  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
-                                             GpuBinaryHandle->getAlignment()));
+  Address GpuBinaryAddr = Address::deprecated(
+      GpuBinaryHandle,
+      CharUnits::fromQuantity( GpuBinaryHandle->getAlignment()));
   auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
   // There is only one HIP fat binary per linked module, however there are
   // multiple destructor functions. Make sure the fat binary is unregistered
diff --git a/clang/lib/CodeGen/CGCXXABI.h b/clang/lib/CodeGen/CGCXXABI.h
index b96222b3ce280..ba073b3ff4e52 100644
--- a/clang/lib/CodeGen/CGCXXABI.h
+++ b/clang/lib/CodeGen/CGCXXABI.h
@@ -56,7 +56,7 @@ class CGCXXABI {
     return CGF.CXXABIThisValue;
   }
   Address getThisAddress(CodeGenFunction &CGF) {
-    return Address(CGF.CXXABIThisValue, CGF.CXXABIThisAlignment);
+    return Address::deprecated(CGF.CXXABIThisValue, CGF.CXXABIThisAlignment);
   }
 
   /// Issue a diagnostic about unsupported features in the ABI.
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 871b94e45a20a..f1eb26e498225 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1016,7 +1016,7 @@ static void forConstantArrayExpansion(CodeGenFunction &CGF,
   for (int i = 0, n = CAE->NumElts; i < n; i++) {
     llvm::Value *EltAddr = CGF.Builder.CreateConstGEP2_32(
         BaseAddr.getElementType(), BaseAddr.getPointer(), 0, i);
-    Fn(Address(EltAddr, EltAlign));
+    Fn(Address::deprecated(EltAddr, EltAlign));
   }
 }
 
@@ -2685,8 +2685,9 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
   // parameter, which is a pointer to the complete memory area.
   Address ArgStruct = Address::invalid();
   if (IRFunctionArgs.hasInallocaArg()) {
-    ArgStruct = Address(Fn->getArg(IRFunctionArgs.getInallocaArgNo()),
-                        FI.getArgStructAlignment());
+    ArgStruct = Address::deprecated(
+        Fn->getArg(IRFunctionArgs.getInallocaArgNo()),
+        FI.getArgStructAlignment());
 
     assert(ArgStruct.getType() == FI.getArgStruct()->getPointerTo());
   }
@@ -2736,8 +2737,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
       Address V =
           Builder.CreateStructGEP(ArgStruct, FieldIndex, Arg->getName());
       if (ArgI.getInAllocaIndirect())
-        V = Address(Builder.CreateLoad(V),
-                    getContext().getTypeAlignInChars(Ty));
+        V = Address::deprecated(Builder.CreateLoad(V),
+                                getContext().getTypeAlignInChars(Ty));
       ArgVals.push_back(ParamValue::forIndirect(V));
       break;
     }
@@ -2885,7 +2886,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
           assert(pointeeTy->isPointerType());
           Address temp =
             CreateMemTemp(pointeeTy, getPointerAlign(), "swifterror.temp");
-          Address arg = Address(V, getContext().getTypeAlignInChars(pointeeTy));
+          Address arg = Address::deprecated(
+              V, getContext().getTypeAlignInChars(pointeeTy));
           llvm::Value *incomingErrorValue = Builder.CreateLoad(arg);
           Builder.CreateStore(incomingErrorValue, temp);
           V = temp.getPointer();
@@ -3751,7 +3753,7 @@ static AggValueSlot createPlaceholderSlot(CodeGenFunction &CGF,
   CharUnits Align = CharUnits::fromQuantity(4);
   Placeholder = CGF.Builder.CreateAlignedLoad(IRPtrTy, Placeholder, Align);
 
-  return AggValueSlot::forAddr(Address(Placeholder, Align),
+  return AggValueSlot::forAddr(Address(Placeholder, IRTy, Align),
                                Ty.getQualifiers(),
                                AggValueSlot::IsNotDestructed,
                                AggValueSlot::DoesNotNeedGCBarriers,
@@ -4741,7 +4743,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     AI->setAlignment(Align.getAsAlign());
     AI->setUsedWithInAlloca(true);
     assert(AI->isUsedWithInAlloca() && !AI->isStaticAlloca());
-    ArgMemory = Address(AI, Align);
+    ArgMemory = Address(AI, ArgStruct, Align);
   }
 
   ClangToLLVMArgMapping IRFunctionArgs(CGM.getContext(), CallInfo);
@@ -4966,7 +4968,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
           QualType pointeeTy = I->Ty->getPointeeType();
           swiftErrorArg =
-            Address(V, getContext().getTypeAlignInChars(pointeeTy));
+            Address::deprecated(V, getContext().getTypeAlignInChars(pointeeTy));
 
           swiftErrorTemp =
             CreateMemTemp(pointeeTy, getPointerAlign(), "swifterror.temp");
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 09951d6db2db6..612209ef8fe8f 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2148,8 +2148,8 @@ void CodeGenFunction::EmitCXXConstructorCall(const CXXConstructorDecl *D,
     assert(Args.size() == 2 && "unexpected argcount for trivial ctor");
 
     QualType SrcTy = D->getParamDecl(0)->getType().getNonReferenceType();
-    Address Src(Args[1].getRValue(*this).getScalarVal(),
-                CGM.getNaturalTypeAlignment(SrcTy));
+    Address Src = Address::deprecated(Args[1].getRValue(*this).getScalarVal(),
+                                      CGM.getNaturalTypeAlignment(SrcTy));
     LValue SrcLVal = MakeAddrLValue(Src, SrcTy);
     QualType DestTy = getContext().getTypeDeclType(ClassDecl);
     LValue DestLVal = MakeAddrLValue(This, DestTy);
@@ -2727,7 +2727,7 @@ void CodeGenFunction::EmitVTablePtrCheckForCast(QualType T,
 
   llvm::Value *VTable;
   std::tie(VTable, ClassDecl) = CGM.getCXXABI().LoadVTablePtr(
-      *this, Address(Derived, getPointerAlign()), ClassDecl);
+      *this, Address::deprecated(Derived, getPointerAlign()), ClassDecl);
 
   EmitVTablePtrCheck(ClassDecl, VTable, TCK, Loc);
 
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index 86d1a7589e2aa..d84bddb8b22f3 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -75,8 +75,9 @@ DominatingValue<RValue>::saved_type::save(CodeGenFunction &CGF, RValue rv) {
 /// point.
 RValue DominatingValue<RValue>::saved_type::restore(CodeGenFunction &CGF) {
   auto getSavingAddress = [&](llvm::Value *value) {
-    auto alignment = cast<llvm::AllocaInst>(value)->getAlignment();
-    return Address(value, CharUnits::fromQuantity(alignment));
+    auto *AI = cast<llvm::AllocaInst>(value);
+    return Address(value, AI->getAllocatedType(),
+                   CharUnits::fromQuantity(AI->getAlignment()));
   };
   switch (K) {
   case ScalarLiteral:
@@ -84,10 +85,12 @@ RValue DominatingValue<RValue>::saved_type::restore(CodeGenFunction &CGF) {
   case ScalarAddress:
     return RValue::get(CGF.Builder.CreateLoad(getSavingAddress(Value)));
   case AggregateLiteral:
-    return RValue::getAggregate(Address(Value, CharUnits::fromQuantity(Align)));
+    return RValue::getAggregate(
+        Address::deprecated(Value, CharUnits::fromQuantity(Align)));
   case AggregateAddress: {
     auto addr = CGF.Builder.CreateLoad(getSavingAddress(Value));
-    return RValue::getAggregate(Address(addr, CharUnits::fromQuantity(Align)));
+    return RValue::getAggregate(
+        Address::deprecated(addr, CharUnits::fromQuantity(Align)));
   }
   case ComplexAddress: {
     Address address = getSavingAddress(Value);
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index cd9051c161526..a32411774312c 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1782,7 +1782,7 @@ void CodeGenFunction::emitZeroOrPatternForAutoVarInit(QualType type,
     Cur->addIncoming(Begin.getPointer(), OriginBB);
     CharUnits CurAlign = Loc.getAlignment().alignmentOfArrayElement(EltSize);
     auto *I =
-        Builder.CreateMemCpy(Address(Cur, CurAlign),
+        Builder.CreateMemCpy(Address(Cur, Int8Ty, CurAlign),
                              createUnnamedGlobalForMemcpyFrom(
                                  CGM, D, Builder, Constant, ConstantAlign),
                              BaseSizeInChars, isVolatile);
@@ -2474,9 +2474,10 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
              CGM.getDataLayout().getAllocaAddrSpace());
       auto DestAS = getContext().getTargetAddressSpace(DestLangAS);
       auto *T = V->getType()->getPointerElementType()->getPointerTo(DestAS);
-      DeclPtr = Address(getTargetHooks().performAddrSpaceCast(
-                            *this, V, SrcLangAS, DestLangAS, T, true),
-                        DeclPtr.getAlignment());
+      DeclPtr =
+          Address::deprecated(getTargetHooks().performAddrSpaceCast(
+                                  *this, V, SrcLangAS, DestLangAS, T, true),
+                              DeclPtr.getAlignment());
     }
 
     // Push a destructor cleanup for this parameter if the ABI requires it.
diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index 91ecbecc843f3..98be1e2ff338f 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -1845,7 +1845,7 @@ Address CodeGenFunction::recoverAddrOfEscapedLocal(CodeGenFunction &ParentCGF,
   llvm::Value *ChildVar =
       Builder.CreateBitCast(RecoverCall, ParentVar.getType());
   ChildVar->setName(ParentVar.getName());
-  return Address(ChildVar, ParentVar.getAlignment());
+  return Address::deprecated(ChildVar, ParentVar.getAlignment());
 }
 
 void CodeGenFunction::EmitCapturedLocals(CodeGenFunction &ParentCGF,
@@ -1931,7 +1931,8 @@ void CodeGenFunction::EmitCapturedLocals(CodeGenFunction &ParentCGF,
           FrameRecoverFn, {ParentI8Fn, ParentFP,
                            llvm::ConstantInt::get(Int32Ty, FrameEscapeIdx)});
       ParentFP = Builder.CreateBitCast(ParentFP, CGM.VoidPtrPtrTy);
-      ParentFP = Builder.CreateLoad(Address(ParentFP, getPointerAlign()));
+      ParentFP = Builder.CreateLoad(
+          Address(ParentFP, CGM.VoidPtrTy, getPointerAlign()));
     }
   }
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 7fc4c4f357969..37f9a79c71325 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -823,7 +823,8 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc,
       // Load the vptr, and compute hash_16_bytes(TypeHash, vptr).
       llvm::Value *Low = llvm::ConstantInt::get(Int64Ty, TypeHash);
       llvm::Type *VPtrTy = llvm::PointerType::get(IntPtrTy, 0);
-      Address VPtrAddr(Builder.CreateBitCast(Ptr, VPtrTy), getPointerAlign());
+      Address VPtrAddr(Builder.CreateBitCast(Ptr, VPtrTy), IntPtrTy,
+                       getPointerAlign());
       llvm::Value *VPtrVal = Builder.CreateLoad(VPtrAddr);
       llvm::Value *High = Builder.CreateZExt(VPtrVal, Int64Ty);
 
@@ -2502,9 +2503,10 @@ Address CodeGenFunction::EmitLoadOfPointer(Address Ptr,
                                            LValueBaseInfo *BaseInfo,
                                            TBAAAccessInfo *TBAAInfo) {
   llvm::Value *Addr = Builder.CreateLoad(Ptr);
-  return Address(Addr, CGM.getNaturalTypeAlignment(PtrTy->getPointeeType(),
-                                                   BaseInfo, TBAAInfo,
-                                                   /*forPointeeType=*/true));
+  return Address::deprecated(
+      Addr,
+      CGM.getNaturalTypeAlignment(PtrTy->getPointeeType(), BaseInfo, TBAAInfo,
+                                  /*forPointeeType=*/true));
 }
 
 LValue CodeGenFunction::EmitLoadOfPointerLValue(Address PtrAddr,
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index f00e315a152ac..9596ed34e5e9d 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -548,11 +548,12 @@ static void EmitNullBaseClassInitialization(CodeGenFunction &CGF,
         /*isConstant=*/true, llvm::GlobalVariable::PrivateLinkage,
         NullConstantForBase, Twine());
 
-    CharUnits Align = std::max(Layout.getNonVirtualAlignment(),
-                               DestPtr.getAlignment());
+    CharUnits Align =
+        std::max(Layout.getNonVirtualAlignment(), DestPtr.getAlignment());
     NullVariable->setAlignment(Align.getAsAlign());
 
-    Address SrcPtr = Address(CGF.EmitCastToVoidPtr(NullVariable), Align);
+    Address SrcPtr =
+        Address(CGF.EmitCastToVoidPtr(NullVariable), CGF.Int8Ty, Align);
 
     // Get and call the appropriate llvm.memcpy overload.
     for (std::pair<CharUnits, CharUnits> Store : Stores) {
@@ -1244,10 +1245,10 @@ void CodeGenFunction::EmitNewArrayInitializer(
 
   // Set up the current-element phi.
   llvm::PHINode *CurPtrPhi =
-    Builder.CreatePHI(CurPtr.getType(), 2, "array.cur");
+      Builder.CreatePHI(CurPtr.getType(), 2, "array.cur");
   CurPtrPhi->addIncoming(CurPtr.getPointer(), EntryBB);
 
-  CurPtr = Address(CurPtrPhi, ElementAlign);
+  CurPtr = Address(CurPtrPhi, CurPtr.getElementType(), ElementAlign);
 
   // Store the new Cleanup position for irregular Cleanups.
   if (EndOfInit.isValid())
@@ -1796,7 +1797,8 @@ void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD,
     CharUnits Align = CGM.getNaturalTypeAlignment(DDTag);
     DestroyingDeleteTag = CreateTempAlloca(Ty, "destroying.delete.tag");
     DestroyingDeleteTag->setAlignment(Align.getAsAlign());
-    DeleteArgs.add(RValue::getAggregate(Address(DestroyingDeleteTag, Align)), DDTag);
+    DeleteArgs.add(
+        RValue::getAggregate(Address(DestroyingDeleteTag, Ty, Align)), DDTag);
   }
 
   // Pass the size if the delete function has a size_t parameter.
@@ -2099,9 +2101,10 @@ void CodeGenFunction::EmitCXXDeleteExpr(const CXXDeleteExpr *E) {
       GEP.push_back(Zero);
     }
 
-    Ptr = Address(Builder.CreateInBoundsGEP(Ptr.getElementType(),
-                                            Ptr.getPointer(), GEP, "del.first"),
-                  Ptr.getAlignment());
+    Ptr = Address::deprecated(Builder.CreateInBoundsGEP(Ptr.getElementType(),
+                                                        Ptr.getPointer(), GEP,
+                                                        "del.first"),
+                              Ptr.getAlignment());
   }
 
   assert(ConvertTypeForMem(DeleteTy) == Ptr.getElementType());
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index a235639054935..d3db63f9917fe 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -420,7 +420,7 @@ class ScalarExprEmitter
 
     if (Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) {
       if (E->isGLValue())
-        return CGF.Builder.CreateLoad(Address(
+        return CGF.Builder.CreateLoad(Address::deprecated(
             Result, CGF.getContext().getTypeAlignInChars(E->getType())));
       return Result;
     }
@@ -4896,7 +4896,7 @@ LValue CodeGenFunction::EmitObjCIsaExpr(const ObjCIsaExpr *E) {
   Expr *BaseExpr = E->getBase();
   Address Addr = Address::invalid();
   if (BaseExpr->isPRValue()) {
-    Addr = Address(EmitScalarExpr(BaseExpr), getPointerAlign());
+    Addr = Address::deprecated(EmitScalarExpr(BaseExpr), getPointerAlign());
   } else {
     Addr = EmitLValue(BaseExpr).getAddress(*this);
   }
diff --git a/clang/lib/CodeGen/CGNonTrivialStruct.cpp b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
index e3b0e069b8301..eff768ead9580 100644
--- a/clang/lib/CodeGen/CGNonTrivialStruct.cpp
+++ b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
@@ -326,9 +326,9 @@ static std::array<Address, N> getParamAddrs(std::index_sequence<Ints...> IntSeq,
                                             std::array<CharUnits, N> Alignments,
                                             FunctionArgList Args,
                                             CodeGenFunction *CGF) {
-  return std::array<Address, N>{{
-      Address(CGF->Builder.CreateLoad(CGF->GetAddrOfLocalVar(Args[Ints])),
-              Alignments[Ints])...}};
+  return std::array<Address, N>{{Address::deprecated(
+      CGF->Builder.CreateLoad(CGF->GetAddrOfLocalVar(Args[Ints])),
+      Alignments[Ints])...}};
 }
 
 // Template classes that are used as bases for classes that emit special
@@ -400,8 +400,9 @@ template <class Derived> struct GenFuncBase {
     std::array<Address, N> NewAddrs = Addrs;
 
     for (unsigned I = 0; I < N; ++I)
-      NewAddrs[I] = Address(
-          PHIs[I], StartAddrs[I].getAlignment().alignmentAtOffset(EltSize));
+      NewAddrs[I] =
+            Address(PHIs[I], CGF.Int8PtrTy,
+                    StartAddrs[I].getAlignment().alignmentAtOffset(EltSize));
 
     EltQT = IsVolatile ? EltQT.withVolatile() : EltQT;
     this->asDerived().visitWithKind(FK, EltQT, nullptr, CharUnits::Zero(),
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index d78275831ded0..51343ca3dbe30 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -3845,15 +3845,14 @@ CodeGenFunction::GenerateObjCAtomicGetterCopyHelperFunction(
                       SourceLocation());
 
   RValue DV = EmitAnyExpr(&DstExpr);
-  CharUnits Alignment
-    = getContext().getTypeAlignInChars(TheCXXConstructExpr->getType());
+  CharUnits Alignment =
+      getContext().getTypeAlignInChars(TheCXXConstructExpr->getType());
   EmitAggExpr(TheCXXConstructExpr,
-              AggValueSlot::forAddr(Address(DV.getScalarVal(), Alignment),
-                                    Qualifiers(),
-                                    AggValueSlot::IsDestructed,
-                                    AggValueSlot::DoesNotNeedGCBarriers,
-                                    AggValueSlot::IsNotAliased,
-                                    AggValueSlot::DoesNotOverlap));
+              AggValueSlot::forAddr(
+                  Address::deprecated(DV.getScalarVal(), Alignment),
+                  Qualifiers(), AggValueSlot::IsDestructed,
+                  AggValueSlot::DoesNotNeedGCBarriers,
+                  AggValueSlot::IsNotAliased, AggValueSlot::DoesNotOverlap));
 
   FinishFunction();
   HelperFn = llvm::ConstantExpr::getBitCast(Fn, VoidPtrTy);
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index aee751e88c395..a0f9e32d4f81d 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -1264,8 +1264,8 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
   llvm::Value *GetClassNamed(CodeGenFunction &CGF,
                              const std::string &Name,
                              bool isWeak) override {
-    return CGF.Builder.CreateLoad(Address(GetClassVar(Name, isWeak),
-          CGM.getPointerAlign()));
+    return CGF.Builder.CreateLoad(
+        Address::deprecated(GetClassVar(Name, isWeak), CGM.getPointerAlign()));
   }
   int32_t FlagsForOwnership(Qualifiers::ObjCLifetime Ownership) {
     // typedef enum {
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index bf6b2175bbe92..7e4176e63ecd0 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -5272,7 +5272,7 @@ Address CGObjCMac::EmitSelectorAddr(Selector Sel) {
     Entry->setExternallyInitialized(true);
   }
 
-  return Address(Entry, Align);
+  return Address::deprecated(Entry, Align);
 }
 
 llvm::Constant *CGObjCCommonMac::GetClassName(StringRef RuntimeName) {
@@ -7349,7 +7349,7 @@ CGObjCNonFragileABIMac::EmitVTableMessageSend(CodeGenFunction &CGF,
 
   Address mref =
     Address(CGF.Builder.CreateBitCast(messageRef, ObjCTypes.MessageRefPtrTy),
-            CGF.getPointerAlign());
+            ObjCTypes.MessageRefTy, CGF.getPointerAlign());
 
   // Update the message ref argument.
   args[1].setRValue(RValue::get(mref.getPointer()));
@@ -7643,7 +7643,7 @@ Address CGObjCNonFragileABIMac::EmitSelectorAddr(Selector Sel) {
     CGM.addCompilerUsedGlobal(Entry);
   }
 
-  return Address(Entry, Align);
+  return Address::deprecated(Entry, Align);
 }
 
 /// EmitObjCIvarAssign - Code gen for assigning to a __strong object.
diff --git a/clang/lib/CodeGen/CGObjCRuntime.cpp b/clang/lib/CodeGen/CGObjCRuntime.cpp
index 33ae3c7c2b28a..80cabbe930017 100644
--- a/clang/lib/CodeGen/CGObjCRuntime.cpp
+++ b/clang/lib/CodeGen/CGObjCRuntime.cpp
@@ -106,7 +106,7 @@ LValue CGObjCRuntime::EmitValueForIvarAtOffset(CodeGen::CodeGenFunction &CGF,
                              CGF.CGM.getContext().toBits(StorageSize),
                              CharUnits::fromQuantity(0)));
 
-  Address Addr(V, Alignment);
+  Address Addr = Address::deprecated(V, Alignment);
   Addr = CGF.Builder.CreateElementBitCast(Addr,
                                    llvm::Type::getIntNTy(CGF.getLLVMContext(),
                                                          Info->StorageSize));
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 0d611c2e343d4..a77658060be72 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -719,14 +719,14 @@ static void EmitOMPAggregateInit(CodeGenFunction &CGF, Address DestAddr,
                                           "omp.arraycpy.srcElementPast");
     SrcElementPHI->addIncoming(SrcBegin, EntryBB);
     SrcElementCurrent =
-        Address(SrcElementPHI,
+        Address(SrcElementPHI, SrcAddr.getElementType(),
                 SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize));
   }
   llvm::PHINode *DestElementPHI = CGF.Builder.CreatePHI(
       DestBegin->getType(), 2, "omp.arraycpy.destElementPast");
   DestElementPHI->addIncoming(DestBegin, EntryBB);
   Address DestElementCurrent =
-      Address(DestElementPHI,
+      Address(DestElementPHI, DestAddr.getElementType(),
               DestAddr.getAlignment().alignmentOfArrayElement(ElementSize));
 
   // Emit copy.
@@ -973,7 +973,7 @@ static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
     CGF.Builder.CreateStore(Addr, Tmp);
     return MostTopTmp;
   }
-  return Address(Addr, BaseLVAlignment);
+  return Address::deprecated(Addr, BaseLVAlignment);
 }
 
 static const VarDecl *getBaseDecl(const Expr *Ref, const DeclRefExpr *&DE) {
@@ -1715,7 +1715,7 @@ Address CGOpenMPRuntime::getAddrOfDeclareTargetVar(const VarDecl *VD) {
         GV->setInitializer(CGM.GetAddrOfGlobal(VD));
       registerTargetGlobalVariable(VD, cast<llvm::Constant>(Ptr));
     }
-    return Address(Ptr, CGM.getContext().getDeclAlign(VD));
+    return Address::deprecated(Ptr, CGM.getContext().getDeclAlign(VD));
   }
   return Address::invalid();
 }
@@ -1739,16 +1739,17 @@ Address CGOpenMPRuntime::getAddrOfThreadPrivate(CodeGenFunction &CGF,
     return VDAddr;
 
   llvm::Type *VarTy = VDAddr.getElementType();
-  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
-                         CGF.Builder.CreatePointerCast(VDAddr.getPointer(),
-                                                       CGM.Int8PtrTy),
-                         CGM.getSize(CGM.GetTargetTypeStoreSize(VarTy)),
-                         getOrCreateThreadPrivateCache(VD)};
-  return Address(CGF.EmitRuntimeCall(
-                     OMPBuilder.getOrCreateRuntimeFunction(
-                         CGM.getModule(), OMPRTL___kmpc_threadprivate_cached),
-                     Args),
-                 VDAddr.getAlignment());
+  llvm::Value *Args[] = {
+      emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
+      CGF.Builder.CreatePointerCast(VDAddr.getPointer(), CGM.Int8PtrTy),
+      CGM.getSize(CGM.GetTargetTypeStoreSize(VarTy)),
+      getOrCreateThreadPrivateCache(VD)};
+  return Address::deprecated(
+      CGF.EmitRuntimeCall(
+          OMPBuilder.getOrCreateRuntimeFunction(
+              CGM.getModule(), OMPRTL___kmpc_threadprivate_cached),
+          Args),
+      VDAddr.getAlignment());
 }
 
 void CGOpenMPRuntime::emitThreadPrivateVarInit(
@@ -1805,7 +1806,7 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
       llvm::Value *ArgVal = CtorCGF.EmitLoadOfScalar(
           CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false,
           CGM.getContext().VoidPtrTy, Dst.getLocation());
-      Address Arg = Address(ArgVal, VDAddr.getAlignment());
+      Address Arg = Address::deprecated(ArgVal, VDAddr.getAlignment());
       Arg = CtorCGF.Builder.CreateElementBitCast(
           Arg, CtorCGF.ConvertTypeForMem(ASTTy));
       CtorCGF.EmitAnyExprToMem(Init, Arg, Init->getType().getQualifiers(),
@@ -1841,8 +1842,8 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
       llvm::Value *ArgVal = DtorCGF.EmitLoadOfScalar(
           DtorCGF.GetAddrOfLocalVar(&Dst),
           /*Volatile=*/false, CGM.getContext().VoidPtrTy, Dst.getLocation());
-      DtorCGF.emitDestroy(Address(ArgVal, VDAddr.getAlignment()), ASTTy,
-                          DtorCGF.getDestroyer(ASTTy.isDestructedType()),
+      DtorCGF.emitDestroy(Address::deprecated(ArgVal, VDAddr.getAlignment()),
+                          ASTTy, DtorCGF.getDestroyer(ASTTy.isDestructedType()),
                           DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
       DtorCGF.FinishFunction();
       Dtor = Fn;
@@ -1943,10 +1944,10 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
       CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI,
                             FunctionArgList(), Loc, Loc);
       auto AL = ApplyDebugLocation::CreateArtificial(CtorCGF);
-      CtorCGF.EmitAnyExprToMem(Init,
-                               Address(Addr, CGM.getContext().getDeclAlign(VD)),
-                               Init->getType().getQualifiers(),
-                               /*IsInitializer=*/true);
+      CtorCGF.EmitAnyExprToMem(
+          Init, Address::deprecated(Addr, CGM.getContext().getDeclAlign(VD)),
+          Init->getType().getQualifiers(),
+          /*IsInitializer=*/true);
       CtorCGF.FinishFunction();
       Ctor = Fn;
       ID = llvm::ConstantExpr::getBitCast(Fn, CGM.Int8PtrTy);
@@ -1983,9 +1984,10 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
       // Create a scope with an artificial location for the body of this
       // function.
       auto AL = ApplyDebugLocation::CreateArtificial(DtorCGF);
-      DtorCGF.emitDestroy(Address(Addr, CGM.getContext().getDeclAlign(VD)),
-                          ASTTy, DtorCGF.getDestroyer(ASTTy.isDestructedType()),
-                          DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
+      DtorCGF.emitDestroy(
+          Address::deprecated(Addr, CGM.getContext().getDeclAlign(VD)), ASTTy,
+          DtorCGF.getDestroyer(ASTTy.isDestructedType()),
+          DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
       DtorCGF.FinishFunction();
       Dtor = Fn;
       ID = llvm::ConstantExpr::getBitCast(Fn, CGM.Int8PtrTy);
@@ -2035,7 +2037,7 @@ Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF,
                   CGM.getModule(), OMPRTL___kmpc_threadprivate_cached),
               Args),
           VarLVType->getPointerTo(/*AddrSpace=*/0)),
-      CGM.getContext().getTypeAlignInChars(VarType));
+      VarLVType, CGM.getContext().getTypeAlignInChars(VarType));
 }
 
 void CGOpenMPRuntime::emitIfClause(CodeGenFunction &CGF, const Expr *Cond,
@@ -2367,7 +2369,7 @@ static Address emitAddrOfVarFromArray(CodeGenFunction &CGF, Address Array,
   Address PtrAddr = CGF.Builder.CreateConstArrayGEP(Array, Index);
   llvm::Value *Ptr = CGF.Builder.CreateLoad(PtrAddr);
 
-  Address Addr = Address(Ptr, CGF.getContext().getDeclAlign(Var));
+  Address Addr = Address::deprecated(Ptr, CGF.getContext().getDeclAlign(Var));
   Addr = CGF.Builder.CreateElementBitCast(
       Addr, CGF.ConvertTypeForMem(Var->getType()));
   return Addr;
@@ -2400,12 +2402,14 @@ static llvm::Value *emitCopyprivateCopyFunction(
   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
   // Dest = (void*[n])(LHSArg);
   // Src = (void*[n])(RHSArg);
-  Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
-      ArgsType), CGF.getPointerAlign());
-  Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
-      ArgsType), CGF.getPointerAlign());
+  Address LHS = Address::deprecated(
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)), ArgsType),
+      CGF.getPointerAlign());
+  Address RHS = Address::deprecated(
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)), ArgsType),
+      CGF.getPointerAlign());
   // *(Type0*)Dst[0] = *(Type0*)Src[0];
   // *(Type1*)Dst[1] = *(Type1*)Src[1];
   // ...
@@ -3903,8 +3907,8 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
         } else if (ForDup) {
           SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
           SharedRefLValue = CGF.MakeAddrLValue(
-              Address(SharedRefLValue.getPointer(CGF),
-                      C.getDeclAlign(OriginalVD)),
+              SharedRefLValue.getAddress(CGF).withAlignment(
+                  C.getDeclAlign(OriginalVD)),
               SharedRefLValue.getType(), LValueBaseInfo(AlignmentSource::Decl),
               SharedRefLValue.getTBAAInfo());
         } else if (CGF.LambdaCaptureFields.count(
@@ -4046,7 +4050,7 @@ emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc,
         KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
     LValue Base = CGF.EmitLValueForField(
         TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
-    KmpTaskSharedsPtr = Address(
+    KmpTaskSharedsPtr = Address::deprecated(
         CGF.EmitLoadOfScalar(CGF.EmitLValueForField(
                                  Base, *std::next(KmpTaskTQTyRD->field_begin(),
                                                   KmpTaskTShareds)),
@@ -4534,13 +4538,13 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
   // Copy shareds if there are any.
   Address KmpTaskSharedsPtr = Address::invalid();
   if (!SharedsTy->getAsStructureType()->getDecl()->field_empty()) {
-    KmpTaskSharedsPtr =
-        Address(CGF.EmitLoadOfScalar(
-                    CGF.EmitLValueForField(
-                        TDBase, *std::next(KmpTaskTQTyRD->field_begin(),
-                                           KmpTaskTShareds)),
-                    Loc),
-                CGM.getNaturalTypeAlignment(SharedsTy));
+    KmpTaskSharedsPtr = Address::deprecated(
+        CGF.EmitLoadOfScalar(
+            CGF.EmitLValueForField(
+                TDBase,
+                *std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds)),
+            Loc),
+        CGM.getNaturalTypeAlignment(SharedsTy));
     LValue Dest = CGF.MakeAddrLValue(KmpTaskSharedsPtr, SharedsTy);
     LValue Src = CGF.MakeAddrLValue(Shareds, SharedsTy);
     CGF.EmitAggregateCopy(Dest, Src, SharedsTy, AggValueSlot::DoesNotOverlap);
@@ -5024,7 +5028,7 @@ Address CGOpenMPRuntime::emitDepobjDependClause(
                           Args, ".dep.arr.addr");
   Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
       Addr, CGF.ConvertTypeForMem(KmpDependInfoTy)->getPointerTo());
-  DependenciesArray = Address(Addr, Align);
+  DependenciesArray = Address::deprecated(Addr, Align);
   // Write number of elements in the first element of array for depobj.
   LValue Base = CGF.MakeAddrLValue(DependenciesArray, KmpDependInfoTy);
   // deps[i].base_addr = NumDependencies;
@@ -5102,7 +5106,7 @@ void CGOpenMPRuntime::emitUpdateClause(CodeGenFunction &CGF, LValue DepobjLVal,
   llvm::PHINode *ElementPHI =
       CGF.Builder.CreatePHI(Begin.getType(), 2, "omp.elementPast");
   ElementPHI->addIncoming(Begin.getPointer(), EntryBB);
-  Begin = Address(ElementPHI, Begin.getAlignment());
+  Begin = Begin.withPointer(ElementPHI);
   Base = CGF.MakeAddrLValue(Begin, KmpDependInfoTy, Base.getBaseInfo(),
                             Base.getTBAAInfo());
   // deps[i].flags = NewDepKind;
@@ -5374,16 +5378,16 @@ static void EmitOMPAggregateReduction(
   llvm::PHINode *RHSElementPHI = CGF.Builder.CreatePHI(
       RHSBegin->getType(), 2, "omp.arraycpy.srcElementPast");
   RHSElementPHI->addIncoming(RHSBegin, EntryBB);
-  Address RHSElementCurrent =
-      Address(RHSElementPHI,
-              RHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
+  Address RHSElementCurrent = Address::deprecated(
+      RHSElementPHI,
+      RHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
 
   llvm::PHINode *LHSElementPHI = CGF.Builder.CreatePHI(
       LHSBegin->getType(), 2, "omp.arraycpy.destElementPast");
   LHSElementPHI->addIncoming(LHSBegin, EntryBB);
-  Address LHSElementCurrent =
-      Address(LHSElementPHI,
-              LHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
+  Address LHSElementCurrent = Address::deprecated(
+      LHSElementPHI,
+      LHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
 
   // Emit copy.
   CodeGenFunction::OMPPrivateScope Scope(CGF);
@@ -5459,12 +5463,14 @@ llvm::Function *CGOpenMPRuntime::emitReductionFunction(
 
   // Dst = (void*[n])(LHSArg);
   // Src = (void*[n])(RHSArg);
-  Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
-      ArgsType), CGF.getPointerAlign());
-  Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
-      ArgsType), CGF.getPointerAlign());
+  Address LHS = Address::deprecated(
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)), ArgsType),
+      CGF.getPointerAlign());
+  Address RHS = Address::deprecated(
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)), ArgsType),
+      CGF.getPointerAlign());
 
   //  ...
   //  *(Type<i>*)lhs[i] = RedOp<i>(*(Type<i>*)lhs[i], *(Type<i>*)rhs[i]);
@@ -6236,7 +6242,7 @@ Address CGOpenMPRuntime::getTaskReductionItem(CodeGenFunction &CGF,
                          ReductionsPtr,
                          CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
                              SharedLVal.getPointer(CGF), CGM.VoidPtrTy)};
-  return Address(
+  return Address::deprecated(
       CGF.EmitRuntimeCall(
           OMPBuilder.getOrCreateRuntimeFunction(
               CGM.getModule(), OMPRTL___kmpc_task_reduction_get_th_data),
@@ -7875,7 +7881,7 @@ class MappableExprsHandler {
       BP = CGF.EmitOMPSharedLValue(AssocExpr).getAddress(CGF);
     } else if (OAShE &&
                isa<CXXThisExpr>(OAShE->getBase()->IgnoreParenCasts())) {
-      BP = Address(
+      BP = Address::deprecated(
           CGF.EmitScalarExpr(OAShE->getBase()),
           CGF.getContext().getTypeAlignInChars(OAShE->getBase()->getType()));
     } else {
@@ -8046,9 +8052,10 @@ class MappableExprsHandler {
           return BaseLV;
         };
         if (OAShE) {
-          LowestElem = LB = Address(CGF.EmitScalarExpr(OAShE->getBase()),
-                                    CGF.getContext().getTypeAlignInChars(
-                                        OAShE->getBase()->getType()));
+          LowestElem = LB =
+              Address::deprecated(CGF.EmitScalarExpr(OAShE->getBase()),
+                                  CGF.getContext().getTypeAlignInChars(
+                                      OAShE->getBase()->getType()));
         } else if (IsMemberReference) {
           const auto *ME = cast<MemberExpr>(I->getAssociatedExpression());
           LValue BaseLVal = EmitMemberExprBase(CGF, ME);
@@ -9047,7 +9054,8 @@ class MappableExprsHandler {
                          ->getAsCXXRecordDecl();
     if (!RD || !RD->isLambda())
       return;
-    Address VDAddr = Address(Arg, CGF.getContext().getDeclAlign(VD));
+    Address VDAddr =
+        Address::deprecated(Arg, CGF.getContext().getDeclAlign(VD));
     LValue VDLVal = CGF.MakeAddrLValue(
         VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
     llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
@@ -9508,7 +9516,7 @@ static void emitNonContiguousDescriptor(
     llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32(
         llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
         Info.PointersArray, 0, I);
-    Address PAddr(P, CGF.getPointerAlign());
+    Address PAddr = Address::deprecated(P, CGF.getPointerAlign());
     CGF.Builder.CreateStore(DAddr.getPointer(), PAddr);
     ++L;
   }
@@ -9687,7 +9695,8 @@ static void emitOffloadingArrays(
           Info.BasePointersArray, 0, I);
       BP = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
           BP, BPVal->getType()->getPointerTo(/*AddrSpace=*/0));
-      Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
+      Address BPAddr =
+          Address::deprecated(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
       CGF.Builder.CreateStore(BPVal, BPAddr);
 
       if (Info.requiresDevicePointerInfo())
@@ -9701,7 +9710,8 @@ static void emitOffloadingArrays(
           Info.PointersArray, 0, I);
       P = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
           P, PVal->getType()->getPointerTo(/*AddrSpace=*/0));
-      Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
+      Address PAddr =
+          Address::deprecated(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
       CGF.Builder.CreateStore(PVal, PAddr);
 
       if (hasRuntimeEvaluationCaptureSize) {
@@ -9710,7 +9720,8 @@ static void emitOffloadingArrays(
             Info.SizesArray,
             /*Idx0=*/0,
             /*Idx1=*/I);
-        Address SAddr(S, Ctx.getTypeAlignInChars(Int64Ty));
+        Address SAddr =
+            Address::deprecated(S, Ctx.getTypeAlignInChars(Int64Ty));
         CGF.Builder.CreateStore(CGF.Builder.CreateIntCast(CombinedInfo.Sizes[I],
                                                           CGM.Int64Ty,
                                                           /*isSigned=*/true),
@@ -10049,9 +10060,9 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D,
       PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
   PtrPHI->addIncoming(PtrBegin, EntryBB);
   Address PtrCurrent =
-      Address(PtrPHI, MapperCGF.GetAddrOfLocalVar(&BeginArg)
-                          .getAlignment()
-                          .alignmentOfArrayElement(ElementSize));
+      Address::deprecated(PtrPHI, MapperCGF.GetAddrOfLocalVar(&BeginArg)
+                                      .getAlignment()
+                                      .alignmentOfArrayElement(ElementSize));
   // Privatize the declared variable of mapper to be the current array element.
   CodeGenFunction::OMPPrivateScope Scope(MapperCGF);
   Scope.addPrivate(MapperVarDecl, [PtrCurrent]() { return PtrCurrent; });
@@ -10591,11 +10602,13 @@ void CGOpenMPRuntime::emitTargetCall(
 
     InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
     InputInfo.BasePointersArray =
-        Address(Info.BasePointersArray, CGM.getPointerAlign());
+        Address::deprecated(Info.BasePointersArray, CGM.getPointerAlign());
     InputInfo.PointersArray =
-        Address(Info.PointersArray, CGM.getPointerAlign());
-    InputInfo.SizesArray = Address(Info.SizesArray, CGM.getPointerAlign());
-    InputInfo.MappersArray = Address(Info.MappersArray, CGM.getPointerAlign());
+        Address::deprecated(Info.PointersArray, CGM.getPointerAlign());
+    InputInfo.SizesArray =
+        Address::deprecated(Info.SizesArray, CGM.getPointerAlign());
+    InputInfo.MappersArray =
+        Address::deprecated(Info.MappersArray, CGM.getPointerAlign());
     MapTypesArray = Info.MapTypesArray;
     MapNamesArray = Info.MapNamesArray;
     if (RequiresOuterTask)
@@ -11472,12 +11485,13 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
         {/*ForEndCall=*/false});
     InputInfo.NumberOfTargetItems = Info.NumberOfPtrs;
     InputInfo.BasePointersArray =
-        Address(Info.BasePointersArray, CGM.getPointerAlign());
+        Address::deprecated(Info.BasePointersArray, CGM.getPointerAlign());
     InputInfo.PointersArray =
-        Address(Info.PointersArray, CGM.getPointerAlign());
+        Address::deprecated(Info.PointersArray, CGM.getPointerAlign());
     InputInfo.SizesArray =
-        Address(Info.SizesArray, CGM.getPointerAlign());
-    InputInfo.MappersArray = Address(Info.MappersArray, CGM.getPointerAlign());
+        Address::deprecated(Info.SizesArray, CGM.getPointerAlign());
+    InputInfo.MappersArray =
+        Address::deprecated(Info.MappersArray, CGM.getPointerAlign());
     MapTypesArray = Info.MapTypesArray;
     MapNamesArray = Info.MapNamesArray;
     if (RequiresOuterTask)
@@ -12327,8 +12341,9 @@ Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF,
         CGF.EmitRuntimeCall(RTLFn, Args);
       }
     };
-    Address VDAddr =
-        UntiedRealAddr.isValid() ? UntiedRealAddr : Address(Addr, Align);
+    Address VDAddr = UntiedRealAddr.isValid()
+                         ? UntiedRealAddr
+                         : Address::deprecated(Addr, Align);
     CGF.EHStack.pushCleanup<OMPAllocateCleanupTy>(
         NormalAndEHCleanup, FiniRTLFn, CVD->getLocation().getRawEncoding(),
         VDAddr, Allocator);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 5f2aea518acae..bb6847ab87319 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1800,8 +1800,9 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
       llvm::PHINode *PhiDest =
           Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
       PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
-      Ptr = Address(PhiSrc, Ptr.getAlignment());
-      ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
+      Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
+      ElemPtr =
+          Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
       llvm::Value *PtrDiff = Bld.CreatePtrDiff(
           CGF.Int8Ty, PtrEnd.getPointer(),
           Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
@@ -1943,7 +1944,7 @@ static void emitReductionListCopy(
           Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
       ScratchPadElemAbsolutePtrVal =
           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
-      DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
+      DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
                                 C.getTypeAlignInChars(Private->getType()));
       IncrScratchpadDest = true;
       break;
@@ -1958,7 +1959,7 @@ static void emitReductionListCopy(
           Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
       ScratchPadElemAbsolutePtrVal =
           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
-      SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
+      SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
                                C.getTypeAlignInChars(Private->getType()));
       IncrScratchpadSrc = true;
 
@@ -2051,9 +2052,10 @@ static void emitReductionListCopy(
           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
 
       if (IncrScratchpadDest)
-        DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
+        DestBase =
+            Address::deprecated(ScratchpadBasePtr, CGF.getPointerAlign());
       else /* IncrScratchpadSrc = true */
-        SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
+        SrcBase = Address::deprecated(ScratchpadBasePtr, CGF.getPointerAlign());
     }
 
     ++Idx;
@@ -2137,13 +2139,14 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
   llvm::Value *WarpID = getNVPTXWarpID(CGF);
 
   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
   Address LocalReduceList(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
           CGF.EmitLoadOfScalar(
               AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
               LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
-          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
-      CGF.getPointerAlign());
+          ElemTy->getPointerTo()),
+      ElemTy, CGF.getPointerAlign());
 
   unsigned Idx = 0;
   for (const Expr *Private : Privates) {
@@ -2201,7 +2204,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
       llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
           ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
       // elemptr = ((CopyType*)(elemptrptr)) + I
-      Address ElemPtr = Address(ElemPtrPtr, Align);
+      Address ElemPtr = Address::deprecated(ElemPtrPtr, Align);
       ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
       if (NumIters > 1)
         ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
@@ -2211,7 +2214,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
       llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
           TransferMedium->getValueType(), TransferMedium,
           {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
-      Address MediumPtr(MediumPtrVal, Align);
+      Address MediumPtr = Address::deprecated(MediumPtrVal, Align);
       // Casting to actual data type.
       // MediumPtr = (CopyType*)MediumPtrAddr;
       MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
@@ -2260,7 +2263,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
       llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
           TransferMedium->getValueType(), TransferMedium,
           {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
-      Address SrcMediumPtr(SrcMediumPtrVal, Align);
+      Address SrcMediumPtr = Address::deprecated(SrcMediumPtrVal, Align);
       // SrcMediumVal = *SrcMediumPtr;
       SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
 
@@ -2268,7 +2271,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
       Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
       llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
           TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
-      Address TargetElemPtr = Address(TargetElemPtrVal, Align);
+      Address TargetElemPtr = Address::deprecated(TargetElemPtrVal, Align);
       TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
       if (NumIters > 1)
         TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
@@ -2404,12 +2407,13 @@ static llvm::Function *emitShuffleAndReduceFunction(
   CGBuilderTy &Bld = CGF.Builder;
 
   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
   Address LocalReduceList(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
                                C.VoidPtrTy, SourceLocation()),
-          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
-      CGF.getPointerAlign());
+          ElemTy->getPointerTo()),
+      ElemTy, CGF.getPointerAlign());
 
   Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
   llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
@@ -2560,12 +2564,13 @@ static llvm::Value *emitListToGlobalCopyFunction(
 
   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
   Address LocalReduceList(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
                                C.VoidPtrTy, Loc),
-          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
-      CGF.getPointerAlign());
+          ElemTy->getPointerTo()),
+      ElemTy, CGF.getPointerAlign());
   QualType StaticTy = C.getRecordType(TeamReductionRec);
   llvm::Type *LLVMReductionsBufferTy =
       CGM.getTypes().ConvertTypeForMem(StaticTy);
@@ -2583,10 +2588,11 @@ static llvm::Value *emitListToGlobalCopyFunction(
     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
     // elemptr = ((CopyType*)(elemptrptr)) + I
+    ElemTy = CGF.ConvertTypeForMem(Private->getType());
     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
+        ElemPtrPtr, ElemTy->getPointerTo());
     Address ElemPtr =
-        Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
+        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
     // Global = Buffer.VD[Idx];
     const FieldDecl *FD = VarFieldMap.lookup(VD);
@@ -2595,7 +2601,8 @@ static llvm::Value *emitListToGlobalCopyFunction(
     Address GlobAddr = GlobLVal.getAddress(CGF);
     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
-    GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
+    GlobLVal.setAddress(
+        Address::deprecated(BufferPtr, GlobAddr.getAlignment()));
     switch (CGF.getEvaluationKind(Private->getType())) {
     case TEK_Scalar: {
       llvm::Value *V = CGF.EmitLoadOfScalar(
@@ -2765,12 +2772,13 @@ static llvm::Value *emitGlobalToListCopyFunction(
 
   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
   Address LocalReduceList(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
                                C.VoidPtrTy, Loc),
-          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
-      CGF.getPointerAlign());
+          ElemTy->getPointerTo()),
+      ElemTy, CGF.getPointerAlign());
   QualType StaticTy = C.getRecordType(TeamReductionRec);
   llvm::Type *LLVMReductionsBufferTy =
       CGM.getTypes().ConvertTypeForMem(StaticTy);
@@ -2789,10 +2797,11 @@ static llvm::Value *emitGlobalToListCopyFunction(
     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
     // elemptr = ((CopyType*)(elemptrptr)) + I
+    ElemTy = CGF.ConvertTypeForMem(Private->getType());
     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
+        ElemPtrPtr, ElemTy->getPointerTo());
     Address ElemPtr =
-        Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
+        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
     // Global = Buffer.VD[Idx];
     const FieldDecl *FD = VarFieldMap.lookup(VD);
@@ -2801,7 +2810,8 @@ static llvm::Value *emitGlobalToListCopyFunction(
     Address GlobAddr = GlobLVal.getAddress(CGF);
     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
         GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
-    GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
+    GlobLVal.setAddress(
+        Address::deprecated(BufferPtr, GlobAddr.getAlignment()));
     switch (CGF.getEvaluationKind(Private->getType())) {
     case TEK_Scalar: {
       llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
@@ -3283,7 +3293,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
           "_openmp_teams_reductions_buffer_$_$ptr");
     }
     llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
-        Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
+        Address::deprecated(KernelTeamsReductionPtr, CGM.getPointerAlign()),
         /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
@@ -3690,7 +3700,7 @@ Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
             GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
                     VD->getType().getAddressSpace()))),
-        Align);
+        VarTy, Align);
   }
 
   if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 0fbedb9e6ea77..b491642871ced 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -156,10 +156,11 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
           if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) {
             (void)PreCondVars.setVarAddr(
                 CGF, OrigVD,
-                Address(llvm::UndefValue::get(CGF.ConvertTypeForMem(
-                            CGF.getContext().getPointerType(
-                                OrigVD->getType().getNonReferenceType()))),
-                        CGF.getContext().getDeclAlign(OrigVD)));
+                Address::deprecated(
+                    llvm::UndefValue::get(
+                        CGF.ConvertTypeForMem(CGF.getContext().getPointerType(
+                            OrigVD->getType().getNonReferenceType()))),
+                    CGF.getContext().getDeclAlign(OrigVD)));
           }
         }
       }
@@ -578,8 +579,7 @@ static llvm::Function *emitOutlinedFunctionPrologue(
       }
       if (!FO.RegisterCastedArgsOnly) {
         LocalAddrs.insert(
-            {Args[Cnt],
-             {Var, Address(ArgAddr.getPointer(), Ctx.getDeclAlign(Var))}});
+            {Args[Cnt], {Var, ArgAddr.withAlignment(Ctx.getDeclAlign(Var))}});
       }
     } else if (I->capturesVariableByCopy()) {
       assert(!FD->getType()->isAnyPointerType() &&
@@ -726,14 +726,14 @@ void CodeGenFunction::EmitOMPAggregateAssign(
       Builder.CreatePHI(SrcBegin->getType(), 2, "omp.arraycpy.srcElementPast");
   SrcElementPHI->addIncoming(SrcBegin, EntryBB);
   Address SrcElementCurrent =
-      Address(SrcElementPHI,
+      Address(SrcElementPHI, SrcAddr.getElementType(),
               SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize));
 
   llvm::PHINode *DestElementPHI = Builder.CreatePHI(
       DestBegin->getType(), 2, "omp.arraycpy.destElementPast");
   DestElementPHI->addIncoming(DestBegin, EntryBB);
   Address DestElementCurrent =
-      Address(DestElementPHI,
+      Address(DestElementPHI, DestAddr.getElementType(),
               DestAddr.getAlignment().alignmentOfArrayElement(ElementSize));
 
   // Emit copy.
@@ -1007,10 +1007,10 @@ bool CodeGenFunction::EmitOMPCopyinClause(const OMPExecutableDirective &D) {
           MasterAddr = EmitLValue(&DRE).getAddress(*this);
           LocalDeclMap.erase(VD);
         } else {
-          MasterAddr =
-              Address(VD->isStaticLocal() ? CGM.getStaticLocalDeclAddress(VD)
-                                          : CGM.GetAddrOfGlobal(VD),
-                      getContext().getDeclAlign(VD));
+          MasterAddr = Address::deprecated(
+              VD->isStaticLocal() ? CGM.getStaticLocalDeclAddress(VD)
+                                  : CGM.GetAddrOfGlobal(VD),
+              getContext().getDeclAlign(VD));
         }
         // Get the address of the threadprivate variable.
         Address PrivateAddr = EmitLValue(*IRef).getAddress(*this);
@@ -1182,9 +1182,9 @@ void CodeGenFunction::EmitOMPLastprivateClauseFinal(
         // Get the address of the private variable.
         Address PrivateAddr = GetAddrOfLocalVar(PrivateVD);
         if (const auto *RefTy = PrivateVD->getType()->getAs<ReferenceType>())
-          PrivateAddr =
-              Address(Builder.CreateLoad(PrivateAddr),
-                      CGM.getNaturalTypeAlignment(RefTy->getPointeeType()));
+          PrivateAddr = Address::deprecated(
+              Builder.CreateLoad(PrivateAddr),
+              CGM.getNaturalTypeAlignment(RefTy->getPointeeType()));
         // Store the last value to the private copy in the last iteration.
         if (C->getKind() == OMPC_LASTPRIVATE_conditional)
           CGM.getOpenMPRuntime().emitLastprivateConditionalFinalUpdate(
@@ -1659,7 +1659,7 @@ Address CodeGenFunction::OMPBuilderCBHelpers::getAddressOfLocalVariable(
       Addr,
       CGF.ConvertTypeForMem(CGM.getContext().getPointerType(CVD->getType())),
       getNameWithSeparators({CVD->getName(), ".addr"}, ".", "."));
-  return Address(Addr, Align);
+  return Address::deprecated(Addr, Align);
 }
 
 Address CodeGenFunction::OMPBuilderCBHelpers::getAddrOfThreadPrivate(
@@ -1682,7 +1682,7 @@ Address CodeGenFunction::OMPBuilderCBHelpers::getAddrOfThreadPrivate(
   llvm::CallInst *ThreadPrivateCacheCall =
       OMPBuilder.createCachedThreadPrivate(CGF.Builder, Data, Size, CacheName);
 
-  return Address(ThreadPrivateCacheCall, VDAddr.getAlignment());
+  return Address::deprecated(ThreadPrivateCacheCall, VDAddr.getAlignment());
 }
 
 std::string CodeGenFunction::OMPBuilderCBHelpers::getNameWithSeparators(
@@ -4618,8 +4618,9 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
         });
       }
       for (const auto &Pair : PrivatePtrs) {
-        Address Replacement(CGF.Builder.CreateLoad(Pair.second),
-                            CGF.getContext().getDeclAlign(Pair.first));
+        Address Replacement =
+            Address::deprecated(CGF.Builder.CreateLoad(Pair.second),
+                                CGF.getContext().getDeclAlign(Pair.first));
         Scope.addPrivate(Pair.first, [Replacement]() { return Replacement; });
         if (auto *DI = CGF.getDebugInfo())
           if (CGF.CGM.getCodeGenOpts().hasReducedDebugInfo())
@@ -4632,14 +4633,16 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
       for (auto &Pair : UntiedLocalVars) {
         if (isAllocatableDecl(Pair.first)) {
           llvm::Value *Ptr = CGF.Builder.CreateLoad(Pair.second.first);
-          Address Replacement(Ptr, CGF.getPointerAlign());
+          Address Replacement = Address::deprecated(Ptr, CGF.getPointerAlign());
           Pair.second.first = Replacement;
           Ptr = CGF.Builder.CreateLoad(Replacement);
-          Replacement = Address(Ptr, CGF.getContext().getDeclAlign(Pair.first));
+          Replacement = Address::deprecated(
+              Ptr, CGF.getContext().getDeclAlign(Pair.first));
           Pair.second.second = Replacement;
         } else {
           llvm::Value *Ptr = CGF.Builder.CreateLoad(Pair.second.first);
-          Address Replacement(Ptr, CGF.getContext().getDeclAlign(Pair.first));
+          Address Replacement = Address::deprecated(
+              Ptr, CGF.getContext().getDeclAlign(Pair.first));
           Pair.second.first = Replacement;
         }
       }
@@ -4647,8 +4650,9 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
     if (Data.Reductions) {
       OMPPrivateScope FirstprivateScope(CGF);
       for (const auto &Pair : FirstprivatePtrs) {
-        Address Replacement(CGF.Builder.CreateLoad(Pair.second),
-                            CGF.getContext().getDeclAlign(Pair.first));
+        Address Replacement =
+            Address::deprecated(CGF.Builder.CreateLoad(Pair.second),
+                                CGF.getContext().getDeclAlign(Pair.first));
         FirstprivateScope.addPrivate(Pair.first,
                                      [Replacement]() { return Replacement; });
       }
@@ -4668,13 +4672,13 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
                                                            RedCG, Cnt);
         Address Replacement = CGF.CGM.getOpenMPRuntime().getTaskReductionItem(
             CGF, S.getBeginLoc(), ReductionsPtr, RedCG.getSharedLValue(Cnt));
-        Replacement =
-            Address(CGF.EmitScalarConversion(
-                        Replacement.getPointer(), CGF.getContext().VoidPtrTy,
-                        CGF.getContext().getPointerType(
-                            Data.ReductionCopies[Cnt]->getType()),
-                        Data.ReductionCopies[Cnt]->getExprLoc()),
-                    Replacement.getAlignment());
+        Replacement = Address::deprecated(
+            CGF.EmitScalarConversion(Replacement.getPointer(),
+                                     CGF.getContext().VoidPtrTy,
+                                     CGF.getContext().getPointerType(
+                                         Data.ReductionCopies[Cnt]->getType()),
+                                     Data.ReductionCopies[Cnt]->getExprLoc()),
+            Replacement.getAlignment());
         Replacement = RedCG.adjustPrivateAddress(CGF, Cnt, Replacement);
         Scope.addPrivate(RedCG.getBaseDecl(Cnt),
                          [Replacement]() { return Replacement; });
@@ -4724,7 +4728,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
         }
         Address Replacement = CGF.CGM.getOpenMPRuntime().getTaskReductionItem(
             CGF, S.getBeginLoc(), ReductionsPtr, RedCG.getSharedLValue(Cnt));
-        Replacement = Address(
+        Replacement = Address::deprecated(
             CGF.EmitScalarConversion(
                 Replacement.getPointer(), CGF.getContext().VoidPtrTy,
                 CGF.getContext().getPointerType(InRedPrivs[Cnt]->getType()),
@@ -4883,8 +4887,9 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
       CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
           CGF, S.getBeginLoc(), {CopyFnTy, CopyFn}, CallArgs);
       for (const auto &Pair : PrivatePtrs) {
-        Address Replacement(CGF.Builder.CreateLoad(Pair.second),
-                            CGF.getContext().getDeclAlign(Pair.first));
+        Address Replacement =
+            Address::deprecated(CGF.Builder.CreateLoad(Pair.second),
+                                CGF.getContext().getDeclAlign(Pair.first));
         Scope.addPrivate(Pair.first, [Replacement]() { return Replacement; });
       }
     }
diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index c839376880c49..34df7da7985b4 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -90,9 +90,8 @@ static RValue PerformReturnAdjustment(CodeGenFunction &CGF,
 
   auto ClassDecl = ResultType->getPointeeType()->getAsCXXRecordDecl();
   auto ClassAlign = CGF.CGM.getClassPointerAlignment(ClassDecl);
-  ReturnValue = CGF.CGM.getCXXABI().performReturnAdjustment(CGF,
-                                            Address(ReturnValue, ClassAlign),
-                                            Thunk.Return);
+  ReturnValue = CGF.CGM.getCXXABI().performReturnAdjustment(
+      CGF, Address::deprecated(ReturnValue, ClassAlign), Thunk.Return);
 
   if (NullCheckValue) {
     CGF.Builder.CreateBr(AdjustEnd);
@@ -198,7 +197,8 @@ CodeGenFunction::GenerateVarArgsThunk(llvm::Function *Fn,
 
   // Find the first store of "this", which will be to the alloca associated
   // with "this".
-  Address ThisPtr(&*AI, CGM.getClassPointerAlignment(MD->getParent()));
+  Address ThisPtr =
+      Address::deprecated(&*AI, CGM.getClassPointerAlignment(MD->getParent()));
   llvm::BasicBlock *EntryBB = &Fn->front();
   llvm::BasicBlock::iterator ThisStore =
       llvm::find_if(*EntryBB, [&](llvm::Instruction &I) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index a7f62b4a4e30a..95091edd9ecb7 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -593,7 +593,7 @@ CodeGenFunction::DecodeAddrUsedInPrologue(llvm::Value *F,
   auto *GOTAddr = Builder.CreateIntToPtr(GOTAsInt, Int8PtrPtrTy, "global_addr");
 
   // Load the original pointer through the global.
-  return Builder.CreateLoad(Address(GOTAddr, getPointerAlign()),
+  return Builder.CreateLoad(Address(GOTAddr, Int8PtrTy, getPointerAlign()),
                             "decoded_addr");
 }
 
@@ -1102,9 +1102,9 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
         EI->getType()->getPointerElementType(), &*EI, Idx);
     llvm::Type *Ty =
         cast<llvm::GetElementPtrInst>(Addr)->getResultElementType();
-    ReturnValuePointer = Address(Addr, getPointerAlign());
+    ReturnValuePointer = Address::deprecated(Addr, getPointerAlign());
     Addr = Builder.CreateAlignedLoad(Ty, Addr, getPointerAlign(), "agg.result");
-    ReturnValue = Address(Addr, CGM.getNaturalTypeAlignment(RetTy));
+    ReturnValue = Address::deprecated(Addr, CGM.getNaturalTypeAlignment(RetTy));
   } else {
     ReturnValue = CreateIRTemp(RetTy, "retval");
 
@@ -1929,7 +1929,7 @@ static void emitNonZeroVLAInit(CodeGenFunction &CGF, QualType baseType,
     dest.getAlignment().alignmentOfArrayElement(baseSize);
 
   // memcpy the individual element bit-pattern.
-  Builder.CreateMemCpy(Address(cur, curAlign), src, baseSizeInChars,
+  Builder.CreateMemCpy(Address(cur, CGF.Int8Ty, curAlign), src, baseSizeInChars,
                        /*volatile*/ false);
 
   // Go to the next element.
@@ -2002,7 +2002,7 @@ CodeGenFunction::EmitNullInitialization(Address DestPtr, QualType Ty) {
     CharUnits NullAlign = DestPtr.getAlignment();
     NullVariable->setAlignment(NullAlign.getAsAlign());
     Address SrcPtr(Builder.CreateBitCast(NullVariable, Builder.getInt8PtrTy()),
-                   NullAlign);
+                   Builder.getInt8Ty(), NullAlign);
 
     if (vla) return emitNonZeroVLAInit(*this, Ty, DestPtr, SrcPtr, SizeVal);
 
@@ -2480,7 +2480,7 @@ Address CodeGenFunction::EmitFieldAnnotations(const FieldDecl *D,
     V = Builder.CreateBitCast(V, VTy);
   }
 
-  return Address(V, Addr.getAlignment());
+  return Address::deprecated(V, Addr.getAlignment());
 }
 
 CodeGenFunction::CGCapturedStmtInfo::~CGCapturedStmtInfo() { }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c99fd899ac932..37c0b793134f7 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6431,7 +6431,8 @@ void CodeGenModule::EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
         !VD->getAnyInitializer()->isConstantInitializer(getContext(),
                                                         /*ForRef=*/false);
 
-    Address Addr(GetAddrOfGlobalVar(VD), getContext().getDeclAlign(VD));
+    Address Addr = Address::deprecated(GetAddrOfGlobalVar(VD),
+                                       getContext().getDeclAlign(VD));
     if (auto InitFunction = getOpenMPRuntime().emitThreadPrivateVarDefinition(
             VD, Addr, RefExpr->getBeginLoc(), PerformInit))
       CXXGlobalInits.push_back(InitFunction);
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index e00ff2b68719b..c1156d643e82d 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -948,7 +948,8 @@ MicrosoftCXXABI::performBaseAdjustment(CodeGenFunction &CGF, Address Value,
       Value.getElementType(), Value.getPointer(), Offset);
   CharUnits VBaseAlign =
     CGF.CGM.getVBaseAlignment(Value.getAlignment(), SrcDecl, PolymorphicBase);
-  return std::make_tuple(Address(Ptr, VBaseAlign), Offset, PolymorphicBase);
+  return std::make_tuple(Address(Ptr, CGF.Int8Ty, VBaseAlign), Offset,
+                         PolymorphicBase);
 }
 
 bool MicrosoftCXXABI::shouldTypeidBeNullChecked(bool IsDeref,
@@ -1470,7 +1471,7 @@ Address MicrosoftCXXABI::adjustThisArgumentForVirtualFunctionCall(
         Result.getElementType(), Result.getPointer(), VBaseOffset);
     CharUnits VBaseAlign =
       CGF.CGM.getVBaseAlignment(Result.getAlignment(), Derived, VBase);
-    Result = Address(VBasePtr, VBaseAlign);
+    Result = Address(VBasePtr, CGF.Int8Ty, VBaseAlign);
   }
   if (!StaticOffset.isZero()) {
     assert(StaticOffset.isPositive());
@@ -2217,10 +2218,10 @@ llvm::Value *MicrosoftCXXABI::performThisAdjustment(CodeGenFunction &CGF,
       assert(TA.Virtual.Microsoft.VBPtrOffset > 0);
       assert(TA.Virtual.Microsoft.VBOffsetOffset >= 0);
       llvm::Value *VBPtr;
-      llvm::Value *VBaseOffset =
-          GetVBaseOffsetFromVBPtr(CGF, Address(V, CGF.getPointerAlign()),
-                                  -TA.Virtual.Microsoft.VBPtrOffset,
-                                  TA.Virtual.Microsoft.VBOffsetOffset, &VBPtr);
+      llvm::Value *VBaseOffset = GetVBaseOffsetFromVBPtr(
+          CGF, Address(V, CGF.Int8Ty, CGF.getPointerAlign()),
+          -TA.Virtual.Microsoft.VBPtrOffset,
+          TA.Virtual.Microsoft.VBOffsetOffset, &VBPtr);
       V = CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, VBPtr, VBaseOffset);
     }
   }
@@ -2432,7 +2433,7 @@ static void emitTlsGuardCheck(CodeGenFunction &CGF, llvm::GlobalValue *TlsGuard,
                               llvm::BasicBlock *DynInitBB,
                               llvm::BasicBlock *ContinueBB) {
   llvm::LoadInst *TlsGuardValue =
-      CGF.Builder.CreateLoad(Address(TlsGuard, CharUnits::One()));
+      CGF.Builder.CreateLoad(Address(TlsGuard, CGF.Int8Ty, CharUnits::One()));
   llvm::Value *CmpResult =
       CGF.Builder.CreateICmpEQ(TlsGuardValue, CGF.Builder.getInt8(0));
   CGF.Builder.CreateCondBr(CmpResult, DynInitBB, ContinueBB);
@@ -2483,7 +2484,7 @@ LValue MicrosoftCXXABI::EmitThreadLocalVarDeclLValue(CodeGenFunction &CGF,
   V = CGF.Builder.CreateBitCast(V, RealVarTy->getPointerTo(AS));
 
   CharUnits Alignment = CGF.getContext().getDeclAlign(VD);
-  Address Addr(V, Alignment);
+  Address Addr(V, RealVarTy, Alignment);
 
   LValue LV = VD->getType()->isReferenceType()
                   ? CGF.EmitLoadOfReferenceLValue(Addr, VD->getType(),
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index a8aa7b1f17b29..3e1df744b2ad7 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -323,10 +323,10 @@ static Address emitVoidPtrDirectVAArg(CodeGenFunction &CGF,
   // If the CC aligns values higher than the slot size, do so if needed.
   Address Addr = Address::invalid();
   if (AllowHigherAlign && DirectAlign > SlotSize) {
-    Addr = Address(emitRoundPointerUpToAlignment(CGF, Ptr, DirectAlign),
-                                                 DirectAlign);
+    Addr = Address::deprecated(
+        emitRoundPointerUpToAlignment(CGF, Ptr, DirectAlign), DirectAlign);
   } else {
-    Addr = Address(Ptr, SlotSize);
+    Addr = Address::deprecated(Ptr, SlotSize);
   }
 
   // Advance the pointer past the argument, then store that back.
@@ -379,17 +379,15 @@ static Address emitVoidPtrVAArg(CodeGenFunction &CGF, Address VAListAddr,
   if (IsIndirect)
     DirectTy = DirectTy->getPointerTo(0);
 
-  Address Addr = emitVoidPtrDirectVAArg(CGF, VAListAddr, DirectTy,
-                                        DirectSize, DirectAlign,
-                                        SlotSizeAndAlign,
-                                        AllowHigherAlign);
+  Address Addr =
+      emitVoidPtrDirectVAArg(CGF, VAListAddr, DirectTy, DirectSize, DirectAlign,
+                             SlotSizeAndAlign, AllowHigherAlign);
 
   if (IsIndirect) {
-    Addr = Address(CGF.Builder.CreateLoad(Addr), ValueInfo.Align);
+    Addr = Address::deprecated(CGF.Builder.CreateLoad(Addr), ValueInfo.Align);
   }
 
   return Addr;
-
 }
 
 static Address complexTempStructure(CodeGenFunction &CGF, Address VAListAddr,
@@ -694,7 +692,7 @@ Address EmitVAArgInstr(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
         llvm::PointerType::getUnqual(CGF.ConvertTypeForMem(Ty));
     llvm::Value *Addr =
         CGF.Builder.CreateVAArg(VAListAddr.getPointer(), BaseTy);
-    return Address(Addr, TyAlignForABI);
+    return Address::deprecated(Addr, TyAlignForABI);
   } else {
     assert((AI.isDirect() || AI.isExtend()) &&
            "Unexpected ArgInfo Kind in generic VAArg emitter!");
@@ -4837,8 +4835,8 @@ Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
     CGF.EmitBlock(UsingRegs);
 
     Address RegSaveAreaPtr = Builder.CreateStructGEP(VAList, 4);
-    RegAddr = Address(Builder.CreateLoad(RegSaveAreaPtr),
-                      CharUnits::fromQuantity(8));
+    RegAddr = Address::deprecated(Builder.CreateLoad(RegSaveAreaPtr),
+                                  CharUnits::fromQuantity(8));
     assert(RegAddr.getElementType() == CGF.Int8Ty);
 
     // Floating-point registers start after the general-purpose registers.
@@ -4851,10 +4849,10 @@ Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
     // registers we've used by the number of
     CharUnits RegSize = CharUnits::fromQuantity((isInt || IsSoftFloatABI) ? 4 : 8);
     llvm::Value *RegOffset =
-      Builder.CreateMul(NumRegs, Builder.getInt8(RegSize.getQuantity()));
-    RegAddr = Address(Builder.CreateInBoundsGEP(CGF.Int8Ty,
-                                            RegAddr.getPointer(), RegOffset),
-                      RegAddr.getAlignment().alignmentOfArrayElement(RegSize));
+        Builder.CreateMul(NumRegs, Builder.getInt8(RegSize.getQuantity()));
+    RegAddr = Address(
+        Builder.CreateInBoundsGEP(CGF.Int8Ty, RegAddr.getPointer(), RegOffset),
+        CGF.Int8Ty, RegAddr.getAlignment().alignmentOfArrayElement(RegSize));
     RegAddr = Builder.CreateElementBitCast(RegAddr, DirectTy);
 
     // Increase the used-register count.
@@ -4885,14 +4883,14 @@ Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
     }
 
     Address OverflowAreaAddr = Builder.CreateStructGEP(VAList, 3);
-    Address OverflowArea(Builder.CreateLoad(OverflowAreaAddr, "argp.cur"),
-                         OverflowAreaAlign);
+    Address OverflowArea = Address::deprecated(
+        Builder.CreateLoad(OverflowAreaAddr, "argp.cur"), OverflowAreaAlign);
     // Round up address of argument to alignment
     CharUnits Align = CGF.getContext().getTypeAlignInChars(Ty);
     if (Align > OverflowAreaAlign) {
       llvm::Value *Ptr = OverflowArea.getPointer();
-      OverflowArea = Address(emitRoundPointerUpToAlignment(CGF, Ptr, Align),
-                                                           Align);
+      OverflowArea = Address::deprecated(
+          emitRoundPointerUpToAlignment(CGF, Ptr, Align), Align);
     }
 
     MemAddr = Builder.CreateElementBitCast(OverflowArea, DirectTy);
@@ -4911,8 +4909,8 @@ Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
 
   // Load the pointer if the argument was passed indirectly.
   if (isIndirect) {
-    Result = Address(Builder.CreateLoad(Result, "aggr"),
-                     getContext().getTypeAlignInChars(Ty));
+    Result = Address::deprecated(Builder.CreateLoad(Result, "aggr"),
+                                 getContext().getTypeAlignInChars(Ty));
   }
 
   return Result;
@@ -6060,7 +6058,7 @@ Address AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
       CGF.Builder.CreateStructGEP(VAListAddr, reg_top_index, "reg_top_p");
   reg_top = CGF.Builder.CreateLoad(reg_top_p, "reg_top");
   Address BaseAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, reg_top, reg_offs),
-                   CharUnits::fromQuantity(IsFPR ? 16 : 8));
+                   CGF.Int8Ty, CharUnits::fromQuantity(IsFPR ? 16 : 8));
   Address RegAddr = Address::invalid();
   llvm::Type *MemTy = CGF.ConvertTypeForMem(Ty);
 
@@ -6145,8 +6143,8 @@ Address AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
 
     OnStackPtr = CGF.Builder.CreateIntToPtr(OnStackPtr, CGF.Int8PtrTy);
   }
-  Address OnStackAddr(OnStackPtr,
-                      std::max(CharUnits::fromQuantity(8), TyAlign));
+  Address OnStackAddr = Address::deprecated(
+      OnStackPtr, std::max(CharUnits::fromQuantity(8), TyAlign));
 
   // All stack slots are multiples of 8 bytes.
   CharUnits StackSlotSize = CharUnits::fromQuantity(8);
@@ -6178,12 +6176,12 @@ Address AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
   //=======================================
   CGF.EmitBlock(ContBlock);
 
-  Address ResAddr = emitMergePHI(CGF, RegAddr, InRegBlock,
-                                 OnStackAddr, OnStackBlock, "vaargs.addr");
+  Address ResAddr = emitMergePHI(CGF, RegAddr, InRegBlock, OnStackAddr,
+                                 OnStackBlock, "vaargs.addr");
 
   if (IsIndirect)
-    return Address(CGF.Builder.CreateLoad(ResAddr, "vaarg.addr"),
-                   TyAlign);
+    return Address::deprecated(CGF.Builder.CreateLoad(ResAddr, "vaarg.addr"),
+                               TyAlign);
 
   return ResAddr;
 }
@@ -6201,7 +6199,8 @@ Address AArch64ABIInfo::EmitDarwinVAArg(Address VAListAddr, QualType Ty,
 
   // Empty records are ignored for parameter passing purposes.
   if (isEmptyRecord(getContext(), Ty, true)) {
-    Address Addr(CGF.Builder.CreateLoad(VAListAddr, "ap.cur"), SlotSize);
+    Address Addr = Address::deprecated(
+        CGF.Builder.CreateLoad(VAListAddr, "ap.cur"), SlotSize);
     Addr = CGF.Builder.CreateElementBitCast(Addr, CGF.ConvertTypeForMem(Ty));
     return Addr;
   }
@@ -6989,7 +6988,8 @@ Address ARMABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
 
   // Empty records are ignored for parameter passing purposes.
   if (isEmptyRecord(getContext(), Ty, true)) {
-    Address Addr(CGF.Builder.CreateLoad(VAListAddr), SlotSize);
+    Address Addr =
+        Address::deprecated(CGF.Builder.CreateLoad(VAListAddr), SlotSize);
     Addr = CGF.Builder.CreateElementBitCast(Addr, CGF.ConvertTypeForMem(Ty));
     return Addr;
   }
@@ -7562,17 +7562,16 @@ Address SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
     // single (8 byte) or double (16 byte) stack slot.
     Address OverflowArgAreaPtr =
         CGF.Builder.CreateStructGEP(VAListAddr, 2, "overflow_arg_area_ptr");
-    Address OverflowArgArea =
-      Address(CGF.Builder.CreateLoad(OverflowArgAreaPtr, "overflow_arg_area"),
-              TyInfo.Align);
+    Address OverflowArgArea = Address::deprecated(
+        CGF.Builder.CreateLoad(OverflowArgAreaPtr, "overflow_arg_area"),
+        TyInfo.Align);
     Address MemAddr =
-      CGF.Builder.CreateElementBitCast(OverflowArgArea, DirectTy, "mem_addr");
+        CGF.Builder.CreateElementBitCast(OverflowArgArea, DirectTy, "mem_addr");
 
     // Update overflow_arg_area_ptr pointer
-    llvm::Value *NewOverflowArgArea =
-      CGF.Builder.CreateGEP(OverflowArgArea.getElementType(),
-                            OverflowArgArea.getPointer(), PaddedSizeV,
-                            "overflow_arg_area");
+    llvm::Value *NewOverflowArgArea = CGF.Builder.CreateGEP(
+        OverflowArgArea.getElementType(), OverflowArgArea.getPointer(),
+        PaddedSizeV, "overflow_arg_area");
     CGF.Builder.CreateStore(NewOverflowArgArea, OverflowArgAreaPtr);
 
     return MemAddr;
@@ -7620,12 +7619,12 @@ Address SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   Address RegSaveAreaPtr =
       CGF.Builder.CreateStructGEP(VAListAddr, 3, "reg_save_area_ptr");
   llvm::Value *RegSaveArea =
-    CGF.Builder.CreateLoad(RegSaveAreaPtr, "reg_save_area");
-  Address RawRegAddr(CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, RegOffset,
-                                           "raw_reg_addr"),
-                     PaddedSize);
+      CGF.Builder.CreateLoad(RegSaveAreaPtr, "reg_save_area");
+  Address RawRegAddr(
+      CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, RegOffset, "raw_reg_addr"),
+      CGF.Int8Ty, PaddedSize);
   Address RegAddr =
-    CGF.Builder.CreateElementBitCast(RawRegAddr, DirectTy, "reg_addr");
+      CGF.Builder.CreateElementBitCast(RawRegAddr, DirectTy, "reg_addr");
 
   // Update the register count
   llvm::Value *One = llvm::ConstantInt::get(IndexTy, 1);
@@ -7640,11 +7639,11 @@ Address SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   // Work out the address of a stack argument.
   Address OverflowArgAreaPtr =
       CGF.Builder.CreateStructGEP(VAListAddr, 2, "overflow_arg_area_ptr");
-  Address OverflowArgArea =
-    Address(CGF.Builder.CreateLoad(OverflowArgAreaPtr, "overflow_arg_area"),
-            PaddedSize);
+  Address OverflowArgArea = Address::deprecated(
+      CGF.Builder.CreateLoad(OverflowArgAreaPtr, "overflow_arg_area"),
+      PaddedSize);
   Address RawMemAddr =
-    CGF.Builder.CreateConstByteGEP(OverflowArgArea, Padding, "raw_mem_addr");
+      CGF.Builder.CreateConstByteGEP(OverflowArgArea, Padding, "raw_mem_addr");
   Address MemAddr =
     CGF.Builder.CreateElementBitCast(RawMemAddr, DirectTy, "mem_addr");
 
@@ -7658,12 +7657,12 @@ Address SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
 
   // Return the appropriate result.
   CGF.EmitBlock(ContBlock);
-  Address ResAddr = emitMergePHI(CGF, RegAddr, InRegBlock,
-                                 MemAddr, InMemBlock, "va_arg.addr");
+  Address ResAddr = emitMergePHI(CGF, RegAddr, InRegBlock, MemAddr, InMemBlock,
+                                 "va_arg.addr");
 
   if (IsIndirect)
-    ResAddr = Address(CGF.Builder.CreateLoad(ResAddr, "indirect_arg"),
-                      TyInfo.Align);
+    ResAddr = Address::deprecated(
+        CGF.Builder.CreateLoad(ResAddr, "indirect_arg"), TyInfo.Align);
 
   return ResAddr;
 }
@@ -8601,7 +8600,9 @@ Address HexagonABIInfo::EmitVAArgFromMemory(CodeGenFunction &CGF,
   // overflow area pointer to the argument type.
   llvm::Type *PTy = CGF.ConvertTypeForMem(Ty);
   Address AddrTyped = CGF.Builder.CreateElementBitCast(
-      Address(__overflow_area_pointer, CharUnits::fromQuantity(Align)), PTy);
+      Address::deprecated(__overflow_area_pointer,
+                          CharUnits::fromQuantity(Align)),
+      PTy);
 
   // Round up to the minimum stack alignment for varargs which is 4 bytes.
   uint64_t Offset = llvm::alignTo(CGF.getContext().getTypeSize(Ty) / 8, 4);
@@ -8633,7 +8634,8 @@ Address HexagonABIInfo::EmitVAArgForHexagon(CodeGenFunction &CGF,
     Addr = Builder.CreateIntToPtr(AddrAsInt, BP);
   }
   Address AddrTyped = Builder.CreateElementBitCast(
-      Address(Addr, CharUnits::fromQuantity(TyAlign)), CGF.ConvertType(Ty));
+      Address::deprecated(Addr, CharUnits::fromQuantity(TyAlign)),
+      CGF.ConvertType(Ty));
 
   uint64_t Offset = llvm::alignTo(CGF.getContext().getTypeSize(Ty) / 8, 4);
   llvm::Value *NextAddr = Builder.CreateGEP(
@@ -8786,12 +8788,13 @@ Address HexagonABIInfo::EmitVAArgForHexagonLinux(CodeGenFunction &CGF,
   // Implement the ContBlock
   CGF.EmitBlock(ContBlock);
 
-  llvm::Type *MemPTy = llvm::PointerType::getUnqual(CGF.ConvertTypeForMem(Ty));
+  llvm::Type *MemTy = CGF.ConvertTypeForMem(Ty);
+  llvm::Type *MemPTy = llvm::PointerType::getUnqual(MemTy);
   llvm::PHINode *ArgAddr = CGF.Builder.CreatePHI(MemPTy, 2, "vaarg.addr");
   ArgAddr->addIncoming(__saved_reg_area_p, InRegBlock);
   ArgAddr->addIncoming(__overflow_area_p, OnStackBlock);
 
-  return Address(ArgAddr, CharUnits::fromQuantity(ArgAlign));
+  return Address(ArgAddr, MemTy, CharUnits::fromQuantity(ArgAlign));
 }
 
 Address HexagonABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
@@ -9704,7 +9707,8 @@ Address SparcV9ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   CharUnits SlotSize = CharUnits::fromQuantity(8);
 
   CGBuilderTy &Builder = CGF.Builder;
-  Address Addr(Builder.CreateLoad(VAListAddr, "ap.cur"), SlotSize);
+  Address Addr =
+      Address::deprecated(Builder.CreateLoad(VAListAddr, "ap.cur"), SlotSize);
   llvm::Type *ArgPtrTy = llvm::PointerType::getUnqual(ArgTy);
 
   auto TypeInfo = getContext().getTypeInfoInChars(Ty);
@@ -9735,12 +9739,12 @@ Address SparcV9ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   case ABIArgInfo::IndirectAliased:
     Stride = SlotSize;
     ArgAddr = Builder.CreateElementBitCast(Addr, ArgPtrTy, "indirect");
-    ArgAddr = Address(Builder.CreateLoad(ArgAddr, "indirect.arg"),
-                      TypeInfo.Align);
+    ArgAddr = Address::deprecated(Builder.CreateLoad(ArgAddr, "indirect.arg"),
+                                  TypeInfo.Align);
     break;
 
   case ABIArgInfo::Ignore:
-    return Address(llvm::UndefValue::get(ArgPtrTy), TypeInfo.Align);
+    return Address(llvm::UndefValue::get(ArgPtrTy), ArgTy, TypeInfo.Align);
   }
 
   // Update VAList.
@@ -10081,7 +10085,7 @@ Address XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
 
   // Get the VAList.
   CharUnits SlotSize = CharUnits::fromQuantity(4);
-  Address AP(Builder.CreateLoad(VAListAddr), SlotSize);
+  Address AP = Address::deprecated(Builder.CreateLoad(VAListAddr), SlotSize);
 
   // Handle the argument.
   ABIArgInfo AI = classifyArgumentType(Ty);
@@ -10099,7 +10103,7 @@ Address XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   case ABIArgInfo::InAlloca:
     llvm_unreachable("Unsupported ABI kind for va_arg");
   case ABIArgInfo::Ignore:
-    Val = Address(llvm::UndefValue::get(ArgPtrTy), TypeAlign);
+    Val = Address(llvm::UndefValue::get(ArgPtrTy), ArgTy, TypeAlign);
     ArgSize = CharUnits::Zero();
     break;
   case ABIArgInfo::Extend:
@@ -10112,7 +10116,7 @@ Address XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   case ABIArgInfo::Indirect:
   case ABIArgInfo::IndirectAliased:
     Val = Builder.CreateElementBitCast(AP, ArgPtrTy);
-    Val = Address(Builder.CreateLoad(Val), TypeAlign);
+    Val = Address::deprecated(Builder.CreateLoad(Val), TypeAlign);
     ArgSize = SlotSize;
     break;
   }
@@ -11126,7 +11130,8 @@ Address RISCVABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
 
   // Empty records are ignored for parameter passing purposes.
   if (isEmptyRecord(getContext(), Ty, true)) {
-    Address Addr(CGF.Builder.CreateLoad(VAListAddr), SlotSize);
+    Address Addr =
+        Address::deprecated(CGF.Builder.CreateLoad(VAListAddr), SlotSize);
     Addr = CGF.Builder.CreateElementBitCast(Addr, CGF.ConvertTypeForMem(Ty));
     return Addr;
   }
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index cc830c1d5416c..5b0c03f43e5e4 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -706,8 +706,8 @@ class AddressResult : public Result {
   AddressResult(Ptr Arg, unsigned Align) : Arg(Arg), Align(Align) {}
   void genCode(raw_ostream &OS,
                CodeGenParamAllocator &ParamAlloc) const override {
-    OS << "Address(" << Arg->varname() << ", CharUnits::fromQuantity("
-       << Align << "))";
+    OS << "Address::deprecated(" << Arg->varname()
+       << ", CharUnits::fromQuantity(" << Align << "))";
   }
   std::string typeName() const override {
     return "Address";

From f3809b20f2d97d0abdcdc5628c7093758aefc530 Mon Sep 17 00:00:00 2001
From: Pavel Kosov <kosov.pavel@huawei.com>
Date: Thu, 17 Feb 2022 13:41:05 +0300
Subject: [PATCH 065/748] [AArch64][SchedModels] Handle virtual registers in
 FP/NEON predicates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current implementation of Check[HSDQ]Form predicates doesn’t handle virtual registers and therefore isn’t useful for pre-RA scheduling. Patch fixes this implementing two function predicates: CheckQForm for checking that instruction writes 128-bit NEON register and CheckFpOrNEON which checks that instruction writes FP register (any width). The latter supersedes Check[HSD]Form predicates which are not used individually.

OS Laboratory. Huawei Russian Research Institute. Saint-Petersburg

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D114642
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  45 ++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   6 +
 .../Target/AArch64/AArch64SchedPredExynos.td  |   5 +-
 .../Target/AArch64/AArch64SchedPredicates.td  | 147 ++----------------
 .../MCTargetDesc/AArch64MCTargetDesc.cpp      |  25 +++
 .../MCTargetDesc/AArch64MCTargetDesc.h        |   3 +
 .../AArch64/misched-predicate-virtreg.mir     |  36 +++++
 7 files changed, 125 insertions(+), 142 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 153b176ad4189..b2689e900f625 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3105,6 +3105,51 @@ bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
   return isPreLd(MI) || isPreSt(MI);
 }
 
+static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
+                                              Register Reg) {
+  if (MI.getParent() == nullptr)
+    return nullptr;
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
+}
+
+bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
+  auto IsQFPR = [&](const MachineOperand &Op) {
+    if (!Op.isReg())
+      return false;
+    auto Reg = Op.getReg();
+    if (Reg.isPhysical())
+      return AArch64::FPR128RegClass.contains(Reg);
+    const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
+    return TRC == &AArch64::FPR128RegClass ||
+           TRC == &AArch64::FPR128_loRegClass;
+  };
+  return llvm::any_of(MI.operands(), IsQFPR);
+}
+
+bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
+  auto IsFPR = [&](const MachineOperand &Op) {
+    if (!Op.isReg())
+      return false;
+    auto Reg = Op.getReg();
+    if (Reg.isPhysical())
+      return AArch64::FPR128RegClass.contains(Reg) ||
+             AArch64::FPR64RegClass.contains(Reg) ||
+             AArch64::FPR32RegClass.contains(Reg) ||
+             AArch64::FPR16RegClass.contains(Reg) ||
+             AArch64::FPR8RegClass.contains(Reg);
+
+    const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
+    return TRC == &AArch64::FPR128RegClass ||
+           TRC == &AArch64::FPR128_loRegClass ||
+           TRC == &AArch64::FPR64RegClass ||
+           TRC == &AArch64::FPR64_loRegClass ||
+           TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
+           TRC == &AArch64::FPR8RegClass;
+  };
+  return llvm::any_of(MI.operands(), IsFPR);
+}
+
 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
 // scaled.
 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 75d2eb0169ea9..b522230496d25 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -103,6 +103,12 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Returns whether the instruction is a pre-indexed load/store.
   static bool isPreLdSt(const MachineInstr &MI);
 
+  /// Returns whether the instruction is FP or NEON.
+  static bool isFpOrNEON(const MachineInstr &MI);
+
+  /// Returns whether the instruction is in Q form (128 bit operands)
+  static bool isQForm(const MachineInstr &MI);
+
   /// Returns the index for the immediate for a given instruction.
   static unsigned getLoadStoreImmIdx(unsigned Opc);
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
index fcda2394bacf7..ee7cc1f5095b5 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredExynos.td
@@ -109,10 +109,7 @@ def ExynosScaledIdxFn   : TIIPredicate<"isExynosScaledAddr",
 def ExynosScaledIdxPred : MCSchedPredicate<ExynosScaledIdxFn>;
 
 // Identify FP instructions.
-def ExynosFPPred : MCSchedPredicate<CheckAny<[CheckHForm,
-                                              CheckSForm,
-                                              CheckDForm,
-                                              CheckQForm]>>;
+def ExynosFPPred : MCSchedPredicate<CheckFpOrNEON>;
 
 // Identify 128-bit NEON instructions.
 def ExynosQFormPred : MCSchedPredicate<CheckQForm>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
index fc13b23b4cf89..5402b8bf09e56 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -59,146 +59,17 @@ foreach I = {0-3, 8} in {
 }
 
 // Generic predicates.
-
-// Identify whether an instruction is the 16-bit NEON form based on its result.
-def CheckHForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, H0>,
-                                                 CheckRegOperand<0, H1>,
-                                                 CheckRegOperand<0, H2>,
-                                                 CheckRegOperand<0, H3>,
-                                                 CheckRegOperand<0, H4>,
-                                                 CheckRegOperand<0, H5>,
-                                                 CheckRegOperand<0, H6>,
-                                                 CheckRegOperand<0, H7>,
-                                                 CheckRegOperand<0, H8>,
-                                                 CheckRegOperand<0, H9>,
-                                                 CheckRegOperand<0, H10>,
-                                                 CheckRegOperand<0, H11>,
-                                                 CheckRegOperand<0, H12>,
-                                                 CheckRegOperand<0, H13>,
-                                                 CheckRegOperand<0, H14>,
-                                                 CheckRegOperand<0, H15>,
-                                                 CheckRegOperand<0, H16>,
-                                                 CheckRegOperand<0, H17>,
-                                                 CheckRegOperand<0, H18>,
-                                                 CheckRegOperand<0, H19>,
-                                                 CheckRegOperand<0, H20>,
-                                                 CheckRegOperand<0, H21>,
-                                                 CheckRegOperand<0, H22>,
-                                                 CheckRegOperand<0, H23>,
-                                                 CheckRegOperand<0, H24>,
-                                                 CheckRegOperand<0, H25>,
-                                                 CheckRegOperand<0, H26>,
-                                                 CheckRegOperand<0, H27>,
-                                                 CheckRegOperand<0, H28>,
-                                                 CheckRegOperand<0, H29>,
-                                                 CheckRegOperand<0, H30>,
-                                                 CheckRegOperand<0, H31>]>]>;
-
-// Identify whether an instruction is the 32-bit NEON form based on its result.
-def CheckSForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, S0>,
-                                                 CheckRegOperand<0, S1>,
-                                                 CheckRegOperand<0, S2>,
-                                                 CheckRegOperand<0, S3>,
-                                                 CheckRegOperand<0, S4>,
-                                                 CheckRegOperand<0, S5>,
-                                                 CheckRegOperand<0, S6>,
-                                                 CheckRegOperand<0, S7>,
-                                                 CheckRegOperand<0, S8>,
-                                                 CheckRegOperand<0, S9>,
-                                                 CheckRegOperand<0, S10>,
-                                                 CheckRegOperand<0, S11>,
-                                                 CheckRegOperand<0, S12>,
-                                                 CheckRegOperand<0, S13>,
-                                                 CheckRegOperand<0, S14>,
-                                                 CheckRegOperand<0, S15>,
-                                                 CheckRegOperand<0, S16>,
-                                                 CheckRegOperand<0, S17>,
-                                                 CheckRegOperand<0, S18>,
-                                                 CheckRegOperand<0, S19>,
-                                                 CheckRegOperand<0, S20>,
-                                                 CheckRegOperand<0, S21>,
-                                                 CheckRegOperand<0, S22>,
-                                                 CheckRegOperand<0, S23>,
-                                                 CheckRegOperand<0, S24>,
-                                                 CheckRegOperand<0, S25>,
-                                                 CheckRegOperand<0, S26>,
-                                                 CheckRegOperand<0, S27>,
-                                                 CheckRegOperand<0, S28>,
-                                                 CheckRegOperand<0, S29>,
-                                                 CheckRegOperand<0, S30>,
-                                                 CheckRegOperand<0, S31>]>]>;
-
-// Identify whether an instruction is the 64-bit NEON form based on its result.
-def CheckDForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, D0>,
-                                                 CheckRegOperand<0, D1>,
-                                                 CheckRegOperand<0, D2>,
-                                                 CheckRegOperand<0, D3>,
-                                                 CheckRegOperand<0, D4>,
-                                                 CheckRegOperand<0, D5>,
-                                                 CheckRegOperand<0, D6>,
-                                                 CheckRegOperand<0, D7>,
-                                                 CheckRegOperand<0, D8>,
-                                                 CheckRegOperand<0, D9>,
-                                                 CheckRegOperand<0, D10>,
-                                                 CheckRegOperand<0, D11>,
-                                                 CheckRegOperand<0, D12>,
-                                                 CheckRegOperand<0, D13>,
-                                                 CheckRegOperand<0, D14>,
-                                                 CheckRegOperand<0, D15>,
-                                                 CheckRegOperand<0, D16>,
-                                                 CheckRegOperand<0, D17>,
-                                                 CheckRegOperand<0, D18>,
-                                                 CheckRegOperand<0, D19>,
-                                                 CheckRegOperand<0, D20>,
-                                                 CheckRegOperand<0, D21>,
-                                                 CheckRegOperand<0, D22>,
-                                                 CheckRegOperand<0, D23>,
-                                                 CheckRegOperand<0, D24>,
-                                                 CheckRegOperand<0, D25>,
-                                                 CheckRegOperand<0, D26>,
-                                                 CheckRegOperand<0, D27>,
-                                                 CheckRegOperand<0, D28>,
-                                                 CheckRegOperand<0, D29>,
-                                                 CheckRegOperand<0, D30>,
-                                                 CheckRegOperand<0, D31>]>]>;
+// Identify whether an instruction is NEON or floating point
+def CheckFpOrNEON : CheckFunctionPredicateWithTII<
+  "AArch64_MC::isFpOrNEON",
+  "AArch64InstrInfo::isFpOrNEON"
+>;
 
 // Identify whether an instruction is the 128-bit NEON form based on its result.
-def CheckQForm             : CheckAll<[CheckIsRegOperand<0>,
-                                       CheckAny<[CheckRegOperand<0, Q0>,
-                                                 CheckRegOperand<0, Q1>,
-                                                 CheckRegOperand<0, Q2>,
-                                                 CheckRegOperand<0, Q3>,
-                                                 CheckRegOperand<0, Q4>,
-                                                 CheckRegOperand<0, Q5>,
-                                                 CheckRegOperand<0, Q6>,
-                                                 CheckRegOperand<0, Q7>,
-                                                 CheckRegOperand<0, Q8>,
-                                                 CheckRegOperand<0, Q9>,
-                                                 CheckRegOperand<0, Q10>,
-                                                 CheckRegOperand<0, Q11>,
-                                                 CheckRegOperand<0, Q12>,
-                                                 CheckRegOperand<0, Q13>,
-                                                 CheckRegOperand<0, Q14>,
-                                                 CheckRegOperand<0, Q15>,
-                                                 CheckRegOperand<0, Q16>,
-                                                 CheckRegOperand<0, Q17>,
-                                                 CheckRegOperand<0, Q18>,
-                                                 CheckRegOperand<0, Q19>,
-                                                 CheckRegOperand<0, Q20>,
-                                                 CheckRegOperand<0, Q21>,
-                                                 CheckRegOperand<0, Q22>,
-                                                 CheckRegOperand<0, Q23>,
-                                                 CheckRegOperand<0, Q24>,
-                                                 CheckRegOperand<0, Q25>,
-                                                 CheckRegOperand<0, Q26>,
-                                                 CheckRegOperand<0, Q27>,
-                                                 CheckRegOperand<0, Q28>,
-                                                 CheckRegOperand<0, Q29>,
-                                                 CheckRegOperand<0, Q30>,
-                                                 CheckRegOperand<0, Q31>]>]>;
+def CheckQForm : CheckFunctionPredicateWithTII<
+  "AArch64_MC::isQForm",
+  "AArch64InstrInfo::isQForm"
+>;
 
 // Identify arithmetic instructions with extend.
 def IsArithExtOp           : CheckOpcode<[ADDWrx, ADDXrx, ADDSWrx, ADDSXrx,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index c1186ae804d2f..61ec1de55b9cb 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -243,6 +243,31 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
     MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
 }
 
+bool AArch64_MC::isQForm(const MCInst &MI, const MCInstrInfo *MCII) {
+  const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
+  return llvm::any_of(MI, [](const MCOperand &Op) {
+    return Op.isReg() && FPR128.contains(Op.getReg());
+  });
+}
+
+bool AArch64_MC::isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII) {
+  const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
+  const auto &FPR64 = AArch64MCRegisterClasses[AArch64::FPR64RegClassID];
+  const auto &FPR32 = AArch64MCRegisterClasses[AArch64::FPR32RegClassID];
+  const auto &FPR16 = AArch64MCRegisterClasses[AArch64::FPR16RegClassID];
+  const auto &FPR8 = AArch64MCRegisterClasses[AArch64::FPR8RegClassID];
+
+  auto IsFPR = [&](const MCOperand &Op) {
+    if (!Op.isReg())
+      return false;
+    auto Reg = Op.getReg();
+    return FPR128.contains(Reg) || FPR64.contains(Reg) || FPR32.contains(Reg) ||
+           FPR16.contains(Reg) || FPR8.contains(Reg);
+  };
+
+  return llvm::any_of(MI, IsFPR);
+}
+
 static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
   InitAArch64MCRegisterInfo(X, AArch64::LR);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 436648b67f8b7..ad2dc1027a5ee 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -22,6 +22,7 @@ class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
+class MCInst;
 class MCInstrInfo;
 class MCInstPrinter;
 class MCRegisterInfo;
@@ -59,6 +60,8 @@ MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
 
 namespace AArch64_MC {
 void initLLVMToCVRegMapping(MCRegisterInfo *MRI);
+bool isQForm(const MCInst &MI, const MCInstrInfo *MCII);
+bool isFpOrNEON(const MCInst &MI, const MCInstrInfo *MCII);
 }
 
 } // End llvm namespace
diff --git a/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir
new file mode 100644
index 0000000000000..95e58a48b424b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir
@@ -0,0 +1,36 @@
+# RUN: llc -mcpu=exynos-m5 -mtriple=aarch64 -enable-misched -run-pass=machine-scheduler -debug-only=machine-scheduler %s -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK-LABEL: ********** MI Scheduling **********
+# CHECK:       SU(0):   %0:fpr128 = COPY $q1
+# CHECK-NEXT:    # preds left       : 0
+# CHECK-NEXT:    # succs left       : 1
+# CHECK-NEXT:    # rdefs left       : 0
+# CHECK-NEXT:    Latency            : 2
+# CHECK-NEXT:    Depth              : 0
+# CHECK-NEXT:    Height             : 12
+# CHECK-NEXT:    Successors:
+# CHECK-NEXT:      SU(1): Data Latency=2 Reg=%0
+# CHECK-NEXT:    Single Issue       : false;
+# CHECK-NEXT:  SU(1):   %1:fpr32 = FMINVv4i32v %0:fpr128
+# CHECK-NEXT:    # preds left       : 1
+# CHECK-NEXT:    # succs left       : 1
+# CHECK-NEXT:    # rdefs left       : 0
+# CHECK-NEXT:    Latency            : 8
+# CHECK-NEXT:    Depth              : 2
+# CHECK-NEXT:    Height             : 10
+# CHECK-NEXT:    Predecessors:
+# CHECK-NEXT:      SU(0): Data Latency=2 Reg=%0
+# CHECK-NEXT:    Successors:
+# CHECK-NEXT:      SU(2): Data Latency=8 Reg=%1
+# CHECK-NEXT:    Single Issue       : false;
+
+name: test_qform_virtreg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $s0, $q1
+    %0:fpr128 = COPY $q1
+    %1:fpr32 = FMINVv4i32v %0:fpr128
+    $s0 = COPY %1
+    RET_ReallyLR implicit $s0
+

From 37fa99eda0f5e6d5b15f6bb726d3bcbeeed30c50 Mon Sep 17 00:00:00 2001
From: Pavel Kosov <kosov.pavel@huawei.com>
Date: Thu, 17 Feb 2022 13:41:57 +0300
Subject: [PATCH 066/748] [SchedModels][CortexA55] Add ASIMD integer
 instructions

Depends on D114642

Original review https://reviews.llvm.org/D112201

OS Laboratory. Huawei Russian Research Institute. Saint-Petersburg

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D117003
---
 llvm/lib/Target/AArch64/AArch64SchedA55.td    | 134 ++-
 .../CostModel/AArch64/vector-select.ll        |   2 +-
 .../AArch64/GlobalISel/combine-udiv.ll        | 176 ++--
 llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll  |   4 +-
 llvm/test/CodeGen/AArch64/active_lane_mask.ll |  26 +-
 .../AArch64/addsub-constant-folding.ll        |   4 +-
 .../CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll   |   8 +-
 llvm/test/CodeGen/AArch64/arm64-fcopysign.ll  |   8 +-
 .../test/CodeGen/AArch64/arm64-sli-sri-opt.ll |   4 +-
 .../CodeGen/AArch64/arm64-subvector-extend.ll | 224 ++---
 llvm/test/CodeGen/AArch64/arm64-vhadd.ll      |   8 +-
 llvm/test/CodeGen/AArch64/cmp-select-sign.ll  |  18 +-
 llvm/test/CodeGen/AArch64/dag-numsignbits.ll  |   6 +-
 .../div-rem-pair-recomposition-signed.ll      |   6 +-
 .../div-rem-pair-recomposition-unsigned.ll    |   6 +-
 .../test/CodeGen/AArch64/expand-vector-rot.ll |  12 +-
 llvm/test/CodeGen/AArch64/f16-instructions.ll |  12 +-
 llvm/test/CodeGen/AArch64/fcopysign.ll        |   4 +-
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll |  52 +-
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll |  16 +-
 llvm/test/CodeGen/AArch64/funnel-shift-rot.ll |   4 +-
 .../insert-subvector-res-legalization.ll      |  24 +-
 llvm/test/CodeGen/AArch64/lowerMUL-newload.ll |   8 +-
 llvm/test/CodeGen/AArch64/minmax-of-minmax.ll |  64 +-
 llvm/test/CodeGen/AArch64/minmax.ll           |   8 +-
 .../CodeGen/AArch64/overeager_mla_fusing.ll   |   2 +-
 .../AArch64/ragreedy-local-interval-cost.ll   |   4 +-
 llvm/test/CodeGen/AArch64/sadd_sat_vec.ll     |  28 +-
 llvm/test/CodeGen/AArch64/sat-add.ll          |   8 +-
 .../CodeGen/AArch64/selectcc-to-shiftand.ll   |   4 +-
 llvm/test/CodeGen/AArch64/signbit-shift.ll    |   8 +-
 .../CodeGen/AArch64/sink-addsub-of-const.ll   |  48 +-
 llvm/test/CodeGen/AArch64/sinksplat.ll        |   4 +-
 .../CodeGen/AArch64/sitofp-fixed-legal.ll     |   6 +-
 .../AArch64/srem-seteq-illegal-types.ll       |  62 +-
 .../AArch64/srem-seteq-vec-nonsplat.ll        | 176 ++--
 .../CodeGen/AArch64/srem-seteq-vec-splat.ll   |  66 +-
 llvm/test/CodeGen/AArch64/ssub_sat_vec.ll     |  28 +-
 .../AArch64/sve-fixed-length-int-div.ll       |  16 +-
 .../AArch64/sve-fixed-length-int-mulh.ll      |   8 +-
 .../AArch64/sve-fixed-length-int-rem.ll       |  16 +-
 .../sve-fixed-length-masked-scatter.ll        |  21 +-
 llvm/test/CodeGen/AArch64/sve-vscale-attr.ll  |   8 +-
 llvm/test/CodeGen/AArch64/uadd_sat_vec.ll     |  24 +-
 .../AArch64/urem-seteq-illegal-types.ll       |  20 +-
 .../AArch64/urem-seteq-vec-nonsplat.ll        | 204 ++--
 .../CodeGen/AArch64/urem-seteq-vec-nonzero.ll |  18 +-
 .../CodeGen/AArch64/urem-seteq-vec-splat.ll   |  58 +-
 .../AArch64/urem-seteq-vec-tautological.ll    |   6 +-
 llvm/test/CodeGen/AArch64/usub_sat_vec.ll     |  24 +-
 llvm/test/CodeGen/AArch64/vec_cttz.ll         |   4 +-
 llvm/test/CodeGen/AArch64/vec_uaddo.ll        |  74 +-
 llvm/test/CodeGen/AArch64/vec_umulo.ll        | 118 ++-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 162 +--
 .../AArch64/vecreduce-and-legalization.ll     |   6 +-
 .../AArch64/vecreduce-fmax-legalization.ll    |   6 +-
 .../AArch64/vecreduce-fmin-legalization.ll    |   6 +-
 llvm/test/CodeGen/AArch64/vector-fcopysign.ll |  74 +-
 .../test/CodeGen/AArch64/vselect-constants.ll |  30 +-
 .../AArch64/Cortex/A55-neon-instructions.s    | 950 +++++++++---------
 60 files changed, 1630 insertions(+), 1505 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 009219ce3c54b..3543ff3ddfc3b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -149,8 +149,36 @@ def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
 def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
 def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
 def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
-def : WriteRes<WriteVd, [CortexA55UnitFPALU]> { let Latency = 4; }
-def : WriteRes<WriteVq, [CortexA55UnitFPALU,CortexA55UnitFPALU]> { let Latency = 4; let BeginGroup = 1; }
+
+// NEON
+class CortexA55WriteVd<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+  let Latency = n;
+}
+class CortexA55WriteVq<int n, ProcResourceKind res> : SchedWriteRes<[res, res]> {
+  let Latency = n;
+  let BeginGroup = 1;
+}
+class CortexA55WriteVqL<int n, ProcResourceKind res> : SchedWriteRes<[res, res, res, res]> {
+  let Latency = n;
+  let BeginGroup = 1;
+}
+def CortexA55WriteDotScVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaLVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaIxVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_3 : CortexA55WriteVd<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_3 : CortexA55WriteVq<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_2 : CortexA55WriteVd<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_2 : CortexA55WriteVq<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_1 : CortexA55WriteVd<1, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_1 : CortexA55WriteVq<1, CortexA55UnitFPALU>;
+def CortexA55WriteAluVqL_4 : CortexA55WriteVqL<4, CortexA55UnitFPALU>;
+def : SchedAlias<WriteVd, CortexA55WriteVd<4, CortexA55UnitFPALU>>;
+def : SchedAlias<WriteVq, CortexA55WriteVq<4, CortexA55UnitFPALU>>;
 
 // FP ALU specific new schedwrite definitions
 def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;}
@@ -229,6 +257,13 @@ def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
                               WriteID32,WriteID64,
                               WriteIM32,WriteIM64]>;
 
+// NEON ALU/MAC forwarding paths
+def CortexA55ReadMla : SchedReadAdvance<3, [CortexA55WriteMlaVd_4, CortexA55WriteMlaVq_4]>;
+def CortexA55ReadMlaIx : SchedReadAdvance<3, [CortexA55WriteMlaIxVq_4]>;
+def CortexA55ReadMlaL : SchedReadAdvance<3, [CortexA55WriteMlaLVq_4]>;
+def CortexA55ReadDot : SchedReadAdvance<3, [CortexA55WriteDotVd_4, CortexA55WriteDotVq_4]>;
+def CortexA55ReadDotSc : SchedReadAdvance<3, [CortexA55WriteDotScVq_4]>;
+
 //===----------------------------------------------------------------------===//
 // Subtarget-specific InstRWs.
 
@@ -358,4 +393,99 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
 def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
 def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
 
+// 4.15. Advanced SIMD integer instructions
+// ASIMD absolute diff
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
+// ASIMD absolute diff accum
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "[SU]ABAL?v")>;
+// ASIMD absolute diff long
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDLv")>;
+// ASIMD arith #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)",
+  "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)",
+  "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>;
+// ASIMD arith #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
+  "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$", 
+  "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$",
+  "ADDPv(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
+  "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$", 
+  "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$",
+  "ADDPv(16i8|2i64|4i32|8i16)$")>;
+// ASIMD arith #3
+def : InstRW<[CortexA55WriteAluVq_3], (instregex  "SADDLv", "UADDLv", "SADDWv",
+  "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>;
+// ASIMD arith #5
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "RADDHNv", "RSUBHNv")>;
+// ASIMD arith, reduce
+def : InstRW<[CortexA55WriteAluVq_3], (instregex  "ADDVv", "SADDLVv", "UADDLVv")>;
+// ASIMD compare #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
+// ASIMD compare #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
+// ASIMD logical $1
+def : InstRW<[CortexA55WriteAluVd_1], (instregex "(AND|EOR|NOT|ORN)v8i8",
+  "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
+def : InstRW<[CortexA55WriteAluVq_1], (instregex "(AND|EOR|NOT|ORN)v16i8",
+  "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
+// ASIMD max/min, basic
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
+// SIMD max/min, reduce
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU](MAX|MIN)Vv")>;
+// ASIMD multiply, by element
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
+  "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply
+def : InstRW<[CortexA55WriteAluVd_3], (instrs PMULv8i8)>;
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULv16i8)>;
+// ASIMD multiply accumulate
+def : InstRW<[CortexA55WriteMlaVd_4, CortexA55ReadMla], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteMlaVq_4, CortexA55ReadMla], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
+def : InstRW<[CortexA55WriteMlaIxVq_4, CortexA55ReadMlaIx], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply accumulate half
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQRDML[AS]H[vi]")>;
+// ASIMD multiply accumulate long
+def : InstRW<[CortexA55WriteMlaLVq_4, CortexA55ReadMlaL], (instregex "[SU]ML[AS]Lv")>;
+// ASIMD multiply accumulate long #2
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQDML[AS]L[iv]")>;
+// ASIMD dot product
+def : InstRW<[CortexA55WriteDotVd_4, CortexA55ReadDot], (instregex "[SU]DOTv8i8")>;
+def : InstRW<[CortexA55WriteDotVq_4, CortexA55ReadDot], (instregex "[SU]DOTv16i8")>;
+// ASIMD dot product, by scalar
+def : InstRW<[CortexA55WriteDotScVq_4, CortexA55ReadDotSc], (instregex "[SU]DOTlanev")>;
+// ASIMD multiply long
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULLv8i8, PMULLv16i8)>;
+// ASIMD pairwise add and accumulate
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "[SU]ADALPv")>;
+// ASIMD shift accumulate
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
+// ASIMD shift accumulate #2
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "[SU]RSRA[vd]")>;
+// ASIMD shift by immed
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "SHLd$", "SHLv",
+  "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
+// ASIMD shift by immed
+// SXTL and UXTL are aliases for SHLL
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[US]?SHLLv")>;
+// ASIMD shift by immed #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
+  "RSHRNv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHRv(16i8|2i64|4i32|8i16)",
+  "RSHRNv(16i8|4i32|8i16)")>;
+// ASIMD shift by register
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
+// ASIMD shift by register #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
+
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index e2d718c62d881..d43b82a8ea13d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -121,11 +121,11 @@ define <2 x i64> @v2i64_select_sle(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CODE-LABEL: v3i64_select_sle
 ; CODE:       bb.0
 ; CODE:    mov
-; CODE:    ldr
 ; CODE:    mov
 ; CODE:    mov
 ; CODE:    cmge
 ; CODE:    cmge
+; CODE:    ldr
 ; CODE:    bif
 ; CODE:    bif
 ; CODE:    ext
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index 80222e5d5b631..da06d82f24301 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -35,11 +35,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SDAG-LABEL: combine_vec_udiv_nonuniform:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    adrp x8, .LCPI1_0
+; SDAG-NEXT:    adrp x9, .LCPI1_1
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
-; SDAG-NEXT:    adrp x8, .LCPI1_1
-; SDAG-NEXT:    ushl v1.8h, v0.8h, v1.8h
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI1_1]
 ; SDAG-NEXT:    adrp x8, .LCPI1_2
+; SDAG-NEXT:    ldr q2, [x9, :lo12:.LCPI1_1]
+; SDAG-NEXT:    ushl v1.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    umull2 v3.4s, v1.8h, v2.8h
 ; SDAG-NEXT:    umull v1.4s, v1.4h, v2.4h
 ; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI1_2]
@@ -48,41 +48,41 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SDAG-NEXT:    sub v0.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    umull2 v3.4s, v0.8h, v2.8h
 ; SDAG-NEXT:    umull v0.4s, v0.4h, v2.4h
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI1_3]
 ; SDAG-NEXT:    uzp2 v0.8h, v0.8h, v3.8h
 ; SDAG-NEXT:    add v0.8h, v0.8h, v1.8h
-; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI1_3]
-; SDAG-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; SDAG-NEXT:    ushl v0.8h, v0.8h, v2.8h
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: combine_vec_udiv_nonuniform:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI1_4
-; GISEL-NEXT:    adrp x10, .LCPI1_0
-; GISEL-NEXT:    adrp x9, .LCPI1_1
+; GISEL-NEXT:    adrp x9, .LCPI1_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI1_4]
 ; GISEL-NEXT:    adrp x8, .LCPI1_3
-; GISEL-NEXT:    ldr q5, [x10, :lo12:.LCPI1_0]
-; GISEL-NEXT:    ldr q6, [x9, :lo12:.LCPI1_1]
+; GISEL-NEXT:    ldr q5, [x9, :lo12:.LCPI1_0]
 ; GISEL-NEXT:    neg v1.8h, v1.8h
 ; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI1_3]
 ; GISEL-NEXT:    adrp x8, .LCPI1_2
 ; GISEL-NEXT:    ushl v1.8h, v0.8h, v1.8h
 ; GISEL-NEXT:    umull2 v3.4s, v1.8h, v2.8h
 ; GISEL-NEXT:    umull v1.4s, v1.4h, v2.4h
-; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
-; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI1_2]
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI1_2]
 ; GISEL-NEXT:    adrp x8, .LCPI1_5
-; GISEL-NEXT:    sub v2.8h, v0.8h, v1.8h
-; GISEL-NEXT:    umull2 v4.4s, v2.8h, v3.8h
-; GISEL-NEXT:    umull v2.4s, v2.4h, v3.4h
+; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; GISEL-NEXT:    sub v3.8h, v0.8h, v1.8h
+; GISEL-NEXT:    umull2 v4.4s, v3.8h, v2.8h
+; GISEL-NEXT:    umull v2.4s, v3.4h, v2.4h
 ; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI1_5]
+; GISEL-NEXT:    adrp x8, .LCPI1_1
 ; GISEL-NEXT:    cmeq v3.8h, v3.8h, v5.8h
 ; GISEL-NEXT:    uzp2 v2.8h, v2.8h, v4.8h
-; GISEL-NEXT:    neg v4.8h, v6.8h
+; GISEL-NEXT:    ldr q4, [x8, :lo12:.LCPI1_1]
+; GISEL-NEXT:    shl v3.8h, v3.8h, #15
 ; GISEL-NEXT:    add v1.8h, v2.8h, v1.8h
-; GISEL-NEXT:    shl v2.8h, v3.8h, #15
-; GISEL-NEXT:    ushl v1.8h, v1.8h, v4.8h
-; GISEL-NEXT:    sshr v2.8h, v2.8h, #15
+; GISEL-NEXT:    neg v2.8h, v4.8h
+; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT:    sshr v2.8h, v3.8h, #15
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768>
@@ -93,15 +93,15 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
 ; SDAG-LABEL: combine_vec_udiv_nonuniform2:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    adrp x8, .LCPI2_0
+; SDAG-NEXT:    adrp x9, .LCPI2_1
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; SDAG-NEXT:    adrp x8, .LCPI2_1
-; SDAG-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI2_1]
 ; SDAG-NEXT:    adrp x8, .LCPI2_2
-; SDAG-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT:    umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT:    ldr q2, [x9, :lo12:.LCPI2_1]
+; SDAG-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; SDAG-NEXT:    umull2 v1.4s, v0.8h, v2.8h
+; SDAG-NEXT:    umull v0.4s, v0.4h, v2.4h
+; SDAG-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI2_2]
-; SDAG-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
 ; SDAG-NEXT:    ushl v0.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    ret
 ;
@@ -112,21 +112,21 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
 ; GISEL-NEXT:    adrp x10, .LCPI2_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI2_3]
 ; GISEL-NEXT:    adrp x8, .LCPI2_2
-; GISEL-NEXT:    ldr q3, [x9, :lo12:.LCPI2_4]
 ; GISEL-NEXT:    ldr q4, [x10, :lo12:.LCPI2_0]
 ; GISEL-NEXT:    neg v1.8h, v1.8h
 ; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI2_2]
 ; GISEL-NEXT:    adrp x8, .LCPI2_1
-; GISEL-NEXT:    cmeq v3.8h, v3.8h, v4.8h
 ; GISEL-NEXT:    ushl v1.8h, v0.8h, v1.8h
-; GISEL-NEXT:    shl v3.8h, v3.8h, #15
-; GISEL-NEXT:    umull2 v5.4s, v1.8h, v2.8h
+; GISEL-NEXT:    umull2 v3.4s, v1.8h, v2.8h
+; GISEL-NEXT:    ldr q5, [x8, :lo12:.LCPI2_1]
 ; GISEL-NEXT:    umull v1.4s, v1.4h, v2.4h
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI2_1]
-; GISEL-NEXT:    neg v2.8h, v2.8h
-; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v5.8h
-; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
-; GISEL-NEXT:    sshr v2.8h, v3.8h, #15
+; GISEL-NEXT:    ldr q2, [x9, :lo12:.LCPI2_4]
+; GISEL-NEXT:    cmeq v2.8h, v2.8h, v4.8h
+; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
+; GISEL-NEXT:    neg v3.8h, v5.8h
+; GISEL-NEXT:    shl v2.8h, v2.8h, #15
+; GISEL-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; GISEL-NEXT:    sshr v2.8h, v2.8h, #15
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -151,21 +151,21 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
 ; GISEL-LABEL: combine_vec_udiv_nonuniform3:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI3_2
-; GISEL-NEXT:    adrp x10, .LCPI3_0
-; GISEL-NEXT:    adrp x9, .LCPI3_1
+; GISEL-NEXT:    adrp x9, .LCPI3_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI3_2]
 ; GISEL-NEXT:    adrp x8, .LCPI3_3
-; GISEL-NEXT:    ldr q3, [x10, :lo12:.LCPI3_0]
-; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI3_1]
+; GISEL-NEXT:    ldr q3, [x9, :lo12:.LCPI3_0]
 ; GISEL-NEXT:    umull2 v2.4s, v0.8h, v1.8h
 ; GISEL-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
 ; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI3_3]
+; GISEL-NEXT:    adrp x8, .LCPI3_1
 ; GISEL-NEXT:    cmeq v2.8h, v2.8h, v3.8h
-; GISEL-NEXT:    sub v5.8h, v0.8h, v1.8h
-; GISEL-NEXT:    neg v3.8h, v4.8h
+; GISEL-NEXT:    sub v4.8h, v0.8h, v1.8h
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI3_1]
 ; GISEL-NEXT:    shl v2.8h, v2.8h, #15
-; GISEL-NEXT:    usra v1.8h, v5.8h, #1
+; GISEL-NEXT:    usra v1.8h, v4.8h, #1
+; GISEL-NEXT:    neg v3.8h, v3.8h
 ; GISEL-NEXT:    sshr v2.8h, v2.8h, #15
 ; GISEL-NEXT:    ushl v1.8h, v1.8h, v3.8h
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
@@ -178,41 +178,41 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SDAG-LABEL: combine_vec_udiv_nonuniform4:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    adrp x8, .LCPI4_0
-; SDAG-NEXT:    adrp x9, .LCPI4_3
+; SDAG-NEXT:    adrp x9, .LCPI4_2
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
 ; SDAG-NEXT:    adrp x8, .LCPI4_1
-; SDAG-NEXT:    ldr q3, [x9, :lo12:.LCPI4_3]
+; SDAG-NEXT:    ldr q3, [x9, :lo12:.LCPI4_2]
 ; SDAG-NEXT:    umull2 v2.8h, v0.16b, v1.16b
 ; SDAG-NEXT:    umull v1.8h, v0.8b, v1.8b
-; SDAG-NEXT:    and v0.16b, v0.16b, v3.16b
 ; SDAG-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
 ; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI4_1]
-; SDAG-NEXT:    adrp x8, .LCPI4_2
+; SDAG-NEXT:    adrp x8, .LCPI4_3
 ; SDAG-NEXT:    ushl v1.16b, v1.16b, v2.16b
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI4_2]
-; SDAG-NEXT:    and v1.16b, v1.16b, v2.16b
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI4_3]
+; SDAG-NEXT:    and v1.16b, v1.16b, v3.16b
+; SDAG-NEXT:    and v0.16b, v0.16b, v2.16b
 ; SDAG-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: combine_vec_udiv_nonuniform4:
 ; GISEL:       // %bb.0:
+; GISEL-NEXT:    adrp x8, .LCPI4_2
+; GISEL-NEXT:    adrp x9, .LCPI4_0
+; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI4_2]
 ; GISEL-NEXT:    adrp x8, .LCPI4_3
-; GISEL-NEXT:    adrp x9, .LCPI4_2
-; GISEL-NEXT:    adrp x10, .LCPI4_1
-; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI4_3]
-; GISEL-NEXT:    adrp x8, .LCPI4_0
-; GISEL-NEXT:    ldr q2, [x9, :lo12:.LCPI4_2]
-; GISEL-NEXT:    ldr q3, [x10, :lo12:.LCPI4_1]
-; GISEL-NEXT:    ldr q4, [x8, :lo12:.LCPI4_0]
-; GISEL-NEXT:    umull2 v5.8h, v0.16b, v2.16b
-; GISEL-NEXT:    umull v2.8h, v0.8b, v2.8b
-; GISEL-NEXT:    cmeq v1.16b, v1.16b, v4.16b
-; GISEL-NEXT:    neg v3.16b, v3.16b
-; GISEL-NEXT:    uzp2 v2.16b, v2.16b, v5.16b
-; GISEL-NEXT:    shl v1.16b, v1.16b, #7
-; GISEL-NEXT:    ushl v2.16b, v2.16b, v3.16b
-; GISEL-NEXT:    sshr v1.16b, v1.16b, #7
-; GISEL-NEXT:    bif v0.16b, v2.16b, v1.16b
+; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI4_0]
+; GISEL-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI4_3]
+; GISEL-NEXT:    umull v1.8h, v0.8b, v1.8b
+; GISEL-NEXT:    adrp x8, .LCPI4_1
+; GISEL-NEXT:    cmeq v3.16b, v3.16b, v4.16b
+; GISEL-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI4_1]
+; GISEL-NEXT:    shl v3.16b, v3.16b, #7
+; GISEL-NEXT:    neg v2.16b, v2.16b
+; GISEL-NEXT:    ushl v1.16b, v1.16b, v2.16b
+; GISEL-NEXT:    sshr v2.16b, v3.16b, #7
+; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %div = udiv <16 x i8> %x, <i8 -64, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   ret <16 x i8> %div
@@ -222,54 +222,54 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; SDAG-LABEL: pr38477:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    adrp x8, .LCPI5_0
-; SDAG-NEXT:    adrp x9, .LCPI5_4
+; SDAG-NEXT:    adrp x9, .LCPI5_3
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
 ; SDAG-NEXT:    adrp x8, .LCPI5_1
 ; SDAG-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT:    ldr q3, [x8, :lo12:.LCPI5_1]
 ; SDAG-NEXT:    umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT:    adrp x8, .LCPI5_2
 ; SDAG-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT:    sub v2.8h, v0.8h, v1.8h
-; SDAG-NEXT:    umull2 v4.4s, v2.8h, v3.8h
-; SDAG-NEXT:    umull v2.4s, v2.4h, v3.4h
-; SDAG-NEXT:    ldr q3, [x9, :lo12:.LCPI5_4]
-; SDAG-NEXT:    and v0.16b, v0.16b, v3.16b
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_1]
+; SDAG-NEXT:    adrp x8, .LCPI5_2
+; SDAG-NEXT:    sub v3.8h, v0.8h, v1.8h
+; SDAG-NEXT:    umull2 v4.4s, v3.8h, v2.8h
+; SDAG-NEXT:    umull v2.4s, v3.4h, v2.4h
+; SDAG-NEXT:    ldr q3, [x8, :lo12:.LCPI5_2]
+; SDAG-NEXT:    adrp x8, .LCPI5_4
 ; SDAG-NEXT:    uzp2 v2.8h, v2.8h, v4.8h
+; SDAG-NEXT:    ldr q4, [x9, :lo12:.LCPI5_3]
 ; SDAG-NEXT:    add v1.8h, v2.8h, v1.8h
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_2]
-; SDAG-NEXT:    adrp x8, .LCPI5_3
-; SDAG-NEXT:    ushl v1.8h, v1.8h, v2.8h
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_3]
-; SDAG-NEXT:    and v1.16b, v1.16b, v2.16b
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_4]
+; SDAG-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; SDAG-NEXT:    and v0.16b, v0.16b, v2.16b
+; SDAG-NEXT:    and v1.16b, v1.16b, v4.16b
 ; SDAG-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: pr38477:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI5_3
-; GISEL-NEXT:    adrp x10, .LCPI5_0
-; GISEL-NEXT:    adrp x9, .LCPI5_1
+; GISEL-NEXT:    adrp x9, .LCPI5_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI5_3]
 ; GISEL-NEXT:    adrp x8, .LCPI5_2
-; GISEL-NEXT:    ldr q5, [x10, :lo12:.LCPI5_0]
-; GISEL-NEXT:    ldr q6, [x9, :lo12:.LCPI5_1]
+; GISEL-NEXT:    ldr q5, [x9, :lo12:.LCPI5_0]
 ; GISEL-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI5_2]
 ; GISEL-NEXT:    umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT:    adrp x8, .LCPI5_4
 ; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT:    sub v2.8h, v0.8h, v1.8h
-; GISEL-NEXT:    umull2 v4.4s, v2.8h, v3.8h
-; GISEL-NEXT:    umull v2.4s, v2.4h, v3.4h
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI5_2]
+; GISEL-NEXT:    adrp x8, .LCPI5_4
+; GISEL-NEXT:    sub v3.8h, v0.8h, v1.8h
+; GISEL-NEXT:    umull2 v4.4s, v3.8h, v2.8h
+; GISEL-NEXT:    umull v2.4s, v3.4h, v2.4h
 ; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI5_4]
+; GISEL-NEXT:    adrp x8, .LCPI5_1
 ; GISEL-NEXT:    cmeq v3.8h, v3.8h, v5.8h
 ; GISEL-NEXT:    uzp2 v2.8h, v2.8h, v4.8h
-; GISEL-NEXT:    neg v4.8h, v6.8h
+; GISEL-NEXT:    ldr q4, [x8, :lo12:.LCPI5_1]
+; GISEL-NEXT:    shl v3.8h, v3.8h, #15
 ; GISEL-NEXT:    add v1.8h, v2.8h, v1.8h
-; GISEL-NEXT:    shl v2.8h, v3.8h, #15
-; GISEL-NEXT:    ushl v1.8h, v1.8h, v4.8h
-; GISEL-NEXT:    sshr v2.8h, v2.8h, #15
+; GISEL-NEXT:    neg v2.8h, v4.8h
+; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT:    sshr v2.8h, v3.8h, #15
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index f7247be3c0bf7..bc31d41a55f43 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -98,10 +98,10 @@ entry:
 define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) {
 ; CHECK-LABEL: dupsext_v2i8_v2i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-NEXT:    dup v1.2s, w8
 ; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 7b3bcacc4f3d0..11bcb783cb5cd 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -428,10 +428,10 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI24_0
 ; CHECK-NEXT:    dup v0.8b, w0
+; CHECK-NEXT:    dup v2.8b, w1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI24_0]
 ; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    dup v1.8b, w1
-; CHECK-NEXT:    cmhi v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    cmhi v0.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8 %index, i8 %TC)
   ret <8 x i1> %active.lane.mask
@@ -440,16 +440,16 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
 define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_v4i1_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v0.4h, w0
 ; CHECK-NEXT:    adrp x8, .LCPI25_0
-; CHECK-NEXT:    dup v2.4h, w1
+; CHECK-NEXT:    dup v0.4h, w0
+; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-NEXT:    dup v3.4h, w1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI25_0]
 ; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-NEXT:    bic v3.4h, #255, lsl #8
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    movi d1, #0xff00ff00ff00ff
-; CHECK-NEXT:    umin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    cmhi v0.4h, v2.4h, v0.4h
+; CHECK-NEXT:    umin v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    cmhi v0.4h, v3.4h, v0.4h
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC)
   ret <4 x i1> %active.lane.mask
@@ -458,16 +458,16 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
 define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_v2i1_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    movi d0, #0x0000ff000000ff
 ; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    dup v3.2s, w1
-; CHECK-NEXT:    and v1.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI26_0]
+; CHECK-NEXT:    and v1.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    add v1.2s, v1.2s, v2.2s
-; CHECK-NEXT:    and v2.8b, v3.8b, v0.8b
-; CHECK-NEXT:    umin v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    cmhi v0.2s, v2.2s, v0.2s
+; CHECK-NEXT:    umin v1.2s, v1.2s, v0.2s
+; CHECK-NEXT:    and v0.8b, v3.8b, v0.8b
+; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC)
   ret <2 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
index ee7be0f48a7e5..81b7991a7f86e 100644
--- a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
+++ b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
@@ -213,9 +213,9 @@ define <4 x i32> @vec_add_const_const_sub_extrause(<4 x i32> %arg) {
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    bl vec_use
-; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
@@ -290,9 +290,9 @@ define <4 x i32> @vec_sub_const_add_const_extrause(<4 x i32> %arg) {
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    bl vec_use
-; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 7934e39b2b69f..0b1b581d77925 100644
--- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -68,16 +68,16 @@ define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: add_sub_su64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d2, xzr
 ; CHECK-NEXT:    add d0, d1, d0
-; CHECK-NEXT:    fmov d1, xzr
-; CHECK-NEXT:    sub d0, d1, d0
+; CHECK-NEXT:    sub d0, d2, d0
 ; CHECK-NEXT:    ret
 ;
 ; GENERIC-LABEL: add_sub_su64:
 ; GENERIC:       // %bb.0:
+; GENERIC-NEXT:    fmov d2, xzr
 ; GENERIC-NEXT:    add d0, d1, d0
-; GENERIC-NEXT:    fmov d1, xzr
-; GENERIC-NEXT:    sub d0, d1, d0
+; GENERIC-NEXT:    sub d0, d2, d0
 ; GENERIC-NEXT:    ret
   %vecext = extractelement <2 x i64> %a, i32 0
   %vecext1 = extractelement <2 x i64> %b, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
index 4c90f93b235dd..17d937d1f3940 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -6,8 +6,8 @@
 define float @test1(float %x, float %y) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mvni.4s v2, #128, lsl #24
 ; CHECK-NEXT:    ; kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mvni.4s v2, #128, lsl #24
 ; CHECK-NEXT:    ; kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    bif.16b v0, v1, v2
 ; CHECK-NEXT:    ; kill: def $s0 killed $s0 killed $q0
@@ -55,10 +55,10 @@ define float @test4() nounwind {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; CHECK-NEXT:    bl _bar
-; CHECK-NEXT:    mvni.4s v1, #128, lsl #24
 ; CHECK-NEXT:    fcvt s0, d0
-; CHECK-NEXT:    fmov s2, #0.50000000
-; CHECK-NEXT:    bsl.16b v1, v2, v0
+; CHECK-NEXT:    fmov s1, #0.50000000
+; CHECK-NEXT:    mvni.4s v2, #128, lsl #24
+; CHECK-NEXT:    bif.16b v1, v0, v2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
index 764a6b307b179..870190807b0d9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -374,8 +374,8 @@ define void @testLeftBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest)
 ; CHECK-LABEL: testLeftBad2x64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #10
-; CHECK-NEXT:    movk x8, #1, lsl #48
 ; CHECK-NEXT:    shl.2d v1, v1, #48
+; CHECK-NEXT:    movk x8, #1, lsl #48
 ; CHECK-NEXT:    dup.2d v2, x8
 ; CHECK-NEXT:    and.16b v0, v0, v2
 ; CHECK-NEXT:    orr.16b v0, v0, v1
@@ -405,8 +405,8 @@ define void @testRightBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest
 ; CHECK-LABEL: testRightBad2x64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #10
-; CHECK-NEXT:    movk x8, #1, lsl #48
 ; CHECK-NEXT:    ushr.2d v1, v1, #48
+; CHECK-NEXT:    movk x8, #1, lsl #48
 ; CHECK-NEXT:    dup.2d v2, x8
 ; CHECK-NEXT:    and.16b v0, v0, v2
 ; CHECK-NEXT:    orr.16b v0, v0, v1
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 38d574213b9dc..50dda82c904b4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -177,12 +177,12 @@ define <4 x i64> @sext_v4i8_to_v4i64(<4 x i8> %v0) nounwind {
 ; CHECK-LABEL: sext_v4i8_to_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll2.2d v1, v0, #0
-; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    shl.2d v1, v1, #56
+; CHECK-NEXT:    ushll.2d v1, v0, #0
+; CHECK-NEXT:    ushll2.2d v0, v0, #0
+; CHECK-NEXT:    shl.2d v2, v1, #56
 ; CHECK-NEXT:    shl.2d v0, v0, #56
-; CHECK-NEXT:    sshr.2d v1, v1, #56
-; CHECK-NEXT:    sshr.2d v0, v0, #56
+; CHECK-NEXT:    sshr.2d v1, v0, #56
+; CHECK-NEXT:    sshr.2d v0, v2, #56
 ; CHECK-NEXT:    ret
   %r = sext <4 x i8> %v0 to <4 x i64>
   ret <4 x i64> %r
@@ -192,12 +192,12 @@ define <8 x i64> @zext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
 ; CHECK-LABEL: zext_v8i8_to_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll.4s v2, v0, #0
-; CHECK-NEXT:    ushll2.4s v4, v0, #0
-; CHECK-NEXT:    ushll2.2d v1, v2, #0
-; CHECK-NEXT:    ushll.2d v0, v2, #0
-; CHECK-NEXT:    ushll2.2d v3, v4, #0
-; CHECK-NEXT:    ushll.2d v2, v4, #0
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll2.2d v3, v2, #0
+; CHECK-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    ushll.2d v2, v2, #0
 ; CHECK-NEXT:    ret
   %r = zext <8 x i8> %v0 to <8 x i64>
   ret <8 x i64> %r
@@ -207,12 +207,12 @@ define <8 x i64> @sext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
 ; CHECK-LABEL: sext_v8i8_to_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    sshll.4s v2, v0, #0
-; CHECK-NEXT:    sshll2.4s v4, v0, #0
-; CHECK-NEXT:    sshll2.2d v1, v2, #0
-; CHECK-NEXT:    sshll.2d v0, v2, #0
-; CHECK-NEXT:    sshll2.2d v3, v4, #0
-; CHECK-NEXT:    sshll.2d v2, v4, #0
+; CHECK-NEXT:    sshll2.4s v2, v0, #0
+; CHECK-NEXT:    sshll.4s v0, v0, #0
+; CHECK-NEXT:    sshll2.2d v3, v2, #0
+; CHECK-NEXT:    sshll2.2d v1, v0, #0
+; CHECK-NEXT:    sshll.2d v0, v0, #0
+; CHECK-NEXT:    sshll.2d v2, v2, #0
 ; CHECK-NEXT:    ret
   %r = sext <8 x i8> %v0 to <8 x i64>
   ret <8 x i64> %r
@@ -496,129 +496,129 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
 ; CHECK-NEXT:    ldr w9, [sp, #64]
 ; CHECK-NEXT:    ldr w10, [sp, #192]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldr w8, [sp, #72]
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    ldr w9, [sp, #200]
-; CHECK-NEXT:    fmov s1, w10
-; CHECK-NEXT:    ldr w10, [sp, #328]
+; CHECK-NEXT:    ldr w8, [sp, #328]
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    ldr w9, [sp, #72]
+; CHECK-NEXT:    fmov s2, w10
+; CHECK-NEXT:    ldr w10, [sp, #80]
+; CHECK-NEXT:    mov.b v0[1], w8
+; CHECK-NEXT:    ldr w8, [sp, #200]
+; CHECK-NEXT:    mov.b v1[1], w9
+; CHECK-NEXT:    ldr w9, [sp, #336]
 ; CHECK-NEXT:    mov.b v3[1], w1
-; CHECK-NEXT:    ldr w11, [sp, #344]
+; CHECK-NEXT:    ldr w11, [sp, #88]
 ; CHECK-NEXT:    mov.b v2[1], w8
-; CHECK-NEXT:    ldr w8, [sp, #336]
-; CHECK-NEXT:    mov.b v1[1], w9
-; CHECK-NEXT:    ldr w9, [sp, #80]
-; CHECK-NEXT:    mov.b v0[1], w10
-; CHECK-NEXT:    ldr w10, [sp, #208]
+; CHECK-NEXT:    ldr w8, [sp, #344]
+; CHECK-NEXT:    mov.b v0[2], w9
+; CHECK-NEXT:    ldr w9, [sp, #208]
+; CHECK-NEXT:    mov.b v1[2], w10
+; CHECK-NEXT:    ldr w10, [sp, #352]
 ; CHECK-NEXT:    mov.b v3[2], w2
-; CHECK-NEXT:    ldr w12, [sp, #360]
+; CHECK-NEXT:    ldr w12, [sp, #96]
 ; CHECK-NEXT:    mov.b v2[2], w9
-; CHECK-NEXT:    ldr w9, [sp, #352]
-; CHECK-NEXT:    mov.b v1[2], w10
-; CHECK-NEXT:    ldr w10, [sp, #88]
-; CHECK-NEXT:    mov.b v0[2], w8
+; CHECK-NEXT:    ldr w9, [sp, #360]
+; CHECK-NEXT:    mov.b v0[3], w8
 ; CHECK-NEXT:    ldr w8, [sp, #216]
+; CHECK-NEXT:    mov.b v1[3], w11
+; CHECK-NEXT:    ldr w13, [sp, #104]
 ; CHECK-NEXT:    mov.b v3[3], w3
-; CHECK-NEXT:    ldr w13, [sp, #376]
-; CHECK-NEXT:    mov.b v2[3], w10
-; CHECK-NEXT:    ldr w10, [sp, #368]
-; CHECK-NEXT:    mov.b v1[3], w8
-; CHECK-NEXT:    ldr w8, [sp, #96]
-; CHECK-NEXT:    mov.b v0[3], w11
-; CHECK-NEXT:    ldr w11, [sp, #224]
+; CHECK-NEXT:    ldr w11, [sp, #368]
+; CHECK-NEXT:    mov.b v2[3], w8
+; CHECK-NEXT:    ldr w14, [sp, #112]
+; CHECK-NEXT:    mov.b v0[4], w10
+; CHECK-NEXT:    ldr w10, [sp, #224]
+; CHECK-NEXT:    mov.b v1[4], w12
+; CHECK-NEXT:    ldr w8, [sp, #376]
 ; CHECK-NEXT:    mov.b v3[4], w4
-; CHECK-NEXT:    ldr w14, [sp, #392]
-; CHECK-NEXT:    mov.b v2[4], w8
-; CHECK-NEXT:    ldr w8, [sp, #384]
-; CHECK-NEXT:    mov.b v1[4], w11
-; CHECK-NEXT:    ldr w11, [sp, #104]
-; CHECK-NEXT:    mov.b v0[4], w9
+; CHECK-NEXT:    ldr w15, [sp, #120]
+; CHECK-NEXT:    mov.b v2[4], w10
+; CHECK-NEXT:    ldr w12, [sp, #384]
+; CHECK-NEXT:    mov.b v0[5], w9
 ; CHECK-NEXT:    ldr w9, [sp, #232]
+; CHECK-NEXT:    mov.b v1[5], w13
+; CHECK-NEXT:    ldr w16, [sp, #128]
 ; CHECK-NEXT:    mov.b v3[5], w5
-; CHECK-NEXT:    ldr w15, [sp, #408]
-; CHECK-NEXT:    mov.b v2[5], w11
-; CHECK-NEXT:    ldr w11, [sp, #400]
-; CHECK-NEXT:    mov.b v1[5], w9
-; CHECK-NEXT:    ldr w9, [sp, #112]
-; CHECK-NEXT:    mov.b v0[5], w12
-; CHECK-NEXT:    ldr w12, [sp, #240]
+; CHECK-NEXT:    ldr w10, [sp, #392]
+; CHECK-NEXT:    mov.b v2[5], w9
+; CHECK-NEXT:    ldr w13, [sp, #400]
+; CHECK-NEXT:    mov.b v0[6], w11
+; CHECK-NEXT:    ldr w11, [sp, #240]
+; CHECK-NEXT:    mov.b v1[6], w14
+; CHECK-NEXT:    ldr w9, [sp, #408]
 ; CHECK-NEXT:    mov.b v3[6], w6
-; CHECK-NEXT:    ldr w16, [sp, #424]
-; CHECK-NEXT:    mov.b v2[6], w9
-; CHECK-NEXT:    ldr w9, [sp, #416]
-; CHECK-NEXT:    mov.b v1[6], w12
-; CHECK-NEXT:    ldr w12, [sp, #120]
-; CHECK-NEXT:    mov.b v0[6], w10
-; CHECK-NEXT:    ldr w10, [sp, #248]
+; CHECK-NEXT:    ldr w14, [sp, #416]
+; CHECK-NEXT:    mov.b v2[6], w11
+; CHECK-NEXT:    ldr w11, [sp, #424]
+; CHECK-NEXT:    mov.b v0[7], w8
+; CHECK-NEXT:    ldr w8, [sp, #248]
+; CHECK-NEXT:    mov.b v1[7], w15
+; CHECK-NEXT:    ldr w15, [sp, #432]
 ; CHECK-NEXT:    mov.b v3[7], w7
-; CHECK-NEXT:    mov.b v2[7], w12
-; CHECK-NEXT:    ldr w12, [sp]
-; CHECK-NEXT:    mov.b v1[7], w10
-; CHECK-NEXT:    ldr w10, [sp, #128]
-; CHECK-NEXT:    mov.b v0[7], w13
-; CHECK-NEXT:    ldr w13, [sp, #256]
-; CHECK-NEXT:    mov.b v3[8], w12
-; CHECK-NEXT:    ldr w12, [sp, #432]
-; CHECK-NEXT:    mov.b v2[8], w10
-; CHECK-NEXT:    ldr w10, [sp, #8]
-; CHECK-NEXT:    mov.b v1[8], w13
-; CHECK-NEXT:    ldr w13, [sp, #136]
-; CHECK-NEXT:    mov.b v0[8], w8
-; CHECK-NEXT:    ldr w8, [sp, #264]
-; CHECK-NEXT:    mov.b v3[9], w10
-; CHECK-NEXT:    ldr w10, [sp, #440]
-; CHECK-NEXT:    mov.b v2[9], w13
-; CHECK-NEXT:    ldr w13, [sp, #16]
+; CHECK-NEXT:    mov.b v2[7], w8
+; CHECK-NEXT:    ldr w8, [sp]
+; CHECK-NEXT:    mov.b v0[8], w12
+; CHECK-NEXT:    ldr w12, [sp, #256]
+; CHECK-NEXT:    mov.b v1[8], w16
+; CHECK-NEXT:    ldr w16, [sp, #440]
+; CHECK-NEXT:    mov.b v3[8], w8
+; CHECK-NEXT:    ldr w8, [sp, #136]
+; CHECK-NEXT:    mov.b v2[8], w12
+; CHECK-NEXT:    ldr w12, [sp, #8]
+; CHECK-NEXT:    mov.b v0[9], w10
+; CHECK-NEXT:    ldr w10, [sp, #264]
 ; CHECK-NEXT:    mov.b v1[9], w8
-; CHECK-NEXT:    ldr w8, [sp, #144]
-; CHECK-NEXT:    mov.b v0[9], w14
-; CHECK-NEXT:    ldr w14, [sp, #272]
-; CHECK-NEXT:    mov.b v3[10], w13
+; CHECK-NEXT:    ldr w8, [sp, #272]
+; CHECK-NEXT:    mov.b v3[9], w12
+; CHECK-NEXT:    ldr w12, [sp, #144]
+; CHECK-NEXT:    mov.b v2[9], w10
+; CHECK-NEXT:    ldr w10, [sp, #16]
+; CHECK-NEXT:    mov.b v0[10], w13
 ; CHECK-NEXT:    ldr w13, [sp, #280]
+; CHECK-NEXT:    mov.b v1[10], w12
+; CHECK-NEXT:    ldr w12, [sp, #152]
+; CHECK-NEXT:    mov.b v3[10], w10
+; CHECK-NEXT:    ldr w10, [sp, #160]
 ; CHECK-NEXT:    mov.b v2[10], w8
 ; CHECK-NEXT:    ldr w8, [sp, #24]
-; CHECK-NEXT:    mov.b v1[10], w14
-; CHECK-NEXT:    ldr w14, [sp, #152]
-; CHECK-NEXT:    mov.b v0[10], w11
-; CHECK-NEXT:    ldr w11, [sp, #288]
+; CHECK-NEXT:    mov.b v0[11], w9
+; CHECK-NEXT:    ldr w9, [sp, #288]
+; CHECK-NEXT:    mov.b v1[11], w12
+; CHECK-NEXT:    ldr w12, [sp, #296]
 ; CHECK-NEXT:    mov.b v3[11], w8
 ; CHECK-NEXT:    ldr w8, [sp, #32]
-; CHECK-NEXT:    mov.b v2[11], w14
-; CHECK-NEXT:    ldr w14, [sp, #296]
-; CHECK-NEXT:    mov.b v1[11], w13
-; CHECK-NEXT:    ldr w13, [sp, #160]
-; CHECK-NEXT:    mov.b v0[11], w15
+; CHECK-NEXT:    mov.b v2[11], w13
+; CHECK-NEXT:    mov.b v0[12], w14
+; CHECK-NEXT:    mov.b v1[12], w10
+; CHECK-NEXT:    ldr w10, [sp, #168]
 ; CHECK-NEXT:    mov.b v3[12], w8
 ; CHECK-NEXT:    ldr w8, [sp, #40]
-; CHECK-NEXT:    mov.b v2[12], w13
-; CHECK-NEXT:    ldr w13, [sp, #312]
-; CHECK-NEXT:    mov.b v1[12], w11
-; CHECK-NEXT:    ldr w11, [sp, #168]
-; CHECK-NEXT:    mov.b v0[12], w9
+; CHECK-NEXT:    mov.b v2[12], w9
 ; CHECK-NEXT:    ldr w9, [sp, #304]
+; CHECK-NEXT:    mov.b v0[13], w11
+; CHECK-NEXT:    ldr w11, [sp, #312]
+; CHECK-NEXT:    mov.b v1[13], w10
+; CHECK-NEXT:    ldr w10, [sp, #176]
 ; CHECK-NEXT:    mov.b v3[13], w8
 ; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    mov.b v2[13], w11
-; CHECK-NEXT:    ldr w11, [sp, #176]
-; CHECK-NEXT:    mov.b v1[13], w14
-; CHECK-NEXT:    mov.b v0[13], w16
+; CHECK-NEXT:    mov.b v2[13], w12
+; CHECK-NEXT:    mov.b v0[14], w15
+; CHECK-NEXT:    mov.b v1[14], w10
+; CHECK-NEXT:    ldr w10, [sp, #184]
 ; CHECK-NEXT:    mov.b v3[14], w8
 ; CHECK-NEXT:    ldr w8, [sp, #56]
-; CHECK-NEXT:    mov.b v2[14], w11
-; CHECK-NEXT:    mov.b v1[14], w9
-; CHECK-NEXT:    ldr w9, [sp, #184]
-; CHECK-NEXT:    mov.b v0[14], w12
+; CHECK-NEXT:    mov.b v2[14], w9
+; CHECK-NEXT:    mov.b v0[15], w16
+; CHECK-NEXT:    mov.b v1[15], w10
 ; CHECK-NEXT:    mov.b v3[15], w8
-; CHECK-NEXT:    mov.b v2[15], w9
-; CHECK-NEXT:    mov.b v1[15], w13
-; CHECK-NEXT:    mov.b v0[15], w10
+; CHECK-NEXT:    mov.b v2[15], w11
+; CHECK-NEXT:    shl.16b v4, v0, #7
+; CHECK-NEXT:    shl.16b v1, v1, #7
 ; CHECK-NEXT:    shl.16b v3, v3, #7
 ; CHECK-NEXT:    shl.16b v2, v2, #7
-; CHECK-NEXT:    shl.16b v4, v1, #7
-; CHECK-NEXT:    shl.16b v5, v0, #7
 ; CHECK-NEXT:    cmlt.16b v0, v3, #0
-; CHECK-NEXT:    cmlt.16b v1, v2, #0
-; CHECK-NEXT:    cmlt.16b v2, v4, #0
-; CHECK-NEXT:    cmlt.16b v3, v5, #0
+; CHECK-NEXT:    cmlt.16b v1, v1, #0
+; CHECK-NEXT:    cmlt.16b v2, v2, #0
+; CHECK-NEXT:    cmlt.16b v3, v4, #0
 ; CHECK-NEXT:    ret
   %res = sext <64 x i1> %arg to <64 x i8>
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index 4f365171de05d..396d9efe4566d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -787,10 +787,10 @@ define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
 define <4 x i64> @hadd32_sext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
 ; CHECK-LABEL: hadd32_sext_lsr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    saddl2.2d v2, v0, v1
-; CHECK-NEXT:    saddl.2d v0, v0, v1
-; CHECK-NEXT:    ushr.2d v1, v2, #1
-; CHECK-NEXT:    ushr.2d v0, v0, #1
+; CHECK-NEXT:    saddl.2d v2, v0, v1
+; CHECK-NEXT:    saddl2.2d v0, v0, v1
+; CHECK-NEXT:    ushr.2d v1, v0, #1
+; CHECK-NEXT:    ushr.2d v0, v2, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index 44d0eed3d7234..abf9469c45ef5 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -178,10 +178,10 @@ define <4 x i32> @sign_4xi32_multi_use(<4 x i32> %a) {
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    orr v2.4s, #1
-; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    cmgt v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    str q2, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-NEXT:    bl use_4xi1
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
@@ -198,10 +198,10 @@ define <4 x i32> @not_sign_4xi32(<4 x i32> %a) {
 ; CHECK-LABEL: not_sign_4xi32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    cmgt v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v1.16b, v0.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v1.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    orn v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i32> %a, <i32 1, i32 -1, i32 -1, i32 -1>
@@ -229,10 +229,10 @@ define <4 x i32> @not_sign_4xi32_3(<4 x i32> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %res = select <4 x i1> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 1>
diff --git a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
index e4f13f5c98a17..3ac8a18772721 100644
--- a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
+++ b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
@@ -8,15 +8,15 @@ define void @signbits_vXi1(<4 x i16> %a1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w1, wzr
+; CHECK-NEXT:    movi v2.4h, #1
 ; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    mov w1, wzr
 ; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    adrp x8, .LCPI0_1
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    movi v1.4h, #1
-; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_1]
+; CHECK-NEXT:    cmgt v0.4h, v2.4h, v0.4h
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #15
 ; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
index b278e9cd06da7..72e9a1e710f18 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
@@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
 ; ALL-NEXT:    sdiv x9, x9, x8
 ; ALL-NEXT:    mul x8, x9, x8
 ; ALL-NEXT:    sdiv x11, x11, x10
+; ALL-NEXT:    fmov d2, x9
 ; ALL-NEXT:    fmov d1, x8
 ; ALL-NEXT:    mul x10, x11, x10
+; ALL-NEXT:    mov v2.d[1], x11
 ; ALL-NEXT:    mov v1.d[1], x10
+; ALL-NEXT:    str q2, [x0]
 ; ALL-NEXT:    sub v0.2d, v0.2d, v1.2d
-; ALL-NEXT:    fmov d1, x9
-; ALL-NEXT:    mov v1.d[1], x11
-; ALL-NEXT:    str q1, [x0]
 ; ALL-NEXT:    ret
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, <2 x i64>* %divdst, align 16
diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
index af57819567919..c514cc99f014d 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
@@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
 ; ALL-NEXT:    udiv x9, x9, x8
 ; ALL-NEXT:    mul x8, x9, x8
 ; ALL-NEXT:    udiv x11, x11, x10
+; ALL-NEXT:    fmov d2, x9
 ; ALL-NEXT:    fmov d1, x8
 ; ALL-NEXT:    mul x10, x11, x10
+; ALL-NEXT:    mov v2.d[1], x11
 ; ALL-NEXT:    mov v1.d[1], x10
+; ALL-NEXT:    str q2, [x0]
 ; ALL-NEXT:    sub v0.2d, v0.2d, v1.2d
-; ALL-NEXT:    fmov d1, x9
-; ALL-NEXT:    mov v1.d[1], x11
-; ALL-NEXT:    str q1, [x0]
 ; ALL-NEXT:    ret
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, <2 x i64>* %divdst, align 16
diff --git a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
index b75913dabadff..de9a0fe9b23ae 100644
--- a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
+++ b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
@@ -7,14 +7,14 @@ define <2 x i16>  @rotlv2_16(<2 x i16> %vec2_16, <2 x i16> %shift) {
 ; CHECK-LABEL: rotlv2_16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2s, #15
-; CHECK-NEXT:    neg v3.2s, v1.2s
-; CHECK-NEXT:    movi d4, #0x00ffff0000ffff
-; CHECK-NEXT:    and v3.8b, v3.8b, v2.8b
+; CHECK-NEXT:    movi d3, #0x00ffff0000ffff
+; CHECK-NEXT:    neg v4.2s, v1.2s
+; CHECK-NEXT:    and v4.8b, v4.8b, v2.8b
+; CHECK-NEXT:    and v3.8b, v0.8b, v3.8b
+; CHECK-NEXT:    neg v4.2s, v4.2s
 ; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v4.8b, v0.8b, v4.8b
-; CHECK-NEXT:    neg v3.2s, v3.2s
 ; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushl v2.2s, v4.2s, v3.2s
+; CHECK-NEXT:    ushl v2.2s, v3.2s, v4.2s
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    ret
   %1 = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %vec2_16, <2 x i16> %vec2_16, <2 x i16> %shift)
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 7faa4f8ec6fa9..f50f566703a78 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1101,9 +1101,9 @@ define half @test_maxnum(half %a, half %b) #0 {
 }
 
 ; CHECK-CVT-LABEL: test_copysign:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: fcvt s1, h1
 ; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: bif.16b v0, v1, v2
 ; CHECK-CVT-NEXT: fcvt h0, s0
 ; CHECK-CVT-NEXT: ret
@@ -1119,15 +1119,15 @@ define half @test_copysign(half %a, half %b) #0 {
 }
 
 ; CHECK-CVT-LABEL: test_copysign_f32:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: bif.16b v0, v1, v2
 ; CHECK-CVT-NEXT: fcvt h0, s0
 ; CHECK-CVT-NEXT: ret
 
 ; CHECK-FP16-LABEL: test_copysign_f32:
-; CHECK-FP16-NEXT: mvni.8h	v2, #128, lsl #8
 ; CHECK-FP16-NEXT: fcvt h1, s1
+; CHECK-FP16-NEXT: mvni.8h	v2, #128, lsl #8
 ; CHECK-FP16-NEXT: bif.16b v0, v1, v2
 ; CHECK-FP16-NEXT: ret
 
@@ -1138,16 +1138,16 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 }
 
 ; CHECK-CVT-LABEL: test_copysign_f64:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: fcvt s1, d1
 ; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: bif.16b v0, v1, v2
 ; CHECK-CVT-NEXT: fcvt h0, s0
 ; CHECK-CVT-NEXT: ret
 
 ; CHECK-FP16-LABEL: test_copysign_f64:
-; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
 ; CHECK-FP16-NEXT: fcvt h1, d1
+; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
 ; CHECK-FP16-NEXT: bif.16b v0, v1, v2
 ; CHECK-FP16-NEXT: ret
 
@@ -1161,9 +1161,9 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; away the (fpext (fp_round <result>)) here.
 
 ; CHECK-CVT-LABEL: test_copysign_extended:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: fcvt s1, h1
 ; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
 ; CHECK-CVT-NEXT: bif.16b v0, v1, v2
 ; CHECK-CVT-NEXT: ret
 
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index b012b6493901d..ff93ff77a1a0a 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -95,8 +95,8 @@ entry:
 define float @copysign32(float %a, float %b) {
 ; CHECK-LABEL: copysign32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
@@ -142,9 +142,9 @@ entry:
 define half @copysign16(half %a, half %b) {
 ; CHECK-LABEL: copysign16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-NEXT:    fcvt s1, h1
 ; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 3625bd6011fbb..ad3f2b3963b57 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -295,11 +295,11 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -364,12 +364,12 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    stp q0, q2, [sp, #48] // 32-byte Folded Spill
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
+; CHECK-NEXT:    stp q0, q2, [sp, #48] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -454,11 +454,11 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    stp q2, q3, [sp, #64] // 32-byte Folded Spill
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
+; CHECK-NEXT:    stp q2, q3, [sp, #64] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
-; CHECK-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
@@ -697,9 +697,9 @@ define <2 x i1> @test_signed_v2f32_v2i1(<2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    smax v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    ret
     %x = call <2 x i1> @llvm.fptosi.sat.v2f32.v2i1(<2 x float> %f)
     ret <2 x i1> %x
@@ -1628,9 +1628,9 @@ define <4 x i1> @test_signed_v4f16_v4i1(<4 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-FP16-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v1.4h
-; CHECK-FP16-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v1.4h
+; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v2.4h
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f)
     ret <4 x i1> %x
@@ -1674,10 +1674,10 @@ define <4 x i13> @test_signed_v4f16_v4i13(<4 x half> %f) {
 ;
 ; CHECK-FP16-LABEL: test_signed_v4f16_v4i13:
 ; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    mvni v1.4h, #240, lsl #8
 ; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
-; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    movi v1.4h, #240, lsl #8
+; CHECK-FP16-NEXT:    mvni v2.4h, #240, lsl #8
+; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v2.4h
 ; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i13> @llvm.fptosi.sat.v4f16.v4i13(<4 x half> %f)
@@ -2129,9 +2129,9 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
+; CHECK-FP16-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v1.8h
-; CHECK-FP16-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v1.8h
+; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f)
@@ -2278,10 +2278,10 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i13:
 ; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    mvni v1.8h, #240, lsl #8
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
-; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    movi v1.8h, #240, lsl #8
+; CHECK-FP16-NEXT:    mvni v2.8h, #240, lsl #8
+; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i13> @llvm.fptosi.sat.v8f16.v8i13(<8 x half> %f)
@@ -2366,21 +2366,21 @@ define <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) {
 ; CHECK-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    movi v1.4s, #3, msl #16
-; CHECK-NEXT:    mvni v3.4s, #3, msl #16
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    smin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v1.4s, v2.4s, v3.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mov w1, v1.s[1]
-; CHECK-NEXT:    mov w2, v1.s[2]
+; CHECK-NEXT:    mvni v1.4s, #3, msl #16
+; CHECK-NEXT:    smax v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov w1, v2.s[1]
+; CHECK-NEXT:    mov w2, v2.s[2]
 ; CHECK-NEXT:    mov w5, v0.s[1]
-; CHECK-NEXT:    mov w3, v1.s[3]
+; CHECK-NEXT:    mov w3, v2.s[3]
 ; CHECK-NEXT:    mov w6, v0.s[2]
 ; CHECK-NEXT:    mov w7, v0.s[3]
 ; CHECK-NEXT:    fmov w4, s0
-; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    fmov w0, s2
 ; CHECK-NEXT:    ret
     %x = call <8 x i19> @llvm.fptosi.sat.v8f16.v8i19(<8 x half> %f)
     ret <8 x i19> %x
@@ -2995,11 +2995,11 @@ define <8 x i8> @test_signed_v8f32_v8i8(<8 x float> %f) {
 ; CHECK-NEXT:    movi v2.4s, #127
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    mvni v3.4s, #127
 ; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    smax v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mvni v2.4s, #127
+; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index ace5196842156..cbb8b8a51126f 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -285,11 +285,11 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -338,12 +338,12 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    stp q0, q2, [sp, #48] // 32-byte Folded Spill
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
+; CHECK-NEXT:    stp q0, q2, [sp, #48] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -406,13 +406,13 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    stp q0, q2, [sp, #16] // 32-byte Folded Spill
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
+; CHECK-NEXT:    stp q0, q2, [sp, #16] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q3, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
-; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -1424,8 +1424,8 @@ define <4 x i13> @test_unsigned_v4f16_v4i13(<4 x half> %f) {
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v4f16_v4i13:
 ; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    mvni v1.4h, #224, lsl #8
 ; CHECK-FP16-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-FP16-NEXT:    mvni v1.4h, #224, lsl #8
 ; CHECK-FP16-NEXT:    umin v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i13> @llvm.fptoui.sat.v4f16.v4i13(<4 x half> %f)
@@ -1910,8 +1910,8 @@ define <8 x i13> @test_unsigned_v8f16_v8i13(<8 x half> %f) {
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i13:
 ; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    mvni v1.8h, #224, lsl #8
 ; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
+; CHECK-FP16-NEXT:    mvni v1.8h, #224, lsl #8
 ; CHECK-FP16-NEXT:    umin v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i13> @llvm.fptoui.sat.v8f16.v8i13(<8 x half> %f)
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
index e62f84f746716..bb37cc81a7ab1 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@@ -81,8 +81,8 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK-NEXT:    neg v3.4s, v1.4s
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    neg v2.4s, v2.4s
+; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
@@ -170,8 +170,8 @@ define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK-NEXT:    movi v2.4s, #31
 ; CHECK-NEXT:    neg v3.4s, v1.4s
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index 3445968721c87..63102a3d146e9 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -92,17 +92,17 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(<vscale x 8 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    sub x8, x8, #8
 ; CHECK-NEXT:    mov w9, #8
 ; CHECK-NEXT:    cmp x8, #8
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    lsl x8, x8, #1
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
@@ -136,17 +136,17 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(<vscale x 4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sub x8, x8, #4
 ; CHECK-NEXT:    mov w9, #4
 ; CHECK-NEXT:    cmp x8, #4
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    lsl x8, x8, #2
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
@@ -180,17 +180,17 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(<vscale x 2
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sub x8, x8, #2
 ; CHECK-NEXT:    mov w9, #2
 ; CHECK-NEXT:    cmp x8, #2
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    lsl x8, x8, #3
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
diff --git a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
index 8f76a1fea511c..b72422be759fb 100644
--- a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
+++ b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
@@ -22,9 +22,9 @@ define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2)
 ; CHECK-LABEL: mlai16_and:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT:    movi v3.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
   %v0 = sext <4 x i16> %vec0 to <4 x i32>
@@ -158,9 +158,9 @@ define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2)
 ; CHECK-LABEL: mlai32_and:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
-; CHECK-NEXT:    movi v3.2d, #0x000000ffffffff
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
 ; CHECK-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
   %v0 = sext <2 x i32> %vec0 to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
index af6475732a81e..3dad36acdf636 100644
--- a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
@@ -1079,8 +1079,8 @@ define <4 x i32> @notted_smin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ab:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1101,8 +1101,8 @@ define <4 x i32> @notted_smin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ba:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1167,8 +1167,8 @@ define <4 x i32> @notted_smin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_smin_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1189,8 +1189,8 @@ define <4 x i32> @notted_smin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_smin_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1255,8 +1255,8 @@ define <4 x i32> @notted_smin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_smin_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1277,8 +1277,8 @@ define <4 x i32> @notted_smin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_smin_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1343,8 +1343,8 @@ define <4 x i32> @notted_smin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_smin_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1365,8 +1365,8 @@ define <4 x i32> @notted_smin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_smin_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smin v0.4s, v2.4s, v0.4s
@@ -1431,8 +1431,8 @@ define <4 x i32> @notted_smax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ab:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1453,8 +1453,8 @@ define <4 x i32> @notted_smax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ba:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1519,8 +1519,8 @@ define <4 x i32> @notted_smax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_smax_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1541,8 +1541,8 @@ define <4 x i32> @notted_smax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_smax_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1607,8 +1607,8 @@ define <4 x i32> @notted_smax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_smax_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1629,8 +1629,8 @@ define <4 x i32> @notted_smax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_smax_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1695,8 +1695,8 @@ define <4 x i32> @notted_smax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_smax_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1717,8 +1717,8 @@ define <4 x i32> @notted_smax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_smax_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    smax v0.4s, v2.4s, v0.4s
@@ -1783,8 +1783,8 @@ define <4 x i32> @notted_umin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ab:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -1805,8 +1805,8 @@ define <4 x i32> @notted_umin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ba:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -1871,8 +1871,8 @@ define <4 x i32> @notted_umin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_umin_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -1893,8 +1893,8 @@ define <4 x i32> @notted_umin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_umin_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -1959,8 +1959,8 @@ define <4 x i32> @notted_umin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_umin_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -1981,8 +1981,8 @@ define <4 x i32> @notted_umin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_umin_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -2047,8 +2047,8 @@ define <4 x i32> @notted_umin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_umin_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -2069,8 +2069,8 @@ define <4 x i32> @notted_umin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_umin_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umin v0.4s, v2.4s, v0.4s
@@ -2135,8 +2135,8 @@ define <4 x i32> @notted_umax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ab:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
@@ -2157,8 +2157,8 @@ define <4 x i32> @notted_umax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ba:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
@@ -2223,8 +2223,8 @@ define <4 x i32> @notted_umax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_umax_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
@@ -2245,8 +2245,8 @@ define <4 x i32> @notted_umax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 ; CHECK-LABEL: notted_umax_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
@@ -2311,8 +2311,8 @@ define <4 x i32> @notted_umax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_umax_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
@@ -2333,8 +2333,8 @@ define <4 x i32> @notted_umax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 ; CHECK-LABEL: notted_umax_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
@@ -2399,8 +2399,8 @@ define <4 x i32> @notted_umax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_umax_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
@@ -2421,8 +2421,8 @@ define <4 x i32> @notted_umax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 ; CHECK-LABEL: notted_umax_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    umax v0.4s, v2.4s, v0.4s
diff --git a/llvm/test/CodeGen/AArch64/minmax.ll b/llvm/test/CodeGen/AArch64/minmax.ll
index 74f6b894eef83..59faf0efc35dd 100644
--- a/llvm/test/CodeGen/AArch64/minmax.ll
+++ b/llvm/test/CodeGen/AArch64/minmax.ll
@@ -122,10 +122,10 @@ define <16 x i32> @t11(<16 x i32> %a, <16 x i32> %b) {
 define <16 x i8> @t12(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: t12:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.16b, #1
-; CHECK-NEXT:    cmhi v3.16b, v1.16b, v0.16b
-; CHECK-NEXT:    bif v0.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v1.16b, v3.16b, v2.16b
+; CHECK-NEXT:    cmhi v2.16b, v1.16b, v0.16b
+; CHECK-NEXT:    movi v3.16b, #1
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    add v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
   %t1 = icmp ugt <16 x i8> %b, %a
diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
index 2bcb7ca696d12..8cd45160fcf44 100644
--- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
+++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
@@ -13,9 +13,9 @@ define dso_local void @jsimd_idct_ifast_neon_intrinsic(i8* nocapture readonly %d
 ; CHECK-NEXT:    mul v0.8h, v2.8h, v0.8h
 ; CHECK-NEXT:    mul v1.8h, v3.8h, v1.8h
 ; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    str q2, [x9, x8]
 ; CHECK-NEXT:    ldr x9, [x2, #56]
+; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    str q0, [x9, x8]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index 9f9e459b73548..75c0355965ebc 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -119,12 +119,12 @@ define dso_local void @run_test() local_unnamed_addr #0 {
 ; CHECK-NEXT:    add v0.2d, v0.2d, v15.2d
 ; CHECK-NEXT:    add v11.2d, v11.2d, v14.2d
 ; CHECK-NEXT:    fmov d14, x3
-; CHECK-NEXT:    add v9.2d, v9.2d, v1.2d
 ; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov d0, x13
+; CHECK-NEXT:    add v9.2d, v9.2d, v1.2d
 ; CHECK-NEXT:    mov v14.d[1], x15
-; CHECK-NEXT:    add v31.2d, v31.2d, v1.2d
 ; CHECK-NEXT:    mov v0.d[1], x12
+; CHECK-NEXT:    add v31.2d, v31.2d, v1.2d
 ; CHECK-NEXT:    add v26.2d, v26.2d, v1.2d
 ; CHECK-NEXT:    add v23.2d, v23.2d, v1.2d
 ; CHECK-NEXT:    add v21.2d, v21.2d, v1.2d
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index ecc94ccc2f795..917f146890266 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -97,9 +97,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    sqadd v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -158,9 +158,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    sqadd v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -224,9 +224,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x0]
-; CHECK-NEXT:    ldr b1, [x1]
-; CHECK-NEXT:    sqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr b0, [x1]
+; CHECK-NEXT:    ldr b1, [x0]
+; CHECK-NEXT:    sqadd v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -239,9 +239,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ldr h1, [x1]
-; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr h0, [x1]
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    sqadd v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
@@ -254,10 +254,10 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; CHECK-LABEL: v16i4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
-; CHECK-NEXT:    sshr v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
+; CHECK-NEXT:    sshr v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
 ; CHECK-NEXT:    sqadd v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 99ff626d8dd80..06fc023d927d4 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -346,9 +346,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_min(<16 x i8> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_min:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.16b, #213
+; CHECK-NEXT:    movi v2.16b, #42
 ; CHECK-NEXT:    umin v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    movi v1.16b, #42
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    add v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %c = icmp ult <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
   %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
@@ -383,9 +383,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) {
 define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvni v1.8h, #42
-; CHECK-NEXT:    umin v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    movi v1.8h, #42
+; CHECK-NEXT:    mvni v2.8h, #42
+; CHECK-NEXT:    umin v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %c = icmp ult <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index e473bbe72ceff..56082bcb4c1bc 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -165,8 +165,8 @@ define i64 @sel_shift_bool_i64(i1 %t) {
 define <16 x i8> @sel_shift_bool_v16i8(<16 x i1> %t) {
 ; CHECK-LABEL: sel_shift_bool_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    movi v1.16b, #128
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -205,8 +205,8 @@ define <2 x i64> @sel_shift_bool_v2i64(<2 x i1> %t) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    mov w8, #65536
-; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #63
+; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll
index 7991c797ff27d..cb758f8a6202b 100644
--- a/llvm/test/CodeGen/AArch64/signbit-shift.ll
+++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll
@@ -30,9 +30,9 @@ define <4 x i32> @add_zext_ifpos_vec_splat(<4 x i32> %x) {
 ; CHECK-LABEL: add_zext_ifpos_vec_splat:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT:    movi v2.4s, #41
 ; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #41
-; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %e = zext <4 x i1> %c to <4 x i32>
@@ -79,9 +79,9 @@ define <4 x i32> @add_sext_ifpos_vec_splat(<4 x i32> %x) {
 ; CHECK-LABEL: add_sext_ifpos_vec_splat:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT:    movi v2.4s, #42
 ; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %e = sext <4 x i1> %c to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
index d0f8d08a65266..0c1e61ff06401 100644
--- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
+++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
@@ -160,8 +160,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = add <4 x i32> %t0, %b
@@ -172,8 +172,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = add <4 x i32> %b, %t0
@@ -188,8 +188,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = add <4 x i32> %t0, %b
@@ -200,8 +200,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = add <4 x i32> %b, %t0
@@ -216,8 +216,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = add <4 x i32> %t0, %b
@@ -228,8 +228,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = add <4 x i32> %b, %t0
@@ -244,8 +244,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = sub <4 x i32> %t0, %b
@@ -256,8 +256,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI19_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = sub <4 x i32> %b, %t0
@@ -272,8 +272,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI20_0
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = sub <4 x i32> %t0, %b
@@ -284,8 +284,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = sub <4 x i32> %b, %t0
@@ -300,8 +300,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI22_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
-; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_0]
+; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = sub <4 x i32> %t0, %b
@@ -312,8 +312,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = sub <4 x i32> %b, %t0
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index 98b5420b94752..4c8b0ab87e9c4 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -305,10 +305,10 @@ define <4 x float> @fma(<4 x float> %x, <4 x float> *%y) {
 ; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB9_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov v2.16b, v0.16b
 ; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    subs w8, w8, #1
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    fmla v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    b.eq .LBB9_1
 ; CHECK-NEXT:  // %bb.2: // %l2
diff --git a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
index 69cd4ee69733e..7d041d4c5e75d 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
@@ -5,20 +5,20 @@ define <16 x double> @test_sitofp_fixed(<16 x i32> %in) {
 ; CHECK-LABEL: test_sitofp_fixed:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sshll2.2d v4, v2, #0
-; CHECK-NEXT:    sshll.2d v16, v1, #0
 ; CHECK-NEXT:    sshll2.2d v5, v0, #0
 ; CHECK-NEXT:    sshll2.2d v6, v1, #0
 ; CHECK-NEXT:    sshll2.2d v7, v3, #0
 ; CHECK-NEXT:    sshll.2d v0, v0, #0
+; CHECK-NEXT:    sshll.2d v16, v1, #0
 ; CHECK-NEXT:    sshll.2d v17, v2, #0
 ; CHECK-NEXT:    sshll.2d v18, v3, #0
 ; CHECK-NEXT:    scvtf.2d v1, v5, #6
+; CHECK-NEXT:    scvtf.2d v0, v0, #6
 ; CHECK-NEXT:    scvtf.2d v3, v6, #6
 ; CHECK-NEXT:    scvtf.2d v2, v16, #6
 ; CHECK-NEXT:    scvtf.2d v5, v4, #6
-; CHECK-NEXT:    scvtf.2d v0, v0, #6
-; CHECK-NEXT:    scvtf.2d v7, v7, #6
 ; CHECK-NEXT:    scvtf.2d v4, v17, #6
+; CHECK-NEXT:    scvtf.2d v7, v7, #6
 ; CHECK-NEXT:    scvtf.2d v6, v18, #6
 ; CHECK-NEXT:    ret
 
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 43ce12809203c..84e0979f6551a 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -59,47 +59,47 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; CHECK-LABEL: test_srem_vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x11, #7282
-; CHECK-NEXT:    sbfx x10, x0, #0, #33
+; CHECK-NEXT:    mov x8, #7282
+; CHECK-NEXT:    sbfx x9, x0, #0, #33
+; CHECK-NEXT:    movk x8, #29127, lsl #16
+; CHECK-NEXT:    mov x11, #7281
+; CHECK-NEXT:    movk x8, #50972, lsl #32
 ; CHECK-NEXT:    movk x11, #29127, lsl #16
-; CHECK-NEXT:    mov x9, #7281
+; CHECK-NEXT:    movk x8, #7281, lsl #48
 ; CHECK-NEXT:    movk x11, #50972, lsl #32
-; CHECK-NEXT:    movk x9, #29127, lsl #16
+; CHECK-NEXT:    sbfx x12, x1, #0, #33
+; CHECK-NEXT:    sbfx x10, x2, #0, #33
+; CHECK-NEXT:    smulh x13, x9, x8
 ; CHECK-NEXT:    movk x11, #7281, lsl #48
-; CHECK-NEXT:    movk x9, #50972, lsl #32
-; CHECK-NEXT:    sbfx x13, x1, #0, #33
-; CHECK-NEXT:    sbfx x8, x2, #0, #33
-; CHECK-NEXT:    smulh x12, x10, x11
-; CHECK-NEXT:    movk x9, #7281, lsl #48
-; CHECK-NEXT:    smulh x11, x13, x11
-; CHECK-NEXT:    smulh x9, x8, x9
-; CHECK-NEXT:    add x12, x12, x12, lsr #63
-; CHECK-NEXT:    sub x9, x9, x8
-; CHECK-NEXT:    add x11, x11, x11, lsr #63
-; CHECK-NEXT:    add x12, x12, x12, lsl #3
-; CHECK-NEXT:    asr x14, x9, #3
-; CHECK-NEXT:    sub x10, x10, x12
-; CHECK-NEXT:    add x9, x14, x9, lsr #63
+; CHECK-NEXT:    smulh x8, x12, x8
+; CHECK-NEXT:    smulh x11, x10, x11
+; CHECK-NEXT:    add x13, x13, x13, lsr #63
+; CHECK-NEXT:    sub x11, x11, x10
+; CHECK-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-NEXT:    add x13, x13, x13, lsl #3
+; CHECK-NEXT:    asr x14, x11, #3
+; CHECK-NEXT:    sub x9, x9, x13
+; CHECK-NEXT:    add x11, x14, x11, lsr #63
+; CHECK-NEXT:    add x8, x8, x8, lsl #3
+; CHECK-NEXT:    sub x8, x12, x8
 ; CHECK-NEXT:    add x11, x11, x11, lsl #3
-; CHECK-NEXT:    sub x11, x13, x11
-; CHECK-NEXT:    add x9, x9, x9, lsl #3
-; CHECK-NEXT:    fmov d0, x10
-; CHECK-NEXT:    add x8, x8, x9
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    add x10, x10, x11
 ; CHECK-NEXT:    mov x9, #8589934591
-; CHECK-NEXT:    mov v0.d[1], x11
-; CHECK-NEXT:    fmov d1, x8
+; CHECK-NEXT:    adrp x11, .LCPI3_0
+; CHECK-NEXT:    adrp x12, .LCPI3_1
+; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    fmov d1, x10
 ; CHECK-NEXT:    dup v2.2d, x9
-; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    adrp x9, .LCPI3_1
+; CHECK-NEXT:    ldr q3, [x11, :lo12:.LCPI3_0]
+; CHECK-NEXT:    ldr q4, [x12, :lo12:.LCPI3_1]
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI3_1]
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    cmeq v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, v3.2d
+; CHECK-NEXT:    cmeq v1.2d, v1.2d, v4.2d
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-NEXT:    mov w1, v0.s[1]
 ; CHECK-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
index 56f66a127d87a..bd6145d1bca66 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
@@ -7,6 +7,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    adrp x9, .LCPI0_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    adrp x8, .LCPI0_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_1]
@@ -17,11 +18,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI0_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -39,12 +39,12 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #39321
 ; CHECK-NEXT:    movk w8, #52428, lsl #16
 ; CHECK-NEXT:    movk w9, #6553, lsl #16
-; CHECK-NEXT:    adrp x10, .LCPI1_0
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q0, [x10, :lo12:.LCPI1_0]
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -60,12 +60,12 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #39321
 ; CHECK-NEXT:    movk w8, #52428, lsl #16
 ; CHECK-NEXT:    movk w9, #6553, lsl #16
-; CHECK-NEXT:    adrp x10, .LCPI2_0
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    adrp x8, .LCPI2_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q0, [x10, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    cmhi v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -83,17 +83,17 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #9362
 ; CHECK-NEXT:    movk w8, #46811, lsl #16
 ; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #31
 ; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -107,17 +107,17 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #9362
 ; CHECK-NEXT:    movk w8, #46811, lsl #16
 ; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #31
 ; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -131,6 +131,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    adrp x9, .LCPI5_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    adrp x8, .LCPI5_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI5_1]
@@ -141,11 +142,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI5_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -157,6 +157,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI6_0
 ; CHECK-NEXT:    adrp x9, .LCPI6_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
 ; CHECK-NEXT:    adrp x8, .LCPI6_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_1]
@@ -167,11 +168,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI6_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_4]
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -187,6 +187,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI7_0
 ; CHECK-NEXT:    adrp x9, .LCPI7_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
 ; CHECK-NEXT:    adrp x8, .LCPI7_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI7_1]
@@ -197,11 +198,10 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI7_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -215,6 +215,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI8_0
 ; CHECK-NEXT:    adrp x9, .LCPI8_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_0]
 ; CHECK-NEXT:    adrp x8, .LCPI8_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI8_1]
@@ -225,11 +226,10 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI8_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -243,6 +243,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    adrp x9, .LCPI9_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_0]
 ; CHECK-NEXT:    adrp x8, .LCPI9_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI9_1]
@@ -253,11 +254,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI9_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -275,12 +275,12 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #39321
 ; CHECK-NEXT:    movk w8, #52428, lsl #16
 ; CHECK-NEXT:    movk w9, #6553, lsl #16
-; CHECK-NEXT:    adrp x10, .LCPI10_0
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q0, [x10, :lo12:.LCPI10_0]
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -298,17 +298,17 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #9362
 ; CHECK-NEXT:    movk w8, #46811, lsl #16
 ; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI11_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #31
 ; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -322,6 +322,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    adrp x9, .LCPI12_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    adrp x8, .LCPI12_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI12_1]
@@ -332,11 +333,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI12_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -441,6 +441,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    adrp x9, .LCPI16_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    adrp x8, .LCPI16_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI16_1]
@@ -451,11 +452,10 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI16_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -469,6 +469,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    adrp x9, .LCPI17_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    adrp x8, .LCPI17_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI17_1]
@@ -479,11 +480,10 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI17_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -497,6 +497,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    adrp x9, .LCPI18_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    adrp x8, .LCPI18_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI18_1]
@@ -507,11 +508,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-NEXT:    adrp x8, .LCPI18_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -529,12 +529,12 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #39321
 ; CHECK-NEXT:    movk w8, #52428, lsl #16
 ; CHECK-NEXT:    movk w9, #6553, lsl #16
-; CHECK-NEXT:    adrp x10, .LCPI19_0
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    adrp x8, .LCPI19_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q0, [x10, :lo12:.LCPI19_0]
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI19_0]
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -552,17 +552,17 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #9362
 ; CHECK-NEXT:    movk w8, #46811, lsl #16
 ; CHECK-NEXT:    movk w9, #4681, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI20_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #31
 ; CHECK-NEXT:    ushr v1.4s, v2.4s, #1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_0]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -576,6 +576,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-NEXT:    adrp x9, .LCPI21_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
 ; CHECK-NEXT:    adrp x8, .LCPI21_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI21_1]
@@ -586,11 +587,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI21_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -606,6 +606,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI22_0
 ; CHECK-NEXT:    adrp x9, .LCPI22_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
 ; CHECK-NEXT:    adrp x8, .LCPI22_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI22_1]
@@ -616,11 +617,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI22_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -634,6 +634,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-NEXT:    adrp x9, .LCPI23_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
 ; CHECK-NEXT:    adrp x8, .LCPI23_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI23_1]
@@ -644,11 +645,10 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI23_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -662,6 +662,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI24_0
 ; CHECK-NEXT:    adrp x9, .LCPI24_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_0]
 ; CHECK-NEXT:    adrp x8, .LCPI24_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI24_1]
@@ -672,11 +673,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI24_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -691,6 +691,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI25_0
 ; CHECK-NEXT:    adrp x9, .LCPI25_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_0]
 ; CHECK-NEXT:    adrp x8, .LCPI25_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI25_1]
@@ -701,11 +702,10 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-NEXT:    adrp x8, .LCPI25_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI25_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -718,6 +718,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    adrp x9, .LCPI26_1
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_0]
 ; CHECK-NEXT:    adrp x8, .LCPI26_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI26_1]
@@ -728,11 +729,10 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-NEXT:    adrp x8, .LCPI26_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_4]
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_4]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index c37e5450160f7..fc033bc741c19 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -33,6 +33,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #47184
 ; CHECK-NEXT:    movk w8, #49807, lsl #16
 ; CHECK-NEXT:    movk w9, #1310, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    mov w8, #23592
@@ -40,11 +41,10 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    movk w8, #655, lsl #16
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #30
 ; CHECK-NEXT:    ushr v1.4s, v2.4s, #2
+; CHECK-NEXT:    dup v2.4s, w8
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -86,6 +86,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w9, #47184
 ; CHECK-NEXT:    movk w8, #49807, lsl #16
 ; CHECK-NEXT:    movk w9, #1310, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    mov w8, #23592
@@ -93,11 +94,10 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    movk w8, #655, lsl #16
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #30
 ; CHECK-NEXT:    ushr v1.4s, v2.4s, #2
+; CHECK-NEXT:    dup v2.4s, w8
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -114,15 +114,15 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    movi v1.4s, #25
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT:    smull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    sshr v3.4s, v2.4s, #3
-; CHECK-NEXT:    usra v3.4s, v2.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT:    movi v3.4s, #25
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    sshr v2.4s, v1.4s, #3
+; CHECK-NEXT:    usra v2.4s, v1.4s, #31
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -137,15 +137,15 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    movi v1.4s, #100
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT:    smull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    sshr v3.4s, v2.4s, #5
-; CHECK-NEXT:    usra v3.4s, v2.4s, #31
-; CHECK-NEXT:    mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT:    movi v3.4s, #100
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    sshr v2.4s, v1.4s, #5
+; CHECK-NEXT:    usra v2.4s, v1.4s, #31
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -184,12 +184,12 @@ define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_pow2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT:    mov v3.16b, v0.16b
+; CHECK-NEXT:    cmlt v3.4s, v0.4s, #0
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    usra v2.4s, v3.4s, #28
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    usra v3.4s, v2.4s, #28
-; CHECK-NEXT:    bic v3.4s, #15
-; CHECK-NEXT:    sub v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    bic v2.4s, #15
+; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -204,10 +204,10 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_int_min:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT:    mov v3.16b, v0.16b
-; CHECK-NEXT:    movi v1.4s, #128, lsl #24
-; CHECK-NEXT:    usra v3.4s, v2.4s, #1
-; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    movi v3.4s, #128, lsl #24
+; CHECK-NEXT:    usra v1.4s, v2.4s, #1
+; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 95e56df351daa..74c7a55c8be73 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -98,9 +98,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    sqsub v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -159,9 +159,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    sqsub v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -225,9 +225,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x0]
-; CHECK-NEXT:    ldr b1, [x1]
-; CHECK-NEXT:    sqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr b0, [x1]
+; CHECK-NEXT:    ldr b1, [x0]
+; CHECK-NEXT:    sqsub v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -240,9 +240,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ldr h1, [x1]
-; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr h0, [x1]
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    sqsub v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
@@ -255,10 +255,10 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; CHECK-LABEL: v16i4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
-; CHECK-NEXT:    sshr v1.16b, v1.16b, #4
+; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    sshr v0.16b, v0.16b, #4
+; CHECK-NEXT:    sshr v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v1.16b, v1.16b, #4
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #4
 ; CHECK-NEXT:    sqsub v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index f95860d55a401..8b1bae5009a12 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -52,11 +52,11 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: sdiv_v8i8:
 ; VBITS_EQ_128:         sshll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    sshll v0.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sshll v0.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
 ; VBITS_EQ_128-NEXT:    sunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    sunpkhi z3.s, z0.h
 ; VBITS_EQ_128-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -350,8 +350,8 @@ define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: sdiv_v4i16:
 ; CHECK: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
 ; CHECK-NEXT: mov w8, v1.s[1]
 ; CHECK-NEXT: mov w9, v1.s[2]
@@ -364,8 +364,8 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: sdiv_v4i16:
 ; VBITS_EQ_128:         sshll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_EQ_128-NEXT:    xtn v0.4h, v0.4s
 ; VBITS_EQ_128-NEXT:    ret
@@ -744,11 +744,11 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: udiv_v8i8:
 ; VBITS_EQ_128:         ushll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    ushll v0.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    ushll v0.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
 ; VBITS_EQ_128-NEXT:    uunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT:    uunpkhi z3.s, z0.h
 ; VBITS_EQ_128-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -1040,8 +1040,8 @@ define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: udiv_v4i16:
 ; CHECK: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
 ; CHECK-NEXT: mov w8, v1.s[1]
 ; CHECK-NEXT: mov w9, v1.s[2]
@@ -1054,8 +1054,8 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: udiv_v4i16:
 ; VBITS_EQ_128:         ushll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_EQ_128-NEXT:    xtn v0.4h, v0.4s
 ; VBITS_EQ_128-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 710575a54477c..c83e6ded4841f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -288,16 +288,16 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-LABEL: smulh_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
 
 ; VBITS_EQ_128-LABEL: smulh_v2i32:
 ; VBITS_EQ_128:         sshll v0.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    sshll v1.2d, v1.2s, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT:    sshll v1.2d, v1.2s, #0
 ; VBITS_EQ_128-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; VBITS_EQ_128-NEXT:    shrn v0.2s, v0.2d, #32
 ; VBITS_EQ_128-NEXT:    ret
@@ -785,16 +785,16 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 ; CHECK-LABEL: umulh_v2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
 ; CHECK-NEXT:    ret
 
 ; VBITS_EQ_128-LABEL: umulh_v2i32:
 ; VBITS_EQ_128:         ushll   v0.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT:    ushll   v1.2d, v1.2s, #0
 ; VBITS_EQ_128-NEXT:    ptrue   p0.d, vl2
+; VBITS_EQ_128-NEXT:    ushll   v1.2d, v1.2s, #0
 ; VBITS_EQ_128-NEXT:    mul     z0.d, p0/m, z0.d, z1.d
 ; VBITS_EQ_128-NEXT:    shrn    v0.2s, v0.2d, #32
 ; VBITS_EQ_128-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 3626aa915541e..1a7774bd1174d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -53,11 +53,11 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: srem_v8i8:
 ; VBITS_EQ_128:         sshll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    sshll v3.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sshll v3.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    sunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z3.h
 ; VBITS_EQ_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    sunpkhi z5.s, z3.h
 ; VBITS_EQ_128-NEXT:    sunpklo z3.s, z3.h
 ; VBITS_EQ_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
@@ -364,8 +364,8 @@ define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: srem_v4i16:
 ; CHECK: sshll v2.4s, v1.4h, #0
-; CHECK-NEXT: sshll v3.4s, v0.4h, #0
 ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: sshll v3.4s, v0.4h, #0
 ; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
 ; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
 ; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
@@ -379,8 +379,8 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: srem_v4i16:
 ; VBITS_EQ_128:         sshll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    sshll v3.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    sshll v3.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_128-NEXT:    xtn v2.4h, v2.4s
 ; VBITS_EQ_128-NEXT:    mls v0.4h, v2.4h, v1.4h
@@ -812,11 +812,11 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: urem_v8i8:
 ; VBITS_EQ_128:         ushll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT:    ushll v3.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    ushll v3.8h, v0.8b, #0
 ; VBITS_EQ_128-NEXT:    uunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z3.h
 ; VBITS_EQ_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT:    uunpkhi z5.s, z3.h
 ; VBITS_EQ_128-NEXT:    uunpklo z3.s, z3.h
 ; VBITS_EQ_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
@@ -1121,8 +1121,8 @@ define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; CHECK-LABEL: urem_v4i16:
 ; CHECK: ushll v2.4s, v1.4h, #0
-; CHECK-NEXT: ushll v3.4s, v0.4h, #0
 ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: ushll v3.4s, v0.4h, #0
 ; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
 ; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
 ; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
@@ -1136,8 +1136,8 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 
 ; VBITS_EQ_128-LABEL: urem_v4i16:
 ; VBITS_EQ_128:         ushll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT:    ushll v3.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT:    ushll v3.4s, v0.4h, #0
 ; VBITS_EQ_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_EQ_128-NEXT:    xtn v2.4h, v2.4s
 ; VBITS_EQ_128-NEXT:    mls v0.4h, v2.4h, v1.4h
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index a2dc244c848bd..44bb3674ee1c6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function masked_scatter_v8i8,masked_scatter_v8i16,masked_scatter_v8i32,masked_scatter_v8i64 --prefix VBITS_EQ_256
 ; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -check-prefix=NO_SVE
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
 ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK
@@ -85,9 +86,9 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    shl v2.4h, v2.4h, #8
 ; VBITS_EQ_256-NEXT:    shl v1.4h, v1.4h, #8
 ; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_EQ_256-NEXT:    sshr v2.4h, v2.4h, #8
 ; VBITS_EQ_256-NEXT:    sshr v1.4h, v1.4h, #8
+; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_EQ_256-NEXT:    sunpklo z2.s, z2.h
 ; VBITS_EQ_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_EQ_256-NEXT:    sunpklo z2.d, z2.s
@@ -99,7 +100,6 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    st1b { z1.d }, p1, [z4.d]
 ; VBITS_EQ_256-NEXT:    st1b { z0.d }, p0, [z3.d]
 ; VBITS_EQ_256-NEXT:    ret
-;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i8:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr d0, [x0]
@@ -108,8 +108,8 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
 ; VBITS_GE_512-NEXT:    cmeq v2.8b, v0.8b, #0
 ; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_512-NEXT:    sunpklo z2.h, z2.b
+; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
 ; VBITS_GE_512-NEXT:    sunpklo z2.d, z2.s
 ; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z2.d, #0
@@ -131,8 +131,8 @@ define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
 ; VBITS_GE_1024-NEXT:    cmeq v2.16b, v0.16b, #0
 ; VBITS_GE_1024-NEXT:    uunpklo z0.h, z0.b
 ; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_1024-NEXT:    sunpklo z2.h, z2.b
+; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_1024-NEXT:    sunpklo z2.s, z2.h
 ; VBITS_GE_1024-NEXT:    sunpklo z2.d, z2.s
 ; VBITS_GE_1024-NEXT:    cmpne p0.d, p0/z, z2.d, #0
@@ -226,8 +226,8 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_EQ_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
 ; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_EQ_256-NEXT:    sunpklo z2.s, z1.h
+; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_EQ_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; VBITS_EQ_256-NEXT:    sunpklo z2.d, z2.s
 ; VBITS_EQ_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
@@ -240,7 +240,6 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    uunpklo z1.d, z3.s
 ; VBITS_EQ_256-NEXT:    st1h { z1.d }, p0, [z4.d]
 ; VBITS_EQ_256-NEXT:    ret
-;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr q0, [x0]
@@ -369,7 +368,6 @@ define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    st1w { z1.d }, p0, [z3.d]
 ; VBITS_EQ_256-NEXT:    st1w { z0.d }, p1, [z2.d]
 ; VBITS_EQ_256-NEXT:    ret
-;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
@@ -455,10 +453,10 @@ define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q2, [x1]
-; CHECK-NEXT:    cmeq v1.2d, v0.2d, #0
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    cmeq v2.2d, v0.2d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x i64>, <2 x i64>* %a
   %ptrs = load <2 x i64*>, <2 x i64*>* %b
@@ -498,7 +496,6 @@ define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
 ; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [z3.d]
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p1, [z2.d]
 ; VBITS_EQ_256-NEXT:    ret
-;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
index 2c13eea4ca4fa..19ebd4265bd61 100644
--- a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
@@ -15,8 +15,8 @@ define void @func_vscale_none(<16 x i32>* %a, <16 x i32>* %b) #0 {
 ; CHECK-NOARG-NEXT:    ldp q6, q4, [x1]
 ; CHECK-NOARG-NEXT:    stp q0, q1, [x0, #32]
 ; CHECK-NOARG-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NOARG-NEXT:    add v3.4s, v3.4s, v4.4s
-; CHECK-NOARG-NEXT:    stp q2, q3, [x0]
+; CHECK-NOARG-NEXT:    add v0.4s, v3.4s, v4.4s
+; CHECK-NOARG-NEXT:    stp q2, q0, [x0]
 ; CHECK-NOARG-NEXT:    ret
 ;
 ; CHECK-ARG-LABEL: func_vscale_none:
@@ -47,8 +47,8 @@ define void @func_vscale1_1(<16 x i32>* %a, <16 x i32>* %b) #1 {
 ; CHECK-NEXT:    ldp q6, q4, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x0, #32]
 ; CHECK-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    stp q2, q3, [x0]
+; CHECK-NEXT:    add v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, <16 x i32>* %a
   %op2 = load <16 x i32>, <16 x i32>* %b
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index f8298485f35b5..46dd3db9e97fe 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -97,9 +97,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    uqadd v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -158,9 +158,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    uqadd v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -225,9 +225,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x0]
-; CHECK-NEXT:    ldr b1, [x1]
-; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr b0, [x1]
+; CHECK-NEXT:    ldr b1, [x0]
+; CHECK-NEXT:    uqadd v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -240,9 +240,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ldr h1, [x1]
-; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr h0, [x1]
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    uqadd v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
index 334744116a8e1..e7f7e13756879 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
@@ -67,25 +67,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    adrp x9, .LCPI4_1
 ; CHECK-NEXT:    mov v0.h[1], w1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT:    adrp x8, .LCPI4_1
+; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI4_1]
+; CHECK-NEXT:    adrp x8, .LCPI4_2
 ; CHECK-NEXT:    mov v0.h[2], w2
 ; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT:    adrp x8, .LCPI4_2
-; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    movi d1, #0x0000000000ffff
-; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_2]
 ; CHECK-NEXT:    adrp x8, .LCPI4_3
-; CHECK-NEXT:    shl v2.4h, v0.4h, #1
+; CHECK-NEXT:    shl v3.4h, v0.4h, #1
 ; CHECK-NEXT:    bic v0.4h, #248, lsl #8
-; CHECK-NEXT:    ushl v2.4h, v2.4h, v3.4h
 ; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    ushl v1.4h, v3.4h, v2.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    bic v0.4h, #248, lsl #8
-; CHECK-NEXT:    cmhi v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    cmhi v0.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    umov w1, v0.h[1]
 ; CHECK-NEXT:    umov w2, v0.h[2]
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
index 4fe59ff237717..a2a27a1508a19 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
@@ -7,6 +7,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    adrp x9, .LCPI0_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    adrp x8, .LCPI0_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_2]
@@ -15,11 +16,10 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI0_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -34,13 +34,13 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_allones_eq:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
 ; CHECK-NEXT:    adrp x8, .LCPI1_1
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_1]
 ; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -51,13 +51,13 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_allones_ne:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    adrp x8, .LCPI2_1
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_1]
 ; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
   %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -71,6 +71,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
 ; CHECK-NEXT:    adrp x9, .LCPI3_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    adrp x8, .LCPI3_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI3_2]
@@ -79,11 +80,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI3_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -95,6 +95,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    adrp x9, .LCPI4_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI4_2]
@@ -103,11 +104,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI4_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -121,6 +121,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    adrp x9, .LCPI5_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    adrp x8, .LCPI5_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI5_2]
@@ -129,11 +130,10 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI5_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -145,6 +145,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI6_0
 ; CHECK-NEXT:    adrp x9, .LCPI6_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
 ; CHECK-NEXT:    adrp x8, .LCPI6_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_2]
@@ -153,11 +154,10 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI6_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_3]
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -173,6 +173,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI7_0
 ; CHECK-NEXT:    adrp x9, .LCPI7_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
 ; CHECK-NEXT:    adrp x8, .LCPI7_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI7_2]
@@ -181,11 +182,10 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI7_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -199,6 +199,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI8_0
 ; CHECK-NEXT:    adrp x9, .LCPI8_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_0]
 ; CHECK-NEXT:    adrp x8, .LCPI8_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI8_2]
@@ -207,11 +208,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI8_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -225,6 +225,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    adrp x9, .LCPI9_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_0]
 ; CHECK-NEXT:    adrp x8, .LCPI9_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI9_2]
@@ -233,11 +234,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI9_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -252,14 +252,14 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_one:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #52429
-; CHECK-NEXT:    adrp x9, .LCPI10_0
 ; CHECK-NEXT:    movk w8, #52428, lsl #16
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI10_0]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI10_0]
 ; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -273,16 +273,16 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #28087
 ; CHECK-NEXT:    movk w8, #46811, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    adrp x8, .LCPI11_0
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
 ; CHECK-NEXT:    shl v1.4s, v0.4s, #31
 ; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -296,6 +296,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    adrp x9, .LCPI12_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    adrp x8, .LCPI12_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI12_2]
@@ -304,11 +305,10 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI12_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -324,6 +324,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    adrp x9, .LCPI13_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    adrp x8, .LCPI13_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI13_2]
@@ -332,11 +333,10 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI13_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -350,6 +350,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    adrp x9, .LCPI14_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
 ; CHECK-NEXT:    adrp x8, .LCPI14_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_2]
@@ -358,11 +359,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI14_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -376,6 +376,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    adrp x9, .LCPI15_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-NEXT:    adrp x8, .LCPI15_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI15_2]
@@ -384,11 +385,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI15_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -404,6 +404,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    adrp x9, .LCPI16_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    adrp x8, .LCPI16_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI16_2]
@@ -412,11 +413,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI16_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -430,6 +430,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    adrp x9, .LCPI17_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    adrp x8, .LCPI17_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI17_2]
@@ -438,11 +439,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI17_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -456,6 +456,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    adrp x9, .LCPI18_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    adrp x8, .LCPI18_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI18_2]
@@ -464,11 +465,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-NEXT:    adrp x8, .LCPI18_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -483,13 +483,13 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_allones_and_one:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI19_0
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
 ; CHECK-NEXT:    adrp x8, .LCPI19_1
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_1]
 ; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -503,6 +503,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI20_0
 ; CHECK-NEXT:    adrp x9, .LCPI20_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
 ; CHECK-NEXT:    adrp x8, .LCPI20_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI20_2]
@@ -511,11 +512,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI20_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -529,6 +529,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-NEXT:    adrp x9, .LCPI21_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
 ; CHECK-NEXT:    adrp x8, .LCPI21_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI21_2]
@@ -537,11 +538,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI21_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -557,6 +557,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI22_0
 ; CHECK-NEXT:    adrp x9, .LCPI22_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
 ; CHECK-NEXT:    adrp x8, .LCPI22_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI22_2]
@@ -565,11 +566,10 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI22_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -583,6 +583,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-NEXT:    adrp x9, .LCPI23_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
 ; CHECK-NEXT:    adrp x8, .LCPI23_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI23_2]
@@ -591,11 +592,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI23_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -609,6 +609,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI24_0
 ; CHECK-NEXT:    adrp x9, .LCPI24_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_0]
 ; CHECK-NEXT:    adrp x8, .LCPI24_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI24_2]
@@ -617,11 +618,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI24_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -636,6 +636,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI25_0
 ; CHECK-NEXT:    adrp x9, .LCPI25_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_0]
 ; CHECK-NEXT:    adrp x8, .LCPI25_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI25_2]
@@ -644,11 +645,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-NEXT:    adrp x8, .LCPI25_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI25_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -661,6 +661,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    adrp x9, .LCPI26_2
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_0]
 ; CHECK-NEXT:    adrp x8, .LCPI26_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI26_2]
@@ -669,11 +670,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-NEXT:    adrp x8, .LCPI26_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_3]
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_3]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
index c59ff82586968..a989eaa37c11f 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -54,11 +54,11 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w8, #43690
 ; CHECK-NEXT:    movk w8, #10922, lsl #16
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    dup v2.4s, w8
 ; CHECK-NEXT:    shl v1.4s, v0.4s, #31
 ; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
@@ -70,18 +70,18 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: t32_6_part1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    mov w9, #43691
+; CHECK-NEXT:    movk w9, #43690, lsl #16
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    mov w8, #43691
-; CHECK-NEXT:    movk w8, #43690, lsl #16
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    adrp x8, .LCPI3_1
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    dup v2.4s, w9
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
 ; CHECK-NEXT:    shl v1.4s, v0.4s, #31
 ; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
index e5b21fcd6553b..19c59975d7de9 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
@@ -7,14 +7,14 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #23593
 ; CHECK-NEXT:    movk w8, #49807, lsl #16
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    mov w8, #28835
 ; CHECK-NEXT:    movk w8, #2621, lsl #16
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -28,17 +28,17 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #23593
 ; CHECK-NEXT:    movk w8, #49807, lsl #16
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    mov w8, #23592
 ; CHECK-NEXT:    movk w8, #655, lsl #16
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    dup v2.4s, w8
 ; CHECK-NEXT:    shl v1.4s, v0.4s, #30
 ; CHECK-NEXT:    ushr v0.4s, v0.4s, #2
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v1.4s, w8
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -53,13 +53,13 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_neg25:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    adrp x8, .LCPI2_1
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_1]
 ; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 25, i32 -25, i32 -25, i32 25>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -72,16 +72,16 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_even_neg100:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    adrp x8, .LCPI3_1
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
 ; CHECK-NEXT:    shl v1.4s, v0.4s, #30
 ; CHECK-NEXT:    ushr v0.4s, v0.4s, #2
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -98,13 +98,13 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    movi v1.4s, #25
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    umull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    ushr v2.4s, v2.4s, #3
-; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    movi v2.4s, #25
+; CHECK-NEXT:    ushr v1.4s, v1.4s, #3
+; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -120,13 +120,13 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    movi v1.4s, #100
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    umull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    ushr v2.4s, v2.4s, #5
-; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT:    dup v1.4s, w8
+; CHECK-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    movi v2.4s, #100
+; CHECK-NEXT:    ushr v1.4s, v1.4s, #5
+; CHECK-NEXT:    mls v0.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -167,10 +167,10 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_pow2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #15
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -182,8 +182,8 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_int_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v0.4s, #128, lsl #24
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    bic v0.4s, #128, lsl #24
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
index 30574505998d0..be08ee4c893bd 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
@@ -5,11 +5,11 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: t0_all_tautological:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    adrp x9, .LCPI0_1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT:    adrp x8, .LCPI0_1
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_1]
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_1]
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmeq v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 5e7f7d350f33e..666e5a6134330 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -98,9 +98,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    uqsub v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, <8 x i8>* %px
@@ -155,9 +155,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    uqsub v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, <4 x i16>* %px
@@ -220,9 +220,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x0]
-; CHECK-NEXT:    ldr b1, [x1]
-; CHECK-NEXT:    uqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr b0, [x1]
+; CHECK-NEXT:    ldr b1, [x0]
+; CHECK-NEXT:    uqsub v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, <1 x i8>* %px
@@ -235,9 +235,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ldr h1, [x1]
-; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ldr h0, [x1]
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    uqsub v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, <1 x i16>* %px
diff --git a/llvm/test/CodeGen/AArch64/vec_cttz.ll b/llvm/test/CodeGen/AArch64/vec_cttz.ll
index 1cd7f34082337..231790fc21219 100644
--- a/llvm/test/CodeGen/AArch64/vec_cttz.ll
+++ b/llvm/test/CodeGen/AArch64/vec_cttz.ll
@@ -85,8 +85,8 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %a) nounwind {
 ; CHECK-NEXT:    movi v1.8h, #1
 ; CHECK-NEXT:    sub v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    movi v1.8h, #16
 ; CHECK-NEXT:    clz v0.8h, v0.8h
+; CHECK-NEXT:    movi v1.8h, #16
 ; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
     %b = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
@@ -99,8 +99,8 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %a) nounwind {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    sub v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    movi v1.4s, #32
 ; CHECK-NEXT:    clz v0.4s, v0.4s
+; CHECK-NEXT:    movi v1.4s, #32
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
     %b = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index f75d247e88c39..516f0297b462e 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -52,8 +52,8 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
 ; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    st1 { v1.s }[2], [x8]
 ; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    st1 { v1.s }[2], [x8]
 ; CHECK-NEXT:    ret
   %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
   %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -84,27 +84,27 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
 ; CHECK-NEXT:    fmov s0, w6
 ; CHECK-NEXT:    fmov s1, w0
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    add x9, sp, #8
 ; CHECK-NEXT:    ldr s2, [sp, #16]
-; CHECK-NEXT:    fmov s3, w4
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    add x10, sp, #8
 ; CHECK-NEXT:    mov v0.s[1], w7
+; CHECK-NEXT:    fmov s3, w4
 ; CHECK-NEXT:    mov v1.s[1], w1
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
 ; CHECK-NEXT:    mov v3.s[1], w5
 ; CHECK-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #24
 ; CHECK-NEXT:    mov v1.s[2], w2
-; CHECK-NEXT:    ld1 { v2.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v0.s }[3], [x9]
-; CHECK-NEXT:    mov v1.s[3], w3
 ; CHECK-NEXT:    ldr x8, [sp, #32]
 ; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    cmhi v3.4s, v3.4s, v2.4s
+; CHECK-NEXT:    ld1 { v0.s }[3], [x10]
+; CHECK-NEXT:    mov v1.s[3], w3
 ; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    cmhi v1.4s, v1.4s, v0.4s
-; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    cmhi v3.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    mov w5, v3.s[1]
 ; CHECK-NEXT:    fmov w4, s3
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    cmhi v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    mov w1, v1.s[1]
 ; CHECK-NEXT:    mov w2, v1.s[2]
 ; CHECK-NEXT:    mov w3, v1.s[3]
@@ -141,23 +141,23 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
 ; CHECK-NEXT:    add v4.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    cmhi v0.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    str q4, [x0]
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v2.8b, v0.8b, v0.8b
-; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip2 v2.8b, v0.8b, v0.8b
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    zip1 v3.8b, v1.8b, v0.8b
-; CHECK-NEXT:    zip2 v1.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    zip1 v3.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip2 v5.8b, v0.8b, v0.8b
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v1.4s, #0
+; CHECK-NEXT:    cmlt v1.4s, v2.4s, #0
+; CHECK-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v3.4s, v5.4h, #0
 ; CHECK-NEXT:    shl v2.4s, v2.4s, #31
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    shl v5.4s, v0.4s, #31
-; CHECK-NEXT:    cmlt v0.4s, v2.4s, #0
 ; CHECK-NEXT:    shl v3.4s, v3.4s, #31
-; CHECK-NEXT:    shl v6.4s, v1.4s, #31
-; CHECK-NEXT:    cmlt v1.4s, v5.4s, #0
-; CHECK-NEXT:    cmlt v2.4s, v3.4s, #0
-; CHECK-NEXT:    cmlt v3.4s, v6.4s, #0
+; CHECK-NEXT:    cmlt v2.4s, v2.4s, #0
+; CHECK-NEXT:    cmlt v3.4s, v3.4s, #0
 ; CHECK-NEXT:    ret
   %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
   %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -213,26 +213,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
 ; CHECK-NEXT:    bic v1.4s, #255, lsl #24
 ; CHECK-NEXT:    bic v0.4s, #255, lsl #24
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, v0.s[3]
 ; CHECK-NEXT:    mov w9, v0.s[2]
 ; CHECK-NEXT:    mov w10, v0.s[1]
 ; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    bic v1.4s, #255, lsl #24
 ; CHECK-NEXT:    sturh w8, [x0, #9]
 ; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    cmeq v1.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    strh w9, [x0, #6]
 ; CHECK-NEXT:    sturh w10, [x0, #3]
 ; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    strh w11, [x0]
-; CHECK-NEXT:    cmeq v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    lsr w10, w10, #16
 ; CHECK-NEXT:    strb w8, [x0, #11]
-; CHECK-NEXT:    lsr w8, w10, #16
-; CHECK-NEXT:    lsr w10, w11, #16
-; CHECK-NEXT:    strb w9, [x0, #8]
+; CHECK-NEXT:    lsr w8, w11, #16
+; CHECK-NEXT:    strh w11, [x0]
 ; CHECK-NEXT:    mvn v0.16b, v1.16b
-; CHECK-NEXT:    strb w8, [x0, #5]
-; CHECK-NEXT:    strb w10, [x0, #2]
+; CHECK-NEXT:    strb w9, [x0, #8]
+; CHECK-NEXT:    strb w10, [x0, #5]
+; CHECK-NEXT:    strb w8, [x0, #2]
 ; CHECK-NEXT:    ret
   %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
   %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
@@ -249,20 +249,20 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
 ; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    and v1.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    umov w8, v0.h[1]
 ; CHECK-NEXT:    umov w9, v0.h[2]
 ; CHECK-NEXT:    umov w10, v0.h[0]
 ; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    cmeq v1.4h, v1.4h, v0.4h
+; CHECK-NEXT:    and v1.8b, v0.8b, v2.8b
+; CHECK-NEXT:    cmeq v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    and w8, w8, #0x1
 ; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    bfi w10, w8, #1, #1
-; CHECK-NEXT:    mvn v1.8b, v1.8b
 ; CHECK-NEXT:    bfi w10, w9, #2, #1
 ; CHECK-NEXT:    bfi w10, w11, #3, #29
 ; CHECK-NEXT:    and w8, w10, #0xf
-; CHECK-NEXT:    sshll v0.4s, v1.4h, #0
 ; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index d305b688f3af2..602e4c727ef12 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -22,8 +22,8 @@ define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) noun
 ; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    shrn v0.2s, v1.2d, #32
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    str s1, [x0]
 ; CHECK-NEXT:    cmtst v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    str s1, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
   %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
@@ -39,8 +39,8 @@ define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) noun
 ; CHECK-NEXT:    umull v1.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    shrn v0.2s, v1.2d, #32
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    str d1, [x0]
 ; CHECK-NEXT:    cmtst v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    str d1, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
   %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -96,37 +96,37 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
 ; CHECK-NEXT:    fmov s0, w6
 ; CHECK-NEXT:    fmov s1, w0
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    add x9, sp, #8
 ; CHECK-NEXT:    ldr s2, [sp, #16]
-; CHECK-NEXT:    fmov s3, w4
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    add x10, sp, #8
 ; CHECK-NEXT:    mov v0.s[1], w7
+; CHECK-NEXT:    fmov s3, w4
 ; CHECK-NEXT:    mov v1.s[1], w1
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
 ; CHECK-NEXT:    mov v3.s[1], w5
 ; CHECK-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #24
 ; CHECK-NEXT:    mov v1.s[2], w2
-; CHECK-NEXT:    ld1 { v2.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v0.s }[3], [x9]
-; CHECK-NEXT:    mov v1.s[3], w3
 ; CHECK-NEXT:    ldr x8, [sp, #32]
-; CHECK-NEXT:    umull2 v6.2d, v3.4s, v2.4s
+; CHECK-NEXT:    umull2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    ld1 { v0.s }[3], [x10]
+; CHECK-NEXT:    mov v1.s[3], w3
 ; CHECK-NEXT:    umull v7.2d, v3.2s, v2.2s
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v0.4s
-; CHECK-NEXT:    umull v5.2d, v1.2s, v0.2s
 ; CHECK-NEXT:    mul v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp2 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    uzp2 v5.4s, v7.4s, v6.4s
+; CHECK-NEXT:    umull2 v5.2d, v1.4s, v0.4s
+; CHECK-NEXT:    umull v6.2d, v1.2s, v0.2s
+; CHECK-NEXT:    uzp2 v4.4s, v7.4s, v4.4s
 ; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v5.4s, v6.4s, v5.4s
 ; CHECK-NEXT:    cmtst v4.4s, v4.4s, v4.4s
+; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    cmtst v3.4s, v5.4s, v5.4s
-; CHECK-NEXT:    mov w1, v4.s[1]
-; CHECK-NEXT:    mov w2, v4.s[2]
-; CHECK-NEXT:    mov w3, v4.s[3]
-; CHECK-NEXT:    mov w5, v3.s[1]
-; CHECK-NEXT:    fmov w0, s4
-; CHECK-NEXT:    fmov w4, s3
+; CHECK-NEXT:    mov w5, v4.s[1]
+; CHECK-NEXT:    fmov w4, s4
+; CHECK-NEXT:    mov w1, v3.s[1]
+; CHECK-NEXT:    mov w2, v3.s[2]
+; CHECK-NEXT:    mov w3, v3.s[3]
+; CHECK-NEXT:    fmov w0, s3
 ; CHECK-NEXT:    ret
   %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
   %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -166,29 +166,27 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
 ; CHECK-NEXT:    umull v3.8h, v0.8b, v1.8b
+; CHECK-NEXT:    mul v5.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    uzp2 v2.16b, v3.16b, v2.16b
+; CHECK-NEXT:    str q5, [x0]
 ; CHECK-NEXT:    cmtst v2.16b, v2.16b, v2.16b
-; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    zip1 v4.8b, v2.8b, v0.8b
-; CHECK-NEXT:    zip2 v2.8b, v2.8b, v0.8b
-; CHECK-NEXT:    zip1 v5.8b, v3.8b, v0.8b
-; CHECK-NEXT:    zip2 v3.8b, v3.8b, v0.8b
-; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    ushll v5.4s, v5.4h, #0
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NEXT:    shl v4.4s, v4.4s, #31
+; CHECK-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; CHECK-NEXT:    zip2 v4.8b, v2.8b, v0.8b
+; CHECK-NEXT:    ext v0.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    ushll v1.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v2.4s, v4.4h, #0
+; CHECK-NEXT:    zip1 v3.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip2 v4.8b, v0.8b, v0.8b
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v1.4s, #0
+; CHECK-NEXT:    cmlt v1.4s, v2.4s, #0
+; CHECK-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v3.4s, v4.4h, #0
 ; CHECK-NEXT:    shl v2.4s, v2.4s, #31
-; CHECK-NEXT:    shl v6.4s, v5.4s, #31
 ; CHECK-NEXT:    shl v3.4s, v3.4s, #31
-; CHECK-NEXT:    cmlt v4.4s, v4.4s, #0
-; CHECK-NEXT:    cmlt v5.4s, v2.4s, #0
-; CHECK-NEXT:    cmlt v2.4s, v6.4s, #0
+; CHECK-NEXT:    cmlt v2.4s, v2.4s, #0
 ; CHECK-NEXT:    cmlt v3.4s, v3.4s, #0
-; CHECK-NEXT:    mul v6.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v4.16b
-; CHECK-NEXT:    mov v1.16b, v5.16b
-; CHECK-NEXT:    str q6, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
   %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -262,30 +260,30 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bic v1.4s, #255, lsl #24
 ; CHECK-NEXT:    bic v0.4s, #255, lsl #24
-; CHECK-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v3.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    uzp2 v1.4s, v3.4s, v2.4s
-; CHECK-NEXT:    ushr v2.4s, v0.4s, #24
-; CHECK-NEXT:    mov w8, v0.s[3]
-; CHECK-NEXT:    mov w9, v0.s[2]
-; CHECK-NEXT:    mov w10, v0.s[1]
-; CHECK-NEXT:    cmeq v1.4s, v1.4s, #0
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    cmtst v2.4s, v2.4s, v2.4s
+; CHECK-NEXT:    mul v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    umull2 v3.2d, v0.4s, v1.4s
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    mov w8, v2.s[3]
+; CHECK-NEXT:    mov w10, v2.s[2]
+; CHECK-NEXT:    mov w11, v2.s[1]
+; CHECK-NEXT:    ushr v1.4s, v2.4s, #24
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    cmtst v1.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    sturh w8, [x0, #9]
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    strh w9, [x0, #6]
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    sturh w10, [x0, #3]
-; CHECK-NEXT:    orn v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    strh w10, [x0, #6]
+; CHECK-NEXT:    lsr w10, w10, #16
+; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT:    sturh w11, [x0, #3]
+; CHECK-NEXT:    lsr w11, w11, #16
 ; CHECK-NEXT:    strb w8, [x0, #11]
-; CHECK-NEXT:    lsr w8, w10, #16
-; CHECK-NEXT:    lsr w10, w11, #16
-; CHECK-NEXT:    strh w11, [x0]
-; CHECK-NEXT:    strb w9, [x0, #8]
-; CHECK-NEXT:    strb w8, [x0, #5]
-; CHECK-NEXT:    strb w10, [x0, #2]
+; CHECK-NEXT:    lsr w8, w9, #16
+; CHECK-NEXT:    strh w9, [x0]
+; CHECK-NEXT:    orn v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    strb w10, [x0, #8]
+; CHECK-NEXT:    strb w11, [x0, #5]
+; CHECK-NEXT:    strb w8, [x0, #2]
 ; CHECK-NEXT:    ret
   %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
   %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 3b946b8b2e092..82f45e56f8331 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -407,17 +407,17 @@ define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT:    uaddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -432,17 +432,17 @@ define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT:    saddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -1029,17 +1029,17 @@ define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT:    uaddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    add x0, x8, x0
@@ -1056,17 +1056,17 @@ define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT:    saddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    add x0, x8, x0
@@ -1766,29 +1766,29 @@ define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v4.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    ushll2 v4.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    uaddl2 v16.2d, v5.4s, v4.4s
-; CHECK-NEXT:    uaddl v4.2d, v5.2s, v4.2s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    uaddl2 v5.2d, v4.4s, v2.4s
+; CHECK-NEXT:    uaddl2 v6.2d, v0.4s, v3.4s
+; CHECK-NEXT:    ushll2 v7.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    uaddl v2.2d, v4.2s, v2.2s
+; CHECK-NEXT:    add v4.2d, v6.2d, v5.2d
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v3.2s
+; CHECK-NEXT:    ushll v3.4s, v7.4h, #0
+; CHECK-NEXT:    ushll2 v5.4s, v7.8h, #0
+; CHECK-NEXT:    ushll2 v6.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    uaddl2 v7.2d, v6.4s, v5.4s
+; CHECK-NEXT:    uaddl v5.2d, v6.2s, v5.2s
+; CHECK-NEXT:    uaddl2 v6.2d, v1.4s, v3.4s
 ; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    uaddl2 v3.2d, v7.4s, v6.4s
-; CHECK-NEXT:    uaddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT:    add v5.2d, v5.2d, v16.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v2.2d, v6.2d, v7.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v5.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v2.2d, v3.2d, v2.2d
-; CHECK-NEXT:    add v1.2d, v6.2d, v1.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v5.2d
 ; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
@@ -1808,29 +1808,29 @@ define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    sshll2 v4.4s, v0.8h, #0
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v6.4s, v3.4h, #0
-; CHECK-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    saddl2 v16.2d, v5.4s, v4.4s
-; CHECK-NEXT:    saddl v4.2d, v5.2s, v4.2s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT:    saddl2 v5.2d, v4.4s, v2.4s
+; CHECK-NEXT:    saddl2 v6.2d, v0.4s, v3.4s
+; CHECK-NEXT:    sshll2 v7.8h, v1.16b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    saddl v2.2d, v4.2s, v2.2s
+; CHECK-NEXT:    add v4.2d, v6.2d, v5.2d
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v3.2s
+; CHECK-NEXT:    sshll v3.4s, v7.4h, #0
+; CHECK-NEXT:    sshll2 v5.4s, v7.8h, #0
+; CHECK-NEXT:    sshll2 v6.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    saddl2 v7.2d, v6.4s, v5.4s
+; CHECK-NEXT:    saddl v5.2d, v6.2s, v5.2s
+; CHECK-NEXT:    saddl2 v6.2d, v1.4s, v3.4s
 ; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    saddl2 v3.2d, v7.4s, v6.4s
-; CHECK-NEXT:    saddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT:    add v5.2d, v5.2d, v16.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v2.2d, v6.2d, v7.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v5.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v2.2d, v3.2d, v2.2d
-; CHECK-NEXT:    add v1.2d, v6.2d, v1.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v5.2d
 ; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
@@ -1925,21 +1925,21 @@ entry:
 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: add_pair_v4i8_v4i64_sext:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-NEXT:    ushll v3.2d, v0.2s, #0
 ; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
 ; CHECK-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-NEXT:    shl v2.2d, v2.2d, #56
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-NEXT:    sshr v3.2d, v3.2d, #56
 ; CHECK-NEXT:    shl v1.2d, v1.2d, #56
 ; CHECK-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-NEXT:    ssra v3.2d, v0.2d, #56
+; CHECK-NEXT:    ssra v2.2d, v1.2d, #56
+; CHECK-NEXT:    add v0.2d, v3.2d, v2.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index fc0f3a10f5b16..5d87506177e03 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -96,9 +96,9 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
 define i8 @test_v9i8(<9 x i8> %a) nounwind {
 ; CHECK-LABEL: test_v9i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, #-1
 ; CHECK-NEXT:    umov w12, v0.b[4]
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v1.b[9], w8
 ; CHECK-NEXT:    mov v1.b[10], w8
 ; CHECK-NEXT:    mov v1.b[11], w8
@@ -129,8 +129,8 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    and v1.8b, v0.8b, v1.8b
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 1a946dd2ca05c..51b60332bf5ab 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -302,14 +302,14 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-FP-NEXT:    mvni v1.8h, #4, lsl #8
-; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
+; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 06aa415b909f5..fade974b07dc7 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -302,14 +302,14 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-FP-NEXT:    mvni v1.8h, #132, lsl #8
-; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
+; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]
diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
index 3e9673acb0061..754d9e8eb7eda 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -82,8 +82,8 @@ define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0
 define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: test_copysign_v2f32_v2f64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mvni.2s v2, #128, lsl #24
 ; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    mvni.2s v2, #128, lsl #24
 ; CHECK-NEXT:    bif.8b v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp0 = fptrunc <2 x double> %b to <2 x float>
@@ -110,9 +110,9 @@ define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #
 ; CHECK-LABEL: test_copysign_v4f32_v4f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    fcvtn v1.2s, v1.2d
-; CHECK-NEXT:    mvni.4s v3, #128, lsl #24
 ; CHECK-NEXT:    fcvtn2 v1.4s, v2.2d
-; CHECK-NEXT:    bif.16b v0, v1, v3
+; CHECK-NEXT:    mvni.4s v2, #128, lsl #24
+; CHECK-NEXT:    bif.16b v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp0 = fptrunc <4 x double> %b to <4 x float>
   %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)
@@ -191,21 +191,21 @@ define <4 x half> @test_copysign_v4f16_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; NOFP16-NEXT:    mov h3, v1[1]
 ; NOFP16-NEXT:    mov h4, v0[1]
-; NOFP16-NEXT:    mvni.4s v2, #128, lsl #24
 ; NOFP16-NEXT:    fcvt s5, h1
 ; NOFP16-NEXT:    fcvt s6, h0
 ; NOFP16-NEXT:    mov h7, v1[2]
 ; NOFP16-NEXT:    mov h16, v0[2]
+; NOFP16-NEXT:    mvni.4s v2, #128, lsl #24
+; NOFP16-NEXT:    mov h1, v1[3]
 ; NOFP16-NEXT:    fcvt s3, h3
 ; NOFP16-NEXT:    fcvt s4, h4
-; NOFP16-NEXT:    mov h1, v1[3]
 ; NOFP16-NEXT:    bit.16b v5, v6, v2
 ; NOFP16-NEXT:    fcvt s6, h7
 ; NOFP16-NEXT:    fcvt s7, h16
+; NOFP16-NEXT:    fcvt s1, h1
 ; NOFP16-NEXT:    bit.16b v3, v4, v2
 ; NOFP16-NEXT:    mov h4, v0[3]
 ; NOFP16-NEXT:    fcvt h0, s5
-; NOFP16-NEXT:    fcvt s1, h1
 ; NOFP16-NEXT:    bit.16b v6, v7, v2
 ; NOFP16-NEXT:    fcvt h3, s3
 ; NOFP16-NEXT:    fcvt s4, h4
@@ -233,9 +233,9 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
 ; NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; NOFP16-NEXT:    mov h3, v0[1]
-; NOFP16-NEXT:    mvni.4s v2, #128, lsl #24
 ; NOFP16-NEXT:    fcvt s5, h0
 ; NOFP16-NEXT:    mov h7, v0[2]
+; NOFP16-NEXT:    mvni.4s v2, #128, lsl #24
 ; NOFP16-NEXT:    mov h4, v1[1]
 ; NOFP16-NEXT:    fcvt s6, h1
 ; NOFP16-NEXT:    mov h16, v1[2]
@@ -263,8 +263,8 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
 ;
 ; FP16-LABEL: test_copysign_v4f16_v4f32:
 ; FP16:       ; %bb.0:
-; FP16-NEXT:    mvni.4h v2, #128, lsl #8
 ; FP16-NEXT:    fcvtn v1.4h, v1.4s
+; FP16-NEXT:    mvni.4h v2, #128, lsl #8
 ; FP16-NEXT:    bif.8b v0, v1, v2
 ; FP16-NEXT:    ret
   %tmp0 = fptrunc <4 x float> %b to <4 x half>
@@ -278,28 +278,28 @@ define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; NOFP16-NEXT:    mov d4, v1[1]
 ; NOFP16-NEXT:    mov h5, v0[1]
-; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
 ; NOFP16-NEXT:    fcvt s1, d1
 ; NOFP16-NEXT:    fcvt s6, h0
 ; NOFP16-NEXT:    mov h7, v0[2]
+; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
 ; NOFP16-NEXT:    fcvt s4, d4
 ; NOFP16-NEXT:    fcvt s5, h5
 ; NOFP16-NEXT:    bit.16b v1, v6, v3
 ; NOFP16-NEXT:    fcvt s6, d2
 ; NOFP16-NEXT:    fcvt s7, h7
-; NOFP16-NEXT:    bit.16b v4, v5, v3
 ; NOFP16-NEXT:    mov d2, v2[1]
+; NOFP16-NEXT:    bit.16b v4, v5, v3
 ; NOFP16-NEXT:    mov h5, v0[3]
 ; NOFP16-NEXT:    fcvt h0, s1
 ; NOFP16-NEXT:    bit.16b v6, v7, v3
-; NOFP16-NEXT:    fcvt h1, s4
 ; NOFP16-NEXT:    fcvt s2, d2
+; NOFP16-NEXT:    fcvt h1, s4
 ; NOFP16-NEXT:    fcvt s4, h5
 ; NOFP16-NEXT:    fcvt h5, s6
 ; NOFP16-NEXT:    mov.h v0[1], v1[0]
 ; NOFP16-NEXT:    mov.16b v1, v3
-; NOFP16-NEXT:    mov.h v0[2], v5[0]
 ; NOFP16-NEXT:    bsl.16b v1, v4, v2
+; NOFP16-NEXT:    mov.h v0[2], v5[0]
 ; NOFP16-NEXT:    fcvt h1, s1
 ; NOFP16-NEXT:    mov.h v0[3], v1[0]
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 killed $q0
@@ -307,17 +307,17 @@ define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0
 ;
 ; FP16-LABEL: test_copysign_v4f16_v4f64:
 ; FP16:       ; %bb.0:
-; FP16-NEXT:    mov d4, v1[1]
+; FP16-NEXT:    mov d3, v1[1]
 ; FP16-NEXT:    fcvt h1, d1
-; FP16-NEXT:    mvni.4h v3, #128, lsl #8
-; FP16-NEXT:    fcvt h4, d4
-; FP16-NEXT:    mov.h v1[1], v4[0]
-; FP16-NEXT:    fcvt h4, d2
+; FP16-NEXT:    fcvt h3, d3
+; FP16-NEXT:    mov.h v1[1], v3[0]
+; FP16-NEXT:    fcvt h3, d2
 ; FP16-NEXT:    mov d2, v2[1]
-; FP16-NEXT:    mov.h v1[2], v4[0]
+; FP16-NEXT:    mov.h v1[2], v3[0]
 ; FP16-NEXT:    fcvt h2, d2
 ; FP16-NEXT:    mov.h v1[3], v2[0]
-; FP16-NEXT:    bif.8b v0, v1, v3
+; FP16-NEXT:    mvni.4h v2, #128, lsl #8
+; FP16-NEXT:    bif.8b v0, v1, v2
 ; FP16-NEXT:    ret
   %tmp0 = fptrunc <4 x double> %b to <4 x half>
   %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
@@ -333,33 +333,33 @@ define <8 x half> @test_copysign_v8f16_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; NOFP16:       ; %bb.0:
 ; NOFP16-NEXT:    mov h5, v1[1]
 ; NOFP16-NEXT:    mov h6, v0[1]
-; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
 ; NOFP16-NEXT:    fcvt s2, h1
 ; NOFP16-NEXT:    fcvt s4, h0
 ; NOFP16-NEXT:    mov h7, v1[2]
 ; NOFP16-NEXT:    mov h16, v0[2]
+; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
+; NOFP16-NEXT:    mov h17, v0[3]
 ; NOFP16-NEXT:    fcvt s5, h5
 ; NOFP16-NEXT:    fcvt s6, h6
-; NOFP16-NEXT:    mov h17, v0[3]
 ; NOFP16-NEXT:    mov h18, v0[5]
 ; NOFP16-NEXT:    bit.16b v2, v4, v3
 ; NOFP16-NEXT:    mov h4, v1[3]
 ; NOFP16-NEXT:    fcvt s7, h7
 ; NOFP16-NEXT:    fcvt s16, h16
-; NOFP16-NEXT:    bit.16b v5, v6, v3
 ; NOFP16-NEXT:    fcvt s17, h17
+; NOFP16-NEXT:    bit.16b v5, v6, v3
 ; NOFP16-NEXT:    mov.16b v6, v3
 ; NOFP16-NEXT:    fcvt s4, h4
-; NOFP16-NEXT:    fcvt h2, s2
-; NOFP16-NEXT:    fcvt h5, s5
 ; NOFP16-NEXT:    bsl.16b v6, v16, v7
 ; NOFP16-NEXT:    mov h7, v1[4]
 ; NOFP16-NEXT:    mov h16, v0[4]
+; NOFP16-NEXT:    fcvt h2, s2
+; NOFP16-NEXT:    fcvt h5, s5
 ; NOFP16-NEXT:    bit.16b v4, v17, v3
 ; NOFP16-NEXT:    mov h17, v1[5]
-; NOFP16-NEXT:    mov.h v2[1], v5[0]
 ; NOFP16-NEXT:    fcvt s7, h7
 ; NOFP16-NEXT:    fcvt s16, h16
+; NOFP16-NEXT:    mov.h v2[1], v5[0]
 ; NOFP16-NEXT:    fcvt h5, s6
 ; NOFP16-NEXT:    fcvt s6, h17
 ; NOFP16-NEXT:    fcvt s17, h18
@@ -403,11 +403,11 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
 ; NOFP16:       ; %bb.0:
 ; NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
 ; NOFP16-NEXT:    mov h4, v0[1]
 ; NOFP16-NEXT:    mov h5, v0[4]
 ; NOFP16-NEXT:    fcvt s7, h0
 ; NOFP16-NEXT:    mov h17, v0[2]
+; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
 ; NOFP16-NEXT:    mov h6, v1[1]
 ; NOFP16-NEXT:    fcvt s16, h1
 ; NOFP16-NEXT:    fcvt s4, h4
@@ -425,29 +425,29 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
 ; NOFP16-NEXT:    fcvt h1, s7
 ; NOFP16-NEXT:    mov.16b v7, v3
 ; NOFP16-NEXT:    fcvt h4, s4
+; NOFP16-NEXT:    bsl.16b v7, v17, v18
 ; NOFP16-NEXT:    fcvt s6, h6
 ; NOFP16-NEXT:    fcvt s16, h16
-; NOFP16-NEXT:    fcvt h5, s5
-; NOFP16-NEXT:    bsl.16b v7, v17, v18
 ; NOFP16-NEXT:    mov h17, v0[5]
 ; NOFP16-NEXT:    mov h18, v2[1]
+; NOFP16-NEXT:    fcvt h5, s5
 ; NOFP16-NEXT:    mov.h v1[1], v4[0]
-; NOFP16-NEXT:    bif.16b v6, v16, v3
 ; NOFP16-NEXT:    fcvt h4, s7
+; NOFP16-NEXT:    bif.16b v6, v16, v3
 ; NOFP16-NEXT:    fcvt s7, h17
 ; NOFP16-NEXT:    fcvt s17, h18
-; NOFP16-NEXT:    mov h16, v2[2]
-; NOFP16-NEXT:    mov h2, v2[3]
-; NOFP16-NEXT:    fcvt h6, s6
 ; NOFP16-NEXT:    mov.h v1[2], v4[0]
 ; NOFP16-NEXT:    mov h4, v0[6]
-; NOFP16-NEXT:    bif.16b v7, v17, v3
-; NOFP16-NEXT:    fcvt s16, h16
+; NOFP16-NEXT:    mov h16, v2[2]
+; NOFP16-NEXT:    fcvt h6, s6
 ; NOFP16-NEXT:    mov h0, v0[7]
-; NOFP16-NEXT:    fcvt s2, h2
+; NOFP16-NEXT:    bif.16b v7, v17, v3
+; NOFP16-NEXT:    mov h2, v2[3]
 ; NOFP16-NEXT:    fcvt s4, h4
+; NOFP16-NEXT:    fcvt s16, h16
 ; NOFP16-NEXT:    mov.h v1[3], v6[0]
 ; NOFP16-NEXT:    fcvt s0, h0
+; NOFP16-NEXT:    fcvt s2, h2
 ; NOFP16-NEXT:    bif.16b v4, v16, v3
 ; NOFP16-NEXT:    mov.h v1[4], v5[0]
 ; NOFP16-NEXT:    fcvt h5, s7
@@ -464,9 +464,9 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
 ; FP16:       ; %bb.0:
 ; FP16-NEXT:    fcvtn v2.4h, v2.4s
 ; FP16-NEXT:    fcvtn v1.4h, v1.4s
-; FP16-NEXT:    mvni.8h v3, #128, lsl #8
 ; FP16-NEXT:    mov.d v1[1], v2[0]
-; FP16-NEXT:    bif.16b v0, v1, v3
+; FP16-NEXT:    mvni.8h v2, #128, lsl #8
+; FP16-NEXT:    bif.16b v0, v1, v2
 ; FP16-NEXT:    ret
   %tmp0 = fptrunc <8 x float> %b to <8 x half>
   %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)
diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index 763edf825e1f2..30ba6f2e34643 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -10,11 +10,11 @@
 define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_C1_or_C2_vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    adrp x9, .LCPI0_1
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_1]
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
@@ -29,9 +29,9 @@ define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    adrp x9, .LCPI1_1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI1_1]
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI1_1]
+; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cond = icmp eq <4 x i32> %x, %y
   %add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
@@ -41,11 +41,11 @@ define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @sel_Cplus1_or_C_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_Cplus1_or_C_vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
 ; CHECK-NEXT:    adrp x9, .LCPI2_1
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI2_1]
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
@@ -60,9 +60,9 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
 ; CHECK-NEXT:    adrp x9, .LCPI3_1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI3_1]
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI3_1]
+; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cond = icmp eq <4 x i32> %x, %y
   %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
@@ -72,11 +72,11 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_Cminus1_or_C_vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    adrp x9, .LCPI4_1
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI4_1]
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
@@ -91,9 +91,9 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    adrp x9, .LCPI5_1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI5_1]
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI5_1]
+; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cond = icmp eq <4 x i32> %x, %y
   %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s
index fe34b839f3900..afffb854740ab 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s
@@ -1070,27 +1070,27 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      4     0.50                        abs	d29, d24
-# CHECK-NEXT:  1      4     1.00                        abs	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        abs	v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        abs	v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        abs	v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        abs	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        abs	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        abs	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        add	d17, d31, d29
-# CHECK-NEXT:  1      4     0.50                        add	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        addhn	v0.2s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        addhn	v0.4h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        addhn	v0.8b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        addhn2	v0.16b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        addhn2	v0.4s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        addhn2	v0.8h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        addp	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        addp	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        and	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        bic	v0.4h, #15, lsl #8
-# CHECK-NEXT:  1      4     0.50                        bic	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        abs	d29, d24
+# CHECK-NEXT:  1      3     1.00                        abs	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        abs	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        abs	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        abs	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        abs	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        abs	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        abs	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        add	d17, d31, d29
+# CHECK-NEXT:  1      2     0.50                        add	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        addhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        addhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        addhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        addhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        addhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        addhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        addp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        addp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      1     0.50                        and	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      1     0.50                        bic	v0.4h, #15, lsl #8
+# CHECK-NEXT:  1      1     0.50                        bic	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     1.00                        bif	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     1.00                        bit	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        bsl	v0.8b, v0.8b, v0.8b
@@ -1106,28 +1106,28 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        clz	v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        clz	v0.8b, v0.8b
 # CHECK-NEXT:  1      4     1.00                        clz	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        cmeq	d20, d21, #0
-# CHECK-NEXT:  1      4     0.50                        cmeq	d20, d21, d22
-# CHECK-NEXT:  1      4     1.00                        cmeq	v0.16b, v0.16b, #0
-# CHECK-NEXT:  1      4     1.00                        cmeq	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        cmge	d20, d21, #0
-# CHECK-NEXT:  1      4     0.50                        cmge	d20, d21, d22
-# CHECK-NEXT:  1      4     0.50                        cmge	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        cmge	v0.8b, v0.8b, #0
-# CHECK-NEXT:  1      4     0.50                        cmgt	d20, d21, #0
-# CHECK-NEXT:  1      4     0.50                        cmgt	d20, d21, d22
-# CHECK-NEXT:  1      4     0.50                        cmgt	v0.2s, v0.2s, #0
-# CHECK-NEXT:  1      4     1.00                        cmgt	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        cmhi	d20, d21, d22
-# CHECK-NEXT:  1      4     1.00                        cmhi	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        cmhs	d20, d21, d22
-# CHECK-NEXT:  1      4     0.50                        cmhs	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        cmle	d20, d21, #0
-# CHECK-NEXT:  1      4     1.00                        cmle	v0.2d, v0.2d, #0
-# CHECK-NEXT:  1      4     0.50                        cmlt	d20, d21, #0
-# CHECK-NEXT:  1      4     1.00                        cmlt	v0.8h, v0.8h, #0
-# CHECK-NEXT:  1      4     0.50                        cmtst	d20, d21, d22
-# CHECK-NEXT:  1      4     0.50                        cmtst	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        cmeq	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmeq	d20, d21, d22
+# CHECK-NEXT:  1      2     1.00                        cmeq	v0.16b, v0.16b, #0
+# CHECK-NEXT:  1      2     1.00                        cmeq	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        cmge	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmge	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmge	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        cmge	v0.8b, v0.8b, #0
+# CHECK-NEXT:  1      2     0.50                        cmgt	d20, d21, #0
+# CHECK-NEXT:  1      2     0.50                        cmgt	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmgt	v0.2s, v0.2s, #0
+# CHECK-NEXT:  1      2     1.00                        cmgt	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        cmhi	d20, d21, d22
+# CHECK-NEXT:  1      2     1.00                        cmhi	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        cmhs	d20, d21, d22
+# CHECK-NEXT:  1      2     0.50                        cmhs	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        cmle	d20, d21, #0
+# CHECK-NEXT:  1      2     1.00                        cmle	v0.2d, v0.2d, #0
+# CHECK-NEXT:  1      2     0.50                        cmlt	d20, d21, #0
+# CHECK-NEXT:  1      2     1.00                        cmlt	v0.8h, v0.8h, #0
+# CHECK-NEXT:  1      3     0.50                        cmtst	d20, d21, d22
+# CHECK-NEXT:  1      3     0.50                        cmtst	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        cnt	v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        cnt	v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.50                        dup	v0.16b, w28
@@ -1137,7 +1137,7 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.50                        dup	v0.4s, w28
 # CHECK-NEXT:  1      4     0.50                        dup	v0.8b, w28
 # CHECK-NEXT:  1      2     0.50                        dup	v0.8h, w28
-# CHECK-NEXT:  1      4     1.00                        eor	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      1     1.00                        eor	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     1.00                        ext	v0.16b, v0.16b, v0.16b, #3
 # CHECK-NEXT:  1      4     0.50                        ext	v0.8b, v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      4     0.50                        fabd	d29, d24, d20
@@ -1429,8 +1429,8 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        mov	d6, v0.d[1]
 # CHECK-NEXT:  1      4     0.50                        mov	h2, v0.h[5]
 # CHECK-NEXT:  1      4     0.50                        mov	s17, v0.s[2]
-# CHECK-NEXT:  1      4     1.00                        mov	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        mov	v0.8b, v0.8b
+# CHECK-NEXT:  1      1     1.00                        mov	v0.16b, v0.16b
+# CHECK-NEXT:  1      1     0.50                        mov	v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        movi	d15, #0xff00ff00ff00ff
 # CHECK-NEXT:  1      4     1.00                        movi	v0.16b, #31
 # CHECK-NEXT:  1      4     1.00                        movi	v0.2d, #0xff0000ff0000ffff
@@ -1438,31 +1438,31 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        movi	v0.4s, #255, lsl #24
 # CHECK-NEXT:  1      4     0.50                        movi	v0.8b, #255
 # CHECK-NEXT:  1      4     0.50                        mul	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        mvni	v0.2s, #0
-# CHECK-NEXT:  1      4     1.00                        mvni	v0.4s, #16, msl #16
-# CHECK-NEXT:  1      4     0.50                        neg	d29, d24
-# CHECK-NEXT:  1      4     1.00                        neg	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        neg	v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        neg	v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        neg	v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        neg	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        neg	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        neg	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        mvn	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        mvn	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        orn	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        mov	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        orr	v0.8h, #31
-# CHECK-NEXT:  1      4     1.00                        pmul	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        pmul	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        pmull	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        pmull2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        raddhn	v0.2s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        raddhn	v0.4h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        raddhn	v0.8b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        raddhn2	v0.16b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        raddhn2	v0.4s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        raddhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      1     0.50                        mvni	v0.2s, #0
+# CHECK-NEXT:  1      1     1.00                        mvni	v0.4s, #16, msl #16
+# CHECK-NEXT:  1      2     0.50                        neg	d29, d24
+# CHECK-NEXT:  1      2     1.00                        neg	v0.16b, v0.16b
+# CHECK-NEXT:  1      2     1.00                        neg	v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        neg	v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        neg	v0.4h, v0.4h
+# CHECK-NEXT:  1      2     1.00                        neg	v0.4s, v0.4s
+# CHECK-NEXT:  1      2     0.50                        neg	v0.8b, v0.8b
+# CHECK-NEXT:  1      2     1.00                        neg	v0.8h, v0.8h
+# CHECK-NEXT:  1      1     1.00                        mvn	v0.16b, v0.16b
+# CHECK-NEXT:  1      1     0.50                        mvn	v0.8b, v0.8b
+# CHECK-NEXT:  1      1     1.00                        orn	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      1     1.00                        mov	v0.16b, v0.16b
+# CHECK-NEXT:  1      1     1.00                        orr	v0.8h, #31
+# CHECK-NEXT:  1      3     1.00                        pmul	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        pmul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        pmull	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        pmull2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     2.00                        raddhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     2.00                        raddhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     2.00                        raddhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     2.00                        raddhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     2.00                        raddhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     2.00                        raddhn2	v0.8h, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     1.00                        rbit	v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        rbit	v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        rev16	v21.8b, v1.8b
@@ -1477,56 +1477,56 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        rev64	v2.8h, v4.8h
 # CHECK-NEXT:  1      4     0.50                        rev64	v4.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        rev64	v6.4s, v8.4s
-# CHECK-NEXT:  1      4     0.50                        rshrn	v0.2s, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        rshrn	v0.4h, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        rshrn	v0.8b, v0.8h, #3
-# CHECK-NEXT:  1      4     1.00                        rshrn2	v0.16b, v0.8h, #3
-# CHECK-NEXT:  1      4     1.00                        rshrn2	v0.4s, v0.2d, #3
-# CHECK-NEXT:  1      4     1.00                        rshrn2	v0.8h, v0.4s, #3
-# CHECK-NEXT:  1      4     1.00                        rsubhn	v0.2s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        rsubhn	v0.4h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        rsubhn	v0.8b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        rsubhn2	v0.16b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        rsubhn2	v0.4s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        rsubhn2	v0.8h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        saba	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        sabal	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        sabal	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        sabal	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        sabal2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        sabal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        sabal2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        sabd	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        sabdl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        sabdl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        sabdl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        sabdl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        sabdl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        sabdl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        sadalp	v0.1d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        sadalp	v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        sadalp	v0.2s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        sadalp	v0.4h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        sadalp	v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        sadalp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     1.00                        saddl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        saddl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        saddl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        saddl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        saddl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        saddl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.1d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        saddlp	v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.2s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.4h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        saddlp	v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        saddlp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     1.00                        saddw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        saddw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     1.00                        saddw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        saddw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     1.00                        saddw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        saddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        rshrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        rshrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        rshrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        rshrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        rshrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      3     1.00                        rshrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      4     2.00                        rsubhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     2.00                        rsubhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     2.00                        rsubhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     2.00                        rsubhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     2.00                        rsubhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      4     2.00                        rsubhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     2.00                        saba	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     2.00                        sabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     2.00                        sabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     2.00                        sabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     2.00                        sabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     2.00                        sabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     2.00                        sabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        sabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        sabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        sabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        sabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        sabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        sabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        sabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     2.00                        sadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      4     2.00                        sadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      4     2.00                        sadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      4     2.00                        sadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      4     2.00                        sadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      4     2.00                        sadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        saddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        saddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        saddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        saddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        saddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        saddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        saddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        saddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        saddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        saddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        saddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        saddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        saddw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     0.50                        scvtf	d21, d12
 # CHECK-NEXT:  1      4     0.50                        scvtf	d21, d12, #64
 # CHECK-NEXT:  1      4     0.50                        scvtf	s22, s13
@@ -1539,33 +1539,33 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        scvtf	v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        scvtf	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        scvtf	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        shadd	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        shl	d7, d10, #12
-# CHECK-NEXT:  1      4     1.00                        shl	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        shl	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        shl	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        shl	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     1.00                        shll	v0.2d, v0.2s, #32
-# CHECK-NEXT:  1      4     1.00                        shll	v0.4s, v0.4h, #16
-# CHECK-NEXT:  1      4     1.00                        shll	v0.8h, v0.8b, #8
-# CHECK-NEXT:  1      4     1.00                        shll	v0.2d, v0.2s, #32
-# CHECK-NEXT:  1      4     1.00                        shll	v0.4s, v0.4h, #16
-# CHECK-NEXT:  1      4     1.00                        shll	v0.8h, v0.8b, #8
-# CHECK-NEXT:  1      4     1.00                        shll2	v0.2d, v0.4s, #32
-# CHECK-NEXT:  1      4     1.00                        shll2	v0.4s, v0.8h, #16
-# CHECK-NEXT:  1      4     1.00                        shll2	v0.8h, v0.16b, #8
-# CHECK-NEXT:  1      4     1.00                        shll2	v0.2d, v0.4s, #32
-# CHECK-NEXT:  1      4     1.00                        shll2	v0.4s, v0.8h, #16
-# CHECK-NEXT:  1      4     1.00                        shll2	v0.8h, v0.16b, #8
-# CHECK-NEXT:  1      4     0.50                        shrn	v0.2s, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        shrn	v0.4h, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        shrn	v0.8b, v0.8h, #3
-# CHECK-NEXT:  1      4     1.00                        shrn2	v0.16b, v0.8h, #3
-# CHECK-NEXT:  1      4     1.00                        shrn2	v0.4s, v0.2d, #3
-# CHECK-NEXT:  1      4     1.00                        shrn2	v0.8h, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        shsub	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        shsub	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        sli	d10, d14, #12
+# CHECK-NEXT:  1      2     0.50                        shadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        shl	d7, d10, #12
+# CHECK-NEXT:  1      2     0.50                        shl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        shl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        shl	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        shl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     1.00                        shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  1      2     1.00                        shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  1      2     1.00                        shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  1      2     1.00                        shll	v0.2d, v0.2s, #32
+# CHECK-NEXT:  1      2     1.00                        shll	v0.4s, v0.4h, #16
+# CHECK-NEXT:  1      2     1.00                        shll	v0.8h, v0.8b, #8
+# CHECK-NEXT:  1      2     1.00                        shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  1      2     1.00                        shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  1      2     1.00                        shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  1      2     1.00                        shll2	v0.2d, v0.4s, #32
+# CHECK-NEXT:  1      2     1.00                        shll2	v0.4s, v0.8h, #16
+# CHECK-NEXT:  1      2     1.00                        shll2	v0.8h, v0.16b, #8
+# CHECK-NEXT:  1      2     0.50                        shrn	v0.2s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        shrn	v0.4h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        shrn	v0.8b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        shrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        shrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        shrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        shsub	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        shsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sli	d10, d14, #12
 # CHECK-NEXT:  1      4     1.00                        sli	v0.16b, v0.16b, #3
 # CHECK-NEXT:  1      4     1.00                        sli	v0.2d, v0.2d, #3
 # CHECK-NEXT:  1      4     0.50                        sli	v0.2s, v0.2s, #3
@@ -1573,18 +1573,18 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        sli	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        sli	v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      4     1.00                        sli	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        smax	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        smax	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        smax	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        smaxp	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        smaxp	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        smaxp	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        smin	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     0.50                        smax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        smax	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        smax	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        smaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        smaxp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        smaxp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     1.00                        smin	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     1.00                        smin	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        smin	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        sminp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     1.00                        smin	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     1.00                        sminp	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     1.00                        sminp	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        sminp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     1.00                        sminp	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        smlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        smlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     1.00                        smlal	v0.8h, v0.8b, v0.8b
@@ -1614,53 +1614,53 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        sqabs	v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        sqabs	v0.8b, v0.8b
 # CHECK-NEXT:  1      4     1.00                        sqabs	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        sqadd	b20, b11, b15
-# CHECK-NEXT:  1      4     1.00                        sqadd	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        sqadd	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        sqdmlal	d19, s24, s12
-# CHECK-NEXT:  1      4     0.50                        sqdmlal	d8, s9, v0.s[1]
-# CHECK-NEXT:  1      4     0.50                        sqdmlal	s0, h0, v0.h[3]
-# CHECK-NEXT:  1      4     0.50                        sqdmlal	s17, h27, h12
+# CHECK-NEXT:  1      3     0.50                        sqadd	b20, b11, b15
+# CHECK-NEXT:  1      3     1.00                        sqadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        sqadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	d19, s24, s12
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	d8, s9, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	s0, h0, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlal	s17, h27, h12
 # CHECK-NEXT:  1      4     1.00                        sqdmlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        sqdmlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     1.00                        sqdmlal2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     1.00                        sqdmlal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        sqdmlsl	d12, s23, s13
-# CHECK-NEXT:  1      4     0.50                        sqdmlsl	d8, s9, v0.s[1]
-# CHECK-NEXT:  1      4     0.50                        sqdmlsl	s0, h0, v0.h[3]
-# CHECK-NEXT:  1      4     0.50                        sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	d8, s9, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	s0, h0, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmlsl	s14, h12, h25
 # CHECK-NEXT:  1      4     1.00                        sqdmlsl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        sqdmlsl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     1.00                        sqdmlsl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     1.00                        sqdmlsl2	v0.4s, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        sqdmulh	h10, h11, h12
-# CHECK-NEXT:  1      4     0.50                        sqdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  1      4     0.50                        sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmulh	s15, s14, v0.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqdmulh	s20, s21, s2
 # CHECK-NEXT:  1      4     0.50                        sqdmulh	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        sqdmulh	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        sqdmull	d1, s1, v0.s[1]
-# CHECK-NEXT:  1      4     0.50                        sqdmull	d15, s22, s12
-# CHECK-NEXT:  1      4     0.50                        sqdmull	s1, h1, v0.h[3]
-# CHECK-NEXT:  1      4     0.50                        sqdmull	s12, h22, h12
+# CHECK-NEXT:  1      4     1.00                        sqdmull	d1, s1, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqdmull	d15, s22, s12
+# CHECK-NEXT:  1      4     1.00                        sqdmull	s1, h1, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqdmull	s12, h22, h12
 # CHECK-NEXT:  1      4     1.00                        sqdmull	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        sqdmull	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     1.00                        sqdmull2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     1.00                        sqdmull2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        sqneg	b19, b14
-# CHECK-NEXT:  1      4     0.50                        sqneg	d18, d12
-# CHECK-NEXT:  1      4     0.50                        sqneg	h21, h15
-# CHECK-NEXT:  1      4     0.50                        sqneg	s20, s12
-# CHECK-NEXT:  1      4     1.00                        sqneg	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        sqneg	v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        sqneg	v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        sqneg	v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        sqneg	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        sqneg	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        sqneg	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        sqneg	b19, b14
+# CHECK-NEXT:  1      3     0.50                        sqneg	d18, d12
+# CHECK-NEXT:  1      3     0.50                        sqneg	h21, h15
+# CHECK-NEXT:  1      3     0.50                        sqneg	s20, s12
+# CHECK-NEXT:  1      3     1.00                        sqneg	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        sqneg	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        sqneg	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        sqneg	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        sqneg	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        sqneg	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        sqneg	v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        sqrdmulh	h10, h11, h12
-# CHECK-NEXT:  1      4     0.50                        sqrdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  1      4     0.50                        sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     1.00                        sqrdmulh	s15, s14, v0.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqrdmulh	s20, s21, s2
 # CHECK-NEXT:  1      4     0.50                        sqrdmulh	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     1.00                        sqrdmulh	v0.8h, v0.8h, v0.8h
@@ -1732,10 +1732,10 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        sqshrun2	v0.16b, v0.8h, #3
 # CHECK-NEXT:  1      4     1.00                        sqshrun2	v0.4s, v0.2d, #3
 # CHECK-NEXT:  1      4     1.00                        sqshrun2	v0.8h, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        sqsub	s20, s10, s7
-# CHECK-NEXT:  1      4     1.00                        sqsub	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        sqsub	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        sqsub	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        sqsub	s20, s10, s7
+# CHECK-NEXT:  1      3     1.00                        sqsub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        sqsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        sqsub	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        sqxtn	b18, h18
 # CHECK-NEXT:  1      4     0.50                        sqxtn	h20, s17
 # CHECK-NEXT:  1      4     0.50                        sqxtn	s19, d14
@@ -1754,10 +1754,10 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        sqxtun2	v0.16b, v0.8h
 # CHECK-NEXT:  1      4     1.00                        sqxtun2	v0.4s, v0.2d
 # CHECK-NEXT:  1      4     1.00                        sqxtun2	v0.8h, v0.4s
-# CHECK-NEXT:  1      4     0.50                        srhadd	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        srhadd	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        srhadd	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        sri	d10, d12, #14
+# CHECK-NEXT:  1      2     0.50                        srhadd	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        srhadd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        srhadd	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        sri	d10, d12, #14
 # CHECK-NEXT:  1      4     1.00                        sri	v0.16b, v0.16b, #3
 # CHECK-NEXT:  1      4     1.00                        sri	v0.2d, v0.2d, #3
 # CHECK-NEXT:  1      4     0.50                        sri	v0.2s, v0.2s, #3
@@ -1765,61 +1765,61 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        sri	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        sri	v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      4     1.00                        sri	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        srshl	d16, d16, d16
-# CHECK-NEXT:  1      4     0.50                        srshl	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        srshl	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        srshl	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        srshr	d19, d18, #7
-# CHECK-NEXT:  1      4     1.00                        srshr	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        srshr	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        srshr	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        srshr	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        srshr	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        srshr	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        srshr	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        srsra	d15, d11, #19
-# CHECK-NEXT:  1      4     1.00                        srsra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        srsra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        srsra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        srsra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        srsra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        srsra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        srsra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        sshl	d31, d31, d31
-# CHECK-NEXT:  1      4     1.00                        sshl	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        sshl	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        sshl	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        sshl	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        sshll	v0.2d, v0.2s, #3
-# CHECK-NEXT:  1      4     1.00                        sshll2	v0.4s, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        sshr	d15, d16, #12
-# CHECK-NEXT:  1      4     1.00                        sshr	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        sshr	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        sshr	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        sshr	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        sshr	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        sshr	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        sshr	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        ssra	d18, d12, #21
-# CHECK-NEXT:  1      4     1.00                        ssra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        ssra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        ssra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        ssra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        ssra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        ssra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        ssra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     1.00                        ssubl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        ssubl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        ssubl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        ssubl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        ssubl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        ssubl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        ssubw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        ssubw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     1.00                        ssubw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        ssubw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     1.00                        ssubw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        ssubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        srshl	d16, d16, d16
+# CHECK-NEXT:  1      3     0.50                        srshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        srshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        srshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        srshr	d19, d18, #7
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        srshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        srshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      4     2.00                        srsra	d15, d11, #19
+# CHECK-NEXT:  1      4     2.00                        srsra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     2.00                        srsra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     2.00                        srsra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     2.00                        srsra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      4     2.00                        srsra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     2.00                        srsra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     2.00                        srsra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        sshl	d31, d31, d31
+# CHECK-NEXT:  1      2     1.00                        sshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      2     0.50                        sshl	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        sshl	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        sshl	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     1.00                        sshll	v0.2d, v0.2s, #3
+# CHECK-NEXT:  1      2     1.00                        sshll2	v0.4s, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        sshr	d15, d16, #12
+# CHECK-NEXT:  1      2     0.50                        sshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        sshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        sshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        sshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        sshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        sshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        sshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	d18, d12, #21
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        ssra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        ssubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        ssubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        ssubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        ssubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        ssubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        ssubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        ssubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        ssubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        ssubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        ssubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        ssubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        ssubw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     1.00           *            st1	{ v0.16b }, [x0]
 # CHECK-NEXT:  2      5     2.00           *            st1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
 # CHECK-NEXT:  1      5     4.00           *            st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
@@ -1842,19 +1842,19 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  2      5     4.00           *            st4	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
 # CHECK-NEXT:  1      5     2.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0]
 # CHECK-NEXT:  2      5     2.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
-# CHECK-NEXT:  1      4     0.50                        sub	d15, d5, d16
-# CHECK-NEXT:  1      4     1.00                        sub	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        suqadd	b19, b14
-# CHECK-NEXT:  1      4     0.50                        suqadd	d18, d22
-# CHECK-NEXT:  1      4     0.50                        suqadd	h20, h15
-# CHECK-NEXT:  1      4     0.50                        suqadd	s21, s12
-# CHECK-NEXT:  1      4     1.00                        suqadd	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        suqadd	v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        suqadd	v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        suqadd	v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        suqadd	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        suqadd	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        suqadd	v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        sub	d15, d5, d16
+# CHECK-NEXT:  1      2     1.00                        sub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        suqadd	b19, b14
+# CHECK-NEXT:  1      3     0.50                        suqadd	d18, d22
+# CHECK-NEXT:  1      3     0.50                        suqadd	h20, h15
+# CHECK-NEXT:  1      3     0.50                        suqadd	s21, s12
+# CHECK-NEXT:  1      3     1.00                        suqadd	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        suqadd	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        suqadd	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        suqadd	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        suqadd	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        suqadd	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        suqadd	v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        tbl	v0.16b, { v0.16b }, v0.16b
 # CHECK-NEXT:  1      4     1.00                        tbl	v0.16b, { v0.16b, v1.16b }, v0.16b
 # CHECK-NEXT:  1      4     1.00                        tbl	v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
@@ -1885,44 +1885,44 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        trn2	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        trn2	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     1.00                        trn2	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        uaba	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        uabal	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        uabal	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        uabal	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        uabal2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        uabal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uabal2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uabd	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        uabdl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        uabdl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        uabdl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        uabdl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        uabdl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uabdl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uadalp	v0.1d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        uadalp	v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        uadalp	v0.2s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        uadalp	v0.4h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        uadalp	v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uadalp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     1.00                        uaddl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        uaddl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        uaddl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        uaddl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        uaddl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uaddl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.1d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        uaddlp	v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.2s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.4h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        uaddlp	v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uaddlp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     1.00                        uaddw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        uaddw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     1.00                        uaddw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        uaddw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     1.00                        uaddw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uaddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      4     2.00                        uaba	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     2.00                        uabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      4     2.00                        uabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     2.00                        uabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     2.00                        uabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      4     2.00                        uabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      4     2.00                        uabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uabd	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uabdl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uabdl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uabdl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uabdl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        uabdl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uabdl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      4     2.00                        uadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      4     2.00                        uadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      4     2.00                        uadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      4     2.00                        uadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      4     2.00                        uadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      4     2.00                        uadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        uaddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uaddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uaddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uaddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        uaddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uaddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uaddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     1.00                        uaddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        uaddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        uaddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        uaddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        uaddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        uaddw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     0.50                        ucvtf	d21, d14
 # CHECK-NEXT:  1      4     0.50                        ucvtf	d21, d14, #64
 # CHECK-NEXT:  1      4     0.50                        ucvtf	s22, s13
@@ -1935,21 +1935,21 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        ucvtf	v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        ucvtf	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        ucvtf	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uhadd	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        uhadd	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        uhsub	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        umax	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     1.00                        uhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     1.00                        uhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     1.00                        uhsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     1.00                        umax	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     1.00                        umax	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        umax	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        umaxp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     1.00                        umax	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     1.00                        umaxp	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     1.00                        umaxp	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        umaxp	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        umin	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        umin	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        umin	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        uminp	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        uminp	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        uminp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     1.00                        umaxp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     0.50                        umin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        umin	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        umin	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      2     0.50                        uminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      2     0.50                        uminp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      2     0.50                        uminp	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     1.00                        umlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        umlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     1.00                        umlal	v0.8h, v0.8b, v0.8b
@@ -1968,8 +1968,8 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        umull2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     1.00                        umull2	v0.4s, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        umull2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uqadd	h0, h1, h5
-# CHECK-NEXT:  1      4     1.00                        uqadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        uqadd	h0, h1, h5
+# CHECK-NEXT:  1      3     1.00                        uqadd	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        uqrshl	b11, b20, b30
 # CHECK-NEXT:  1      4     0.50                        uqrshl	s23, s20, s16
 # CHECK-NEXT:  1      4     1.00                        uqrshl	v0.16b, v0.16b, v0.16b
@@ -2011,8 +2011,8 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        uqshrn2	v0.16b, v0.8h, #3
 # CHECK-NEXT:  1      4     1.00                        uqshrn2	v0.4s, v0.2d, #3
 # CHECK-NEXT:  1      4     1.00                        uqshrn2	v0.8h, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        uqsub	d16, d16, d16
-# CHECK-NEXT:  1      4     0.50                        uqsub	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        uqsub	d16, d16, d16
+# CHECK-NEXT:  1      3     0.50                        uqsub	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        uqxtn	b18, h18
 # CHECK-NEXT:  1      4     0.50                        uqxtn	h20, s17
 # CHECK-NEXT:  1      4     0.50                        uqxtn	s19, d14
@@ -2024,77 +2024,77 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     1.00                        uqxtn2	v0.8h, v0.4s
 # CHECK-NEXT:  1      4     0.50                        urecpe	v0.2s, v0.2s
 # CHECK-NEXT:  1      4     1.00                        urecpe	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        urhadd	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        urhadd	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        urhadd	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        urshl	d8, d7, d4
-# CHECK-NEXT:  1      4     1.00                        urshl	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        urshl	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     1.00                        urshl	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        urshl	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        urshr	d20, d23, #31
-# CHECK-NEXT:  1      4     1.00                        urshr	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        urshr	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        urshr	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        urshr	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        urshr	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        urshr	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        urshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     1.00                        urhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     1.00                        urhadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     1.00                        urhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        urshl	d8, d7, d4
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        urshl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        urshr	d20, d23, #31
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        urshr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        urshr	v0.8h, v0.8h, #3
 # CHECK-NEXT:  1      12    9.00                        ursqrte	v0.2s, v0.2s
 # CHECK-NEXT:  1      12    9.00                        ursqrte	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        ursra	d18, d10, #13
-# CHECK-NEXT:  1      4     1.00                        ursra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        ursra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        ursra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        ursra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        ursra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        ursra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        ursra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        ushl	d0, d0, d0
-# CHECK-NEXT:  1      4     1.00                        ushl	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        ushl	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        ushl	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        ushll	v0.4s, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        ushll2	v0.8h, v0.16b, #3
-# CHECK-NEXT:  1      4     0.50                        ushr	d10, d17, #18
-# CHECK-NEXT:  1      4     1.00                        ushr	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        ushr	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        ushr	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        ushr	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        ushr	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        ushr	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        ushr	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        usqadd	b19, b14
-# CHECK-NEXT:  1      4     0.50                        usqadd	d18, d22
-# CHECK-NEXT:  1      4     0.50                        usqadd	h20, h15
-# CHECK-NEXT:  1      4     0.50                        usqadd	s21, s12
-# CHECK-NEXT:  1      4     1.00                        usqadd	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        usqadd	v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        usqadd	v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        usqadd	v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        usqadd	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        usqadd	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        usqadd	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        usra	d20, d13, #61
-# CHECK-NEXT:  1      4     1.00                        usra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      4     1.00                        usra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      4     0.50                        usra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      4     0.50                        usra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      4     1.00                        usra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        usra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      4     1.00                        usra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     1.00                        usubl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     1.00                        usubl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     1.00                        usubl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     1.00                        usubl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     1.00                        usubl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     1.00                        usubl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     1.00                        usubw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     1.00                        usubw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     1.00                        usubw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     1.00                        usubw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     1.00                        usubw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     1.00                        usubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      4     2.00                        ursra	d18, d10, #13
+# CHECK-NEXT:  1      4     2.00                        ursra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      4     2.00                        ursra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      4     2.00                        ursra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      4     2.00                        ursra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      4     2.00                        ursra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      4     2.00                        ursra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      4     2.00                        ursra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      2     0.50                        ushl	d0, d0, d0
+# CHECK-NEXT:  1      2     1.00                        ushl	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      2     1.00                        ushl	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      2     1.00                        ushl	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      2     1.00                        ushll	v0.4s, v0.4h, #3
+# CHECK-NEXT:  1      2     1.00                        ushll2	v0.8h, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        ushr	d10, d17, #18
+# CHECK-NEXT:  1      2     0.50                        ushr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      2     0.50                        ushr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      2     0.50                        ushr	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      2     0.50                        ushr	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      2     0.50                        ushr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      2     0.50                        ushr	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      2     0.50                        ushr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        usqadd	b19, b14
+# CHECK-NEXT:  1      3     0.50                        usqadd	d18, d22
+# CHECK-NEXT:  1      3     0.50                        usqadd	h20, h15
+# CHECK-NEXT:  1      3     0.50                        usqadd	s21, s12
+# CHECK-NEXT:  1      3     1.00                        usqadd	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        usqadd	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        usqadd	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        usqadd	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        usqadd	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        usqadd	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        usqadd	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        usra	d20, d13, #61
+# CHECK-NEXT:  1      3     1.00                        usra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     1.00                        usra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     1.00                        usubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     1.00                        usubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     1.00                        usubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     1.00                        usubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     1.00                        usubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     1.00                        usubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     1.00                        usubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     1.00                        usubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     1.00                        usubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     1.00                        usubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     1.00                        usubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     1.00                        usubw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     1.00                        uzp1	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     1.00                        uzp1	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT:  1      4     0.50                        uzp1	v0.2s, v0.2s, v0.2s
@@ -2146,7 +2146,7 @@ zip2	v0.8h, v0.8h, v0.8h
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]
-# CHECK-NEXT:  -      -      -      -     716.50 716.50 197.00 3.00   3.00   107.00  -     52.00
+# CHECK-NEXT:  -      -      -      -     780.00 780.00 197.00 3.00   3.00   107.00  -     52.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3.0]  [3.1]  [4]    [5.0]  [5.1]  [6]    [7]    [8]    Instructions:
@@ -2537,12 +2537,12 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     pmul	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     pmull	v0.8h, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     pmull2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     raddhn	v0.2s, v0.2d, v0.2d
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     raddhn	v0.4h, v0.4s, v0.4s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     raddhn	v0.8b, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     raddhn2	v0.16b, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     raddhn2	v0.4s, v0.2d, v0.2d
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     raddhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     raddhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     raddhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     raddhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     raddhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     raddhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     raddhn2	v0.8h, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rbit	v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     rbit	v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     rev16	v21.8b, v1.8b
@@ -2563,19 +2563,19 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rshrn2	v0.16b, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rshrn2	v0.4s, v0.2d, #3
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rshrn2	v0.8h, v0.4s, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rsubhn	v0.2s, v0.2d, v0.2d
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rsubhn	v0.4h, v0.4s, v0.4s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rsubhn	v0.8b, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rsubhn2	v0.16b, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rsubhn2	v0.4s, v0.2d, v0.2d
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     rsubhn2	v0.8h, v0.4s, v0.4s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     saba	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabal	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabal	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabal	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabal2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     rsubhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     rsubhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     rsubhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     rsubhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     rsubhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     rsubhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     saba	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sabal2	v0.8h, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sabd	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabdl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabdl	v0.4s, v0.4h, v0.4h
@@ -2583,12 +2583,12 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabdl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabdl2	v0.4s, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sabdl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sadalp	v0.1d, v0.2s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sadalp	v0.2d, v0.4s
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sadalp	v0.2s, v0.4h
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sadalp	v0.4h, v0.8b
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sadalp	v0.4s, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sadalp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sadalp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sadalp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sadalp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sadalp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sadalp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     sadalp	v0.8h, v0.16b
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     saddl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     saddl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     saddl	v0.8h, v0.8b, v0.8b
@@ -2621,10 +2621,10 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     scvtf	v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shadd	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shl	d7, d10, #12
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shl	v0.16b, v0.16b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shl	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shl	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shl	v0.2d, v0.2d, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shl	v0.4h, v0.4h, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shl	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shl	v0.4s, v0.4s, #3
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shll	v0.2d, v0.2s, #32
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shll	v0.4s, v0.4h, #16
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shll	v0.8h, v0.8b, #8
@@ -2640,9 +2640,9 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shrn	v0.2s, v0.2d, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shrn	v0.4h, v0.4s, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shrn	v0.8b, v0.8h, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shrn2	v0.16b, v0.8h, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shrn2	v0.4s, v0.2d, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     shrn2	v0.8h, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shrn2	v0.16b, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shrn2	v0.4s, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shrn2	v0.8h, v0.4s, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shsub	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     shsub	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sli	d10, d14, #12
@@ -2697,32 +2697,32 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqadd	b20, b11, b15
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqadd	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqadd	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlal	d19, s24, s12
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlal	d8, s9, v0.s[1]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlal	s0, h0, v0.h[3]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlal	s17, h27, h12
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal	d19, s24, s12
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal	d8, s9, v0.s[1]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal	s0, h0, v0.h[3]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal	s17, h27, h12
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlsl	d12, s23, s13
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlsl	d8, s9, v0.s[1]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlsl	s0, h0, v0.h[3]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl	d8, s9, v0.s[1]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl	s0, h0, v0.h[3]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl	s14, h12, h25
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmlsl2	v0.4s, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmulh	h10, h11, h12
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmulh	s15, s14, v0.s[1]
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmulh	s20, s21, s2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmulh	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmulh	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmull	d1, s1, v0.s[1]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmull	d15, s22, s12
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmull	s1, h1, v0.h[3]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqdmull	s12, h22, h12
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmull	d1, s1, v0.s[1]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmull	d15, s22, s12
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmull	s1, h1, v0.h[3]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmull	s12, h22, h12
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmull	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmull	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqdmull2	v0.2d, v0.4s, v0.4s
@@ -2739,8 +2739,8 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqneg	v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqneg	v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqrdmulh	h10, h11, h12
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqrdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqrdmulh	s15, s14, v0.s[1]
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqrdmulh	s20, s21, s2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sqrdmulh	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sqrdmulh	v0.8h, v0.8h, v0.8h
@@ -2857,29 +2857,29 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     srshr	v0.4s, v0.4s, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     srshr	v0.8b, v0.8b, #3
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     srshr	v0.8h, v0.8h, #3
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     srsra	d15, d11, #19
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     srsra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     srsra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     srsra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     srsra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     srsra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     srsra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     srsra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	d15, d11, #19
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     srsra	v0.8h, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshl	d31, d31, d31
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sshl	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshl	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshl	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshl	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshll	v0.2d, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sshll	v0.2d, v0.2s, #3
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sshll2	v0.4s, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	d15, d16, #12
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sshr	v0.16b, v0.16b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sshr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	v0.2d, v0.2d, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	v0.2s, v0.2s, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	v0.4h, v0.4h, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sshr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	v0.4s, v0.4s, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	v0.8b, v0.8b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     sshr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     sshr	v0.8h, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ssra	d18, d12, #21
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ssra	v0.16b, v0.16b, #3
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ssra	v0.2d, v0.2d, #3
@@ -2965,13 +2965,13 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     trn2	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     trn2	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     trn2	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     uaba	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabal	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabal	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabal	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabal2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uaba	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uabal2	v0.8h, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     uabd	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabdl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabdl	v0.4s, v0.4h, v0.4h
@@ -2979,12 +2979,12 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabdl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabdl2	v0.4s, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uabdl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     uadalp	v0.1d, v0.2s
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uadalp	v0.2d, v0.4s
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     uadalp	v0.2s, v0.4h
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     uadalp	v0.4h, v0.8b
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uadalp	v0.4s, v0.8h
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uadalp	v0.8h, v0.16b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uadalp	v0.1d, v0.2s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uadalp	v0.2d, v0.4s
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uadalp	v0.2s, v0.4h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uadalp	v0.4h, v0.8b
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uadalp	v0.4s, v0.8h
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     uadalp	v0.8h, v0.16b
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uaddl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uaddl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     uaddl	v0.8h, v0.8b, v0.8b
@@ -3122,28 +3122,28 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     urshr	v0.8h, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -     ursqrte	v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -     ursqrte	v0.4s, v0.4s
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ursra	d18, d10, #13
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ursra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ursra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ursra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ursra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ursra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ursra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ursra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	d18, d10, #13
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -     2.00   2.00    -      -      -      -      -      -     ursra	v0.8h, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushl	d0, d0, d0
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushl	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushl	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushl	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushll	v0.4s, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushll	v0.4s, v0.4h, #3
 # CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushll2	v0.8h, v0.16b, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	d10, d17, #18
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushr	v0.16b, v0.16b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushr	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	v0.2d, v0.2d, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	v0.2s, v0.2s, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	v0.4h, v0.4h, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushr	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	v0.4s, v0.4s, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	v0.8b, v0.8b, #3
-# CHECK-NEXT:  -      -      -      -     1.00   1.00    -      -      -      -      -      -     ushr	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     ushr	v0.8h, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     usqadd	b19, b14
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     usqadd	d18, d22
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     usqadd	h20, h15

From 889317d47b7f046cf0e68746da8f7f264582fb5b Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Thu, 17 Feb 2022 13:25:48 +0300
Subject: [PATCH 067/748] [objcopy][NFC] Add doc comments to the
 executeObjcopy* functions.

Add doc comments to the executeObjcopy* functions.

Depends on D88827
---
 llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h   |  3 +++
 llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h     | 13 +++++++++++++
 llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h |  6 ++++++
 llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h   |  3 +++
 4 files changed, 25 insertions(+)

diff --git a/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h b/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
index f8925e21159be..d9043d6c5d019 100644
--- a/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
@@ -23,6 +23,9 @@ struct COFFConfig;
 
 namespace coff {
 
+/// Apply the transformations described by \p Config and \p COFFConfig
+/// to \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
 Error executeObjcopyOnBinary(const CommonConfig &Config, const COFFConfig &,
                              object::COFFObjectFile &In, raw_ostream &Out);
 
diff --git a/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h b/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
index 676af4bec0844..552b6fb655f18 100644
--- a/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
@@ -23,12 +23,25 @@ struct CommonConfig;
 struct ELFConfig;
 
 namespace elf {
+/// Apply the transformations described by \p Config and \p ELFConfig to
+/// \p In, which must represent an IHex file, and writes the result
+/// into \p Out.
+/// \returns any Error encountered whilst performing the operation.
 Error executeObjcopyOnIHex(const CommonConfig &Config,
                            const ELFConfig &ELFConfig, MemoryBuffer &In,
                            raw_ostream &Out);
+
+/// Apply the transformations described by \p Config and \p ELFConfig to
+/// \p In, which is treated as a raw binary input, and writes the result
+/// into \p Out.
+/// \returns any Error encountered whilst performing the operation.
 Error executeObjcopyOnRawBinary(const CommonConfig &Config,
                                 const ELFConfig &ELFConfig, MemoryBuffer &In,
                                 raw_ostream &Out);
+
+/// Apply the transformations described by \p Config and \p ELFConfig to
+/// \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
 Error executeObjcopyOnBinary(const CommonConfig &Config,
                              const ELFConfig &ELFConfig,
                              object::ELFObjectFileBase &In, raw_ostream &Out);
diff --git a/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h b/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
index 79f6ba4cf8a84..73690d7ace8a5 100644
--- a/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
@@ -24,10 +24,16 @@ struct MachOConfig;
 class MultiFormatConfig;
 
 namespace macho {
+/// Apply the transformations described by \p Config and \p MachOConfig to
+/// \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
 Error executeObjcopyOnBinary(const CommonConfig &Config,
                              const MachOConfig &MachOConfig,
                              object::MachOObjectFile &In, raw_ostream &Out);
 
+/// Apply the transformations described by \p Config and \p MachOConfig to
+/// \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
 Error executeObjcopyOnMachOUniversalBinary(
     const MultiFormatConfig &Config, const object::MachOUniversalBinary &In,
     raw_ostream &Out);
diff --git a/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h b/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
index 36a9103a35df3..5b4181c22b979 100644
--- a/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
+++ b/llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
@@ -22,6 +22,9 @@ struct CommonConfig;
 struct WasmConfig;
 
 namespace wasm {
+/// Apply the transformations described by \p Config and \p WasmConfig
+/// to \p In and writes the result into \p Out.
+/// \returns any Error encountered whilst performing the operation.
 Error executeObjcopyOnBinary(const CommonConfig &Config, const WasmConfig &,
                              object::WasmObjectFile &In, raw_ostream &Out);
 

From 9798b33d1dc14f5334e2cc117e3896510fa57b82 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 17 Feb 2022 10:58:52 +0000
Subject: [PATCH 068/748] [OpenCL] Guard 64-bit atomic types

Until now, overloads with a 64-bit atomic type argument were always
made available with `-fdeclare-opencl-builtins`.  Ensure these
overloads are only available when both the `cl_khr_int64_base_atomics`
and `cl_khr_int64_extended_atomics` extensions have been enabled, as
required by the OpenCL specification.

Differential Revision: https://reviews.llvm.org/D119858
---
 clang/lib/Sema/OpenCLBuiltins.td              | 12 +++++--
 .../SemaOpenCL/fdeclare-opencl-builtins.cl    | 19 +++++++++++
 .../TableGen/ClangOpenCLBuiltinEmitter.cpp    | 34 +++++++++++++------
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td
index 556d1778625e7..e6da5e34f7091 100644
--- a/clang/lib/Sema/OpenCLBuiltins.td
+++ b/clang/lib/Sema/OpenCLBuiltins.td
@@ -78,6 +78,8 @@ class concatExtension<FunctionExtension Base, string NewExts> {
 def NoTypeExt   : TypeExtension<"">;
 def Fp16TypeExt : TypeExtension<"cl_khr_fp16">;
 def Fp64TypeExt : TypeExtension<"cl_khr_fp64">;
+def Atomic64TypeExt : TypeExtension<"cl_khr_int64_base_atomics cl_khr_int64_extended_atomics">;
+def AtomicFp64TypeExt : TypeExtension<"cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp64">;
 
 // FunctionExtension definitions.
 def FuncExtNone                          : FunctionExtension<"">;
@@ -389,10 +391,14 @@ def NDRange               : TypedefType<"ndrange_t">;
 // OpenCL v2.0 s6.13.11: Atomic integer and floating-point types.
 def AtomicInt             : Type<"atomic_int", QualType<"Context.getAtomicType(Context.IntTy)">>;
 def AtomicUInt            : Type<"atomic_uint", QualType<"Context.getAtomicType(Context.UnsignedIntTy)">>;
-def AtomicLong            : Type<"atomic_long", QualType<"Context.getAtomicType(Context.LongTy)">>;
-def AtomicULong           : Type<"atomic_ulong", QualType<"Context.getAtomicType(Context.UnsignedLongTy)">>;
+let Extension = Atomic64TypeExt in {
+  def AtomicLong            : Type<"atomic_long", QualType<"Context.getAtomicType(Context.LongTy)">>;
+  def AtomicULong           : Type<"atomic_ulong", QualType<"Context.getAtomicType(Context.UnsignedLongTy)">>;
+}
 def AtomicFloat           : Type<"atomic_float", QualType<"Context.getAtomicType(Context.FloatTy)">>;
-def AtomicDouble          : Type<"atomic_double", QualType<"Context.getAtomicType(Context.DoubleTy)">>;
+let Extension = AtomicFp64TypeExt in {
+  def AtomicDouble          : Type<"atomic_double", QualType<"Context.getAtomicType(Context.DoubleTy)">>;
+}
 def AtomicHalf            : Type<"atomic_half", QualType<"Context.getAtomicType(Context.HalfTy)">>;
 def AtomicIntPtr          : Type<"atomic_intptr_t", QualType<"Context.getAtomicType(Context.getIntPtrType())">>;
 def AtomicUIntPtr         : Type<"atomic_uintptr_t", QualType<"Context.getAtomicType(Context.getUIntPtrType())">>;
diff --git a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
index d526c32d65a92..d2d7fff02efaa 100644
--- a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
+++ b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
@@ -163,6 +163,25 @@ void test_atomic_fetch_with_address_space(volatile __generic atomic_float *a_flo
 }
 #endif // !defined(NO_HEADER) && __OPENCL_C_VERSION__ >= 200
 
+#if !defined(NO_HEADER) && __OPENCL_C_VERSION__ == 200 && defined(__opencl_c_generic_address_space)
+
+// Test that overloads that use atomic_double are not available when the fp64
+// extension is disabled.  Test this by counting the number of notes about
+// candidate functions.
+void test_atomic_double_reporting(volatile __generic atomic_int *a) {
+  atomic_init(a);
+  // expected-error@-1{{no matching function for call to 'atomic_init'}}
+#if defined(NO_FP64)
+  // Expecting 5 candidates: int, uint, long, ulong, float
+  // expected-note@-4 5 {{candidate function not viable: requires 2 arguments, but 1 was provided}}
+#else
+  // Expecting 6 candidates: int, uint, long, ulong, float, double
+  // expected-note@-7 6 {{candidate function not viable: requires 2 arguments, but 1 was provided}}
+#endif
+}
+
+#endif
+
 #if defined(NO_ATOMSCOPE) && __OPENCL_C_VERSION__ >= 300
 // Disable the feature by undefining the feature macro.
 #undef __opencl_c_atomic_scope_device
diff --git a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
index 4795b008dda3c..34ca6cb36738c 100644
--- a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
@@ -733,6 +733,20 @@ static std::pair<unsigned, unsigned> isOpenCLBuiltin(llvm::StringRef Name) {
   OS << "} // isOpenCLBuiltin\n";
 }
 
+// Emit an if-statement with an isMacroDefined call for each extension in
+// the space-separated list of extensions.
+static void EmitMacroChecks(raw_ostream &OS, StringRef Extensions) {
+  SmallVector<StringRef, 2> ExtVec;
+  Extensions.split(ExtVec, " ");
+  OS << "      if (";
+  for (StringRef Ext : ExtVec) {
+    if (Ext != ExtVec.front())
+      OS << " && ";
+    OS << "S.getPreprocessor().isMacroDefined(\"" << Ext << "\")";
+  }
+  OS << ") {\n  ";
+}
+
 void BuiltinNameEmitter::EmitQualTypeFinder() {
   OS << R"(
 
@@ -825,15 +839,14 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty,
     // Collect all QualTypes for a single vector size into TypeList.
     OS << "      SmallVector<QualType, " << BaseTypes.size() << "> TypeList;\n";
     for (const auto *T : BaseTypes) {
-      StringRef Ext =
+      StringRef Exts =
           T->getValueAsDef("Extension")->getValueAsString("ExtName");
-      if (!Ext.empty()) {
-        OS << "      if (S.getPreprocessor().isMacroDefined(\"" << Ext
-           << "\")) {\n  ";
+      if (!Exts.empty()) {
+        EmitMacroChecks(OS, Exts);
       }
       OS << "      TypeList.push_back("
          << T->getValueAsDef("QTExpr")->getValueAsString("TypeExpr") << ");\n";
-      if (!Ext.empty()) {
+      if (!Exts.empty()) {
         OS << "      }\n";
       }
     }
@@ -877,15 +890,14 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty,
     // Emit the cases for non generic, non image types.
     OS << "    case OCLT_" << T->getValueAsString("Name") << ":\n";
 
-    StringRef Ext = T->getValueAsDef("Extension")->getValueAsString("ExtName");
-    // If this type depends on an extension, ensure the extension macro is
+    StringRef Exts = T->getValueAsDef("Extension")->getValueAsString("ExtName");
+    // If this type depends on an extension, ensure the extension macros are
     // defined.
-    if (!Ext.empty()) {
-      OS << "      if (S.getPreprocessor().isMacroDefined(\"" << Ext
-         << "\")) {\n  ";
+    if (!Exts.empty()) {
+      EmitMacroChecks(OS, Exts);
     }
     OS << "      QT.push_back(" << QT->getValueAsString("TypeExpr") << ");\n";
-    if (!Ext.empty()) {
+    if (!Exts.empty()) {
       OS << "      }\n";
     }
     OS << "      break;\n";

From fc539b0004d4fe8072aca00e38599a2300a955ce Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Thu, 17 Feb 2022 17:38:42 +0700
Subject: [PATCH 069/748] [SCEV] Infer ranges for SCC consisting of cycled Phis

Our current strategy of computing ranges of SCEVUnknown Phis was to simply
compute the union of ranges of all its inputs. In order to avoid infinite recursion,
we mark Phis as pending and conservatively return full set for them. As result,
even simplest patterns of cycled phis always have a range of full set.

This patch makes this logic a bit smarter. We basically do the same, but instead
of taking inputs of single Phi we find its strongly connected component (SCC)
and compute the union of all inputs that come into this SCC from outside.

Processing entire SCC together has one more advantage: we can set range for all
of them at once, because the only thing that happens to them is the same value is
being passed between those Phis. So, despite we spend more time analyzing a
single Phi, overall we may save time by not processing other SCC members, so
amortized compile time spent should be approximately the same.

Differential Revision: https://reviews.llvm.org/D110620
Reviewed By: reames
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  |  13 ++
 llvm/lib/Analysis/ScalarEvolution.cpp         | 143 +++++++++++++++---
 .../Analysis/ScalarEvolution/cycled_phis.ll   |  20 ++-
 .../Analysis/ScalarEvolution/unknown_phis.ll  |   6 +-
 4 files changed, 150 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index c224346442f1f..768925433bed5 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1588,6 +1588,19 @@ class ScalarEvolution {
   /// SCEVUnknowns and thus don't use this mechanism.
   ConstantRange getRangeForUnknownRecurrence(const SCEVUnknown *U);
 
+  /// Return true and fill \p SCC with elements of PNINode-composed strongly
+  /// connected component that contains \p Phi. Here SCC is a maximum by
+  /// inclusion subgraph composed of Phis that transitively use one another as
+  /// inputs. Otherwise, return false and conservatively put \p Phi into \p SCC
+  /// as the only element of its strongly connected component.
+  bool collectSCC(const PHINode *Phi,
+                  SmallVectorImpl<const PHINode *> &SCC) const;
+
+  /// Sharpen range of entire SCEVUnknown Phi strongly connected component that
+  /// includes \p Phi. On output, \p ConservativeResult is the sharpened range.
+  void sharpenPhiSCCRange(const PHINode *Phi, ConstantRange &ConservativeResult,
+                          ScalarEvolution::RangeSignHint SignHint);
+
   /// We know that there is no SCEV for the specified value.  Analyze the
   /// expression.
   const SCEV *createSCEV(Value *V);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index e1b2b12a4df5b..613379a54a3d4 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -145,6 +145,8 @@ STATISTIC(NumTripCountsNotComputed,
           "Number of loops without predictable loop counts");
 STATISTIC(NumBruteForceTripCountsComputed,
           "Number of loops with trip counts computed by force");
+STATISTIC(NumFoundPhiSCCs,
+          "Number of found Phi-composed strongly connected components");
 
 static cl::opt<unsigned>
 MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
@@ -231,6 +233,12 @@ static cl::opt<bool> UseExpensiveRangeSharpening(
     cl::desc("Use more powerful methods of sharpening expression ranges. May "
              "be costly in terms of compile time"));
 
+static cl::opt<unsigned> MaxPhiSCCAnalysisSize(
+    "scalar-evolution-max-scc-analysis-depth", cl::Hidden,
+    cl::desc("Maximum amount of nodes to process while searching SCEVUnknown "
+             "Phi strongly connected components"),
+    cl::init(8));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -6566,29 +6574,130 @@ ScalarEvolution::getRangeRef(const SCEV *S,
           RangeType);
 
     // A range of Phi is a subset of union of all ranges of its input.
-    if (const PHINode *Phi = dyn_cast<PHINode>(U->getValue())) {
-      // Make sure that we do not run over cycled Phis.
-      if (PendingPhiRanges.insert(Phi).second) {
-        ConstantRange RangeFromOps(BitWidth, /*isFullSet=*/false);
-        for (auto &Op : Phi->operands()) {
-          auto OpRange = getRangeRef(getSCEV(Op), SignHint);
-          RangeFromOps = RangeFromOps.unionWith(OpRange);
-          // No point to continue if we already have a full set.
-          if (RangeFromOps.isFullSet())
-            break;
+    if (const PHINode *Phi = dyn_cast<PHINode>(U->getValue()))
+      if (!PendingPhiRanges.count(Phi))
+        sharpenPhiSCCRange(Phi, ConservativeResult, SignHint);
+
+    return setRange(U, SignHint, std::move(ConservativeResult));
+  }
+
+  return setRange(S, SignHint, std::move(ConservativeResult));
+}
+
+bool ScalarEvolution::collectSCC(const PHINode *Phi,
+                                 SmallVectorImpl<const PHINode *> &SCC) const {
+  assert(SCC.empty() && "Precondition: SCC should be empty.");
+  auto Bail = [&]() {
+    SCC.clear();
+    SCC.push_back(Phi);
+    return false;
+  };
+  SmallPtrSet<const PHINode *, 4> Reachable;
+  {
+    // First, find all PHI nodes that are reachable from Phi.
+    SmallVector<const PHINode *, 4> Worklist;
+    Reachable.insert(Phi);
+    Worklist.push_back(Phi);
+    while (!Worklist.empty()) {
+      if (Reachable.size() > MaxPhiSCCAnalysisSize)
+        // Too many nodes to process. Assume that SCC is composed of Phi alone.
+        return Bail();
+      auto *Curr = Worklist.pop_back_val();
+      for (auto &Op : Curr->operands()) {
+        if (auto *PhiOp = dyn_cast<PHINode>(&*Op)) {
+          if (PendingPhiRanges.count(PhiOp))
+            // Do not want to deal with this situation, so conservatively bail.
+            return Bail();
+          if (Reachable.insert(PhiOp).second)
+            Worklist.push_back(PhiOp);
         }
-        ConservativeResult =
-            ConservativeResult.intersectWith(RangeFromOps, RangeType);
-        bool Erased = PendingPhiRanges.erase(Phi);
-        assert(Erased && "Failed to erase Phi properly?");
-        (void) Erased;
       }
     }
+  }
+  {
+    // Out of reachable nodes, find those from which Phi is also reachable. This
+    // defines a SCC.
+    SmallVector<const PHINode *, 4> Worklist;
+    SmallPtrSet<const PHINode *, 4> SCCSet;
+    SCCSet.insert(Phi);
+    SCC.push_back(Phi);
+    Worklist.push_back(Phi);
+    while (!Worklist.empty()) {
+      auto *Curr = Worklist.pop_back_val();
+      for (auto *User : Curr->users())
+        if (auto *PN = dyn_cast<PHINode>(User))
+          if (Reachable.count(PN) && SCCSet.insert(PN).second) {
+            Worklist.push_back(PN);
+            SCC.push_back(PN);
+          }
+    }
+  }
+  return true;
+}
 
-    return setRange(U, SignHint, std::move(ConservativeResult));
+void
+ScalarEvolution::sharpenPhiSCCRange(const PHINode *Phi,
+                                    ConstantRange &ConservativeResult,
+                                    ScalarEvolution::RangeSignHint SignHint) {
+  // Collect strongly connected component (further on - SCC ) composed of Phis.
+  // Analyze all values that are incoming to this SCC (we call them roots).
+  // All SCC elements have range that is not wider than union of ranges of
+  // roots.
+  SmallVector<const PHINode *, 8> SCC;
+  if (collectSCC(Phi, SCC))
+    ++NumFoundPhiSCCs;
+
+  // Collect roots: inputs of SCC nodes that come from outside of SCC.
+  SmallPtrSet<Value *, 4> Roots;
+  const SmallPtrSet<const PHINode *, 8> SCCSet(SCC.begin(), SCC.end());
+  for (auto *PN : SCC)
+    for (auto &Op : PN->operands()) {
+      auto *PhiInput = dyn_cast<PHINode>(Op);
+      if (!PhiInput || !SCCSet.count(PhiInput))
+        Roots.insert(Op);
+    }
+
+  // Mark SCC elements as pending to avoid infinite recursion if there is a
+  // cyclic dependency through some instruction that is not a PHI.
+  for (auto *PN : SCC) {
+    bool Inserted = PendingPhiRanges.insert(PN).second;
+    assert(Inserted && "PHI is already pending?");
+    (void)Inserted;
+  }
+
+  auto BitWidth = ConservativeResult.getBitWidth();
+  ConstantRange RangeFromRoots(BitWidth, /*isFullSet=*/false);
+  for (auto *Root : Roots) {
+    auto OpRange = getRangeRef(getSCEV(Root), SignHint);
+    RangeFromRoots = RangeFromRoots.unionWith(OpRange);
+    // No point to continue if we already have a full set.
+    if (RangeFromRoots.isFullSet())
+      break;
   }
+  ConstantRange::PreferredRangeType RangeType =
+      SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? ConstantRange::Unsigned
+                                                       : ConstantRange::Signed;
+  ConservativeResult =
+      ConservativeResult.intersectWith(RangeFromRoots, RangeType);
 
-  return setRange(S, SignHint, std::move(ConservativeResult));
+  DenseMap<const SCEV *, ConstantRange> &Cache =
+      SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? UnsignedRanges
+                                                       : SignedRanges;
+  // Entire SCC has the same range.
+  for (auto *PN : SCC) {
+    bool Erased = PendingPhiRanges.erase(PN);
+    assert(Erased && "Failed to erase Phi properly?");
+    (void)Erased;
+    auto *PNSCEV = getSCEV(const_cast<PHINode *>(PN));
+    auto I = Cache.find(PNSCEV);
+    if (I == Cache.end())
+      setRange(PNSCEV, SignHint, ConservativeResult);
+    else {
+      auto SharpenedRange =
+          I->second.intersectWith(ConservativeResult, RangeType);
+      setRange(PNSCEV, SignHint, SharpenedRange);
+    }
+  }
 }
 
 // Given a StartRange, Step and MaxBECount for an expression compute a range of
diff --git a/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll b/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll
index 7183bb8c0a634..b8ac4a81d74ca 100644
--- a/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll
+++ b/llvm/test/Analysis/ScalarEvolution/cycled_phis.ll
@@ -3,14 +3,13 @@
 
 declare i1 @cond()
 
-; FIXME: Range of phi_1 and phi_2 here can be sharpened to [10, 21).
 define void @test_01() {
 ; CHECK-LABEL: 'test_01'
 ; CHECK-NEXT:  Classifying expressions for: @test_01
 ; CHECK-NEXT:    %phi_1 = phi i32 [ 10, %entry ], [ %phi_2, %loop ]
-; CHECK-NEXT:    --> %phi_1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> %phi_1 U: [10,21) S: [10,21) Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %phi_2 = phi i32 [ 20, %entry ], [ %phi_1, %loop ]
-; CHECK-NEXT:    --> %phi_2 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> %phi_2 U: [10,21) S: [10,21) Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %cond = call i1 @cond()
 ; CHECK-NEXT:    --> %cond U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_01
@@ -38,15 +37,15 @@ define void @test_02(i32* %p, i32* %q) {
 ; CHECK-NEXT:    %start = load i32, i32* %p, align 4, !range !0
 ; CHECK-NEXT:    --> %start U: [0,1000) S: [0,1000)
 ; CHECK-NEXT:    %outer_phi = phi i32 [ %start, %entry ], [ %inner_lcssa, %outer_backedge ]
-; CHECK-NEXT:    --> %outer_phi U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
+; CHECK-NEXT:    --> %outer_phi U: [0,3000) S: [0,3000) Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
 ; CHECK-NEXT:    %inner_phi = phi i32 [ %outer_phi, %outer_loop ], [ %inner_load, %inner_loop ]
-; CHECK-NEXT:    --> %inner_phi U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
+; CHECK-NEXT:    --> %inner_phi U: [0,3000) S: [0,3000) Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
 ; CHECK-NEXT:    %inner_load = load i32, i32* %q, align 4, !range !1
 ; CHECK-NEXT:    --> %inner_load U: [2000,3000) S: [2000,3000) Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
 ; CHECK-NEXT:    %inner_cond = call i1 @cond()
 ; CHECK-NEXT:    --> %inner_cond U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
 ; CHECK-NEXT:    %inner_lcssa = phi i32 [ %inner_phi, %inner_loop ]
-; CHECK-NEXT:    --> %inner_lcssa U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
+; CHECK-NEXT:    --> %inner_lcssa U: [0,3000) S: [0,3000) Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
 ; CHECK-NEXT:    %outer_cond = call i1 @cond()
 ; CHECK-NEXT:    --> %outer_cond U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_02
@@ -80,7 +79,6 @@ exit:
   ret void
 }
 
-; FIXME: All phis should have range [0, 3000)
 define void @test_03(i32* %p, i32* %q) {
 ; CHECK-LABEL: 'test_03'
 ; CHECK-NEXT:  Classifying expressions for: @test_03
@@ -89,15 +87,15 @@ define void @test_03(i32* %p, i32* %q) {
 ; CHECK-NEXT:    %start_2 = load i32, i32* %q, align 4, !range !1
 ; CHECK-NEXT:    --> %start_2 U: [2000,3000) S: [2000,3000)
 ; CHECK-NEXT:    %outer_phi = phi i32 [ %start_1, %entry ], [ %inner_lcssa, %outer_backedge ]
-; CHECK-NEXT:    --> %outer_phi U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
+; CHECK-NEXT:    --> %outer_phi U: [0,3000) S: [0,3000) Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
 ; CHECK-NEXT:    %inner_phi_1 = phi i32 [ %outer_phi, %outer_loop ], [ %inner_phi_2, %inner_loop ]
-; CHECK-NEXT:    --> %inner_phi_1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
+; CHECK-NEXT:    --> %inner_phi_1 U: [0,3000) S: [0,3000) Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
 ; CHECK-NEXT:    %inner_phi_2 = phi i32 [ %start_2, %outer_loop ], [ %inner_phi_1, %inner_loop ]
-; CHECK-NEXT:    --> %inner_phi_2 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
+; CHECK-NEXT:    --> %inner_phi_2 U: [0,3000) S: [0,3000) Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
 ; CHECK-NEXT:    %inner_cond = call i1 @cond()
 ; CHECK-NEXT:    --> %inner_cond U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %inner_loop: Variant, %outer_loop: Variant }
 ; CHECK-NEXT:    %inner_lcssa = phi i32 [ %inner_phi_1, %inner_loop ]
-; CHECK-NEXT:    --> %inner_lcssa U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
+; CHECK-NEXT:    --> %inner_lcssa U: [0,3000) S: [0,3000) Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
 ; CHECK-NEXT:    %outer_cond = call i1 @cond()
 ; CHECK-NEXT:    --> %outer_cond U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer_loop: Variant, %inner_loop: Invariant }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_03
diff --git a/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll b/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll
index 3d2c774507dbc..84428d36d52d6 100644
--- a/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll
+++ b/llvm/test/Analysis/ScalarEvolution/unknown_phis.ll
@@ -30,8 +30,6 @@ merge:
 }
 
 define void @merge_values_with_ranges_looped(i32 *%a_len_ptr, i32 *%b_len_ptr) {
-; TODO: We could be much smarter here. So far we just make sure that we do not
-;       go into infinite loop analyzing these Phis.
 ; CHECK-LABEL: 'merge_values_with_ranges_looped'
 ; CHECK-NEXT:  Classifying expressions for: @merge_values_with_ranges_looped
 ; CHECK-NEXT:    %len_a = load i32, i32* %a_len_ptr, align 4, !range !0
@@ -39,9 +37,9 @@ define void @merge_values_with_ranges_looped(i32 *%a_len_ptr, i32 *%b_len_ptr) {
 ; CHECK-NEXT:    %len_b = load i32, i32* %b_len_ptr, align 4, !range !0
 ; CHECK-NEXT:    --> %len_b U: [0,2147483647) S: [0,2147483647)
 ; CHECK-NEXT:    %p1 = phi i32 [ %len_a, %entry ], [ %p2, %loop ]
-; CHECK-NEXT:    --> %p1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> %p1 U: [0,2147483647) S: [0,2147483647) Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %p2 = phi i32 [ %len_b, %entry ], [ %p1, %loop ]
-; CHECK-NEXT:    --> %p2 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> %p2 U: [0,2147483647) S: [0,2147483647) Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK-NEXT:    --> {0,+,1}<%loop> U: [0,100) S: [0,100) Exits: 99 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = add i32 %iv, 1

From 36fdfaba191ce9e7f951d994379a0827c7b08ffe Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Feb 2022 11:59:04 +0100
Subject: [PATCH 070/748] [RelLookupTableConverter] Ensure that GV, GEP and
 load types match

This code could be generalized to be type-independent, but for now
just ensure that the same type constraints are enforced with opaque
pointers as with typed pointers.
---
 .../Utils/RelLookupTableConverter.cpp         |  6 ++-
 .../RelLookupTableConverter/X86/opaque-ptr.ll | 53 +++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll

diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index 65207056a3f40..ad11015407550 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -38,11 +38,13 @@ static bool shouldConvertToRelLookupTable(Module &M, GlobalVariable &GV) {
 
   GetElementPtrInst *GEP =
       dyn_cast<GetElementPtrInst>(GV.use_begin()->getUser());
-  if (!GEP || !GEP->hasOneUse())
+  if (!GEP || !GEP->hasOneUse() ||
+      GV.getValueType() != GEP->getSourceElementType())
     return false;
 
   LoadInst *Load = dyn_cast<LoadInst>(GEP->use_begin()->getUser());
-  if (!Load || !Load->hasOneUse())
+  if (!Load || !Load->hasOneUse() ||
+      Load->getType() != GEP->getResultElementType())
     return false;
 
   // If the original lookup table does not have local linkage and is
diff --git a/llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll b/llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll
new file mode 100644
index 0000000000000..bed4fc6f5ba7d
--- /dev/null
+++ b/llvm/test/Transforms/RelLookupTableConverter/X86/opaque-ptr.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: x86-registered-target
+; RUN: opt < %s -passes=rel-lookup-table-converter -relocation-model=pic -opaque-pointers -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = internal constant i32 0, align 4
+@b = internal constant i32 0, align 4
+@c = internal constant i32 0, align 4
+
+@table1 = private unnamed_addr constant [3 x ptr] [ptr @a, ptr @b, ptr @c], align 8
+@table2 = private unnamed_addr constant [3 x ptr] [ptr @a, ptr @b, ptr @c], align 8
+@table3 = private unnamed_addr constant [3 x ptr] [ptr @a, ptr @b, ptr @c], align 8
+
+define ptr @test(i32 %cond) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[RELTABLE_SHIFT:%.*]] = shl i32 [[COND:%.*]], 2
+; CHECK-NEXT:    [[RELTABLE_INTRINSIC:%.*]] = call ptr @llvm.load.relative.i32(ptr @reltable.test, i32 [[RELTABLE_SHIFT]])
+; CHECK-NEXT:    ret ptr [[RELTABLE_INTRINSIC]]
+;
+  %switch.gep = getelementptr inbounds [3 x ptr], ptr @table1, i32 0, i32 %cond
+  %switch.load = load ptr, ptr %switch.gep, align 8
+  ret ptr %switch.load
+}
+
+define i32 @test_different_load_type(i32 %cond) {
+; CHECK-LABEL: @test_different_load_type(
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [3 x ptr], ptr @table2, i32 0, i32 [[COND:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 8
+; CHECK-NEXT:    ret i32 [[SWITCH_LOAD]]
+;
+  %switch.gep = getelementptr inbounds [3 x ptr], ptr @table2, i32 0, i32 %cond
+  %switch.load = load i32, ptr %switch.gep, align 8
+  ret i32 %switch.load
+}
+
+define i8 @test_different_gep_type(i32 %cond) {
+; CHECK-LABEL: @test_different_gep_type(
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [3 x i8], ptr @table3, i32 0, i32 [[COND:%.*]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i8, ptr [[SWITCH_GEP]], align 8
+; CHECK-NEXT:    ret i8 [[SWITCH_LOAD]]
+;
+  %switch.gep = getelementptr inbounds [3 x i8], ptr @table3, i32 0, i32 %cond
+  %switch.load = load i8, ptr %switch.gep, align 8
+  ret i8 %switch.load
+}
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 7, !"PIC Level", i32 2}
+!1 = !{i32 1, !"Code Model", i32 1}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}

From f1877eb1bb76dd498fd931f2d3dd8d206a3db409 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 11:41:47 +0000
Subject: [PATCH 071/748] AArch64_MC::isQForm - Fix MSVC 'no default capture
 mode' lambda warning

---
 llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 61ec1de55b9cb..0c0615010ab4d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -245,7 +245,7 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
 
 bool AArch64_MC::isQForm(const MCInst &MI, const MCInstrInfo *MCII) {
   const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
-  return llvm::any_of(MI, [](const MCOperand &Op) {
+  return llvm::any_of(MI, [&FPR128](const MCOperand &Op) {
     return Op.isReg() && FPR128.contains(Op.getReg());
   });
 }

From ada6bcc13f0519320c9e5eb1006d5e71beea1cec Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 11:54:12 +0000
Subject: [PATCH 072/748] [X86] X86tcret_1reg - use cast<> instead of
 dyn_cast<> to avoid dereference of nullptr

The pointer is always dereferenced, so assert the cast is correct instead of returning nullptr
---
 llvm/lib/Target/X86/X86InstrCompiler.td | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 36f56cfcf1154..fe8126f3dc7d8 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1239,8 +1239,7 @@ def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off),
                              (X86tcret node:$ptr, node:$off), [{
   // X86tcret args: (*chain, ptr, imm, regs..., glue)
   unsigned NumRegs = 1;
-  LoadSDNode* ld = dyn_cast<LoadSDNode>(N->getOperand(1));
-  const SDValue& BasePtr = ld->getBasePtr();
+  const SDValue& BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr();
   if (isa<FrameIndexSDNode>(BasePtr))
     NumRegs = 3;
   else if (BasePtr->getNumOperands() && isa<GlobalAddressSDNode>(BasePtr->getOperand(0)))

From 5f4549c372676840a70dba455fea6b44c83391d6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 11:56:29 +0000
Subject: [PATCH 073/748] [SystemZ] lowerDYNAMIC_STACKALLOC_XPLINK - use cast<>
 instead of dyn_cast<> to avoid dereference of nullptr

The pointer is always dereferenced, so assert the cast is correct instead of returning nullptr
---
 llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 0eded62347bee..7fc3e33309830 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -3577,7 +3577,7 @@ SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op,
   // If user has set the no alignment function attribute, ignore
   // alloca alignments.
   uint64_t AlignVal =
-      (RealignOpt ? dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+      (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
 
   uint64_t StackAlign = TFI->getStackAlignment();
   uint64_t RequiredAlign = std::max(AlignVal, StackAlign);

From 07cf95942f57a85d9626a1c9ef8b90deb123bdb6 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Thu, 17 Feb 2022 14:30:02 +0300
Subject: [PATCH 074/748] [NFC][PhaseOrdering] Improve test coverage for
 D119975

---
 .../X86/hoist-load-of-baseptr.ll              | 398 ++++++++++++++++++
 .../PhaseOrdering/X86/speculation-vs-tbaa.ll  | 232 ++++------
 .../PhaseOrdering/X86/spurious-peeling.ll     | 196 +++------
 3 files changed, 542 insertions(+), 284 deletions(-)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
new file mode 100644
index 0000000000000..20c3cb029d2ae
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
@@ -0,0 +1,398 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -O1 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O1 %s
+; RUN: opt -O2 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O2 %s
+; RUN: opt -O3 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O3 %s
+; RUN: opt -passes='default<O1>' -S < %s | FileCheck --check-prefixes=NEWPM_O1 %s
+; RUN: opt -passes='default<O2>' -S < %s | FileCheck --check-prefixes=NEWPM_O2 %s
+; RUN: opt -passes='default<O3>' -S < %s | FileCheck --check-prefixes=NEWPM_O3 %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"class.std::vector" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl" }
+%"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl" = type { %"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl_data" }
+%"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl_data" = type { i32*, i32*, i32* }
+
+$_ZNSt6vectorIiSaIiEEixEm = comdat any
+
+define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* noundef nonnull align 8 dereferenceable(24) %data, i64 noundef %numElems) {
+; OLDPM_O1-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
+; OLDPM_O1-SAME: (%"class.std::vector"* noundef nonnull align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr {
+; OLDPM_O1-NEXT:  entry:
+; OLDPM_O1-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; OLDPM_O1-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; OLDPM_O1:       for.cond1.preheader:
+; OLDPM_O1-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; OLDPM_O1-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4:%.*]]
+; OLDPM_O1:       for.cond.cleanup:
+; OLDPM_O1-NEXT:    ret void
+; OLDPM_O1:       for.cond.cleanup3:
+; OLDPM_O1-NEXT:    [[INC7]] = add nuw nsw i64 [[I_08]], 1
+; OLDPM_O1-NEXT:    [[EXITCOND9_NOT:%.*]] = icmp eq i64 [[INC7]], 100
+; OLDPM_O1-NEXT:    br i1 [[EXITCOND9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; OLDPM_O1:       for.body4:
+; OLDPM_O1-NEXT:    [[J_07:%.*]] = phi i64 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
+; OLDPM_O1-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) i32* @_ZNSt6vectorIiSaIiEEixEm(%"class.std::vector"* noundef nonnull align 8 dereferenceable(24) [[DATA]], i64 noundef [[J_07]])
+; OLDPM_O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4, !tbaa [[TBAA2:![0-9]+]]
+; OLDPM_O1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+; OLDPM_O1-NEXT:    store i32 [[INC]], i32* [[CALL]], align 4, !tbaa [[TBAA2]]
+; OLDPM_O1-NEXT:    [[INC5]] = add nuw i64 [[J_07]], 1
+; OLDPM_O1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5]], [[NUMELEMS]]
+; OLDPM_O1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; OLDPM_O2-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
+; OLDPM_O2-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OLDPM_O2-NEXT:  entry:
+; OLDPM_O2-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; OLDPM_O2-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; OLDPM_O2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
+; OLDPM_O2-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
+; OLDPM_O2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]]
+; OLDPM_O2-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; OLDPM_O2:       for.cond1.preheader:
+; OLDPM_O2-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; OLDPM_O2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
+; OLDPM_O2-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]]
+; OLDPM_O2:       for.body4.preheader:
+; OLDPM_O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER11:%.*]], label [[VECTOR_BODY:%.*]]
+; OLDPM_O2:       vector.body:
+; OLDPM_O2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_BODY4_PREHEADER]] ]
+; OLDPM_O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDEX]]
+; OLDPM_O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; OLDPM_O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; OLDPM_O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; OLDPM_O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OLDPM_O2-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; OLDPM_O2-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], <i32 1, i32 1, i32 1, i32 1>
+; OLDPM_O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; OLDPM_O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OLDPM_O2-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; OLDPM_O2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; OLDPM_O2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; OLDPM_O2:       middle.block:
+; OLDPM_O2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER11]]
+; OLDPM_O2:       for.body4.preheader11:
+; OLDPM_O2-NEXT:    [[J_07_PH:%.*]] = phi i64 [ 0, [[FOR_BODY4_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; OLDPM_O2-NEXT:    br label [[FOR_BODY4:%.*]]
+; OLDPM_O2:       for.cond.cleanup:
+; OLDPM_O2-NEXT:    ret void
+; OLDPM_O2:       for.cond.cleanup3:
+; OLDPM_O2-NEXT:    [[INC7]] = add nuw nsw i64 [[I_08]], 1
+; OLDPM_O2-NEXT:    [[EXITCOND9_NOT:%.*]] = icmp eq i64 [[INC7]], 100
+; OLDPM_O2-NEXT:    br i1 [[EXITCOND9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; OLDPM_O2:       for.body4:
+; OLDPM_O2-NEXT:    [[J_07:%.*]] = phi i64 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ [[J_07_PH]], [[FOR_BODY4_PREHEADER11]] ]
+; OLDPM_O2-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[J_07]]
+; OLDPM_O2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ADD_PTR_I]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP10]], 1
+; OLDPM_O2-NEXT:    store i32 [[INC]], i32* [[ADD_PTR_I]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O2-NEXT:    [[INC5]] = add nuw i64 [[J_07]], 1
+; OLDPM_O2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5]], [[NUMELEMS]]
+; OLDPM_O2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; OLDPM_O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
+; OLDPM_O3-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OLDPM_O3-NEXT:  entry:
+; OLDPM_O3-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; OLDPM_O3-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; OLDPM_O3-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
+; OLDPM_O3:       for.cond1.preheader.us.preheader:
+; OLDPM_O3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
+; OLDPM_O3-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
+; OLDPM_O3-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]]
+; OLDPM_O3-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; OLDPM_O3:       for.cond1.preheader.us:
+; OLDPM_O3-NEXT:    [[I_08_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; OLDPM_O3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
+; OLDPM_O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; OLDPM_O3:       vector.body:
+; OLDPM_O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; OLDPM_O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDEX]]
+; OLDPM_O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; OLDPM_O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; OLDPM_O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; OLDPM_O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OLDPM_O3-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; OLDPM_O3-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD11]], <i32 1, i32 1, i32 1, i32 1>
+; OLDPM_O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; OLDPM_O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O3-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OLDPM_O3-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O3-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; OLDPM_O3-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; OLDPM_O3-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; OLDPM_O3:       middle.block:
+; OLDPM_O3-NEXT:    br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US_PREHEADER]]
+; OLDPM_O3:       for.body4.us.preheader:
+; OLDPM_O3-NEXT:    [[J_07_US_PH:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; OLDPM_O3-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; OLDPM_O3:       for.body4.us:
+; OLDPM_O3-NEXT:    [[J_07_US:%.*]] = phi i64 [ [[INC5_US:%.*]], [[FOR_BODY4_US]] ], [ [[J_07_US_PH]], [[FOR_BODY4_US_PREHEADER]] ]
+; OLDPM_O3-NEXT:    [[ADD_PTR_I_US:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[J_07_US]]
+; OLDPM_O3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ADD_PTR_I_US]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O3-NEXT:    [[INC_US:%.*]] = add nsw i32 [[TMP10]], 1
+; OLDPM_O3-NEXT:    store i32 [[INC_US]], i32* [[ADD_PTR_I_US]], align 4, !tbaa [[TBAA0]]
+; OLDPM_O3-NEXT:    [[INC5_US]] = add nuw i64 [[J_07_US]], 1
+; OLDPM_O3-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5_US]], [[NUMELEMS]]
+; OLDPM_O3-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP7:![0-9]+]]
+; OLDPM_O3:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; OLDPM_O3-NEXT:    [[INC7_US]] = add nuw nsw i64 [[I_08_US]], 1
+; OLDPM_O3-NEXT:    [[EXITCOND10_NOT:%.*]] = icmp eq i64 [[INC7_US]], 100
+; OLDPM_O3-NEXT:    br i1 [[EXITCOND10_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]], !llvm.loop [[LOOP9:![0-9]+]]
+; OLDPM_O3:       for.cond.cleanup:
+; OLDPM_O3-NEXT:    ret void
+;
+; NEWPM_O1-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
+; NEWPM_O1-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; NEWPM_O1-NEXT:  entry:
+; NEWPM_O1-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; NEWPM_O1-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; NEWPM_O1-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; NEWPM_O1:       for.cond1.preheader:
+; NEWPM_O1-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; NEWPM_O1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
+; NEWPM_O1-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4:%.*]]
+; NEWPM_O1:       for.cond.cleanup:
+; NEWPM_O1-NEXT:    ret void
+; NEWPM_O1:       for.cond.cleanup3:
+; NEWPM_O1-NEXT:    [[INC7]] = add nuw nsw i64 [[I_08]], 1
+; NEWPM_O1-NEXT:    [[EXITCOND9_NOT:%.*]] = icmp eq i64 [[INC7]], 100
+; NEWPM_O1-NEXT:    br i1 [[EXITCOND9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; NEWPM_O1:       for.body4:
+; NEWPM_O1-NEXT:    [[J_07:%.*]] = phi i64 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
+; NEWPM_O1-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[J_07]]
+; NEWPM_O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ADD_PTR_I]], align 4, !tbaa [[TBAA2:![0-9]+]]
+; NEWPM_O1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+; NEWPM_O1-NEXT:    store i32 [[INC]], i32* [[ADD_PTR_I]], align 4, !tbaa [[TBAA2]]
+; NEWPM_O1-NEXT:    [[INC5]] = add nuw i64 [[J_07]], 1
+; NEWPM_O1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5]], [[NUMELEMS]]
+; NEWPM_O1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; NEWPM_O2-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
+; NEWPM_O2-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; NEWPM_O2-NEXT:  entry:
+; NEWPM_O2-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; NEWPM_O2-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; NEWPM_O2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
+; NEWPM_O2-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
+; NEWPM_O2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]]
+; NEWPM_O2-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; NEWPM_O2:       for.cond1.preheader:
+; NEWPM_O2-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; NEWPM_O2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
+; NEWPM_O2-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]]
+; NEWPM_O2:       for.body4.preheader:
+; NEWPM_O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER11:%.*]], label [[VECTOR_BODY:%.*]]
+; NEWPM_O2:       vector.body:
+; NEWPM_O2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_BODY4_PREHEADER]] ]
+; NEWPM_O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDEX]]
+; NEWPM_O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEWPM_O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; NEWPM_O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; NEWPM_O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; NEWPM_O2-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; NEWPM_O2-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], <i32 1, i32 1, i32 1, i32 1>
+; NEWPM_O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEWPM_O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; NEWPM_O2-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NEWPM_O2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NEWPM_O2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NEWPM_O2:       middle.block:
+; NEWPM_O2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER11]]
+; NEWPM_O2:       for.body4.preheader11:
+; NEWPM_O2-NEXT:    [[J_07_PH:%.*]] = phi i64 [ 0, [[FOR_BODY4_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; NEWPM_O2-NEXT:    br label [[FOR_BODY4:%.*]]
+; NEWPM_O2:       for.cond.cleanup:
+; NEWPM_O2-NEXT:    ret void
+; NEWPM_O2:       for.cond.cleanup3:
+; NEWPM_O2-NEXT:    [[INC7]] = add nuw nsw i64 [[I_08]], 1
+; NEWPM_O2-NEXT:    [[EXITCOND9_NOT:%.*]] = icmp eq i64 [[INC7]], 100
+; NEWPM_O2-NEXT:    br i1 [[EXITCOND9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; NEWPM_O2:       for.body4:
+; NEWPM_O2-NEXT:    [[J_07:%.*]] = phi i64 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ [[J_07_PH]], [[FOR_BODY4_PREHEADER11]] ]
+; NEWPM_O2-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[J_07]]
+; NEWPM_O2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ADD_PTR_I]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP10]], 1
+; NEWPM_O2-NEXT:    store i32 [[INC]], i32* [[ADD_PTR_I]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O2-NEXT:    [[INC5]] = add nuw i64 [[J_07]], 1
+; NEWPM_O2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5]], [[NUMELEMS]]
+; NEWPM_O2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; NEWPM_O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
+; NEWPM_O3-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; NEWPM_O3-NEXT:  entry:
+; NEWPM_O3-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; NEWPM_O3-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; NEWPM_O3-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
+; NEWPM_O3:       for.cond1.preheader.us.preheader:
+; NEWPM_O3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
+; NEWPM_O3-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
+; NEWPM_O3-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]]
+; NEWPM_O3-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; NEWPM_O3:       for.cond1.preheader.us:
+; NEWPM_O3-NEXT:    [[I_08_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; NEWPM_O3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
+; NEWPM_O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; NEWPM_O3:       vector.body:
+; NEWPM_O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; NEWPM_O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDEX]]
+; NEWPM_O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEWPM_O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; NEWPM_O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; NEWPM_O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; NEWPM_O3-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; NEWPM_O3-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD11]], <i32 1, i32 1, i32 1, i32 1>
+; NEWPM_O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEWPM_O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O3-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; NEWPM_O3-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O3-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NEWPM_O3-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NEWPM_O3-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NEWPM_O3:       middle.block:
+; NEWPM_O3-NEXT:    br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US_PREHEADER]]
+; NEWPM_O3:       for.body4.us.preheader:
+; NEWPM_O3-NEXT:    [[J_07_US_PH:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; NEWPM_O3-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; NEWPM_O3:       for.body4.us:
+; NEWPM_O3-NEXT:    [[J_07_US:%.*]] = phi i64 [ [[INC5_US:%.*]], [[FOR_BODY4_US]] ], [ [[J_07_US_PH]], [[FOR_BODY4_US_PREHEADER]] ]
+; NEWPM_O3-NEXT:    [[ADD_PTR_I_US:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[J_07_US]]
+; NEWPM_O3-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ADD_PTR_I_US]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O3-NEXT:    [[INC_US:%.*]] = add nsw i32 [[TMP10]], 1
+; NEWPM_O3-NEXT:    store i32 [[INC_US]], i32* [[ADD_PTR_I_US]], align 4, !tbaa [[TBAA0]]
+; NEWPM_O3-NEXT:    [[INC5_US]] = add nuw i64 [[J_07_US]], 1
+; NEWPM_O3-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5_US]], [[NUMELEMS]]
+; NEWPM_O3-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP7:![0-9]+]]
+; NEWPM_O3:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; NEWPM_O3-NEXT:    [[INC7_US]] = add nuw nsw i64 [[I_08_US]], 1
+; NEWPM_O3-NEXT:    [[EXITCOND10_NOT:%.*]] = icmp eq i64 [[INC7_US]], 100
+; NEWPM_O3-NEXT:    br i1 [[EXITCOND10_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]], !llvm.loop [[LOOP9:![0-9]+]]
+; NEWPM_O3:       for.cond.cleanup:
+; NEWPM_O3-NEXT:    ret void
+;
+entry:
+  %data.addr = alloca %"class.std::vector"*, align 8
+  %numElems.addr = alloca i64, align 8
+  %i = alloca i64, align 8
+  %cleanup.dest.slot = alloca i32, align 4
+  %j = alloca i64, align 8
+  store %"class.std::vector"* %data, %"class.std::vector"** %data.addr, align 8, !tbaa !3
+  store i64 %numElems, i64* %numElems.addr, align 8, !tbaa !7
+  %0 = bitcast i64* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %0)
+  store i64 0, i64* %i, align 8, !tbaa !7
+  br label %for.cond
+
+for.cond:
+  %1 = load i64, i64* %i, align 8, !tbaa !7
+  %cmp = icmp ult i64 %1, 100
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  store i32 2, i32* %cleanup.dest.slot, align 4
+  %2 = bitcast i64* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %2)
+  br label %for.end8
+
+for.body:
+  %3 = bitcast i64* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %3)
+  store i64 0, i64* %j, align 8, !tbaa !7
+  br label %for.cond1
+
+for.cond1:
+  %4 = load i64, i64* %j, align 8, !tbaa !7
+  %5 = load i64, i64* %numElems.addr, align 8, !tbaa !7
+  %cmp2 = icmp ult i64 %4, %5
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+
+for.cond.cleanup3:
+  store i32 5, i32* %cleanup.dest.slot, align 4
+  %6 = bitcast i64* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %6)
+  br label %for.end
+
+for.body4:
+  %7 = load %"class.std::vector"*, %"class.std::vector"** %data.addr, align 8, !tbaa !3
+  %8 = load i64, i64* %j, align 8, !tbaa !7
+  %call = call noundef nonnull align 4 dereferenceable(4) i32* @_ZNSt6vectorIiSaIiEEixEm(%"class.std::vector"* noundef nonnull align 8 dereferenceable(24) %7, i64 noundef %8)
+  %9 = load i32, i32* %call, align 4, !tbaa !9
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %call, align 4, !tbaa !9
+  br label %for.inc
+
+for.inc:
+  %10 = load i64, i64* %j, align 8, !tbaa !7
+  %inc5 = add i64 %10, 1
+  store i64 %inc5, i64* %j, align 8, !tbaa !7
+  br label %for.cond1, !llvm.loop !11
+
+for.end:
+  br label %for.inc6
+
+for.inc6:
+  %11 = load i64, i64* %i, align 8, !tbaa !7
+  %inc7 = add i64 %11, 1
+  store i64 %inc7, i64* %i, align 8, !tbaa !7
+  br label %for.cond, !llvm.loop !13
+
+for.end8:
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+
+define linkonce_odr dso_local noundef nonnull align 4 dereferenceable(4) i32* @_ZNSt6vectorIiSaIiEEixEm(%"class.std::vector"* noundef nonnull align 8 dereferenceable(24) %this, i64 noundef %__n) comdat align 2 {
+; OLDPM_O1-LABEL: define {{[^@]+}}@_ZNSt6vectorIiSaIiEEixEm
+; OLDPM_O1-SAME: (%"class.std::vector"* noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[__N:%.*]]) local_unnamed_addr comdat align 2 {
+; OLDPM_O1-NEXT:  entry:
+; OLDPM_O1-NEXT:    [[_M_START:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[THIS]], i64 0, i32 0, i32 0, i32 0, i32 0
+; OLDPM_O1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START]], align 8, !tbaa [[TBAA7:![0-9]+]]
+; OLDPM_O1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[__N]]
+; OLDPM_O1-NEXT:    ret i32* [[ADD_PTR]]
+;
+entry:
+  %this.addr = alloca %"class.std::vector"*, align 8
+  %__n.addr = alloca i64, align 8
+  store %"class.std::vector"* %this, %"class.std::vector"** %this.addr, align 8, !tbaa !3
+  store i64 %__n, i64* %__n.addr, align 8, !tbaa !14
+  %this1 = load %"class.std::vector"*, %"class.std::vector"** %this.addr, align 8
+  %0 = bitcast %"class.std::vector"* %this1 to %"struct.std::_Vector_base"*
+  %_M_impl = getelementptr inbounds %"struct.std::_Vector_base", %"struct.std::_Vector_base"* %0, i32 0, i32 0
+  %1 = bitcast %"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl"* %_M_impl to %"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl_data"*
+  %_M_start = getelementptr inbounds %"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl_data", %"struct.std::_Vector_base<int, std::allocator<int>>::_Vector_impl_data"* %1, i32 0, i32 0
+  %2 = load i32*, i32** %_M_start, align 8, !tbaa !16
+  %3 = load i64, i64* %__n.addr, align 8, !tbaa !14
+  %add.ptr = getelementptr inbounds i32, i32* %2, i64 %3
+  ret i32* %add.ptr
+}
+
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git 69297cf639044acf48dd5d9b39b95c54dd50561d)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"long long", !5, i64 0}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !5, i64 0}
+!11 = distinct !{!11, !12}
+!12 = !{!"llvm.loop.mustprogress"}
+!13 = distinct !{!13, !12}
+!14 = !{!15, !15, i64 0}
+!15 = !{!"long", !5, i64 0}
+!16 = !{!17, !4, i64 0}
+!17 = !{!"_ZTSNSt12_Vector_baseIiSaIiEE17_Vector_impl_dataE", !4, i64 0, !4, i64 8, !4, i64 16}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
index 6b349b57e5faa..3ac9104708405 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O1 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O1 %s
-; RUN: opt -O2 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O2 %s
-; RUN: opt -O3 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O3 %s
+; RUN: opt -O2 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O23 %s
+; RUN: opt -O3 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O23 %s
 ; RUN: opt -passes='default<O1>' -S < %s | FileCheck --check-prefixes=NEWPM_O1 %s
-; RUN: opt -passes='default<O2>' -S < %s | FileCheck --check-prefixes=NEWPM_O2 %s
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck --check-prefixes=NEWPM_O3 %s
+; RUN: opt -passes='default<O2>' -S < %s | FileCheck --check-prefixes=NEWPM_O23 %s
+; RUN: opt -passes='default<O3>' -S < %s | FileCheck --check-prefixes=NEWPM_O23 %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -27,81 +27,43 @@ define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem)
 ; OLDPM_O1:       for.cond.cleanup:
 ; OLDPM_O1-NEXT:    ret void
 ;
-; OLDPM_O2-LABEL: @licm(
-; OLDPM_O2-NEXT:  entry:
-; OLDPM_O2-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
-; OLDPM_O2-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; OLDPM_O2-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; OLDPM_O2:       for.body.preheader:
-; OLDPM_O2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
-; OLDPM_O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
-; OLDPM_O2:       vector.ph:
-; OLDPM_O2-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
-; OLDPM_O2-NEXT:    br label [[VECTOR_BODY:%.*]]
-; OLDPM_O2:       vector.body:
-; OLDPM_O2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; OLDPM_O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
-; OLDPM_O2-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
-; OLDPM_O2-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; OLDPM_O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
-; OLDPM_O2-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
-; OLDPM_O2-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
-; OLDPM_O2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; OLDPM_O2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; OLDPM_O2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; OLDPM_O2:       middle.block:
-; OLDPM_O2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
-; OLDPM_O2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
-; OLDPM_O2:       for.body.preheader3:
-; OLDPM_O2-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; OLDPM_O2-NEXT:    br label [[FOR_BODY:%.*]]
-; OLDPM_O2:       for.body:
-; OLDPM_O2-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
-; OLDPM_O2-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; OLDPM_O2-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
-; OLDPM_O2-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
-; OLDPM_O2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; OLDPM_O2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; OLDPM_O2:       for.cond.cleanup:
-; OLDPM_O2-NEXT:    ret void
-;
-; OLDPM_O3-LABEL: @licm(
-; OLDPM_O3-NEXT:  entry:
-; OLDPM_O3-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
-; OLDPM_O3-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; OLDPM_O3-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; OLDPM_O3:       for.body.preheader:
-; OLDPM_O3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
-; OLDPM_O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
-; OLDPM_O3:       vector.ph:
-; OLDPM_O3-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
-; OLDPM_O3-NEXT:    br label [[VECTOR_BODY:%.*]]
-; OLDPM_O3:       vector.body:
-; OLDPM_O3-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; OLDPM_O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
-; OLDPM_O3-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
-; OLDPM_O3-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; OLDPM_O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
-; OLDPM_O3-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
-; OLDPM_O3-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
-; OLDPM_O3-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; OLDPM_O3-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; OLDPM_O3-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; OLDPM_O3:       middle.block:
-; OLDPM_O3-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
-; OLDPM_O3-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
-; OLDPM_O3:       for.body.preheader3:
-; OLDPM_O3-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; OLDPM_O3-NEXT:    br label [[FOR_BODY:%.*]]
-; OLDPM_O3:       for.body:
-; OLDPM_O3-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
-; OLDPM_O3-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; OLDPM_O3-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
-; OLDPM_O3-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
-; OLDPM_O3-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; OLDPM_O3-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; OLDPM_O3:       for.cond.cleanup:
-; OLDPM_O3-NEXT:    ret void
+; OLDPM_O23-LABEL: @licm(
+; OLDPM_O23-NEXT:  entry:
+; OLDPM_O23-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
+; OLDPM_O23-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
+; OLDPM_O23-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; OLDPM_O23:       for.body.preheader:
+; OLDPM_O23-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
+; OLDPM_O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
+; OLDPM_O23:       vector.ph:
+; OLDPM_O23-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
+; OLDPM_O23-NEXT:    br label [[VECTOR_BODY:%.*]]
+; OLDPM_O23:       vector.body:
+; OLDPM_O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OLDPM_O23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
+; OLDPM_O23-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
+; OLDPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; OLDPM_O23-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
+; OLDPM_O23-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
+; OLDPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
+; OLDPM_O23-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; OLDPM_O23-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; OLDPM_O23-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; OLDPM_O23:       middle.block:
+; OLDPM_O23-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
+; OLDPM_O23-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
+; OLDPM_O23:       for.body.preheader3:
+; OLDPM_O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; OLDPM_O23-NEXT:    br label [[FOR_BODY:%.*]]
+; OLDPM_O23:       for.body:
+; OLDPM_O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
+; OLDPM_O23-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
+; OLDPM_O23-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
+; OLDPM_O23-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
+; OLDPM_O23-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
+; OLDPM_O23-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; OLDPM_O23:       for.cond.cleanup:
+; OLDPM_O23-NEXT:    ret void
 ;
 ; NEWPM_O1-LABEL: @licm(
 ; NEWPM_O1-NEXT:  entry:
@@ -118,81 +80,43 @@ define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem)
 ; NEWPM_O1:       for.cond.cleanup:
 ; NEWPM_O1-NEXT:    ret void
 ;
-; NEWPM_O2-LABEL: @licm(
-; NEWPM_O2-NEXT:  entry:
-; NEWPM_O2-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
-; NEWPM_O2-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; NEWPM_O2-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NEWPM_O2:       for.body.preheader:
-; NEWPM_O2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
-; NEWPM_O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
-; NEWPM_O2:       vector.ph:
-; NEWPM_O2-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
-; NEWPM_O2-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NEWPM_O2:       vector.body:
-; NEWPM_O2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NEWPM_O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
-; NEWPM_O2-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
-; NEWPM_O2-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; NEWPM_O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
-; NEWPM_O2-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
-; NEWPM_O2-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
-; NEWPM_O2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; NEWPM_O2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NEWPM_O2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; NEWPM_O2:       middle.block:
-; NEWPM_O2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
-; NEWPM_O2-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
-; NEWPM_O2:       for.body.preheader3:
-; NEWPM_O2-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; NEWPM_O2-NEXT:    br label [[FOR_BODY:%.*]]
-; NEWPM_O2:       for.body:
-; NEWPM_O2-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
-; NEWPM_O2-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; NEWPM_O2-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
-; NEWPM_O2-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
-; NEWPM_O2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; NEWPM_O2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; NEWPM_O2:       for.cond.cleanup:
-; NEWPM_O2-NEXT:    ret void
-;
-; NEWPM_O3-LABEL: @licm(
-; NEWPM_O3-NEXT:  entry:
-; NEWPM_O3-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
-; NEWPM_O3-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; NEWPM_O3-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NEWPM_O3:       for.body.preheader:
-; NEWPM_O3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
-; NEWPM_O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
-; NEWPM_O3:       vector.ph:
-; NEWPM_O3-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
-; NEWPM_O3-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NEWPM_O3:       vector.body:
-; NEWPM_O3-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NEWPM_O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
-; NEWPM_O3-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
-; NEWPM_O3-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; NEWPM_O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
-; NEWPM_O3-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
-; NEWPM_O3-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
-; NEWPM_O3-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; NEWPM_O3-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NEWPM_O3-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; NEWPM_O3:       middle.block:
-; NEWPM_O3-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
-; NEWPM_O3-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
-; NEWPM_O3:       for.body.preheader3:
-; NEWPM_O3-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; NEWPM_O3-NEXT:    br label [[FOR_BODY:%.*]]
-; NEWPM_O3:       for.body:
-; NEWPM_O3-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
-; NEWPM_O3-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; NEWPM_O3-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
-; NEWPM_O3-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
-; NEWPM_O3-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; NEWPM_O3-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; NEWPM_O3:       for.cond.cleanup:
-; NEWPM_O3-NEXT:    ret void
+; NEWPM_O23-LABEL: @licm(
+; NEWPM_O23-NEXT:  entry:
+; NEWPM_O23-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
+; NEWPM_O23-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
+; NEWPM_O23-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NEWPM_O23:       for.body.preheader:
+; NEWPM_O23-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
+; NEWPM_O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
+; NEWPM_O23:       vector.ph:
+; NEWPM_O23-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
+; NEWPM_O23-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NEWPM_O23:       vector.body:
+; NEWPM_O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NEWPM_O23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
+; NEWPM_O23-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
+; NEWPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; NEWPM_O23-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
+; NEWPM_O23-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
+; NEWPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
+; NEWPM_O23-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; NEWPM_O23-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NEWPM_O23-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NEWPM_O23:       middle.block:
+; NEWPM_O23-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
+; NEWPM_O23-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
+; NEWPM_O23:       for.body.preheader3:
+; NEWPM_O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; NEWPM_O23-NEXT:    br label [[FOR_BODY:%.*]]
+; NEWPM_O23:       for.body:
+; NEWPM_O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
+; NEWPM_O23-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
+; NEWPM_O23-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
+; NEWPM_O23-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
+; NEWPM_O23-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
+; NEWPM_O23-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NEWPM_O23:       for.cond.cleanup:
+; NEWPM_O23-NEXT:    ret void
 ;
 entry:
   br label %for.cond
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
index e8e5ed90e60f1..f75c7d6ea1316 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
 ; RUN: opt -O1 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O1 %s
-; RUN: opt -O2 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O2 %s
-; RUN: opt -O3 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O3 %s
+; RUN: opt -O2 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O23 %s
+; RUN: opt -O3 -S -enable-new-pm=0 < %s | FileCheck --check-prefixes=OLDPM_O23 %s
 ; RUN: opt -passes='default<O1>' -S < %s | FileCheck --check-prefixes=NEWPM_O1 %s
-; RUN: opt -passes='default<O2>' -S < %s | FileCheck --check-prefixes=NEWPM_O2 %s
-; RUN: opt -passes='default<O3>' -S < %s | FileCheck --check-prefixes=NEWPM_O3 %s
+; RUN: opt -passes='default<O2>' -S < %s | FileCheck --check-prefixes=NEWPM_O23 %s
+; RUN: opt -passes='default<O3>' -S < %s | FileCheck --check-prefixes=NEWPM_O23 %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -22,69 +22,37 @@ define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(%class.FloatVecPair* %FV
 ; OLDPM_O1-NEXT:    call void @_ZN12FloatVecPair6vecIncEv(%class.FloatVecPair* [[FVP]])
 ; OLDPM_O1-NEXT:    ret void
 ;
-; OLDPM_O2-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
-; OLDPM_O2-SAME: (%class.FloatVecPair* nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OLDPM_O2-NEXT:  entry:
-; OLDPM_O2-NEXT:    [[BASE_I_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[FVP]], i64 0, i32 1, i32 0
-; OLDPM_O2-NEXT:    [[TMP0:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; OLDPM_O2-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 1
-; OLDPM_O2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
-; OLDPM_O2-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
-; OLDPM_O2-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
-; OLDPM_O2:       for.body7.lr.ph.i:
-; OLDPM_O2-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
-; OLDPM_O2-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
-; OLDPM_O2-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I6_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; OLDPM_O2-NEXT:    [[ARRAYIDX_I7_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
-; OLDPM_O2-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I4_I]], align 8, !tbaa [[TBAA0]]
-; OLDPM_O2-NEXT:    [[BASE_I2_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
-; OLDPM_O2-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I2_I]], align 8, !tbaa [[TBAA8]]
-; OLDPM_O2-NEXT:    [[ARRAYIDX_I3_I:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 undef
-; OLDPM_O2-NEXT:    [[DOTPRE_I:%.*]] = load float, float* [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
-; OLDPM_O2-NEXT:    br label [[FOR_BODY7_I:%.*]]
-; OLDPM_O2:       for.body7.i:
-; OLDPM_O2-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], [[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], [[FOR_BODY7_I]] ]
-; OLDPM_O2-NEXT:    [[J_011_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
-; OLDPM_O2-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_I7_I]], align 4, !tbaa [[TBAA9]]
-; OLDPM_O2-NEXT:    [[ADD_I]] = fadd float [[TMP5]], [[TMP6]]
-; OLDPM_O2-NEXT:    store float [[ADD_I]], float* [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9]]
-; OLDPM_O2-NEXT:    [[INC_I]] = add nuw i32 [[J_011_I]], 1
-; OLDPM_O2-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
-; OLDPM_O2-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
-; OLDPM_O2:       _ZN12FloatVecPair6vecIncEv.exit:
-; OLDPM_O2-NEXT:    ret void
-;
-; OLDPM_O3-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
-; OLDPM_O3-SAME: (%class.FloatVecPair* nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; OLDPM_O3-NEXT:  entry:
-; OLDPM_O3-NEXT:    [[BASE_I_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[FVP]], i64 0, i32 1, i32 0
-; OLDPM_O3-NEXT:    [[TMP0:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; OLDPM_O3-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 1
-; OLDPM_O3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
-; OLDPM_O3-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
-; OLDPM_O3-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
-; OLDPM_O3:       for.body7.lr.ph.i:
-; OLDPM_O3-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
-; OLDPM_O3-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
-; OLDPM_O3-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I6_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; OLDPM_O3-NEXT:    [[ARRAYIDX_I7_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
-; OLDPM_O3-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I4_I]], align 8, !tbaa [[TBAA0]]
-; OLDPM_O3-NEXT:    [[BASE_I2_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
-; OLDPM_O3-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I2_I]], align 8, !tbaa [[TBAA8]]
-; OLDPM_O3-NEXT:    [[ARRAYIDX_I3_I:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 undef
-; OLDPM_O3-NEXT:    [[DOTPRE_I:%.*]] = load float, float* [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
-; OLDPM_O3-NEXT:    br label [[FOR_BODY7_I:%.*]]
-; OLDPM_O3:       for.body7.i:
-; OLDPM_O3-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], [[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], [[FOR_BODY7_I]] ]
-; OLDPM_O3-NEXT:    [[J_011_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
-; OLDPM_O3-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_I7_I]], align 4, !tbaa [[TBAA9]]
-; OLDPM_O3-NEXT:    [[ADD_I]] = fadd float [[TMP5]], [[TMP6]]
-; OLDPM_O3-NEXT:    store float [[ADD_I]], float* [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9]]
-; OLDPM_O3-NEXT:    [[INC_I]] = add nuw i32 [[J_011_I]], 1
-; OLDPM_O3-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
-; OLDPM_O3-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
-; OLDPM_O3:       _ZN12FloatVecPair6vecIncEv.exit:
-; OLDPM_O3-NEXT:    ret void
+; OLDPM_O23-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
+; OLDPM_O23-SAME: (%class.FloatVecPair* nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OLDPM_O23-NEXT:  entry:
+; OLDPM_O23-NEXT:    [[BASE_I_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[FVP]], i64 0, i32 1, i32 0
+; OLDPM_O23-NEXT:    [[TMP0:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; OLDPM_O23-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 1
+; OLDPM_O23-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; OLDPM_O23-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
+; OLDPM_O23-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
+; OLDPM_O23:       for.body7.lr.ph.i:
+; OLDPM_O23-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
+; OLDPM_O23-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
+; OLDPM_O23-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I6_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; OLDPM_O23-NEXT:    [[ARRAYIDX_I7_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
+; OLDPM_O23-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I4_I]], align 8, !tbaa [[TBAA0]]
+; OLDPM_O23-NEXT:    [[BASE_I2_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
+; OLDPM_O23-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I2_I]], align 8, !tbaa [[TBAA8]]
+; OLDPM_O23-NEXT:    [[ARRAYIDX_I3_I:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 undef
+; OLDPM_O23-NEXT:    [[DOTPRE_I:%.*]] = load float, float* [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
+; OLDPM_O23-NEXT:    br label [[FOR_BODY7_I:%.*]]
+; OLDPM_O23:       for.body7.i:
+; OLDPM_O23-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], [[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], [[FOR_BODY7_I]] ]
+; OLDPM_O23-NEXT:    [[J_011_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
+; OLDPM_O23-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_I7_I]], align 4, !tbaa [[TBAA9]]
+; OLDPM_O23-NEXT:    [[ADD_I]] = fadd float [[TMP5]], [[TMP6]]
+; OLDPM_O23-NEXT:    store float [[ADD_I]], float* [[ARRAYIDX_I3_I]], align 4, !tbaa [[TBAA9]]
+; OLDPM_O23-NEXT:    [[INC_I]] = add nuw i32 [[J_011_I]], 1
+; OLDPM_O23-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
+; OLDPM_O23-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
+; OLDPM_O23:       _ZN12FloatVecPair6vecIncEv.exit:
+; OLDPM_O23-NEXT:    ret void
 ;
 ; NEWPM_O1-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
 ; NEWPM_O1-SAME: (%class.FloatVecPair* nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
@@ -117,69 +85,37 @@ define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(%class.FloatVecPair* %FV
 ; NEWPM_O1:       _ZN12FloatVecPair6vecIncEv.exit:
 ; NEWPM_O1-NEXT:    ret void
 ;
-; NEWPM_O2-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
-; NEWPM_O2-SAME: (%class.FloatVecPair* nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; NEWPM_O2-NEXT:  entry:
-; NEWPM_O2-NEXT:    [[BASE_I_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[FVP]], i64 0, i32 1, i32 0
-; NEWPM_O2-NEXT:    [[TMP0:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; NEWPM_O2-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 1
-; NEWPM_O2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
-; NEWPM_O2-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
-; NEWPM_O2-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
-; NEWPM_O2:       for.body7.lr.ph.i:
-; NEWPM_O2-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
-; NEWPM_O2-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
-; NEWPM_O2-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I4_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; NEWPM_O2-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
-; NEWPM_O2-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I6_I]], align 8, !tbaa [[TBAA0]]
-; NEWPM_O2-NEXT:    [[BASE_I8_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
-; NEWPM_O2-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I8_I]], align 8, !tbaa [[TBAA8]]
-; NEWPM_O2-NEXT:    [[ARRAYIDX_I9_I:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 undef
-; NEWPM_O2-NEXT:    [[DOTPRE_I:%.*]] = load float, float* [[ARRAYIDX_I9_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
-; NEWPM_O2-NEXT:    br label [[FOR_BODY7_I:%.*]]
-; NEWPM_O2:       for.body7.i:
-; NEWPM_O2-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], [[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], [[FOR_BODY7_I]] ]
-; NEWPM_O2-NEXT:    [[J_011_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
-; NEWPM_O2-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_I5_I]], align 4, !tbaa [[TBAA9]]
-; NEWPM_O2-NEXT:    [[ADD_I]] = fadd float [[TMP5]], [[TMP6]]
-; NEWPM_O2-NEXT:    store float [[ADD_I]], float* [[ARRAYIDX_I9_I]], align 4, !tbaa [[TBAA9]]
-; NEWPM_O2-NEXT:    [[INC_I]] = add nuw i32 [[J_011_I]], 1
-; NEWPM_O2-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
-; NEWPM_O2-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
-; NEWPM_O2:       _ZN12FloatVecPair6vecIncEv.exit:
-; NEWPM_O2-NEXT:    ret void
-;
-; NEWPM_O3-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
-; NEWPM_O3-SAME: (%class.FloatVecPair* nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; NEWPM_O3-NEXT:  entry:
-; NEWPM_O3-NEXT:    [[BASE_I_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[FVP]], i64 0, i32 1, i32 0
-; NEWPM_O3-NEXT:    [[TMP0:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; NEWPM_O3-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 1
-; NEWPM_O3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
-; NEWPM_O3-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
-; NEWPM_O3-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
-; NEWPM_O3:       for.body7.lr.ph.i:
-; NEWPM_O3-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
-; NEWPM_O3-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
-; NEWPM_O3-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I4_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; NEWPM_O3-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
-; NEWPM_O3-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I6_I]], align 8, !tbaa [[TBAA0]]
-; NEWPM_O3-NEXT:    [[BASE_I8_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
-; NEWPM_O3-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I8_I]], align 8, !tbaa [[TBAA8]]
-; NEWPM_O3-NEXT:    [[ARRAYIDX_I9_I:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 undef
-; NEWPM_O3-NEXT:    [[DOTPRE_I:%.*]] = load float, float* [[ARRAYIDX_I9_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
-; NEWPM_O3-NEXT:    br label [[FOR_BODY7_I:%.*]]
-; NEWPM_O3:       for.body7.i:
-; NEWPM_O3-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], [[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], [[FOR_BODY7_I]] ]
-; NEWPM_O3-NEXT:    [[J_011_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
-; NEWPM_O3-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_I5_I]], align 4, !tbaa [[TBAA9]]
-; NEWPM_O3-NEXT:    [[ADD_I]] = fadd float [[TMP5]], [[TMP6]]
-; NEWPM_O3-NEXT:    store float [[ADD_I]], float* [[ARRAYIDX_I9_I]], align 4, !tbaa [[TBAA9]]
-; NEWPM_O3-NEXT:    [[INC_I]] = add nuw i32 [[J_011_I]], 1
-; NEWPM_O3-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
-; NEWPM_O3-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
-; NEWPM_O3:       _ZN12FloatVecPair6vecIncEv.exit:
-; NEWPM_O3-NEXT:    ret void
+; NEWPM_O23-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
+; NEWPM_O23-SAME: (%class.FloatVecPair* nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; NEWPM_O23-NEXT:  entry:
+; NEWPM_O23-NEXT:    [[BASE_I_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[FVP]], i64 0, i32 1, i32 0
+; NEWPM_O23-NEXT:    [[TMP0:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; NEWPM_O23-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 1
+; NEWPM_O23-NEXT:    [[TMP1:%.*]] = load i32, i32* [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
+; NEWPM_O23-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
+; NEWPM_O23-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
+; NEWPM_O23:       for.body7.lr.ph.i:
+; NEWPM_O23-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
+; NEWPM_O23-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
+; NEWPM_O23-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I4_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; NEWPM_O23-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
+; NEWPM_O23-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I6_I]], align 8, !tbaa [[TBAA0]]
+; NEWPM_O23-NEXT:    [[BASE_I8_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
+; NEWPM_O23-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I8_I]], align 8, !tbaa [[TBAA8]]
+; NEWPM_O23-NEXT:    [[ARRAYIDX_I9_I:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 undef
+; NEWPM_O23-NEXT:    [[DOTPRE_I:%.*]] = load float, float* [[ARRAYIDX_I9_I]], align 4, !tbaa [[TBAA9:![0-9]+]]
+; NEWPM_O23-NEXT:    br label [[FOR_BODY7_I:%.*]]
+; NEWPM_O23:       for.body7.i:
+; NEWPM_O23-NEXT:    [[TMP5:%.*]] = phi float [ [[DOTPRE_I]], [[FOR_BODY7_LR_PH_I]] ], [ [[ADD_I:%.*]], [[FOR_BODY7_I]] ]
+; NEWPM_O23-NEXT:    [[J_011_I:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH_I]] ], [ [[INC_I:%.*]], [[FOR_BODY7_I]] ]
+; NEWPM_O23-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_I5_I]], align 4, !tbaa [[TBAA9]]
+; NEWPM_O23-NEXT:    [[ADD_I]] = fadd float [[TMP5]], [[TMP6]]
+; NEWPM_O23-NEXT:    store float [[ADD_I]], float* [[ARRAYIDX_I9_I]], align 4, !tbaa [[TBAA9]]
+; NEWPM_O23-NEXT:    [[INC_I]] = add nuw i32 [[J_011_I]], 1
+; NEWPM_O23-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[TMP1]]
+; NEWPM_O23-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT]], label [[FOR_BODY7_I]], !llvm.loop [[LOOP11:![0-9]+]]
+; NEWPM_O23:       _ZN12FloatVecPair6vecIncEv.exit:
+; NEWPM_O23-NEXT:    ret void
 ;
 entry:
   %FVP.addr = alloca %class.FloatVecPair*, align 8

From 4846568191ba1634598eb845ce0b5fab338e6d26 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Feb 2022 13:00:46 +0100
Subject: [PATCH 075/748] [Docs] Update opaque pointers docs

Expand migration instructions.
---
 llvm/docs/OpaquePointers.rst | 89 +++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 41 deletions(-)

diff --git a/llvm/docs/OpaquePointers.rst b/llvm/docs/OpaquePointers.rst
index ef05b88e7262c..ed8fd5b1e32d1 100644
--- a/llvm/docs/OpaquePointers.rst
+++ b/llvm/docs/OpaquePointers.rst
@@ -24,15 +24,15 @@ Issues with explicit pointee types
 ==================================
 
 LLVM IR pointers can be cast back and forth between pointers with different
-pointee types. The pointee type does not necessarily actually represent the
-actual underlying type in memory. In other words, the pointee type contains no
-real semantics.
+pointee types. The pointee type does not necessarily represent the actual
+underlying type in memory. In other words, the pointee type carries no real
+semantics.
 
 Lots of operations do not actually care about the underlying type. These
 operations, typically intrinsics, usually end up taking an ``i8*``. This causes
 lots of redundant no-op bitcasts in the IR to and from a pointer with a
 different pointee type. The extra bitcasts take up space and require extra work
-to look through in optimizations. And more bitcasts increases the chances of
+to look through in optimizations. And more bitcasts increase the chances of
 incorrect bitcasts, especially in regards to address spaces.
 
 Some instructions still need to know what type to treat the memory pointed to by
@@ -86,36 +86,6 @@ opaque pointers.
      ret ptr %p2
    }
 
-I Still Need Pointee Types!
-===========================
-
-The frontend should already know what type each operation operates on based on
-the input source code. However, some frontends like Clang may end up relying on
-LLVM pointer pointee types to keep track of pointee types. The frontend needs to
-keep track of frontend pointee types on its own.
-
-For optimizations around frontend types, pointee types are not useful due their
-lack of semantics. Rather, since LLVM IR works on untyped memory, for a frontend
-to tell LLVM about frontend types for the purposes of alias analysis, extra
-metadata is added to the IR. For more information, see `TBAA
-<LangRef.html#tbaa-metadata>`_.
-
-Some specific operations still need to know what type a pointer types to. For
-the most part, this is codegen and ABI specific. For example, `byval
-<LangRef.html#parameter-attributes>`_ arguments are pointers, but backends need
-to know the underlying type of the argument to properly lower it. In cases like
-these, the attributes contain a type argument. For example,
-
-.. code-block:: llvm
-
-  call void @f(ptr byval(i32) %p)
-
-signifies that ``%p`` as an argument should be lowered as an ``i32`` passed
-indirectly.
-
-If you have use cases that this sort of fix doesn't cover, please email
-llvm-dev.
-
 Migration Instructions
 ======================
 
@@ -128,14 +98,44 @@ the type of relevant operations instead. For example, memory access related
 analyses and optimizations should use the types encoded in the load and store
 instructions instead of querying the pointer type.
 
-Frontends need to be adjusted to track pointee types independently of LLVM,
-insofar as they are necessary for lowering. For example, clang now tracks the
-pointee type in the ``Address`` structure.
+Here are some common ways to avoid pointer element type accesses:
+
+* For loads, use ''getType()''.
+* For stores, use ''getValueOperand()->getType()''.
+* Use ''getLoadStoreType()'' to handle both of the above in one call.
+* For getelementptr instructions, use ''getSourceElementType()''.
+* For calls, use ''getFunctionType()''.
+* For allocas, use ''getAllocatedType()''.
+* For globals, use ''getValueType()''.
+* For consistency assertions, use
+  ''PointerType::isOpaqueOrPointeeTypeEquals()''.
+* To create a pointer type in a different address space, use
+  ''PointerType::getWithSamePointeeType()''.
+* To check that two pointers have the same element type, use
+  ''PointerType::hasSameElementTypeAs()''.
+* While it is preferred to write code in a way that accepts both typed and
+  opaque pointers, ''Type::isOpaquePointerTy()'' and
+  ''PointerType::isOpaque()'' can be used to handle opaque pointers specially.
+  ''PointerType::getNonOpaquePointerElementType()'' can be used as a marker in
+  code-paths where opaque pointers have been explicitly excluded.
+* To get the type of a byval argument, use ''getParamByValType()''. Similar
+  method exists for other ABI-affecting attributes that need to know the
+  element type, such as byref, sret, inalloca and preallocated.
+* Some intrinsics require an ''elementtype'' attribute, which can be retrieved
+  using ''getParamElementType()''. This attribute is required in cases where
+  the intrinsic does not naturally encode a needed element type. This is also
+  used for inline assembly.
+
+Note that some of the methods mentioned above only exist to support both typed
+and opaque pointers at the same time, and will be dropped once the migration
+has completed. For example, ''isOpaqueOrPointeeTypeEquals()'' becomes
+meaningless once all pointers are opaque.
 
 While direct usage of pointer element types is immediately apparent in code,
 there is a more subtle issue that opaque pointers need to contend with: A lot
 of code assumes that pointer equality also implies that the used load/store
-type is the same. Consider the following examples with typed an opaque pointers:
+type or GEP source element type is the same. Consider the following examples
+with typed an opaque pointers:
 
 .. code-block:: llvm
 
@@ -163,6 +163,13 @@ of an incorrect type. Code making such assumptions needs to be adjusted to
 check the accessed type explicitly:
 ``LI->getType() == SI->getValueOperand()->getType()``.
 
+Frontends
+---------
+
+Frontends need to be adjusted to track pointee types independently of LLVM,
+insofar as they are necessary for lowering. For example, clang now tracks the
+pointee type in the ``Address`` structure.
+
 Frontends using the C API through an FFI interface should be aware that a
 number of C API functions are deprecated and will be removed as part of the
 opaque pointer transition::
@@ -195,9 +202,9 @@ open problems:
   in opaque pointer mode.
 
 * While clang has limited support for opaque pointers (sufficient to compile
-  CTMark on Linux), a major effort will be needed to systematically remove all
-  uses of ``getPointerElementType()`` and the deprecated ``Address()``
-  constructor.
+  most C/C++ code on Linux), a major effort will be needed to systematically
+  remove all uses of ``getPointerElementType()`` and the deprecated
+  ``Address::deprecated()`` constructor.
 
 * We do not yet have a testing strategy for how we can test both typed and
   opaque pointers during the migration. Currently, individual tests for

From b254a2a703407468ef471630d9dd7b0667d45229 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 17 Feb 2022 10:56:25 +0000
Subject: [PATCH 076/748] [libc][automemcpy] Add mean/variance and simplify
 implementation

Differential Revision: https://reviews.llvm.org/D120031
---
 .../include/automemcpy/ResultAnalyzer.h       |  9 ++-
 .../automemcpy/lib/ResultAnalyzer.cpp         | 63 ++++++++++++-------
 .../unittests/ResultAnalyzerTest.cpp          | 19 +++---
 3 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
index 845c3e1e1180f..9b861c6250611 100644
--- a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
+++ b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
@@ -49,9 +49,12 @@ struct FunctionId {
 };
 
 struct PerDistributionData {
-  double MedianBytesPerSecond; // Median of samples for this distribution.
-  double Score;                // Normalized score for this distribution.
-  Grade::GradeEnum Grade;      // Grade for this distribution.
+  std::vector<double> BytesPerSecondSamples;
+  double BytesPerSecondMedian;   // Median of samples for this distribution.
+  double BytesPerSecondMean;     // Mean of samples for this distribution.
+  double BytesPerSecondVariance; // Variance of samples for this distribution.
+  double Score;                  // Normalized score for this distribution.
+  Grade::GradeEnum Grade;        // Grade for this distribution.
 };
 
 struct FunctionData {
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
index 00298f69f77f6..ed9cd1f286c2c 100644
--- a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
@@ -76,29 +76,48 @@ Grade::GradeEnum Grade::judge(double Score) {
   return BAD;
 }
 
+static double computeUnbiasedSampleVariance(const std::vector<double> &Samples,
+                                            const double SampleMean) {
+  assert(!Samples.empty());
+  if (Samples.size() == 1)
+    return 0;
+  double DiffSquaresSum = 0;
+  for (const double S : Samples) {
+    const double Diff = S - SampleMean;
+    DiffSquaresSum += Diff * Diff;
+  }
+  return DiffSquaresSum / (Samples.size() - 1);
+}
+
+static void processPerDistributionData(PerDistributionData &Data) {
+  auto &Samples = Data.BytesPerSecondSamples;
+  assert(!Samples.empty());
+  // Sample Mean
+  const double Sum = std::accumulate(Samples.begin(), Samples.end(), 0.0);
+  Data.BytesPerSecondMean = Sum / Samples.size();
+  // Unbiased Sample Variance
+  Data.BytesPerSecondVariance =
+      computeUnbiasedSampleVariance(Samples, Data.BytesPerSecondMean);
+  // Median
+  const size_t HalfSize = Samples.size() / 2;
+  std::nth_element(Samples.begin(), Samples.begin() + HalfSize, Samples.end());
+  Data.BytesPerSecondMedian = Samples[HalfSize];
+}
+
 std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples) {
-  std::unordered_map<SampleId, std::vector<double>, SampleId::Hasher>
-      BucketedSamples;
-  for (const auto &S : Samples)
-    BucketedSamples[S.Id].push_back(S.BytesPerSecond);
-  std::unordered_map<FunctionId, StringMap<double>, FunctionId::Hasher>
-      Throughputs;
-  for (auto &Pair : BucketedSamples) {
-    const auto &Id = Pair.first;
-    auto &Values = Pair.second;
-    const size_t HalfSize = Values.size() / 2;
-    std::nth_element(Values.begin(), Values.begin() + HalfSize, Values.end());
-    const double MedianValue = Values[HalfSize];
-    Throughputs[Id.Function][Id.Distribution.Name] = MedianValue;
+  std::unordered_map<FunctionId, FunctionData, FunctionId::Hasher> Functions;
+  for (const auto &S : Samples) {
+    auto &Function = Functions[S.Id.Function];
+    auto &Data = Function.PerDistributionData[S.Id.Distribution.Name];
+    Data.BytesPerSecondSamples.push_back(S.BytesPerSecond);
   }
+
   std::vector<FunctionData> Output;
-  for (auto &Pair : Throughputs) {
-    FunctionData Data;
-    Data.Id = Pair.first;
-    for (const auto &Pair : Pair.second)
-      Data.PerDistributionData[Pair.getKey()].MedianBytesPerSecond =
-          Pair.getValue();
-    Output.push_back(std::move(Data));
+  for (auto &[FunctionId, Function] : Functions) {
+    Function.Id = FunctionId;
+    for (auto &Pair : Function.PerDistributionData)
+      processPerDistributionData(Pair.second);
+    Output.push_back(std::move(Function));
   }
   return Output;
 }
@@ -130,7 +149,7 @@ void fillScores(MutableArrayRef<FunctionData> Functions) {
     const FunctionType Type = Function.Id.Type;
     for (const auto &Pair : Function.PerDistributionData) {
       const auto &Distribution = Pair.getKey();
-      const double Throughput = Pair.getValue().MedianBytesPerSecond;
+      const double Throughput = Pair.getValue().BytesPerSecondMedian;
       const Key K{Type, Distribution};
       ThroughputMinMax[K].update(Throughput);
     }
@@ -140,7 +159,7 @@ void fillScores(MutableArrayRef<FunctionData> Functions) {
     const FunctionType Type = Function.Id.Type;
     for (const auto &Pair : Function.PerDistributionData) {
       const auto &Distribution = Pair.getKey();
-      const double Throughput = Pair.getValue().MedianBytesPerSecond;
+      const double Throughput = Pair.getValue().BytesPerSecondMedian;
       const Key K{Type, Distribution};
       Function.PerDistributionData[Distribution].Score =
           ThroughputMinMax[K].normalize(Throughput);
diff --git a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
index bce508d17acbd..56f7bbf3d5f80 100644
--- a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
+++ b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
@@ -10,6 +10,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+using testing::DoubleNear;
 using testing::ElementsAre;
 using testing::Pair;
 using testing::SizeIs;
@@ -31,8 +32,10 @@ TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsOneSample) {
   EXPECT_THAT(Data[0].Id, Foo1);
   EXPECT_THAT(Data[0].PerDistributionData, SizeIs(1));
   // A single value is provided.
-  EXPECT_THAT(
-      Data[0].PerDistributionData.lookup(DistA.Name).MedianBytesPerSecond, 4);
+  const auto &DistributionData = Data[0].PerDistributionData.lookup(DistA.Name);
+  EXPECT_THAT(DistributionData.BytesPerSecondMedian, 4);
+  EXPECT_THAT(DistributionData.BytesPerSecondMean, 4);
+  EXPECT_THAT(DistributionData.BytesPerSecondVariance, 0);
 }
 
 TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsManySamplesSameBucket) {
@@ -48,8 +51,10 @@ TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsManySamplesSameBucket) {
   EXPECT_THAT(Data[0].PerDistributionData, SizeIs(1));
   // When multiple values are provided we pick the median one (here median of 4,
   // 5, 5).
-  EXPECT_THAT(
-      Data[0].PerDistributionData.lookup(DistA.Name).MedianBytesPerSecond, 5);
+  const auto &DistributionData = Data[0].PerDistributionData.lookup(DistA.Name);
+  EXPECT_THAT(DistributionData.BytesPerSecondMedian, 5);
+  EXPECT_THAT(DistributionData.BytesPerSecondMean, DoubleNear(4.6, 0.1));
+  EXPECT_THAT(DistributionData.BytesPerSecondVariance, DoubleNear(0.33, 0.01));
 }
 
 TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsServeralFunctionAndDist) {
@@ -86,11 +91,11 @@ TEST(AutomemcpyJsonResultsAnalyzer, getScore) {
       [](const FunctionData &A, const FunctionData &B) { return A.Id < B.Id; });
 
   EXPECT_THAT(Data[0].Id, Foo1);
-  EXPECT_THAT(Data[0].PerDistributionData.lookup("A").MedianBytesPerSecond, 1);
+  EXPECT_THAT(Data[0].PerDistributionData.lookup("A").BytesPerSecondMedian, 1);
   EXPECT_THAT(Data[1].Id, Foo2);
-  EXPECT_THAT(Data[1].PerDistributionData.lookup("A").MedianBytesPerSecond, 2);
+  EXPECT_THAT(Data[1].PerDistributionData.lookup("A").BytesPerSecondMedian, 2);
   EXPECT_THAT(Data[2].Id, Foo3);
-  EXPECT_THAT(Data[2].PerDistributionData.lookup("A").MedianBytesPerSecond, 3);
+  EXPECT_THAT(Data[2].PerDistributionData.lookup("A").BytesPerSecondMedian, 3);
 
   // Normalizes throughput per distribution.
   fillScores(Data);

From e993b20c049d2f933831c26139f024e022f3d7fe Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Fri, 4 Feb 2022 17:15:12 +0000
Subject: [PATCH 077/748] [flang][driver] Add support for `-emit-llvm`

This patch adds support for the `-emit-llvm` option in the frontend
driver (i.e. `flang-new -fc1`). Similarly to Clang, `flang-new -fc1
-emit-llvm file.f` will generate a textual LLVM IR file.

Depends on D118985

Differential Revision: https://reviews.llvm.org/D119012
---
 clang/include/clang/Driver/Options.td         |  2 +-
 .../include/flang/Frontend/FrontendActions.h  | 14 ++++
 .../include/flang/Frontend/FrontendOptions.h  |  6 +-
 flang/lib/Frontend/CompilerInvocation.cpp     |  3 +
 flang/lib/Frontend/FrontendActions.cpp        | 68 +++++++++++++++++++
 .../ExecuteCompilerInvocation.cpp             |  2 +
 flang/test/Driver/driver-help.f90             |  1 +
 flang/test/Driver/emit-llvm.f90               | 22 ++++++
 .../unittests/Frontend/FrontendActionTest.cpp | 27 ++++++++
 9 files changed, 141 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Driver/emit-llvm.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index b81973155cae6..37a8e9b77bbfb 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1071,7 +1071,7 @@ def d_Flag : Flag<["-"], "d">, Group<d_Group>;
 def d_Joined : Joined<["-"], "d">, Group<d_Group>;
 def emit_ast : Flag<["-"], "emit-ast">,
   HelpText<"Emit Clang AST files for source inputs">;
-def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option]>, Group<Action_Group>,
+def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option, FC1Option]>, Group<Action_Group>,
   HelpText<"Use the LLVM representation for assembler and object files">;
 def emit_interface_stubs : Flag<["-"], "emit-interface-stubs">, Flags<[CC1Option]>, Group<Action_Group>,
   HelpText<"Generate Interface Stub Files.">;
diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h
index e3def74e0f417..6a9afd1afc5c0 100644
--- a/flang/include/flang/Frontend/FrontendActions.h
+++ b/flang/include/flang/Frontend/FrontendActions.h
@@ -13,6 +13,7 @@
 #include "flang/Semantics/semantics.h"
 
 #include "mlir/IR/BuiltinOps.h"
+#include "llvm/IR/Module.h"
 #include <memory>
 
 namespace Fortran::frontend {
@@ -163,12 +164,25 @@ class CodeGenAction : public FrontendAction {
   std::unique_ptr<mlir::ModuleOp> mlirModule;
   std::unique_ptr<mlir::MLIRContext> mlirCtx;
   /// }
+
+  /// @name LLVM IR
+  std::unique_ptr<llvm::LLVMContext> llvmCtx;
+  std::unique_ptr<llvm::Module> llvmModule;
+
+  /// Generates an LLVM IR module from CodeGenAction::mlirModule and saves it
+  /// in CodeGenAction::llvmModule.
+  void GenerateLLVMIR();
+  /// }
 };
 
 class EmitMLIRAction : public CodeGenAction {
   void ExecuteAction() override;
 };
 
+class EmitLLVMAction : public CodeGenAction {
+  void ExecuteAction() override;
+};
+
 class EmitObjAction : public CodeGenAction {
   void ExecuteAction() override;
 };
diff --git a/flang/include/flang/Frontend/FrontendOptions.h b/flang/include/flang/Frontend/FrontendOptions.h
index 0ff8d0a758873..060910e3d67cd 100644
--- a/flang/include/flang/Frontend/FrontendOptions.h
+++ b/flang/include/flang/Frontend/FrontendOptions.h
@@ -34,6 +34,9 @@ enum ActionKind {
   /// Emit a .mlir file
   EmitMLIR,
 
+  /// Emit an .ll file
+  EmitLLVM,
+
   /// Emit a .o file.
   EmitObj,
 
@@ -84,9 +87,6 @@ enum ActionKind {
 
   /// Run a plugin action
   PluginAction
-
-  /// TODO: RunPreprocessor, EmitLLVM, EmitLLVMOnly,
-  /// EmitCodeGenOnly, EmitAssembly, (...)
 };
 
 /// \param suffix The file extension
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index af59cb6636b3a..7507b0091e13c 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -137,6 +137,9 @@ static bool ParseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
     case clang::driver::options::OPT_emit_mlir:
       opts.programAction = EmitMLIR;
       break;
+    case clang::driver::options::OPT_emit_llvm:
+      opts.programAction = EmitLLVM;
+      break;
     case clang::driver::options::OPT_emit_obj:
       opts.programAction = EmitObj;
       break;
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index d981faaa84980..43ab3f689522d 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -14,6 +14,7 @@
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/Support/Verifier.h"
+#include "flang/Optimizer/Support/FIRContext.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Support/KindMapping.h"
 #include "flang/Optimizer/Support/Utils.h"
@@ -28,6 +29,7 @@
 
 #include "mlir/IR/Dialect.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <clang/Basic/Diagnostic.h>
@@ -407,6 +409,72 @@ void GetSymbolsSourcesAction::ExecuteAction() {
   ci.semantics().DumpSymbolsSources(llvm::outs());
 }
 
+#include "flang/Tools/CLOptions.inc"
+
+// Lower the previously generated MLIR module into an LLVM IR module
+void CodeGenAction::GenerateLLVMIR() {
+  assert(mlirModule && "The MLIR module has not been generated yet.");
+
+  CompilerInstance &ci = this->instance();
+
+  fir::support::loadDialects(*mlirCtx);
+  fir::support::registerLLVMTranslation(*mlirCtx);
+
+  // Set-up the MLIR pass manager
+  mlir::PassManager pm(mlirCtx.get(), mlir::OpPassManager::Nesting::Implicit);
+
+  pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
+  pm.enableVerifier(/*verifyPasses=*/true);
+  mlir::PassPipelineCLParser passPipeline("", "Compiler passes to run");
+
+  // Create the pass pipeline
+  fir::createMLIRToLLVMPassPipeline(pm);
+
+  // Run the pass manager
+  if (!mlir::succeeded(pm.run(*mlirModule))) {
+    unsigned diagID = ci.diagnostics().getCustomDiagID(
+        clang::DiagnosticsEngine::Error, "Lowering to LLVM IR failed");
+    ci.diagnostics().Report(diagID);
+  }
+
+  // Translate to LLVM IR
+  llvm::Optional<llvm::StringRef> moduleName = mlirModule->getName();
+  llvmCtx = std::make_unique<llvm::LLVMContext>();
+  llvmModule = mlir::translateModuleToLLVMIR(
+      *mlirModule, *llvmCtx, moduleName ? *moduleName : "FIRModule");
+
+  if (!llvmModule) {
+    unsigned diagID = ci.diagnostics().getCustomDiagID(
+        clang::DiagnosticsEngine::Error, "failed to create the LLVM module");
+    ci.diagnostics().Report(diagID);
+    return;
+  }
+}
+
+void EmitLLVMAction::ExecuteAction() {
+  CompilerInstance &ci = this->instance();
+  GenerateLLVMIR();
+
+  // If set, use the predefined outupt stream to print the generated module.
+  if (!ci.IsOutputStreamNull()) {
+    llvmModule->print(
+        ci.GetOutputStream(), /*AssemblyAnnotationWriter=*/nullptr);
+    return;
+  }
+
+  // No predefined output stream was set. Create an output file and dump the
+  // generated module there.
+  std::unique_ptr<llvm::raw_ostream> os = ci.CreateDefaultOutputFile(
+      /*Binary=*/false, /*InFile=*/GetCurrentFileOrBufferName(), "ll");
+  if (!os) {
+    unsigned diagID = ci.diagnostics().getCustomDiagID(
+        clang::DiagnosticsEngine::Error, "failed to create the output file");
+    ci.diagnostics().Report(diagID);
+    return;
+  }
+  llvmModule->print(*os, /*AssemblyAnnotationWriter=*/nullptr);
+}
+
 void EmitMLIRAction::ExecuteAction() {
   CompilerInstance &ci = this->instance();
 
diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index d69242f58666b..d5c15b1c7b567 100644
--- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -35,6 +35,8 @@ static std::unique_ptr<FrontendAction> CreateFrontendBaseAction(
     return std::make_unique<ParseSyntaxOnlyAction>();
   case EmitMLIR:
     return std::make_unique<EmitMLIRAction>();
+  case EmitLLVM:
+    return std::make_unique<EmitLLVMAction>();
   case EmitObj:
     return std::make_unique<EmitObjAction>();
   case DebugUnparse:
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index 622548d353411..73d4697591919 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -65,6 +65,7 @@
 ! HELP-FC1-NEXT:OPTIONS:
 ! HELP-FC1-NEXT: -cpp                   Enable predefined and command line preprocessor macros
 ! HELP-FC1-NEXT: -D <macro>=<value>     Define <macro> to <value> (or 1 if <value> omitted)
+! HELP-FC1-NEXT: -emit-llvm Use the LLVM representation for assembler and object files
 ! HELP-FC1-NEXT: -emit-mlir Build the parse tree, then lower it to MLIR
 ! HELP-FC1-NEXT: -emit-obj Emit native object files
 ! HELP-FC1-NEXT: -E                     Only run the preprocessor
diff --git a/flang/test/Driver/emit-llvm.f90 b/flang/test/Driver/emit-llvm.f90
new file mode 100644
index 0000000000000..c62680d6b5fbc
--- /dev/null
+++ b/flang/test/Driver/emit-llvm.f90
@@ -0,0 +1,22 @@
+! Test the `-emit-llvm` option
+
+! UNSUPPORTED: system-windows
+! Windows is currently not supported in flang/lib/Optimizer/CodeGen/Target.cpp
+
+!------------
+! RUN COMMAND
+!------------
+! RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck %s
+
+!----------------
+! EXPECTED OUTPUT
+!----------------
+! CHECK: ; ModuleID = 'FIRModule'
+! CHECK: define void @_QQmain()
+! CHECK-NEXT:  ret void
+! CHECK-NEXT: }
+
+!------
+! INPUT
+!------
+end program
diff --git a/flang/unittests/Frontend/FrontendActionTest.cpp b/flang/unittests/Frontend/FrontendActionTest.cpp
index 249392eb22882..81a57be1fad01 100644
--- a/flang/unittests/Frontend/FrontendActionTest.cpp
+++ b/flang/unittests/Frontend/FrontendActionTest.cpp
@@ -161,4 +161,31 @@ TEST_F(FrontendActionTest, ParseSyntaxOnly) {
           .contains(
               ":1:14: error: IF statement is not allowed in IF statement\n"));
 }
+
+TEST_F(FrontendActionTest, EmitLLVM) {
+  // Populate the input file with the pre-defined input and flush it.
+  *(inputFileOs_) << "end program";
+  inputFileOs_.reset();
+
+  // Set-up the action kind.
+  compInst_.invocation().frontendOpts().programAction = EmitLLVM;
+  compInst_.invocation().preprocessorOpts().noReformat = true;
+
+  // Set-up the output stream. We are using output buffer wrapped as an output
+  // stream, as opposed to an actual file (or a file descriptor).
+  llvm::SmallVector<char> outputFileBuffer;
+  std::unique_ptr<llvm::raw_pwrite_stream> outputFileStream(
+      new llvm::raw_svector_ostream(outputFileBuffer));
+  compInst_.set_outputStream(std::move(outputFileStream));
+
+  // Execute the action.
+  bool success = ExecuteCompilerInvocation(&compInst_);
+
+  // Validate the expected output.
+  EXPECT_TRUE(success);
+  EXPECT_TRUE(!outputFileBuffer.empty());
+
+  EXPECT_TRUE(llvm::StringRef(outputFileBuffer.data())
+                  .contains("define void @_QQmain()"));
+}
 } // namespace

From e7d65fca7ec470469ad3f8e7689b5e563346e4d7 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 17 Feb 2022 13:02:14 +0100
Subject: [PATCH 078/748] [Bazel] Fix build after ObjCopy move.

Differential Revision: https://reviews.llvm.org/D120039
---
 .../llvm-project-overlay/llvm/BUILD.bazel     | 46 ++++++++++---------
 .../llvm/unittests/BUILD.bazel                | 16 +++++++
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 73fcd06a0a44b..de8fb283c9f68 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -786,6 +786,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ObjCopy",
+    srcs = glob([
+        "lib/ObjCopy/**/*.cpp",
+        "lib/ObjCopy/**/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ObjCopy/**/*.h",
+    ]),
+    copts = llvm_copts,
+    includes = ["lib/ObjCopy"],
+    deps = [
+        ":MC",
+        ":Object",
+        ":ObjectYAML",
+        ":Support",
+        ":Target",
+        ":intrinsics_impl_gen",
+    ],
+)
+
 cc_library(
     name = "Object",
     srcs = glob([
@@ -2432,22 +2453,6 @@ cc_library(
     ],
 )
 
-# FIXME: This library should use `textual_hdrs` instead of `hdrs` as we don't
-# want to parse or build modules for them (and haven't duplicated the necessary
-# dependencies), but unfortunately that doesn't work with
-# `strip_include_prefix`: https://github.com/bazelbuild/bazel/issues/12424
-#
-# For now, we simply disable features that might rely on the headers parsing.
-cc_library(
-    name = "llvm-objcopy-headers",
-    hdrs = glob(["tools/llvm-objcopy/**/*.h"]),
-    features = [
-        "-parse_headers",
-        "-header_modules",
-    ],
-    strip_include_prefix = "tools/llvm-objcopy",
-)
-
 cc_library(
     name = "MCA",
     srcs = glob([
@@ -3337,17 +3342,15 @@ cc_binary(
 cc_binary(
     name = "llvm-objcopy",
     srcs = glob([
-        "tools/llvm-objcopy/**/*.cpp",
-        # Note that we redundantly include the headers here to allow files to
-        # include same-directory headers in addition to including headers via
-        # the `llvm-objcopy-headers` rule's stripped include prefix.
-        "tools/llvm-objcopy/**/*.h",
+        "tools/llvm-objcopy/*.cpp",
+        "tools/llvm-objcopy/*.h",
     ]),
     copts = llvm_copts,
     stamp = 0,
     deps = [
         ":BinaryFormat",
         ":MC",
+        ":ObjCopy",
         ":Object",
         ":ObjectYAML",
         ":Option",
@@ -3355,7 +3358,6 @@ cc_binary(
         ":Target",
         ":llvm-bitcode-strip-opts",
         ":llvm-installnametool-opts",
-        ":llvm-objcopy-headers",
         ":llvm-objcopy-opts",
         ":llvm-strip-opts",
     ],
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 4d730eff7b96b..a0f51f489ef48 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -411,6 +411,22 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "objcopy_tests",
+    srcs = glob(
+        ["ObjCopy/*.cpp"],
+        allow_empty = False,
+    ),
+    deps = [
+        "//llvm:ObjCopy",
+        "//llvm:Object",
+        "//llvm:ObjectYAML",
+        "//llvm:TestingSupport",
+        "//llvm:gtest",
+        "//llvm:gtest_main",
+    ],
+)
+
 cc_test(
     name = "object_tests",
     size = "small",

From 030503e17cae315cbfb6a9adc537d2cc82304f5a Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Thu, 17 Feb 2022 11:18:11 +0000
Subject: [PATCH 079/748] Remove duplicated code for printing the `uwtable`
 attribute (NFC)

Committed as obvious.

Reviewed By: chill

Differential Revision: https://reviews.llvm.org/D120030
---
 llvm/lib/IR/Attributes.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 5751b99a2807e..f88f75e23d9de 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -446,14 +446,6 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
                     Twine(Kind == UWTableKind::Sync ? "sync" : "async") + ")")
                        .str();
     }
-
-    if (Kind != UWTableKind::None) {
-      if (Kind == UWTableKind::Default)
-        return "uwtable";
-      return ("uwtable(" + Twine(Kind == UWTableKind::Sync ? "sync" : "async") +
-              ")")
-          .str();
-    }
   }
 
   // Convert target-dependent attributes to strings of the form:

From dce3b403a7806fb839986f9f46975fb2c7251d42 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Feb 2022 14:08:29 +0100
Subject: [PATCH 080/748] [Docs] Use correct rst syntax

---
 llvm/docs/OpaquePointers.rst | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/llvm/docs/OpaquePointers.rst b/llvm/docs/OpaquePointers.rst
index ed8fd5b1e32d1..470a9347e3faf 100644
--- a/llvm/docs/OpaquePointers.rst
+++ b/llvm/docs/OpaquePointers.rst
@@ -100,35 +100,35 @@ instructions instead of querying the pointer type.
 
 Here are some common ways to avoid pointer element type accesses:
 
-* For loads, use ''getType()''.
-* For stores, use ''getValueOperand()->getType()''.
-* Use ''getLoadStoreType()'' to handle both of the above in one call.
-* For getelementptr instructions, use ''getSourceElementType()''.
-* For calls, use ''getFunctionType()''.
-* For allocas, use ''getAllocatedType()''.
-* For globals, use ''getValueType()''.
+* For loads, use ``getType()``.
+* For stores, use ``getValueOperand()->getType()``.
+* Use ``getLoadStoreType()`` to handle both of the above in one call.
+* For getelementptr instructions, use ``getSourceElementType()``.
+* For calls, use ``getFunctionType()``.
+* For allocas, use ``getAllocatedType()``.
+* For globals, use ``getValueType()``.
 * For consistency assertions, use
-  ''PointerType::isOpaqueOrPointeeTypeEquals()''.
+  ``PointerType::isOpaqueOrPointeeTypeEquals()``.
 * To create a pointer type in a different address space, use
-  ''PointerType::getWithSamePointeeType()''.
+  ``PointerType::getWithSamePointeeType()``.
 * To check that two pointers have the same element type, use
-  ''PointerType::hasSameElementTypeAs()''.
+  ``PointerType::hasSameElementTypeAs()``.
 * While it is preferred to write code in a way that accepts both typed and
-  opaque pointers, ''Type::isOpaquePointerTy()'' and
-  ''PointerType::isOpaque()'' can be used to handle opaque pointers specially.
-  ''PointerType::getNonOpaquePointerElementType()'' can be used as a marker in
+  opaque pointers, ``Type::isOpaquePointerTy()`` and
+  ``PointerType::isOpaque()`` can be used to handle opaque pointers specially.
+  ``PointerType::getNonOpaquePointerElementType()`` can be used as a marker in
   code-paths where opaque pointers have been explicitly excluded.
-* To get the type of a byval argument, use ''getParamByValType()''. Similar
+* To get the type of a byval argument, use ``getParamByValType()``. Similar
   method exists for other ABI-affecting attributes that need to know the
   element type, such as byref, sret, inalloca and preallocated.
-* Some intrinsics require an ''elementtype'' attribute, which can be retrieved
-  using ''getParamElementType()''. This attribute is required in cases where
+* Some intrinsics require an ``elementtype`` attribute, which can be retrieved
+  using ``getParamElementType()``. This attribute is required in cases where
   the intrinsic does not naturally encode a needed element type. This is also
   used for inline assembly.
 
 Note that some of the methods mentioned above only exist to support both typed
 and opaque pointers at the same time, and will be dropped once the migration
-has completed. For example, ''isOpaqueOrPointeeTypeEquals()'' becomes
+has completed. For example, ``isOpaqueOrPointeeTypeEquals()`` becomes
 meaningless once all pointers are opaque.
 
 While direct usage of pointer element types is immediately apparent in code,

From d74f15faffa659a2316a44f3a9bae07b08011544 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 17 Feb 2022 13:31:50 +0100
Subject: [PATCH 081/748] [AArch64][NFC] Fix unused-lambda-capture warning.

Differential Revision: https://reviews.llvm.org/D120041
---
 llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 0c0615010ab4d..7ecc3a58c841d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -245,7 +245,7 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
 
 bool AArch64_MC::isQForm(const MCInst &MI, const MCInstrInfo *MCII) {
   const auto &FPR128 = AArch64MCRegisterClasses[AArch64::FPR128RegClassID];
-  return llvm::any_of(MI, [&FPR128](const MCOperand &Op) {
+  return llvm::any_of(MI, [&](const MCOperand &Op) {
     return Op.isReg() && FPR128.contains(Op.getReg());
   });
 }

From 48e0e6cedc5672230c517ab56aa5e2fa779bcce2 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 17 Feb 2022 11:51:59 +0000
Subject: [PATCH 082/748] [llvm][automemcpy] Allow distribution filtering in
 analysis

Differential Revision: https://reviews.llvm.org/D120037
---
 libc/benchmarks/automemcpy/lib/CodeGen.cpp      |  2 +-
 .../automemcpy/lib/ResultAnalyzerMain.cpp       | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/libc/benchmarks/automemcpy/lib/CodeGen.cpp b/libc/benchmarks/automemcpy/lib/CodeGen.cpp
index c150ab554b46f..2ada57a2e33b0 100644
--- a/libc/benchmarks/automemcpy/lib/CodeGen.cpp
+++ b/libc/benchmarks/automemcpy/lib/CodeGen.cpp
@@ -613,7 +613,7 @@ llvm::ArrayRef<MemmoveConfiguration> getMemmoveConfigurations() {
 // Stores `VolatileStr` into a cache and returns a StringRef of the cached
 // version.
 StringRef getInternalizedString(std::string VolatileStr) {
-  static llvm::StringSet<> StringCache;
+  static llvm::StringSet StringCache;
   return StringCache.insert(std::move(VolatileStr)).first->getKey();
 }
 
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
index 6a657e432c18f..4a6caec469e55 100644
--- a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
@@ -20,6 +20,12 @@ namespace llvm {
 static cl::list<std::string> InputFilenames(cl::Positional, cl::OneOrMore,
                                             cl::desc("<input json files>"));
 
+// User can filter the distributions to be taken into account.
+static cl::list<std::string>
+    KeepOnlyDistributions("keep-only-distributions", cl::ZeroOrMore,
+                          cl::desc("<comma separated list of distribution "
+                                   "names, keeps all if unspecified>"));
+
 namespace automemcpy {
 
 // This is defined in the autogenerated 'Implementations.cpp' file.
@@ -48,7 +54,7 @@ static const FunctionDescriptor &getFunctionDescriptor(StringRef FunctionName) {
 // Functions and distributions names are stored quite a few times so it's more
 // efficient to internalize these strings and refer to them through 'StringRef'.
 static StringRef getInternalizedString(StringRef VolatileStr) {
-  static llvm::StringSet<> StringCache;
+  static llvm::StringSet StringCache;
   return StringCache.insert(VolatileStr).first->getKey();
 }
 
@@ -121,6 +127,15 @@ int Main(int argc, char **argv) {
     llvm::append_range(Samples, Result.Samples);
   }
 
+  if (!KeepOnlyDistributions.empty()) {
+    llvm::StringSet ValidDistributions;
+    ValidDistributions.insert(KeepOnlyDistributions.begin(),
+                              KeepOnlyDistributions.end());
+    llvm::erase_if(Samples, [&ValidDistributions](const Sample &S) {
+      return !ValidDistributions.contains(S.Id.Distribution.Name);
+    });
+  }
+
   // Extracts median of throughputs.
   std::vector<FunctionData> Functions = getThroughputs(Samples);
   fillScores(Functions);

From d4342efb69598f7e789a47cffc8827c54c115f31 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Fri, 7 Jan 2022 14:47:26 +0000
Subject: [PATCH 083/748] [AArch64] Add instruction selection for strict FP

This consists of marking the various strict opcodes as legal, and
adjusting instruction selection patterns so that 'op' is 'any_op'.

FP16 and vector instructions additionally require some extra work in
lowering and legalization, so we can't set IsStrictFPEnabled just yet.
Also more work needs to be done for full strict fp support (marking
instructions that can raise exceptions as such, and modelling FPCR use
for controlling rounding).

Differential Revision: https://reviews.llvm.org/D114946
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 199 +++++++++---------
 .../lib/Target/AArch64/AArch64InstrFormats.td |   9 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 138 ++++++------
 llvm/test/CodeGen/AArch64/arm64-fmadd.ll      | 112 ++++++++++
 llvm/test/CodeGen/AArch64/fp-intrinsics.ll    |  35 ++-
 5 files changed, 321 insertions(+), 172 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f8b183635012..d06fd2b27341a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -404,6 +404,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT, MVT::f128, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+  // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
+  // aren't handled.
 
   // Lowering for many of the conversions is actually specified by the non-f128
   // type. The LowerXXX function will be trivial when f128 isn't involved.
@@ -647,37 +649,35 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   // AArch64 has implementations of a lot of rounding-like FP operations.
-  for (MVT Ty : {MVT::f32, MVT::f64}) {
-    setOperationAction(ISD::FFLOOR, Ty, Legal);
-    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-    setOperationAction(ISD::FCEIL, Ty, Legal);
-    setOperationAction(ISD::FRINT, Ty, Legal);
-    setOperationAction(ISD::FTRUNC, Ty, Legal);
-    setOperationAction(ISD::FROUND, Ty, Legal);
-    setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
-    setOperationAction(ISD::FMINNUM, Ty, Legal);
-    setOperationAction(ISD::FMAXNUM, Ty, Legal);
-    setOperationAction(ISD::FMINIMUM, Ty, Legal);
-    setOperationAction(ISD::FMAXIMUM, Ty, Legal);
-    setOperationAction(ISD::LROUND, Ty, Legal);
-    setOperationAction(ISD::LLROUND, Ty, Legal);
-    setOperationAction(ISD::LRINT, Ty, Legal);
-    setOperationAction(ISD::LLRINT, Ty, Legal);
-  }
-
-  if (Subtarget->hasFullFP16()) {
-    setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
-    setOperationAction(ISD::FFLOOR,  MVT::f16, Legal);
-    setOperationAction(ISD::FCEIL,   MVT::f16, Legal);
-    setOperationAction(ISD::FRINT,   MVT::f16, Legal);
-    setOperationAction(ISD::FTRUNC,  MVT::f16, Legal);
-    setOperationAction(ISD::FROUND,  MVT::f16, Legal);
-    setOperationAction(ISD::FROUNDEVEN,  MVT::f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
-  }
+  for (auto Op :
+       {ISD::FFLOOR,          ISD::FNEARBYINT,      ISD::FCEIL,
+        ISD::FRINT,           ISD::FTRUNC,          ISD::FROUND,
+        ISD::FROUNDEVEN,      ISD::FMINNUM,         ISD::FMAXNUM,
+        ISD::FMINIMUM,        ISD::FMAXIMUM,        ISD::LROUND,
+        ISD::LLROUND,         ISD::LRINT,           ISD::LLRINT,
+        ISD::STRICT_FFLOOR,   ISD::STRICT_FCEIL,    ISD::STRICT_FNEARBYINT,
+        ISD::STRICT_FRINT,    ISD::STRICT_FTRUNC,   ISD::STRICT_FROUNDEVEN,
+        ISD::STRICT_FROUND,   ISD::STRICT_FMINNUM,  ISD::STRICT_FMAXNUM,
+        ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
+        ISD::STRICT_LLROUND,  ISD::STRICT_LRINT,    ISD::STRICT_LLRINT}) {
+    for (MVT Ty : {MVT::f32, MVT::f64})
+      setOperationAction(Op, Ty, Legal);
+    if (Subtarget->hasFullFP16())
+      setOperationAction(Op, MVT::f16, Legal);
+  }
+
+  // Basic strict FP operations are legal
+  for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+                  ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
+    for (MVT Ty : {MVT::f32, MVT::f64})
+      setOperationAction(Op, Ty, Legal);
+    if (Subtarget->hasFullFP16())
+      setOperationAction(Op, MVT::f16, Legal);
+  }
+
+  // Strict conversion to a larger type is legal
+  for (auto VT : {MVT::f32, MVT::f64})
+    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
@@ -938,43 +938,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
-    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
-    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
-    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
-    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
-    setOperationAction(ISD::FROUNDEVEN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
-    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
-    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
-    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
-    setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v1i64, Expand);
-    setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v1i64, Expand);
-
-    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    for (auto Op :
+         {ISD::SELECT,         ISD::SELECT_CC,      ISD::SETCC,
+          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
+          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
+          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
+          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
+          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
+          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
+          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
+          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
+          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
+          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
+          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
+          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
+          ISD::STRICT_FMAXIMUM})
+      setOperationAction(Op, MVT::v1f64, Expand);
+
+    for (auto Op :
+         {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
+          ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
+          ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
+          ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
+      setOperationAction(Op, MVT::v1i64, Expand);
 
     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
     // elements smaller than i32, so promote the input to i32 first.
@@ -982,14 +968,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
 
     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+    for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+                    ISD::STRICT_UINT_TO_FP})
+      for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
+        setOperationAction(Op, VT, Custom);
 
     if (Subtarget->hasFullFP16()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
@@ -1103,26 +1087,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
 
     // AArch64 has implementations of a lot of rounding-like FP operations.
-    for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
-      setOperationAction(ISD::FFLOOR, Ty, Legal);
-      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-      setOperationAction(ISD::FCEIL, Ty, Legal);
-      setOperationAction(ISD::FRINT, Ty, Legal);
-      setOperationAction(ISD::FTRUNC, Ty, Legal);
-      setOperationAction(ISD::FROUND, Ty, Legal);
-      setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
-    }
-
-    if (Subtarget->hasFullFP16()) {
-      for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
-        setOperationAction(ISD::FFLOOR, Ty, Legal);
-        setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-        setOperationAction(ISD::FCEIL, Ty, Legal);
-        setOperationAction(ISD::FRINT, Ty, Legal);
-        setOperationAction(ISD::FTRUNC, Ty, Legal);
-        setOperationAction(ISD::FROUND, Ty, Legal);
-        setOperationAction(ISD::FROUNDEVEN, Ty, Legal);
-      }
+    for (auto Op :
+         {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
+          ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
+          ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
+          ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
+      for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
+        setOperationAction(Op, Ty, Legal);
+      if (Subtarget->hasFullFP16())
+        for (MVT Ty : {MVT::v4f16, MVT::v8f16})
+          setOperationAction(Op, Ty, Legal);
     }
 
     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
@@ -1481,10 +1455,10 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
   setOperationAction(ISD::SREM, VT, Expand);
   setOperationAction(ISD::FREM, VT, Expand);
 
-  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
-  setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
-  setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+  for (unsigned Opcode :
+       {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
+        ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
+    setOperationAction(Opcode, VT, Custom);
 
   if (!VT.isFloatingPoint())
     setOperationAction(ISD::ABS, VT, Legal);
@@ -1494,14 +1468,39 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
       setOperationAction(Opcode, VT, Legal);
 
-  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
+  // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
+  // NEON types.
   if (VT.isFloatingPoint() &&
       VT.getVectorElementType() != MVT::bf16 &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
     for (unsigned Opcode :
-         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
+         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
+          ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
+          ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
+          ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
+          ISD::STRICT_FSQRT})
       setOperationAction(Opcode, VT, Legal);
 
+  // Strict fp extend and trunc are legal
+  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
+    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
+  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
+    setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
+
+  // FIXME: We could potentially make use of the vector comparison instructions
+  // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
+  // complications:
+  //  * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
+  //    so we would need to expand when the condition code doesn't match the
+  //    kind of comparison.
+  //  * Some kinds of comparison require more than one FCMXY instruction so
+  //    would need to be expanded instead.
+  //  * The lowering of the non-strict versions involves target-specific ISD
+  //    nodes so we would likely need to add strict versions of all of them and
+  //    handle them appropriately.
+  setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
+  setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
+
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 4c1e41b7efee0..659d2a62b8c40 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -4963,15 +4963,15 @@ multiclass FPConversion<string asm> {
 
   // Half-precision to Double-precision
   def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
-                             [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
+                             [(set FPR64:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>;
 
   // Half-precision to Single-precision
   def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
-                             [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
+                             [(set FPR32:$Rd, (any_fpextend (f16 FPR16:$Rn)))]>;
 
   // Single-precision to Double-precision
   def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
-                             [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;
+                             [(set FPR64:$Rd, (any_fpextend FPR32:$Rn))]>;
 
   // Single-precision to Half-precision
   def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
@@ -5075,7 +5075,8 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm,
   }
 }
 
-multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm,
+                               SDPatternOperator node> {
   def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
                   [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
     let Inst{23-22} = 0b11; // 16-bit size flag
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 17c11f8bbca46..45f8abc4585f4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3890,24 +3890,24 @@ defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
 
 
 let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (lround f16:$Rn)),
+  def : Pat<(i32 (any_lround f16:$Rn)),
             (!cast<Instruction>(FCVTASUWHr) f16:$Rn)>;
-  def : Pat<(i64 (lround f16:$Rn)),
+  def : Pat<(i64 (any_lround f16:$Rn)),
             (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
-  def : Pat<(i64 (llround f16:$Rn)),
+  def : Pat<(i64 (any_llround f16:$Rn)),
             (!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
 }
-def : Pat<(i32 (lround f32:$Rn)),
+def : Pat<(i32 (any_lround f32:$Rn)),
           (!cast<Instruction>(FCVTASUWSr) f32:$Rn)>;
-def : Pat<(i32 (lround f64:$Rn)),
+def : Pat<(i32 (any_lround f64:$Rn)),
           (!cast<Instruction>(FCVTASUWDr) f64:$Rn)>;
-def : Pat<(i64 (lround f32:$Rn)),
+def : Pat<(i64 (any_lround f32:$Rn)),
           (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
-def : Pat<(i64 (lround f64:$Rn)),
+def : Pat<(i64 (any_lround f64:$Rn)),
           (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
-def : Pat<(i64 (llround f32:$Rn)),
+def : Pat<(i64 (any_llround f32:$Rn)),
           (!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
-def : Pat<(i64 (llround f64:$Rn)),
+def : Pat<(i64 (any_llround f64:$Rn)),
           (!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
 
 //===----------------------------------------------------------------------===//
@@ -3951,17 +3951,17 @@ defm FCVT : FPConversion<"fcvt">;
 defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
 defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
 defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
-defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
-defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", froundeven>;
-defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", any_fround>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", any_fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", any_ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", any_froundeven>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", any_fceil>;
 
-defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", any_frint>;
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", any_ftrunc>;
 
 let SchedRW = [WriteFDiv] in {
-defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", any_fsqrt>;
 }
 
 let Predicates = [HasFRInt3264] in {
@@ -3971,44 +3971,48 @@ let Predicates = [HasFRInt3264] in {
   defm FRINT64X : FRIntNNT<0b11, "frint64x", int_aarch64_frint64x>;
 } // HasFRInt3264
 
+// Emitting strict_lrint as two instructions is valid as any exceptions that
+// occur will happen in exactly one of the instructions (e.g. if the input is
+// not an integer the inexact exception will happen in the FRINTX but not then
+// in the FCVTZS as the output of FRINTX is an integer).
 let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (lrint f16:$Rn)),
+  def : Pat<(i32 (any_lrint f16:$Rn)),
             (FCVTZSUWHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
-  def : Pat<(i64 (lrint f16:$Rn)),
+  def : Pat<(i64 (any_lrint f16:$Rn)),
             (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
-  def : Pat<(i64 (llrint f16:$Rn)),
+  def : Pat<(i64 (any_llrint f16:$Rn)),
             (FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
 }
-def : Pat<(i32 (lrint f32:$Rn)),
+def : Pat<(i32 (any_lrint f32:$Rn)),
           (FCVTZSUWSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i32 (lrint f64:$Rn)),
+def : Pat<(i32 (any_lrint f64:$Rn)),
           (FCVTZSUWDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
-def : Pat<(i64 (lrint f32:$Rn)),
+def : Pat<(i64 (any_lrint f32:$Rn)),
           (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i64 (lrint f64:$Rn)),
+def : Pat<(i64 (any_lrint f64:$Rn)),
           (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
-def : Pat<(i64 (llrint f32:$Rn)),
+def : Pat<(i64 (any_llrint f32:$Rn)),
           (FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
-def : Pat<(i64 (llrint f64:$Rn)),
+def : Pat<(i64 (any_llrint f64:$Rn)),
           (FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
 
 //===----------------------------------------------------------------------===//
 // Floating point two operand instructions.
 //===----------------------------------------------------------------------===//
 
-defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+defm FADD   : TwoOperandFPData<0b0010, "fadd", any_fadd>;
 let SchedRW = [WriteFDiv] in {
-defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", any_fdiv>;
 }
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaximum>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminimum>;
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", any_fmaxnum>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", any_fmaximum>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", any_fminnum>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", any_fminimum>;
 let SchedRW = [WriteFMul] in {
-defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
-defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", any_fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>;
 }
-defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", any_fsub>;
 
 def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
@@ -4023,13 +4027,13 @@ def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
 // Floating point three operand instructions.
 //===----------------------------------------------------------------------===//
 
-defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", any_fma>;
 defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
-     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+     TriOpFrag<(any_fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
 defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
-     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+     TriOpFrag<(fneg (any_fma node:$LHS, node:$MHS, node:$RHS))> >;
 defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
-     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+     TriOpFrag<(any_fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
 
 // The following def pats catch the case where the LHS of an FMA is negated.
 // The TriOpFrag above catches the case where the middle operand is negated.
@@ -4218,9 +4222,9 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
 def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
                                                               (i64 4)))),
           (FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (any_fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
 
-def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (any_fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
 
 defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
 defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
@@ -4232,16 +4236,16 @@ def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
 def : Pat<(concat_vectors V64:$Rd,
                           (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
           (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
+def : Pat<(v2f32 (any_fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (any_fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (any_fpround (v2f64 V128:$Rn)))),
           (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
 defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
 defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
                                         int_aarch64_neon_fcvtxn>;
-defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
-defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
 
 // AArch64's FCVT instructions saturate when out of range.
 multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> {
@@ -4273,13 +4277,13 @@ def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
 
 defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
 defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
-defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
-defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", froundeven>;
-defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
-defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
-defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", any_fround>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", any_fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", any_ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", any_froundeven>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", any_fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", any_frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", any_ftrunc>;
 
 let Predicates = [HasFRInt3264] in {
   defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z", int_aarch64_neon_frint32z>;
@@ -4289,7 +4293,7 @@ let Predicates = [HasFRInt3264] in {
 } // HasFRInt3264
 
 defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
-defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", any_fsqrt>;
 defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
                                UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
@@ -4313,7 +4317,7 @@ defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
 defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
        BinOpFrag<(add node:$LHS, (AArch64saddlp node:$RHS))> >;
 defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", AArch64saddlp>;
-defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", any_sint_to_fp>;
 defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
 defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
 defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
@@ -4323,7 +4327,7 @@ defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd
 defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
        BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >;
 defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>;
-defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", any_uint_to_fp>;
 defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
 defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
 defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
@@ -4447,32 +4451,32 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, V
 defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
 defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
 defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", any_fadd>;
 defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
 defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
 defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", any_fdiv>;
 defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", any_fmaxnum>;
 defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", any_fmaximum>;
 defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>;
 defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
 defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
-            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+            TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >;
 defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
-            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+            TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 
 defm FMULX    : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", any_fmul>;
 defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
 defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", any_fsub>;
 
 // MLA and MLS are generated in MachineCombine
 defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
@@ -6371,7 +6375,7 @@ defm : FMLSIndexedAfterNegPatterns<
            TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
 
 defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
-defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
+defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>;
 
 def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv2i32_indexed V64:$Rn,
diff --git a/llvm/test/CodeGen/AArch64/arm64-fmadd.ll b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
index 0ee7af0ce21c0..d7cdb835cd3c3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
@@ -109,6 +109,114 @@ entry:
   ret double %0
 }
 
+define float @fma32_strict(float %a, float %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fma32_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %a, float %b, float %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %0
+}
+
+define float @fnma32_strict(float %a, float %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma32_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fnmadd s0, s0, s1, s2
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %a, float %b, float %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %neg = fneg float %0
+  ret float %neg
+}
+
+define float @fms32_strict(float %a, float %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fms32_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s0, s1, s2
+; CHECK-NEXT:    ret
+entry:
+  %neg = fneg float %b
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %a, float %neg, float %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %0
+}
+
+define float @fms32_com_strict(float %a, float %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fms32_com_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub s0, s0, s1, s2
+; CHECK-NEXT:    ret
+entry:
+  %neg = fneg float %b
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %neg, float %a, float %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %0
+}
+
+define float @fnms32_strict(float %a, float %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms32_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fnmsub s0, s0, s1, s2
+; CHECK-NEXT:    ret
+entry:
+  %neg = fneg float %c
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %a, float %b, float %neg, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %0
+}
+
+define double @fma64_strict(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fma64_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd d0, d0, d1, d2
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %a, double %b, double %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %0
+}
+
+define double @fnma64_strict(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma64_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fnmadd d0, d0, d1, d2
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %a, double %b, double %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %neg = fneg double %0
+  ret double %neg
+}
+
+define double @fms64_strict(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d0, d1, d2
+; CHECK-NEXT:    ret
+entry:
+  %neg = fneg double %b
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %a, double %neg, double %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %0
+}
+
+define double @fms64_com_strict(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64_com_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub d0, d0, d1, d2
+; CHECK-NEXT:    ret
+entry:
+  %neg = fneg double %b
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %neg, double %a, double %c, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %0
+}
+
+define double @fnms64_strict(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms64_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fnmsub d0, d0, d1, d2
+; CHECK-NEXT:    ret
+entry:
+  %neg = fneg double %c
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %a, double %b, double %neg, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %0
+}
+
 ; This would crash while trying getNegatedExpression().
 
 define float @negated_constant(float %x) {
@@ -127,5 +235,9 @@ define float @negated_constant(float %x) {
   ret float %nfma
 }
 
+attributes #0 = { strictfp }
+
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
index f2694ab08a0db..a434332270ebb 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-eabi %s -disable-strictnode-mutation -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=true -global-isel-abort=2 -disable-strictnode-mutation %s -o - | FileCheck %s
 
 ; Check that constrained fp intrinsics are correctly lowered.
 
@@ -231,6 +232,20 @@ define float @minnum_f32(float %x, float %y) #0 {
   ret float %val
 }
 
+; CHECK-LABEL: maximum_f32:
+; CHECK: fmax s0, s0, s1
+define float @maximum_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.maximum.f32(float %x, float %y, metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: minimum_f32:
+; CHECK: fmin s0, s0, s1
+define float @minimum_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.minimum.f32(float %x, float %y, metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
 ; CHECK-LABEL: ceil_f32:
 ; CHECK: frintp s0, s0
 define float @ceil_f32(float %x) #0 {
@@ -701,6 +716,20 @@ define double @minnum_f64(double %x, double %y) #0 {
   ret double %val
 }
 
+; CHECK-LABEL: maximum_f64:
+; CHECK: fmax d0, d0, d1
+define double @maximum_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.maximum.f64(double %x, double %y, metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: minimum_f64:
+; CHECK: fmin d0, d0, d1
+define double @minimum_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.minimum.f64(double %x, double %y, metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
 ; CHECK-LABEL: ceil_f64:
 ; CHECK: frintp d0, d0
 define double @ceil_f64(double %x) #0 {
@@ -1483,6 +1512,8 @@ declare i32 @llvm.experimental.constrained.lrint.f32(float, metadata, metadata)
 declare i64 @llvm.experimental.constrained.llrint.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.maxnum.f32(float, float, metadata)
 declare float @llvm.experimental.constrained.minnum.f32(float, float, metadata)
+declare float @llvm.experimental.constrained.maximum.f32(float, float, metadata)
+declare float @llvm.experimental.constrained.minimum.f32(float, float, metadata)
 declare float @llvm.experimental.constrained.ceil.f32(float, metadata)
 declare float @llvm.experimental.constrained.floor.f32(float, metadata)
 declare i32 @llvm.experimental.constrained.lround.f32(float, metadata)
@@ -1525,6 +1556,8 @@ declare i32 @llvm.experimental.constrained.lrint.f64(double, metadata, metadata)
 declare i64 @llvm.experimental.constrained.llrint.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.maxnum.f64(double, double, metadata)
 declare double @llvm.experimental.constrained.minnum.f64(double, double, metadata)
+declare double @llvm.experimental.constrained.maximum.f64(double, double, metadata)
+declare double @llvm.experimental.constrained.minimum.f64(double, double, metadata)
 declare double @llvm.experimental.constrained.ceil.f64(double, metadata)
 declare double @llvm.experimental.constrained.floor.f64(double, metadata)
 declare i32 @llvm.experimental.constrained.lround.f64(double, metadata)

From 9d68ed08178d590e294761bde2ef471fbf3fe14d Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Fri, 28 Jan 2022 14:10:51 +0000
Subject: [PATCH 084/748] [AArch64] Allow strict opcodes in fp->int->fp
 patterns

These patterns don't change the fundamental instructions that are
used, just the variants that are used in order to remove some extra
MOVs.

Differential Revision: https://reviews.llvm.org/D118485
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 12 +--
 llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll  | 87 +++++++++++++++++++++
 2 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 45f8abc4585f4..0f88fc950eb41 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4983,19 +4983,19 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
 let Predicates = [HasNEON] in {
-def : Pat<(f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))),
+def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
-def : Pat<(f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))),
+def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
           (SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>;
-def : Pat<(f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))),
+def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
           (UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>;
-def : Pat<(f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))),
+def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
           (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
 
 let Predicates = [HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))),
+def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
           (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
-def : Pat<(f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))),
+def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
           (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
 }
 }
diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
index 981818f683483..d7bdf2d264c4e 100644
--- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
@@ -72,3 +72,90 @@ entry:
   %conv1 = uitofp i32 %conv to half
   ret half %conv1
 }
+
+define double @t1_strict(double %x) #0 {
+; CHECK-LABEL: t1_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    scvtf d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %x, metadata !"fpexcept.strict") #0
+  %conv1 = call double @llvm.experimental.constrained.sitofp.i64.f64(i64 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %conv1
+}
+
+define float @t2_strict(float %x) #0 {
+; CHECK-LABEL: t2_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    ret
+entry:
+  %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %x, metadata !"fpexcept.strict") #0
+  %conv1 = call float @llvm.experimental.constrained.sitofp.i32.f32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %conv1
+}
+
+define half @t3_strict(half %x) #0 {
+; CHECK-LABEL: t3_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs h0, h0
+; CHECK-NEXT:    scvtf h0, h0
+; CHECK-NEXT:    ret
+entry:
+  %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict") #0
+  %conv1 = call half @llvm.experimental.constrained.sitofp.i32.f16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %conv1
+}
+
+define double @t4_strict(double %x) #0 {
+; CHECK-LABEL: t4_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ret
+entry:
+  %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, metadata !"fpexcept.strict") #0
+  %conv1 = call double @llvm.experimental.constrained.uitofp.i64.f64(i64 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %conv1
+}
+
+define float @t5_strict(float %x) #0 {
+; CHECK-LABEL: t5_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+entry:
+  %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict") #0
+  %conv1 = call float @llvm.experimental.constrained.uitofp.i32.f32(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %conv1
+}
+
+define half @t6_strict(half %x) #0 {
+; CHECK-LABEL: t6_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu h0, h0
+; CHECK-NEXT:    ucvtf h0, h0
+; CHECK-NEXT:    ret
+entry:
+  %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict") #0
+  %conv1 = call half @llvm.experimental.constrained.uitofp.i32.f16(i32 %conv, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %conv1
+}
+
+attributes #0 = { strictfp }
+
+declare i32 @llvm.experimental.constrained.fptosi.i32.f16(half, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata)
+declare i32 @llvm.experimental.constrained.fptosi.i32.f32(float, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.i32.f32(float, metadata)
+declare i64 @llvm.experimental.constrained.fptosi.i64.f64(double, metadata)
+declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata)
+declare half @llvm.experimental.constrained.sitofp.i32.f16(i32, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.i32.f16(i32, metadata, metadata)
+declare float @llvm.experimental.constrained.sitofp.i32.f32(i32, metadata, metadata)
+declare float @llvm.experimental.constrained.uitofp.i32.f32(i32, metadata, metadata)
+declare double @llvm.experimental.constrained.sitofp.i64.f64(i64, metadata, metadata)
+declare double @llvm.experimental.constrained.uitofp.i64.f64(i64, metadata, metadata)

From b670da798d352c2edcee1d5ad832905b3923c8f3 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Fri, 28 Jan 2022 14:31:17 +0000
Subject: [PATCH 085/748] [AArch64] Allow strict opcodes in indexed fmul and
 fma patterns

Using an indexed version instead of a non-indexed version doesn't
change anything with regards to exceptions or rounding.

Differential Revision: https://reviews.llvm.org/D118487
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  22 ++--
 llvm/test/CodeGen/AArch64/arm64-vmul.ll       |  84 ++++++++++++++
 .../AArch64/neon-scalar-by-elem-fma.ll        | 106 ++++++++++++++++++
 3 files changed, 201 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 0f88fc950eb41..53a06c2b9e8e5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6280,18 +6280,18 @@ let hasSideEffects = 0 in {
 // On the other hand, there are quite a few valid combinatorial options due to
 // the commutativity of multiplication and the fact that (-x) * y = x * (-y).
 defm : SIMDFPIndexedTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+           TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)>>;
 defm : SIMDFPIndexedTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+           TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)>>;
 
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+           TriOpFrag<(any_fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+           TriOpFrag<(any_fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+           TriOpFrag<(any_fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
 defm : SIMDFPIndexedTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+           TriOpFrag<(any_fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
 
 multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
   // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
@@ -6370,22 +6370,22 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
 }
 
 defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+           TriOpFrag<(any_fma node:$RHS, node:$MHS, node:$LHS)> >;
 defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+           TriOpFrag<(any_fma node:$MHS, node:$RHS, node:$LHS)> >;
 
 defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
 defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", any_fmul>;
 
-def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v2f32 (any_fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv2i32_indexed V64:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+def : Pat<(v4f32 (any_fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv4i32_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
           (FMULv2i64_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
             (i64 0))>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 30a1bc5d8c1d4..482a1c5941e29 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -845,6 +845,90 @@ entry:
   ret <2 x double> %fmla1
 }
 
+define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+; CHECK-LABEL: fmls_indexed_2s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmls.2s v0, v2, v1[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg <2 x float> %c
+  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+; CHECK-LABEL: fmls_indexed_4s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls.4s v0, v2, v1[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg <4 x float> %c
+  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+  %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+; CHECK-LABEL: fmls_indexed_2d_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls.2d v0, v2, v1[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fneg <2 x double> %c
+  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmla.2s v0, v1, v2[0]
+; CHECK-NEXT:    ret
+entry:
+  %v1 = insertelement <2 x float> undef, float %c, i32 0
+  %v2 = insertelement <2 x float> %v1, float %c, i32 1
+  %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_4s_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmla.4s v0, v1, v2[0]
+; CHECK-NEXT:    ret
+entry:
+  %v1 = insertelement <4 x float> undef, float %c, i32 0
+  %v2 = insertelement <4 x float> %v1, float %c, i32 1
+  %v3 = insertelement <4 x float> %v2, float %c, i32 2
+  %v4 = insertelement <4 x float> %v3, float %c, i32 3
+  %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2d_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla.2d v0, v1, v2[0]
+; CHECK-NEXT:    ret
+entry:
+  %v1 = insertelement <2 x double> undef, double %c, i32 0
+  %v2 = insertelement <2 x double> %v1, double %c, i32 1
+  %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %fmla1
+}
+
+attributes #0 = { strictfp }
+
+declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
+
 define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ; CHECK-LABEL: mul_4h:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 32f59626b3812..5ae08cf20c392 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,7 +1,11 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
+attributes #0 = { strictfp }
+
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
 
 define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
   ; CHECK-LABEL: test_fmla_ss4S
@@ -106,3 +110,105 @@ define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
   ret double %tmp3
 }
 
+define float @test_fmla_ss4S_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_strict
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_swap_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_swap_strict
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %a, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_strict(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss2S_strict
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp2
+}
+
+define double @test_fmla_ddD_strict(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmla_ddD_strict
+  ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_strict
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_swap_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_swap_strict
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp2
+}
+
+define float @test_fmls_ss4S_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_strict
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fneg float %tmp1
+  %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp3
+}
+
+define float @test_fmls_ss4S_swap_strict(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_swap_strict
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fneg float %tmp1
+  %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %tmp2, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp3
+}
+
+define float @test_fmls_ss2S_strict(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss2S_strict
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fneg float %tmp1
+  %tmp3 = call float @llvm.experimental.constrained.fma.f32(float %tmp2, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %tmp3
+}
+
+define double @test_fmls_ddD_strict(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmls_ddD_strict
+  ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fneg double %tmp1
+  %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_strict
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fneg double %tmp1
+  %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_swap_strict(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_swap_strict
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fneg double %tmp1
+  %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %tmp2, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %tmp3
+}
+

From d916856bee1165aa78ca342cdd43523c33333736 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Fri, 28 Jan 2022 15:05:39 +0000
Subject: [PATCH 086/748] [AArch64] Allow strict opcodes in faddp patterns

This also requires adjustment to code in AArch64ISelLowering so that
vector_extract is distributed over strict_fadd.

Differential Revision: https://reviews.llvm.org/D118489
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 33 ++++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 12 +--
 llvm/test/CodeGen/AArch64/faddp.ll            | 80 +++++++++++++++++++
 3 files changed, 111 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d06fd2b27341a..6e763202ce917 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14228,6 +14228,7 @@ static SDValue performANDCombine(SDNode *N,
 
 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
   switch (Opcode) {
+  case ISD::STRICT_FADD:
   case ISD::FADD:
     return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
   case ISD::ADD:
@@ -14244,6 +14245,7 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   const bool FullFP16 =
       static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
+  bool IsStrict = N0->isStrictFPOpcode();
 
   // Rewrite for pairwise fadd pattern
   //   (f32 (extract_vector_elt
@@ -14252,11 +14254,14 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
   // ->
   //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
   //              (extract_vector_elt (vXf32 Other) 1))
+  // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
+  // we can only do this when it's used only by the extract_vector_elt.
   if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
-      hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
+      hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
+      (!IsStrict || N0.hasOneUse())) {
     SDLoc DL(N0);
-    SDValue N00 = N0->getOperand(0);
-    SDValue N01 = N0->getOperand(1);
+    SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
+    SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
 
     ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
     SDValue Other = N00;
@@ -14269,11 +14274,23 @@ static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
 
     if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
         Other == Shuffle->getOperand(0)) {
-      return DAG.getNode(N0->getOpcode(), DL, VT,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
-                                     DAG.getConstant(0, DL, MVT::i64)),
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
-                                     DAG.getConstant(1, DL, MVT::i64)));
+      SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+                                     DAG.getConstant(0, DL, MVT::i64));
+      SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
+                                     DAG.getConstant(1, DL, MVT::i64));
+      if (!IsStrict)
+        return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
+
+      // For strict_fadd we need uses of the final extract_vector to be replaced
+      // with the strict_fadd, but we also need uses of the chain output of the
+      // original strict_fadd to use the chain output of the new strict_fadd as
+      // otherwise it may not be deleted.
+      SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
+                                {VT, MVT::Other},
+                                {N0->getOperand(0), Extract1, Extract2});
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
+      return SDValue(N, 0);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 53a06c2b9e8e5..664f670d741c0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8100,17 +8100,17 @@ defm : InsertSubvectorUndef<i64>;
 def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
                     (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
            (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
-def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
-                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+def : Pat<(f64 (any_fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                         (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
            (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
     // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
     // so we match on v4f32 here, not v2f32. This will also catch adding
     // the low two lanes of a true v4f32 vector.
-def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
-                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+def : Pat<(any_fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
           (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
-def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
-                (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
+def : Pat<(any_fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v8f16 FPR128:$Rn), (i64 1))),
           (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
 
 // Scalar 64-bit shifts in FPR64 registers.
diff --git a/llvm/test/CodeGen/AArch64/faddp.ll b/llvm/test/CodeGen/AArch64/faddp.ll
index 06e976136c375..1476f7bcda5e0 100644
--- a/llvm/test/CodeGen/AArch64/faddp.ll
+++ b/llvm/test/CodeGen/AArch64/faddp.ll
@@ -100,3 +100,83 @@ entry:
   %1 = extractelement <2 x i64> %0, i32 0
   ret i64 %1
 }
+
+define float @faddp_2xfloat_strict(<2 x float> %a) #0 {
+; CHECK-LABEL: faddp_2xfloat_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    ret
+entry:
+  %shift = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
+  %0 = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> %shift, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %1 = extractelement <2 x float> %0, i32 0
+  ret float %1
+}
+
+define float @faddp_4xfloat_strict(<4 x float> %a) #0 {
+; CHECK-LABEL: faddp_4xfloat_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    ret
+entry:
+  %shift = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %0 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %a, <4 x float> %shift, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %1 = extractelement <4 x float> %0, i32 0
+  ret float %1
+}
+
+define float @faddp_4xfloat_commute_strict(<4 x float> %a) #0 {
+; CHECK-LABEL: faddp_4xfloat_commute_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    ret
+entry:
+  %shift = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %0 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %shift, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %1 = extractelement <4 x float> %0, i32 0
+  ret float %1
+}
+
+define float @faddp_2xfloat_commute_strict(<2 x float> %a) #0 {
+; CHECK-LABEL: faddp_2xfloat_commute_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    ret
+entry:
+  %shift = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
+  %0 = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %shift, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %1 = extractelement <2 x float> %0, i32 0
+  ret float %1
+}
+
+define double @faddp_2xdouble_strict(<2 x double> %a) #0 {
+; CHECK-LABEL: faddp_2xdouble_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %shift = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  %0 = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %a, <2 x double> %shift, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %1 = extractelement <2 x double> %0, i32 0
+  ret double %1
+}
+
+define double @faddp_2xdouble_commute_strict(<2 x double> %a) #0 {
+; CHECK-LABEL: faddp_2xdouble_commute_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %shift = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  %0 = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %shift, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  %1 = extractelement <2 x double> %0, i32 0
+  ret double %1
+}
+
+attributes #0 = { strictfp }
+
+declare <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float>, <2 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)

From d955ca49379e73485304eb7f500db53e33109b0f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 17 Feb 2022 13:51:27 +0100
Subject: [PATCH 087/748] [BufferDeallocation] Don't assume successor operands
 are unique

This would create a double free when a memref is passed twice to the
same op. This wasn't a problem at the time the pass was written but is
common since the introduction of scf.while.

There's a latent non-determinism that's triggered by the test, but this
change is messy enough as-is so I'll leave that for later.

Differential Revision: https://reviews.llvm.org/D120044
---
 .../Transforms/BufferDeallocation.cpp         | 17 +++---
 .../Transforms/buffer-deallocation.mlir       | 58 +++++++++++++++++++
 2 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp
index f3646806639e3..6d04dd4e92e0d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocation.cpp
@@ -376,17 +376,20 @@ class BufferDeallocation : public BufferPlacementTransformationBase {
 
     // Determine the actual operand to introduce a clone for and rewire the
     // operand to point to the clone instead.
-    Value operand =
-        regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber())
-            [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()];
+    auto operands =
+        regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber());
+    size_t operandIndex =
+        llvm::find(it->getSuccessorInputs(), blockArg).getIndex() +
+        operands.getBeginOperandIndex();
+    Value operand = parentOp->getOperand(operandIndex);
+    assert(operand ==
+               operands[operandIndex - operands.getBeginOperandIndex()] &&
+           "region interface operands don't match parentOp operands");
     auto clone = introduceCloneBuffers(operand, parentOp);
     if (failed(clone))
       return failure();
 
-    auto op = llvm::find(parentOp->getOperands(), operand);
-    assert(op != parentOp->getOperands().end() &&
-           "parentOp does not contain operand");
-    parentOp->setOperand(op.getIndex(), *clone);
+    parentOp->setOperand(operandIndex, *clone);
     return success();
   }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation.mlir b/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation.mlir
index 0a80265aba50f..e7219b5c8cb7a 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation.mlir
@@ -1222,3 +1222,61 @@ func @dealloc_existing_clones(%arg0: memref<?x?xf64>, %arg1: memref<?x?xf64>) ->
   %1 = bufferization.clone %arg1 : memref<?x?xf64> to memref<?x?xf64>
   return %0 : memref<?x?xf64>
 }
+
+// -----
+
+// CHECK-LABEL: func @while_two_arg
+func @while_two_arg(%arg0: index) {
+  %a = memref.alloc(%arg0) : memref<?xf32>
+// CHECK: %[[WHILE:.*]]:2 = scf.while (%[[ARG1:.*]] = %[[ALLOC:.*]], %[[ARG2:.*]] = %[[CLONE:.*]])
+  scf.while (%arg1 = %a, %arg2 = %a) : (memref<?xf32>, memref<?xf32>) -> (memref<?xf32>, memref<?xf32>) {
+// CHECK-NEXT: make_condition
+    %0 = "test.make_condition"() : () -> i1
+// CHECK-NEXT: bufferization.clone %[[ARG2]]
+// CHECK-NEXT: memref.dealloc %[[ARG2]]
+    scf.condition(%0) %arg1, %arg2 : memref<?xf32>, memref<?xf32>
+  } do {
+  ^bb0(%arg1: memref<?xf32>, %arg2: memref<?xf32>):
+// CHECK: %[[ALLOC2:.*]] = memref.alloc
+    %b = memref.alloc(%arg0) : memref<?xf32>
+// CHECK: memref.dealloc %[[ARG2]]
+// CHECK: %[[CLONE2:.*]] = bufferization.clone %[[ALLOC2]]
+// CHECK: memref.dealloc %[[ALLOC2]]
+    scf.yield %arg1, %b : memref<?xf32>, memref<?xf32>
+  }
+// CHECK: }
+// CHECK-NEXT: memref.dealloc %[[WHILE]]#1
+// CHECK-NEXT: memref.dealloc %[[ALLOC]]
+// CHECK-NEXT: return
+  return
+}
+
+// -----
+
+func @while_three_arg(%arg0: index) {
+// CHECK: %[[ALLOC:.*]] = memref.alloc
+  %a = memref.alloc(%arg0) : memref<?xf32>
+// CHECK-NEXT: %[[CLONE1:.*]] = bufferization.clone %[[ALLOC]]
+// CHECK-NEXT: %[[CLONE2:.*]] = bufferization.clone %[[ALLOC]]
+// CHECK-NEXT: %[[CLONE3:.*]] = bufferization.clone %[[ALLOC]]
+// CHECK-NEXT: memref.dealloc %[[ALLOC]]
+// CHECK-NEXT: %[[WHILE:.*]]:3 = scf.while
+// FIXME: This is non-deterministic
+// CHECK-SAME-DAG: [[CLONE1]]
+// CHECK-SAME-DAG: [[CLONE2]]
+// CHECK-SAME-DAG: [[CLONE3]]
+  scf.while (%arg1 = %a, %arg2 = %a, %arg3 = %a) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> (memref<?xf32>, memref<?xf32>, memref<?xf32>) {
+    %0 = "test.make_condition"() : () -> i1
+    scf.condition(%0) %arg1, %arg2, %arg3 : memref<?xf32>, memref<?xf32>, memref<?xf32>
+  } do {
+  ^bb0(%arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>):
+    %b = memref.alloc(%arg0) : memref<?xf32>
+    %q = memref.alloc(%arg0) : memref<?xf32>
+    scf.yield %q, %b, %arg2: memref<?xf32>, memref<?xf32>, memref<?xf32>
+  }
+// CHECK-DAG: memref.dealloc %[[WHILE]]#0
+// CHECK-DAG: memref.dealloc %[[WHILE]]#1
+// CHECK-DAG: memref.dealloc %[[WHILE]]#2
+// CHECK-NEXT: return
+  return
+}

From 21ac47439218e6222a48c451a0d4d86d5e8f02b7 Mon Sep 17 00:00:00 2001
From: Shao-Ce SUN <shaoce@nj.iscas.ac.cn>
Date: Thu, 17 Feb 2022 21:13:00 +0800
Subject: [PATCH 088/748] [NFC] Correct typo `interger` to `integer`

---
 flang/lib/Optimizer/Support/KindMapping.cpp                     | 2 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h                       | 2 +-
 llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll      | 2 +-
 .../Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py    | 2 +-
 .../Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py  | 2 +-
 .../Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py        | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Optimizer/Support/KindMapping.cpp b/flang/lib/Optimizer/Support/KindMapping.cpp
index d22a438e3b812..4535863abf8e4 100644
--- a/flang/lib/Optimizer/Support/KindMapping.cpp
+++ b/flang/lib/Optimizer/Support/KindMapping.cpp
@@ -87,7 +87,7 @@ static RT doLookup(std::function<RT(KindTy)> def,
   return def(kind);
 }
 
-// do a lookup for INTERGER, LOGICAL, or CHARACTER
+// do a lookup for integer, LOGICAL, or CHARACTER
 template <char KEY, typename MAP>
 static Bitsize getIntegerLikeBitsize(KindTy kind, const MAP &map) {
   return doLookup<Bitsize, KEY>(defaultScalingKind, map, kind);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 4131fa498259d..826d26ce85631 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -77,7 +77,7 @@ namespace llvm {
     FCTIDUZ,
     FCTIWUZ,
 
-    /// Floating-point-to-interger conversion instructions
+    /// Floating-point-to-integer conversion instructions
     FP_TO_UINT_IN_VSR,
     FP_TO_SINT_IN_VSR,
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll b/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll
index 9eb3c0d761c8a..36bb5c88adef0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Regression test from https://bugs.llvm.org/show_bug.cgi?id=39168
 ; Based on code from `compiler-rt/lib/builtins/multc3.c`
-; On plaforms where fp128 lowers to an interger type (soft-fp) we
+; On plaforms where fp128 lowers to an integer type (soft-fp) we
 ; shouldn't be calling isFAbsFree() on the legalized type.
 
 ; RUN: opt -slp-vectorizer -slp-threshold=-10 -S %s | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py b/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
index cfed5a89c9ae4..d238e6fdb79b4 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
@@ -47,7 +47,7 @@ def sparse_tensor_to_coo_tensor(support_lib, sparse, dtype):
   Returns:
     A tuple that contains the following values:
     rank: An integer for the rank of the tensor.
-    nse: An interger for the number of non-zero values in the tensor.
+    nse: An integer for the number of non-zero values in the tensor.
     shape: A 1D numpy array of integers, for the shape of the tensor.
     values: A 1D numpy array, for the non-zero values in the tensor.
     indices: A 2D numpy array of integers, representing the indices for the
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
index 69dedf39c68cf..c44a84e25a25d 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
@@ -379,7 +379,7 @@ def get_index_vars(n: int) -> List[IndexVar]:
   This routine is defined by the TACO API.
 
   Args:
-    n: An interger representing the number of IndexVar to get.
+    n: An integer representing the number of IndexVar to get.
 
   Returns:
     A list of IndexVar.
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
index 25221f32fff99..62cd6baff6388 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
@@ -83,7 +83,7 @@ def sparse_tensor_to_coo_tensor(
     A tuple that contains the following values for the COO-flavored format
     tensor:
     rank: An integer for the rank of the tensor.
-    nse: An interger for the number of non-zero values in the tensor.
+    nse: An integer for the number of non-zero values in the tensor.
     shape: A 1D numpy array of integers, for the shape of the tensor.
     values: A 1D numpy array, for the non-zero values in the tensor.
     indices: A 2D numpy array of integers, representing the indices for the

From 2614de82025bd9c04f8515747a611238c0ac4e05 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 13:15:19 +0000
Subject: [PATCH 089/748] [clang] CGCXXABI::EmitLoadOfMemberFunctionPointer -
 use castAs<> instead of getAs<> to avoid dereference of nullptr

The pointer is always dereferenced by arrangeCXXMethodType, so assert the cast is correct instead of returning nullptr
---
 clang/lib/CodeGen/CGCXXABI.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGCXXABI.cpp b/clang/lib/CodeGen/CGCXXABI.cpp
index 0b441e382f11c..42e6c916bed0c 100644
--- a/clang/lib/CodeGen/CGCXXABI.cpp
+++ b/clang/lib/CodeGen/CGCXXABI.cpp
@@ -45,8 +45,7 @@ CGCallee CGCXXABI::EmitLoadOfMemberFunctionPointer(
   ErrorUnsupportedABI(CGF, "calls through member pointers");
 
   ThisPtrForCall = This.getPointer();
-  const FunctionProtoType *FPT =
-    MPT->getPointeeType()->getAs<FunctionProtoType>();
+  const auto *FPT = MPT->getPointeeType()->castAs<FunctionProtoType>();
   const auto *RD =
       cast<CXXRecordDecl>(MPT->getClass()->castAs<RecordType>()->getDecl());
   llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(

From 57fc9798d7145626809b0e81af9154a755b383eb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 13:18:02 +0000
Subject: [PATCH 090/748] [clang] CGDebugInfo::getOrCreateMethodType - use
 castAs<> instead of getAs<> to avoid dereference of nullptr

The pointer is always dereferenced, so assert the cast is correct instead of returning nullptr
---
 clang/lib/CodeGen/CGDebugInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index c09adad09aa8f..d75b5a1a9d125 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1725,7 +1725,7 @@ void CGDebugInfo::CollectRecordFields(
 llvm::DISubroutineType *
 CGDebugInfo::getOrCreateMethodType(const CXXMethodDecl *Method,
                                    llvm::DIFile *Unit, bool decl) {
-  const FunctionProtoType *Func = Method->getType()->getAs<FunctionProtoType>();
+  const auto *Func = Method->getType()->castAs<FunctionProtoType>();
   if (Method->isStatic())
     return cast_or_null<llvm::DISubroutineType>(
         getOrCreateType(QualType(Func, 0), Unit));

From f29f86b60bf7ce7e6651d90b36eb36592f60c4bf Mon Sep 17 00:00:00 2001
From: Shao-Ce SUN <shaoce@nj.iscas.ac.cn>
Date: Thu, 17 Feb 2022 21:19:14 +0800
Subject: [PATCH 091/748] [NFC] Fix comment

---
 flang/lib/Optimizer/Support/KindMapping.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Support/KindMapping.cpp b/flang/lib/Optimizer/Support/KindMapping.cpp
index 4535863abf8e4..e5500c14476de 100644
--- a/flang/lib/Optimizer/Support/KindMapping.cpp
+++ b/flang/lib/Optimizer/Support/KindMapping.cpp
@@ -87,7 +87,7 @@ static RT doLookup(std::function<RT(KindTy)> def,
   return def(kind);
 }
 
-// do a lookup for integer, LOGICAL, or CHARACTER
+// do a lookup for INTEGER, LOGICAL, or CHARACTER
 template <char KEY, typename MAP>
 static Bitsize getIntegerLikeBitsize(KindTy kind, const MAP &map) {
   return doLookup<Bitsize, KEY>(defaultScalingKind, map, kind);

From 1c502c63cb77dd15e698087fdc6b3fb892ce0977 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 13:28:02 +0000
Subject: [PATCH 092/748] [clang-doc] SerializeIndex - pass Index param by
 constant reference

Silence coverity warnings about unnecessary copies
---
 clang-tools-extra/clang-doc/HTMLGenerator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index e110f312d10c4..4ab962be7864d 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -899,7 +899,7 @@ static llvm::Error SerializeIndex(ClangDocContext &CDCtx) {
   }
   CDCtx.Idx.sort();
   llvm::json::OStream J(OS, 2);
-  std::function<void(Index)> IndexToJSON = [&](Index I) {
+  std::function<void(Index)> IndexToJSON = [&](const Index &I) {
     J.object([&] {
       J.attribute("USR", toHex(llvm::toStringRef(I.USR)));
       J.attribute("Name", I.Name);

From 1a8bdf95a3361a90e49c96c3b4eaeda6462fe878 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson@ericsson.com>
Date: Fri, 4 Feb 2022 18:32:13 +0100
Subject: [PATCH 093/748] [DAG] Fix in ReplaceAllUsesOfValuesWith

When doing SelectionDAG::ReplaceAllUsesOfValuesWith a worklist is
prepared containing all users that should be updated. Then we use
the RemoveNodeFromCSEMaps/AddModifiedNodeToCSEMaps helpers to handle
recursive CSE updates while doing the replacements.

This patch aims at solving a problem that could arise if the recursive
CSE updates would result in an SDNode present in the worklist is being
removed as a side-effect of morphing a prio user in the worklist.

To examplify such a scenario, imagine that we have these nodes in
the DAG
   t12: i64 = add t8, t11
   t13: i64 = add t12, t8
   t14: i64 = add t11, t11
   t15: i64 = add t14, t8
   t16: i64 = sub t13, t15
and that the t8 uses should be replaced by t11. An initial worklist
(listing the users that should be morphed) could be [t12, t13, t15].
When updating t12 we get
   t12: i64 = add t11, t11
which results in a CSE update that replaces t14 by t12, so we get
   t15: i64 = add t12, t8
which results in a CSE update that replaces t13 by t12, so we get
   t16: i64 = sub t12, t15
and then t13 is removed given that it was the last use of t13.

So when being done with the updates triggered by rewriting the use
of t8 in t12 the t13 node no longer exist. And we used to end up
hitting an assertion when continuing with the worklist aiming at
replacing the t8 uses in t13.

The solution is based on using a DAGUpdateListener, making sure that
we prune a user from the worklist if it is removed during the
recursive CSE updates.

The bug was found using an OOT target. I think the problem is quite
old, even if the particular intree target reproducer added in this
patch seem to pass when using LLVM 13.0.0.

Differential Revision: https://reviews.llvm.org/D119088
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 44 +++++++++++++----
 .../AArch64/dag-ReplaceAllUsesOfValuesWith.ll | 47 +++++++++++++++++++
 2 files changed, 81 insertions(+), 10 deletions(-)
 create mode 100755 llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6c142fee38b54..82c0990bf0201 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9708,19 +9708,36 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
 
 namespace {
 
-  /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
-  /// to record information about a use.
-  struct UseMemo {
-    SDNode *User;
-    unsigned Index;
-    SDUse *Use;
-  };
+/// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+/// to record information about a use.
+struct UseMemo {
+  SDNode *User;
+  unsigned Index;
+  SDUse *Use;
+};
+
+/// operator< - Sort Memos by User.
+bool operator<(const UseMemo &L, const UseMemo &R) {
+  return (intptr_t)L.User < (intptr_t)R.User;
+}
+
+/// RAUOVWUpdateListener - Helper for ReplaceAllUsesOfValuesWith - When the node
+/// pointed to by a UseMemo is deleted, set the User to nullptr to indicate that
+/// the node already has been taken care of recursively.
+class RAUOVWUpdateListener : public SelectionDAG::DAGUpdateListener {
+  SmallVector<UseMemo, 4> &Uses;
 
-  /// operator< - Sort Memos by User.
-  bool operator<(const UseMemo &L, const UseMemo &R) {
-    return (intptr_t)L.User < (intptr_t)R.User;
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    for (UseMemo &Memo : Uses)
+      if (Memo.User == N)
+        Memo.User = nullptr;
   }
 
+public:
+  RAUOVWUpdateListener(SelectionDAG &d, SmallVector<UseMemo, 4> &uses)
+      : SelectionDAG::DAGUpdateListener(d), Uses(uses) {}
+};
+
 } // end anonymous namespace
 
 bool SelectionDAG::calculateDivergence(SDNode *N) {
@@ -9812,12 +9829,19 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
 
   // Sort the uses, so that all the uses from a given User are together.
   llvm::sort(Uses);
+  RAUOVWUpdateListener Listener(*this, Uses);
 
   for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
        UseIndex != UseIndexEnd; ) {
     // We know that this user uses some value of From.  If it is the right
     // value, update it.
     SDNode *User = Uses[UseIndex].User;
+    // If the node has been deleted by recursive CSE updates when updating
+    // another node, then just skip this entry.
+    if (User == nullptr) {
+      ++UseIndex;
+      continue;
+    }
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
new file mode 100755
index 0000000000000..90b004233fb52
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-- -start-after codegenprepare -o - %s | FileCheck %s
+
+; REQUIRES: asserts
+
+; This used to hit an assertion like this:
+;
+; llc: ../lib/CodeGen/SelectionDAG/SelectionDAG.cpp:1087: bool llvm::SelectionDAG::RemoveNodeFromCSEMaps(llvm::SDNode*): Assertion `N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!"' failed.
+; Stack dump:
+; 0.      Program arguments: llc -mtriple aarch64 -o - reduced.ll -start-after codegenprepare
+; 1.      Running pass 'Function Pass Manager' on module 'reduced.ll'.
+; 2.      Running pass 'AArch64 Instruction Selection' on function '@g'
+;  #0 0x00000000031615b8 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int)
+;  #1 0x000000000315effe SignalHandler(int) Signals.cpp:0:0
+;  #2 0x00007f2c746b2630 __restore_rt sigaction.c:0:0
+;  #3 0x00007f2c7200f387 raise (/lib64/libc.so.6+0x36387)
+;  #4 0x00007f2c72010a78 abort (/lib64/libc.so.6+0x37a78)
+;  #5 0x00007f2c720081a6 __assert_fail_base (/lib64/libc.so.6+0x2f1a6)
+;  #6 0x00007f2c72008252 (/lib64/libc.so.6+0x2f252)
+;  #7 0x0000000002f06de9 llvm::SelectionDAG::RemoveNodeFromCSEMaps(llvm::SDNode*)
+;  #8 0x0000000002f0f0b4 llvm::SelectionDAG::ReplaceAllUsesOfValuesWith(llvm::SDValue const*, llvm::SDValue const*, unsigned int)
+;  #9 0x0000000002dc8a4f (anonymous namespace)::DAGCombiner::scalarizeExtractedVectorLoad(llvm::SDNode*, llvm::EVT, llvm::SDValue, llvm::LoadSDNode*) DAGCombiner.cpp:0:0
+; #10 0x0000000002de1a8e (anonymous namespace)::DAGCombiner::visitEXTRACT_VECTOR_ELT(llvm::SDNode*) DAGCombiner.cpp:0:0
+; #11 0x0000000002e12f41 (anonymous namespace)::DAGCombiner::visit(llvm::SDNode*) DAGCombiner.cpp:0:0
+; #12 0x0000000002e14fe5 (anonymous namespace)::DAGCombiner::combine(llvm::SDNode*) DAGCombiner.cpp:0:0
+
+define i64 @g({ i64, i64 }* %p) {
+; CHECK-LABEL: g:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    add x9, x8, x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    sub x0, x8, x8
+; CHECK-NEXT:    ret
+  %vecp = bitcast { i64, i64 }* %p to <2 x i64>*
+  %vec = load <2 x i64>, <2 x i64>* %vecp, align 1
+  %elt = extractelement <2 x i64> %vec, i32 1
+  %scalarp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %p, i32 0, i32 1
+  %scalar = load i64, i64* %scalarp, align 1
+  %add.i62 = add i64 %elt, %scalar
+  %add.i66 = add i64 %add.i62, %elt
+  %add.i72 = add i64 %scalar, %scalar
+  %add.i76 = add i64 %add.i72, %elt
+  %add.i80 = add i64 %add.i76, 0
+  %sub.i82 = sub i64 %add.i66, %add.i80
+  ret i64 %sub.i82
+}

From 6457f42bde82fd9a514434c946b9d3fbe92a8619 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Fri, 10 Dec 2021 18:05:38 +0000
Subject: [PATCH 094/748] [DAGCombiner] Extend ISD::ABDS/U combine to handle
 more cases.

The current ABD combine doesn't quite work for SVE because only a
single scalable vector per scalar integer type is legal (e.g. for
i32, <vscale x 4 x i32> is the only legal scalable vector type).

This patch extends the combine to also trigger for the cases when
operand extension must be retained.

Differential Revision: https://reviews.llvm.org/D115739
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 +++--
 llvm/test/CodeGen/AArch64/neon-abd.ll         | 12 +--
 llvm/test/CodeGen/AArch64/sve-abd.ll          | 96 ++++++++++++++-----
 llvm/test/CodeGen/Thumb2/mve-vabdus.ll        | 12 +--
 4 files changed, 99 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9b156b2c49401..a0708336d26a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9505,18 +9505,27 @@ static SDValue combineABSToABD(SDNode *N, SelectionDAG &DAG,
       (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
     return SDValue();
 
+  EVT VT = N->getValueType(0);
   EVT VT1 = Op0.getOperand(0).getValueType();
   EVT VT2 = Op1.getOperand(0).getValueType();
-  // Check if the operands are of same type and valid size.
   unsigned ABDOpcode = (Opc0 == ISD::SIGN_EXTEND) ? ISD::ABDS : ISD::ABDU;
-  if (VT1 != VT2 || !TLI.isOperationLegalOrCustom(ABDOpcode, VT1))
-    return SDValue();
 
-  Op0 = Op0.getOperand(0);
-  Op1 = Op1.getOperand(0);
-  SDValue ABD =
-      DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1);
-  return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD);
+  // fold abs(sext(x) - sext(y)) -> zext(abds(x, y))
+  // fold abs(zext(x) - zext(y)) -> zext(abdu(x, y))
+  // NOTE: Extensions must be equivalent.
+  if (VT1 == VT2 && TLI.isOperationLegalOrCustom(ABDOpcode, VT1)) {
+    Op0 = Op0.getOperand(0);
+    Op1 = Op1.getOperand(0);
+    SDValue ABD = DAG.getNode(ABDOpcode, SDLoc(N), VT1, Op0, Op1);
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, ABD);
+  }
+
+  // fold abs(sext(x) - sext(y)) -> abds(sext(x), sext(y))
+  // fold abs(zext(x) - zext(y)) -> abdu(zext(x), zext(y))
+  if (TLI.isOperationLegalOrCustom(ABDOpcode, VT))
+    return DAG.getNode(ABDOpcode, SDLoc(N), VT, Op0, Op1);
+
+  return SDValue();
 }
 
 SDValue DAGCombiner::visitABS(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/neon-abd.ll b/llvm/test/CodeGen/AArch64/neon-abd.ll
index 94c25e945f630..0279c832391ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-abd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-abd.ll
@@ -53,8 +53,7 @@ define <4 x i16> @sabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    shl v1.4h, v1.4h, #8
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    abs v0.4h, v0.4h
+; CHECK-NEXT:    sabd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %a.sext = sext <4 x i8> %a to <4 x i16>
   %b.sext = sext <4 x i8> %b to <4 x i16>
@@ -108,8 +107,7 @@ define <2 x i32> @sabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) #0 {
 ; CHECK-NEXT:    shl v1.2s, v1.2s, #16
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
-; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    abs v0.2s, v0.2s
+; CHECK-NEXT:    sabd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %a.sext = sext <2 x i16> %a to <2 x i32>
   %b.sext = sext <2 x i16> %b to <2 x i32>
@@ -234,8 +232,7 @@ define <4 x i16> @uabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bic v0.4h, #255, lsl #8
 ; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    abs v0.4h, v0.4h
+; CHECK-NEXT:    uabd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %a.zext = zext <4 x i8> %a to <4 x i16>
   %b.zext = zext <4 x i8> %b to <4 x i16>
@@ -288,8 +285,7 @@ define <2 x i32> @uabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) #0 {
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    sub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    abs v0.2s, v0.2s
+; CHECK-NEXT:    uabd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
   %a.zext = zext <2 x i16> %a to <2 x i32>
   %b.zext = zext <2 x i16> %b to <2 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-abd.ll b/llvm/test/CodeGen/AArch64/sve-abd.ll
index affd6d5b15f79..1bdff3a42db93 100644
--- a/llvm/test/CodeGen/AArch64/sve-abd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-abd.ll
@@ -24,11 +24,10 @@ define <vscale x 16 x i8> @sabd_b(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 16 x i8> @sabd_b_promoted_ops(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: sabd_b_promoted_ops:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    sub z0.b, z0.b, z1.b
-; CHECK-NEXT:    abs z0.b, p2/m, z0.b
+; CHECK-NEXT:    sabd z0.b, p2/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 16 x i1> %a to <vscale x 16 x i8>
   %b.sext = sext <vscale x 16 x i1> %b to <vscale x 16 x i8>
@@ -57,8 +56,7 @@ define <vscale x 8 x i16> @sabd_h_promoted_ops(<vscale x 8 x i8> %a, <vscale x 8
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEXT:    sub z0.h, z0.h, z1.h
-; CHECK-NEXT:    abs z0.h, p0/m, z0.h
+; CHECK-NEXT:    sabd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -87,8 +85,7 @@ define <vscale x 4 x i32> @sabd_s_promoted_ops(<vscale x 4 x i16> %a, <vscale x
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
-; CHECK-NEXT:    abs z0.s, p0/m, z0.s
+; CHECK-NEXT:    sabd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %b.sext = sext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -117,8 +114,7 @@ define <vscale x 2 x i64> @sabd_d_promoted_ops(<vscale x 2 x i32> %a, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    sxtw z1.d, p0/m, z1.d
-; CHECK-NEXT:    sub z0.d, z0.d, z1.d
-; CHECK-NEXT:    abs z0.d, p0/m, z0.d
+; CHECK-NEXT:    sabd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %b.sext = sext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -148,11 +144,10 @@ define <vscale x 16 x i8> @uabd_b(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 16 x i8> @uabd_b_promoted_ops(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: uabd_b_promoted_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    add z0.b, z0.b, z1.b
-; CHECK-NEXT:    abs z0.b, p2/m, z0.b
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.b, p1/z, #1 // =0x1
+; CHECK-NEXT:    uabd z0.b, p2/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 16 x i1> %a to <vscale x 16 x i8>
   %b.zext = zext <vscale x 16 x i1> %b to <vscale x 16 x i8>
@@ -178,11 +173,10 @@ define <vscale x 8 x i16> @uabd_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
 define <vscale x 8 x i16> @uabd_h_promoted_ops(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) #0 {
 ; CHECK-LABEL: uabd_h_promoted_ops:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sub z0.h, z0.h, z1.h
-; CHECK-NEXT:    abs z0.h, p0/m, z0.h
+; CHECK-NEXT:    uabd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -208,11 +202,10 @@ define <vscale x 4 x i32> @uabd_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
 define <vscale x 4 x i32> @uabd_s_promoted_ops(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) #0 {
 ; CHECK-LABEL: uabd_s_promoted_ops:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
-; CHECK-NEXT:    abs z0.s, p0/m, z0.s
+; CHECK-NEXT:    uabd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %b.zext = zext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -238,11 +231,10 @@ define <vscale x 2 x i64> @uabd_d(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
 define <vscale x 2 x i64> @uabd_d_promoted_ops(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) #0 {
 ; CHECK-LABEL: uabd_d_promoted_ops:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    and z1.d, z1.d, #0xffffffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sub z0.d, z0.d, z1.d
-; CHECK-NEXT:    abs z0.d, p0/m, z0.d
+; CHECK-NEXT:    uabd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %b.zext = zext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -251,6 +243,66 @@ define <vscale x 2 x i64> @uabd_d_promoted_ops(<vscale x 2 x i32> %a, <vscale x
   ret <vscale x 2 x i64> %abs
 }
 
+; Test the situation where isLegal(ISD::ABD, typeof(%a)) returns true but %a and
+; %b have differing types.
+define <vscale x 4 x i32> @uabd_non_matching_extension(<vscale x 4 x i32> %a, <vscale x 4 x i8> %b) #0 {
+; CHECK-LABEL: uabd_non_matching_extension:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z1.s, z1.s, #0xff
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpkhi z3.d, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    sub z1.d, z2.d, z3.d
+; CHECK-NEXT:    abs z1.d, p0/m, z1.d
+; CHECK-NEXT:    abs z0.d, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 4 x i32> %a to <vscale x 4 x i64>
+  %b.zext = zext <vscale x 4 x i8> %b to <vscale x 4 x i64>
+  %sub = sub <vscale x 4 x i64> %a.zext, %b.zext
+  %abs = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> %sub, i1 true)
+  %trunc = trunc <vscale x 4 x i64> %abs to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %trunc
+}
+
+; Test the situation where isLegal(ISD::ABD, typeof(%a.zext)) returns true but
+; %a and %b have differing types.
+define <vscale x 4 x i32> @uabd_non_matching_promoted_ops(<vscale x 4 x i8> %a, <vscale x 4 x i16> %b) #0 {
+; CHECK-LABEL: uabd_non_matching_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-NEXT:    uabd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
+  %b.zext = zext <vscale x 4 x i16> %b to <vscale x 4 x i32>
+  %sub = sub <vscale x 4 x i32> %a.zext, %b.zext
+  %abs = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> %sub, i1 true)
+  ret <vscale x 4 x i32> %abs
+}
+
+; Test the situation where isLegal(ISD::ABD, typeof(%a)) returns true but %a and
+; %b are promoted differently.
+define <vscale x 4 x i32> @uabd_non_matching_promotion(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) #0 {
+; CHECK-LABEL: uabd_non_matching_promotion:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    sxtb z1.s, p0/m, z1.s
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    abs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %a.zext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
+  %b.zext = sext <vscale x 4 x i8> %b to <vscale x 4 x i32>
+  %sub = sub <vscale x 4 x i32> %a.zext, %b.zext
+  %abs = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> %sub, i1 true)
+  ret <vscale x 4 x i32> %abs
+}
+
 declare <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8>, i1)
 
 declare <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16>, i1)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index aa37c70718ac0..5d93e5f179dbc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -21,8 +21,7 @@ define arm_aapcs_vfpcc <8 x i8> @vabd_v8s8(<8 x i8> %src1, <8 x i8> %src2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vsub.i16 q0, q0, q1
-; CHECK-NEXT:    vabs.s16 q0, q0
+; CHECK-NEXT:    vabd.s16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <8 x i8> %src1 to <8 x i16>
   %sextsrc2 = sext <8 x i8> %src2 to <8 x i16>
@@ -74,8 +73,7 @@ define arm_aapcs_vfpcc <4 x i16> @vabd_v4s16(<4 x i16> %src1, <4 x i16> %src2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vsub.i32 q0, q0, q1
-; CHECK-NEXT:    vabs.s32 q0, q0
+; CHECK-NEXT:    vabd.s32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <4 x i16> %src1 to <4 x i32>
   %sextsrc2 = sext <4 x i16> %src2 to <4 x i32>
@@ -158,8 +156,7 @@ define arm_aapcs_vfpcc <8 x i8> @vabd_v8u8(<8 x i8> %src1, <8 x i8> %src2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmovlb.u8 q1, q1
 ; CHECK-NEXT:    vmovlb.u8 q0, q0
-; CHECK-NEXT:    vsub.i16 q0, q0, q1
-; CHECK-NEXT:    vabs.s16 q0, q0
+; CHECK-NEXT:    vabd.u16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
   %zextsrc1 = zext <8 x i8> %src1 to <8 x i16>
   %zextsrc2 = zext <8 x i8> %src2 to <8 x i16>
@@ -210,8 +207,7 @@ define arm_aapcs_vfpcc <4 x i16> @vabd_v4u16(<4 x i16> %src1, <4 x i16> %src2) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    vmovlb.u16 q1, q1
 ; CHECK-NEXT:    vmovlb.u16 q0, q0
-; CHECK-NEXT:    vsub.i32 q0, q0, q1
-; CHECK-NEXT:    vabs.s32 q0, q0
+; CHECK-NEXT:    vabd.u32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
   %zextsrc1 = zext <4 x i16> %src1 to <4 x i32>
   %zextsrc2 = zext <4 x i16> %src2 to <4 x i32>

From da5a4f16e84b6f0bbfe12e5b743951081502f771 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 17 Feb 2022 12:31:00 +0000
Subject: [PATCH 095/748] [libc][automemcpy] Introduce geomean of scores as a
 tie breaker

Differential Revision: https://reviews.llvm.org/D120040
---
 .../include/automemcpy/ResultAnalyzer.h       |  3 +-
 .../automemcpy/lib/ResultAnalyzer.cpp         |  5 ++-
 .../automemcpy/lib/ResultAnalyzerMain.cpp     | 14 ++++----
 .../unittests/ResultAnalyzerTest.cpp          | 32 ++++++++++++-------
 4 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
index 9b861c6250611..2991df0aceba7 100644
--- a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
+++ b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
@@ -60,7 +60,8 @@ struct PerDistributionData {
 struct FunctionData {
   FunctionId Id;
   StringMap<PerDistributionData> PerDistributionData;
-  GradeHistogram GradeHisto = {};           // GradeEnum indexed array
+  double ScoresGeoMean;           // Geomean of scores for each distribution.
+  GradeHistogram GradeHisto = {}; // GradeEnum indexed array
   Grade::GradeEnum FinalGrade = Grade::BAD; // Overall grade for this function
 };
 
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
index ed9cd1f286c2c..6bfde0d2cb4be 100644
--- a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
@@ -168,14 +168,17 @@ void fillScores(MutableArrayRef<FunctionData> Functions) {
 }
 
 void castVotes(MutableArrayRef<FunctionData> Functions) {
-  for (FunctionData &Function : Functions)
+  for (FunctionData &Function : Functions) {
+    Function.ScoresGeoMean = 1.0;
     for (const auto &Pair : Function.PerDistributionData) {
       const StringRef Distribution = Pair.getKey();
       const double Score = Pair.getValue().Score;
+      Function.ScoresGeoMean *= Score;
       const auto G = Grade::judge(Score);
       ++(Function.GradeHisto[G]);
       Function.PerDistributionData[Distribution].Grade = G;
     }
+  }
 
   for (FunctionData &Function : Functions) {
     const auto &GradeHisto = Function.GradeHisto;
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
index 4a6caec469e55..422bc575b6b72 100644
--- a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
@@ -141,18 +141,16 @@ int Main(int argc, char **argv) {
   fillScores(Functions);
   castVotes(Functions);
 
-  // TODO: Implement tie breaking algorithm.
+  // Present data by function type, Grade and Geomean of scores.
   std::sort(Functions.begin(), Functions.end(),
             [](const FunctionData &A, const FunctionData &B) {
-              return A.FinalGrade < B.FinalGrade;
+              const auto Less = [](const FunctionData &FD) {
+                return std::make_tuple(FD.Id.Type, FD.FinalGrade,
+                                       -FD.ScoresGeoMean);
+              };
+              return Less(A) < Less(B);
             });
 
-  // Present data by function type.
-  std::stable_sort(Functions.begin(), Functions.end(),
-                   [](const FunctionData &A, const FunctionData &B) {
-                     return A.Id.Type < B.Id.Type;
-                   });
-
   // Print result.
   for (const FunctionData &Function : Functions) {
     outs() << formatv("{0,-10}", Grade::getString(Function.FinalGrade));
diff --git a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
index 56f7bbf3d5f80..10d0f98272b4b 100644
--- a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
+++ b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
@@ -139,27 +139,35 @@ TEST(AutomemcpyJsonResultsAnalyzer, castVotes) {
   EXPECT_THAT(Data[1].Id, Foo2);
   EXPECT_THAT(Data[2].Id, Foo3);
 
+  const auto GetDistData = [&Data](size_t Index, StringRef Name) {
+    return Data[Index].PerDistributionData.lookup(Name);
+  };
+
   // Distribution A
   // Throughput is 0, 1 and 7, so normalized scores are 0, 1/7 and 1.
-  EXPECT_NEAR(Data[0].PerDistributionData.lookup("A").Score, 0, kAbsErr);
-  EXPECT_NEAR(Data[1].PerDistributionData.lookup("A").Score, 1. / 7, kAbsErr);
-  EXPECT_NEAR(Data[2].PerDistributionData.lookup("A").Score, 1, kAbsErr);
+  EXPECT_THAT(GetDistData(0, "A").Score, DoubleNear(0, kAbsErr));
+  EXPECT_THAT(GetDistData(1, "A").Score, DoubleNear(1. / 7, kAbsErr));
+  EXPECT_THAT(GetDistData(2, "A").Score, DoubleNear(1, kAbsErr));
   // which are turned into grades BAD,  MEDIOCRE and EXCELLENT.
-  EXPECT_THAT(Data[0].PerDistributionData.lookup("A").Grade, Grade::BAD);
-  EXPECT_THAT(Data[1].PerDistributionData.lookup("A").Grade, Grade::MEDIOCRE);
-  EXPECT_THAT(Data[2].PerDistributionData.lookup("A").Grade, Grade::EXCELLENT);
+  EXPECT_THAT(GetDistData(0, "A").Grade, Grade::BAD);
+  EXPECT_THAT(GetDistData(1, "A").Grade, Grade::MEDIOCRE);
+  EXPECT_THAT(GetDistData(2, "A").Grade, Grade::EXCELLENT);
 
   // Distribution B
   // Throughput is 30, 100 and 100, so normalized scores are 0, 1 and 1.
-  EXPECT_NEAR(Data[0].PerDistributionData.lookup("B").Score, 0, kAbsErr);
-  EXPECT_NEAR(Data[1].PerDistributionData.lookup("B").Score, 1, kAbsErr);
-  EXPECT_NEAR(Data[2].PerDistributionData.lookup("B").Score, 1, kAbsErr);
+  EXPECT_THAT(GetDistData(0, "B").Score, DoubleNear(0, kAbsErr));
+  EXPECT_THAT(GetDistData(1, "B").Score, DoubleNear(1, kAbsErr));
+  EXPECT_THAT(GetDistData(2, "B").Score, DoubleNear(1, kAbsErr));
   // which are turned into grades BAD, EXCELLENT and EXCELLENT.
-  EXPECT_THAT(Data[0].PerDistributionData.lookup("B").Grade, Grade::BAD);
-  EXPECT_THAT(Data[1].PerDistributionData.lookup("B").Grade, Grade::EXCELLENT);
-  EXPECT_THAT(Data[2].PerDistributionData.lookup("B").Grade, Grade::EXCELLENT);
+  EXPECT_THAT(GetDistData(0, "B").Grade, Grade::BAD);
+  EXPECT_THAT(GetDistData(1, "B").Grade, Grade::EXCELLENT);
+  EXPECT_THAT(GetDistData(2, "B").Grade, Grade::EXCELLENT);
 
   // Now looking from the functions point of view.
+  EXPECT_THAT(Data[0].ScoresGeoMean, DoubleNear(0, kAbsErr));
+  EXPECT_THAT(Data[1].ScoresGeoMean, DoubleNear(1. * (1. / 7), kAbsErr));
+  EXPECT_THAT(Data[2].ScoresGeoMean, DoubleNear(1, kAbsErr));
+
   // Note the array is indexed by GradeEnum values (EXCELLENT=0 / BAD = 6)
   EXPECT_THAT(Data[0].GradeHisto, ElementsAre(0, 0, 0, 0, 0, 0, 2));
   EXPECT_THAT(Data[1].GradeHisto, ElementsAre(1, 0, 0, 0, 0, 1, 0));

From 3f22a4962dafe2718a92b3cd9b5be4a6fcc83b77 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 13:50:52 +0000
Subject: [PATCH 096/748] [X86] selectLEAAddr - add X86ISD::SMUL/UMULO handling

After D118128 relaxed the heuristic to require only one EFLAGS generating operand, it now makes sense to avoid X86ISD::SMUL/UMULO duplication as well.

Differential Revision: https://reviews.llvm.org/D119578
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |   4 +-
 llvm/test/CodeGen/X86/select-lea.ll           |  34 +-
 llvm/test/CodeGen/X86/umul_fix_sat.ll         |  10 +-
 .../X86/umulo-128-legalisation-lowering.ll    |  81 +++--
 .../X86/umulo-64-legalisation-lowering.ll     |  10 +-
 llvm/test/CodeGen/X86/vec_umulo.ll            | 296 +++++++++---------
 llvm/test/CodeGen/X86/xmulo.ll                |  27 +-
 7 files changed, 215 insertions(+), 247 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 17f4b1ec5bf78..66c44a49f4f68 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2782,10 +2782,10 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
       case X86ISD::SUB:
       case X86ISD::ADC:
       case X86ISD::SBB:
-      /* TODO: These opcodes can be added safely, but we may want to justify
-               their inclusion for different reasons (better for reg-alloc).
       case X86ISD::SMUL:
       case X86ISD::UMUL:
+      /* TODO: These opcodes can be added safely, but we may want to justify
+               their inclusion for different reasons (better for reg-alloc).
       case X86ISD::OR:
       case X86ISD::XOR:
       case X86ISD::AND:
diff --git a/llvm/test/CodeGen/X86/select-lea.ll b/llvm/test/CodeGen/X86/select-lea.ll
index 487b1f3d3a223..a849280c1377e 100644
--- a/llvm/test/CodeGen/X86/select-lea.ll
+++ b/llvm/test/CodeGen/X86/select-lea.ll
@@ -330,35 +330,27 @@ define i32 @usub_add_load(i32 %x, i32 %y, i32* %pz) nounwind {
 define i32 @smul_add_imm(i32 %x, i32 %y) {
 ; X64-LABEL: smul_add_imm:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    imull %esi, %eax
-; X64-NEXT:    addl $100, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    imull %esi, %edi
+; X64-NEXT:    leal 100(%rdi), %eax
 ; X64-NEXT:    cmovnol %edi, %eax
 ; X64-NEXT:    retq
 ;
 ; CMOV-LABEL: smul_add_imm:
 ; CMOV:       # %bb.0:
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CMOV-NEXT:    movl %eax, %edx
-; CMOV-NEXT:    imull %ecx, %edx
-; CMOV-NEXT:    addl $100, %edx
-; CMOV-NEXT:    imull %ecx, %eax
-; CMOV-NEXT:    cmovol %edx, %eax
+; CMOV-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; CMOV-NEXT:    leal 100(%ecx), %eax
+; CMOV-NEXT:    cmovnol %ecx, %eax
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: smul_add_imm:
 ; NOCMOV:       # %bb.0:
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; NOCMOV-NEXT:    movl %eax, %ecx
-; NOCMOV-NEXT:    imull %edx, %ecx
-; NOCMOV-NEXT:    imull %edx, %eax
+; NOCMOV-NEXT:    imull {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    jno .LBB8_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    addl $100, %ecx
-; NOCMOV-NEXT:    movl %ecx, %eax
+; NOCMOV-NEXT:    addl $100, %eax
 ; NOCMOV-NEXT:  .LBB8_2:
 ; NOCMOV-NEXT:    retl
   %o = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %x, i32 %y)
@@ -422,10 +414,8 @@ define i32 @umul_add_imm(i32 %x, i32 %y) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    mull %esi
 ; X64-NEXT:    # kill: def $eax killed $eax def $rax
-; X64-NEXT:    seto %cl
-; X64-NEXT:    leal 100(%rax), %edx
-; X64-NEXT:    testb %cl, %cl
-; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    leal 100(%rax), %ecx
+; X64-NEXT:    cmovol %ecx, %eax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
 ;
@@ -433,10 +423,8 @@ define i32 @umul_add_imm(i32 %x, i32 %y) {
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CMOV-NEXT:    mull {{[0-9]+}}(%esp)
-; CMOV-NEXT:    seto %cl
-; CMOV-NEXT:    leal 100(%eax), %edx
-; CMOV-NEXT:    testb %cl, %cl
-; CMOV-NEXT:    cmovnel %edx, %eax
+; CMOV-NEXT:    leal 100(%eax), %ecx
+; CMOV-NEXT:    cmovol %ecx, %eax
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: umul_add_imm:
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 504557242c305..247b5ee17e7a5 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -281,21 +281,21 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %bl
 ; X86-NEXT:    andb %dl, %bl
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    seto %bh
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    seto %cl
 ; X86-NEXT:    orb %bh, %cl
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    leal (%edi,%eax), %esi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 40fc6db7fe6b2..3d7544f7f6814 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -19,10 +19,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    seto %r11b
 ; X64-NEXT:    orb %r10b, %r11b
-; X64-NEXT:    addq %rax, %rsi
+; X64-NEXT:    leaq (%rsi,%rax), %rcx
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    orb %r11b, %cl
 ; X64-NEXT:    orb %r9b, %cl
@@ -38,64 +38,63 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 48
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 44
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    leal (%ecx,%eax), %ecx
+; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    leal (%ecx,%eax), %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -103,12 +102,12 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    setne %cl
@@ -121,10 +120,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
 ; X86-NEXT:    orb %ch, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    setne %bh
 ; X86-NEXT:    andb %cl, %bh
 ; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
@@ -133,7 +132,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    setne %bl
-; X86-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 4(%ecx)
@@ -150,7 +149,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 2b7e032fb4b7f..3bbeec17c7a9e 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -19,21 +19,21 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %bl
 ; X86-NEXT:    andb %dl, %bl
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    seto %bh
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    seto %ch
 ; X86-NEXT:    orb %bh, %ch
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    leal (%edi,%eax), %esi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 51de68916596b..bd448d5d19244 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2952,63 +2952,61 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE2-NEXT:    movq %rcx, %r12
 ; SSE2-NEXT:    movq %rdx, %r11
 ; SSE2-NEXT:    movq %rsi, %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; SSE2-NEXT:    testq %r10, %r10
-; SSE2-NEXT:    setne %cl
+; SSE2-NEXT:    setne %dl
 ; SSE2-NEXT:    testq %rsi, %rsi
-; SSE2-NEXT:    setne %r13b
-; SSE2-NEXT:    andb %cl, %r13b
+; SSE2-NEXT:    setne %bpl
+; SSE2-NEXT:    andb %dl, %bpl
 ; SSE2-NEXT:    mulq %r8
 ; SSE2-NEXT:    movq %rax, %rsi
-; SSE2-NEXT:    seto %bpl
+; SSE2-NEXT:    seto %bl
 ; SSE2-NEXT:    movq %r10, %rax
 ; SSE2-NEXT:    mulq %rdi
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    seto %bl
-; SSE2-NEXT:    orb %bpl, %bl
-; SSE2-NEXT:    addq %rsi, %rcx
+; SSE2-NEXT:    seto %cl
+; SSE2-NEXT:    orb %bl, %cl
+; SSE2-NEXT:    leaq (%rsi,%rax), %rbx
 ; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    mulq %r8
-; SSE2-NEXT:    movq %rax, %r8
+; SSE2-NEXT:    movq %rax, %rdi
 ; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    addq %rcx, %rsi
-; SSE2-NEXT:    setb %cl
-; SSE2-NEXT:    orb %bl, %cl
-; SSE2-NEXT:    orb %r13b, %cl
+; SSE2-NEXT:    addq %rbx, %rsi
+; SSE2-NEXT:    setb %r13b
+; SSE2-NEXT:    orb %cl, %r13b
+; SSE2-NEXT:    orb %bpl, %r13b
 ; SSE2-NEXT:    testq %r9, %r9
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    testq %r12, %r12
 ; SSE2-NEXT:    setne %r10b
 ; SSE2-NEXT:    andb %al, %r10b
 ; SSE2-NEXT:    movq %r12, %rax
-; SSE2-NEXT:    mulq %r15
-; SSE2-NEXT:    movq %rax, %rdi
-; SSE2-NEXT:    seto %bpl
+; SSE2-NEXT:    mulq %r14
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    seto %r8b
 ; SSE2-NEXT:    movq %r9, %rax
 ; SSE2-NEXT:    mulq %r11
-; SSE2-NEXT:    movq %rax, %rbx
-; SSE2-NEXT:    seto %r9b
-; SSE2-NEXT:    orb %bpl, %r9b
-; SSE2-NEXT:    addq %rdi, %rbx
+; SSE2-NEXT:    seto %cl
+; SSE2-NEXT:    orb %r8b, %cl
+; SSE2-NEXT:    addq %rax, %rbp
 ; SSE2-NEXT:    movq %r11, %rax
-; SSE2-NEXT:    mulq %r15
-; SSE2-NEXT:    addq %rbx, %rdx
+; SSE2-NEXT:    mulq %r14
+; SSE2-NEXT:    addq %rbp, %rdx
 ; SSE2-NEXT:    setb %bl
-; SSE2-NEXT:    orb %r9b, %bl
+; SSE2-NEXT:    orb %cl, %bl
 ; SSE2-NEXT:    orb %r10b, %bl
-; SSE2-NEXT:    movzbl %bl, %edi
-; SSE2-NEXT:    negl %edi
-; SSE2-NEXT:    movd %edi, %xmm1
-; SSE2-NEXT:    movzbl %cl, %ecx
+; SSE2-NEXT:    movzbl %bl, %ecx
+; SSE2-NEXT:    negl %ecx
+; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movzbl %r13b, %ecx
 ; SSE2-NEXT:    negl %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq %rax, 16(%r14)
-; SSE2-NEXT:    movq %r8, (%r14)
-; SSE2-NEXT:    movq %rdx, 24(%r14)
-; SSE2-NEXT:    movq %rsi, 8(%r14)
+; SSE2-NEXT:    movq %rax, 16(%r15)
+; SSE2-NEXT:    movq %rdi, (%r15)
+; SSE2-NEXT:    movq %rdx, 24(%r15)
+; SSE2-NEXT:    movq %rsi, 8(%r15)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
 ; SSE2-NEXT:    popq %r13
@@ -3029,63 +3027,61 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSSE3-NEXT:    movq %rcx, %r12
 ; SSSE3-NEXT:    movq %rdx, %r11
 ; SSSE3-NEXT:    movq %rsi, %rax
-; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; SSSE3-NEXT:    testq %r10, %r10
-; SSSE3-NEXT:    setne %cl
+; SSSE3-NEXT:    setne %dl
 ; SSSE3-NEXT:    testq %rsi, %rsi
-; SSSE3-NEXT:    setne %r13b
-; SSSE3-NEXT:    andb %cl, %r13b
+; SSSE3-NEXT:    setne %bpl
+; SSSE3-NEXT:    andb %dl, %bpl
 ; SSSE3-NEXT:    mulq %r8
 ; SSSE3-NEXT:    movq %rax, %rsi
-; SSSE3-NEXT:    seto %bpl
+; SSSE3-NEXT:    seto %bl
 ; SSSE3-NEXT:    movq %r10, %rax
 ; SSSE3-NEXT:    mulq %rdi
-; SSSE3-NEXT:    movq %rax, %rcx
-; SSSE3-NEXT:    seto %bl
-; SSSE3-NEXT:    orb %bpl, %bl
-; SSSE3-NEXT:    addq %rsi, %rcx
+; SSSE3-NEXT:    seto %cl
+; SSSE3-NEXT:    orb %bl, %cl
+; SSSE3-NEXT:    leaq (%rsi,%rax), %rbx
 ; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    mulq %r8
-; SSSE3-NEXT:    movq %rax, %r8
+; SSSE3-NEXT:    movq %rax, %rdi
 ; SSSE3-NEXT:    movq %rdx, %rsi
-; SSSE3-NEXT:    addq %rcx, %rsi
-; SSSE3-NEXT:    setb %cl
-; SSSE3-NEXT:    orb %bl, %cl
-; SSSE3-NEXT:    orb %r13b, %cl
+; SSSE3-NEXT:    addq %rbx, %rsi
+; SSSE3-NEXT:    setb %r13b
+; SSSE3-NEXT:    orb %cl, %r13b
+; SSSE3-NEXT:    orb %bpl, %r13b
 ; SSSE3-NEXT:    testq %r9, %r9
 ; SSSE3-NEXT:    setne %al
 ; SSSE3-NEXT:    testq %r12, %r12
 ; SSSE3-NEXT:    setne %r10b
 ; SSSE3-NEXT:    andb %al, %r10b
 ; SSSE3-NEXT:    movq %r12, %rax
-; SSSE3-NEXT:    mulq %r15
-; SSSE3-NEXT:    movq %rax, %rdi
-; SSSE3-NEXT:    seto %bpl
+; SSSE3-NEXT:    mulq %r14
+; SSSE3-NEXT:    movq %rax, %rbp
+; SSSE3-NEXT:    seto %r8b
 ; SSSE3-NEXT:    movq %r9, %rax
 ; SSSE3-NEXT:    mulq %r11
-; SSSE3-NEXT:    movq %rax, %rbx
-; SSSE3-NEXT:    seto %r9b
-; SSSE3-NEXT:    orb %bpl, %r9b
-; SSSE3-NEXT:    addq %rdi, %rbx
+; SSSE3-NEXT:    seto %cl
+; SSSE3-NEXT:    orb %r8b, %cl
+; SSSE3-NEXT:    addq %rax, %rbp
 ; SSSE3-NEXT:    movq %r11, %rax
-; SSSE3-NEXT:    mulq %r15
-; SSSE3-NEXT:    addq %rbx, %rdx
+; SSSE3-NEXT:    mulq %r14
+; SSSE3-NEXT:    addq %rbp, %rdx
 ; SSSE3-NEXT:    setb %bl
-; SSSE3-NEXT:    orb %r9b, %bl
+; SSSE3-NEXT:    orb %cl, %bl
 ; SSSE3-NEXT:    orb %r10b, %bl
-; SSSE3-NEXT:    movzbl %bl, %edi
-; SSSE3-NEXT:    negl %edi
-; SSSE3-NEXT:    movd %edi, %xmm1
-; SSSE3-NEXT:    movzbl %cl, %ecx
+; SSSE3-NEXT:    movzbl %bl, %ecx
+; SSSE3-NEXT:    negl %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movzbl %r13b, %ecx
 ; SSSE3-NEXT:    negl %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movq %rax, 16(%r14)
-; SSSE3-NEXT:    movq %r8, (%r14)
-; SSSE3-NEXT:    movq %rdx, 24(%r14)
-; SSSE3-NEXT:    movq %rsi, 8(%r14)
+; SSSE3-NEXT:    movq %rax, 16(%r15)
+; SSSE3-NEXT:    movq %rdi, (%r15)
+; SSSE3-NEXT:    movq %rdx, 24(%r15)
+; SSSE3-NEXT:    movq %rsi, 8(%r15)
 ; SSSE3-NEXT:    popq %rbx
 ; SSSE3-NEXT:    popq %r12
 ; SSSE3-NEXT:    popq %r13
@@ -3106,62 +3102,60 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; SSE41-NEXT:    movq %rcx, %r12
 ; SSE41-NEXT:    movq %rdx, %r11
 ; SSE41-NEXT:    movq %rsi, %rax
-; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; SSE41-NEXT:    testq %r10, %r10
-; SSE41-NEXT:    setne %cl
+; SSE41-NEXT:    setne %dl
 ; SSE41-NEXT:    testq %rsi, %rsi
-; SSE41-NEXT:    setne %r13b
-; SSE41-NEXT:    andb %cl, %r13b
+; SSE41-NEXT:    setne %bpl
+; SSE41-NEXT:    andb %dl, %bpl
 ; SSE41-NEXT:    mulq %r8
 ; SSE41-NEXT:    movq %rax, %rsi
-; SSE41-NEXT:    seto %bpl
+; SSE41-NEXT:    seto %bl
 ; SSE41-NEXT:    movq %r10, %rax
 ; SSE41-NEXT:    mulq %rdi
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    seto %bl
-; SSE41-NEXT:    orb %bpl, %bl
-; SSE41-NEXT:    addq %rsi, %rcx
+; SSE41-NEXT:    seto %cl
+; SSE41-NEXT:    orb %bl, %cl
+; SSE41-NEXT:    leaq (%rsi,%rax), %rbx
 ; SSE41-NEXT:    movq %rdi, %rax
 ; SSE41-NEXT:    mulq %r8
-; SSE41-NEXT:    movq %rax, %r8
+; SSE41-NEXT:    movq %rax, %rdi
 ; SSE41-NEXT:    movq %rdx, %rsi
-; SSE41-NEXT:    addq %rcx, %rsi
-; SSE41-NEXT:    setb %cl
-; SSE41-NEXT:    orb %bl, %cl
-; SSE41-NEXT:    orb %r13b, %cl
+; SSE41-NEXT:    addq %rbx, %rsi
+; SSE41-NEXT:    setb %r13b
+; SSE41-NEXT:    orb %cl, %r13b
+; SSE41-NEXT:    orb %bpl, %r13b
 ; SSE41-NEXT:    testq %r9, %r9
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    testq %r12, %r12
 ; SSE41-NEXT:    setne %r10b
 ; SSE41-NEXT:    andb %al, %r10b
 ; SSE41-NEXT:    movq %r12, %rax
-; SSE41-NEXT:    mulq %r15
-; SSE41-NEXT:    movq %rax, %rdi
-; SSE41-NEXT:    seto %bpl
+; SSE41-NEXT:    mulq %r14
+; SSE41-NEXT:    movq %rax, %rbp
+; SSE41-NEXT:    seto %r8b
 ; SSE41-NEXT:    movq %r9, %rax
 ; SSE41-NEXT:    mulq %r11
-; SSE41-NEXT:    movq %rax, %rbx
-; SSE41-NEXT:    seto %r9b
-; SSE41-NEXT:    orb %bpl, %r9b
-; SSE41-NEXT:    addq %rdi, %rbx
+; SSE41-NEXT:    seto %cl
+; SSE41-NEXT:    orb %r8b, %cl
+; SSE41-NEXT:    addq %rax, %rbp
 ; SSE41-NEXT:    movq %r11, %rax
-; SSE41-NEXT:    mulq %r15
-; SSE41-NEXT:    addq %rbx, %rdx
+; SSE41-NEXT:    mulq %r14
+; SSE41-NEXT:    addq %rbp, %rdx
 ; SSE41-NEXT:    setb %bl
-; SSE41-NEXT:    orb %r9b, %bl
+; SSE41-NEXT:    orb %cl, %bl
 ; SSE41-NEXT:    orb %r10b, %bl
-; SSE41-NEXT:    movzbl %bl, %edi
-; SSE41-NEXT:    negl %edi
-; SSE41-NEXT:    movzbl %cl, %ecx
+; SSE41-NEXT:    movzbl %bl, %ecx
 ; SSE41-NEXT:    negl %ecx
-; SSE41-NEXT:    movd %ecx, %xmm0
-; SSE41-NEXT:    pinsrd $1, %edi, %xmm0
-; SSE41-NEXT:    movq %rax, 16(%r14)
-; SSE41-NEXT:    movq %r8, (%r14)
-; SSE41-NEXT:    movq %rdx, 24(%r14)
-; SSE41-NEXT:    movq %rsi, 8(%r14)
+; SSE41-NEXT:    movzbl %r13b, %ebp
+; SSE41-NEXT:    negl %ebp
+; SSE41-NEXT:    movd %ebp, %xmm0
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
+; SSE41-NEXT:    movq %rax, 16(%r15)
+; SSE41-NEXT:    movq %rdi, (%r15)
+; SSE41-NEXT:    movq %rdx, 24(%r15)
+; SSE41-NEXT:    movq %rsi, 8(%r15)
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
 ; SSE41-NEXT:    popq %r13
@@ -3182,62 +3176,60 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX-NEXT:    movq %rcx, %r12
 ; AVX-NEXT:    movq %rdx, %r11
 ; AVX-NEXT:    movq %rsi, %rax
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; AVX-NEXT:    testq %r10, %r10
-; AVX-NEXT:    setne %cl
+; AVX-NEXT:    setne %dl
 ; AVX-NEXT:    testq %rsi, %rsi
-; AVX-NEXT:    setne %r13b
-; AVX-NEXT:    andb %cl, %r13b
+; AVX-NEXT:    setne %bpl
+; AVX-NEXT:    andb %dl, %bpl
 ; AVX-NEXT:    mulq %r8
 ; AVX-NEXT:    movq %rax, %rsi
-; AVX-NEXT:    seto %bpl
+; AVX-NEXT:    seto %bl
 ; AVX-NEXT:    movq %r10, %rax
 ; AVX-NEXT:    mulq %rdi
-; AVX-NEXT:    movq %rax, %rcx
-; AVX-NEXT:    seto %bl
-; AVX-NEXT:    orb %bpl, %bl
-; AVX-NEXT:    addq %rsi, %rcx
+; AVX-NEXT:    seto %cl
+; AVX-NEXT:    orb %bl, %cl
+; AVX-NEXT:    leaq (%rsi,%rax), %rbx
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    mulq %r8
-; AVX-NEXT:    movq %rax, %r8
+; AVX-NEXT:    movq %rax, %rdi
 ; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    addq %rcx, %rsi
-; AVX-NEXT:    setb %cl
-; AVX-NEXT:    orb %bl, %cl
-; AVX-NEXT:    orb %r13b, %cl
+; AVX-NEXT:    addq %rbx, %rsi
+; AVX-NEXT:    setb %r13b
+; AVX-NEXT:    orb %cl, %r13b
+; AVX-NEXT:    orb %bpl, %r13b
 ; AVX-NEXT:    testq %r9, %r9
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    testq %r12, %r12
 ; AVX-NEXT:    setne %r10b
 ; AVX-NEXT:    andb %al, %r10b
 ; AVX-NEXT:    movq %r12, %rax
-; AVX-NEXT:    mulq %r15
-; AVX-NEXT:    movq %rax, %rdi
-; AVX-NEXT:    seto %bpl
+; AVX-NEXT:    mulq %r14
+; AVX-NEXT:    movq %rax, %rbp
+; AVX-NEXT:    seto %r8b
 ; AVX-NEXT:    movq %r9, %rax
 ; AVX-NEXT:    mulq %r11
-; AVX-NEXT:    movq %rax, %rbx
-; AVX-NEXT:    seto %r9b
-; AVX-NEXT:    orb %bpl, %r9b
-; AVX-NEXT:    addq %rdi, %rbx
+; AVX-NEXT:    seto %cl
+; AVX-NEXT:    orb %r8b, %cl
+; AVX-NEXT:    addq %rax, %rbp
 ; AVX-NEXT:    movq %r11, %rax
-; AVX-NEXT:    mulq %r15
-; AVX-NEXT:    addq %rbx, %rdx
+; AVX-NEXT:    mulq %r14
+; AVX-NEXT:    addq %rbp, %rdx
 ; AVX-NEXT:    setb %bl
-; AVX-NEXT:    orb %r9b, %bl
+; AVX-NEXT:    orb %cl, %bl
 ; AVX-NEXT:    orb %r10b, %bl
-; AVX-NEXT:    movzbl %bl, %edi
-; AVX-NEXT:    negl %edi
-; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    movzbl %bl, %ecx
 ; AVX-NEXT:    negl %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm0
-; AVX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; AVX-NEXT:    movq %rax, 16(%r14)
-; AVX-NEXT:    movq %r8, (%r14)
-; AVX-NEXT:    movq %rdx, 24(%r14)
-; AVX-NEXT:    movq %rsi, 8(%r14)
+; AVX-NEXT:    movzbl %r13b, %ebp
+; AVX-NEXT:    negl %ebp
+; AVX-NEXT:    vmovd %ebp, %xmm0
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT:    movq %rax, 16(%r15)
+; AVX-NEXT:    movq %rdi, (%r15)
+; AVX-NEXT:    movq %rdx, 24(%r15)
+; AVX-NEXT:    movq %rsi, 8(%r15)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
 ; AVX-NEXT:    popq %r13
@@ -3251,7 +3243,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    pushq %rbp
 ; AVX512F-NEXT:    pushq %r15
 ; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
 ; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
 ; AVX512F-NEXT:    movq %rcx, %rax
@@ -3263,25 +3254,24 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    testq %r10, %r10
 ; AVX512F-NEXT:    setne %dl
 ; AVX512F-NEXT:    testq %rcx, %rcx
-; AVX512F-NEXT:    setne %r13b
-; AVX512F-NEXT:    andb %dl, %r13b
+; AVX512F-NEXT:    setne %bl
+; AVX512F-NEXT:    andb %dl, %bl
 ; AVX512F-NEXT:    mulq %r15
 ; AVX512F-NEXT:    movq %rax, %rdi
 ; AVX512F-NEXT:    seto %bpl
 ; AVX512F-NEXT:    movq %r10, %rax
 ; AVX512F-NEXT:    mulq %r12
-; AVX512F-NEXT:    movq %rax, %rbx
 ; AVX512F-NEXT:    seto %cl
 ; AVX512F-NEXT:    orb %bpl, %cl
-; AVX512F-NEXT:    addq %rdi, %rbx
+; AVX512F-NEXT:    leaq (%rdi,%rax), %rbp
 ; AVX512F-NEXT:    movq %r12, %rax
 ; AVX512F-NEXT:    mulq %r15
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    movq %rdx, %r15
-; AVX512F-NEXT:    addq %rbx, %r15
+; AVX512F-NEXT:    movq %rdx, %rdi
+; AVX512F-NEXT:    addq %rbp, %rdi
 ; AVX512F-NEXT:    setb %al
 ; AVX512F-NEXT:    orb %cl, %al
-; AVX512F-NEXT:    orb %r13b, %al
+; AVX512F-NEXT:    orb %bl, %al
 ; AVX512F-NEXT:    kmovw %eax, %k0
 ; AVX512F-NEXT:    testq %r9, %r9
 ; AVX512F-NEXT:    setne %al
@@ -3294,13 +3284,12 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    seto %bpl
 ; AVX512F-NEXT:    movq %r9, %rax
 ; AVX512F-NEXT:    mulq %r11
-; AVX512F-NEXT:    movq %rax, %rdi
 ; AVX512F-NEXT:    seto %bl
 ; AVX512F-NEXT:    orb %bpl, %bl
-; AVX512F-NEXT:    addq %rsi, %rdi
+; AVX512F-NEXT:    addq %rax, %rsi
 ; AVX512F-NEXT:    movq %r11, %rax
 ; AVX512F-NEXT:    mulq %r8
-; AVX512F-NEXT:    addq %rdi, %rdx
+; AVX512F-NEXT:    addq %rsi, %rdx
 ; AVX512F-NEXT:    setb %sil
 ; AVX512F-NEXT:    orb %bl, %sil
 ; AVX512F-NEXT:    orb %cl, %sil
@@ -3312,11 +3301,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512F-NEXT:    movq %r10, 16(%r14)
 ; AVX512F-NEXT:    movq %rax, (%r14)
-; AVX512F-NEXT:    movq %r15, 24(%r14)
+; AVX512F-NEXT:    movq %rdi, 24(%r14)
 ; AVX512F-NEXT:    movq %rdx, 8(%r14)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
 ; AVX512F-NEXT:    popq %r14
 ; AVX512F-NEXT:    popq %r15
 ; AVX512F-NEXT:    popq %rbp
@@ -3327,7 +3315,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    pushq %rbp
 ; AVX512BW-NEXT:    pushq %r15
 ; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %r13
 ; AVX512BW-NEXT:    pushq %r12
 ; AVX512BW-NEXT:    pushq %rbx
 ; AVX512BW-NEXT:    movq %rcx, %rax
@@ -3339,25 +3326,24 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    testq %r10, %r10
 ; AVX512BW-NEXT:    setne %dl
 ; AVX512BW-NEXT:    testq %rcx, %rcx
-; AVX512BW-NEXT:    setne %r13b
-; AVX512BW-NEXT:    andb %dl, %r13b
+; AVX512BW-NEXT:    setne %bl
+; AVX512BW-NEXT:    andb %dl, %bl
 ; AVX512BW-NEXT:    mulq %r15
 ; AVX512BW-NEXT:    movq %rax, %rdi
 ; AVX512BW-NEXT:    seto %bpl
 ; AVX512BW-NEXT:    movq %r10, %rax
 ; AVX512BW-NEXT:    mulq %r12
-; AVX512BW-NEXT:    movq %rax, %rbx
 ; AVX512BW-NEXT:    seto %cl
 ; AVX512BW-NEXT:    orb %bpl, %cl
-; AVX512BW-NEXT:    addq %rdi, %rbx
+; AVX512BW-NEXT:    leaq (%rdi,%rax), %rbp
 ; AVX512BW-NEXT:    movq %r12, %rax
 ; AVX512BW-NEXT:    mulq %r15
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    movq %rdx, %r15
-; AVX512BW-NEXT:    addq %rbx, %r15
+; AVX512BW-NEXT:    movq %rdx, %rdi
+; AVX512BW-NEXT:    addq %rbp, %rdi
 ; AVX512BW-NEXT:    setb %al
 ; AVX512BW-NEXT:    orb %cl, %al
-; AVX512BW-NEXT:    orb %r13b, %al
+; AVX512BW-NEXT:    orb %bl, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k0
 ; AVX512BW-NEXT:    testq %r9, %r9
 ; AVX512BW-NEXT:    setne %al
@@ -3370,13 +3356,12 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    seto %bpl
 ; AVX512BW-NEXT:    movq %r9, %rax
 ; AVX512BW-NEXT:    mulq %r11
-; AVX512BW-NEXT:    movq %rax, %rdi
 ; AVX512BW-NEXT:    seto %bl
 ; AVX512BW-NEXT:    orb %bpl, %bl
-; AVX512BW-NEXT:    addq %rsi, %rdi
+; AVX512BW-NEXT:    addq %rax, %rsi
 ; AVX512BW-NEXT:    movq %r11, %rax
 ; AVX512BW-NEXT:    mulq %r8
-; AVX512BW-NEXT:    addq %rdi, %rdx
+; AVX512BW-NEXT:    addq %rsi, %rdx
 ; AVX512BW-NEXT:    setb %sil
 ; AVX512BW-NEXT:    orb %bl, %sil
 ; AVX512BW-NEXT:    orb %cl, %sil
@@ -3388,11 +3373,10 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2)
 ; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; AVX512BW-NEXT:    movq %r10, 16(%r14)
 ; AVX512BW-NEXT:    movq %rax, (%r14)
-; AVX512BW-NEXT:    movq %r15, 24(%r14)
+; AVX512BW-NEXT:    movq %rdi, 24(%r14)
 ; AVX512BW-NEXT:    movq %rdx, 8(%r14)
 ; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    popq %r12
-; AVX512BW-NEXT:    popq %r13
 ; AVX512BW-NEXT:    popq %r14
 ; AVX512BW-NEXT:    popq %r15
 ; AVX512BW-NEXT:    popq %rbp
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index d416b1a547815..71d92af0dd94b 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -487,10 +487,9 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bh, %ch
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %edx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
@@ -713,6 +712,7 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    testl %ebp, %ebp
 ; WIN32-NEXT:    setne %al
@@ -720,26 +720,26 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    setne %bl
 ; WIN32-NEXT:    andb %al, %bl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %edi, %edx
 ; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    addl %eax, %edi
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    addl %ebp, %edx
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %edi, %edx
 ; WIN32-NEXT:    setb %al
 ; WIN32-NEXT:    orb %bh, %al
 ; WIN32-NEXT:    orb %bl, %al
 ; WIN32-NEXT:    testb %al, %al
 ; WIN32-NEXT:    jne LBB14_2
 ; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:  LBB14_2:
 ; WIN32-NEXT:    movl %ecx, %eax
@@ -1337,10 +1337,9 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %cl
 ; WIN32-NEXT:    orb %bh, %cl
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %edx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
@@ -2244,10 +2243,9 @@ define zeroext i1 @umuloi64_load(i64* %ptr1, i64 %v2, i64* %res) {
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
@@ -2325,10 +2323,9 @@ define zeroext i1 @umuloi64_load2(i64 %v1, i64* %ptr2, i64* %res) {
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    addl %esi, %edx

From 7798ecca9c3db42241169d31fea4fb820ed01830 Mon Sep 17 00:00:00 2001
From: Shao-Ce SUN <shaoce@nj.iscas.ac.cn>
Date: Thu, 17 Feb 2022 21:02:58 +0800
Subject: [PATCH 097/748] [RISCV] add the MC layer support of Zfinx extension

This patch added the MC layer support of Zfinx extension.

Authored-by: StephenFan
Co-Authored-by: Shao-Ce Sun

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D93298
---
 llvm/lib/Support/RISCVISAInfo.cpp             |  23 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |  35 ++-
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  32 ++
 llvm/lib/Target/RISCV/RISCV.td                |  37 +++
 llvm/lib/Target/RISCV/RISCVInstrInfoD.td      | 215 +++++++++-----
 llvm/lib/Target/RISCV/RISCVInstrInfoF.td      | 281 +++++++++++++-----
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td    | 213 ++++++++-----
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td    |  30 ++
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |   8 +
 llvm/test/MC/RISCV/attribute-arch.s           |  12 +
 llvm/test/MC/RISCV/rv32i-invalid.s            |   6 +-
 llvm/test/MC/RISCV/rv32zdinx-invalid.s        |  27 ++
 llvm/test/MC/RISCV/rv32zdinx-valid.s          | 124 ++++++++
 llvm/test/MC/RISCV/rv32zfinx-invalid.s        |  25 ++
 llvm/test/MC/RISCV/rv32zfinx-valid.s          | 128 ++++++++
 llvm/test/MC/RISCV/rv32zhinx-invalid.s        |  24 ++
 llvm/test/MC/RISCV/rv32zhinx-valid.s          | 128 ++++++++
 llvm/test/MC/RISCV/rv32zhinxmin-invalid.s     |  15 +
 llvm/test/MC/RISCV/rv32zhinxmin-valid.s       |  18 ++
 llvm/test/MC/RISCV/rv64zdinx-invalid.s        |   9 +
 llvm/test/MC/RISCV/rv64zdinx-valid.s          |  43 +++
 llvm/test/MC/RISCV/rv64zfinx-invalid.s        |   9 +
 llvm/test/MC/RISCV/rv64zfinx-valid.s          |  43 +++
 llvm/test/MC/RISCV/rv64zhinx-invalid.s        |   9 +
 llvm/test/MC/RISCV/rv64zhinx-valid.s          |  43 +++
 llvm/test/MC/RISCV/rv64zhinxmin-invalid.s     |   9 +
 llvm/test/MC/RISCV/rv64zhinxmin-valid.s       |  13 +
 llvm/test/MC/RISCV/rvzdinx-aliases-valid.s    |  49 +++
 llvm/test/MC/RISCV/rvzfinx-aliases-valid.s    |  82 +++++
 llvm/test/MC/RISCV/rvzhinx-aliases-valid.s    |  82 +++++
 30 files changed, 1548 insertions(+), 224 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/rv32zdinx-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zdinx-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zfinx-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zfinx-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zhinx-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zhinx-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zhinxmin-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zhinxmin-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zdinx-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zdinx-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zfinx-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zfinx-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zhinx-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zhinx-valid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zhinxmin-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zhinxmin-valid.s
 create mode 100644 llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
 create mode 100644 llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
 create mode 100644 llvm/test/MC/RISCV/rvzhinx-aliases-valid.s

diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 7af375aef86b5..ed256b8aaa048 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -53,6 +53,11 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"zfhmin", RISCVExtensionVersion{1, 0}},
     {"zfh", RISCVExtensionVersion{1, 0}},
 
+    {"zfinx", RISCVExtensionVersion{1, 0}},
+    {"zdinx", RISCVExtensionVersion{1, 0}},
+    {"zhinxmin", RISCVExtensionVersion{1, 0}},
+    {"zhinx", RISCVExtensionVersion{1, 0}},
+
     {"zba", RISCVExtensionVersion{1, 0}},
     {"zbb", RISCVExtensionVersion{1, 0}},
     {"zbc", RISCVExtensionVersion{1, 0}},
@@ -688,6 +693,8 @@ Error RISCVISAInfo::checkDependency() {
   bool HasE = Exts.count("e") != 0;
   bool HasD = Exts.count("d") != 0;
   bool HasF = Exts.count("f") != 0;
+  bool HasZfinx = Exts.count("zfinx") != 0;
+  bool HasZdinx = Exts.count("zdinx") != 0;
   bool HasVector = Exts.count("zve32x") != 0;
   bool HasZve32f = Exts.count("zve32f") != 0;
   bool HasZve64d = Exts.count("zve64d") != 0;
@@ -706,17 +713,15 @@ Error RISCVISAInfo::checkDependency() {
     return createStringError(errc::invalid_argument,
                              "d requires f extension to also be specified");
 
-  // FIXME: Consider Zfinx in the future
-  if (HasZve32f && !HasF)
+  if (HasZve32f && !HasF && !HasZfinx)
     return createStringError(
         errc::invalid_argument,
-        "zve32f requires f extension to also be specified");
+        "zve32f requires f or zfinx extension to also be specified");
 
-  // FIXME: Consider Zdinx in the future
-  if (HasZve64d && !HasD)
+  if (HasZve64d && !HasD && !HasZdinx)
     return createStringError(
         errc::invalid_argument,
-        "zve64d requires d extension to also be specified");
+        "zve64d requires d or zdinx extension to also be specified");
 
   if (HasZvl && !HasVector)
     return createStringError(
@@ -733,6 +738,9 @@ Error RISCVISAInfo::checkDependency() {
 static const char *ImpliedExtsV[] = {"zvl128b", "zve64d", "f", "d"};
 static const char *ImpliedExtsZfhmin[] = {"f"};
 static const char *ImpliedExtsZfh[] = {"f"};
+static const char *ImpliedExtsZdinx[] = {"zfinx"};
+static const char *ImpliedExtsZhinxmin[] = {"zfinx"};
+static const char *ImpliedExtsZhinx[] = {"zfinx"};
 static const char *ImpliedExtsZve64d[] = {"zve64f"};
 static const char *ImpliedExtsZve64f[] = {"zve64x", "zve32f"};
 static const char *ImpliedExtsZve64x[] = {"zve32x", "zvl64b"};
@@ -767,8 +775,11 @@ struct ImpliedExtsEntry {
 // Note: The table needs to be sorted by name.
 static constexpr ImpliedExtsEntry ImpliedExts[] = {
     {{"v"}, {ImpliedExtsV}},
+    {{"zdinx"}, {ImpliedExtsZdinx}},
     {{"zfh"}, {ImpliedExtsZfh}},
     {{"zfhmin"}, {ImpliedExtsZfhmin}},
+    {{"zhinx"}, {ImpliedExtsZhinx}},
+    {{"zhinxmin"}, {ImpliedExtsZhinxmin}},
     {{"zk"}, {ImpliedExtsZk}},
     {{"zkn"}, {ImpliedExtsZkn}},
     {{"zks"}, {ImpliedExtsZks}},
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index df1ed414d9156..81b097c74e6ec 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -171,6 +171,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseVTypeI(OperandVector &Operands);
   OperandMatchResultTy parseMaskReg(OperandVector &Operands);
   OperandMatchResultTy parseInsnDirectiveOpcode(OperandVector &Operands);
+  OperandMatchResultTy parseGPRAsFPR(OperandVector &Operands);
 
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
@@ -274,6 +275,8 @@ struct RISCVOperand : public MCParsedAsmOperand {
 
   bool IsRV64;
 
+  bool IsGPRAsFPR;
+
   struct RegOp {
     MCRegister RegNum;
   };
@@ -344,6 +347,14 @@ struct RISCVOperand : public MCParsedAsmOperand {
            RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum);
   }
 
+  bool isGPRAsFPR() const { return isGPR() && IsGPRAsFPR; }
+
+  bool isGPRF64AsFPR() const { return isGPR() && IsGPRAsFPR && IsRV64; }
+
+  bool isGPRPF64AsFPR() const {
+    return isGPR() && IsGPRAsFPR && !IsRV64 && !((Reg.RegNum - RISCV::X0) & 1);
+  }
+
   static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
                                   RISCVMCExpr::VariantKind &VK) {
     if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) {
@@ -840,12 +851,14 @@ struct RISCVOperand : public MCParsedAsmOperand {
   }
 
   static std::unique_ptr<RISCVOperand> createReg(unsigned RegNo, SMLoc S,
-                                                 SMLoc E, bool IsRV64) {
+                                                 SMLoc E, bool IsRV64,
+                                                 bool IsGPRAsFPR = false) {
     auto Op = std::make_unique<RISCVOperand>(KindTy::Register);
     Op->Reg.RegNum = RegNo;
     Op->StartLoc = S;
     Op->EndLoc = E;
     Op->IsRV64 = IsRV64;
+    Op->IsGPRAsFPR = IsGPRAsFPR;
     return Op;
   }
 
@@ -1799,6 +1812,26 @@ OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy RISCVAsmParser::parseGPRAsFPR(OperandVector &Operands) {
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::Identifier:
+    StringRef Name = getLexer().getTok().getIdentifier();
+    MCRegister RegNo;
+    matchRegisterNameHelper(isRV32E(), RegNo, Name);
+
+    if (RegNo == RISCV::NoRegister)
+      return MatchOperand_NoMatch;
+    SMLoc S = getLoc();
+    SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+    getLexer().Lex();
+    Operands.push_back(RISCVOperand::createReg(
+        RegNo, S, E, isRV64(), !getSTI().hasFeature(RISCV::FeatureStdExtF)));
+  }
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
   if (getLexer().isNot(AsmToken::LParen)) {
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index ff96b2b254cac..18947997dc583 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -161,6 +161,17 @@ static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeGPRPF64RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo >= 32 || RegNo & 1)
+    return MCDisassembler::Fail;
+
+  MCRegister Reg = RISCV::X0 + RegNo;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeVRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
                                           const void *Decoder) {
@@ -427,6 +438,27 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       return MCDisassembler::Fail;
     }
     Insn = support::endian::read32le(Bytes.data());
+    if (STI.getFeatureBits()[RISCV::FeatureStdExtZdinx] &&
+        !STI.getFeatureBits()[RISCV::Feature64Bit]) {
+      LLVM_DEBUG(dbgs() << "Trying RV32Zdinx table (Double in Integer and"
+                           "rv32)\n");
+      Result = decodeInstruction(DecoderTableRV32Zdinx32, MI, Insn, Address,
+                                 this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
+
+    if (STI.getFeatureBits()[RISCV::FeatureStdExtZfinx]) {
+      LLVM_DEBUG(dbgs() << "Trying RVZfinx table (Float in Integer):\n");
+      Result = decodeInstruction(DecoderTableRVZfinx32, MI, Insn, Address, this,
+                                 STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
     LLVM_DEBUG(dbgs() << "Trying RISCV32 table :\n");
     Result = decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
     Size = 4;
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 5486649106941..adf20c8210667 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -70,6 +70,43 @@ def HasStdExtZfhOrZfhmin
                                    "'Zfh' (Half-Precision Floating-Point) or "
                                    "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
 
+def FeatureStdExtZfinx
+    : SubtargetFeature<"zfinx", "HasStdExtZfinx", "true",
+                       "'Zfinx' (Float in Integer)">;
+def HasStdExtZfinx : Predicate<"Subtarget->hasStdExtZfinx()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZfinx),
+                               "'Zfinx' (Float in Integer)">;
+
+def FeatureStdExtZdinx
+    : SubtargetFeature<"zdinx", "HasStdExtZdinx", "true",
+                       "'Zdinx' (Double in Integer)",
+                       [FeatureStdExtZfinx]>;
+def HasStdExtZdinx : Predicate<"Subtarget->hasStdExtZdinx()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZdinx),
+                               "'Zdinx' (Double in Integer)">;
+
+def FeatureStdExtZhinxmin
+    : SubtargetFeature<"zhinxmin", "HasStdExtZhinxmin", "true",
+                       "'Zhinxmin' (Half Float in Integer Minimal)",
+                       [FeatureStdExtZfinx]>;
+def HasStdExtZhinxmin : Predicate<"Subtarget->hasStdExtZhinxmin()">,
+                                  AssemblerPredicate<(all_of FeatureStdExtZhinxmin),
+                                  "'Zhinxmin' (Half Float in Integer Minimal)">;
+
+def FeatureStdExtZhinx
+    : SubtargetFeature<"zhinx", "HasStdExtZhinx", "true",
+                       "'Zhinx' (Half Float in Integer)",
+                       [FeatureStdExtZfinx]>;
+def HasStdExtZhinx : Predicate<"Subtarget->hasStdExtZhinx()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZhinx),
+                               "'Zhinx' (Half Float in Integer)">;
+
+def HasStdExtZhinxOrZhinxmin
+    : Predicate<"Subtarget->hasStdExtZhinx() || Subtarget->hasStdExtZhinxmin()">,
+                AssemblerPredicate<(any_of FeatureStdExtZhinx, FeatureStdExtZhinxmin),
+                                   "'Zhinx' (Half Float in Integer) or "
+                                   "'Zhinxmin' (Half Float in Integer Minimal)">;
+
 def FeatureStdExtC
     : SubtargetFeature<"c", "HasStdExtC", "true",
                        "'C' (Compressed Instructions)">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 2837b92da81f4..4f5ec6aada615 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -25,6 +25,69 @@ def SDT_RISCVSplitF64     : SDTypeProfile<2, 1, [SDTCisVT<0, i32>,
 def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>;
 def RISCVSplitF64     : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
 
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+// Zdinx
+
+def GPRPF64AsFPR : AsmOperandClass {
+  let Name = "GPRPF64AsFPR";
+  let ParserMethod = "parseGPRAsFPR";
+  let RenderMethod = "addRegOperands";
+}
+
+def GPRF64AsFPR : AsmOperandClass {
+  let Name = "GPRF64AsFPR";
+  let ParserMethod = "parseGPRAsFPR";
+  let RenderMethod = "addRegOperands";
+}
+
+def FPR64INX : RegisterOperand<GPRF64> {
+  let ParserMatchClass = GPRF64AsFPR;
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+def FPR64IN32X : RegisterOperand<GPRPF64> {
+  let ParserMatchClass = GPRPF64AsFPR;
+}
+
+def DExt       : ExtInfo<0, [HasStdExtD]>;
+def D64Ext     : ExtInfo<0, [HasStdExtD, IsRV64]>;
+def ZdinxExt   : ExtInfo<1, [HasStdExtZdinx, IsRV64]>;
+def Zdinx32Ext : ExtInfo<2, [HasStdExtZdinx, IsRV32]>;
+
+def D       : ExtInfo_r<DExt,       FPR64>;
+def D_INX   : ExtInfo_r<ZdinxExt,   FPR64INX>;
+def D_IN32X : ExtInfo_r<Zdinx32Ext, FPR64IN32X>;
+
+def DD       : ExtInfo_rr<DExt,       FPR64,      FPR64>;
+def DD_INX   : ExtInfo_rr<ZdinxExt,   FPR64INX,   FPR64INX>;
+def DD_IN32X : ExtInfo_rr<Zdinx32Ext, FPR64IN32X, FPR64IN32X>;
+def DF       : ExtInfo_rr<DExt,       FPR64,      FPR32>;
+def DF_INX   : ExtInfo_rr<ZdinxExt,   FPR64INX,   FPR32INX>;
+def DF_IN32X : ExtInfo_rr<Zdinx32Ext, FPR64IN32X, FPR32INX>;
+def DX       : ExtInfo_rr<DExt,       FPR64,      GPR>;
+def DX_INX   : ExtInfo_rr<ZdinxExt,   FPR64INX,   GPR>;
+def DX_IN32X : ExtInfo_rr<Zdinx32Ext, FPR64IN32X, GPR>;
+def DX_64    : ExtInfo_rr<D64Ext,     FPR64,      GPR>;
+def FD       : ExtInfo_rr<DExt,       FPR32,      FPR64>;
+def FD_INX   : ExtInfo_rr<ZdinxExt,   FPR32INX,   FPR64INX>;
+def FD_IN32X : ExtInfo_rr<Zdinx32Ext, FPR32INX,   FPR64IN32X>;
+def XD       : ExtInfo_rr<DExt,       GPR,        FPR64>;
+def XD_INX   : ExtInfo_rr<ZdinxExt,   GPR,        FPR64INX>;
+def XD_IN32X : ExtInfo_rr<Zdinx32Ext, GPR,        FPR64IN32X>;
+def XD_64    : ExtInfo_rr<D64Ext,     GPR,        FPR64>;
+
+defvar DINX    = [D,     D_INX,  D_IN32X];
+defvar DDINX   = [DD,    DD_INX, DD_IN32X];
+defvar DXINX   = [DX,    DX_INX, DX_IN32X];
+defvar DFINX   = [DF,    DF_INX, DF_IN32X];
+defvar FDINX   = [FD,    FD_INX, FD_IN32X];
+defvar XDINX   = [XD,    XD_INX, XD_IN32X];
+defvar DXIN64X = [DX_64, DX_INX];
+defvar XDIN64X = [XD_64, XD_INX];
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -36,106 +99,104 @@ def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
 // reflecting the order these fields are specified in the instruction
 // encoding.
 def FSD : FPStore_r<0b011, "fsd", FPR64, WriteFST64>;
+} // Predicates = [HasStdExtD]
 
 let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64] in {
-def FMADD_D  : FPFMA_rrr_frm<OPC_MADD,  0b01, "fmadd.d",  FPR64>;
-def FMSUB_D  : FPFMA_rrr_frm<OPC_MSUB,  0b01, "fmsub.d",  FPR64>;
-def FNMSUB_D : FPFMA_rrr_frm<OPC_NMSUB, 0b01, "fnmsub.d", FPR64>;
-def FNMADD_D : FPFMA_rrr_frm<OPC_NMADD, 0b01, "fnmadd.d", FPR64>;
+defm FMADD_D  : FPFMA_rrr_frm_m<OPC_MADD,  0b01, "fmadd.d",  DINX>;
+defm FMSUB_D  : FPFMA_rrr_frm_m<OPC_MSUB,  0b01, "fmsub.d",  DINX>;
+defm FNMSUB_D : FPFMA_rrr_frm_m<OPC_NMSUB, 0b01, "fnmsub.d", DINX>;
+defm FNMADD_D : FPFMA_rrr_frm_m<OPC_NMADD, 0b01, "fnmadd.d", DINX>;
+}
+
+defm : FPFMADynFrmAlias_m<FMADD_D,  "fmadd.d",  DINX>;
+defm : FPFMADynFrmAlias_m<FMSUB_D,  "fmsub.d",  DINX>;
+defm : FPFMADynFrmAlias_m<FNMSUB_D, "fnmsub.d", DINX>;
+defm : FPFMADynFrmAlias_m<FNMADD_D, "fnmadd.d", DINX>;
+
+let SchedRW = [WriteFALU64, ReadFALU64, ReadFALU64] in {
+defm FADD_D : FPALU_rr_frm_m<0b0000001, "fadd.d", DINX>;
+defm FSUB_D : FPALU_rr_frm_m<0b0000101, "fsub.d", DINX>;
 }
+let SchedRW = [WriteFMul64, ReadFMul64, ReadFMul64] in
+defm FMUL_D : FPALU_rr_frm_m<0b0001001, "fmul.d", DINX>;
 
-def : FPFMADynFrmAlias<FMADD_D,  "fmadd.d",  FPR64>;
-def : FPFMADynFrmAlias<FMSUB_D,  "fmsub.d",  FPR64>;
-def : FPFMADynFrmAlias<FNMSUB_D, "fnmsub.d", FPR64>;
-def : FPFMADynFrmAlias<FNMADD_D, "fnmadd.d", FPR64>;
-
-def FADD_D : FPALU_rr_frm<0b0000001, "fadd.d", FPR64>,
-             Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
-def FSUB_D : FPALU_rr_frm<0b0000101, "fsub.d", FPR64>,
-             Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
-def FMUL_D : FPALU_rr_frm<0b0001001, "fmul.d", FPR64>,
-             Sched<[WriteFMul64, ReadFMul64, ReadFMul64]>;
-def FDIV_D : FPALU_rr_frm<0b0001101, "fdiv.d", FPR64>,
-             Sched<[WriteFDiv64, ReadFDiv64, ReadFDiv64]>;
-
-def        : FPALUDynFrmAlias<FADD_D, "fadd.d", FPR64>;
-def        : FPALUDynFrmAlias<FSUB_D, "fsub.d", FPR64>;
-def        : FPALUDynFrmAlias<FMUL_D, "fmul.d", FPR64>;
-def        : FPALUDynFrmAlias<FDIV_D, "fdiv.d", FPR64>;
-
-def FSQRT_D : FPUnaryOp_r_frm<0b0101101, 0b00000, FPR64, FPR64, "fsqrt.d">,
-              Sched<[WriteFSqrt64, ReadFSqrt64]>;
-def         : FPUnaryOpDynFrmAlias<FSQRT_D, "fsqrt.d", FPR64, FPR64>;
+let SchedRW = [WriteFDiv64, ReadFDiv64, ReadFDiv64] in
+defm FDIV_D : FPALU_rr_frm_m<0b0001101, "fdiv.d", DINX>;
+
+defm : FPALUDynFrmAlias_m<FADD_D, "fadd.d", DINX>;
+defm : FPALUDynFrmAlias_m<FSUB_D, "fsub.d", DINX>;
+defm : FPALUDynFrmAlias_m<FMUL_D, "fmul.d", DINX>;
+defm : FPALUDynFrmAlias_m<FDIV_D, "fdiv.d", DINX>;
+
+defm FSQRT_D : FPUnaryOp_r_frm_m<0b0101101, 0b00000, DDINX, "fsqrt.d">,
+               Sched<[WriteFSqrt64, ReadFSqrt64]>;
+defm         : FPUnaryOpDynFrmAlias_m<FSQRT_D, "fsqrt.d", DDINX>;
 
 let SchedRW = [WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64],
     mayRaiseFPException = 0 in {
-def FSGNJ_D  : FPALU_rr<0b0010001, 0b000, "fsgnj.d", FPR64>;
-def FSGNJN_D : FPALU_rr<0b0010001, 0b001, "fsgnjn.d", FPR64>;
-def FSGNJX_D : FPALU_rr<0b0010001, 0b010, "fsgnjx.d", FPR64>;
+defm FSGNJ_D  : FPALU_rr_m<0b0010001, 0b000, "fsgnj.d",  DINX>;
+defm FSGNJN_D : FPALU_rr_m<0b0010001, 0b001, "fsgnjn.d", DINX>;
+defm FSGNJX_D : FPALU_rr_m<0b0010001, 0b010, "fsgnjx.d", DINX>;
 }
 
 let SchedRW = [WriteFMinMax64, ReadFMinMax64, ReadFMinMax64] in {
-def FMIN_D   : FPALU_rr<0b0010101, 0b000, "fmin.d", FPR64>;
-def FMAX_D   : FPALU_rr<0b0010101, 0b001, "fmax.d", FPR64>;
+defm FMIN_D   : FPALU_rr_m<0b0010101, 0b000, "fmin.d", DINX>;
+defm FMAX_D   : FPALU_rr_m<0b0010101, 0b001, "fmax.d", DINX>;
 }
 
-def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, 0b00001, FPR32, FPR64, "fcvt.s.d">,
-               Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_S_D, "fcvt.s.d", FPR32, FPR64>;
+defm FCVT_S_D : FPUnaryOp_r_frm_m<0b0100000, 0b00001, FDINX, "fcvt.s.d">,
+                Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_S_D, "fcvt.s.d", FDINX>;
 
-def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b00000, 0b000, FPR64, FPR32, "fcvt.d.s">,
-               Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>;
+defm FCVT_D_S : FPUnaryOp_r_m<0b0100001, 0b00000, 0b000, DFINX, "fcvt.d.s">,
+                Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>;
 
 let SchedRW = [WriteFCmp64, ReadFCmp64, ReadFCmp64] in {
-def FEQ_D : FPCmp_rr<0b1010001, 0b010, "feq.d", FPR64>;
-def FLT_D : FPCmp_rr<0b1010001, 0b001, "flt.d", FPR64>;
-def FLE_D : FPCmp_rr<0b1010001, 0b000, "fle.d", FPR64>;
+defm FEQ_D : FPCmp_rr_m<0b1010001, 0b010, "feq.d", DINX>;
+defm FLT_D : FPCmp_rr_m<0b1010001, 0b001, "flt.d", DINX>;
+defm FLE_D : FPCmp_rr_m<0b1010001, 0b000, "fle.d", DINX>;
 }
 
-let mayRaiseFPException = 0 in
-def FCLASS_D : FPUnaryOp_r<0b1110001, 0b00000, 0b001, GPR, FPR64, "fclass.d">,
-               Sched<[WriteFClass64, ReadFClass64]>;
+defm FCLASS_D : FPUnaryOp_r_m<0b1110001, 0b00000, 0b001, XDINX, "fclass.d">,
+                Sched<[WriteFClass64, ReadFClass64]>;
 
-def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, 0b00000, GPR, FPR64, "fcvt.w.d">,
+defm FCVT_W_D : FPUnaryOp_r_frm_m<0b1100001, 0b00000, XDINX, "fcvt.w.d">,
                Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_W_D, "fcvt.w.d", GPR, FPR64>;
-
-def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, 0b00001, GPR, FPR64, "fcvt.wu.d">,
-                Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_WU_D, "fcvt.wu.d", GPR, FPR64>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_W_D, "fcvt.w.d", XDINX>;
 
-def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b00000, 0b000, FPR64, GPR, "fcvt.d.w">,
-               Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
+defm FCVT_WU_D : FPUnaryOp_r_frm_m<0b1100001, 0b00001, XDINX, "fcvt.wu.d">,
+                 Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_WU_D, "fcvt.wu.d", XDINX>;
 
-def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b00001, 0b000, FPR64, GPR, "fcvt.d.wu">,
+defm FCVT_D_W : FPUnaryOp_r_m<0b1101001, 0b00000, 0b000, DXINX, "fcvt.d.w">,
                 Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
-} // Predicates = [HasStdExtD]
 
-let Predicates = [HasStdExtD, IsRV64] in {
-def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, 0b00010, GPR, FPR64, "fcvt.l.d">,
-               Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_L_D, "fcvt.l.d", GPR, FPR64>;
+defm FCVT_D_WU : FPUnaryOp_r_m<0b1101001, 0b00001, 0b000, DXINX, "fcvt.d.wu">,
+                 Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>;
 
-def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, 0b00011, GPR, FPR64, "fcvt.lu.d">,
+defm FCVT_L_D : FPUnaryOp_r_frm_m<0b1100001, 0b00010, XDIN64X, "fcvt.l.d">,
                 Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_LU_D, "fcvt.lu.d", GPR, FPR64>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_L_D, "fcvt.l.d", XDIN64X>;
 
-let mayRaiseFPException = 0 in
+defm FCVT_LU_D : FPUnaryOp_r_frm_m<0b1100001, 0b00011, XDIN64X, "fcvt.lu.d">,
+                 Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_LU_D, "fcvt.lu.d", XDIN64X>;
+
+let Predicates = [HasStdExtD, IsRV64], mayRaiseFPException = 0 in
 def FMV_X_D : FPUnaryOp_r<0b1110001, 0b00000, 0b000, GPR, FPR64, "fmv.x.d">,
               Sched<[WriteFMovF64ToI64, ReadFMovF64ToI64]>;
 
-def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, 0b00010, FPR64, GPR, "fcvt.d.l">,
-               Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_D_L, "fcvt.d.l", FPR64, GPR>;
-
-def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, 0b00011, FPR64, GPR, "fcvt.d.lu">,
+defm FCVT_D_L : FPUnaryOp_r_frm_m<0b1101001, 0b00010, DXIN64X, "fcvt.d.l">,
                 Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_D_LU, "fcvt.d.lu", FPR64, GPR>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_D_L, "fcvt.d.l", DXIN64X>;
 
-let mayRaiseFPException = 0 in
+defm FCVT_D_LU : FPUnaryOp_r_frm_m<0b1101001, 0b00011, DXIN64X, "fcvt.d.lu">,
+                 Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_D_LU, "fcvt.d.lu", DXIN64X>;
+
+let Predicates = [HasStdExtD, IsRV64], mayRaiseFPException = 0 in
 def FMV_D_X : FPUnaryOp_r<0b1111001, 0b00000, 0b000, FPR64, GPR, "fmv.d.x">,
               Sched<[WriteFMovI64ToF64, ReadFMovI64ToF64]>;
-} // Predicates = [HasStdExtD, IsRV64]
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
@@ -164,6 +225,26 @@ def PseudoQuietFLT_D : PseudoQuietFCMP<FPR64>;
 }
 } // Predicates = [HasStdExtD]
 
+let Predicates = [HasStdExtZdinx, IsRV64] in {
+def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D_INX FPR64INX:$rd, FPR64INX:$rs, FPR64INX:$rs)>;
+def : InstAlias<"fneg.d $rd, $rs", (FSGNJN_D_INX FPR64INX:$rd, FPR64INX:$rs, FPR64INX:$rs)>;
+
+def : InstAlias<"fgt.d $rd, $rs, $rt",
+                (FLT_D_INX GPR:$rd, FPR64INX:$rt, FPR64INX:$rs), 0>;
+def : InstAlias<"fge.d $rd, $rs, $rt",
+                (FLE_D_INX GPR:$rd, FPR64INX:$rt, FPR64INX:$rs), 0>;
+} // Predicates = [HasStdExtZdinx, IsRV64]
+
+let Predicates = [HasStdExtZdinx, IsRV32] in {
+def : InstAlias<"fabs.d $rd, $rs", (FSGNJX_D_IN32X FPR64IN32X:$rd, FPR64IN32X:$rs, FPR64IN32X:$rs)>;
+def : InstAlias<"fneg.d $rd, $rs", (FSGNJN_D_IN32X FPR64IN32X:$rd, FPR64IN32X:$rs, FPR64IN32X:$rs)>;
+
+def : InstAlias<"fgt.d $rd, $rs, $rt",
+                (FLT_D_IN32X GPR:$rd, FPR64IN32X:$rt, FPR64IN32X:$rs), 0>;
+def : InstAlias<"fge.d $rd, $rs, $rt",
+                (FLE_D_IN32X GPR:$rd, FPR64IN32X:$rt, FPR64IN32X:$rs), 0>;
+} // Predicates = [HasStdExtZdinx, IsRV32]
+
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index a8ac06ba8da3f..4b45b47af451c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -57,6 +57,73 @@ def riscv_any_fcvt_wu_rv64 : PatFrags<(ops node:$src, node:$frm),
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
+// Zfinx
+
+def GPRAsFPR : AsmOperandClass {
+  let Name = "GPRAsFPR";
+  let ParserMethod = "parseGPRAsFPR";
+  let RenderMethod = "addRegOperands";
+}
+
+def FPR32INX : RegisterOperand<GPRF32> {
+  let ParserMatchClass = GPRAsFPR;
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+// inx = 0 : f, d, zfh, zfhmin
+//     = 1 : zfinx, zdinx, zhinx, zhinxmin
+//     = 2 : zdinx_rv32
+class ExtInfo<bits<2> inx, list<Predicate> pres> {
+  string Suffix = !cond(!eq(inx, 0): "",
+                        !eq(inx, 1): "_INX",
+                        !eq(inx, 2): "_IN32X");
+  list<Predicate> Predicates = pres;
+  string Space = !cond(!eq(inx, 0): "",
+                       !eq(inx, 1): "RVZfinx",
+                       !eq(inx, 2): "RV32Zdinx");
+}
+
+class ExtInfo_r<ExtInfo ext, DAGOperand reg> {
+  string Suffix = ext.Suffix;
+  list<Predicate> Predicates = ext.Predicates;
+  string Space = ext.Space;
+  DAGOperand Reg = reg;
+}
+
+class ExtInfo_rr<ExtInfo ext, DAGOperand rdty, DAGOperand rs1ty> {
+  string Suffix = ext.Suffix;
+  list<Predicate> Predicates = ext.Predicates;
+  string Space = ext.Space;
+  DAGOperand RdTy = rdty;
+  DAGOperand Rs1Ty = rs1ty;
+}
+
+def FExt       : ExtInfo<0, [HasStdExtF]>;
+def F64Ext     : ExtInfo<0, [HasStdExtF, IsRV64]>;
+def ZfinxExt   : ExtInfo<1, [HasStdExtZfinx]>;
+def Zfinx64Ext : ExtInfo<1, [HasStdExtZfinx, IsRV64]>;
+
+def F      : ExtInfo_r<FExt,     FPR32>;
+def F_INX  : ExtInfo_r<ZfinxExt, FPR32INX>;
+
+def FF        : ExtInfo_rr<FExt,       FPR32,    FPR32>;
+def FF_INX    : ExtInfo_rr<ZfinxExt,   FPR32INX, FPR32INX>;
+def FX        : ExtInfo_rr<FExt,       FPR32,    GPR>;
+def FX_INX    : ExtInfo_rr<ZfinxExt,   FPR32INX, GPR>;
+def FX_64     : ExtInfo_rr<F64Ext,     FPR32,    GPR>;
+def FX_INX_64 : ExtInfo_rr<Zfinx64Ext, FPR32INX, GPR>;
+def XF        : ExtInfo_rr<FExt,       GPR,      FPR32>;
+def XF_64     : ExtInfo_rr<F64Ext,     GPR,      FPR32>;
+def XF_INX    : ExtInfo_rr<ZfinxExt,   GPR,      FPR32INX>;
+def XF_INX_64 : ExtInfo_rr<Zfinx64Ext, GPR,      FPR32INX>;
+
+defvar FINX    = [F,     F_INX];
+defvar FFINX   = [FF,    FF_INX];
+defvar FXINX   = [FX,    FX_INX];
+defvar XFINX   = [XF,    XF_INX];
+defvar XFIN64X = [XF_64, XF_INX_64];
+defvar FXIN64X = [FX_64, FX_INX_64];
+
 // Floating-point rounding mode
 
 def FRMArg : AsmOperandClass {
@@ -94,62 +161,123 @@ class FPStore_r<bits<3> funct3, string opcodestr, RegisterClass rty,
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
     UseNamedOperandTable = 1, hasPostISelHook = 1 in
 class FPFMA_rrr_frm<RISCVOpcode opcode, bits<2> funct2, string opcodestr,
-                    RegisterClass rty>
+                    DAGOperand rty>
     : RVInstR4Frm<funct2, opcode, (outs rty:$rd),
                   (ins rty:$rs1, rty:$rs2, rty:$rs3, frmarg:$frm),
                   opcodestr, "$rd, $rs1, $rs2, $rs3, $frm">;
 
+multiclass FPFMA_rrr_frm_m<RISCVOpcode opcode, bits<2> funct2,
+                           string opcodestr, list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPFMA_rrr_frm<opcode, funct2, opcodestr, Ext.Reg>;
+}
+
 class FPFMADynFrmAlias<FPFMA_rrr_frm Inst, string OpcodeStr,
-                       RegisterClass rty>
+                       DAGOperand rty>
     : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3",
                 (Inst rty:$rd, rty:$rs1, rty:$rs2, rty:$rs3, 0b111)>;
+multiclass FPFMADynFrmAlias_m<FPFMA_rrr_frm Inst, string OpcodeStr,
+                              list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates in
+    def : FPFMADynFrmAlias<!cast<FPFMA_rrr_frm>(Inst#Ext.Suffix), OpcodeStr,
+                           Ext.Reg>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
-               RegisterClass rty>
+               DAGOperand rty>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs rty:$rd),
               (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+multiclass FPALU_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
+                      list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPALU_rr<funct7, funct3, opcodestr, Ext.Reg>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
     UseNamedOperandTable = 1, hasPostISelHook = 1 in
-class FPALU_rr_frm<bits<7> funct7, string opcodestr, RegisterClass rty>
+class FPALU_rr_frm<bits<7> funct7, string opcodestr, DAGOperand rty>
     : RVInstRFrm<funct7, OPC_OP_FP, (outs rty:$rd),
                  (ins rty:$rs1, rty:$rs2, frmarg:$frm), opcodestr,
                   "$rd, $rs1, $rs2, $frm">;
 
+multiclass FPALU_rr_frm_m<bits<7> funct7, string opcodestr,
+                          list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPALU_rr_frm<funct7, opcodestr, Ext.Reg>;
+}
+
 class FPALUDynFrmAlias<FPALU_rr_frm Inst, string OpcodeStr,
-                       RegisterClass rty>
+                       DAGOperand rty>
     : InstAlias<OpcodeStr#" $rd, $rs1, $rs2",
                 (Inst rty:$rd, rty:$rs1, rty:$rs2, 0b111)>;
+multiclass FPALUDynFrmAlias_m<FPALU_rr_frm Inst, string OpcodeStr,
+                              list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates in
+    def : FPALUDynFrmAlias<!cast<FPALU_rr_frm>(Inst#Ext.Suffix), OpcodeStr,
+                           Ext.Reg>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPUnaryOp_r<bits<7> funct7, bits<5> rs2val, bits<3> funct3,
-                  RegisterClass rdty, RegisterClass rs1ty, string opcodestr>
+                  DAGOperand rdty, DAGOperand rs1ty, string opcodestr>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs rdty:$rd), (ins rs1ty:$rs1),
               opcodestr, "$rd, $rs1"> {
   let rs2 = rs2val;
 }
+multiclass FPUnaryOp_r_m<bits<7> funct7, bits<5> rs2val, bits<3> funct3,
+                         list<ExtInfo_rr> Exts, string opcodestr> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPUnaryOp_r<funct7, rs2val, funct3, Ext.RdTy, Ext.Rs1Ty,
+                                 opcodestr>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
     UseNamedOperandTable = 1, hasPostISelHook = 1 in
-class FPUnaryOp_r_frm<bits<7> funct7, bits<5> rs2val, RegisterClass rdty,
-                      RegisterClass rs1ty, string opcodestr>
+class FPUnaryOp_r_frm<bits<7> funct7, bits<5> rs2val, DAGOperand rdty,
+                      DAGOperand rs1ty, string opcodestr>
     : RVInstRFrm<funct7, OPC_OP_FP, (outs rdty:$rd),
                  (ins rs1ty:$rs1, frmarg:$frm), opcodestr,
                   "$rd, $rs1, $frm"> {
   let rs2 = rs2val;
 }
+multiclass FPUnaryOp_r_frm_m<bits<7> funct7, bits<5> rs2val,
+                             list<ExtInfo_rr> Exts, string opcodestr> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPUnaryOp_r_frm<funct7, rs2val, Ext.RdTy, Ext.Rs1Ty,
+                                     opcodestr>;
+}
 
 class FPUnaryOpDynFrmAlias<FPUnaryOp_r_frm Inst, string OpcodeStr,
-                           RegisterClass rdty, RegisterClass rs1ty>
+                           DAGOperand rdty, DAGOperand rs1ty>
     : InstAlias<OpcodeStr#" $rd, $rs1",
                 (Inst rdty:$rd, rs1ty:$rs1, 0b111)>;
+multiclass FPUnaryOpDynFrmAlias_m<FPUnaryOp_r_frm Inst, string OpcodeStr,
+                                  list<ExtInfo_rr> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates in
+    def : FPUnaryOpDynFrmAlias<!cast<FPUnaryOp_r_frm>(Inst#Ext.Suffix),
+                               OpcodeStr, Ext.RdTy, Ext.Rs1Ty>;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in
 class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
-               RegisterClass rty>
+               DAGOperand rty>
     : RVInstR<funct7, funct3, OPC_OP_FP, (outs GPR:$rd),
               (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+multiclass FPCmp_rr_m<bits<7> funct7, bits<3> funct3, string opcodestr,
+                      list<ExtInfo_r> Exts> {
+  foreach Ext = Exts in
+    let Predicates = Ext.Predicates, DecoderNamespace = Ext.Space in
+    def Ext.Suffix : FPCmp_rr<funct7, funct3, opcodestr, Ext.Reg>;
+}
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -162,101 +290,100 @@ def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
 // reflecting the order these fields are specified in the instruction
 // encoding.
 def FSW : FPStore_r<0b010, "fsw", FPR32, WriteFST32>;
+} // Predicates = [HasStdExtF]
 
 let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in {
-def FMADD_S  : FPFMA_rrr_frm<OPC_MADD,  0b00, "fmadd.s",  FPR32>;
-def FMSUB_S  : FPFMA_rrr_frm<OPC_MSUB,  0b00, "fmsub.s",  FPR32>;
-def FNMSUB_S : FPFMA_rrr_frm<OPC_NMSUB, 0b00, "fnmsub.s", FPR32>;
-def FNMADD_S : FPFMA_rrr_frm<OPC_NMADD, 0b00, "fnmadd.s", FPR32>;
+defm FMADD_S  : FPFMA_rrr_frm_m<OPC_MADD,  0b00, "fmadd.s",  FINX>;
+defm FMSUB_S  : FPFMA_rrr_frm_m<OPC_MSUB,  0b00, "fmsub.s",  FINX>;
+defm FNMSUB_S : FPFMA_rrr_frm_m<OPC_NMSUB, 0b00, "fnmsub.s", FINX>;
+defm FNMADD_S : FPFMA_rrr_frm_m<OPC_NMADD, 0b00, "fnmadd.s", FINX>;
+}
+
+defm : FPFMADynFrmAlias_m<FMADD_S,  "fmadd.s",  FINX>;
+defm : FPFMADynFrmAlias_m<FMSUB_S,  "fmsub.s",  FINX>;
+defm : FPFMADynFrmAlias_m<FNMSUB_S, "fnmsub.s", FINX>;
+defm : FPFMADynFrmAlias_m<FNMADD_S, "fnmadd.s", FINX>;
+
+let SchedRW = [WriteFALU32, ReadFALU32, ReadFALU32] in {
+defm FADD_S : FPALU_rr_frm_m<0b0000000, "fadd.s", FINX>;
+defm FSUB_S : FPALU_rr_frm_m<0b0000100, "fsub.s", FINX>;
 }
+let SchedRW = [WriteFMul32, ReadFMul32, ReadFMul32] in
+defm FMUL_S : FPALU_rr_frm_m<0b0001000, "fmul.s", FINX>;
 
-def : FPFMADynFrmAlias<FMADD_S,  "fmadd.s",  FPR32>;
-def : FPFMADynFrmAlias<FMSUB_S,  "fmsub.s",  FPR32>;
-def : FPFMADynFrmAlias<FNMSUB_S, "fnmsub.s", FPR32>;
-def : FPFMADynFrmAlias<FNMADD_S, "fnmadd.s", FPR32>;
-
-def FADD_S : FPALU_rr_frm<0b0000000, "fadd.s", FPR32>,
-             Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
-def FSUB_S : FPALU_rr_frm<0b0000100, "fsub.s", FPR32>,
-             Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
-def FMUL_S : FPALU_rr_frm<0b0001000, "fmul.s", FPR32>,
-             Sched<[WriteFMul32, ReadFMul32, ReadFMul32]>;
-def FDIV_S : FPALU_rr_frm<0b0001100, "fdiv.s", FPR32>,
-             Sched<[WriteFDiv32, ReadFDiv32, ReadFDiv32]>;
-
-def        : FPALUDynFrmAlias<FADD_S, "fadd.s", FPR32>;
-def        : FPALUDynFrmAlias<FSUB_S, "fsub.s", FPR32>;
-def        : FPALUDynFrmAlias<FMUL_S, "fmul.s", FPR32>;
-def        : FPALUDynFrmAlias<FDIV_S, "fdiv.s", FPR32>;
-
-def FSQRT_S : FPUnaryOp_r_frm<0b0101100, 0b00000, FPR32, FPR32, "fsqrt.s">,
-              Sched<[WriteFSqrt32, ReadFSqrt32]>;
-def         : FPUnaryOpDynFrmAlias<FSQRT_S, "fsqrt.s", FPR32, FPR32>;
+let SchedRW = [WriteFDiv32, ReadFDiv32, ReadFDiv32] in
+defm FDIV_S : FPALU_rr_frm_m<0b0001100, "fdiv.s", FINX>;
+
+defm : FPALUDynFrmAlias_m<FADD_S, "fadd.s", FINX>;
+defm : FPALUDynFrmAlias_m<FSUB_S, "fsub.s", FINX>;
+defm : FPALUDynFrmAlias_m<FMUL_S, "fmul.s", FINX>;
+defm : FPALUDynFrmAlias_m<FDIV_S, "fdiv.s", FINX>;
+
+defm FSQRT_S : FPUnaryOp_r_frm_m<0b0101100, 0b00000, FFINX, "fsqrt.s">,
+               Sched<[WriteFSqrt32, ReadFSqrt32]>;
+defm         : FPUnaryOpDynFrmAlias_m<FSQRT_S, "fsqrt.s", FFINX>;
 
 let SchedRW = [WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32],
     mayRaiseFPException = 0 in {
-def FSGNJ_S  : FPALU_rr<0b0010000, 0b000, "fsgnj.s", FPR32>;
-def FSGNJN_S : FPALU_rr<0b0010000, 0b001, "fsgnjn.s", FPR32>;
-def FSGNJX_S : FPALU_rr<0b0010000, 0b010, "fsgnjx.s", FPR32>;
+defm FSGNJ_S  : FPALU_rr_m<0b0010000, 0b000, "fsgnj.s",  FINX>;
+defm FSGNJN_S : FPALU_rr_m<0b0010000, 0b001, "fsgnjn.s", FINX>;
+defm FSGNJX_S : FPALU_rr_m<0b0010000, 0b010, "fsgnjx.s", FINX>;
 }
 
 let SchedRW = [WriteFMinMax32, ReadFMinMax32, ReadFMinMax32] in {
-def FMIN_S   : FPALU_rr<0b0010100, 0b000, "fmin.s", FPR32>;
-def FMAX_S   : FPALU_rr<0b0010100, 0b001, "fmax.s", FPR32>;
+defm FMIN_S   : FPALU_rr_m<0b0010100, 0b000, "fmin.s", FINX>;
+defm FMAX_S   : FPALU_rr_m<0b0010100, 0b001, "fmax.s", FINX>;
 }
 
-def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, 0b00000, GPR, FPR32, "fcvt.w.s">,
-               Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_W_S, "fcvt.w.s", GPR, FPR32>;
-
-def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, 0b00001, GPR, FPR32, "fcvt.wu.s">,
+defm FCVT_W_S : FPUnaryOp_r_frm_m<0b1100000, 0b00000, XFINX, "fcvt.w.s">,
                 Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_WU_S, "fcvt.wu.s", GPR, FPR32>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_W_S, "fcvt.w.s", XFINX>;
+
+defm FCVT_WU_S : FPUnaryOp_r_frm_m<0b1100000, 0b00001, XFINX, "fcvt.wu.s">,
+                 Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_WU_S, "fcvt.wu.s", XFINX>;
 
 let mayRaiseFPException = 0 in
 def FMV_X_W : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR32, "fmv.x.w">,
               Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>;
 
 let SchedRW = [WriteFCmp32, ReadFCmp32, ReadFCmp32] in {
-def FEQ_S : FPCmp_rr<0b1010000, 0b010, "feq.s", FPR32>;
-def FLT_S : FPCmp_rr<0b1010000, 0b001, "flt.s", FPR32>;
-def FLE_S : FPCmp_rr<0b1010000, 0b000, "fle.s", FPR32>;
+defm FEQ_S : FPCmp_rr_m<0b1010000, 0b010, "feq.s", FINX>;
+defm FLT_S : FPCmp_rr_m<0b1010000, 0b001, "flt.s", FINX>;
+defm FLE_S : FPCmp_rr_m<0b1010000, 0b000, "fle.s", FINX>;
 }
 
 let mayRaiseFPException = 0 in
-def FCLASS_S : FPUnaryOp_r<0b1110000, 0b00000, 0b001, GPR, FPR32, "fclass.s">,
-               Sched<[WriteFClass32, ReadFClass32]>;
-
-def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, 0b00000, FPR32, GPR, "fcvt.s.w">,
-               Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_S_W, "fcvt.s.w", FPR32, GPR>;
+defm FCLASS_S : FPUnaryOp_r_m<0b1110000, 0b00000, 0b001, XFINX, "fclass.s">,
+                Sched<[WriteFClass32, ReadFClass32]>;
 
-def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, 0b00001, FPR32, GPR, "fcvt.s.wu">,
+defm FCVT_S_W : FPUnaryOp_r_frm_m<0b1101000, 0b00000, FXINX, "fcvt.s.w">,
                 Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_S_WU, "fcvt.s.wu", FPR32, GPR>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_S_W, "fcvt.s.w", FXINX>;
+
+defm FCVT_S_WU : FPUnaryOp_r_frm_m<0b1101000, 0b00001, FXINX, "fcvt.s.wu">,
+                 Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_S_WU, "fcvt.s.wu", FXINX>;
 
 let mayRaiseFPException = 0 in
 def FMV_W_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR32, GPR, "fmv.w.x">,
               Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>;
-} // Predicates = [HasStdExtF]
 
-let Predicates = [HasStdExtF, IsRV64] in {
-def FCVT_L_S  : FPUnaryOp_r_frm<0b1100000, 0b00010, GPR, FPR32, "fcvt.l.s">,
-                Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_L_S, "fcvt.l.s", GPR, FPR32>;
-
-def FCVT_LU_S  : FPUnaryOp_r_frm<0b1100000, 0b00011, GPR, FPR32, "fcvt.lu.s">,
+defm FCVT_L_S  : FPUnaryOp_r_frm_m<0b1100000, 0b00010, XFIN64X, "fcvt.l.s">,
                  Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
-def            : FPUnaryOpDynFrmAlias<FCVT_LU_S, "fcvt.lu.s", GPR, FPR32>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_L_S, "fcvt.l.s", XFIN64X>;
 
-def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, 0b00010, FPR32, GPR, "fcvt.s.l">,
-               Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_S_L, "fcvt.s.l", FPR32, GPR>;
+defm FCVT_LU_S  : FPUnaryOp_r_frm_m<0b1100000, 0b00011, XFIN64X, "fcvt.lu.s">,
+                  Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>;
+defm            : FPUnaryOpDynFrmAlias_m<FCVT_LU_S, "fcvt.lu.s", XFIN64X>;
 
-def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, 0b00011, FPR32, GPR, "fcvt.s.lu">,
+defm FCVT_S_L : FPUnaryOp_r_frm_m<0b1101000, 0b00010, FXIN64X, "fcvt.s.l">,
                 Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_S_LU, "fcvt.s.lu", FPR32, GPR>;
-} // Predicates = [HasStdExtF, IsRV64]
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_S_L, "fcvt.s.l", FXIN64X>;
+
+defm FCVT_S_LU : FPUnaryOp_r_frm_m<0b1101000, 0b00011, FXIN64X, "fcvt.s.lu">,
+                 Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_S_LU, "fcvt.s.lu", FXIN64X>;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
@@ -315,6 +442,16 @@ def PseudoQuietFLT_S : PseudoQuietFCMP<FPR32>;
 }
 } // Predicates = [HasStdExtF]
 
+let Predicates = [HasStdExtZfinx] in {
+def : InstAlias<"fabs.s $rd, $rs", (FSGNJX_S_INX FPR32INX:$rd, FPR32INX:$rs, FPR32INX:$rs)>;
+def : InstAlias<"fneg.s $rd, $rs", (FSGNJN_S_INX FPR32INX:$rd, FPR32INX:$rs, FPR32INX:$rs)>;
+
+def : InstAlias<"fgt.s $rd, $rs, $rt",
+                (FLT_S_INX GPR:$rd, FPR32INX:$rt, FPR32INX:$rs), 0>;
+def : InstAlias<"fge.s $rd, $rs, $rt",
+                (FLE_S_INX GPR:$rd, FPR32INX:$rt, FPR32INX:$rs), 0>;
+} // Predicates = [HasStdExtZfinx]
+
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index a2753c1323548..631525484bd9d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -25,6 +25,62 @@ def riscv_fmv_h_x
 def riscv_fmv_x_anyexth
     : SDNode<"RISCVISD::FMV_X_ANYEXTH", SDT_RISCVFMV_X_ANYEXTH>;
 
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+// Zhinxmin and Zhinx
+
+def FPR16INX : RegisterOperand<GPRF16> {
+  let ParserMatchClass = GPRAsFPR;
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+def ZfhExt           : ExtInfo<0, [HasStdExtZfh]>;
+def Zfh64Ext         : ExtInfo<0, [HasStdExtZfh,             IsRV64]>;
+def ZfhminExt        : ExtInfo<0, [HasStdExtZfhOrZfhmin]>;
+def ZhinxExt         : ExtInfo<1, [HasStdExtZhinx]>;
+def ZhinxminExt      : ExtInfo<1, [HasStdExtZhinxOrZhinxmin]>;
+def Zhinx64Ext       : ExtInfo<1, [HasStdExtZhinx,           IsRV64]>;
+
+def ZfhminDExt       : ExtInfo<0, [HasStdExtZfhOrZfhmin,     HasStdExtD]>;
+def ZhinxminZdinxExt : ExtInfo<1, [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx]>;
+
+def H     : ExtInfo_r<ZfhExt,     FPR16>;
+def H_INX : ExtInfo_r<ZhinxExt, FPR16INX>;
+
+def HH        : ExtInfo_rr<ZfhExt,           FPR16,    FPR16>;
+def HH_INX    : ExtInfo_rr<ZhinxExt,         FPR16INX, FPR16INX>;
+def XH        : ExtInfo_rr<ZfhExt,           GPR,      FPR16>;
+def XH_INX    : ExtInfo_rr<ZhinxExt,         GPR,      FPR16INX>;
+def HX        : ExtInfo_rr<ZfhExt,           FPR16,    GPR>;
+def HX_INX    : ExtInfo_rr<ZhinxExt,         FPR16INX, GPR>;
+def XH_64     : ExtInfo_rr<Zfh64Ext,         GPR,      FPR16>;
+def HX_64     : ExtInfo_rr<Zfh64Ext,         FPR16,    GPR>;
+def XH_INX_64 : ExtInfo_rr<Zhinx64Ext,       GPR,      FPR16INX>;
+def HX_INX_64 : ExtInfo_rr<Zhinx64Ext,       FPR16INX, GPR>;
+def HFmin     : ExtInfo_rr<ZfhminExt,        FPR16,    FPR32>;
+def HF_INXmin : ExtInfo_rr<ZhinxminExt,      FPR16INX, FPR32INX>;
+def HF_INX    : ExtInfo_rr<ZhinxExt,         FPR16INX, FPR32INX>;
+def FHmin     : ExtInfo_rr<ZfhminExt,        FPR32,    FPR16>;
+def FH_INXmin : ExtInfo_rr<ZhinxminExt,      FPR32INX, FPR16INX>;
+def FH_INX    : ExtInfo_rr<ZhinxExt,         FPR32INX, FPR16INX>;
+def DHmin     : ExtInfo_rr<ZfhminDExt,       FPR64,    FPR16>;
+def DH_INXmin : ExtInfo_rr<ZhinxminZdinxExt, FPR64INX, FPR16INX>;
+def HDmin     : ExtInfo_rr<ZfhminDExt,       FPR16,    FPR64>;
+def HD_INXmin : ExtInfo_rr<ZhinxminZdinxExt, FPR16INX, FPR64INX>;
+
+defvar HINX     = [H,     H_INX];
+defvar HHINX    = [HH,    HH_INX];
+defvar XHINX    = [XH,    XH_INX];
+defvar HXINX    = [HX,    HX_INX];
+defvar XHIN64X  = [XH_64, XH_INX_64];
+defvar HXIN64X  = [HX_64, HX_INX_64];
+defvar HFINXmin = [HFmin, HF_INXmin];
+defvar FHINXmin = [FHmin, FH_INXmin];
+defvar DHINXmin = [DHmin, DH_INXmin];
+defvar HDINXmin = [HDmin, HD_INXmin];
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -38,74 +94,73 @@ def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>;
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
-let Predicates = [HasStdExtZfh] in {
 let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16] in {
-def FMADD_H  : FPFMA_rrr_frm<OPC_MADD,  0b10, "fmadd.h",  FPR16>;
-def FMSUB_H  : FPFMA_rrr_frm<OPC_MSUB,  0b10, "fmsub.h",  FPR16>;
-def FNMSUB_H : FPFMA_rrr_frm<OPC_NMSUB, 0b10, "fnmsub.h", FPR16>;
-def FNMADD_H : FPFMA_rrr_frm<OPC_NMADD, 0b10, "fnmadd.h", FPR16>;
+defm FMADD_H  : FPFMA_rrr_frm_m<OPC_MADD,  0b10, "fmadd.h",  HINX>;
+defm FMSUB_H  : FPFMA_rrr_frm_m<OPC_MSUB,  0b10, "fmsub.h",  HINX>;
+defm FNMSUB_H : FPFMA_rrr_frm_m<OPC_NMSUB, 0b10, "fnmsub.h", HINX>;
+defm FNMADD_H : FPFMA_rrr_frm_m<OPC_NMADD, 0b10, "fnmadd.h", HINX>;
 }
 
-def : FPFMADynFrmAlias<FMADD_H,  "fmadd.h",  FPR16>;
-def : FPFMADynFrmAlias<FMSUB_H,  "fmsub.h",  FPR16>;
-def : FPFMADynFrmAlias<FNMSUB_H, "fnmsub.h", FPR16>;
-def : FPFMADynFrmAlias<FNMADD_H, "fnmadd.h", FPR16>;
-
-def FADD_H : FPALU_rr_frm<0b0000010, "fadd.h", FPR16>,
-             Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
-def FSUB_H : FPALU_rr_frm<0b0000110, "fsub.h", FPR16>,
-             Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>;
-def FMUL_H : FPALU_rr_frm<0b0001010, "fmul.h", FPR16>,
-             Sched<[WriteFMul16, ReadFMul16, ReadFMul16]>;
-def FDIV_H : FPALU_rr_frm<0b0001110, "fdiv.h", FPR16>,
-             Sched<[WriteFDiv16, ReadFDiv16, ReadFDiv16]>;
-
-def        : FPALUDynFrmAlias<FADD_H, "fadd.h", FPR16>;
-def        : FPALUDynFrmAlias<FSUB_H, "fsub.h", FPR16>;
-def        : FPALUDynFrmAlias<FMUL_H, "fmul.h", FPR16>;
-def        : FPALUDynFrmAlias<FDIV_H, "fdiv.h", FPR16>;
-
-def FSQRT_H : FPUnaryOp_r_frm<0b0101110, 0b00000, FPR16, FPR16, "fsqrt.h">,
-              Sched<[WriteFSqrt16, ReadFSqrt16]>;
-def         : FPUnaryOpDynFrmAlias<FSQRT_H, "fsqrt.h", FPR16, FPR16>;
+defm : FPFMADynFrmAlias_m<FMADD_H,  "fmadd.h",  HINX>;
+defm : FPFMADynFrmAlias_m<FMSUB_H,  "fmsub.h",  HINX>;
+defm : FPFMADynFrmAlias_m<FNMSUB_H, "fnmsub.h", HINX>;
+defm : FPFMADynFrmAlias_m<FNMADD_H, "fnmadd.h", HINX>;
+
+let SchedRW = [WriteFALU16, ReadFALU16, ReadFALU16] in {
+defm FADD_H : FPALU_rr_frm_m<0b0000010, "fadd.h", HINX>;
+defm FSUB_H : FPALU_rr_frm_m<0b0000110, "fsub.h", HINX>;
+}
+let SchedRW = [WriteFMul16, ReadFMul16, ReadFMul16] in
+defm FMUL_H : FPALU_rr_frm_m<0b0001010, "fmul.h", HINX>;
+
+let SchedRW = [WriteFDiv16, ReadFDiv16, ReadFDiv16] in
+defm FDIV_H : FPALU_rr_frm_m<0b0001110, "fdiv.h", HINX>;
+
+defm : FPALUDynFrmAlias_m<FADD_H, "fadd.h", HINX>;
+defm : FPALUDynFrmAlias_m<FSUB_H, "fsub.h", HINX>;
+defm : FPALUDynFrmAlias_m<FMUL_H, "fmul.h", HINX>;
+defm : FPALUDynFrmAlias_m<FDIV_H, "fdiv.h", HINX>;
+
+defm FSQRT_H : FPUnaryOp_r_frm_m<0b0101110, 0b00000, HHINX, "fsqrt.h">,
+               Sched<[WriteFSqrt16, ReadFSqrt16]>;
+defm         : FPUnaryOpDynFrmAlias_m<FSQRT_H, "fsqrt.h", HHINX>;
 
 let SchedRW = [WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16],
     mayRaiseFPException = 0 in {
-def FSGNJ_H  : FPALU_rr<0b0010010, 0b000, "fsgnj.h", FPR16>;
-def FSGNJN_H : FPALU_rr<0b0010010, 0b001, "fsgnjn.h", FPR16>;
-def FSGNJX_H : FPALU_rr<0b0010010, 0b010, "fsgnjx.h", FPR16>;
+defm FSGNJ_H  : FPALU_rr_m<0b0010010, 0b000, "fsgnj.h",  HINX>;
+defm FSGNJN_H : FPALU_rr_m<0b0010010, 0b001, "fsgnjn.h", HINX>;
+defm FSGNJX_H : FPALU_rr_m<0b0010010, 0b010, "fsgnjx.h", HINX>;
 }
 
 let SchedRW = [WriteFMinMax16, ReadFMinMax16, ReadFMinMax16] in {
-def FMIN_H   : FPALU_rr<0b0010110, 0b000, "fmin.h", FPR16>;
-def FMAX_H   : FPALU_rr<0b0010110, 0b001, "fmax.h", FPR16>;
+defm FMIN_H   : FPALU_rr_m<0b0010110, 0b000, "fmin.h", HINX>;
+defm FMAX_H   : FPALU_rr_m<0b0010110, 0b001, "fmax.h", HINX>;
 }
 
-def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, 0b00000, GPR, FPR16, "fcvt.w.h">,
-               Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_W_H, "fcvt.w.h", GPR, FPR16>;
-
-def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, 0b00001, GPR, FPR16, "fcvt.wu.h">,
+defm FCVT_W_H : FPUnaryOp_r_frm_m<0b1100010, 0b00000, XHINX, "fcvt.w.h">,
                 Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_WU_H, "fcvt.wu.h", GPR, FPR16>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_W_H, "fcvt.w.h", XHINX>;
 
-def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, 0b00000, FPR16, GPR, "fcvt.h.w">,
-               Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_W, "fcvt.h.w", FPR16, GPR>;
+defm FCVT_WU_H : FPUnaryOp_r_frm_m<0b1100010, 0b00001, XHINX, "fcvt.wu.h">,
+                 Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_WU_H, "fcvt.wu.h", XHINX>;
 
-def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, 0b00001, FPR16, GPR, "fcvt.h.wu">,
+defm FCVT_H_W : FPUnaryOp_r_frm_m<0b1101010, 0b00000, HXINX, "fcvt.h.w">,
                 Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>;
-} // Predicates = [HasStdExtZfh]
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_W, "fcvt.h.w", HXINX>;
 
-let Predicates = [HasStdExtZfhOrZfhmin] in {
-def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, 0b00000, FPR16, FPR32, "fcvt.h.s">,
-               Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>;
+defm FCVT_H_WU : FPUnaryOp_r_frm_m<0b1101010, 0b00001, HXINX, "fcvt.h.wu">,
+                 Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_H_WU, "fcvt.h.wu", HXINX>;
 
-def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b00010, 0b000, FPR32, FPR16, "fcvt.s.h">,
+defm FCVT_H_S : FPUnaryOp_r_frm_m<0b0100010, 0b00000, HFINXmin, "fcvt.h.s">,
+                Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_S, "fcvt.h.s", HFINXmin>;
+
+defm FCVT_S_H : FPUnaryOp_r_m<0b0100000, 0b00010, 0b000, FHINXmin, "fcvt.s.h">,
                Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]>;
 
+let Predicates = [HasStdExtZfhOrZfhmin] in {
 let mayRaiseFPException = 0 in
 def FMV_X_H : FPUnaryOp_r<0b1110010, 0b00000, 0b000, GPR, FPR16, "fmv.x.h">,
               Sched<[WriteFMovF16ToI16, ReadFMovF16ToI16]>;
@@ -115,45 +170,38 @@ def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">,
               Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]>;
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
-let Predicates = [HasStdExtZfh] in {
-
 let SchedRW = [WriteFCmp16, ReadFCmp16, ReadFCmp16] in {
-def FEQ_H : FPCmp_rr<0b1010010, 0b010, "feq.h", FPR16>;
-def FLT_H : FPCmp_rr<0b1010010, 0b001, "flt.h", FPR16>;
-def FLE_H : FPCmp_rr<0b1010010, 0b000, "fle.h", FPR16>;
+defm FEQ_H : FPCmp_rr_m<0b1010010, 0b010, "feq.h", HINX>;
+defm FLT_H : FPCmp_rr_m<0b1010010, 0b001, "flt.h", HINX>;
+defm FLE_H : FPCmp_rr_m<0b1010010, 0b000, "fle.h", HINX>;
 }
 
 let mayRaiseFPException = 0 in
-def FCLASS_H : FPUnaryOp_r<0b1110010, 0b00000, 0b001, GPR, FPR16, "fclass.h">,
-               Sched<[WriteFClass16, ReadFClass16]>;
-} // Predicates = [HasStdExtZfh]
-
-let Predicates = [HasStdExtZfh, IsRV64] in {
-def FCVT_L_H  : FPUnaryOp_r_frm<0b1100010, 0b00010, GPR, FPR16, "fcvt.l.h">,
-                Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_L_H, "fcvt.l.h", GPR, FPR16>;
+defm FCLASS_H : FPUnaryOp_r_m<0b1110010, 0b00000, 0b001, XHINX, "fclass.h">,
+                Sched<[WriteFClass16, ReadFClass16]>;
 
-def FCVT_LU_H  : FPUnaryOp_r_frm<0b1100010, 0b00011, GPR, FPR16, "fcvt.lu.h">,
+defm FCVT_L_H  : FPUnaryOp_r_frm_m<0b1100010, 0b00010, XHIN64X, "fcvt.l.h">,
                  Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
-def            : FPUnaryOpDynFrmAlias<FCVT_LU_H, "fcvt.lu.h", GPR, FPR16>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_L_H, "fcvt.l.h", XHIN64X>;
 
-def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, 0b00010, FPR16, GPR, "fcvt.h.l">,
-               Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_L, "fcvt.h.l", FPR16, GPR>;
+defm FCVT_LU_H  : FPUnaryOp_r_frm_m<0b1100010, 0b00011, XHIN64X, "fcvt.lu.h">,
+                  Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>;
+defm            : FPUnaryOpDynFrmAlias_m<FCVT_LU_H, "fcvt.lu.h", XHIN64X>;
 
-def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, 0b00011, FPR16, GPR, "fcvt.h.lu">,
+defm FCVT_H_L : FPUnaryOp_r_frm_m<0b1101010, 0b00010, HXIN64X, "fcvt.h.l">,
                 Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
-def           : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>;
-} // Predicates = [HasStdExtZfh, IsRV64]
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_L, "fcvt.h.l", HXIN64X>;
 
-let Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] in {
-def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, 0b00001, FPR16, FPR64, "fcvt.h.d">,
-               Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>;
-def          : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>;
+defm FCVT_H_LU : FPUnaryOp_r_frm_m<0b1101010, 0b00011, HXIN64X, "fcvt.h.lu">,
+                 Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>;
+defm           : FPUnaryOpDynFrmAlias_m<FCVT_H_LU, "fcvt.h.lu", HXIN64X>;
 
-def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b00010, 0b000, FPR64, FPR16, "fcvt.d.h">,
-               Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>;
-} // Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD]
+defm FCVT_H_D : FPUnaryOp_r_frm_m<0b0100010, 0b00001, HDINXmin, "fcvt.h.d">,
+                Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>;
+defm          : FPUnaryOpDynFrmAlias_m<FCVT_H_D, "fcvt.h.d", HDINXmin>;
+
+defm FCVT_D_H : FPUnaryOp_r_m<0b0100001, 0b00010, 0b000, DHINXmin, "fcvt.d.h">,
+                Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
@@ -186,6 +234,17 @@ def PseudoQuietFLT_H : PseudoQuietFCMP<FPR16>;
 }
 } // Predicates = [HasStdExtZfhOrZfhmin]
 
+let Predicates = [HasStdExtZhinx] in {
+def : InstAlias<"fmv.h $rd, $rs",  (FSGNJ_H_INX  FPR16INX:$rd, FPR16INX:$rs, FPR16INX:$rs)>;
+def : InstAlias<"fabs.h $rd, $rs", (FSGNJX_H_INX FPR16INX:$rd, FPR16INX:$rs, FPR16INX:$rs)>;
+def : InstAlias<"fneg.h $rd, $rs", (FSGNJN_H_INX FPR16INX:$rd, FPR16INX:$rs, FPR16INX:$rs)>;
+
+def : InstAlias<"fgt.h $rd, $rs, $rt",
+                (FLT_H_INX GPR:$rd, FPR16INX:$rt, FPR16INX:$rs), 0>;
+def : InstAlias<"fge.h $rd, $rs, $rt",
+                (FLE_H_INX GPR:$rd, FPR16INX:$rt, FPR16INX:$rs), 0>;
+} // Predicates = [HasStdExtZhinx]
+
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 8c1c03b51c249..b06af3787b5d3 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -66,6 +66,7 @@ def sub_vrm1_5 : ComposedSubRegIndex<sub_vrm2_2, sub_vrm1_1>;
 def sub_vrm1_6 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_0>;
 def sub_vrm1_7 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_1>;
 
+def sub_32_hi  : SubRegIndex<32, 32>;
 } // Namespace = "RISCV"
 
 // Integer registers
@@ -534,6 +535,35 @@ def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
   let Size = 64;
 }
 
+let RegInfos = XLenRI in {
+def GPRF16  : RegisterClass<"RISCV", [f16], 16, (add GPR)>;
+def GPRF32  : RegisterClass<"RISCV", [f32], 32, (add GPR)>;
+def GPRF64  : RegisterClass<"RISCV", [f64], 64, (add GPR)>;
+} // RegInfos = XLenRI
+
+let RegAltNameIndices = [ABIRegAltName] in {
+  foreach Index = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
+                   24, 26, 28, 30] in {
+    defvar Reg = !cast<Register>("X"#Index);
+    def X#Index#_PD : RISCVRegWithSubRegs<Index, Reg.AsmName,
+                                          [!cast<Register>("X"#Index),
+                                           !cast<Register>("X"#!add(Index, 1))],
+                                           Reg.AltNames> {
+      let SubRegIndices = [sub_32, sub_32_hi];
+    }
+  }
+}
+
+let RegInfos = RegInfoByHwMode<[RV64], [RegInfo<64, 64, 64>]> in
+def GPRPF64 : RegisterClass<"RISCV", [f64], 64, (add
+    X10_PD, X12_PD, X14_PD, X16_PD,
+    X6_PD,
+    X28_PD, X30_PD,
+    X8_PD,
+    X18_PD, X20_PD, X22_PD, X24_PD, X26_PD,
+    X0_PD, X2_PD, X4_PD
+)>;
+
 // The register class is added for inline assembly for vector mask types.
 def VM : VReg<VMaskVTs,
            (add (sequence "V%u", 8, 31),
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 3b5543bbccd13..352bef4e661b7 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -84,6 +84,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtZve64d = false;
   bool HasStdExtZfhmin = false;
   bool HasStdExtZfh = false;
+  bool HasStdExtZfinx = false;
+  bool HasStdExtZdinx = false;
+  bool HasStdExtZhinxmin = false;
+  bool HasStdExtZhinx = false;
   bool HasStdExtZbkb = false;
   bool HasStdExtZbkc = false;
   bool HasStdExtZbkx = false;
@@ -172,6 +176,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool hasStdExtZvl() const { return ZvlLen != ExtZvl::NotSet; }
   bool hasStdExtZfhmin() const { return HasStdExtZfhmin; }
   bool hasStdExtZfh() const { return HasStdExtZfh; }
+  bool hasStdExtZfinx() const { return HasStdExtZfinx; }
+  bool hasStdExtZdinx() const { return HasStdExtZdinx; }
+  bool hasStdExtZhinxmin() const { return HasStdExtZhinxmin; }
+  bool hasStdExtZhinx() const { return HasStdExtZhinx; }
   bool hasStdExtZbkb() const { return HasStdExtZbkb; }
   bool hasStdExtZbkc() const { return HasStdExtZbkc; }
   bool hasStdExtZbkx() const { return HasStdExtZbkx; }
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 16cc49e213f2d..88f155201195b 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -125,6 +125,18 @@
 .attribute arch, "rv32ifzfh1p0"
 # CHECK: attribute      5, "rv32i2p0_f2p0_zfh1p0"
 
+.attribute arch, "rv32izfinx"
+# CHECK: attribute      5, "rv32i2p0_zfinx1p0"
+
+.attribute arch, "rv32izfinx_zdinx"
+# CHECK: attribute      5, "rv32i2p0_zfinx1p0_zdinx1p0"
+
+.attribute arch, "rv32izfinx_zhinxmin"
+# CHECK: attribute      5, "rv32i2p0_zfinx1p0_zhinxmin1p0"
+
+.attribute arch, "rv32izfinx_zhinx1p0"
+# CHECK: attribute      5, "rv32i2p0_zfinx1p0_zhinx1p0"
+
 .attribute arch, "rv32i_zbkb1p0"
 # CHECK: attribute      5, "rv32i2p0_zbkb1p0"
 
diff --git a/llvm/test/MC/RISCV/rv32i-invalid.s b/llvm/test/MC/RISCV/rv32i-invalid.s
index 574751205c107..4c2be6a84b41b 100644
--- a/llvm/test/MC/RISCV/rv32i-invalid.s
+++ b/llvm/test/MC/RISCV/rv32i-invalid.s
@@ -1,4 +1,4 @@
-# RUN: not llvm-mc -triple riscv32 < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple riscv32 %s 2>&1 | FileCheck %s
 
 # Out of range immediates
 ## fencearg
@@ -172,6 +172,10 @@ xor s2, s2 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
 mul a4, ra, s0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'M' (Integer Multiplication and Division)
 amomaxu.w s5, s4, (s3) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'A' (Atomic Instructions)
 fadd.s ft0, ft1, ft2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'F' (Single-Precision Floating-Point){{$}}
+fadd.h ft0, ft1, ft2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point)
+fadd.s a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zfinx' (Float in Integer)
+fadd.d a0, a2, a4 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zdinx' (Double in Integer)
+fadd.h a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zhinx' (Half Float in Integer)
 flh ft0, (a0) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal){{$}}
 sh1add a0, a1, a2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zba' (Address Generation Instructions)
 clz a0, a1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zbb' (Basic Bit-Manipulation)
diff --git a/llvm/test/MC/RISCV/rv32zdinx-invalid.s b/llvm/test/MC/RISCV/rv32zdinx-invalid.s
new file mode 100644
index 0000000000000..54dd4bbfb308a
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zdinx-invalid.s
@@ -0,0 +1,27 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+zdinx %s 2>&1 | FileCheck %s
+
+# Unsupport Odd Registers in RV32
+fadd.d a0, a1, a2 # CHECK: :[[@LINE]]:12: error: invalid operand for instruction
+
+# Not support float registers
+flw fa4, 12(sp) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'F' (Single-Precision Floating-Point)
+fadd.d fa0, fa1, fa2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'D' (Double-Precision Floating-Point)
+
+# Invalid instructions
+fsw a5, 12(sp) # CHECK: :[[@LINE]]:5: error: invalid operand for instruction
+fmv.x.w s0, s1 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+
+# Invalid register names
+fadd.d a100, a2, a3 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+fsgnjn.d a100, a2, a3 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction
+
+# Rounding mode when a register is expected
+fmadd.d x10, x12, x14, ree # CHECK: :[[@LINE]]:24: error: invalid operand for instruction
+
+# Invalid rounding modes
+fmadd.d x10, x12, x14, x16, ree # CHECK: :[[@LINE]]:29: error: operand must be a valid floating point rounding mode mnemonic
+fmsub.d x10, x12, x14, x16, 0 # CHECK: :[[@LINE]]:29: error: operand must be a valid floating point rounding mode mnemonic
+fnmsub.d x10, x12, x14, x16, 0b111 # CHECK: :[[@LINE]]:30: error: operand must be a valid floating point rounding mode mnemonic
+
+# FP registers where integer regs are expected
+fcvt.wu.d ft2, a1 # CHECK: :[[@LINE]]:11: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zdinx-valid.s b/llvm/test/MC/RISCV/rv32zdinx-valid.s
new file mode 100644
index 0000000000000..660116bc9a555
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zdinx-valid.s
@@ -0,0 +1,124 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zdinx %s \
+# RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \
+# RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: fmadd.d a0, a2, a4, a6, dyn
+# CHECK-ASM: encoding: [0x43,0x75,0xe6,0x82]
+fmadd.d x10, x12, x14, x16, dyn
+# CHECK-ASM-AND-OBJ: fmsub.d a0, a2, a4, a6, dyn
+# CHECK-ASM: encoding: [0x47,0x75,0xe6,0x82]
+fmsub.d x10, x12, x14, x16, dyn
+# CHECK-ASM-AND-OBJ: fnmsub.d a0, a2, a4, a6, dyn
+# CHECK-ASM: encoding: [0x4b,0x75,0xe6,0x82]
+fnmsub.d x10, x12, x14, x16, dyn
+# CHECK-ASM-AND-OBJ: fnmadd.d a0, a2, a4, a6, dyn
+# CHECK-ASM: encoding: [0x4f,0x75,0xe6,0x82]
+fnmadd.d x10, x12, x14, x16, dyn
+
+# CHECK-ASM-AND-OBJ: fadd.d s10, t3, t5, dyn
+# CHECK-ASM: encoding: [0x53,0x7d,0xee,0x03]
+fadd.d x26, x28, x30, dyn
+# CHECK-ASM-AND-OBJ: fsub.d s10, t3, t5, dyn
+# CHECK-ASM: encoding: [0x53,0x7d,0xee,0x0b]
+fsub.d x26, x28, x30, dyn
+# CHECK-ASM-AND-OBJ: fmul.d s10, t3, t5, dyn
+# CHECK-ASM: encoding: [0x53,0x7d,0xee,0x13]
+fmul.d x26, x28, x30, dyn
+# CHECK-ASM-AND-OBJ: fdiv.d s10, t3, t5, dyn
+# CHECK-ASM: encoding: [0x53,0x7d,0xee,0x1b]
+fdiv.d x26, x28, x30, dyn
+# CHECK-ASM-AND-OBJ: fsqrt.d s4, s6, dyn
+# CHECK-ASM: encoding: [0x53,0x7a,0x0b,0x5a]
+fsqrt.d x20, x22, dyn
+# CHECK-ASM-AND-OBJ: fsgnj.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x0d,0xee,0x23]
+fsgnj.d x26, x28, x30
+# CHECK-ASM-AND-OBJ: fsgnjn.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x1d,0xee,0x23]
+fsgnjn.d x26, x28, x30
+# CHECK-ASM-AND-OBJ: fsgnjx.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x2d,0xee,0x23]
+fsgnjx.d x26, x28, x30
+# CHECK-ASM-AND-OBJ: fmin.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x0d,0xee,0x2b]
+fmin.d x26, x28, x30
+# CHECK-ASM-AND-OBJ: fmax.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x1d,0xee,0x2b]
+fmax.d x26, x28, x30
+
+# CHECK-ASM-AND-OBJ: fcvt.s.d s10, t3, dyn
+# CHECK-ASM: encoding: [0x53,0x7d,0x1e,0x40]
+fcvt.s.d x26, x28, dyn
+# CHECK-ASM-AND-OBJ: fcvt.d.s s10, t3
+# CHECK-ASM: encoding: [0x53,0x0d,0x0e,0x42]
+fcvt.d.s x26, x28
+# CHECK-ASM-AND-OBJ: feq.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x2d,0xee,0xa3]
+feq.d x26, x28, x30
+# CHECK-ASM-AND-OBJ: flt.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x1d,0xee,0xa3]
+flt.d x26, x28, x30
+# CHECK-ASM-AND-OBJ: fle.d s10, t3, t5
+# CHECK-ASM: encoding: [0x53,0x0d,0xee,0xa3]
+fle.d x26, x28, x30
+# CHECK-ASM-AND-OBJ: fclass.d s10, t3
+# CHECK-ASM: encoding: [0x53,0x1d,0x0e,0xe2]
+fclass.d x26, x28
+
+# CHECK-ASM-AND-OBJ: fcvt.w.d s4, s6, dyn
+# CHECK-ASM: encoding: [0x53,0x7a,0x0b,0xc2]
+fcvt.w.d x20, x22, dyn
+# CHECK-ASM-AND-OBJ: fcvt.d.w s10, t3
+# CHECK-ASM: encoding: [0x53,0x0d,0x0e,0xd2]
+fcvt.d.w x26, x28
+# CHECK-ASM-AND-OBJ: fcvt.d.wu s10, t3
+# CHECK-ASM: encoding: [0x53,0x0d,0x1e,0xd2]
+fcvt.d.wu x26, x28
+
+# Rounding modes
+
+# CHECK-ASM-AND-OBJ: fmadd.d a0, a2, a4, a6, rne
+# CHECK-ASM: encoding: [0x43,0x05,0xe6,0x82]
+fmadd.d x10, x12, x14, x16, rne
+# CHECK-ASM-AND-OBJ: fmsub.d a0, a2, a4, a6, rtz
+# CHECK-ASM: encoding: [0x47,0x15,0xe6,0x82]
+fmsub.d x10, x12, x14, x16, rtz
+# CHECK-ASM-AND-OBJ: fnmsub.d a0, a2, a4, a6, rdn
+# CHECK-ASM: encoding: [0x4b,0x25,0xe6,0x82]
+fnmsub.d x10, x12, x14, x16, rdn
+# CHECK-ASM-AND-OBJ: fnmadd.d a0, a2, a4, a6, rup
+# CHECK-ASM: encoding: [0x4f,0x35,0xe6,0x82]
+fnmadd.d x10, x12, x14, x16, rup
+
+# CHECK-ASM-AND-OBJ: fadd.d s10, t3, t5, rmm
+# CHECK-ASM: encoding: [0x53,0x4d,0xee,0x03]
+fadd.d x26, x28, x30, rmm
+# CHECK-ASM-AND-OBJ: fsub.d s10, t3, t5, dyn
+# CHECK-ASM: encoding: [0x53,0x7d,0xee,0x0b]
+fsub.d x26, x28, x30, dyn
+# CHECK-ASM-AND-OBJ: fmul.d s10, t3, t5, rne
+# CHECK-ASM: encoding: [0x53,0x0d,0xee,0x13]
+fmul.d x26, x28, x30, rne
+# CHECK-ASM-AND-OBJ: fdiv.d s10, t3, t5, rtz
+# CHECK-ASM: encoding: [0x53,0x1d,0xee,0x1b]
+fdiv.d x26, x28, x30, rtz
+
+# CHECK-ASM-AND-OBJ: fsqrt.d s4, s6, rdn
+# CHECK-ASM: encoding: [0x53,0x2a,0x0b,0x5a]
+fsqrt.d x20, x22, rdn
+# CHECK-ASM-AND-OBJ: fcvt.s.d s4, s6, rup
+# CHECK-ASM: encoding: [0x53,0x3a,0x1b,0x40]
+fcvt.s.d x20, x22, rup
+# CHECK-ASM-AND-OBJ: fcvt.w.d s4, s6, rmm
+# CHECK-ASM: encoding: [0x53,0x4a,0x0b,0xc2]
+fcvt.w.d x20, x22, rmm
+# CHECK-ASM-AND-OBJ: fcvt.wu.d s4, s6, dyn
+# CHECK-ASM: encoding: [0x53,0x7a,0x1b,0xc2]
+fcvt.wu.d x20, x22, dyn
diff --git a/llvm/test/MC/RISCV/rv32zfinx-invalid.s b/llvm/test/MC/RISCV/rv32zfinx-invalid.s
new file mode 100644
index 0000000000000..b3712875673b1
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zfinx-invalid.s
@@ -0,0 +1,25 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+zfinx %s 2>&1 | FileCheck %s
+
+# Not support float registers
+flw fa4, 12(sp) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'F' (Single-Precision Floating-Point)
+fadd.s fa0, fa1, fa2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'F' (Single-Precision Floating-Point)
+
+# Invalid instructions
+fsw a5, 12(sp) # CHECK: :[[@LINE]]:5: error: invalid operand for instruction
+fmv.x.w s0, s1 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+fadd.d t1, t3, t5 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zdinx' (Double in Integer)
+
+# Invalid register names
+fadd.d a100, a2, a3 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+fsgnjn.s a100, a2, a3 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction
+
+# Rounding mode when a register is expected
+fmadd.s x10, x11, x12, ree # CHECK: :[[@LINE]]:24: error: invalid operand for instruction
+
+# Invalid rounding modes
+fmadd.s x10, x11, x12, x13, ree # CHECK: :[[@LINE]]:29: error: operand must be a valid floating point rounding mode mnemonic
+fmsub.s x14, x15, x16, x17, 0 # CHECK: :[[@LINE]]:29: error: operand must be a valid floating point rounding mode mnemonic
+fnmsub.s x18, x19, x20, x21, 0b111 # CHECK: :[[@LINE]]:30: error: operand must be a valid floating point rounding mode mnemonic
+
+# Using 'Zdinx' instructions for an 'Zfinx'-only target
+fadd.d t0, t1, t2 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zfinx-valid.s b/llvm/test/MC/RISCV/rv32zfinx-valid.s
new file mode 100644
index 0000000000000..58f805c9fce71
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zfinx-valid.s
@@ -0,0 +1,128 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zfinx %s \
+# RUN:     | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfinx %s \
+# RUN:     | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: fmadd.s a0, a1, a2, a3, dyn
+# CHECK-ASM: encoding: [0x43,0xf5,0xc5,0x68]
+fmadd.s x10, x11, x12, x13, dyn
+# CHECK-ASM-AND-OBJ: fmsub.s a4, a5, a6, a7, dyn
+# CHECK-ASM: encoding: [0x47,0xf7,0x07,0x89]
+fmsub.s x14, x15, x16, x17, dyn
+# CHECK-ASM-AND-OBJ: fnmsub.s s2, s3, s4, s5, dyn
+# CHECK-ASM: encoding: [0x4b,0xf9,0x49,0xa9]
+fnmsub.s x18, x19, x20, x21, dyn
+# CHECK-ASM-AND-OBJ: fnmadd.s s6, s7, s8, s9, dyn
+# CHECK-ASM: encoding: [0x4f,0xfb,0x8b,0xc9]
+fnmadd.s x22, x23, x24, x25, dyn
+
+# CHECK-ASM-AND-OBJ: fadd.s s10, s11, t3, dyn
+# CHECK-ASM: encoding: [0x53,0xfd,0xcd,0x01]
+fadd.s x26, x27, x28, dyn
+# CHECK-ASM-AND-OBJ: fsub.s t4, t5, t6, dyn
+# CHECK-ASM: encoding: [0xd3,0x7e,0xff,0x09]
+fsub.s x29, x30, x31, dyn
+# CHECK-ASM-AND-OBJ: fmul.s s0, s1, s2, dyn
+# CHECK-ASM: encoding: [0x53,0xf4,0x24,0x11]
+fmul.s s0, s1, s2, dyn
+# CHECK-ASM-AND-OBJ: fdiv.s s3, s4, s5, dyn
+# CHECK-ASM: encoding: [0xd3,0x79,0x5a,0x19]
+fdiv.s s3, s4, s5, dyn
+# CHECK-ASM-AND-OBJ: fsqrt.s t1, t2, dyn
+# CHECK-ASM: encoding: [0x53,0xf3,0x03,0x58]
+fsqrt.s t1, t2, dyn
+# CHECK-ASM-AND-OBJ: fsgnj.s s1, a0, a1
+# CHECK-ASM: encoding: [0xd3,0x04,0xb5,0x20]
+fsgnj.s s1, a0, a1
+# CHECK-ASM-AND-OBJ: fsgnjn.s a1, a3, a4
+# CHECK-ASM: encoding: [0xd3,0x95,0xe6,0x20]
+fsgnjn.s a1, a3, a4
+# CHECK-ASM-AND-OBJ: fsgnjx.s a4, a3, a2
+# CHECK-ASM: encoding: [0x53,0xa7,0xc6,0x20]
+fsgnjx.s a4, a3, a2
+# CHECK-ASM-AND-OBJ: fmin.s a5, a6, a7
+# CHECK-ASM: encoding: [0xd3,0x07,0x18,0x29]
+fmin.s a5, a6, a7
+# CHECK-ASM-AND-OBJ: fmax.s s2, s3, s4
+# CHECK-ASM: encoding: [0x53,0x99,0x49,0x29]
+fmax.s s2, s3, s4
+# CHECK-ASM-AND-OBJ: fcvt.w.s a0, s5, dyn
+# CHECK-ASM: encoding: [0x53,0xf5,0x0a,0xc0]
+fcvt.w.s a0, s5, dyn
+# CHECK-ASM-AND-OBJ: fcvt.wu.s a1, s6, dyn
+# CHECK-ASM: encoding: [0xd3,0x75,0x1b,0xc0]
+fcvt.wu.s a1, s6, dyn
+# CHECK-ASM-AND-OBJ: feq.s a1, s8, s9
+# CHECK-ASM: encoding: [0xd3,0x25,0x9c,0xa1]
+feq.s a1, s8, s9
+# CHECK-ASM-AND-OBJ: flt.s a2, s10, s11
+# CHECK-ASM: encoding: [0x53,0x16,0xbd,0xa1]
+flt.s a2, s10, s11
+# CHECK-ASM-AND-OBJ: fle.s a3, t3, t4
+# CHECK-ASM: encoding: [0xd3,0x06,0xde,0xa1]
+fle.s a3, t3, t4
+# CHECK-ASM-AND-OBJ: fclass.s a3, t5
+# CHECK-ASM: encoding: [0xd3,0x16,0x0f,0xe0]
+fclass.s a3, t5
+# CHECK-ASM-AND-OBJ: fcvt.s.w t6, a4, dyn
+# CHECK-ASM: encoding: [0xd3,0x7f,0x07,0xd0]
+fcvt.s.w t6, a4, dyn
+# CHECK-ASM-AND-OBJ: fcvt.s.wu s0, a5, dyn
+# CHECK-ASM: encoding: [0x53,0xf4,0x17,0xd0]
+fcvt.s.wu s0, a5, dyn
+
+# Rounding modes
+
+# CHECK-ASM-AND-OBJ: fmadd.s a0, a1, a2, a3, rne
+# CHECK-ASM: encoding: [0x43,0x85,0xc5,0x68]
+fmadd.s x10, x11, x12, x13, rne
+# CHECK-ASM-AND-OBJ: fmsub.s a4, a5, a6, a7, rtz
+# CHECK-ASM: encoding: [0x47,0x97,0x07,0x89]
+fmsub.s x14, x15, x16, x17, rtz
+# CHECK-ASM-AND-OBJ: fnmsub.s s2, s3, s4, s5, rdn
+# CHECK-ASM: encoding: [0x4b,0xa9,0x49,0xa9]
+fnmsub.s x18, x19, x20, x21, rdn
+# CHECK-ASM-AND-OBJ: fnmadd.s s6, s7, s8, s9, rup
+# CHECK-ASM: encoding: [0x4f,0xbb,0x8b,0xc9]
+fnmadd.s x22, x23, x24, x25, rup
+# CHECK-ASM-AND-OBJ: fmadd.s a0, a1, a2, a3, rmm
+# CHECK-ASM: encoding: [0x43,0xc5,0xc5,0x68]
+fmadd.s x10, x11, x12, x13, rmm
+# CHECK-ASM-AND-OBJ: fmsub.s a4, a5, a6, a7
+# CHECK-ASM: encoding: [0x47,0xf7,0x07,0x89]
+fmsub.s x14, x15, x16, x17, dyn
+
+# CHECK-ASM-AND-OBJ: fadd.s s10, s11, t3, rne
+# CHECK-ASM: encoding: [0x53,0x8d,0xcd,0x01]
+fadd.s x26, x27, x28, rne
+# CHECK-ASM-AND-OBJ: fsub.s t4, t5, t6, rtz
+# CHECK-ASM: encoding: [0xd3,0x1e,0xff,0x09]
+fsub.s x29, x30, x31, rtz
+# CHECK-ASM-AND-OBJ: fmul.s s0, s1, s2, rdn
+# CHECK-ASM: encoding: [0x53,0xa4,0x24,0x11]
+fmul.s s0, s1, s2, rdn
+# CHECK-ASM-AND-OBJ: fdiv.s s3, s4, s5, rup
+# CHECK-ASM: encoding: [0xd3,0x39,0x5a,0x19]
+fdiv.s s3, s4, s5, rup
+
+# CHECK-ASM-AND-OBJ: fsqrt.s t1, t2, rmm
+# CHECK-ASM: encoding: [0x53,0xc3,0x03,0x58]
+fsqrt.s t1, t2, rmm
+# CHECK-ASM-AND-OBJ: fcvt.w.s a0, s5, rup
+# CHECK-ASM: encoding: [0x53,0xb5,0x0a,0xc0]
+fcvt.w.s a0, s5, rup
+# CHECK-ASM-AND-OBJ: fcvt.wu.s a1, s6, rdn
+# CHECK-ASM: encoding: [0xd3,0x25,0x1b,0xc0]
+fcvt.wu.s a1, s6, rdn
+# CHECK-ASM-AND-OBJ: fcvt.s.w t6, a4, rtz
+# CHECK-ASM: encoding: [0xd3,0x1f,0x07,0xd0]
+fcvt.s.w t6, a4, rtz
+# CHECK-ASM-AND-OBJ: fcvt.s.wu s0, a5, rne
+# CHECK-ASM: encoding: [0x53,0x84,0x17,0xd0]
+fcvt.s.wu s0, a5, rne
diff --git a/llvm/test/MC/RISCV/rv32zhinx-invalid.s b/llvm/test/MC/RISCV/rv32zhinx-invalid.s
new file mode 100644
index 0000000000000..2ab1dee8d08a5
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zhinx-invalid.s
@@ -0,0 +1,24 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+zhinx %s 2>&1 | FileCheck %s
+
+# Not support float registers
+flw fa4, 12(sp) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'F' (Single-Precision Floating-Point)
+fadd.h fa0, fa1, fa2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point)
+
+# Invalid instructions
+fsw a5, 12(sp) # CHECK: :[[@LINE]]:5: error: invalid operand for instruction
+fmv.x.h s0, s1 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+
+# Invalid register names
+fadd.h a100, a2, a3 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+fsgnjn.h a100, a2, a3 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction
+
+# Rounding mode when a register is expected
+fmadd.h x10, x11, x12, ree # CHECK: :[[@LINE]]:24: error: invalid operand for instruction
+
+# Invalid rounding modes
+fmadd.h x10, x11, x12, x13, ree # CHECK: :[[@LINE]]:29: error: operand must be a valid floating point rounding mode mnemonic
+fmsub.h x14, x15, x16, x17, 0 # CHECK: :[[@LINE]]:29: error: operand must be a valid floating point rounding mode mnemonic
+fnmsub.h x18, x19, x20, x21, 0b111 # CHECK: :[[@LINE]]:30: error: operand must be a valid floating point rounding mode mnemonic
+
+# FP registers where integer regs are expected
+fcvt.wu.h ft2, a1 # CHECK: :[[@LINE]]:11: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zhinx-valid.s b/llvm/test/MC/RISCV/rv32zhinx-valid.s
new file mode 100644
index 0000000000000..97ec9dd1a34f8
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zhinx-valid.s
@@ -0,0 +1,128 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinx %s \
+# RUN:     | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx %s \
+# RUN:     | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: fmadd.h a0, a1, a2, a3, dyn
+# CHECK-ASM: encoding: [0x43,0xf5,0xc5,0x6c]
+fmadd.h x10, x11, x12, x13, dyn
+# CHECK-ASM-AND-OBJ: fmsub.h a4, a5, a6, a7, dyn
+# CHECK-ASM: encoding: [0x47,0xf7,0x07,0x8d]
+fmsub.h x14, x15, x16, x17, dyn
+# CHECK-ASM-AND-OBJ: fnmsub.h s2, s3, s4, s5, dyn
+# CHECK-ASM: encoding: [0x4b,0xf9,0x49,0xad]
+fnmsub.h x18, x19, x20, x21, dyn
+# CHECK-ASM-AND-OBJ: fnmadd.h s6, s7, s8, s9, dyn
+# CHECK-ASM: encoding: [0x4f,0xfb,0x8b,0xcd]
+fnmadd.h x22, x23, x24, x25, dyn
+
+# CHECK-ASM-AND-OBJ: fadd.h s10, s11, t3, dyn
+# CHECK-ASM: encoding: [0x53,0xfd,0xcd,0x05]
+fadd.h x26, x27, x28, dyn
+# CHECK-ASM-AND-OBJ: fsub.h t4, t5, t6, dyn
+# CHECK-ASM: encoding: [0xd3,0x7e,0xff,0x0d]
+fsub.h x29, x30, x31, dyn
+# CHECK-ASM-AND-OBJ: fmul.h s0, s1, s2, dyn
+# CHECK-ASM: encoding: [0x53,0xf4,0x24,0x15]
+fmul.h s0, s1, s2, dyn
+# CHECK-ASM-AND-OBJ: fdiv.h s3, s4, s5, dyn
+# CHECK-ASM: encoding: [0xd3,0x79,0x5a,0x1d]
+fdiv.h s3, s4, s5, dyn
+# CHECK-ASM-AND-OBJ: fsqrt.h s6, s7, dyn
+# CHECK-ASM: encoding: [0x53,0xfb,0x0b,0x5c]
+fsqrt.h s6, s7, dyn
+# CHECK-ASM-AND-OBJ: fsgnj.h s1, a0, a1
+# CHECK-ASM: encoding: [0xd3,0x04,0xb5,0x24]
+fsgnj.h x9, x10, x11
+# CHECK-ASM-AND-OBJ: fsgnjn.h a1, a3, a4
+# CHECK-ASM: encoding: [0xd3,0x95,0xe6,0x24]
+fsgnjn.h x11, x13, x14
+# CHECK-ASM-AND-OBJ: fsgnjx.h a4, a3, a2
+# CHECK-ASM: encoding: [0x53,0xa7,0xc6,0x24]
+fsgnjx.h x14, x13, x12
+# CHECK-ASM-AND-OBJ: fmin.h a5, a6, a7
+# CHECK-ASM: encoding: [0xd3,0x07,0x18,0x2d]
+fmin.h x15, x16, x17
+# CHECK-ASM-AND-OBJ: fmax.h s2, s3, s4
+# CHECK-ASM: encoding: [0x53,0x99,0x49,0x2d]
+fmax.h x18, x19, x20
+# CHECK-ASM-AND-OBJ: fcvt.w.h a0, s5, dyn
+# CHECK-ASM: encoding: [0x53,0xf5,0x0a,0xc4]
+fcvt.w.h x10, x21, dyn
+# CHECK-ASM-AND-OBJ: fcvt.wu.h a1, s6, dyn
+# CHECK-ASM: encoding: [0xd3,0x75,0x1b,0xc4]
+fcvt.wu.h x11, x22, dyn
+# CHECK-ASM-AND-OBJ: feq.h a1, s8, s9
+# CHECK-ASM: encoding: [0xd3,0x25,0x9c,0xa5]
+feq.h x11, x24, x25
+# CHECK-ASM-AND-OBJ: flt.h a2, s10, s11
+# CHECK-ASM: encoding: [0x53,0x16,0xbd,0xa5]
+flt.h x12, x26, x27
+# CHECK-ASM-AND-OBJ: fle.h a3, t3, t4
+# CHECK-ASM: encoding: [0xd3,0x06,0xde,0xa5]
+fle.h x13, x28, x29
+# CHECK-ASM-AND-OBJ: fclass.h a3, t5
+# CHECK-ASM: encoding: [0xd3,0x16,0x0f,0xe4]
+fclass.h x13, x30
+# CHECK-ASM-AND-OBJ: fcvt.h.w t6, a4, dyn
+# CHECK-ASM: encoding: [0xd3,0x7f,0x07,0xd4]
+fcvt.h.w x31, x14, dyn
+# CHECK-ASM-AND-OBJ: fcvt.h.wu s0, a5, dyn
+# CHECK-ASM: encoding: [0x53,0xf4,0x17,0xd4]
+fcvt.h.wu s0, x15, dyn
+
+# Rounding modes
+
+# CHECK-ASM-AND-OBJ: fmadd.h a0, a1, a2, a3, rne
+# CHECK-ASM: encoding: [0x43,0x85,0xc5,0x6c]
+fmadd.h x10, x11, x12, x13, rne
+# CHECK-ASM-AND-OBJ: fmsub.h a4, a5, a6, a7, rtz
+# CHECK-ASM: encoding: [0x47,0x97,0x07,0x8d]
+fmsub.h x14, x15, x16, x17, rtz
+# CHECK-ASM-AND-OBJ: fnmsub.h s2, s3, s4, s5, rdn
+# CHECK-ASM: encoding: [0x4b,0xa9,0x49,0xad]
+fnmsub.h x18, x19, x20, x21, rdn
+# CHECK-ASM-AND-OBJ: fnmadd.h s6, s7, s8, s9, rup
+# CHECK-ASM: encoding: [0x4f,0xbb,0x8b,0xcd]
+fnmadd.h x22, x23, x24, x25, rup
+# CHECK-ASM-AND-OBJ: fmadd.h a0, a1, a2, a3, rmm
+# CHECK-ASM: encoding: [0x43,0xc5,0xc5,0x6c]
+fmadd.h x10, x11, x12, x13, rmm
+# CHECK-ASM-AND-OBJ: fmsub.h a4, a5, a6, a7
+# CHECK-ASM: encoding: [0x47,0xf7,0x07,0x8d]
+fmsub.h x14, x15, x16, x17, dyn
+
+# CHECK-ASM-AND-OBJ: fadd.h s10, s11, t3, rne
+# CHECK-ASM: encoding: [0x53,0x8d,0xcd,0x05]
+fadd.h x26, x27, x28, rne
+# CHECK-ASM-AND-OBJ: fsub.h t4, t5, t6, rtz
+# CHECK-ASM: encoding: [0xd3,0x1e,0xff,0x0d]
+fsub.h x29, x30, x31, rtz
+# CHECK-ASM-AND-OBJ: fmul.h s0, s1, s2, rdn
+# CHECK-ASM: encoding: [0x53,0xa4,0x24,0x15]
+fmul.h s0, s1, s2, rdn
+# CHECK-ASM-AND-OBJ: fdiv.h s3, s4, s5, rup
+# CHECK-ASM: encoding: [0xd3,0x39,0x5a,0x1d]
+fdiv.h s3, s4, s5, rup
+
+# CHECK-ASM-AND-OBJ: fsqrt.h s6, s7, rmm
+# CHECK-ASM: encoding: [0x53,0xcb,0x0b,0x5c]
+fsqrt.h s6, s7, rmm
+# CHECK-ASM-AND-OBJ: fcvt.w.h a0, s5, rup
+# CHECK-ASM: encoding: [0x53,0xb5,0x0a,0xc4]
+fcvt.w.h x10, x21, rup
+# CHECK-ASM-AND-OBJ: fcvt.wu.h a1, s6, rdn
+# CHECK-ASM: encoding: [0xd3,0x25,0x1b,0xc4]
+fcvt.wu.h x11, x22, rdn
+# CHECK-ASM-AND-OBJ: fcvt.h.w t6, a4, rtz
+# CHECK-ASM: encoding: [0xd3,0x1f,0x07,0xd4]
+fcvt.h.w x31, x14, rtz
+# CHECK-ASM-AND-OBJ: fcvt.h.wu s0, a5, rne
+# CHECK-ASM: encoding: [0x53,0x84,0x17,0xd4]
+fcvt.h.wu s0, a5, rne
diff --git a/llvm/test/MC/RISCV/rv32zhinxmin-invalid.s b/llvm/test/MC/RISCV/rv32zhinxmin-invalid.s
new file mode 100644
index 0000000000000..ebfd8a58562e3
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zhinxmin-invalid.s
@@ -0,0 +1,15 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+zhinxmin %s 2>&1 | FileCheck %s
+
+# Not support float registers
+flw fa4, 12(sp) # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'F' (Single-Precision Floating-Point)
+fcvt.h.s fa0, fa1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal)
+
+# Invalid instructions
+fsw a5, 12(sp) # CHECK: :[[@LINE]]:5: error: invalid operand for instruction
+fmv.x.h s0, s1 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+
+# Invalid register names
+fcvt.h.s a100, a1 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction
+
+# Valid in Zhinx
+fmadd.h x10, x11, x12, x13, dyn # CHECK: :[[@LINE]]:1: error: instruction requires the following: 'Zhinx' (Half Float in Integer)
diff --git a/llvm/test/MC/RISCV/rv32zhinxmin-valid.s b/llvm/test/MC/RISCV/rv32zhinxmin-valid.s
new file mode 100644
index 0000000000000..536c0bdfe2b91
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zhinxmin-valid.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinxmin -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zhinxmin %s \
+# RUN:     | llvm-objdump --mattr=+zhinxmin -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinxmin %s \
+# RUN:     | llvm-objdump --mattr=+zhinxmin -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: fcvt.s.h a0, a1
+# CHECK-ASM: encoding: [0x53,0x85,0x25,0x40]
+fcvt.s.h a0, a1
+
+# CHECK-ASM-AND-OBJ: fcvt.h.s a0, a1, dyn
+# CHECK-ASM: encoding: [0x53,0xf5,0x05,0x44]
+fcvt.h.s a0, a1
diff --git a/llvm/test/MC/RISCV/rv64zdinx-invalid.s b/llvm/test/MC/RISCV/rv64zdinx-invalid.s
new file mode 100644
index 0000000000000..8ba06963a553e
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zdinx-invalid.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple riscv64 -mattr=+zdinx %s 2>&1 | FileCheck %s
+
+# Invalid Instructions
+fmv.x.d t2, a2 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+fmv.d.x a5, t5 # CHECK: :[[@LINE]]:9:  error: invalid operand for instruction
+
+# FP registers where integer regs are expected
+fcvt.d.l a3, ft3 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+fcvt.d.lu a4, ft4 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zdinx-valid.s b/llvm/test/MC/RISCV/rv64zdinx-valid.s
new file mode 100644
index 0000000000000..1e6e430686d01
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zdinx-valid.s
@@ -0,0 +1,43 @@
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zdinx %s \
+# RUN:     | llvm-objdump --mattr=+zdinx -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+#
+# RUN: not llvm-mc -triple riscv32 -mattr=+zdinx %s 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
+
+# CHECK-ASM-AND-OBJ: fcvt.l.d a0, t0, dyn
+# CHECK-ASM: encoding: [0x53,0xf5,0x22,0xc2]
+# CHECK-RV32: :[[#@LINE+1]]:14: error: invalid operand for instruction
+fcvt.l.d a0, t0, dyn
+# CHECK-ASM-AND-OBJ: fcvt.lu.d a1, t1, dyn
+# CHECK-ASM: encoding: [0xd3,0x75,0x33,0xc2]
+# CHECK-RV32: :[[#@LINE+1]]:15: error: invalid operand for instruction
+fcvt.lu.d a1, t1, dyn
+# CHECK-ASM-AND-OBJ: fcvt.d.l t3, a3, dyn
+# CHECK-ASM: encoding: [0x53,0xfe,0x26,0xd2]
+# CHECK-RV32: :[[#@LINE+1]]:10: error: invalid operand for instruction
+fcvt.d.l t3, a3, dyn
+# CHECK-ASM-AND-OBJ: fcvt.d.lu t4, a4, dyn
+# CHECK-ASM: encoding: [0xd3,0x7e,0x37,0xd2]
+# CHECK-RV32: :[[#@LINE+1]]:11: error: invalid operand for instruction
+fcvt.d.lu t4, a4, dyn
+
+# Rounding modes
+# CHECK-ASM-AND-OBJ: fcvt.d.l t3, a3, rne
+# CHECK-ASM: encoding: [0x53,0x8e,0x26,0xd2]
+# CHECK-RV32: :[[#@LINE+1]]:10: error: invalid operand for instruction
+fcvt.d.l t3, a3, rne
+# CHECK-ASM-AND-OBJ: fcvt.d.lu t4, a4, rtz
+# CHECK-ASM: encoding: [0xd3,0x1e,0x37,0xd2]
+# CHECK-RV32: :[[#@LINE+1]]:11: error: invalid operand for instruction
+fcvt.d.lu t4, a4, rtz
+# CHECK-ASM-AND-OBJ: fcvt.l.d a0, t0, rdn
+# CHECK-ASM: encoding: [0x53,0xa5,0x22,0xc2]
+# CHECK-RV32: :[[#@LINE+1]]:14: error: invalid operand for instruction
+fcvt.l.d a0, t0, rdn
+# CHECK-ASM-AND-OBJ: fcvt.lu.d a1, t1, rup
+# CHECK-ASM: encoding: [0xd3,0x35,0x33,0xc2]
+# CHECK-RV32: :[[#@LINE+1]]:15: error: invalid operand for instruction
+fcvt.lu.d a1, t1, rup
diff --git a/llvm/test/MC/RISCV/rv64zfinx-invalid.s b/llvm/test/MC/RISCV/rv64zfinx-invalid.s
new file mode 100644
index 0000000000000..5815a85cdf0ea
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zfinx-invalid.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple riscv64 -mattr=+zfinx %s 2>&1 | FileCheck %s
+
+# Invalid instructions
+fmv.x.w t2, a2 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+fmv.w.x a5, t5 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
+
+# FP registers where integer regs are expected
+fcvt.s.l a2, ft2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+fcvt.s.lu a3, ft3 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zfinx-valid.s b/llvm/test/MC/RISCV/rv64zfinx-valid.s
new file mode 100644
index 0000000000000..1a3e787e0d316
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zfinx-valid.s
@@ -0,0 +1,43 @@
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zfinx %s \
+# RUN:     | llvm-objdump --mattr=+zfinx -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+#
+# RUN: not llvm-mc -triple riscv32 -mattr=+zfinx %s 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
+
+# CHECK-ASM-AND-OBJ: fcvt.l.s a0, t0, dyn
+# CHECK-ASM: encoding: [0x53,0xf5,0x22,0xc0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.l.s a0, t0, dyn
+# CHECK-ASM-AND-OBJ: fcvt.lu.s a1, t1, dyn
+# CHECK-ASM: encoding: [0xd3,0x75,0x33,0xc0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.lu.s a1, t1, dyn
+# CHECK-ASM-AND-OBJ: fcvt.s.l t2, a2, dyn
+# CHECK-ASM: encoding: [0xd3,0x73,0x26,0xd0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.s.l t2, a2, dyn
+# CHECK-ASM-AND-OBJ: fcvt.s.lu t3, a3, dyn
+# CHECK-ASM: encoding: [0x53,0xfe,0x36,0xd0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.s.lu t3, a3, dyn
+
+# Rounding modes
+# CHECK-ASM-AND-OBJ: fcvt.l.s a4, t4, rne
+# CHECK-ASM: encoding: [0x53,0x87,0x2e,0xc0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.l.s a4, t4, rne
+# CHECK-ASM-AND-OBJ: fcvt.lu.s a5, t5, rtz
+# CHECK-ASM: encoding: [0xd3,0x17,0x3f,0xc0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.lu.s a5, t5, rtz
+# CHECK-ASM-AND-OBJ: fcvt.s.l t6, a6, rdn
+# CHECK-ASM: encoding: [0xd3,0x2f,0x28,0xd0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.s.l t6, a6, rdn
+# CHECK-ASM-AND-OBJ: fcvt.s.lu s7, a7, rup
+# CHECK-ASM: encoding: [0xd3,0xbb,0x38,0xd0]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.s.lu s7, a7, rup
diff --git a/llvm/test/MC/RISCV/rv64zhinx-invalid.s b/llvm/test/MC/RISCV/rv64zhinx-invalid.s
new file mode 100644
index 0000000000000..90fb08b5af7c2
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zhinx-invalid.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple riscv64 -mattr=+zhinx %s 2>&1 | FileCheck %s
+
+# Invalid instructions
+fmv.x.h t2, a2 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+fmv.h.x a5, t5 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
+
+# FP registers where integer regs are expected
+fcvt.h.l a2, ft2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+fcvt.h.lu a3, ft3 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zhinx-valid.s b/llvm/test/MC/RISCV/rv64zhinx-valid.s
new file mode 100644
index 0000000000000..57e47194632fd
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zhinx-valid.s
@@ -0,0 +1,43 @@
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx %s \
+# RUN:     | llvm-objdump --mattr=+zhinx -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+#
+# RUN: not llvm-mc -triple riscv32 -mattr=+zhinx %s 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
+
+# CHECK-ASM-AND-OBJ: fcvt.l.h a0, t0, dyn
+# CHECK-ASM: encoding: [0x53,0xf5,0x22,0xc4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.l.h a0, t0, dyn
+# CHECK-ASM-AND-OBJ: fcvt.lu.h a1, t1, dyn
+# CHECK-ASM: encoding: [0xd3,0x75,0x33,0xc4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.lu.h a1, t1, dyn
+# CHECK-ASM-AND-OBJ: fcvt.h.l t2, a2, dyn
+# CHECK-ASM: encoding: [0xd3,0x73,0x26,0xd4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.h.l t2, a2, dyn
+# CHECK-ASM-AND-OBJ: fcvt.h.lu t3, a3, dyn
+# CHECK-ASM: encoding: [0x53,0xfe,0x36,0xd4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.h.lu t3, a3, dyn
+
+# Rounding modes
+# CHECK-ASM-AND-OBJ: fcvt.l.h a4, t4, rne
+# CHECK-ASM: encoding: [0x53,0x87,0x2e,0xc4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.l.h a4, t4, rne
+# CHECK-ASM-AND-OBJ: fcvt.lu.h a5, t5, rtz
+# CHECK-ASM: encoding: [0xd3,0x17,0x3f,0xc4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.lu.h a5, t5, rtz
+# CHECK-ASM-AND-OBJ: fcvt.h.l t6, a6, rdn
+# CHECK-ASM: encoding: [0xd3,0x2f,0x28,0xd4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.h.l t6, a6, rdn
+# CHECK-ASM-AND-OBJ: fcvt.h.lu s7, a7, rup
+# CHECK-ASM: encoding: [0xd3,0xbb,0x38,0xd4]
+# CHECK-RV32: :[[#@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set
+fcvt.h.lu s7, a7, rup
diff --git a/llvm/test/MC/RISCV/rv64zhinxmin-invalid.s b/llvm/test/MC/RISCV/rv64zhinxmin-invalid.s
new file mode 100644
index 0000000000000..9a7e8b0c675d7
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zhinxmin-invalid.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple riscv64 -mattr=+zhinx %s 2>&1 | FileCheck %s
+
+# Invalid instructions
+fmv.x.h t2, a2 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction
+fmv.h.x a5, t5 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
+
+# FP registers where integer regs are expected
+fcvt.d.h a0, fa2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+fcvt.h.d a0, fa2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zhinxmin-valid.s b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
new file mode 100644
index 0000000000000..54f32e7a07b50
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zhinxmin-valid.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx,+zdinx -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zhinx,+zdinx %s \
+# RUN:     | llvm-objdump --mattr=+zhinx,+zdinx -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: fcvt.d.h a0, a2
+# CHECK-ASM: encoding: [0x53,0x05,0x26,0x42]
+fcvt.d.h a0, a2
+
+# CHECK-ASM-AND-OBJ: fcvt.h.d a0, a2, dyn
+# CHECK-ASM: encoding: [0x53,0x75,0x16,0x44]
+fcvt.h.d a0, a2, dyn
diff --git a/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
new file mode 100644
index 0000000000000..3262c88e92816
--- /dev/null
+++ b/llvm/test/MC/RISCV/rvzdinx-aliases-valid.s
@@ -0,0 +1,49 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx -riscv-no-aliases \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zdinx \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx -riscv-no-aliases \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zdinx \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zdinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zdinx -M no-aliases - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zdinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zdinx - \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zdinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zdinx -M no-aliases - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zdinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zdinx - \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+
+##===----------------------------------------------------------------------===##
+## Aliases which omit the rounding mode.
+##===----------------------------------------------------------------------===##
+
+# CHECK-INST: fmadd.d a0, a2, a4, a6, dyn
+# CHECK-ALIAS: fmadd.d a0, a2, a4, a6
+fmadd.d x10, x12, x14, x16
+# CHECK-INST: fmsub.d a0, a2, a4, a6, dyn
+# CHECK-ALIAS: fmsub.d a0, a2, a4, a6
+fmsub.d x10, x12, x14, x16
+# CHECK-INST: fnmsub.d a0, a2, a4, a6, dyn
+# CHECK-ALIAS: fnmsub.d a0, a2, a4, a6
+fnmsub.d x10, x12, x14, x16
+# CHECK-INST: fnmadd.d a0, a2, a4, a6, dyn
+# CHECK-ALIAS: fnmadd.d a0, a2, a4, a6
+fnmadd.d x10, x12, x14, x16
+# CHECK-INST: fadd.d a0, a2, a4, dyn
+# CHECK-ALIAS: fadd.d a0, a2, a4
+fadd.d x10, x12, x14
+# CHECK-INST: fsub.d a0, a2, a4, dyn
+# CHECK-ALIAS: fsub.d a0, a2, a4
+fsub.d x10, x12, x14
+# CHECK-INST: fmul.d a0, a2, a4, dyn
+# CHECK-ALIAS: fmul.d a0, a2, a4
+fmul.d x10, x12, x14
+# CHECK-INST: fdiv.d a0, a2, a4, dyn
+# CHECK-ALIAS: fdiv.d a0, a2, a4
+fdiv.d x10, x12, x14
diff --git a/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
new file mode 100644
index 0000000000000..71e9977404d7a
--- /dev/null
+++ b/llvm/test/MC/RISCV/rvzfinx-aliases-valid.s
@@ -0,0 +1,82 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx -riscv-no-aliases \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zfinx \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx -riscv-no-aliases \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zfinx \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zfinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zfinx -M no-aliases - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zfinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zfinx - \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zfinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zfinx -M no-aliases - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zfinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zfinx - \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+
+##===----------------------------------------------------------------------===##
+## Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
+##===----------------------------------------------------------------------===##
+
+# CHECK-INST: fsgnjx.s s1, s2, s2
+# CHECK-ALIAS: fabs.s s1, s2
+fabs.s s1, s2
+# CHECK-INST: fsgnjn.s s2, s3, s3
+# CHECK-ALIAS: fneg.s s2, s3
+fneg.s s2, s3
+
+# CHECK-INST: flt.s tp, s6, s5
+# CHECK-ALIAS: flt.s tp, s6, s5
+fgt.s x4, s5, s6
+# CHECK-INST: fle.s t2, s1, s0
+# CHECK-ALIAS: fle.s t2, s1, s0
+fge.s x7, x8, x9
+
+##===----------------------------------------------------------------------===##
+## Aliases which omit the rounding mode.
+##===----------------------------------------------------------------------===##
+
+# CHECK-INST: fmadd.s a0, a1, a2, a3, dyn
+# CHECK-ALIAS: fmadd.s a0, a1, a2, a3
+fmadd.s x10, x11, x12, x13
+# CHECK-INST: fmsub.s a4, a5, a6, a7, dyn
+# CHECK-ALIAS: fmsub.s a4, a5, a6, a7
+fmsub.s x14, x15, x16, x17
+# CHECK-INST: fnmsub.s s2, s3, s4, s5, dyn
+# CHECK-ALIAS: fnmsub.s s2, s3, s4, s5
+fnmsub.s x18, x19, x20, x21
+# CHECK-INST: fnmadd.s s6, s7, s8, s9, dyn
+# CHECK-ALIAS: fnmadd.s s6, s7, s8, s9
+fnmadd.s x22, x23, x24, x25
+# CHECK-INST: fadd.s s10, s11, t3, dyn
+# CHECK-ALIAS: fadd.s s10, s11, t3
+fadd.s x26, x27, x28
+# CHECK-INST: fsub.s t4, t5, t6, dyn
+# CHECK-ALIAS: fsub.s t4, t5, t6
+fsub.s x29, x30, x31
+# CHECK-INST: fmul.s s0, s1, s2, dyn
+# CHECK-ALIAS: fmul.s s0, s1, s2
+fmul.s s0, s1, s2
+# CHECK-INST: fdiv.s s3, s4, s5, dyn
+# CHECK-ALIAS: fdiv.s s3, s4, s5
+fdiv.s s3, s4, s5
+# CHECK-INST: sqrt.s s6, s7, dyn
+# CHECK-ALIAS: sqrt.s s6, s7
+fsqrt.s s6, s7
+# CHECK-INST: fcvt.w.s a0, s5, dyn
+# CHECK-ALIAS: fcvt.w.s a0, s5
+fcvt.w.s a0, s5
+# CHECK-INST: fcvt.wu.s a1, s6, dyn
+# CHECK-ALIAS: fcvt.wu.s a1, s6
+fcvt.wu.s a1, s6
+# CHECK-INST: fcvt.s.w t6, a4, dyn
+# CHECK-ALIAS: fcvt.s.w t6, a4
+fcvt.s.w t6, a4
+# CHECK-INST: fcvt.s.wu s0, a5, dyn
+# CHECK-ALIAS: fcvt.s.wu s0, a5
+fcvt.s.wu s0, a5
diff --git a/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
new file mode 100644
index 0000000000000..9a328e4441244
--- /dev/null
+++ b/llvm/test/MC/RISCV/rvzhinx-aliases-valid.s
@@ -0,0 +1,82 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx -riscv-no-aliases \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+zhinx \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx -riscv-no-aliases \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+zhinx \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zhinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zhinx -M no-aliases - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+zhinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zhinx - \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zhinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zhinx -M no-aliases - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+zhinx %s \
+# RUN:     | llvm-objdump -d --mattr=+zhinx - \
+# RUN:     | FileCheck -check-prefix=CHECK-ALIAS %s
+
+##===----------------------------------------------------------------------===##
+## Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
+##===----------------------------------------------------------------------===##
+
+# CHECK-INST: fsgnjx.h s1, s2, s2
+# CHECK-ALIAS: fabs.h s1, s2
+fabs.h s1, s2
+# CHECK-INST: fsgnjn.h s2, s3, s3
+# CHECK-ALIAS: fneg.h s2, s3
+fneg.h s2, s3
+
+# CHECK-INST: flt.h tp, s6, s5
+# CHECK-ALIAS: flt.h tp, s6, s5
+fgt.h x4, s5, s6
+# CHECK-INST: fle.h t2, s1, s0
+# CHECK-ALIAS: fle.h t2, s1, s0
+fge.h x7, x8, x9
+
+##===----------------------------------------------------------------------===##
+## Aliases which omit the rounding mode.
+##===----------------------------------------------------------------------===##
+
+# CHECK-INST: fmadd.h a0, a1, a2, a3, dyn
+# CHECK-ALIAS: fmadd.h a0, a1, a2, a3
+fmadd.h x10, x11, x12, x13
+# CHECK-INST: fmsub.h a4, a5, a6, a7, dyn
+# CHECK-ALIAS: fmsub.h a4, a5, a6, a7
+fmsub.h x14, x15, x16, x17
+# CHECK-INST: fnmsub.h s2, s3, s4, s5, dyn
+# CHECK-ALIAS: fnmsub.h s2, s3, s4, s5
+fnmsub.h x18, x19, x20, x21
+# CHECK-INST: fnmadd.h s6, s7, s8, s9, dyn
+# CHECK-ALIAS: fnmadd.h s6, s7, s8, s9
+fnmadd.h x22, x23, x24, x25
+# CHECK-INST: fadd.h s10, s11, t3, dyn
+# CHECK-ALIAS: fadd.h s10, s11, t3
+fadd.h x26, x27, x28
+# CHECK-INST: fsub.h t4, t5, t6, dyn
+# CHECK-ALIAS: fsub.h t4, t5, t6
+fsub.h x29, x30, x31
+# CHECK-INST: fmul.h s0, s1, s2, dyn
+# CHECK-ALIAS: fmul.h s0, s1, s2
+fmul.h s0, s1, s2
+# CHECK-INST: fdiv.h s3, s4, s5, dyn
+# CHECK-ALIAS: fdiv.h s3, s4, s5
+fdiv.h s3, s4, s5
+# CHECK-INST: fsqrt.h s6, s7, dyn
+# CHECK-ALIAS: fsqrt.h s6, s7
+fsqrt.h s6, s7
+# CHECK-INST: fcvt.w.h a0, s5, dyn
+# CHECK-ALIAS: fcvt.w.h a0, s5
+fcvt.w.h a0, s5
+# CHECK-INST: fcvt.wu.h a1, s6, dyn
+# CHECK-ALIAS: fcvt.wu.h a1, s6
+fcvt.wu.h a1, s6
+# CHECK-INST: fcvt.h.w t6, a4, dyn
+# CHECK-ALIAS: fcvt.h.w t6, a4
+fcvt.h.w t6, a4
+# CHECK-INST: fcvt.h.wu s0, a5, dyn
+# CHECK-ALIAS: fcvt.h.wu s0, a5
+fcvt.h.wu s0, a5

From a569d6060ded62a57c40fb78def7e40dc5edacd3 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 17 Feb 2022 08:56:06 -0500
Subject: [PATCH 098/748] [gn build] (manually) port f75da0c8e65c (ObjCopy lib)

---
 .../gn/secondary/llvm/lib/ObjCopy/BUILD.gn    | 29 +++++++++++++++++++
 .../llvm/tools/llvm-objcopy/BUILD.gn          | 19 ++----------
 .../gn/secondary/llvm/unittests/BUILD.gn      |  1 +
 .../secondary/llvm/unittests/ObjCopy/BUILD.gn | 11 +++++++
 4 files changed, 43 insertions(+), 17 deletions(-)
 create mode 100644 llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn
 create mode 100644 llvm/utils/gn/secondary/llvm/unittests/ObjCopy/BUILD.gn

diff --git a/llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn
new file mode 100644
index 0000000000000..78fceb2e92362
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn
@@ -0,0 +1,29 @@
+static_library("ObjCopy") {
+  output_name = "LLVMObjCopy"
+  deps = [
+    "//llvm/lib/Object",
+    "//llvm/lib/MC",
+    "//llvm/lib/Support",
+  ]
+  include_dirs = [ "." ]
+  sources = [
+    "Archive.cpp",
+    "ObjCopy.cpp",
+    "ConfigManager.cpp",
+    "COFF/COFFObjcopy.cpp",
+    "COFF/Object.cpp",
+    "COFF/Reader.cpp",
+    "COFF/Writer.cpp",
+    "ELF/ELFObjcopy.cpp",
+    "ELF/Object.cpp",
+    "MachO/MachOObjcopy.cpp",
+    "MachO/MachOReader.cpp",
+    "MachO/MachOWriter.cpp",
+    "MachO/MachOLayoutBuilder.cpp",
+    "MachO/Object.cpp",
+    "wasm/Object.cpp",
+    "wasm/Reader.cpp",
+    "wasm/Writer.cpp",
+    "wasm/WasmObjcopy.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn
index 64ea2b55c99e1..23ee0273c3a63 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-objcopy/BUILD.gn
@@ -56,28 +56,13 @@ executable("llvm-objcopy") {
     ":ObjcopyOpts",
     ":StripOpts",
     "//llvm/lib/MC",
+    "//llvm/lib/ObjCopy",
     "//llvm/lib/Object",
     "//llvm/lib/Option",
     "//llvm/lib/Support",
   ]
-  include_dirs = [ "." ]
   sources = [
-    "COFF/COFFObjcopy.cpp",
-    "COFF/Object.cpp",
-    "COFF/Reader.cpp",
-    "COFF/Writer.cpp",
-    "ConfigManager.cpp",
-    "ELF/ELFObjcopy.cpp",
-    "ELF/Object.cpp",
-    "MachO/MachOLayoutBuilder.cpp",
-    "MachO/MachOObjcopy.cpp",
-    "MachO/MachOReader.cpp",
-    "MachO/MachOWriter.cpp",
-    "MachO/Object.cpp",
+    "ObjcopyOptions.cpp",
     "llvm-objcopy.cpp",
-    "wasm/Object.cpp",
-    "wasm/Reader.cpp",
-    "wasm/WasmObjcopy.cpp",
-    "wasm/Writer.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 4176e743bb13f..cd92504743a50 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -31,6 +31,7 @@ group("unittests") {
     "MC:MCTests",
     "MI:MITests",
     "MIR:MIRTests",
+    "ObjCopy:ObjCopyTests",
     "Object:ObjectTests",
     "ObjectYAML:ObjectYAMLTests",
     "Option:OptionTests",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ObjCopy/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ObjCopy/BUILD.gn
new file mode 100644
index 0000000000000..58c168156d073
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/ObjCopy/BUILD.gn
@@ -0,0 +1,11 @@
+import("//llvm/utils/unittest/unittest.gni")
+
+unittest("ObjCopyTests") {
+  deps = [
+    "//llvm/lib/ObjCopy",
+    "//llvm/lib/Object",
+    "//llvm/lib/ObjectYAML",
+    "//llvm/lib/Testing/Support",
+  ]
+  sources = [ "ObjCopyTest.cpp" ]
+}

From d1cd64ffdd832220dbe1829c2f09b880be67be31 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 17 Feb 2022 05:40:01 -0800
Subject: [PATCH 099/748] [SLP][NFC]Fix misprint in function name, NFC.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f1e42ae9ddec6..f7af3151e1894 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4800,12 +4800,12 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
 /// Build shuffle mask for shuffle graph entries and lists of main and alternate
 /// operations operands.
 static void
-buildSuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
-                     ArrayRef<int> ReusesIndices,
-                     const function_ref<bool(Instruction *)> IsAltOp,
-                     SmallVectorImpl<int> &Mask,
-                     SmallVectorImpl<Value *> *OpScalars = nullptr,
-                     SmallVectorImpl<Value *> *AltScalars = nullptr) {
+buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
+                      ArrayRef<int> ReusesIndices,
+                      const function_ref<bool(Instruction *)> IsAltOp,
+                      SmallVectorImpl<int> &Mask,
+                      SmallVectorImpl<Value *> *OpScalars = nullptr,
+                      SmallVectorImpl<Value *> *AltScalars = nullptr) {
   unsigned Sz = VL.size();
   Mask.assign(Sz, UndefMaskElem);
   SmallVector<int> OrderMask;
@@ -5556,7 +5556,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       }
 
       SmallVector<int> Mask;
-      buildSuffleEntryMask(
+      buildShuffleEntryMask(
           E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
           [E](Instruction *I) {
             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
@@ -7106,7 +7106,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // each vector operation.
       ValueList OpScalars, AltScalars;
       SmallVector<int> Mask;
-      buildSuffleEntryMask(
+      buildShuffleEntryMask(
           E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
           [E](Instruction *I) {
             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

From c9b36807beaf120f4e06d9da3b7df7625e440825 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Thu, 17 Feb 2022 09:08:15 -0500
Subject: [PATCH 100/748] [mlir][spirv] Add a pass to unify aliased resource
 variables

In SPIR-V, resources are represented as global variables that
are bound to certain descriptor. SPIR-V requires those global
variables to be declared as aliased if multiple ones are bound
to the same slot. Such aliased decorations can cause issues
for transcompilers like SPIRV-Cross when converting to source
shading languages like MSL.

So this commit adds a pass to perform analysis of aliased
resources and see if we can unify them into one.

Reviewed By: ThomasRaoux

Differential Revision: https://reviews.llvm.org/D119872
---
 .../Dialect/SPIRV/IR/SPIRVStructureOps.td     |   2 +-
 .../mlir/Dialect/SPIRV/Transforms/Passes.h    |   5 +
 .../mlir/Dialect/SPIRV/Transforms/Passes.td   |   7 +
 .../Dialect/SPIRV/Transforms/CMakeLists.txt   |   2 +
 .../Transforms/UnifyAliasedResourcePass.cpp   | 452 ++++++++++++++++++
 .../Transforms/unify-aliased-resource.mlir    | 215 +++++++++
 6 files changed, 682 insertions(+), 1 deletion(-)
 create mode 100644 mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp
 create mode 100644 mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td
index ebefa3167a249..4201e0ee09333 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVStructureOps.td
@@ -385,7 +385,7 @@ def SPV_GlobalVariableOp : SPV_Op<"GlobalVariable", [InModuleScope, Symbol]> {
     OptionalAttr<FlatSymbolRefAttr>:$initializer,
     OptionalAttr<I32Attr>:$location,
     OptionalAttr<I32Attr>:$binding,
-    OptionalAttr<I32Attr>:$descriptorSet,
+    OptionalAttr<I32Attr>:$descriptor_set,
     OptionalAttr<StrAttr>:$builtin
   );
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.h b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.h
index 38548fee32682..116a37dc0b534 100644
--- a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.h
@@ -55,6 +55,11 @@ std::unique_ptr<OperationPass<spirv::ModuleOp>> createLowerABIAttributesPass();
 /// spv.CompositeInsert into spv.CompositeConstruct.
 std::unique_ptr<OperationPass<spirv::ModuleOp>> createRewriteInsertsPass();
 
+/// Creates an operation pass that unifies access of multiple aliased resources
+/// into access of one single resource.
+std::unique_ptr<OperationPass<spirv::ModuleOp>>
+createUnifyAliasedResourcePass();
+
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
index 575bb0898faad..32abca53f8a59 100644
--- a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
@@ -28,6 +28,13 @@ def SPIRVRewriteInsertsPass : Pass<"spirv-rewrite-inserts", "spirv::ModuleOp"> {
   let constructor = "mlir::spirv::createRewriteInsertsPass()";
 }
 
+def SPIRVUnifyAliasedResourcePass
+    : Pass<"spirv-unify-aliased-resource", "spirv::ModuleOp"> {
+  let summary = "Unify access of multiple aliased resources into access of one "
+                "single resource";
+  let constructor = "mlir::spirv::createUnifyAliasedResourcePass()";
+}
+
 def SPIRVUpdateVCE : Pass<"spirv-update-vce", "spirv::ModuleOp"> {
   let summary = "Deduce and attach minimal (version, capabilities, extensions) "
                 "requirements to spv.module ops";
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
index db274088bdf22..affceebcfd3d4 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_OPTIONAL_SOURCES
   LowerABIAttributesPass.cpp
   RewriteInsertsPass.cpp
   SPIRVConversion.cpp
+  UnifyAliasedResourcePass.cpp
   UpdateVCEPass.cpp
 )
 
@@ -21,6 +22,7 @@ add_mlir_dialect_library(MLIRSPIRVTransforms
   DecorateCompositeTypeLayoutPass.cpp
   LowerABIAttributesPass.cpp
   RewriteInsertsPass.cpp
+  UnifyAliasedResourcePass.cpp
   UpdateVCEPass.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp
new file mode 100644
index 0000000000000..fa0e551f5d53e
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Transforms/UnifyAliasedResourcePass.cpp
@@ -0,0 +1,452 @@
+//===- UnifyAliasedResourcePass.cpp - Pass to Unify Aliased Resources -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that unifies access of multiple aliased resources
+// into access of one single resource.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Pass/AnalysisManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "spirv-unify-aliased-resource"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
+
+using Descriptor = std::pair<uint32_t, uint32_t>; // (set #, binding #)
+using AliasedResourceMap =
+    DenseMap<Descriptor, SmallVector<spirv::GlobalVariableOp>>;
+
+/// Collects all aliased resources in the given SPIR-V `moduleOp`.
+static AliasedResourceMap collectAliasedResources(spirv::ModuleOp moduleOp) {
+  AliasedResourceMap aliasedResoruces;
+  moduleOp->walk([&aliasedResoruces](spirv::GlobalVariableOp varOp) {
+    if (varOp->getAttrOfType<UnitAttr>("aliased")) {
+      Optional<uint32_t> set = varOp.descriptor_set();
+      Optional<uint32_t> binding = varOp.binding();
+      if (set && binding)
+        aliasedResoruces[{*set, *binding}].push_back(varOp);
+    }
+  });
+  return aliasedResoruces;
+}
+
+/// Returns the element type if the given `type` is a runtime array resource:
+/// `!spv.ptr<!spv.struct<!spv.rtarray<...>>>`. Returns null type otherwise.
+static Type getRuntimeArrayElementType(Type type) {
+  auto ptrType = type.dyn_cast<spirv::PointerType>();
+  if (!ptrType)
+    return {};
+
+  auto structType = ptrType.getPointeeType().dyn_cast<spirv::StructType>();
+  if (!structType || structType.getNumElements() != 1)
+    return {};
+
+  auto rtArrayType =
+      structType.getElementType(0).dyn_cast<spirv::RuntimeArrayType>();
+  if (!rtArrayType)
+    return {};
+
+  return rtArrayType.getElementType();
+}
+
+/// Returns true if all `types`, which can either be scalar or vector types,
+/// have the same bitwidth base scalar type.
+static bool hasSameBitwidthScalarType(ArrayRef<spirv::SPIRVType> types) {
+  SmallVector<int64_t> scalarTypes;
+  scalarTypes.reserve(types.size());
+  for (spirv::SPIRVType type : types) {
+    assert(type.isScalarOrVector());
+    if (auto vectorType = type.dyn_cast<VectorType>())
+      scalarTypes.push_back(
+          vectorType.getElementType().getIntOrFloatBitWidth());
+    else
+      scalarTypes.push_back(type.getIntOrFloatBitWidth());
+  }
+  return llvm::is_splat(scalarTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// Analysis
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A class for analyzing aliased resources.
+///
+/// Resources are expected to be spv.GlobalVarible that has a descriptor set and
+/// binding number. Such resources are of the type `!spv.ptr<!spv.struct<...>>`
+/// per Vulkan requirements.
+///
+/// Right now, we only support the case that there is a single runtime array
+/// inside the struct.
+class ResourceAliasAnalysis {
+public:
+  explicit ResourceAliasAnalysis(Operation *);
+
+  /// Returns true if the given `op` can be rewritten to use a canonical
+  /// resource.
+  bool shouldUnify(Operation *op) const;
+
+  /// Returns all descriptors and their corresponding aliased resources.
+  const AliasedResourceMap &getResourceMap() const { return resourceMap; }
+
+  /// Returns the canonical resource for the given descriptor/variable.
+  spirv::GlobalVariableOp
+  getCanonicalResource(const Descriptor &descriptor) const;
+  spirv::GlobalVariableOp
+  getCanonicalResource(spirv::GlobalVariableOp varOp) const;
+
+  /// Returns the element type for the given variable.
+  spirv::SPIRVType getElementType(spirv::GlobalVariableOp varOp) const;
+
+private:
+  /// Given the descriptor and aliased resources bound to it, analyze whether we
+  /// can unify them and record if so.
+  void recordIfUnifiable(const Descriptor &descriptor,
+                         ArrayRef<spirv::GlobalVariableOp> resources);
+
+  /// Mapping from a descriptor to all aliased resources bound to it.
+  AliasedResourceMap resourceMap;
+
+  /// Mapping from a descriptor to the chosen canonical resource.
+  DenseMap<Descriptor, spirv::GlobalVariableOp> canonicalResourceMap;
+
+  /// Mapping from an aliased resource to its descriptor.
+  DenseMap<spirv::GlobalVariableOp, Descriptor> descriptorMap;
+
+  /// Mapping from an aliased resource to its element (scalar/vector) type.
+  DenseMap<spirv::GlobalVariableOp, spirv::SPIRVType> elementTypeMap;
+};
+} // namespace
+
+ResourceAliasAnalysis::ResourceAliasAnalysis(Operation *root) {
+  // Collect all aliased resources first and put them into different sets
+  // according to the descriptor.
+  AliasedResourceMap aliasedResoruces =
+      collectAliasedResources(cast<spirv::ModuleOp>(root));
+
+  // For each resource set, analyze whether we can unify; if so, try to identify
+  // a canonical resource, whose element type has the largest bitwidth.
+  for (const auto &descriptorResoruce : aliasedResoruces) {
+    recordIfUnifiable(descriptorResoruce.first, descriptorResoruce.second);
+  }
+}
+
+bool ResourceAliasAnalysis::shouldUnify(Operation *op) const {
+  if (auto varOp = dyn_cast<spirv::GlobalVariableOp>(op)) {
+    auto canonicalOp = getCanonicalResource(varOp);
+    return canonicalOp && varOp != canonicalOp;
+  }
+  if (auto addressOp = dyn_cast<spirv::AddressOfOp>(op)) {
+    auto moduleOp = addressOp->getParentOfType<spirv::ModuleOp>();
+    auto *varOp = SymbolTable::lookupSymbolIn(moduleOp, addressOp.variable());
+    return shouldUnify(varOp);
+  }
+
+  if (auto acOp = dyn_cast<spirv::AccessChainOp>(op))
+    return shouldUnify(acOp.base_ptr().getDefiningOp());
+  if (auto loadOp = dyn_cast<spirv::LoadOp>(op))
+    return shouldUnify(loadOp.ptr().getDefiningOp());
+  if (auto storeOp = dyn_cast<spirv::StoreOp>(op))
+    return shouldUnify(storeOp.ptr().getDefiningOp());
+
+  return false;
+}
+
+spirv::GlobalVariableOp ResourceAliasAnalysis::getCanonicalResource(
+    const Descriptor &descriptor) const {
+  auto varIt = canonicalResourceMap.find(descriptor);
+  if (varIt == canonicalResourceMap.end())
+    return {};
+  return varIt->second;
+}
+
+spirv::GlobalVariableOp ResourceAliasAnalysis::getCanonicalResource(
+    spirv::GlobalVariableOp varOp) const {
+  auto descriptorIt = descriptorMap.find(varOp);
+  if (descriptorIt == descriptorMap.end())
+    return {};
+  return getCanonicalResource(descriptorIt->second);
+}
+
+spirv::SPIRVType
+ResourceAliasAnalysis::getElementType(spirv::GlobalVariableOp varOp) const {
+  auto it = elementTypeMap.find(varOp);
+  if (it == elementTypeMap.end())
+    return {};
+  return it->second;
+}
+
+void ResourceAliasAnalysis::recordIfUnifiable(
+    const Descriptor &descriptor, ArrayRef<spirv::GlobalVariableOp> resources) {
+  // Collect the element types and byte counts for all resources in the
+  // current set.
+  SmallVector<spirv::SPIRVType> elementTypes;
+  SmallVector<int64_t> numBytes;
+
+  for (spirv::GlobalVariableOp resource : resources) {
+    Type elementType = getRuntimeArrayElementType(resource.type());
+    if (!elementType)
+      return; // Unexpected resource variable type.
+
+    auto type = elementType.cast<spirv::SPIRVType>();
+    if (!type.isScalarOrVector())
+      return; // Unexpected resource element type.
+
+    if (auto vectorType = type.dyn_cast<VectorType>())
+      if (vectorType.getNumElements() % 2 != 0)
+        return; // Odd-sized vector has special layout requirements.
+
+    Optional<int64_t> count = type.getSizeInBytes();
+    if (!count)
+      return;
+
+    elementTypes.push_back(type);
+    numBytes.push_back(*count);
+  }
+
+  // Make sure base scalar types have the same bitwdith, so that we don't need
+  // to handle extracting components for now.
+  if (!hasSameBitwidthScalarType(elementTypes))
+    return;
+
+  // Make sure that the canonical resource's bitwidth is divisible by others.
+  // With out this, we cannot properly adjust the index later.
+  auto *maxCount = std::max_element(numBytes.begin(), numBytes.end());
+  if (llvm::any_of(numBytes, [maxCount](int64_t count) {
+        return *maxCount % count != 0;
+      }))
+    return;
+
+  spirv::GlobalVariableOp canonicalResource =
+      resources[std::distance(numBytes.begin(), maxCount)];
+
+  // Update internal data structures for later use.
+  resourceMap[descriptor].assign(resources.begin(), resources.end());
+  canonicalResourceMap[descriptor] = canonicalResource;
+  for (const auto &resource : llvm::enumerate(resources)) {
+    descriptorMap[resource.value()] = descriptor;
+    elementTypeMap[resource.value()] = elementTypes[resource.index()];
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns
+//===----------------------------------------------------------------------===//
+
+template <typename OpTy>
+class ConvertAliasResoruce : public OpConversionPattern<OpTy> {
+public:
+  ConvertAliasResoruce(const ResourceAliasAnalysis &analysis,
+                       MLIRContext *context, PatternBenefit benefit = 1)
+      : OpConversionPattern<OpTy>(context, benefit), analysis(analysis) {}
+
+protected:
+  const ResourceAliasAnalysis &analysis;
+};
+
+struct ConvertVariable : public ConvertAliasResoruce<spirv::GlobalVariableOp> {
+  using ConvertAliasResoruce::ConvertAliasResoruce;
+
+  LogicalResult
+  matchAndRewrite(spirv::GlobalVariableOp varOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Just remove the aliased resource. Users will be rewritten to use the
+    // canonical one.
+    rewriter.eraseOp(varOp);
+    return success();
+  }
+};
+
+struct ConvertAddressOf : public ConvertAliasResoruce<spirv::AddressOfOp> {
+  using ConvertAliasResoruce::ConvertAliasResoruce;
+
+  LogicalResult
+  matchAndRewrite(spirv::AddressOfOp addressOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Rewrite the AddressOf op to get the address of the canoncical resource.
+    auto moduleOp = addressOp->getParentOfType<spirv::ModuleOp>();
+    auto srcVarOp = cast<spirv::GlobalVariableOp>(
+        SymbolTable::lookupSymbolIn(moduleOp, addressOp.variable()));
+    auto dstVarOp = analysis.getCanonicalResource(srcVarOp);
+    rewriter.replaceOpWithNewOp<spirv::AddressOfOp>(addressOp, dstVarOp);
+    return success();
+  }
+};
+
+struct ConvertAccessChain : public ConvertAliasResoruce<spirv::AccessChainOp> {
+  using ConvertAliasResoruce::ConvertAliasResoruce;
+
+  LogicalResult
+  matchAndRewrite(spirv::AccessChainOp acOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto addressOp = acOp.base_ptr().getDefiningOp<spirv::AddressOfOp>();
+    if (!addressOp)
+      return rewriter.notifyMatchFailure(acOp, "base ptr not addressof op");
+
+    auto moduleOp = acOp->getParentOfType<spirv::ModuleOp>();
+    auto srcVarOp = cast<spirv::GlobalVariableOp>(
+        SymbolTable::lookupSymbolIn(moduleOp, addressOp.variable()));
+    auto dstVarOp = analysis.getCanonicalResource(srcVarOp);
+
+    spirv::SPIRVType srcElemType = analysis.getElementType(srcVarOp);
+    spirv::SPIRVType dstElemType = analysis.getElementType(dstVarOp);
+
+    if ((srcElemType == dstElemType) ||
+        (srcElemType.isIntOrFloat() && dstElemType.isIntOrFloat())) {
+      // We have the same bitwidth for source and destination element types.
+      // Thie indices keep the same.
+      rewriter.replaceOpWithNewOp<spirv::AccessChainOp>(
+          acOp, adaptor.base_ptr(), adaptor.indices());
+      return success();
+    }
+
+    Location loc = acOp.getLoc();
+    auto i32Type = rewriter.getI32Type();
+
+    if (srcElemType.isIntOrFloat() && dstElemType.isa<VectorType>()) {
+      // The source indices are for a buffer with scalar element types. Rewrite
+      // them into a buffer with vector element types. We need to scale the last
+      // index for the vector as a whole, then add one level of index for inside
+      // the vector.
+      int ratio = *dstElemType.getSizeInBytes() / *srcElemType.getSizeInBytes();
+      auto ratioValue = rewriter.create<spirv::ConstantOp>(
+          loc, i32Type, rewriter.getI32IntegerAttr(ratio));
+
+      auto indices = llvm::to_vector<4>(acOp.indices());
+      Value oldIndex = indices.back();
+      indices.back() =
+          rewriter.create<spirv::SDivOp>(loc, i32Type, oldIndex, ratioValue);
+      indices.push_back(
+          rewriter.create<spirv::SModOp>(loc, i32Type, oldIndex, ratioValue));
+
+      rewriter.replaceOpWithNewOp<spirv::AccessChainOp>(
+          acOp, adaptor.base_ptr(), indices);
+      return success();
+    }
+
+    return rewriter.notifyMatchFailure(acOp, "unsupported src/dst types");
+  }
+};
+
+struct ConvertLoad : public ConvertAliasResoruce<spirv::LoadOp> {
+  using ConvertAliasResoruce::ConvertAliasResoruce;
+
+  LogicalResult
+  matchAndRewrite(spirv::LoadOp loadOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto srcElemType =
+        loadOp.ptr().getType().cast<spirv::PointerType>().getPointeeType();
+    auto dstElemType =
+        adaptor.ptr().getType().cast<spirv::PointerType>().getPointeeType();
+    if (!srcElemType.isIntOrFloat() || !dstElemType.isIntOrFloat())
+      return rewriter.notifyMatchFailure(loadOp, "not scalar type");
+
+    Location loc = loadOp.getLoc();
+    auto newLoadOp = rewriter.create<spirv::LoadOp>(loc, adaptor.ptr());
+    if (srcElemType == dstElemType) {
+      rewriter.replaceOp(loadOp, newLoadOp->getResults());
+    } else {
+      auto castOp = rewriter.create<spirv::BitcastOp>(loc, srcElemType,
+                                                      newLoadOp.value());
+      rewriter.replaceOp(loadOp, castOp->getResults());
+    }
+
+    return success();
+  }
+};
+
+struct ConvertStore : public ConvertAliasResoruce<spirv::StoreOp> {
+  using ConvertAliasResoruce::ConvertAliasResoruce;
+
+  LogicalResult
+  matchAndRewrite(spirv::StoreOp storeOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto srcElemType =
+        storeOp.ptr().getType().cast<spirv::PointerType>().getPointeeType();
+    auto dstElemType =
+        adaptor.ptr().getType().cast<spirv::PointerType>().getPointeeType();
+    if (!srcElemType.isIntOrFloat() || !dstElemType.isIntOrFloat())
+      return rewriter.notifyMatchFailure(storeOp, "not scalar type");
+
+    Location loc = storeOp.getLoc();
+    Value value = adaptor.value();
+    if (srcElemType != dstElemType)
+      value = rewriter.create<spirv::BitcastOp>(loc, dstElemType, value);
+    rewriter.replaceOpWithNewOp<spirv::StoreOp>(storeOp, adaptor.ptr(), value,
+                                                storeOp->getAttrs());
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class UnifyAliasedResourcePass final
+    : public SPIRVUnifyAliasedResourcePassBase<UnifyAliasedResourcePass> {
+public:
+  void runOnOperation() override;
+};
+} // namespace
+
+void UnifyAliasedResourcePass::runOnOperation() {
+  spirv::ModuleOp moduleOp = getOperation();
+  MLIRContext *context = &getContext();
+
+  // Analyze aliased resources first.
+  ResourceAliasAnalysis &analysis = getAnalysis<ResourceAliasAnalysis>();
+
+  ConversionTarget target(*context);
+  target.addDynamicallyLegalOp<spirv::GlobalVariableOp, spirv::AddressOfOp,
+                               spirv::AccessChainOp, spirv::LoadOp,
+                               spirv::StoreOp>(
+      [&analysis](Operation *op) { return !analysis.shouldUnify(op); });
+  target.addLegalDialect<spirv::SPIRVDialect>();
+
+  // Run patterns to rewrite usages of non-canonical resources.
+  RewritePatternSet patterns(context);
+  patterns.add<ConvertVariable, ConvertAddressOf, ConvertAccessChain,
+               ConvertLoad, ConvertStore>(analysis, context);
+  if (failed(applyPartialConversion(moduleOp, target, std::move(patterns))))
+    return signalPassFailure();
+
+  // Drop aliased attribute if we only have one single bound resource for a
+  // descriptor. We need to re-collect the map here given in the above the
+  // conversion is best effort; certain sets may not be converted.
+  AliasedResourceMap resourceMap =
+      collectAliasedResources(cast<spirv::ModuleOp>(moduleOp));
+  for (const auto &dr : resourceMap) {
+    const auto &resources = dr.second;
+    if (resources.size() == 1)
+      resources.front()->removeAttr("aliased");
+  }
+}
+
+std::unique_ptr<mlir::OperationPass<spirv::ModuleOp>>
+spirv::createUnifyAliasedResourcePass() {
+  return std::make_unique<UnifyAliasedResourcePass>();
+}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir b/mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir
new file mode 100644
index 0000000000000..546fc1f93b097
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Transforms/unify-aliased-resource.mlir
@@ -0,0 +1,215 @@
+// RUN: mlir-opt -split-input-file -spirv-unify-aliased-resource %s -o - | FileCheck %s
+
+spv.module Logical GLSL450 {
+  spv.GlobalVariable @var01s bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01v bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+
+  spv.func @load_store_scalar(%index: i32) -> f32 "None" {
+    %c0 = spv.Constant 0 : i32
+    %addr = spv.mlir.addressof @var01s : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+    %ac = spv.AccessChain %addr[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %value = spv.Load "StorageBuffer" %ac : f32
+    spv.Store "StorageBuffer" %ac, %value : f32
+    spv.ReturnValue %value : f32
+  }
+}
+
+// CHECK-LABEL: spv.module
+
+// CHECK-NOT: @var01s
+//     CHECK: spv.GlobalVariable @var01v bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+// CHECK-NOT: @var01s
+
+//     CHECK: spv.func @load_store_scalar(%[[INDEX:.+]]: i32)
+// CHECK-DAG:   %[[C0:.+]] = spv.Constant 0 : i32
+// CHECK-DAG:   %[[C4:.+]] = spv.Constant 4 : i32
+// CHECK-DAG:   %[[ADDR:.+]] = spv.mlir.addressof @var01v
+//     CHECK:   %[[DIV:.+]] = spv.SDiv %[[INDEX]], %[[C4]] : i32
+//     CHECK:   %[[MOD:.+]] = spv.SMod %[[INDEX]], %[[C4]] : i32
+//     CHECK:   %[[AC:.+]] = spv.AccessChain %[[ADDR]][%[[C0]], %[[DIV]], %[[MOD]]]
+//     CHECK:   spv.Load "StorageBuffer" %[[AC]]
+//     CHECK:   spv.Store "StorageBuffer" %[[AC]]
+
+// -----
+
+spv.module Logical GLSL450 {
+  spv.GlobalVariable @var01s bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01v bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+
+  spv.func @multiple_uses(%i0: i32, %i1: i32) -> f32 "None" {
+    %c0 = spv.Constant 0 : i32
+    %addr = spv.mlir.addressof @var01s : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+    %ac0 = spv.AccessChain %addr[%c0, %i0] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %val0 = spv.Load "StorageBuffer" %ac0 : f32
+    %ac1 = spv.AccessChain %addr[%c0, %i1] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %val1 = spv.Load "StorageBuffer" %ac1 : f32
+    %value = spv.FAdd %val0, %val1 : f32
+    spv.ReturnValue %value : f32
+  }
+}
+
+// CHECK-LABEL: spv.module
+
+// CHECK-NOT: @var01s
+//     CHECK: spv.GlobalVariable @var01v bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+// CHECK-NOT: @var01s
+
+//     CHECK: spv.func @multiple_uses
+//     CHECK:   %[[ADDR:.+]] = spv.mlir.addressof @var01v
+//     CHECK:   spv.AccessChain %[[ADDR]][%{{.+}}, %{{.+}}, %{{.+}}]
+//     CHECK:   spv.AccessChain %[[ADDR]][%{{.+}}, %{{.+}}, %{{.+}}]
+
+// -----
+
+spv.module Logical GLSL450 {
+  spv.GlobalVariable @var01s bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01v bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<vector<3xf32>, stride=16> [0])>, StorageBuffer>
+
+  spv.func @vector3(%index: i32) -> f32 "None" {
+    %c0 = spv.Constant 0 : i32
+    %addr = spv.mlir.addressof @var01s : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+    %ac = spv.AccessChain %addr[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %value = spv.Load "StorageBuffer" %ac : f32
+    spv.ReturnValue %value : f32
+  }
+}
+
+// CHECK-LABEL: spv.module
+
+// CHECK: spv.GlobalVariable @var01s bind(0, 1) {aliased}
+// CHECK: spv.GlobalVariable @var01v bind(0, 1) {aliased}
+// CHECK: spv.func @vector3
+
+// -----
+
+spv.module Logical GLSL450 {
+  spv.GlobalVariable @var01s bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01v bind(1, 0) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+
+  spv.func @not_aliased(%index: i32) -> f32 "None" {
+    %c0 = spv.Constant 0 : i32
+    %addr = spv.mlir.addressof @var01s : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+    %ac = spv.AccessChain %addr[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %value = spv.Load "StorageBuffer" %ac : f32
+    spv.Store "StorageBuffer" %ac, %value : f32
+    spv.ReturnValue %value : f32
+  }
+}
+
+// CHECK-LABEL: spv.module
+
+// CHECK: spv.GlobalVariable @var01s bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+// CHECK: spv.GlobalVariable @var01v bind(1, 0) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+// CHECK: spv.func @not_aliased
+
+// -----
+
+spv.module Logical GLSL450 {
+  spv.GlobalVariable @var01s bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01s_1 bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01v bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01v_1 bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+
+  spv.func @multiple_aliases(%index: i32) -> f32 "None" {
+    %c0 = spv.Constant 0 : i32
+
+    %addr0 = spv.mlir.addressof @var01s : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+    %ac0 = spv.AccessChain %addr0[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %val0 = spv.Load "StorageBuffer" %ac0 : f32
+
+    %addr1 = spv.mlir.addressof @var01s_1 : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+    %ac1 = spv.AccessChain %addr1[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %val1 = spv.Load "StorageBuffer" %ac1 : f32
+
+    %addr2 = spv.mlir.addressof @var01v_1 : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+    %ac2 = spv.AccessChain %addr2[%c0, %index, %c0] : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>, i32, i32, i32
+    %val2 = spv.Load "StorageBuffer" %ac2 : f32
+
+    %add0 = spv.FAdd %val0, %val1 : f32
+    %add1 = spv.FAdd %add0, %val2 : f32
+    spv.ReturnValue %add1 : f32
+  }
+}
+
+// CHECK-LABEL: spv.module
+
+// CHECK-NOT: @var01s
+//     CHECK: spv.GlobalVariable @var01v bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+// CHECK-NOT: @var01v_1
+
+//     CHECK: spv.func @multiple_aliases
+//     CHECK:   %[[ADDR0:.+]] = spv.mlir.addressof @var01v :
+//     CHECK:   spv.AccessChain %[[ADDR0]][%{{.+}}, %{{.+}}, %{{.+}}]
+//     CHECK:   %[[ADDR1:.+]] = spv.mlir.addressof @var01v :
+//     CHECK:   spv.AccessChain %[[ADDR1]][%{{.+}}, %{{.+}}, %{{.+}}]
+//     CHECK:   %[[ADDR2:.+]] = spv.mlir.addressof @var01v :
+//     CHECK:   spv.AccessChain %[[ADDR2]][%{{.+}}, %{{.+}}, %{{.+}}]
+
+// -----
+
+spv.module Logical GLSL450 {
+  spv.GlobalVariable @var01s_i32 bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<i32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01s_f32 bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+
+  spv.func @different_scalar_type(%index: i32, %val1: f32) -> i32 "None" {
+    %c0 = spv.Constant 0 : i32
+
+    %addr0 = spv.mlir.addressof @var01s_i32 : !spv.ptr<!spv.struct<(!spv.rtarray<i32, stride=4> [0])>, StorageBuffer>
+    %ac0 = spv.AccessChain %addr0[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<i32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %val0 = spv.Load "StorageBuffer" %ac0 : i32
+
+    %addr1 = spv.mlir.addressof @var01s_f32 : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>
+    %ac1 = spv.AccessChain %addr1[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<f32, stride=4> [0])>, StorageBuffer>, i32, i32
+    spv.Store "StorageBuffer" %ac1, %val1 : f32
+
+    spv.ReturnValue %val0 : i32
+  }
+}
+
+// CHECK-LABEL: spv.module
+
+// CHECK-NOT: @var01s_f32
+//     CHECK: spv.GlobalVariable @var01s_i32 bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<i32, stride=4> [0])>, StorageBuffer>
+// CHECK-NOT: @var01s_f32
+
+//     CHECK: spv.func @different_scalar_type(%[[INDEX:.+]]: i32, %[[VAL1:.+]]: f32)
+
+//     CHECK:   %[[IADDR:.+]] = spv.mlir.addressof @var01s_i32
+//     CHECK:   %[[IAC:.+]] = spv.AccessChain %[[IADDR]][%{{.+}}, %[[INDEX]]]
+//     CHECK:   spv.Load "StorageBuffer" %[[IAC]] : i32
+
+//     CHECK:   %[[FADDR:.+]] = spv.mlir.addressof @var01s_i32
+//     CHECK:   %[[FAC:.+]] = spv.AccessChain %[[FADDR]][%cst0_i32, %[[INDEX]]]
+//     CHECK:   %[[CAST:.+]] = spv.Bitcast %[[VAL1]] : f32 to i32
+//     CHECK:   spv.Store "StorageBuffer" %[[FAC]], %[[CAST]] : i32
+
+// -----
+
+spv.module Logical GLSL450 {
+  spv.GlobalVariable @var01s bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<i32, stride=4> [0])>, StorageBuffer>
+  spv.GlobalVariable @var01v bind(0, 1) {aliased} : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+
+  spv.func @different_scalar_type(%index: i32, %val0: i32) -> i32 "None" {
+    %c0 = spv.Constant 0 : i32
+    %addr = spv.mlir.addressof @var01s : !spv.ptr<!spv.struct<(!spv.rtarray<i32, stride=4> [0])>, StorageBuffer>
+    %ac = spv.AccessChain %addr[%c0, %index] : !spv.ptr<!spv.struct<(!spv.rtarray<i32, stride=4> [0])>, StorageBuffer>, i32, i32
+    %val1 = spv.Load "StorageBuffer" %ac : i32
+    spv.Store "StorageBuffer" %ac, %val0 : i32
+    spv.ReturnValue %val1 : i32
+  }
+}
+
+// CHECK-LABEL: spv.module
+
+// CHECK-NOT: @var01s
+//     CHECK: spv.GlobalVariable @var01v bind(0, 1) : !spv.ptr<!spv.struct<(!spv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>
+// CHECK-NOT: @var01s
+
+//     CHECK: spv.func @different_scalar_type(%{{.+}}: i32, %[[VAL0:.+]]: i32)
+//     CHECK:   %[[ADDR:.+]] = spv.mlir.addressof @var01v
+//     CHECK:   %[[AC:.+]] = spv.AccessChain %[[ADDR]][%{{.+}}, %{{.+}}, %{{.+}}]
+//     CHECK:   %[[VAL1:.+]] = spv.Load "StorageBuffer" %[[AC]] : f32
+//     CHECK:   %[[CAST1:.+]] = spv.Bitcast %[[VAL1]] : f32 to i32
+//     CHECK:   %[[CAST2:.+]] = spv.Bitcast %[[VAL0]] : i32 to f32
+//     CHECK:   spv.Store "StorageBuffer" %[[AC]], %[[CAST2]] : f32
+//     CHECK:   spv.ReturnValue %[[CAST1]] : i32

From be77afe43dd39f517ab2468c359fd0d5633d9be6 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 17 Feb 2022 15:09:31 +0100
Subject: [PATCH 101/748] tsan: Add a missing disable_sanitizer_instrumentation
 attribute

Turns out the test was working by accident: we need to ensure
TSan instrumentation is not called from the fork() hook, otherwise the
tool will deadlock. Previously it worked because alloc_free_blocks() got
inlined into __tsan_test_only_on_fork(), but it cannot always be the
case.

Adding __attribute__((disable_sanitizer_instrumentation)) will prevent
TSan from instrumenting alloc_free_blocks().

Reviewed By: dvyukov

Differential Revision: https://reviews.llvm.org/D120050
---
 compiler-rt/test/tsan/Linux/fork_deadlock.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/tsan/Linux/fork_deadlock.cpp b/compiler-rt/test/tsan/Linux/fork_deadlock.cpp
index 8f38ab92e69f6..952507032df65 100644
--- a/compiler-rt/test/tsan/Linux/fork_deadlock.cpp
+++ b/compiler-rt/test/tsan/Linux/fork_deadlock.cpp
@@ -11,7 +11,10 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 
-void alloc_free_blocks() {
+// disable_sanitizer_instrumentation on __tsan_test_only_on_fork is not
+// transitive, so we must apply it here as well.
+// Instrumenting alloc_free_blocks() will result in deadlocks in TSan.
+__attribute__((disable_sanitizer_instrumentation)) void alloc_free_blocks() {
   // Allocate a bunch of blocks to drain local allocator cache
   // and provoke it to lock allocator global mutexes.
   const int kBlocks = 1000;

From eeb7754f6853626c8ac1cb6b1436c1a3599ea182 Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Sun, 13 Feb 2022 18:09:27 -0800
Subject: [PATCH 102/748] [RISCV] Add the passthru operand for
 vmv.vv/vmv.vx/vfmv.vf IR intrinsics.

Add the passthru operand for
VMV_V_X_VL, VFMV_V_F_VL and SPLAT_VECTOR_SPLIT_I64_VL also.

The goal is support tail and mask policy in RVV builtins.
We focus on IR part first.
If the passthru operand is undef, we use tail agnostic, otherwise
use tail undisturbed.

Reviewed By: rogfer01

Differential Revision: https://reviews.llvm.org/D119688
---
 clang/include/clang/Basic/riscv_vector.td     |   5 +-
 .../RISCV/rvv-intrinsics-overloaded/vmv.c     | 106 ++++-----
 .../test/CodeGen/RISCV/rvv-intrinsics/vfmv.c  |  30 +--
 clang/test/CodeGen/RISCV/rvv-intrinsics/vmv.c | 206 +++++++++---------
 llvm/include/llvm/IR/IntrinsicsRISCV.td       |  21 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  41 ++--
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 182 +++++++++-------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   7 +-
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |  32 ++-
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td |   2 +-
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td |  43 +++-
 .../CodeGen/RISCV/rvv/setcc-integer-rv32.ll   |   6 +-
 .../CodeGen/RISCV/rvv/setcc-integer-rv64.ll   |   6 +-
 llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll    | 109 +++++++++
 llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll       |  45 ++++
 llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll   |  74 +++++++
 llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll   |  74 +++++++
 llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll   |  70 ++++++
 llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll   |  66 ++++++
 .../RISCV/rvv/vsetvli-insert-crossbb.ll       |  20 +-
 llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll   |  12 +-
 21 files changed, 848 insertions(+), 309 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index a497f85705c72..94202f6359cee 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -1755,7 +1755,7 @@ let HasMask = false, HasPolicy = false,
 }
 
 // 12.16. Vector Integer Move Instructions
-let HasMask = false, HasPolicy = false in {
+let HasMask = false, HasNoMaskPassThru = true, HasPolicy = false in {
   let MangledName = "vmv_v" in {
     defm vmv_v : RVVOutBuiltinSet<"vmv_v_v", "csil",
                                    [["v", "Uv", "UvUv"]]>;
@@ -1890,7 +1890,8 @@ let HasMask = false, HasPolicy = false,
 }
 
 // 14.16. Vector Floating-Point Move Instruction
-let HasMask = false, HasNoMaskedOverloaded = false, HasPolicy = false in
+let HasMask = false, HasNoMaskPassThru = true, HasNoMaskedOverloaded = false,
+    HasPolicy = false in
   defm vfmv_v : RVVOutBuiltinSet<"vfmv_v_f", "xfd",
                                   [["f", "v", "ve"]]>;
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmv.c
index b6890fd8829d3..466621c19db86 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmv.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmv.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmv_v_v_i8mf8(vint8mf8_t src, size_t vl) {
@@ -16,7 +16,7 @@ vint8mf8_t test_vmv_v_v_i8mf8(vint8mf8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmv_v_v_i8mf4(vint8mf4_t src, size_t vl) {
@@ -25,7 +25,7 @@ vint8mf4_t test_vmv_v_v_i8mf4(vint8mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmv_v_v_i8mf2(vint8mf2_t src, size_t vl) {
@@ -34,35 +34,35 @@ vint8mf2_t test_vmv_v_v_i8mf2(vint8mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmv_v_v_i8m1(vint8m1_t src, size_t vl) { return vmv_v(src, vl); }
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmv_v_v_i8m2(vint8m2_t src, size_t vl) { return vmv_v(src, vl); }
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmv_v_v_i8m4(vint8m4_t src, size_t vl) { return vmv_v(src, vl); }
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmv_v_v_i8m8(vint8m8_t src, size_t vl) { return vmv_v(src, vl); }
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmv_v_v_i16mf4(vint16mf4_t src, size_t vl) {
@@ -71,7 +71,7 @@ vint16mf4_t test_vmv_v_v_i16mf4(vint16mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmv_v_v_i16mf2(vint16mf2_t src, size_t vl) {
@@ -80,7 +80,7 @@ vint16mf2_t test_vmv_v_v_i16mf2(vint16mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmv_v_v_i16m1(vint16m1_t src, size_t vl) {
@@ -89,7 +89,7 @@ vint16m1_t test_vmv_v_v_i16m1(vint16m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmv_v_v_i16m2(vint16m2_t src, size_t vl) {
@@ -98,7 +98,7 @@ vint16m2_t test_vmv_v_v_i16m2(vint16m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmv_v_v_i16m4(vint16m4_t src, size_t vl) {
@@ -107,7 +107,7 @@ vint16m4_t test_vmv_v_v_i16m4(vint16m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmv_v_v_i16m8(vint16m8_t src, size_t vl) {
@@ -116,7 +116,7 @@ vint16m8_t test_vmv_v_v_i16m8(vint16m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmv_v_v_i32mf2(vint32mf2_t src, size_t vl) {
@@ -125,7 +125,7 @@ vint32mf2_t test_vmv_v_v_i32mf2(vint32mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmv_v_v_i32m1(vint32m1_t src, size_t vl) {
@@ -134,7 +134,7 @@ vint32m1_t test_vmv_v_v_i32m1(vint32m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmv_v_v_i32m2(vint32m2_t src, size_t vl) {
@@ -143,7 +143,7 @@ vint32m2_t test_vmv_v_v_i32m2(vint32m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmv_v_v_i32m4(vint32m4_t src, size_t vl) {
@@ -152,7 +152,7 @@ vint32m4_t test_vmv_v_v_i32m4(vint32m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmv_v_v_i32m8(vint32m8_t src, size_t vl) {
@@ -161,7 +161,7 @@ vint32m8_t test_vmv_v_v_i32m8(vint32m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmv_v_v_i64m1(vint64m1_t src, size_t vl) {
@@ -170,7 +170,7 @@ vint64m1_t test_vmv_v_v_i64m1(vint64m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmv_v_v_i64m2(vint64m2_t src, size_t vl) {
@@ -179,7 +179,7 @@ vint64m2_t test_vmv_v_v_i64m2(vint64m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmv_v_v_i64m4(vint64m4_t src, size_t vl) {
@@ -188,7 +188,7 @@ vint64m4_t test_vmv_v_v_i64m4(vint64m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmv_v_v_i64m8(vint64m8_t src, size_t vl) {
@@ -197,7 +197,7 @@ vint64m8_t test_vmv_v_v_i64m8(vint64m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmv_v_v_u8mf8(vuint8mf8_t src, size_t vl) {
@@ -206,7 +206,7 @@ vuint8mf8_t test_vmv_v_v_u8mf8(vuint8mf8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmv_v_v_u8mf4(vuint8mf4_t src, size_t vl) {
@@ -215,7 +215,7 @@ vuint8mf4_t test_vmv_v_v_u8mf4(vuint8mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmv_v_v_u8mf2(vuint8mf2_t src, size_t vl) {
@@ -224,7 +224,7 @@ vuint8mf2_t test_vmv_v_v_u8mf2(vuint8mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmv_v_v_u8m1(vuint8m1_t src, size_t vl) {
@@ -233,7 +233,7 @@ vuint8m1_t test_vmv_v_v_u8m1(vuint8m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmv_v_v_u8m2(vuint8m2_t src, size_t vl) {
@@ -242,7 +242,7 @@ vuint8m2_t test_vmv_v_v_u8m2(vuint8m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmv_v_v_u8m4(vuint8m4_t src, size_t vl) {
@@ -251,7 +251,7 @@ vuint8m4_t test_vmv_v_v_u8m4(vuint8m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmv_v_v_u8m8(vuint8m8_t src, size_t vl) {
@@ -260,7 +260,7 @@ vuint8m8_t test_vmv_v_v_u8m8(vuint8m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmv_v_v_u16mf4(vuint16mf4_t src, size_t vl) {
@@ -269,7 +269,7 @@ vuint16mf4_t test_vmv_v_v_u16mf4(vuint16mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmv_v_v_u16mf2(vuint16mf2_t src, size_t vl) {
@@ -278,7 +278,7 @@ vuint16mf2_t test_vmv_v_v_u16mf2(vuint16mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmv_v_v_u16m1(vuint16m1_t src, size_t vl) {
@@ -287,7 +287,7 @@ vuint16m1_t test_vmv_v_v_u16m1(vuint16m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmv_v_v_u16m2(vuint16m2_t src, size_t vl) {
@@ -296,7 +296,7 @@ vuint16m2_t test_vmv_v_v_u16m2(vuint16m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmv_v_v_u16m4(vuint16m4_t src, size_t vl) {
@@ -305,7 +305,7 @@ vuint16m4_t test_vmv_v_v_u16m4(vuint16m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmv_v_v_u16m8(vuint16m8_t src, size_t vl) {
@@ -314,7 +314,7 @@ vuint16m8_t test_vmv_v_v_u16m8(vuint16m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmv_v_v_u32mf2(vuint32mf2_t src, size_t vl) {
@@ -323,7 +323,7 @@ vuint32mf2_t test_vmv_v_v_u32mf2(vuint32mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmv_v_v_u32m1(vuint32m1_t src, size_t vl) {
@@ -332,7 +332,7 @@ vuint32m1_t test_vmv_v_v_u32m1(vuint32m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmv_v_v_u32m2(vuint32m2_t src, size_t vl) {
@@ -341,7 +341,7 @@ vuint32m2_t test_vmv_v_v_u32m2(vuint32m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmv_v_v_u32m4(vuint32m4_t src, size_t vl) {
@@ -350,7 +350,7 @@ vuint32m4_t test_vmv_v_v_u32m4(vuint32m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmv_v_v_u32m8(vuint32m8_t src, size_t vl) {
@@ -359,7 +359,7 @@ vuint32m8_t test_vmv_v_v_u32m8(vuint32m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmv_v_v_u64m1(vuint64m1_t src, size_t vl) {
@@ -368,7 +368,7 @@ vuint64m1_t test_vmv_v_v_u64m1(vuint64m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmv_v_v_u64m2(vuint64m2_t src, size_t vl) {
@@ -377,7 +377,7 @@ vuint64m2_t test_vmv_v_v_u64m2(vuint64m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmv_v_v_u64m4(vuint64m4_t src, size_t vl) {
@@ -386,7 +386,7 @@ vuint64m4_t test_vmv_v_v_u64m4(vuint64m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmv_v_v_u64m8(vuint64m8_t src, size_t vl) {
@@ -395,7 +395,7 @@ vuint64m8_t test_vmv_v_v_u64m8(vuint64m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32.i64(<vscale x 1 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32.i64(<vscale x 1 x float> undef, <vscale x 1 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vmv_v_v_f32mf2(vfloat32mf2_t src, size_t vl) {
@@ -404,7 +404,7 @@ vfloat32mf2_t test_vmv_v_v_f32mf2(vfloat32mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32.i64(<vscale x 2 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32.i64(<vscale x 2 x float> undef, <vscale x 2 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vmv_v_v_f32m1(vfloat32m1_t src, size_t vl) {
@@ -413,7 +413,7 @@ vfloat32m1_t test_vmv_v_v_f32m1(vfloat32m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32.i64(<vscale x 4 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32.i64(<vscale x 4 x float> undef, <vscale x 4 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vmv_v_v_f32m2(vfloat32m2_t src, size_t vl) {
@@ -422,7 +422,7 @@ vfloat32m2_t test_vmv_v_v_f32m2(vfloat32m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32.i64(<vscale x 8 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32.i64(<vscale x 8 x float> undef, <vscale x 8 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vmv_v_v_f32m4(vfloat32m4_t src, size_t vl) {
@@ -431,7 +431,7 @@ vfloat32m4_t test_vmv_v_v_f32m4(vfloat32m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32.i64(<vscale x 16 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vmv_v_v_f32m8(vfloat32m8_t src, size_t vl) {
@@ -440,7 +440,7 @@ vfloat32m8_t test_vmv_v_v_f32m8(vfloat32m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64.i64(<vscale x 1 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64.i64(<vscale x 1 x double> undef, <vscale x 1 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vmv_v_v_f64m1(vfloat64m1_t src, size_t vl) {
@@ -449,7 +449,7 @@ vfloat64m1_t test_vmv_v_v_f64m1(vfloat64m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64.i64(<vscale x 2 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64.i64(<vscale x 2 x double> undef, <vscale x 2 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vmv_v_v_f64m2(vfloat64m2_t src, size_t vl) {
@@ -458,7 +458,7 @@ vfloat64m2_t test_vmv_v_v_f64m2(vfloat64m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64.i64(<vscale x 4 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64.i64(<vscale x 4 x double> undef, <vscale x 4 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vmv_v_v_f64m4(vfloat64m4_t src, size_t vl) {
@@ -467,7 +467,7 @@ vfloat64m4_t test_vmv_v_v_f64m4(vfloat64m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64.i64(<vscale x 8 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64.i64(<vscale x 8 x double> undef, <vscale x 8 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vmv_v_v_f64m8(vfloat64m8_t src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmv.c
index 74148c846c436..438ef84035b5d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmv.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmv.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32.i64(float [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32.i64(<vscale x 1 x float> undef, float [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmv_v_f_f32mf2(float src, size_t vl) {
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfmv_v_f_f32mf2(float src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.i64(float [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.i64(<vscale x 2 x float> undef, float [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmv_v_f_f32m1(float src, size_t vl) {
@@ -26,7 +26,7 @@ vfloat32m1_t test_vfmv_v_f_f32m1(float src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmv.v.f.nxv4f32.i64(float [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmv.v.f.nxv4f32.i64(<vscale x 4 x float> undef, float [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmv_v_f_f32m2(float src, size_t vl) {
@@ -35,7 +35,7 @@ vfloat32m2_t test_vfmv_v_f_f32m2(float src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmv.v.f.nxv8f32.i64(float [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmv.v.f.nxv8f32.i64(<vscale x 8 x float> undef, float [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmv_v_f_f32m4(float src, size_t vl) {
@@ -44,7 +44,7 @@ vfloat32m4_t test_vfmv_v_f_f32m4(float src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmv.v.f.nxv16f32.i64(float [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmv.v.f.nxv16f32.i64(<vscale x 16 x float> undef, float [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmv_v_f_f32m8(float src, size_t vl) {
@@ -53,7 +53,7 @@ vfloat32m8_t test_vfmv_v_f_f32m8(float src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.i64(double [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.i64(<vscale x 1 x double> undef, double [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmv_v_f_f64m1(double src, size_t vl) {
@@ -62,7 +62,7 @@ vfloat64m1_t test_vfmv_v_f_f64m1(double src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmv.v.f.nxv2f64.i64(double [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmv.v.f.nxv2f64.i64(<vscale x 2 x double> undef, double [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmv_v_f_f64m2(double src, size_t vl) {
@@ -71,7 +71,7 @@ vfloat64m2_t test_vfmv_v_f_f64m2(double src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmv.v.f.nxv4f64.i64(double [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmv.v.f.nxv4f64.i64(<vscale x 4 x double> undef, double [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmv_v_f_f64m4(double src, size_t vl) {
@@ -80,7 +80,7 @@ vfloat64m4_t test_vfmv_v_f_f64m4(double src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmv.v.f.nxv8f64.i64(double [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmv.v.f.nxv8f64.i64(<vscale x 8 x double> undef, double [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmv_v_f_f64m8(double src, size_t vl) {
@@ -251,7 +251,7 @@ vfloat64m8_t test_vfmv_s_f_f64m8(vfloat64m8_t dst, double src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16.i64(half [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16.i64(<vscale x 1 x half> undef, half [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmv_v_f_f16mf4 (_Float16 src, size_t vl) {
@@ -260,7 +260,7 @@ vfloat16mf4_t test_vfmv_v_f_f16mf4 (_Float16 src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmv.v.f.nxv2f16.i64(half [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmv.v.f.nxv2f16.i64(<vscale x 2 x half> undef, half [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmv_v_f_f16mf2 (_Float16 src, size_t vl) {
@@ -269,7 +269,7 @@ vfloat16mf2_t test_vfmv_v_f_f16mf2 (_Float16 src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmv.v.f.nxv4f16.i64(half [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmv.v.f.nxv4f16.i64(<vscale x 4 x half> undef, half [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmv_v_f_f16m1 (_Float16 src, size_t vl) {
@@ -278,7 +278,7 @@ vfloat16m1_t test_vfmv_v_f_f16m1 (_Float16 src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmv.v.f.nxv8f16.i64(half [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmv.v.f.nxv8f16.i64(<vscale x 8 x half> undef, half [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmv_v_f_f16m2 (_Float16 src, size_t vl) {
@@ -287,7 +287,7 @@ vfloat16m2_t test_vfmv_v_f_f16m2 (_Float16 src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmv.v.f.nxv16f16.i64(half [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmv.v.f.nxv16f16.i64(<vscale x 16 x half> undef, half [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmv_v_f_f16m4 (_Float16 src, size_t vl) {
@@ -296,7 +296,7 @@ vfloat16m4_t test_vfmv_v_f_f16m4 (_Float16 src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vfmv_v_f_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmv.v.f.nxv32f16.i64(half [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmv.v.f.nxv32f16.i64(<vscale x 32 x half> undef, half [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmv_v_f_f16m8 (_Float16 src, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmv.c
index 0f8517f447a13..ce4e2cf69df91 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmv.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmv.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmv_v_v_i8mf8(vint8mf8_t src, size_t vl) {
@@ -17,7 +17,7 @@ vint8mf8_t test_vmv_v_v_i8mf8(vint8mf8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8.i64(<vscale x 1 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmv_v_x_i8mf8(int8_t src, size_t vl) {
@@ -26,7 +26,7 @@ vint8mf8_t test_vmv_v_x_i8mf8(int8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmv_v_v_i8mf4(vint8mf4_t src, size_t vl) {
@@ -35,7 +35,7 @@ vint8mf4_t test_vmv_v_v_i8mf4(vint8mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8.i64(<vscale x 2 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmv_v_x_i8mf4(int8_t src, size_t vl) {
@@ -44,7 +44,7 @@ vint8mf4_t test_vmv_v_x_i8mf4(int8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmv_v_v_i8mf2(vint8mf2_t src, size_t vl) {
@@ -53,7 +53,7 @@ vint8mf2_t test_vmv_v_v_i8mf2(vint8mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8.i64(<vscale x 4 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmv_v_x_i8mf2(int8_t src, size_t vl) {
@@ -62,7 +62,7 @@ vint8mf2_t test_vmv_v_x_i8mf2(int8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmv_v_v_i8m1(vint8m1_t src, size_t vl) {
@@ -71,7 +71,7 @@ vint8m1_t test_vmv_v_v_i8m1(vint8m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8.i64(<vscale x 8 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmv_v_x_i8m1(int8_t src, size_t vl) {
@@ -80,7 +80,7 @@ vint8m1_t test_vmv_v_x_i8m1(int8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmv_v_v_i8m2(vint8m2_t src, size_t vl) {
@@ -89,7 +89,7 @@ vint8m2_t test_vmv_v_v_i8m2(vint8m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8.i64(<vscale x 16 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmv_v_x_i8m2(int8_t src, size_t vl) {
@@ -98,7 +98,7 @@ vint8m2_t test_vmv_v_x_i8m2(int8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmv_v_v_i8m4(vint8m4_t src, size_t vl) {
@@ -107,7 +107,7 @@ vint8m4_t test_vmv_v_v_i8m4(vint8m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8.i64(<vscale x 32 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmv_v_x_i8m4(int8_t src, size_t vl) {
@@ -116,7 +116,7 @@ vint8m4_t test_vmv_v_x_i8m4(int8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmv_v_v_i8m8(vint8m8_t src, size_t vl) {
@@ -125,7 +125,7 @@ vint8m8_t test_vmv_v_v_i8m8(vint8m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8.i64(<vscale x 64 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmv_v_x_i8m8(int8_t src, size_t vl) {
@@ -134,7 +134,7 @@ vint8m8_t test_vmv_v_x_i8m8(int8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmv_v_v_i16mf4(vint16mf4_t src, size_t vl) {
@@ -143,7 +143,7 @@ vint16mf4_t test_vmv_v_v_i16mf4(vint16mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16.i64(<vscale x 1 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmv_v_x_i16mf4(int16_t src, size_t vl) {
@@ -152,7 +152,7 @@ vint16mf4_t test_vmv_v_x_i16mf4(int16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmv_v_v_i16mf2(vint16mf2_t src, size_t vl) {
@@ -161,7 +161,7 @@ vint16mf2_t test_vmv_v_v_i16mf2(vint16mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16.i64(<vscale x 2 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmv_v_x_i16mf2(int16_t src, size_t vl) {
@@ -170,7 +170,7 @@ vint16mf2_t test_vmv_v_x_i16mf2(int16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmv_v_v_i16m1(vint16m1_t src, size_t vl) {
@@ -179,7 +179,7 @@ vint16m1_t test_vmv_v_v_i16m1(vint16m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16.i64(<vscale x 4 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmv_v_x_i16m1(int16_t src, size_t vl) {
@@ -188,7 +188,7 @@ vint16m1_t test_vmv_v_x_i16m1(int16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmv_v_v_i16m2(vint16m2_t src, size_t vl) {
@@ -197,7 +197,7 @@ vint16m2_t test_vmv_v_v_i16m2(vint16m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16.i64(<vscale x 8 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmv_v_x_i16m2(int16_t src, size_t vl) {
@@ -206,7 +206,7 @@ vint16m2_t test_vmv_v_x_i16m2(int16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmv_v_v_i16m4(vint16m4_t src, size_t vl) {
@@ -215,7 +215,7 @@ vint16m4_t test_vmv_v_v_i16m4(vint16m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16.i64(<vscale x 16 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmv_v_x_i16m4(int16_t src, size_t vl) {
@@ -224,7 +224,7 @@ vint16m4_t test_vmv_v_x_i16m4(int16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmv_v_v_i16m8(vint16m8_t src, size_t vl) {
@@ -233,7 +233,7 @@ vint16m8_t test_vmv_v_v_i16m8(vint16m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16.i64(<vscale x 32 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmv_v_x_i16m8(int16_t src, size_t vl) {
@@ -242,7 +242,7 @@ vint16m8_t test_vmv_v_x_i16m8(int16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmv_v_v_i32mf2(vint32mf2_t src, size_t vl) {
@@ -251,7 +251,7 @@ vint32mf2_t test_vmv_v_v_i32mf2(vint32mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32.i64(<vscale x 1 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmv_v_x_i32mf2(int32_t src, size_t vl) {
@@ -260,7 +260,7 @@ vint32mf2_t test_vmv_v_x_i32mf2(int32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmv_v_v_i32m1(vint32m1_t src, size_t vl) {
@@ -269,7 +269,7 @@ vint32m1_t test_vmv_v_v_i32m1(vint32m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32.i64(<vscale x 2 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmv_v_x_i32m1(int32_t src, size_t vl) {
@@ -278,7 +278,7 @@ vint32m1_t test_vmv_v_x_i32m1(int32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmv_v_v_i32m2(vint32m2_t src, size_t vl) {
@@ -287,7 +287,7 @@ vint32m2_t test_vmv_v_v_i32m2(vint32m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32.i64(<vscale x 4 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmv_v_x_i32m2(int32_t src, size_t vl) {
@@ -296,7 +296,7 @@ vint32m2_t test_vmv_v_x_i32m2(int32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmv_v_v_i32m4(vint32m4_t src, size_t vl) {
@@ -305,7 +305,7 @@ vint32m4_t test_vmv_v_v_i32m4(vint32m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32.i64(<vscale x 8 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmv_v_x_i32m4(int32_t src, size_t vl) {
@@ -314,7 +314,7 @@ vint32m4_t test_vmv_v_x_i32m4(int32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmv_v_v_i32m8(vint32m8_t src, size_t vl) {
@@ -323,7 +323,7 @@ vint32m8_t test_vmv_v_v_i32m8(vint32m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32.i64(<vscale x 16 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmv_v_x_i32m8(int32_t src, size_t vl) {
@@ -332,7 +332,7 @@ vint32m8_t test_vmv_v_x_i32m8(int32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmv_v_v_i64m1(vint64m1_t src, size_t vl) {
@@ -341,7 +341,7 @@ vint64m1_t test_vmv_v_v_i64m1(vint64m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64.i64(<vscale x 1 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmv_v_x_i64m1(int64_t src, size_t vl) {
@@ -350,7 +350,7 @@ vint64m1_t test_vmv_v_x_i64m1(int64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmv_v_v_i64m2(vint64m2_t src, size_t vl) {
@@ -359,7 +359,7 @@ vint64m2_t test_vmv_v_v_i64m2(vint64m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64.i64(<vscale x 2 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmv_v_x_i64m2(int64_t src, size_t vl) {
@@ -368,7 +368,7 @@ vint64m2_t test_vmv_v_x_i64m2(int64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmv_v_v_i64m4(vint64m4_t src, size_t vl) {
@@ -377,7 +377,7 @@ vint64m4_t test_vmv_v_v_i64m4(vint64m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64.i64(<vscale x 4 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmv_v_x_i64m4(int64_t src, size_t vl) {
@@ -386,7 +386,7 @@ vint64m4_t test_vmv_v_x_i64m4(int64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmv_v_v_i64m8(vint64m8_t src, size_t vl) {
@@ -395,7 +395,7 @@ vint64m8_t test_vmv_v_v_i64m8(vint64m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64.i64(<vscale x 8 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmv_v_x_i64m8(int64_t src, size_t vl) {
@@ -404,7 +404,7 @@ vint64m8_t test_vmv_v_x_i64m8(int64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8.i64(<vscale x 1 x i8> undef, <vscale x 1 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmv_v_v_u8mf8(vuint8mf8_t src, size_t vl) {
@@ -413,7 +413,7 @@ vuint8mf8_t test_vmv_v_v_u8mf8(vuint8mf8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8.i64(<vscale x 1 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmv_v_x_u8mf8(uint8_t src, size_t vl) {
@@ -422,7 +422,7 @@ vuint8mf8_t test_vmv_v_x_u8mf8(uint8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8.i64(<vscale x 2 x i8> undef, <vscale x 2 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmv_v_v_u8mf4(vuint8mf4_t src, size_t vl) {
@@ -431,7 +431,7 @@ vuint8mf4_t test_vmv_v_v_u8mf4(vuint8mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8.i64(<vscale x 2 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmv_v_x_u8mf4(uint8_t src, size_t vl) {
@@ -440,7 +440,7 @@ vuint8mf4_t test_vmv_v_x_u8mf4(uint8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8.i64(<vscale x 4 x i8> undef, <vscale x 4 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmv_v_v_u8mf2(vuint8mf2_t src, size_t vl) {
@@ -449,7 +449,7 @@ vuint8mf2_t test_vmv_v_v_u8mf2(vuint8mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8.i64(<vscale x 4 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmv_v_x_u8mf2(uint8_t src, size_t vl) {
@@ -458,7 +458,7 @@ vuint8mf2_t test_vmv_v_x_u8mf2(uint8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmv_v_v_u8m1(vuint8m1_t src, size_t vl) {
@@ -467,7 +467,7 @@ vuint8m1_t test_vmv_v_v_u8m1(vuint8m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8.i64(<vscale x 8 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmv_v_x_u8m1(uint8_t src, size_t vl) {
@@ -476,7 +476,7 @@ vuint8m1_t test_vmv_v_x_u8m1(uint8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8.i64(<vscale x 16 x i8> undef, <vscale x 16 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmv_v_v_u8m2(vuint8m2_t src, size_t vl) {
@@ -485,7 +485,7 @@ vuint8m2_t test_vmv_v_v_u8m2(vuint8m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8.i64(<vscale x 16 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmv_v_x_u8m2(uint8_t src, size_t vl) {
@@ -494,7 +494,7 @@ vuint8m2_t test_vmv_v_x_u8m2(uint8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8.i64(<vscale x 32 x i8> undef, <vscale x 32 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmv_v_v_u8m4(vuint8m4_t src, size_t vl) {
@@ -503,7 +503,7 @@ vuint8m4_t test_vmv_v_v_u8m4(vuint8m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8.i64(<vscale x 32 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmv_v_x_u8m4(uint8_t src, size_t vl) {
@@ -512,7 +512,7 @@ vuint8m4_t test_vmv_v_x_u8m4(uint8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8.i64(<vscale x 64 x i8> undef, <vscale x 64 x i8> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmv_v_v_u8m8(vuint8m8_t src, size_t vl) {
@@ -521,7 +521,7 @@ vuint8m8_t test_vmv_v_v_u8m8(vuint8m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8.i64(i8 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8.i64(<vscale x 64 x i8> undef, i8 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmv_v_x_u8m8(uint8_t src, size_t vl) {
@@ -530,7 +530,7 @@ vuint8m8_t test_vmv_v_x_u8m8(uint8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16.i64(<vscale x 1 x i16> undef, <vscale x 1 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmv_v_v_u16mf4(vuint16mf4_t src, size_t vl) {
@@ -539,7 +539,7 @@ vuint16mf4_t test_vmv_v_v_u16mf4(vuint16mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16.i64(<vscale x 1 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmv_v_x_u16mf4(uint16_t src, size_t vl) {
@@ -548,7 +548,7 @@ vuint16mf4_t test_vmv_v_x_u16mf4(uint16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16.i64(<vscale x 2 x i16> undef, <vscale x 2 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmv_v_v_u16mf2(vuint16mf2_t src, size_t vl) {
@@ -557,7 +557,7 @@ vuint16mf2_t test_vmv_v_v_u16mf2(vuint16mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16.i64(<vscale x 2 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmv_v_x_u16mf2(uint16_t src, size_t vl) {
@@ -566,7 +566,7 @@ vuint16mf2_t test_vmv_v_x_u16mf2(uint16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16.i64(<vscale x 4 x i16> undef, <vscale x 4 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmv_v_v_u16m1(vuint16m1_t src, size_t vl) {
@@ -575,7 +575,7 @@ vuint16m1_t test_vmv_v_v_u16m1(vuint16m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16.i64(<vscale x 4 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmv_v_x_u16m1(uint16_t src, size_t vl) {
@@ -584,7 +584,7 @@ vuint16m1_t test_vmv_v_x_u16m1(uint16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16.i64(<vscale x 8 x i16> undef, <vscale x 8 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmv_v_v_u16m2(vuint16m2_t src, size_t vl) {
@@ -593,7 +593,7 @@ vuint16m2_t test_vmv_v_v_u16m2(vuint16m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16.i64(<vscale x 8 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmv_v_x_u16m2(uint16_t src, size_t vl) {
@@ -602,7 +602,7 @@ vuint16m2_t test_vmv_v_x_u16m2(uint16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16.i64(<vscale x 16 x i16> undef, <vscale x 16 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmv_v_v_u16m4(vuint16m4_t src, size_t vl) {
@@ -611,7 +611,7 @@ vuint16m4_t test_vmv_v_v_u16m4(vuint16m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16.i64(<vscale x 16 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmv_v_x_u16m4(uint16_t src, size_t vl) {
@@ -620,7 +620,7 @@ vuint16m4_t test_vmv_v_x_u16m4(uint16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16.i64(<vscale x 32 x i16> undef, <vscale x 32 x i16> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmv_v_v_u16m8(vuint16m8_t src, size_t vl) {
@@ -629,7 +629,7 @@ vuint16m8_t test_vmv_v_v_u16m8(vuint16m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16.i64(i16 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16.i64(<vscale x 32 x i16> undef, i16 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmv_v_x_u16m8(uint16_t src, size_t vl) {
@@ -638,7 +638,7 @@ vuint16m8_t test_vmv_v_x_u16m8(uint16_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32.i64(<vscale x 1 x i32> undef, <vscale x 1 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmv_v_v_u32mf2(vuint32mf2_t src, size_t vl) {
@@ -647,7 +647,7 @@ vuint32mf2_t test_vmv_v_v_u32mf2(vuint32mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32.i64(<vscale x 1 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmv_v_x_u32mf2(uint32_t src, size_t vl) {
@@ -656,7 +656,7 @@ vuint32mf2_t test_vmv_v_x_u32mf2(uint32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32.i64(<vscale x 2 x i32> undef, <vscale x 2 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmv_v_v_u32m1(vuint32m1_t src, size_t vl) {
@@ -665,7 +665,7 @@ vuint32m1_t test_vmv_v_v_u32m1(vuint32m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32.i64(<vscale x 2 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmv_v_x_u32m1(uint32_t src, size_t vl) {
@@ -674,7 +674,7 @@ vuint32m1_t test_vmv_v_x_u32m1(uint32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32.i64(<vscale x 4 x i32> undef, <vscale x 4 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmv_v_v_u32m2(vuint32m2_t src, size_t vl) {
@@ -683,7 +683,7 @@ vuint32m2_t test_vmv_v_v_u32m2(vuint32m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32.i64(<vscale x 4 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmv_v_x_u32m2(uint32_t src, size_t vl) {
@@ -692,7 +692,7 @@ vuint32m2_t test_vmv_v_x_u32m2(uint32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32.i64(<vscale x 8 x i32> undef, <vscale x 8 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmv_v_v_u32m4(vuint32m4_t src, size_t vl) {
@@ -701,7 +701,7 @@ vuint32m4_t test_vmv_v_v_u32m4(vuint32m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32.i64(<vscale x 8 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmv_v_x_u32m4(uint32_t src, size_t vl) {
@@ -710,7 +710,7 @@ vuint32m4_t test_vmv_v_x_u32m4(uint32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32.i64(<vscale x 16 x i32> undef, <vscale x 16 x i32> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmv_v_v_u32m8(vuint32m8_t src, size_t vl) {
@@ -719,7 +719,7 @@ vuint32m8_t test_vmv_v_v_u32m8(vuint32m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32.i64(i32 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32.i64(<vscale x 16 x i32> undef, i32 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmv_v_x_u32m8(uint32_t src, size_t vl) {
@@ -728,7 +728,7 @@ vuint32m8_t test_vmv_v_x_u32m8(uint32_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64.i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmv_v_v_u64m1(vuint64m1_t src, size_t vl) {
@@ -737,7 +737,7 @@ vuint64m1_t test_vmv_v_v_u64m1(vuint64m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64.i64(<vscale x 1 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmv_v_x_u64m1(uint64_t src, size_t vl) {
@@ -746,7 +746,7 @@ vuint64m1_t test_vmv_v_x_u64m1(uint64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64.i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmv_v_v_u64m2(vuint64m2_t src, size_t vl) {
@@ -755,7 +755,7 @@ vuint64m2_t test_vmv_v_v_u64m2(vuint64m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64.i64(<vscale x 2 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmv_v_x_u64m2(uint64_t src, size_t vl) {
@@ -764,7 +764,7 @@ vuint64m2_t test_vmv_v_x_u64m2(uint64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64.i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmv_v_v_u64m4(vuint64m4_t src, size_t vl) {
@@ -773,7 +773,7 @@ vuint64m4_t test_vmv_v_v_u64m4(vuint64m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64.i64(<vscale x 4 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmv_v_x_u64m4(uint64_t src, size_t vl) {
@@ -782,7 +782,7 @@ vuint64m4_t test_vmv_v_x_u64m4(uint64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64.i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmv_v_v_u64m8(vuint64m8_t src, size_t vl) {
@@ -791,7 +791,7 @@ vuint64m8_t test_vmv_v_v_u64m8(vuint64m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_x_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64.i64(i64 [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64.i64(<vscale x 8 x i64> undef, i64 [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmv_v_x_u64m8(uint64_t src, size_t vl) {
@@ -800,7 +800,7 @@ vuint64m8_t test_vmv_v_x_u64m8(uint64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32.i64(<vscale x 1 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32.i64(<vscale x 1 x float> undef, <vscale x 1 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vmv_v_v_f32mf2(vfloat32mf2_t src, size_t vl) {
@@ -809,7 +809,7 @@ vfloat32mf2_t test_vmv_v_v_f32mf2(vfloat32mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32.i64(<vscale x 2 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32.i64(<vscale x 2 x float> undef, <vscale x 2 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vmv_v_v_f32m1(vfloat32m1_t src, size_t vl) {
@@ -818,7 +818,7 @@ vfloat32m1_t test_vmv_v_v_f32m1(vfloat32m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32.i64(<vscale x 4 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32.i64(<vscale x 4 x float> undef, <vscale x 4 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vmv_v_v_f32m2(vfloat32m2_t src, size_t vl) {
@@ -827,7 +827,7 @@ vfloat32m2_t test_vmv_v_v_f32m2(vfloat32m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32.i64(<vscale x 8 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32.i64(<vscale x 8 x float> undef, <vscale x 8 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vmv_v_v_f32m4(vfloat32m4_t src, size_t vl) {
@@ -836,7 +836,7 @@ vfloat32m4_t test_vmv_v_v_f32m4(vfloat32m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32.i64(<vscale x 16 x float> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vmv_v_v_f32m8(vfloat32m8_t src, size_t vl) {
@@ -845,7 +845,7 @@ vfloat32m8_t test_vmv_v_v_f32m8(vfloat32m8_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64.i64(<vscale x 1 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64.i64(<vscale x 1 x double> undef, <vscale x 1 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vmv_v_v_f64m1(vfloat64m1_t src, size_t vl) {
@@ -854,7 +854,7 @@ vfloat64m1_t test_vmv_v_v_f64m1(vfloat64m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64.i64(<vscale x 2 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64.i64(<vscale x 2 x double> undef, <vscale x 2 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vmv_v_v_f64m2(vfloat64m2_t src, size_t vl) {
@@ -863,7 +863,7 @@ vfloat64m2_t test_vmv_v_v_f64m2(vfloat64m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64.i64(<vscale x 4 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64.i64(<vscale x 4 x double> undef, <vscale x 4 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vmv_v_v_f64m4(vfloat64m4_t src, size_t vl) {
@@ -872,7 +872,7 @@ vfloat64m4_t test_vmv_v_v_f64m4(vfloat64m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64.i64(<vscale x 8 x double> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64.i64(<vscale x 8 x double> undef, <vscale x 8 x double> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vmv_v_v_f64m8(vfloat64m8_t src, size_t vl) {
@@ -1645,7 +1645,7 @@ vuint64m8_t test_vmv_s_x_u64m8(vuint64m8_t dst, uint64_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16.i64(<vscale x 1 x half> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16.i64(<vscale x 1 x half> undef, <vscale x 1 x half> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vmv_v_v_f16mf4 (vfloat16mf4_t src, size_t vl) {
@@ -1654,7 +1654,7 @@ vfloat16mf4_t test_vmv_v_v_f16mf4 (vfloat16mf4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16.i64(<vscale x 2 x half> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16.i64(<vscale x 2 x half> undef, <vscale x 2 x half> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vmv_v_v_f16mf2 (vfloat16mf2_t src, size_t vl) {
@@ -1663,7 +1663,7 @@ vfloat16mf2_t test_vmv_v_v_f16mf2 (vfloat16mf2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16.i64(<vscale x 4 x half> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16.i64(<vscale x 4 x half> undef, <vscale x 4 x half> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vmv_v_v_f16m1 (vfloat16m1_t src, size_t vl) {
@@ -1672,7 +1672,7 @@ vfloat16m1_t test_vmv_v_v_f16m1 (vfloat16m1_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16.i64(<vscale x 8 x half> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16.i64(<vscale x 8 x half> undef, <vscale x 8 x half> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vmv_v_v_f16m2 (vfloat16m2_t src, size_t vl) {
@@ -1681,7 +1681,7 @@ vfloat16m2_t test_vmv_v_v_f16m2 (vfloat16m2_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16.i64(<vscale x 16 x half> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16.i64(<vscale x 16 x half> undef, <vscale x 16 x half> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vmv_v_v_f16m4 (vfloat16m4_t src, size_t vl) {
@@ -1690,7 +1690,7 @@ vfloat16m4_t test_vmv_v_v_f16m4 (vfloat16m4_t src, size_t vl) {
 
 // CHECK-RV64-LABEL: @test_vmv_v_v_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16.i64(<vscale x 32 x half> [[SRC:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16.i64(<vscale x 32 x half> undef, <vscale x 32 x half> [[SRC:%.*]], i64 [[VL:%.*]])
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vmv_v_v_f16m8 (vfloat16m8_t src, size_t vl) {
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 2c338f139cdb5..33bbf2a2bf4c2 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1266,20 +1266,29 @@ let TargetPrefix = "riscv" in {
 
   defm vmerge : RISCVBinaryWithV0;
 
+  // Output: (vector)
+  // Input: (passthru, vector_in, vl)
   def int_riscv_vmv_v_v : Intrinsic<[llvm_anyvector_ty],
-                                    [LLVMMatchType<0>, llvm_anyint_ty],
+                                    [LLVMMatchType<0>, LLVMMatchType<0>,
+                                     llvm_anyint_ty],
                                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
+  // Output: (vector)
+  // Input: (passthru, scalar, vl)
   def int_riscv_vmv_v_x : Intrinsic<[llvm_anyint_ty],
-                                    [LLVMVectorElementType<0>, llvm_anyint_ty],
+                                    [LLVMMatchType<0>, LLVMVectorElementType<0>,
+                                     llvm_anyint_ty],
                                     [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
+  // Output: (vector)
+  // Input: (passthru, scalar, vl)
   def int_riscv_vfmv_v_f : Intrinsic<[llvm_anyfloat_ty],
-                                     [LLVMVectorElementType<0>,llvm_anyint_ty],
+                                     [LLVMMatchType<0>, LLVMVectorElementType<0>,
+                                      llvm_anyint_ty],
                                      [IntrNoMem]>, RISCVVIntrinsic {
-    let VLOperand = 1;
+    let VLOperand = 2;
   }
 
   def int_riscv_vmv_x_s : Intrinsic<[LLVMVectorElementType<0>],
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 6e7ae9e34d232..8b315960eff9b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -56,7 +56,8 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
           VT.isInteger() ? RISCVISD::VMV_V_X_VL : RISCVISD::VFMV_V_F_VL;
       SDLoc DL(N);
       SDValue VL = CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT());
-      SDValue Result = CurDAG->getNode(Opc, DL, VT, N->getOperand(0), VL);
+      SDValue Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT),
+                                       N->getOperand(0), VL);
 
       --I;
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
@@ -71,11 +72,12 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
     if (N->getOpcode() != RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL)
       continue;
 
-    assert(N->getNumOperands() == 3 && "Unexpected number of operands");
+    assert(N->getNumOperands() == 4 && "Unexpected number of operands");
     MVT VT = N->getSimpleValueType(0);
-    SDValue Lo = N->getOperand(0);
-    SDValue Hi = N->getOperand(1);
-    SDValue VL = N->getOperand(2);
+    SDValue Passthru = N->getOperand(0);
+    SDValue Lo = N->getOperand(1);
+    SDValue Hi = N->getOperand(2);
+    SDValue VL = N->getOperand(3);
     assert(VT.getVectorElementType() == MVT::i64 && VT.isScalableVector() &&
            Lo.getValueType() == MVT::i32 && Hi.getValueType() == MVT::i32 &&
            "Unexpected VTs!");
@@ -106,7 +108,7 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
         CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64);
     SDValue Ops[] = {Chain,
                      IntID,
-                     CurDAG->getUNDEF(VT),
+                     Passthru,
                      StackSlot,
                      CurDAG->getRegister(RISCV::X0, MVT::i64),
                      VL};
@@ -1624,9 +1626,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // Try to match splat of a scalar load to a strided load with stride of x0.
     bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL ||
                         Node->getOpcode() == RISCVISD::VFMV_S_F_VL;
-    if (IsScalarMove && !Node->getOperand(0).isUndef())
+    bool HasPassthruOperand = Node->getOpcode() != ISD::SPLAT_VECTOR;
+    if (HasPassthruOperand && !IsScalarMove && !Node->getOperand(0).isUndef())
       break;
-    SDValue Src = IsScalarMove ? Node->getOperand(1) : Node->getOperand(0);
+    SDValue Src = HasPassthruOperand ? Node->getOperand(1) : Node->getOperand(0);
     auto *Ld = dyn_cast<LoadSDNode>(Src);
     if (!Ld)
       break;
@@ -1648,7 +1651,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         break;
       selectVLOp(Node->getOperand(2), VL);
     } else
-      selectVLOp(Node->getOperand(1), VL);
+      selectVLOp(Node->getOperand(2), VL);
 
     unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
     SDValue SEW = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT);
@@ -1924,9 +1927,9 @@ bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
 }
 
 bool RISCVDAGToDAGISel::selectVSplat(SDValue N, SDValue &SplatVal) {
-  if (N.getOpcode() != RISCVISD::VMV_V_X_VL)
+  if (N.getOpcode() != RISCVISD::VMV_V_X_VL || !N.getOperand(0).isUndef())
     return false;
-  SplatVal = N.getOperand(0);
+  SplatVal = N.getOperand(1);
   return true;
 }
 
@@ -1936,11 +1939,12 @@ static bool selectVSplatSimmHelper(SDValue N, SDValue &SplatVal,
                                    SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget,
                                    ValidateFn ValidateImm) {
-  if (N.getOpcode() != RISCVISD::VMV_V_X_VL ||
-      !isa<ConstantSDNode>(N.getOperand(0)))
+  if (N.getOpcode() != RISCVISD::VMV_V_X_VL || !N.getOperand(0).isUndef() ||
+      !isa<ConstantSDNode>(N.getOperand(1)))
     return false;
 
-  int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+  int64_t SplatImm =
+      cast<ConstantSDNode>(N.getOperand(1))->getSExtValue();
 
   // The semantics of RISCVISD::VMV_V_X_VL is that when the operand
   // type is wider than the resulting vector element type: an implicit
@@ -1950,7 +1954,7 @@ static bool selectVSplatSimmHelper(SDValue N, SDValue &SplatVal,
   // For example, we wish to match (i8 -1) -> (XLenVT 255) as a simm5 by first
   // sign-extending to (XLenVT -1).
   MVT XLenVT = Subtarget.getXLenVT();
-  assert(XLenVT == N.getOperand(0).getSimpleValueType() &&
+  assert(XLenVT == N.getOperand(1).getSimpleValueType() &&
          "Unexpected splat operand type");
   MVT EltVT = N.getSimpleValueType().getVectorElementType();
   if (EltVT.bitsLT(XLenVT))
@@ -1983,11 +1987,12 @@ bool RISCVDAGToDAGISel::selectVSplatSimm5Plus1NonZero(SDValue N,
 }
 
 bool RISCVDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &SplatVal) {
-  if (N.getOpcode() != RISCVISD::VMV_V_X_VL ||
-      !isa<ConstantSDNode>(N.getOperand(0)))
+  if (N.getOpcode() != RISCVISD::VMV_V_X_VL || !N.getOperand(0).isUndef() ||
+      !isa<ConstantSDNode>(N.getOperand(1)))
     return false;
 
-  int64_t SplatImm = cast<ConstantSDNode>(N.getOperand(0))->getSExtValue();
+  int64_t SplatImm =
+      cast<ConstantSDNode>(N.getOperand(1))->getSExtValue();
 
   if (!isUInt<5>(SplatImm))
     return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1beb4dceb9d89..c20d2d1ef710a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1906,7 +1906,8 @@ static SDValue lowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   unsigned Opc =
       VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
-  SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, Op.getOperand(0), VL);
+  SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
+                              Op.getOperand(0), VL);
   return convertFromScalableVector(VT, Splat, DAG, Subtarget);
 }
 
@@ -2164,7 +2165,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       return Gather;
     unsigned Opc = VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL
                                         : RISCVISD::VMV_V_X_VL;
-    Splat = DAG.getNode(Opc, DL, ContainerVT, Splat, VL);
+    Splat =
+        DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Splat, VL);
     return convertFromScalableVector(VT, Splat, DAG, Subtarget);
   }
 
@@ -2272,6 +2274,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
           getContainerForFixedLengthVector(DAG, ViaVecVT, Subtarget);
       SDValue Splat =
           DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ViaContainerVT,
+                      DAG.getUNDEF(ViaContainerVT),
                       DAG.getConstant(SplatValue, DL, XLenVT), ViaVL);
       Splat = convertFromScalableVector(ViaVecVT, Splat, DAG, Subtarget);
       return DAG.getBitcast(VT, Splat);
@@ -2359,15 +2362,19 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Lo,
-                                   SDValue Hi, SDValue VL, SelectionDAG &DAG) {
+static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
+                                   SDValue Lo, SDValue Hi, SDValue VL,
+                                   SelectionDAG &DAG) {
+  bool HasPassthru = Passthru && !Passthru.isUndef();
+  if (!HasPassthru && !Passthru)
+    Passthru = DAG.getUNDEF(VT);
   if (isa<ConstantSDNode>(Lo) && isa<ConstantSDNode>(Hi)) {
     int32_t LoC = cast<ConstantSDNode>(Lo)->getSExtValue();
     int32_t HiC = cast<ConstantSDNode>(Hi)->getSExtValue();
     // If Hi constant is all the same sign bit as Lo, lower this as a custom
     // node in order to try and match RVV vector/scalar instructions.
     if ((LoC >> 31) == HiC)
-      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
+      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Lo, VL);
 
     // If vl is equal to XLEN_MAX and Hi constant is equal to Lo, we could use
     // vmv.v.x whose EEW = 32 to lower it.
@@ -2376,41 +2383,46 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Lo,
       MVT InterVT = MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
       // TODO: if vl <= min(VLMAX), we can also do this. But we could not
       // access the subtarget here now.
-      auto InterVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterVT, Lo,
+      auto InterVec = DAG.getNode(
+          RISCVISD::VMV_V_X_VL, DL, InterVT, DAG.getUNDEF(InterVT), Lo,
                                   DAG.getRegister(RISCV::X0, MVT::i32));
       return DAG.getNode(ISD::BITCAST, DL, VT, InterVec);
     }
   }
 
   // Fall back to a stack store and stride x0 vector load.
-  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VT, Lo, Hi, VL);
+  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VT, Passthru, Lo,
+                     Hi, VL);
 }
 
 // Called by type legalization to handle splat of i64 on RV32.
 // FIXME: We can optimize this when the type has sign or zero bits in one
 // of the halves.
-static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
-                                   SDValue VL, SelectionDAG &DAG) {
+static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,
+                                   SDValue Scalar, SDValue VL,
+                                   SelectionDAG &DAG) {
   assert(Scalar.getValueType() == MVT::i64 && "Unexpected VT!");
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
                            DAG.getConstant(0, DL, MVT::i32));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Scalar,
                            DAG.getConstant(1, DL, MVT::i32));
-  return splatPartsI64WithVL(DL, VT, Lo, Hi, VL, DAG);
+  return splatPartsI64WithVL(DL, VT, Passthru, Lo, Hi, VL, DAG);
 }
 
 // This function lowers a splat of a scalar operand Splat with the vector
 // length VL. It ensures the final sequence is type legal, which is useful when
 // lowering a splat after type legalization.
-static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
-                                SelectionDAG &DAG,
+static SDValue lowerScalarSplat(SDValue Passthru, SDValue Scalar, SDValue VL,
+                                MVT VT, SDLoc DL, SelectionDAG &DAG,
                                 const RISCVSubtarget &Subtarget) {
+  bool HasPassthru = Passthru && !Passthru.isUndef();
+  if (!HasPassthru && !Passthru)
+    Passthru = DAG.getUNDEF(VT);
   if (VT.isFloatingPoint()) {
     // If VL is 1, we could use vfmv.s.f.
     if (isOneConstant(VL))
-      return DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, VT, DAG.getUNDEF(VT),
-                         Scalar, VL);
-    return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL);
+      return DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, VT, Passthru, Scalar, VL);
+    return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Passthru, Scalar, VL);
   }
 
   MVT XLenVT = Subtarget.getXLenVT();
@@ -2429,20 +2441,19 @@ static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
     // use vmv.s.x.
     if (isOneConstant(VL) &&
         (!Const || isNullConstant(Scalar) || !isInt<5>(Const->getSExtValue())))
-      return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT), Scalar,
-                         VL);
-    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL);
+      return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, Passthru, Scalar, VL);
+    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Scalar, VL);
   }
 
   assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 &&
          "Unexpected scalar for splat lowering!");
 
   if (isOneConstant(VL) && isNullConstant(Scalar))
-    return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT),
+    return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, Passthru,
                        DAG.getConstant(0, DL, XLenVT), VL);
 
   // Otherwise use the more complicated splatting algorithm.
-  return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
+  return splatSplitI64WithVL(DL, VT, Passthru, Scalar, VL, DAG);
 }
 
 // Is the mask a slidedown that shifts in undefs.
@@ -2658,7 +2669,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 
         unsigned Opc =
             VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
-        SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, V, VL);
+        SDValue Splat =
+            DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), V, VL);
         return convertFromScalableVector(VT, Splat, DAG, Subtarget);
       }
 
@@ -2767,6 +2779,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                               V2, TrueMask, VL);
     // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer.
     SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT,
+                                     DAG.getUNDEF(IntHalfVT),
                                      DAG.getAllOnesConstant(DL, XLenVT));
     SDValue WidenMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT,
                                    V2, Multiplier, TrueMask, VL);
@@ -2870,7 +2883,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   // TODO: This doesn't trigger for i64 vectors on RV32, since there we
   // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
   if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
-    Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
+    Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
+                              Subtarget);
   } else {
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
     // If only one index is used, we can use a "splat" vrgather.
@@ -4242,7 +4256,8 @@ SDValue RISCVTargetLowering::lowerSPLAT_VECTOR_PARTS(SDValue Op,
     std::tie(Mask, VL) =
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-    SDValue Res = splatPartsI64WithVL(DL, ContainerVT, Lo, Hi, VL, DAG);
+    SDValue Res =
+        splatPartsI64WithVL(DL, ContainerVT, SDValue(), Lo, Hi, VL, DAG);
     return convertFromScalableVector(VecVT, Res, DAG, Subtarget);
   }
 
@@ -4252,19 +4267,20 @@ SDValue RISCVTargetLowering::lowerSPLAT_VECTOR_PARTS(SDValue Op,
     // If Hi constant is all the same sign bit as Lo, lower this as a custom
     // node in order to try and match RVV vector/scalar instructions.
     if ((LoC >> 31) == HiC)
-      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, Lo,
-                         DAG.getRegister(RISCV::X0, MVT::i32));
+      return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT),
+                         Lo, DAG.getRegister(RISCV::X0, MVT::i32));
   }
 
   // Detect cases where Hi is (SRA Lo, 31) which means Hi is Lo sign extended.
   if (Hi.getOpcode() == ISD::SRA && Hi.getOperand(0) == Lo &&
       isa<ConstantSDNode>(Hi.getOperand(1)) &&
       Hi.getConstantOperandVal(1) == 31)
-    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, Lo,
+    return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT), Lo,
                        DAG.getRegister(RISCV::X0, MVT::i32));
 
   // Fall back to use a stack store and stride x0 vector load. Use X0 as VL.
-  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VecVT, Lo, Hi,
+  return DAG.getNode(RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL, DL, VecVT,
+                     DAG.getUNDEF(VecVT), Lo, Hi,
                      DAG.getRegister(RISCV::X0, MVT::i32));
 }
 
@@ -4297,10 +4313,12 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
       SplatZero = DAG.getSplatVector(VecVT, DL, SplatZero);
       SplatTrueVal = DAG.getSplatVector(VecVT, DL, SplatTrueVal);
     } else {
-      SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, SplatZero,
-                              DAG.getRegister(RISCV::X0, XLenVT));
-      SplatTrueVal = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, SplatTrueVal,
-                                 DAG.getRegister(RISCV::X0, XLenVT));
+      SplatZero =
+          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT),
+                      SplatZero, DAG.getRegister(RISCV::X0, XLenVT));
+      SplatTrueVal =
+          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT),
+                      SplatTrueVal, DAG.getRegister(RISCV::X0, XLenVT));
     }
 
     return DAG.getNode(ISD::VSELECT, DL, VecVT, Src, SplatTrueVal, SplatZero);
@@ -4315,9 +4333,10 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatZero, VL);
-  SplatTrueVal =
-      DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatTrueVal, VL);
+  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                          DAG.getUNDEF(ContainerVT), SplatZero, VL);
+  SplatTrueVal = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                             DAG.getUNDEF(ContainerVT), SplatTrueVal, VL);
   SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC,
                                SplatTrueVal, SplatZero, VL);
 
@@ -4375,8 +4394,10 @@ SDValue RISCVTargetLowering::lowerVectorMaskTrunc(SDValue Op,
   SDValue SplatOne = DAG.getConstant(1, DL, Subtarget.getXLenVT());
   SDValue SplatZero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
 
-  SplatOne = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatOne);
-  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, SplatZero);
+  SplatOne = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                         DAG.getUNDEF(ContainerVT), SplatOne);
+  SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                          DAG.getUNDEF(ContainerVT), SplatZero);
 
   if (VecVT.isScalableVector()) {
     SDValue Trunc = DAG.getNode(ISD::AND, DL, VecVT, Src, SplatOne);
@@ -4472,8 +4493,8 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     SDValue InsertI64VL = DAG.getConstant(2, DL, XLenVT);
     // Note: We can't pass a UNDEF to the first VSLIDE1UP_VL since an untied
     // undef doesn't obey the earlyclobber constraint. Just splat a zero value.
-    ValInVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, I32ContainerVT, Zero,
-                           InsertI64VL);
+    ValInVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, I32ContainerVT,
+                           DAG.getUNDEF(I32ContainerVT), Zero, InsertI64VL);
     // First slide in the hi value, then the lo in underneath it.
     ValInVec = DAG.getNode(RISCVISD::VSLIDE1UP_VL, DL, I32ContainerVT,
                            DAG.getUNDEF(I32ContainerVT), ValInVec, ValHi,
@@ -4653,7 +4674,7 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
   // be sign extended?
   SDValue VL = getVLOperand(Op);
   assert(VL.getValueType() == XLenVT);
-  ScalarOp = splatSplitI64WithVL(DL, VT, ScalarOp, VL, DAG);
+  ScalarOp = splatSplitI64WithVL(DL, VT, SDValue(), ScalarOp, VL, DAG);
   return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
 }
 
@@ -4722,10 +4743,11 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getOperand(1));
   case Intrinsic::riscv_vmv_v_x:
     return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2),
-                            Op.getSimpleValueType(), DL, DAG, Subtarget);
+                            Op.getOperand(3), Op.getSimpleValueType(), DL, DAG,
+                            Subtarget);
   case Intrinsic::riscv_vfmv_v_f:
     return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::riscv_vmv_s_x: {
     SDValue Scalar = Op.getOperand(2);
 
@@ -4756,9 +4778,10 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     SDValue Vec = Op.getOperand(1);
     SDValue VL = getVLOperand(Op);
 
-    SDValue SplattedVal = splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
-    SDValue SplattedIdx = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
-                                      DAG.getConstant(0, DL, MVT::i32), VL);
+    SDValue SplattedVal = splatSplitI64WithVL(DL, VT, SDValue(), Scalar, VL, DAG);
+    SDValue SplattedIdx =
+        DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
+                    DAG.getConstant(0, DL, MVT::i32), VL);
 
     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
     SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
@@ -5130,8 +5153,9 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
 
   SDValue NeutralElem =
       DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags());
-  SDValue IdentitySplat = lowerScalarSplat(
-      NeutralElem, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget);
+  SDValue IdentitySplat =
+      lowerScalarSplat(SDValue(), NeutralElem, DAG.getConstant(1, DL, XLenVT),
+                       M1VT, DL, DAG, Subtarget);
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), Vec,
                                   IdentitySplat, Mask, VL);
   SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
@@ -5192,8 +5216,9 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
 
-  SDValue ScalarSplat = lowerScalarSplat(
-      ScalarVal, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget);
+  SDValue ScalarSplat =
+      lowerScalarSplat(SDValue(), ScalarVal, DAG.getConstant(1, DL, XLenVT),
+                       M1VT, DL, DAG, Subtarget);
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT),
                                   VectorVal, ScalarSplat, Mask, VL);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction,
@@ -5259,9 +5284,9 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
   MVT XLenVT = Subtarget.getXLenVT();
   MVT ResVT = !VecVT.isInteger() || VecEltVT.bitsGE(XLenVT) ? VecEltVT : XLenVT;
 
-  SDValue StartSplat =
-      lowerScalarSplat(Op.getOperand(0), DAG.getConstant(1, DL, XLenVT), M1VT,
-                       DL, DAG, Subtarget);
+  SDValue StartSplat = lowerScalarSplat(SDValue(), Op.getOperand(0),
+                                        DAG.getConstant(1, DL, XLenVT), M1VT,
+                                        DL, DAG, Subtarget);
   SDValue Reduction =
       DAG.getNode(RVVOpcode, DL, M1VT, StartSplat, Vec, StartSplat, Mask, VL);
   SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
@@ -5563,13 +5588,13 @@ SDValue RISCVTargetLowering::lowerSTEP_VECTOR(SDValue Op,
   if (StepValImm != 1) {
     if (isPowerOf2_64(StepValImm)) {
       SDValue StepVal =
-          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
+          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
                       DAG.getConstant(Log2_64(StepValImm), DL, XLenVT));
       StepVec = DAG.getNode(ISD::SHL, DL, VT, StepVec, StepVal);
     } else {
       SDValue StepVal = lowerScalarSplat(
-          DAG.getConstant(StepValImm, DL, VT.getVectorElementType()), VL, VT,
-          DL, DAG, Subtarget);
+          SDValue(), DAG.getConstant(StepValImm, DL, VT.getVectorElementType()),
+          VL, VT, DL, DAG, Subtarget);
       StepVec = DAG.getNode(ISD::MUL, DL, VT, StepVec, StepVal);
     }
   }
@@ -5645,8 +5670,8 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   if (!IsRV32E64)
     SplatVL = DAG.getSplatVector(IntVT, DL, VLMinus1);
   else
-    SplatVL = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT, VLMinus1,
-                          DAG.getRegister(RISCV::X0, XLenVT));
+    SplatVL = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT, DAG.getUNDEF(IntVT),
+                          VLMinus1, DAG.getRegister(RISCV::X0, XLenVT));
 
   SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, IntVT, Mask, VL);
   SDValue Indices =
@@ -5900,9 +5925,9 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const {
   SDValue Mask, VL;
   std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
 
-  SDValue SplatZero =
-      DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
-                  DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+  SDValue SplatZero = DAG.getNode(
+      RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
+      DAG.getConstant(0, DL, Subtarget.getXLenVT()));
   SDValue NegX =
       DAG.getNode(RISCVISD::SUB_VL, DL, ContainerVT, SplatZero, X, Mask, VL);
   SDValue Max =
@@ -6814,6 +6839,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     // To extract the upper XLEN bits of the vector element, shift the first
     // element right by 32 bits and re-extract the lower XLEN bits.
     SDValue ThirtyTwoV = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                     DAG.getUNDEF(ContainerVT),
                                      DAG.getConstant(32, DL, XLenVT), VL);
     SDValue LShr32 = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, Vec,
                                  ThirtyTwoV, Mask, VL);
@@ -6916,8 +6942,9 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue VL = DAG.getConstant(1, DL, XLenVT);
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
       SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
-      SDValue ThirtyTwoV = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT,
-                                       DAG.getConstant(32, DL, XLenVT), VL);
+      SDValue ThirtyTwoV =
+          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VecVT, DAG.getUNDEF(VecVT),
+                      DAG.getConstant(32, DL, XLenVT), VL);
       SDValue LShr32 =
           DAG.getNode(RISCVISD::SRL_VL, DL, VecVT, Vec, ThirtyTwoV, Mask, VL);
       SDValue EltHi = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, LShr32);
@@ -7725,8 +7752,8 @@ static SDValue combineVWADD_W_VL_VWSUB_W_VL(SDNode *N, SelectionDAG &DAG) {
   // Look for splats on the left hand side of a vwadd(u).wv. We might be able
   // to commute and use a vwadd(u).vx instead.
   if (IsAdd && Op0.getOpcode() == RISCVISD::VMV_V_X_VL &&
-      Op0.getOperand(1) == VL) {
-    Op0 = Op0.getOperand(0);
+      Op0.getOperand(0).isUndef() && Op0.getOperand(2) == VL) {
+    Op0 = Op0.getOperand(1);
 
     // See if have enough sign bits or zero bits in the scalar to use a
     // widening add/sub by splatting to smaller element size.
@@ -7746,7 +7773,8 @@ static SDValue combineVWADD_W_VL_VWSUB_W_VL(SDNode *N, SelectionDAG &DAG) {
         return SDValue();
     }
 
-    Op0 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op0, VL);
+    Op0 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT,
+                      DAG.getUNDEF(NarrowVT), Op0, VL);
     return DAG.getNode(VOpc, DL, VT, Op1, Op0, Mask, VL);
   }
 
@@ -7798,12 +7826,15 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
   } else if (Op1.getOpcode() == RISCVISD::VMV_V_X_VL) {
     // The operand is a splat of a scalar.
 
+    // The pasthru must be undef for tail agnostic
+    if (!Op1.getOperand(0).isUndef())
+      return SDValue();
     // The VL must be the same.
-    if (Op1.getOperand(1) != VL)
+    if (Op1.getOperand(2) != VL)
       return SDValue();
 
     // Get the scalar value.
-    Op1 = Op1.getOperand(0);
+    Op1 = Op1.getOperand(1);
 
     // See if have enough sign bits or zero bits in the scalar to use a
     // widening multiply by splatting to smaller element size.
@@ -7826,7 +7857,8 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
         return SDValue();
     }
 
-    Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op1, VL);
+    Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT,
+                      DAG.getUNDEF(NarrowVT), Op1, VL);
   } else
     return SDValue();
 
@@ -8398,8 +8430,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       SDLoc DL(N);
       SDValue VL = N->getOperand(3);
       EVT VT = N->getValueType(0);
-      ShAmt =
-          DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, ShAmt.getOperand(0), VL);
+      ShAmt = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
+                          ShAmt.getOperand(1), VL);
       return DAG.getNode(N->getOpcode(), DL, VT, N->getOperand(0), ShAmt,
                          N->getOperand(2), N->getOperand(3));
     }
@@ -8413,7 +8445,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       // We don't need the upper 32 bits of a 64-bit element for a shift amount.
       SDLoc DL(N);
       EVT VT = N->getValueType(0);
-      ShAmt = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, ShAmt.getOperand(0),
+      ShAmt = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
+                          ShAmt.getOperand(1),
                           DAG.getRegister(RISCV::X0, Subtarget.getXLenVT()));
       return DAG.getNode(N->getOpcode(), DL, VT, N->getOperand(0), ShAmt);
     }
@@ -8470,11 +8503,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::VMV_V_X_VL: {
-    // VMV.V.X only demands the vector element bitwidth from the scalar input.
-    unsigned ScalarSize = N->getOperand(0).getValueSizeInBits();
+    // Tail agnostic VMV.V.X only demands the vector element bitwidth from the
+    // scalar input.
+    unsigned ScalarSize = N->getOperand(1).getValueSizeInBits();
     unsigned EltWidth = N->getValueType(0).getScalarSizeInBits();
-    if (ScalarSize > EltWidth)
-      if (SimplifyDemandedLowBitsHelper(0, EltWidth))
+    if (ScalarSize > EltWidth && N->getOperand(0).isUndef())
+      if (SimplifyDemandedLowBitsHelper(1, EltWidth))
         return SDValue(N, 0);
 
     break;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 134d5fa89739f..2a4fa57aad662 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -129,10 +129,12 @@ enum NodeType : unsigned {
   BFPW,
   // Vector Extension
   // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand
-  // for the VL value to be used for the operation.
+  // for the VL value to be used for the operation. The first operand is
+  // passthru operand.
   VMV_V_X_VL,
   // VFMV_V_F_VL matches the semantics of vfmv.v.f but includes an extra operand
-  // for the VL value to be used for the operation.
+  // for the VL value to be used for the operation. The first operand is
+  // passthru operand.
   VFMV_V_F_VL,
   // VMV_X_S matches the semantics of vmv.x.s. The result is always XLenVT sign
   // extended from the vector element size.
@@ -143,6 +145,7 @@ enum NodeType : unsigned {
   VFMV_S_F_VL,
   // Splats an 64-bit value that has been split into two i32 parts. This is
   // expanded late to two scalar stores and a stride 0 vector load.
+  // The first operand is passthru operand.
   SPLAT_VECTOR_SPLIT_I64_VL,
   // Read VLENB CSR
   READ_VLENB,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index f0caf72e01204..fc6ec3879c779 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -859,6 +859,21 @@ class VPseudoUnaryNoDummyMask<VReg RetClass,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+class VPseudoUnaryNoDummyMaskTU<VReg RetClass,
+                                DAGOperand Op2Class> :
+        Pseudo<(outs RetClass:$rd),
+               (ins RetClass:$dest, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $dest";
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
 class VPseudoNullaryNoMask<VReg RegClass>:
       Pseudo<(outs RegClass:$rd),
              (ins AVL:$vl, ixlenimm:$sew),
@@ -2000,6 +2015,12 @@ multiclass VPseudoUnaryVMV_V_X_I {
                          Sched<[WriteVIMovX, ReadVIMovX]>;
       def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>,
                          Sched<[WriteVIMovI]>;
+      def "_V_" # m.MX # "_TU": VPseudoUnaryNoDummyMaskTU<m.vrclass, m.vrclass>,
+                         Sched<[WriteVIMovV, ReadVIMovV]>;
+      def "_X_" # m.MX # "_TU": VPseudoUnaryNoDummyMaskTU<m.vrclass, GPR>,
+                         Sched<[WriteVIMovX, ReadVIMovX]>;
+      def "_I_" # m.MX # "_TU": VPseudoUnaryNoDummyMaskTU<m.vrclass, simm5>,
+                         Sched<[WriteVIMovI]>;
     }
   }
 }
@@ -2011,6 +2032,9 @@ multiclass VPseudoVMV_F {
         def "_" # f.FX # "_" # m.MX :
           VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>,
           Sched<[WriteVFMovV, ReadVFMovF]>;
+        def "_" # f.FX # "_" # m.MX # "_TU":
+          VPseudoUnaryNoDummyMaskTU<m.vrclass, f.fprclass>,
+          Sched<[WriteVFMovV, ReadVFMovF]>;
       }
     }
   }
@@ -5071,10 +5095,16 @@ defm : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
 // 12.16. Vector Integer Move Instructions
 //===----------------------------------------------------------------------===//
 foreach vti = AllVectors in {
-  def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$rs1),
+  def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector undef),
+                                           (vti.Vector vti.RegClass:$rs1),
                                            VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX)
              $rs1, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$passthru),
+                                           (vti.Vector vti.RegClass:$rs1),
+                                           VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_V_"#vti.LMul.MX#"_TU")
+             $passthru, $rs1, GPR:$vl, vti.Log2SEW)>;
 
   // vmv.v.x/vmv.v.i are handled in RISCInstrVInstrInfoVVLPatterns.td
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 9514ed8f3ff4d..1ce006d3da990 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -584,7 +584,7 @@ defm : VPatBinarySDNode_VV_VX_VI<sra, "PseudoVSRA", uimm5>;
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
   def : Pat<(shl (vti.Vector vti.RegClass:$rs1),
-                 (vti.Vector (riscv_vmv_v_x_vl 1, (XLenVT srcvalue)))),
+                 (vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), 1, (XLenVT srcvalue)))),
             (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
                  vti.RegClass:$rs1, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index cefdd4d3c6098..9e84cae445a9b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -47,13 +47,15 @@ def SDT_RISCVFPBinOp_VL : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                SDTCisVT<4, XLenVT>]>;
 
 def riscv_vmv_v_x_vl : SDNode<"RISCVISD::VMV_V_X_VL",
-                              SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
-                                                   SDTCisVT<1, XLenVT>,
-                                                   SDTCisVT<2, XLenVT>]>>;
+                              SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<0>,
+                                                   SDTCisSameAs<0, 1>,
+                                                   SDTCisVT<2, XLenVT>,
+                                                   SDTCisVT<3, XLenVT>]>>;
 def riscv_vfmv_v_f_vl : SDNode<"RISCVISD::VFMV_V_F_VL",
-                               SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>,
-                                                    SDTCisEltOfVec<1, 0>,
-                                                    SDTCisVT<2, XLenVT>]>>;
+                               SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
+                                                    SDTCisSameAs<0, 1>,
+                                                    SDTCisEltOfVec<2, 0>,
+                                                    SDTCisVT<3, XLenVT>]>>;
 def riscv_vmv_s_x_vl : SDNode<"RISCVISD::VMV_S_X_VL",
                               SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                                    SDTCisInt<0>,
@@ -295,7 +297,7 @@ def SplatPat_simm5_plus1_nonzero
 
 // Ignore the vl operand.
 def SplatFPOp : PatFrag<(ops node:$op),
-                        (riscv_vfmv_v_f_vl node:$op, srcvalue)>;
+                        (riscv_vfmv_v_f_vl undef, node:$op, srcvalue)>;
 
 def sew8simm5  : ComplexPattern<XLenVT, 1, "selectRVVSimm5<8>",  []>;
 def sew16simm5 : ComplexPattern<XLenVT, 1, "selectRVVSimm5<16>", []>;
@@ -765,7 +767,7 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_sra_vl, "PseudoVSRA", uimm5>;
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
   def : Pat<(riscv_shl_vl (vti.Vector vti.RegClass:$rs1),
-                          (riscv_vmv_v_x_vl 1, (XLenVT srcvalue)),
+                          (riscv_vmv_v_x_vl (vti.Vector undef), 1, (XLenVT srcvalue)),
                           (vti.Mask true_mask),
                           VLOpFrag),
             (!cast<Instruction>("PseudoVADD_VV_"# vti.LMul.MX)
@@ -1045,14 +1047,21 @@ foreach vti = AllIntegerVectors in {
 
 // 12.16. Vector Integer Move Instructions
 foreach vti = AllIntegerVectors in {
-  def : Pat<(vti.Vector (riscv_vmv_v_x_vl GPR:$rs2, VLOpFrag)),
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), GPR:$rs2, VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX)
              $rs2, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.Vector:$passthru, GPR:$rs2, VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_X_"#vti.LMul.MX#"_TU")
+             $passthru, $rs2, GPR:$vl, vti.Log2SEW)>;
   defvar ImmPat = !cast<ComplexPattern>("sew"#vti.SEW#"simm5");
-  def : Pat<(vti.Vector (riscv_vmv_v_x_vl (ImmPat XLenVT:$imm5),
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl (vti.Vector undef), (ImmPat XLenVT:$imm5),
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX)
              XLenVT:$imm5, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (riscv_vmv_v_x_vl vti.Vector:$passthru, (ImmPat XLenVT:$imm5),
+                                              VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_I_"#vti.LMul.MX#"_TU")
+             $passthru, XLenVT:$imm5, GPR:$vl, vti.Log2SEW)>;
 }
 
 // 12.1. Vector Single-Width Saturating Add and Subtract
@@ -1336,16 +1345,26 @@ foreach fvti = AllFloatVectors in {
   // 14.16. Vector Floating-Point Move Instruction
   // If we're splatting fpimm0, use vmv.v.x vd, x0.
   def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
-                         (fvti.Scalar (fpimm0)), VLOpFrag)),
+                         (fvti.Vector undef), (fvti.Scalar (fpimm0)), VLOpFrag)),
             (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
              0, GPR:$vl, fvti.Log2SEW)>;
+  def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+                         fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX#"_TU")
+             $passthru, 0, GPR:$vl, fvti.Log2SEW)>;
 
   def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
-                         (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
+                         (fvti.Vector undef), (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
             (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
                                 fvti.LMul.MX)
              (fvti.Scalar fvti.ScalarRegClass:$rs2),
              GPR:$vl, fvti.Log2SEW)>;
+  def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+                         fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
+            (!cast<Instruction>("PseudoVFMV_V_" # fvti.ScalarSuffix # "_" #
+                                fvti.LMul.MX # "_TU")
+             $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2),
+             GPR:$vl, fvti.Log2SEW)>;
 
   // 14.17. Vector Single-Width Floating-Point/Integer Type-Convert Instructions
   defm : VPatConvertFP2ISDNode_V_VL<riscv_fp_to_sint_vl, "PseudoVFCVT_RTZ_X_F_V">;
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv32.ll
index 4591e30275dcf..fe0e8acb0e595 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv32.ll
@@ -395,7 +395,7 @@ define <vscale x 8 x i1> @icmp_ult_vi_nxv8i8_4(<vscale x 8 x i8> %va) {
   ret <vscale x 8 x i1> %vc
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(i8, i32);
+declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(<vscale x 8 x i8>, i8, i32);
 
 ; Test that we don't optimize ult x, 0 -> ule x, -1
 define <vscale x 8 x i1> @icmp_ult_vi_nxv8i8_5(<vscale x 8 x i8> %va, i32 %vl) {
@@ -404,7 +404,7 @@ define <vscale x 8 x i1> @icmp_ult_vi_nxv8i8_5(<vscale x 8 x i8> %va, i32 %vl) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vmsltu.vx v0, v8, zero
 ; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(i8 0, i32 %vl)
+  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(<vscale x 8 x i8> undef, i8 0, i32 %vl)
   %vc = icmp ult <vscale x 8 x i8> %va, %splat
   ret <vscale x 8 x i1> %vc
 }
@@ -1038,7 +1038,7 @@ define <vscale x 8 x i1> @icmp_uge_vi_nxv8i8_6(<vscale x 8 x i8> %va, i32 %vl) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(i8 0, i32 %vl)
+  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(<vscale x 8 x i8> undef, i8 0, i32 %vl)
   %vc = icmp uge <vscale x 8 x i8> %va, %splat
   ret <vscale x 8 x i1> %vc
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv64.ll
index b5fd086badf9f..6e86aaa4b7257 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer-rv64.ll
@@ -296,7 +296,7 @@ define <vscale x 8 x i1> @icmp_uge_vi_nxv8i8_6(<vscale x 8 x i8> %va, i64 %vl) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vmsleu.vv v0, v9, v8
 ; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(i8 0, i64 %vl)
+  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(<vscale x 8 x i8> undef, i8 0, i64 %vl)
   %vc = icmp uge <vscale x 8 x i8> %va, %splat
   ret <vscale x 8 x i1> %vc
 }
@@ -409,7 +409,7 @@ define <vscale x 8 x i1> @icmp_ult_vi_nxv8i8_4(<vscale x 8 x i8> %va) {
   ret <vscale x 8 x i1> %vc
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(i8, i64);
+declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(<vscale x 8 x i8>, i8, i64);
 
 ; Test that we don't optimize ult x, 0 -> ule x, -1
 define <vscale x 8 x i1> @icmp_ult_vi_nxv8i8_5(<vscale x 8 x i8> %va, i64 %vl) {
@@ -418,7 +418,7 @@ define <vscale x 8 x i1> @icmp_ult_vi_nxv8i8_5(<vscale x 8 x i8> %va, i64 %vl) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vmsltu.vx v0, v8, zero
 ; CHECK-NEXT:    ret
-  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(i8 0, i64 %vl)
+  %splat = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(<vscale x 8 x i8> undef, i8 0, i64 %vl)
   %vc = icmp ult <vscale x 8 x i8> %va, %splat
   ret <vscale x 8 x i1> %vc
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
index 8147a08481923..a1758a0d67b46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
@@ -3235,3 +3235,112 @@ entry:
 
   ret <vscale x 1 x half> %a
 }
+
+declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV32-NEXT:    vmv.v.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, tu, mu
+; RV64-NEXT:    vmv.v.v v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; RV32-NEXT:    vmv.v.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; RV64-NEXT:    vmv.v.v v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+  <vscale x 1 x i64>,
+  i64,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+    <vscale x 1 x i64> %0,
+    i64 %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(
+  <vscale x 1 x float>,
+  float,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfmv.v.f_f_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vfmv.v.f_f_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; RV32-NEXT:    vfmv.v.f v8, fa0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmv.v.f_f_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; RV64-NEXT:    vfmv.v.f v8, fa0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(
+    <vscale x 1 x float> %0,
+    float %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll
index 6e0613e3e49ba..65f737a859e3e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.v.f.ll
@@ -4,6 +4,7 @@
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 declare <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(
+  <vscale x 1 x half>,
   half,
   iXLen);
 
@@ -15,6 +16,7 @@ define <vscale x 1 x half> @intrinsic_vfmv.v.f_f_nxv1f16(half %0, iXLen %1) noun
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(
+    <vscale x 1 x half> undef,
     half %0,
     iXLen %1)
 
@@ -22,6 +24,7 @@ entry:
 }
 
 declare <vscale x 2 x half> @llvm.riscv.vfmv.v.f.nxv2f16(
+  <vscale x 2 x half>,
   half,
   iXLen);
 
@@ -33,6 +36,7 @@ define <vscale x 2 x half> @intrinsic_vfmv.v.f_f_nxv2f16(half %0, iXLen %1) noun
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmv.v.f.nxv2f16(
+    <vscale x 2 x half> undef,
     half %0,
     iXLen %1)
 
@@ -40,6 +44,7 @@ entry:
 }
 
 declare <vscale x 4 x half> @llvm.riscv.vfmv.v.f.nxv4f16(
+  <vscale x 4 x half>,
   half,
   iXLen);
 
@@ -51,6 +56,7 @@ define <vscale x 4 x half> @intrinsic_vfmv.v.f_f_nxv4f16(half %0, iXLen %1) noun
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmv.v.f.nxv4f16(
+    <vscale x 4 x half> undef,
     half %0,
     iXLen %1)
 
@@ -58,6 +64,7 @@ entry:
 }
 
 declare <vscale x 8 x half> @llvm.riscv.vfmv.v.f.nxv8f16(
+  <vscale x 8 x half>,
   half,
   iXLen);
 
@@ -69,6 +76,7 @@ define <vscale x 8 x half> @intrinsic_vfmv.v.f_f_nxv8f16(half %0, iXLen %1) noun
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmv.v.f.nxv8f16(
+    <vscale x 8 x half> undef,
     half %0,
     iXLen %1)
 
@@ -76,6 +84,7 @@ entry:
 }
 
 declare <vscale x 16 x half> @llvm.riscv.vfmv.v.f.nxv16f16(
+  <vscale x 16 x half>,
   half,
   iXLen);
 
@@ -87,6 +96,7 @@ define <vscale x 16 x half> @intrinsic_vfmv.v.f_f_nxv16f16(half %0, iXLen %1) no
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmv.v.f.nxv16f16(
+    <vscale x 16 x half> undef,
     half %0,
     iXLen %1)
 
@@ -94,6 +104,7 @@ entry:
 }
 
 declare <vscale x 32 x half> @llvm.riscv.vfmv.v.f.nxv32f16(
+  <vscale x 32 x half>,
   half,
   iXLen);
 
@@ -105,6 +116,7 @@ define <vscale x 32 x half> @intrinsic_vfmv.v.f_f_nxv32f16(half %0, iXLen %1) no
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmv.v.f.nxv32f16(
+    <vscale x 32 x half> undef,
     half %0,
     iXLen %1)
 
@@ -112,6 +124,7 @@ entry:
 }
 
 declare <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(
+  <vscale x 1 x float>,
   float,
   iXLen);
 
@@ -123,6 +136,7 @@ define <vscale x 1 x float> @intrinsic_vfmv.v.f_f_nxv1f32(float %0, iXLen %1) no
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(
+    <vscale x 1 x float> undef,
     float %0,
     iXLen %1)
 
@@ -130,6 +144,7 @@ entry:
 }
 
 declare <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32(
+  <vscale x 2 x float>,
   float,
   iXLen);
 
@@ -141,6 +156,7 @@ define <vscale x 2 x float> @intrinsic_vfmv.v.f_f_nxv2f32(float %0, iXLen %1) no
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32(
+    <vscale x 2 x float> undef,
     float %0,
     iXLen %1)
 
@@ -148,6 +164,7 @@ entry:
 }
 
 declare <vscale x 4 x float> @llvm.riscv.vfmv.v.f.nxv4f32(
+  <vscale x 4 x float>,
   float,
   iXLen);
 
@@ -159,6 +176,7 @@ define <vscale x 4 x float> @intrinsic_vfmv.v.f_f_nxv4f32(float %0, iXLen %1) no
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmv.v.f.nxv4f32(
+    <vscale x 4 x float> undef,
     float %0,
     iXLen %1)
 
@@ -166,6 +184,7 @@ entry:
 }
 
 declare <vscale x 8 x float> @llvm.riscv.vfmv.v.f.nxv8f32(
+  <vscale x 8 x float>,
   float,
   iXLen);
 
@@ -177,6 +196,7 @@ define <vscale x 8 x float> @intrinsic_vfmv.v.f_f_nxv8f32(float %0, iXLen %1) no
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmv.v.f.nxv8f32(
+    <vscale x 8 x float> undef,
     float %0,
     iXLen %1)
 
@@ -184,6 +204,7 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.vfmv.v.f.nxv16f32(
+  <vscale x 16 x float>,
   float,
   iXLen);
 
@@ -195,6 +216,7 @@ define <vscale x 16 x float> @intrinsic_vfmv.v.f_f_nxv16f32(float %0, iXLen %1)
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmv.v.f.nxv16f32(
+    <vscale x 16 x float> undef,
     float %0,
     iXLen %1)
 
@@ -202,6 +224,7 @@ entry:
 }
 
 declare <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(
+  <vscale x 1 x double>,
   double,
   iXLen);
 
@@ -213,6 +236,7 @@ define <vscale x 1 x double> @intrinsic_vfmv.v.f_f_nxv1f64(double %0, iXLen %1)
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(
+    <vscale x 1 x double> undef,
     double %0,
     iXLen %1)
 
@@ -220,6 +244,7 @@ entry:
 }
 
 declare <vscale x 2 x double> @llvm.riscv.vfmv.v.f.nxv2f64(
+  <vscale x 2 x double>,
   double,
   iXLen);
 
@@ -231,6 +256,7 @@ define <vscale x 2 x double> @intrinsic_vfmv.v.f_f_nxv2f64(double %0, iXLen %1)
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmv.v.f.nxv2f64(
+    <vscale x 2 x double> undef,
     double %0,
     iXLen %1)
 
@@ -238,6 +264,7 @@ entry:
 }
 
 declare <vscale x 4 x double> @llvm.riscv.vfmv.v.f.nxv4f64(
+  <vscale x 4 x double>,
   double,
   iXLen);
 
@@ -249,6 +276,7 @@ define <vscale x 4 x double> @intrinsic_vfmv.v.f_f_nxv4f64(double %0, iXLen %1)
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmv.v.f.nxv4f64(
+    <vscale x 4 x double> undef,
     double %0,
     iXLen %1)
 
@@ -256,6 +284,7 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.vfmv.v.f.nxv8f64(
+  <vscale x 8 x double>,
   double,
   iXLen);
 
@@ -267,6 +296,7 @@ define <vscale x 8 x double> @intrinsic_vfmv.v.f_f_nxv8f64(double %0, iXLen %1)
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmv.v.f.nxv8f64(
+    <vscale x 8 x double> undef,
     double %0,
     iXLen %1)
 
@@ -281,6 +311,7 @@ define <vscale x 1 x half> @intrinsic_vfmv.v.f_zero_nxv1f16(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(
+    <vscale x 1 x half> undef,
     half 0.0,
     iXLen %0)
 
@@ -295,6 +326,7 @@ define <vscale x 2 x half> @intrinsic_vmv.v.i_zero_nxv2f16(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vfmv.v.f.nxv2f16(
+    <vscale x 2 x half> undef,
     half 0.0,
     iXLen %0)
 
@@ -309,6 +341,7 @@ define <vscale x 4 x half> @intrinsic_vmv.v.i_zero_nxv4f16(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vfmv.v.f.nxv4f16(
+    <vscale x 4 x half> undef,
     half 0.0,
     iXLen %0)
 
@@ -323,6 +356,7 @@ define <vscale x 8 x half> @intrinsic_vmv.v.i_zero_nxv8f16(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vfmv.v.f.nxv8f16(
+    <vscale x 8 x half> undef,
     half 0.0,
     iXLen %0)
 
@@ -337,6 +371,7 @@ define <vscale x 16 x half> @intrinsic_vmv.v.i_zero_nxv16f16(iXLen %0) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vfmv.v.f.nxv16f16(
+    <vscale x 16 x half> undef,
     half 0.0,
     iXLen %0)
 
@@ -351,6 +386,7 @@ define <vscale x 32 x half> @intrinsic_vmv.v.i_zero_nxv32f16(iXLen %0) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vfmv.v.f.nxv32f16(
+    <vscale x 32 x half> undef,
     half 0.0,
     iXLen %0)
 
@@ -365,6 +401,7 @@ define <vscale x 1 x float> @intrinsic_vmv.v.i_zero_nxv1f32(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(
+    <vscale x 1 x float> undef,
     float 0.0,
     iXLen %0)
 
@@ -379,6 +416,7 @@ define <vscale x 2 x float> @intrinsic_vmv.v.i_zero_nxv2f32(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32(
+    <vscale x 2 x float> undef,
     float 0.0,
     iXLen %0)
 
@@ -393,6 +431,7 @@ define <vscale x 4 x float> @intrinsic_vmv.v.i_zero_nxv4f32(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vfmv.v.f.nxv4f32(
+    <vscale x 4 x float> undef,
     float 0.0,
     iXLen %0)
 
@@ -407,6 +446,7 @@ define <vscale x 8 x float> @intrinsic_vmv.v.i_zero_nxv8f32(iXLen %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vfmv.v.f.nxv8f32(
+    <vscale x 8 x float> undef,
     float 0.0,
     iXLen %0)
 
@@ -421,6 +461,7 @@ define <vscale x 16 x float> @intrinsic_vmv.v.i_zero_nxv16f32(iXLen %0) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vfmv.v.f.nxv16f32(
+    <vscale x 16 x float> undef,
     float 0.0,
     iXLen %0)
 
@@ -435,6 +476,7 @@ define <vscale x 1 x double> @intrinsic_vmv.v.i_zero_nxv1f64(iXLen %0) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(
+    <vscale x 1 x double> undef,
     double 0.0,
     iXLen %0)
 
@@ -449,6 +491,7 @@ define <vscale x 2 x double> @intrinsic_vmv.v.i_zero_nxv2f64(iXLen %0) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vfmv.v.f.nxv2f64(
+    <vscale x 2 x double> undef,
     double 0.0,
     iXLen %0)
 
@@ -463,6 +506,7 @@ define <vscale x 4 x double> @intrinsic_vmv.v.i_zero_nxv4f64(iXLen %0) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vfmv.v.f.nxv4f64(
+    <vscale x 4 x double> undef,
     double 0.0,
     iXLen %0)
 
@@ -477,6 +521,7 @@ define <vscale x 8 x double> @intrinsic_vmv.v.i_zero_nxv8f64(iXLen %0) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vfmv.v.f.nxv8f64(
+    <vscale x 8 x double> undef,
     double 0.0,
     iXLen %0)
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll
index d4cebd0ab615e..6bcada4e6f18e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i32);
 
@@ -13,6 +14,7 @@ define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i32 %1)
 
@@ -20,6 +22,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i32);
 
@@ -31,6 +34,7 @@ define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i32 %1)
 
@@ -38,6 +42,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i32);
 
@@ -49,6 +54,7 @@ define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i32 %1)
 
@@ -56,6 +62,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i32);
 
@@ -67,6 +74,7 @@ define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i32 %1)
 
@@ -74,6 +82,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i32);
 
@@ -85,6 +94,7 @@ define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i32 %1)
 
@@ -92,6 +102,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i32);
 
@@ -103,6 +114,7 @@ define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i32 %1)
 
@@ -110,6 +122,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i32);
 
@@ -121,6 +134,7 @@ define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i32 %1)
 
@@ -128,6 +142,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i32);
 
@@ -139,6 +154,7 @@ define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i32 %1)
 
@@ -146,6 +162,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i32);
 
@@ -157,6 +174,7 @@ define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i32 %1)
 
@@ -164,6 +182,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i32);
 
@@ -175,6 +194,7 @@ define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i32 %1)
 
@@ -182,6 +202,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i32);
 
@@ -193,6 +214,7 @@ define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i32 %1)
 
@@ -200,6 +222,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i32);
 
@@ -211,6 +234,7 @@ define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i32 %1)
 
@@ -218,6 +242,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i32);
 
@@ -229,6 +254,7 @@ define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i32 %1)
 
@@ -236,6 +262,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32);
 
@@ -247,6 +274,7 @@ define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1)
 
@@ -254,6 +282,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32);
 
@@ -265,6 +294,7 @@ define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1)
 
@@ -272,6 +302,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32);
 
@@ -283,6 +314,7 @@ define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1)
 
@@ -290,6 +322,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32);
 
@@ -301,6 +334,7 @@ define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1)
 
@@ -308,6 +342,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32);
 
@@ -319,6 +354,7 @@ define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1)
 
@@ -326,6 +362,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i32);
 
@@ -337,6 +374,7 @@ define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i32 %1)
 
@@ -344,6 +382,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i32);
 
@@ -355,6 +394,7 @@ define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i32 %1)
 
@@ -362,6 +402,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i32);
 
@@ -373,6 +414,7 @@ define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i32 %1)
 
@@ -380,6 +422,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i32);
 
@@ -391,6 +434,7 @@ define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i32 %1)
 
@@ -398,6 +442,7 @@ entry:
 }
 
 declare <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
+  <vscale x 1 x half>,
   <vscale x 1 x half>,
   i32);
 
@@ -409,6 +454,7 @@ define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
+    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     i32 %1)
 
@@ -416,6 +462,7 @@ entry:
 }
 
 declare <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
+  <vscale x 2 x half>,
   <vscale x 2 x half>,
   i32);
 
@@ -427,6 +474,7 @@ define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
+    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     i32 %1)
 
@@ -434,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
+  <vscale x 4 x half>,
   <vscale x 4 x half>,
   i32);
 
@@ -445,6 +494,7 @@ define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
+    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     i32 %1)
 
@@ -452,6 +502,7 @@ entry:
 }
 
 declare <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
+  <vscale x 8 x half>,
   <vscale x 8 x half>,
   i32);
 
@@ -463,6 +514,7 @@ define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
+    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     i32 %1)
 
@@ -470,6 +522,7 @@ entry:
 }
 
 declare <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
+  <vscale x 16 x half>,
   <vscale x 16 x half>,
   i32);
 
@@ -481,6 +534,7 @@ define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
+    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     i32 %1)
 
@@ -488,6 +542,7 @@ entry:
 }
 
 declare <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
+  <vscale x 32 x half>,
   <vscale x 32 x half>,
   i32);
 
@@ -499,6 +554,7 @@ define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
+    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     i32 %1)
 
@@ -506,6 +562,7 @@ entry:
 }
 
 declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
+  <vscale x 1 x float>,
   <vscale x 1 x float>,
   i32);
 
@@ -517,6 +574,7 @@ define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
+    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     i32 %1)
 
@@ -524,6 +582,7 @@ entry:
 }
 
 declare <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
+  <vscale x 2 x float>,
   <vscale x 2 x float>,
   i32);
 
@@ -535,6 +594,7 @@ define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
+    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     i32 %1)
 
@@ -542,6 +602,7 @@ entry:
 }
 
 declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
+  <vscale x 4 x float>,
   <vscale x 4 x float>,
   i32);
 
@@ -553,6 +614,7 @@ define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
+    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     i32 %1)
 
@@ -560,6 +622,7 @@ entry:
 }
 
 declare <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
+  <vscale x 8 x float>,
   <vscale x 8 x float>,
   i32);
 
@@ -571,6 +634,7 @@ define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
+    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     i32 %1)
 
@@ -578,6 +642,7 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
+  <vscale x 16 x float>,
   <vscale x 16 x float>,
   i32);
 
@@ -589,6 +654,7 @@ define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
+    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     i32 %1)
 
@@ -596,6 +662,7 @@ entry:
 }
 
 declare <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
+  <vscale x 1 x double>,
   <vscale x 1 x double>,
   i32);
 
@@ -607,6 +674,7 @@ define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
+    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     i32 %1)
 
@@ -614,6 +682,7 @@ entry:
 }
 
 declare <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
+  <vscale x 2 x double>,
   <vscale x 2 x double>,
   i32);
 
@@ -625,6 +694,7 @@ define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
+    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     i32 %1)
 
@@ -632,6 +702,7 @@ entry:
 }
 
 declare <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
+  <vscale x 4 x double>,
   <vscale x 4 x double>,
   i32);
 
@@ -643,6 +714,7 @@ define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
+    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     i32 %1)
 
@@ -650,6 +722,7 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
+  <vscale x 8 x double>,
   <vscale x 8 x double>,
   i32);
 
@@ -661,6 +734,7 @@ define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
+    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     i32 %1)
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll
index 7038f655e5bfa..93c1936527b64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
+  <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i64);
 
@@ -13,6 +14,7 @@ define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
+    <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i64 %1)
 
@@ -20,6 +22,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
+  <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i64);
 
@@ -31,6 +34,7 @@ define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
+    <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i64 %1)
 
@@ -38,6 +42,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
+  <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i64);
 
@@ -49,6 +54,7 @@ define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
+    <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i64 %1)
 
@@ -56,6 +62,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
+  <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i64);
 
@@ -67,6 +74,7 @@ define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
+    <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i64 %1)
 
@@ -74,6 +82,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
+  <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i64);
 
@@ -85,6 +94,7 @@ define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
+    <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i64 %1)
 
@@ -92,6 +102,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
+  <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i64);
 
@@ -103,6 +114,7 @@ define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
+    <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i64 %1)
 
@@ -110,6 +122,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
+  <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i64);
 
@@ -121,6 +134,7 @@ define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
+    <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i64 %1)
 
@@ -128,6 +142,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
+  <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i64);
 
@@ -139,6 +154,7 @@ define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
+    <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i64 %1)
 
@@ -146,6 +162,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
+  <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i64);
 
@@ -157,6 +174,7 @@ define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
+    <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i64 %1)
 
@@ -164,6 +182,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
+  <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i64);
 
@@ -175,6 +194,7 @@ define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
+    <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i64 %1)
 
@@ -182,6 +202,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
+  <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i64);
 
@@ -193,6 +214,7 @@ define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
+    <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i64 %1)
 
@@ -200,6 +222,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
+  <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i64);
 
@@ -211,6 +234,7 @@ define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
+    <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i64 %1)
 
@@ -218,6 +242,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
+  <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i64);
 
@@ -229,6 +254,7 @@ define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
+    <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i64 %1)
 
@@ -236,6 +262,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
+  <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i64);
 
@@ -247,6 +274,7 @@ define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
+    <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i64 %1)
 
@@ -254,6 +282,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
+  <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i64);
 
@@ -265,6 +294,7 @@ define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
+    <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i64 %1)
 
@@ -272,6 +302,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
+  <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i64);
 
@@ -283,6 +314,7 @@ define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
+    <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i64 %1)
 
@@ -290,6 +322,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
+  <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i64);
 
@@ -301,6 +334,7 @@ define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
+    <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i64 %1)
 
@@ -308,6 +342,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
+  <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i64);
 
@@ -319,6 +354,7 @@ define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
+    <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i64 %1)
 
@@ -326,6 +362,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
+  <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64);
 
@@ -337,6 +374,7 @@ define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
+    <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1)
 
@@ -344,6 +382,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
+  <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64);
 
@@ -355,6 +394,7 @@ define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
+    <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1)
 
@@ -362,6 +402,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
+  <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64);
 
@@ -373,6 +414,7 @@ define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
+    <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1)
 
@@ -380,6 +422,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
+  <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64);
 
@@ -391,6 +434,7 @@ define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
+    <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1)
 
@@ -398,6 +442,7 @@ entry:
 }
 
 declare <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
+  <vscale x 1 x half>,
   <vscale x 1 x half>,
   i64);
 
@@ -409,6 +454,7 @@ define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
+    <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     i64 %1)
 
@@ -416,6 +462,7 @@ entry:
 }
 
 declare <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
+  <vscale x 2 x half>,
   <vscale x 2 x half>,
   i64);
 
@@ -427,6 +474,7 @@ define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
+    <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     i64 %1)
 
@@ -434,6 +482,7 @@ entry:
 }
 
 declare <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
+  <vscale x 4 x half>,
   <vscale x 4 x half>,
   i64);
 
@@ -445,6 +494,7 @@ define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
+    <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     i64 %1)
 
@@ -452,6 +502,7 @@ entry:
 }
 
 declare <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
+  <vscale x 8 x half>,
   <vscale x 8 x half>,
   i64);
 
@@ -463,6 +514,7 @@ define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x ha
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
+    <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     i64 %1)
 
@@ -470,6 +522,7 @@ entry:
 }
 
 declare <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
+  <vscale x 16 x half>,
   <vscale x 16 x half>,
   i64);
 
@@ -481,6 +534,7 @@ define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
+    <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     i64 %1)
 
@@ -488,6 +542,7 @@ entry:
 }
 
 declare <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
+  <vscale x 32 x half>,
   <vscale x 32 x half>,
   i64);
 
@@ -499,6 +554,7 @@ define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
+    <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     i64 %1)
 
@@ -506,6 +562,7 @@ entry:
 }
 
 declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
+  <vscale x 1 x float>,
   <vscale x 1 x float>,
   i64);
 
@@ -517,6 +574,7 @@ define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
+    <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     i64 %1)
 
@@ -524,6 +582,7 @@ entry:
 }
 
 declare <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
+  <vscale x 2 x float>,
   <vscale x 2 x float>,
   i64);
 
@@ -535,6 +594,7 @@ define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
+    <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     i64 %1)
 
@@ -542,6 +602,7 @@ entry:
 }
 
 declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
+  <vscale x 4 x float>,
   <vscale x 4 x float>,
   i64);
 
@@ -553,6 +614,7 @@ define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
+    <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     i64 %1)
 
@@ -560,6 +622,7 @@ entry:
 }
 
 declare <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
+  <vscale x 8 x float>,
   <vscale x 8 x float>,
   i64);
 
@@ -571,6 +634,7 @@ define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x f
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
+    <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     i64 %1)
 
@@ -578,6 +642,7 @@ entry:
 }
 
 declare <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
+  <vscale x 16 x float>,
   <vscale x 16 x float>,
   i64);
 
@@ -589,6 +654,7 @@ define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
+    <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     i64 %1)
 
@@ -596,6 +662,7 @@ entry:
 }
 
 declare <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
+  <vscale x 1 x double>,
   <vscale x 1 x double>,
   i64);
 
@@ -607,6 +674,7 @@ define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
+    <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     i64 %1)
 
@@ -614,6 +682,7 @@ entry:
 }
 
 declare <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
+  <vscale x 2 x double>,
   <vscale x 2 x double>,
   i64);
 
@@ -625,6 +694,7 @@ define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
+    <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     i64 %1)
 
@@ -632,6 +702,7 @@ entry:
 }
 
 declare <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
+  <vscale x 4 x double>,
   <vscale x 4 x double>,
   i64);
 
@@ -643,6 +714,7 @@ define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
+    <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     i64 %1)
 
@@ -650,6 +722,7 @@ entry:
 }
 
 declare <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
+  <vscale x 8 x double>,
   <vscale x 8 x double>,
   i64);
 
@@ -661,6 +734,7 @@ define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
+    <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     i64 %1)
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll
index 320fa626e7aad..95c5ece26ae11 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
+  <vscale x 1 x i8>,
   i8,
   i32);
 
@@ -13,6 +14,7 @@ define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
+    <vscale x 1 x i8> undef,
     i8 %0,
     i32 %1)
 
@@ -20,6 +22,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
+  <vscale x 2 x i8>,
   i8,
   i32);
 
@@ -31,6 +34,7 @@ define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
+    <vscale x 2 x i8> undef,
     i8 %0,
     i32 %1)
 
@@ -38,6 +42,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
+  <vscale x 4 x i8>,
   i8,
   i32);
 
@@ -49,6 +54,7 @@ define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
+    <vscale x 4 x i8> undef,
     i8 %0,
     i32 %1)
 
@@ -56,6 +62,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
+  <vscale x 8 x i8>,
   i8,
   i32);
 
@@ -67,6 +74,7 @@ define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
+    <vscale x 8 x i8> undef,
     i8 %0,
     i32 %1)
 
@@ -74,6 +82,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
+  <vscale x 16 x i8>,
   i8,
   i32);
 
@@ -85,6 +94,7 @@ define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
+    <vscale x 16 x i8> undef,
     i8 %0,
     i32 %1)
 
@@ -92,6 +102,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
+  <vscale x 32 x i8>,
   i8,
   i32);
 
@@ -103,6 +114,7 @@ define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
+    <vscale x 32 x i8> undef,
     i8 %0,
     i32 %1)
 
@@ -110,6 +122,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
+  <vscale x 64 x i8>,
   i8,
   i32);
 
@@ -121,6 +134,7 @@ define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, i32 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
+    <vscale x 64 x i8> undef,
     i8 %0,
     i32 %1)
 
@@ -128,6 +142,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
+  <vscale x 1 x i16>,
   i16,
   i32);
 
@@ -139,6 +154,7 @@ define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
+    <vscale x 1 x i16> undef,
     i16 %0,
     i32 %1)
 
@@ -146,6 +162,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
+  <vscale x 2 x i16>,
   i16,
   i32);
 
@@ -157,6 +174,7 @@ define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
+    <vscale x 2 x i16> undef,
     i16 %0,
     i32 %1)
 
@@ -164,6 +182,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
+  <vscale x 4 x i16>,
   i16,
   i32);
 
@@ -175,6 +194,7 @@ define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
+    <vscale x 4 x i16> undef,
     i16 %0,
     i32 %1)
 
@@ -182,6 +202,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
+  <vscale x 8 x i16>,
   i16,
   i32);
 
@@ -193,6 +214,7 @@ define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
+    <vscale x 8 x i16> undef,
     i16 %0,
     i32 %1)
 
@@ -200,6 +222,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
+  <vscale x 16 x i16>,
   i16,
   i32);
 
@@ -211,6 +234,7 @@ define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, i32 %1) nounwin
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
+    <vscale x 16 x i16> undef,
     i16 %0,
     i32 %1)
 
@@ -218,6 +242,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
+  <vscale x 32 x i16>,
   i16,
   i32);
 
@@ -229,6 +254,7 @@ define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, i32 %1) nounwin
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
+    <vscale x 32 x i16> undef,
     i16 %0,
     i32 %1)
 
@@ -236,6 +262,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
+  <vscale x 1 x i32>,
   i32,
   i32);
 
@@ -247,6 +274,7 @@ define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
+    <vscale x 1 x i32> undef,
     i32 %0,
     i32 %1)
 
@@ -254,6 +282,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
+  <vscale x 2 x i32>,
   i32,
   i32);
 
@@ -265,6 +294,7 @@ define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
+    <vscale x 2 x i32> undef,
     i32 %0,
     i32 %1)
 
@@ -272,6 +302,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
+  <vscale x 4 x i32>,
   i32,
   i32);
 
@@ -283,6 +314,7 @@ define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
+    <vscale x 4 x i32> undef,
     i32 %0,
     i32 %1)
 
@@ -290,6 +322,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
+  <vscale x 8 x i32>,
   i32,
   i32);
 
@@ -301,6 +334,7 @@ define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
+    <vscale x 8 x i32> undef,
     i32 %0,
     i32 %1)
 
@@ -308,6 +342,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
+  <vscale x 16 x i32>,
   i32,
   i32);
 
@@ -319,6 +354,7 @@ define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, i32 %1) nounwin
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
+    <vscale x 16 x i32> undef,
     i32 %0,
     i32 %1)
 
@@ -326,6 +362,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+  <vscale x 1 x i64>,
   i64,
   i32);
 
@@ -342,6 +379,7 @@ define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+    <vscale x 1 x i64> undef,
     i64 %0,
     i32 %1)
 
@@ -349,6 +387,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
+  <vscale x 2 x i64>,
   i64,
   i32);
 
@@ -365,6 +404,7 @@ define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
+    <vscale x 2 x i64> undef,
     i64 %0,
     i32 %1)
 
@@ -372,6 +412,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
+  <vscale x 4 x i64>,
   i64,
   i32);
 
@@ -388,6 +429,7 @@ define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
+    <vscale x 4 x i64> undef,
     i64 %0,
     i32 %1)
 
@@ -395,6 +437,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
+  <vscale x 8 x i64>,
   i64,
   i32);
 
@@ -411,6 +454,7 @@ define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, i32 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
+    <vscale x 8 x i64> undef,
     i64 %0,
     i32 %1)
 
@@ -425,6 +469,7 @@ define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
+    <vscale x 1 x i8> undef,
     i8 9,
     i32 %0)
 
@@ -439,6 +484,7 @@ define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
+    <vscale x 2 x i8> undef,
     i8 9,
     i32 %0)
 
@@ -453,6 +499,7 @@ define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
+    <vscale x 4 x i8> undef,
     i8 9,
     i32 %0)
 
@@ -467,6 +514,7 @@ define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
+    <vscale x 8 x i8> undef,
     i8 9,
     i32 %0)
 
@@ -481,6 +529,7 @@ define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
+    <vscale x 16 x i8> undef,
     i8 9,
     i32 %0)
 
@@ -495,6 +544,7 @@ define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
+    <vscale x 32 x i8> undef,
     i8 9,
     i32 %0)
 
@@ -509,6 +559,7 @@ define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
+    <vscale x 64 x i8> undef,
     i8 9,
     i32 %0)
 
@@ -523,6 +574,7 @@ define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
+    <vscale x 1 x i16> undef,
     i16 9,
     i32 %0)
 
@@ -537,6 +589,7 @@ define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
+    <vscale x 2 x i16> undef,
     i16 9,
     i32 %0)
 
@@ -551,6 +604,7 @@ define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
+    <vscale x 4 x i16> undef,
     i16 9,
     i32 %0)
 
@@ -565,6 +619,7 @@ define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
+    <vscale x 8 x i16> undef,
     i16 9,
     i32 %0)
 
@@ -579,6 +634,7 @@ define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
+    <vscale x 16 x i16> undef,
     i16 9,
     i32 %0)
 
@@ -593,6 +649,7 @@ define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
+    <vscale x 32 x i16> undef,
     i16 9,
     i32 %0)
 
@@ -607,6 +664,7 @@ define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
+    <vscale x 1 x i32> undef,
     i32 9,
     i32 %0)
 
@@ -621,6 +679,7 @@ define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
+    <vscale x 2 x i32> undef,
     i32 9,
     i32 %0)
 
@@ -635,6 +694,7 @@ define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
+    <vscale x 4 x i32> undef,
     i32 9,
     i32 %0)
 
@@ -649,6 +709,7 @@ define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
+    <vscale x 8 x i32> undef,
     i32 9,
     i32 %0)
 
@@ -663,6 +724,7 @@ define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
+    <vscale x 16 x i32> undef,
     i32 9,
     i32 %0)
 
@@ -677,6 +739,7 @@ define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+    <vscale x 1 x i64> undef,
     i64 9,
     i32 %0)
 
@@ -691,6 +754,7 @@ define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
+    <vscale x 2 x i64> undef,
     i64 9,
     i32 %0)
 
@@ -705,6 +769,7 @@ define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
+    <vscale x 4 x i64> undef,
     i64 9,
     i32 %0)
 
@@ -719,6 +784,7 @@ define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(i32 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
+    <vscale x 8 x i64> undef,
     i64 9,
     i32 %0)
 
@@ -733,6 +799,7 @@ define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64_vlmax() nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+    <vscale x 1 x i64> undef,
     i64 12884901891,
     i32 -1)
 
@@ -747,6 +814,7 @@ define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64_vlmax() nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
+    <vscale x 2 x i64> undef,
     i64 12884901891,
     i32 -1)
 
@@ -761,6 +829,7 @@ define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64_vlmax() nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
+    <vscale x 4 x i64> undef,
     i64 12884901891,
     i32 -1)
 
@@ -775,6 +844,7 @@ define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64_vlmax() nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
+    <vscale x 8 x i64> undef,
     i64 12884901891,
     i32 -1)
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll
index 6bfc5ace93717..fa060ecea9d56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
+  <vscale x 1 x i8>,
   i8,
   i64);
 
@@ -13,6 +14,7 @@ define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, i64 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
+    <vscale x 1 x i8> undef,
     i8 %0,
     i64 %1)
 
@@ -20,6 +22,7 @@ entry:
 }
 
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
+  <vscale x 2 x i8>,
   i8,
   i64);
 
@@ -31,6 +34,7 @@ define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, i64 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
+    <vscale x 2 x i8> undef,
     i8 %0,
     i64 %1)
 
@@ -38,6 +42,7 @@ entry:
 }
 
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
+  <vscale x 4 x i8>,
   i8,
   i64);
 
@@ -49,6 +54,7 @@ define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, i64 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
+    <vscale x 4 x i8> undef,
     i8 %0,
     i64 %1)
 
@@ -56,6 +62,7 @@ entry:
 }
 
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
+  <vscale x 8 x i8>,
   i8,
   i64);
 
@@ -67,6 +74,7 @@ define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, i64 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
+    <vscale x 8 x i8> undef,
     i8 %0,
     i64 %1)
 
@@ -74,6 +82,7 @@ entry:
 }
 
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
+  <vscale x 16 x i8>,
   i8,
   i64);
 
@@ -85,6 +94,7 @@ define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, i64 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
+    <vscale x 16 x i8> undef,
     i8 %0,
     i64 %1)
 
@@ -92,6 +102,7 @@ entry:
 }
 
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
+  <vscale x 32 x i8>,
   i8,
   i64);
 
@@ -103,6 +114,7 @@ define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, i64 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
+    <vscale x 32 x i8> undef,
     i8 %0,
     i64 %1)
 
@@ -110,6 +122,7 @@ entry:
 }
 
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
+  <vscale x 64 x i8>,
   i8,
   i64);
 
@@ -121,6 +134,7 @@ define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, i64 %1) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
+    <vscale x 64 x i8> undef,
     i8 %0,
     i64 %1)
 
@@ -128,6 +142,7 @@ entry:
 }
 
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
+  <vscale x 1 x i16>,
   i16,
   i64);
 
@@ -139,6 +154,7 @@ define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
+    <vscale x 1 x i16> undef,
     i16 %0,
     i64 %1)
 
@@ -146,6 +162,7 @@ entry:
 }
 
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
+  <vscale x 2 x i16>,
   i16,
   i64);
 
@@ -157,6 +174,7 @@ define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
+    <vscale x 2 x i16> undef,
     i16 %0,
     i64 %1)
 
@@ -164,6 +182,7 @@ entry:
 }
 
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
+  <vscale x 4 x i16>,
   i16,
   i64);
 
@@ -175,6 +194,7 @@ define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
+    <vscale x 4 x i16> undef,
     i16 %0,
     i64 %1)
 
@@ -182,6 +202,7 @@ entry:
 }
 
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
+  <vscale x 8 x i16>,
   i16,
   i64);
 
@@ -193,6 +214,7 @@ define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
+    <vscale x 8 x i16> undef,
     i16 %0,
     i64 %1)
 
@@ -200,6 +222,7 @@ entry:
 }
 
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
+  <vscale x 16 x i16>,
   i16,
   i64);
 
@@ -211,6 +234,7 @@ define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, i64 %1) nounwin
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
+    <vscale x 16 x i16> undef,
     i16 %0,
     i64 %1)
 
@@ -218,6 +242,7 @@ entry:
 }
 
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
+  <vscale x 32 x i16>,
   i16,
   i64);
 
@@ -229,6 +254,7 @@ define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, i64 %1) nounwin
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
+    <vscale x 32 x i16> undef,
     i16 %0,
     i64 %1)
 
@@ -236,6 +262,7 @@ entry:
 }
 
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
+  <vscale x 1 x i32>,
   i32,
   i64);
 
@@ -247,6 +274,7 @@ define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
+    <vscale x 1 x i32> undef,
     i32 %0,
     i64 %1)
 
@@ -254,6 +282,7 @@ entry:
 }
 
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
+  <vscale x 2 x i32>,
   i32,
   i64);
 
@@ -265,6 +294,7 @@ define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
+    <vscale x 2 x i32> undef,
     i32 %0,
     i64 %1)
 
@@ -272,6 +302,7 @@ entry:
 }
 
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
+  <vscale x 4 x i32>,
   i32,
   i64);
 
@@ -283,6 +314,7 @@ define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
+    <vscale x 4 x i32> undef,
     i32 %0,
     i64 %1)
 
@@ -290,6 +322,7 @@ entry:
 }
 
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
+  <vscale x 8 x i32>,
   i32,
   i64);
 
@@ -301,6 +334,7 @@ define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
+    <vscale x 8 x i32> undef,
     i32 %0,
     i64 %1)
 
@@ -308,6 +342,7 @@ entry:
 }
 
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
+  <vscale x 16 x i32>,
   i32,
   i64);
 
@@ -319,6 +354,7 @@ define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, i64 %1) nounwin
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
+    <vscale x 16 x i32> undef,
     i32 %0,
     i64 %1)
 
@@ -326,6 +362,7 @@ entry:
 }
 
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+  <vscale x 1 x i64>,
   i64,
   i64);
 
@@ -337,6 +374,7 @@ define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+    <vscale x 1 x i64> undef,
     i64 %0,
     i64 %1)
 
@@ -344,6 +382,7 @@ entry:
 }
 
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
+  <vscale x 2 x i64>,
   i64,
   i64);
 
@@ -355,6 +394,7 @@ define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
+    <vscale x 2 x i64> undef,
     i64 %0,
     i64 %1)
 
@@ -362,6 +402,7 @@ entry:
 }
 
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
+  <vscale x 4 x i64>,
   i64,
   i64);
 
@@ -373,6 +414,7 @@ define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
+    <vscale x 4 x i64> undef,
     i64 %0,
     i64 %1)
 
@@ -380,6 +422,7 @@ entry:
 }
 
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
+  <vscale x 8 x i64>,
   i64,
   i64);
 
@@ -391,6 +434,7 @@ define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, i64 %1) nounwind
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
+    <vscale x 8 x i64> undef,
     i64 %0,
     i64 %1)
 
@@ -405,6 +449,7 @@ define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
+    <vscale x 1 x i8> undef,
     i8 9,
     i64 %0)
 
@@ -419,6 +464,7 @@ define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
+    <vscale x 2 x i8> undef,
     i8 9,
     i64 %0)
 
@@ -433,6 +479,7 @@ define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
+    <vscale x 4 x i8> undef,
     i8 9,
     i64 %0)
 
@@ -447,6 +494,7 @@ define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
+    <vscale x 8 x i8> undef,
     i8 9,
     i64 %0)
 
@@ -461,6 +509,7 @@ define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
+    <vscale x 16 x i8> undef,
     i8 9,
     i64 %0)
 
@@ -475,6 +524,7 @@ define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
+    <vscale x 32 x i8> undef,
     i8 9,
     i64 %0)
 
@@ -489,6 +539,7 @@ define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
+    <vscale x 64 x i8> undef,
     i8 9,
     i64 %0)
 
@@ -503,6 +554,7 @@ define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
+    <vscale x 1 x i16> undef,
     i16 9,
     i64 %0)
 
@@ -517,6 +569,7 @@ define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
+    <vscale x 2 x i16> undef,
     i16 9,
     i64 %0)
 
@@ -531,6 +584,7 @@ define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
+    <vscale x 4 x i16> undef,
     i16 9,
     i64 %0)
 
@@ -545,6 +599,7 @@ define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
+    <vscale x 8 x i16> undef,
     i16 9,
     i64 %0)
 
@@ -559,6 +614,7 @@ define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
+    <vscale x 16 x i16> undef,
     i16 9,
     i64 %0)
 
@@ -573,6 +629,7 @@ define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
+    <vscale x 32 x i16> undef,
     i16 9,
     i64 %0)
 
@@ -587,6 +644,7 @@ define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
+    <vscale x 1 x i32> undef,
     i32 9,
     i64 %0)
 
@@ -601,6 +659,7 @@ define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
+    <vscale x 2 x i32> undef,
     i32 9,
     i64 %0)
 
@@ -615,6 +674,7 @@ define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
+    <vscale x 4 x i32> undef,
     i32 9,
     i64 %0)
 
@@ -629,6 +689,7 @@ define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
+    <vscale x 8 x i32> undef,
     i32 9,
     i64 %0)
 
@@ -643,6 +704,7 @@ define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
+    <vscale x 16 x i32> undef,
     i32 9,
     i64 %0)
 
@@ -657,6 +719,7 @@ define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
+    <vscale x 1 x i64> undef,
     i64 9,
     i64 %0)
 
@@ -671,6 +734,7 @@ define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
+    <vscale x 2 x i64> undef,
     i64 9,
     i64 %0)
 
@@ -685,6 +749,7 @@ define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
+    <vscale x 4 x i64> undef,
     i64 9,
     i64 %0)
 
@@ -699,6 +764,7 @@ define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(i64 %0) nounwind {
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
+    <vscale x 8 x i64> undef,
     i64 9,
     i64 %0)
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 5d0c898e43691..f93b0a5812d44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -14,8 +14,8 @@ declare <vscale x 1 x double> @llvm.riscv.vfsub.nxv1f64.nxv1f64(<vscale x 1 x do
 
 declare <vscale x 1 x double> @llvm.riscv.vfmul.nxv1f64.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, i64)
 
-declare <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(double, i64)
-declare <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(float, i64)
+declare <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(<vscale x 1 x double>, double, i64)
+declare <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32( <vscale x 2 x float>, float, i64)
 
 declare void @llvm.riscv.vse.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>* nocapture, i64)
 declare void @llvm.riscv.vse.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>* nocapture, i64)
@@ -156,16 +156,16 @@ entry:
   br i1 %tobool, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
-  %0 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(double 1.000000e+00, i64 %avl)
-  %1 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(double 2.000000e+00, i64 %avl)
+  %0 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(<vscale x 1 x double> undef, double 1.000000e+00, i64 %avl)
+  %1 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(<vscale x 1 x double> undef, double 2.000000e+00, i64 %avl)
   %2 = tail call <vscale x 1 x double> @llvm.riscv.vfadd.nxv1f64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> %0, <vscale x 1 x double> %1, i64 %avl)
   %3 = bitcast i8* @scratch to <vscale x 1 x double>*
   tail call void @llvm.riscv.vse.nxv1f64(<vscale x 1 x double> %2, <vscale x 1 x double>* %3, i64 %avl)
   br label %if.end
 
 if.else:                                          ; preds = %entry
-  %4 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(float 1.000000e+00, i64 %avl)
-  %5 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(float 2.000000e+00, i64 %avl)
+  %4 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(<vscale x 2 x float> undef, float 1.000000e+00, i64 %avl)
+  %5 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(<vscale x 2 x float> undef, float 2.000000e+00, i64 %avl)
   %6 = tail call <vscale x 2 x float> @llvm.riscv.vfadd.nxv2f32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> %4, <vscale x 2 x float> %5, i64 %avl)
   %7 = bitcast i8* @scratch to <vscale x 2 x float>*
   tail call void @llvm.riscv.vse.nxv2f32(<vscale x 2 x float> %6, <vscale x 2 x float>* %7, i64 %avl)
@@ -299,8 +299,8 @@ if.end:                                           ; preds = %if.else, %if.then
 
 if.then4:                                         ; preds = %if.end
   %3 = tail call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 0)
-  %4 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(double 1.000000e+00, i64 %3)
-  %5 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(double 2.000000e+00, i64 %3)
+  %4 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(<vscale x 1 x double> undef, double 1.000000e+00, i64 %3)
+  %5 = tail call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64.f64(<vscale x 1 x double> undef, double 2.000000e+00, i64 %3)
   %6 = tail call <vscale x 1 x double> @llvm.riscv.vfadd.nxv1f64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> %4, <vscale x 1 x double> %5, i64 %3)
   %7 = bitcast i8* @scratch to <vscale x 1 x double>*
   tail call void @llvm.riscv.vse.nxv1f64(<vscale x 1 x double> %6, <vscale x 1 x double>* %7, i64 %3)
@@ -308,8 +308,8 @@ if.then4:                                         ; preds = %if.end
 
 if.else5:                                         ; preds = %if.end
   %8 = tail call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 0)
-  %9 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(float 1.000000e+00, i64 %8)
-  %10 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(float 2.000000e+00, i64 %8)
+  %9 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32(<vscale x 2 x float> undef, float 1.000000e+00, i64 %8)
+  %10 = tail call <vscale x 2 x float> @llvm.riscv.vfmv.v.f.nxv2f32.f32( <vscale x 2 x float> undef, float 2.000000e+00, i64 %8)
   %11 = tail call <vscale x 2 x float> @llvm.riscv.vfadd.nxv2f32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> %9, <vscale x 2 x float> %10, i64 %8)
   %12 = bitcast i8* @scratch to <vscale x 2 x float>*
   tail call void @llvm.riscv.vse.nxv2f32(<vscale x 2 x float> %11, <vscale x 2 x float>* %12, i64 %8)
diff --git a/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll b/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll
index fcf6a8c2b0e0a..cc54af8066dfa 100644
--- a/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll
+++ b/llvm/test/CodeGen/RISCV/spill-fpr-scalar.ll
@@ -7,9 +7,9 @@ declare half @llvm.riscv.vfmv.f.s.nxv1f16(<vscale x 1 x half>)
 declare float @llvm.riscv.vfmv.f.s.nxv1f32(<vscale x 1 x float>)
 declare double @llvm.riscv.vfmv.f.s.nxv1f64(<vscale x 1 x double>)
 
-declare <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(half, i64);
-declare <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(float, i64);
-declare <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(double, i64);
+declare <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(<vscale x 1 x half>, half, i64);
+declare <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(<vscale x 1 x float>, float, i64);
+declare <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(<vscale x 1 x double>, double, i64);
 
 define <vscale x 1 x half> @intrinsic_vfmv.f.s_s_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f16:
@@ -28,7 +28,7 @@ define <vscale x 1 x half> @intrinsic_vfmv.f.s_s_nxv1f16(<vscale x 1 x half> %0,
 entry:
   %a = call half @llvm.riscv.vfmv.f.s.nxv1f16(<vscale x 1 x half> %0)
   tail call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
-  %b = call <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(half %a, i64 %1)
+  %b = call <vscale x 1 x half> @llvm.riscv.vfmv.v.f.nxv1f16(<vscale x 1 x half> undef, half %a, i64 %1)
   ret <vscale x 1 x half> %b
 }
 
@@ -49,7 +49,7 @@ define <vscale x 1 x float> @intrinsic_vfmv.f.s_s_nxv1f32(<vscale x 1 x float> %
 entry:
   %a = call float @llvm.riscv.vfmv.f.s.nxv1f32(<vscale x 1 x float> %0)
   tail call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
-  %b = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(float %a, i64 %1)
+  %b = call <vscale x 1 x float> @llvm.riscv.vfmv.v.f.nxv1f32(<vscale x 1 x float> undef, float %a, i64 %1)
   ret <vscale x 1 x float> %b
 }
 
@@ -70,6 +70,6 @@ define <vscale x 1 x double> @intrinsic_vfmv.f.s_s_nxv1f64(<vscale x 1 x double>
 entry:
   %a = call double @llvm.riscv.vfmv.f.s.nxv1f64(<vscale x 1 x double> %0)
   tail call void asm sideeffect "", "~{f0_d},~{f1_d},~{f2_d},~{f3_d},~{f4_d},~{f5_d},~{f6_d},~{f7_d},~{f8_d},~{f9_d},~{f10_d},~{f11_d},~{f12_d},~{f13_d},~{f14_d},~{f15_d},~{f16_d},~{f17_d},~{f18_d},~{f19_d},~{f20_d},~{f21_d},~{f22_d},~{f23_d},~{f24_d},~{f25_d},~{f26_d},~{f27_d},~{f28_d},~{f29_d},~{f30_d},~{f31_d}"()
-  %b = call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(double %a, i64 %1)
+  %b = call <vscale x 1 x double> @llvm.riscv.vfmv.v.f.nxv1f64(<vscale x 1 x double> undef, double %a, i64 %1)
   ret <vscale x 1 x double> %b
 }

From 3af7bbca4a0ef64de64b8bb38d3b167673ec60f0 Mon Sep 17 00:00:00 2001
From: fourdim <fourdim@foxmail.com>
Date: Thu, 17 Feb 2022 23:00:55 +0800
Subject: [PATCH 103/748] [JITLink][RISCV] fix the extractBits behavior and add
 R_RISCV_JAL relocation.

This patch supports the R_RISCV_JAL relocation.
Moreover, it will fix the extractBits function's behavior as it extracts Size + 1 bits.
In the test ELF_jal.s:
Before:
```
Hi: 4294836480
extractBits(Hi, 12, 8): 480
```
After:
```
Hi: 4294836480
extractBits(Hi, 12, 8): 224
```

Reviewed By: StephenFan

Differential Revision: https://reviews.llvm.org/D117975
---
 .../llvm/ExecutionEngine/JITLink/riscv.h      |  7 ++++
 .../lib/ExecutionEngine/JITLink/ELF_riscv.cpp | 16 ++++++++
 llvm/lib/ExecutionEngine/JITLink/riscv.cpp    |  2 +
 .../ExecutionEngine/JITLink/RISCV/ELF_jal.s   | 37 +++++++++++++++++++
 4 files changed, 62 insertions(+)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
index d0d3a3786e55d..2d32a749111d1 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
@@ -44,6 +44,13 @@ enum EdgeKind_riscv : Edge::Kind {
   ///
   R_RISCV_BRANCH,
 
+  /// High 20 bits of PC-relative jump pointer value relocation
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend
+  ///
+  R_RISCV_JAL,
+
   /// High 20 bits of 32-bit pointer value relocation
   ///
   /// Fixup expression
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index 90f3a38b81d53..469a81d882aea 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -220,6 +220,20 @@ class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
       *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7;
       break;
     }
+    case R_RISCV_JAL: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      Error AlignmentIssue = checkAlignment(FixupAddress, Value, 2, E);
+      if (AlignmentIssue) {
+        return AlignmentIssue;
+      }
+      uint32_t Imm20 = extractBits(Value, 20, 1) << 31;
+      uint32_t Imm10_1 = extractBits(Value, 1, 10) << 21;
+      uint32_t Imm11 = extractBits(Value, 11, 1) << 20;
+      uint32_t Imm19_12 = extractBits(Value, 12, 8) << 12;
+      uint32_t RawInstr = *(little32_t *)FixupPtr;
+      *(little32_t *)FixupPtr = RawInstr | Imm20 | Imm10_1 | Imm11 | Imm19_12;
+      break;
+    }
     case R_RISCV_HI20: {
       int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       int64_t Hi = Value + 0x800;
@@ -409,6 +423,8 @@ class ELFLinkGraphBuilder_riscv : public ELFLinkGraphBuilder<ELFT> {
       return EdgeKind_riscv::R_RISCV_64;
     case ELF::R_RISCV_BRANCH:
       return EdgeKind_riscv::R_RISCV_BRANCH;
+    case ELF::R_RISCV_JAL:
+      return EdgeKind_riscv::R_RISCV_JAL;
     case ELF::R_RISCV_HI20:
       return EdgeKind_riscv::R_RISCV_HI20;
     case ELF::R_RISCV_LO12_I:
diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
index 3ce2cf10a24cb..0bd57b654d402 100644
--- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
@@ -26,6 +26,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "R_RISCV_64";
   case R_RISCV_BRANCH:
     return "R_RISCV_BRANCH";
+  case R_RISCV_JAL:
+    return "R_RISCV_JAL";
   case R_RISCV_HI20:
     return "R_RISCV_HI20";
   case R_RISCV_LO12_I:
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s
new file mode 100644
index 0000000000000..82f12358cb6b6
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s
@@ -0,0 +1,37 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=riscv64 -filetype=obj \
+# RUN:     -o %t/elf_riscv64_jal.o %s
+# RUN: llvm-mc -triple=riscv32 -filetype=obj \
+# RUN:     -o %t/elf_riscv32_jal.o %s
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0x1ff00000 -slab-page-size 4096 \
+# RUN:     -define-abs external_func=0x1fe000fe \
+# RUN:     -check %s %t/elf_riscv64_jal.o
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0x1ff00000 -slab-page-size 4096 \
+# RUN:     -define-abs external_func=0x1fe000fe \
+# RUN:     -check %s %t/elf_riscv32_jal.o
+#
+
+        .text
+        .file   "testcase.c"
+
+# Empty main entry point.
+        .globl  main
+        .p2align  1
+        .type   main,@function
+main:
+        ret
+
+        .size   main, .-main
+
+# Test R_RISCV_JAL
+
+# jitlink-check: decode_operand(test_jal, 1)[31:12] = (external_func - test_jal)[31:12]
+  .globl  test_jal
+  .p2align  1
+  .type  test_jal,@function
+test_jal:
+  jal	x0, external_func
+
+  .size test_jal, .-test_jal

From 092a5bb72ba8cc5a6cec02cfe61f70130a2c1282 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Thu, 17 Feb 2022 10:22:16 -0500
Subject: [PATCH 104/748] [OpenMP][Offloading] Fix test case issues in
 bug49334.cpp

`bug49334.cpp` has one issue that causes flaky result reported in #53730.
The root cause is `BlockedC` is never initialized but in `BlockMatMul_TargetNowait`
it is directly read and written (via `+=`). Fixes #53730.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D119988
---
 .../libomptarget/test/offloading/bug49334.cpp | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp
index cd0b185219940..047a78c11ac8b 100644
--- a/openmp/libomptarget/test/offloading/bug49334.cpp
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@@ -50,8 +50,7 @@ class BlockMatrix {
       }
   }
 
-  long Compare(const std::vector<float> &matrix) const {
-    long fail = 0;
+  void Compare(const std::vector<float> &matrix) const {
     for (int i = 0; i < nBlocksPerCol; i++)
       for (int j = 0; j < nBlocksPerRow; j++) {
         float *CurrBlock = GetBlock(i, j);
@@ -61,13 +60,10 @@ class BlockMatrix {
             int currj = j * rowsPerBlock + jj;
             float m_value = matrix[curri + currj * nCols];
             float bm_value = CurrBlock[ii + jj * colsPerBlock];
-            if (std::fabs(bm_value - m_value) >
-                std::numeric_limits<float>::epsilon()) {
-              fail++;
-            }
+            assert(std::fabs(bm_value - m_value) <
+                   std::numeric_limits<float>::epsilon());
           }
       }
-    return fail;
   }
 
   float *GetBlock(int i, int j) const {
@@ -77,7 +73,7 @@ class BlockMatrix {
 };
 
 constexpr const int BS = 16;
-constexpr const int N = 256;
+constexpr const int N = 16;
 
 int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
 #pragma omp parallel
@@ -130,20 +126,19 @@ int main(int argc, char *argv[]) {
   }
 
   auto BlockedA = BlockMatrix(BS, BS, N, N);
-  BlockedA.Initialize(a);
-  BlockedA.Compare(a);
   auto BlockedB = BlockMatrix(BS, BS, N, N);
+  auto BlockedC = BlockMatrix(BS, BS, N, N);
+  BlockedA.Initialize(a);
   BlockedB.Initialize(b);
+  BlockedC.Initialize(c);
+  BlockedA.Compare(a);
   BlockedB.Compare(b);
+  BlockedC.Compare(c);
 
   Matmul(a, b, c);
-
-  auto BlockedC = BlockMatrix(BS, BS, N, N);
   BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
 
-  if (BlockedC.Compare(c) > 0) {
-    return 1;
-  }
+  BlockedC.Compare(c);
 
   std::cout << "PASS\n";
 

From 234a8422c912ec102e23cc06999245945b53182f Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 16 Feb 2022 13:57:38 -0500
Subject: [PATCH 105/748] [InstCombine] add test for min/max intrinsic with
 constant expression; NFC

---
 llvm/test/Transforms/InstCombine/minmax-intrinsics.ll | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index b264cbbc1f10a..4c95d45063f67 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2352,3 +2352,14 @@ define i8 @smin_smin_smin_reassoc_constants(i8 %x, i8 %y) {
   %m3 = call i8 @llvm.smin.i8(i8 %m2, i8 126)
   ret i8 %m3
 }
+
+define i8 @umax_umax_reassoc_constantexpr_sink(i8 %x, i8 %y) {
+; CHECK-LABEL: @umax_umax_reassoc_constantexpr_sink(
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 42)
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.umax.i8(i8 [[M1]], i8 ptrtoint (i8 (i8, i8)* @umax_umax_reassoc_constantexpr_sink to i8))
+; CHECK-NEXT:    ret i8 [[M2]]
+;
+  %m1 = call i8 @llvm.umax.i8(i8 %x, i8 42)
+  %m2 = call i8 @llvm.umax.i8(i8 %m1, i8 ptrtoint (i8 (i8, i8)* @umax_umax_reassoc_constantexpr_sink to i8))
+  ret i8 %m2
+}

From 58df2da0540c0ae0bb4f72c382fae0b4fbedae1c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 17 Feb 2022 10:34:48 -0500
Subject: [PATCH 106/748] [InstCombine] push constant operand down/outside in
 sequence of min/max intrinsics

A generalization like this was suggested in D119754.
This is the inverse direction of D119851,
and we get all of the folds there plus the one that was missed.

There is precedence for this kind of transform in instcombine
with "or" instructions (but strangely only with that one opcode AFAICT).

Similar justification as in the other patch:
The line between instcombine and reassociate for these kinds of folds
is blurry. This doesn't appear to have much cost and gives us the
expected wins from repeated folds as seen in the last set of test diffs.

Differential Revision: https://reviews.llvm.org/D119955
---
 .../InstCombine/InstCombineCalls.cpp          | 33 +++++++++++++++++
 .../InstCombine/minmax-intrinsics.ll          | 37 +++++++++----------
 2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d2a104be802af..eecd583740c30 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -902,6 +902,36 @@ static Instruction *reassociateMinMaxWithConstants(IntrinsicInst *II) {
   return CallInst::Create(MinMax, {LHS->getArgOperand(0), NewC});
 }
 
+/// If this min/max has a matching min/max operand with a constant, try to push
+/// the constant operand into this instruction. This can enable more folds.
+static Instruction *
+reassociateMinMaxWithConstantInOperand(IntrinsicInst *II,
+                                       InstCombiner::BuilderTy &Builder) {
+  // Match and capture a min/max operand candidate.
+  Value *X, *Y;
+  Constant *C;
+  Instruction *Inner;
+  if (!match(II, m_c_MaxOrMin(m_OneUse(m_CombineAnd(
+                                  m_Instruction(Inner),
+                                  m_MaxOrMin(m_Value(X), m_ImmConstant(C)))),
+                              m_Value(Y))))
+    return nullptr;
+
+  // The inner op must match. Check for constants to avoid infinite loops.
+  Intrinsic::ID MinMaxID = II->getIntrinsicID();
+  auto *InnerMM = dyn_cast<IntrinsicInst>(Inner);
+  if (!InnerMM || InnerMM->getIntrinsicID() != MinMaxID ||
+      match(X, m_ImmConstant()) || match(Y, m_ImmConstant()))
+    return nullptr;
+
+  // max (max X, C), Y --> max (max X, Y), C
+  Function *MinMax =
+      Intrinsic::getDeclaration(II->getModule(), MinMaxID, II->getType());
+  Value *NewInner = Builder.CreateBinaryIntrinsic(MinMaxID, X, Y);
+  NewInner->takeName(Inner);
+  return CallInst::Create(MinMax, {NewInner, C});
+}
+
 /// Reduce a sequence of min/max intrinsics with a common operand.
 static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
   // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
@@ -1250,6 +1280,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *NewMinMax = reassociateMinMaxWithConstants(II))
       return NewMinMax;
 
+    if (Instruction *R = reassociateMinMaxWithConstantInOperand(II, Builder))
+      return R;
+
     if (Instruction *NewMinMax = factorizeMinMaxTree(II))
        return NewMinMax;
 
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index 4c95d45063f67..bc4da868e25a2 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2259,8 +2259,8 @@ define i8 @umin_umin_reassoc_constant_use(i8 %x, i8 %y) {
 
 define i8 @smax_smax_reassoc_constant_sink(i8 %x, i8 %y) {
 ; CHECK-LABEL: @smax_smax_reassoc_constant_sink(
-; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 42)
-; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[M1]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[M1]], i8 42)
 ; CHECK-NEXT:    ret i8 [[M2]]
 ;
   %m1 = call i8 @llvm.smax.i8(i8 %x, i8 42)
@@ -2270,8 +2270,8 @@ define i8 @smax_smax_reassoc_constant_sink(i8 %x, i8 %y) {
 
 define <3 x i8> @smin_smin_reassoc_constant_sink(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @smin_smin_reassoc_constant_sink(
-; CHECK-NEXT:    [[M1:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[X:%.*]], <3 x i8> <i8 43, i8 -43, i8 0>)
-; CHECK-NEXT:    [[M2:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[M1]], <3 x i8> [[Y:%.*]])
+; CHECK-NEXT:    [[M1:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]])
+; CHECK-NEXT:    [[M2:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[M1]], <3 x i8> <i8 43, i8 -43, i8 0>)
 ; CHECK-NEXT:    ret <3 x i8> [[M2]]
 ;
   %m1 = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %x, <3 x i8> <i8 43, i8 -43, i8 0>)
@@ -2281,8 +2281,8 @@ define <3 x i8> @smin_smin_reassoc_constant_sink(<3 x i8> %x, <3 x i8> %y) {
 
 define i8 @umax_umax_reassoc_constant_sink(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umax_umax_reassoc_constant_sink(
-; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 42)
-; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.umax.i8(i8 [[M1]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.umax.i8(i8 [[M1]], i8 42)
 ; CHECK-NEXT:    ret i8 [[M2]]
 ;
   %m1 = call i8 @llvm.umax.i8(i8 %x, i8 42)
@@ -2292,8 +2292,8 @@ define i8 @umax_umax_reassoc_constant_sink(i8 %x, i8 %y) {
 
 define <3 x i8> @umin_umin_reassoc_constant_sink(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @umin_umin_reassoc_constant_sink(
-; CHECK-NEXT:    [[M1:%.*]] = call <3 x i8> @llvm.umin.v3i8(<3 x i8> [[X:%.*]], <3 x i8> <i8 43, i8 -43, i8 0>)
-; CHECK-NEXT:    [[M2:%.*]] = call <3 x i8> @llvm.umin.v3i8(<3 x i8> [[M1]], <3 x i8> [[Y:%.*]])
+; CHECK-NEXT:    [[M1:%.*]] = call <3 x i8> @llvm.umin.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]])
+; CHECK-NEXT:    [[M2:%.*]] = call <3 x i8> @llvm.umin.v3i8(<3 x i8> [[M1]], <3 x i8> <i8 43, i8 -43, i8 0>)
 ; CHECK-NEXT:    ret <3 x i8> [[M2]]
 ;
   %m1 = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %x, <3 x i8> <i8 43, i8 -43, i8 0>)
@@ -2316,9 +2316,8 @@ define i8 @umin_umin_reassoc_constant_sink_use(i8 %x, i8 %y) {
 
 define i8 @smax_smax_smax_reassoc_constants(i8 %x, i8 %y) {
 ; CHECK-LABEL: @smax_smax_smax_reassoc_constants(
-; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 42)
-; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[Y:%.*]], i8 [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smax.i8(i8 [[M2]], i8 126)
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smax.i8(i8 [[M1]], i8 126)
 ; CHECK-NEXT:    ret i8 [[M3]]
 ;
   %m1 = call i8 @llvm.smax.i8(i8 %x, i8 42)
@@ -2329,9 +2328,8 @@ define i8 @smax_smax_smax_reassoc_constants(i8 %x, i8 %y) {
 
 define i8 @smax_smax_smax_reassoc_constants_swap(i8 %x, i8 %y) {
 ; CHECK-LABEL: @smax_smax_smax_reassoc_constants_swap(
-; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 42)
-; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[M1]], i8 [[Y:%.*]])
-; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smax.i8(i8 [[M2]], i8 126)
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smax.i8(i8 [[M1]], i8 126)
 ; CHECK-NEXT:    ret i8 [[M3]]
 ;
   %m1 = call i8 @llvm.smax.i8(i8 %x, i8 42)
@@ -2342,10 +2340,9 @@ define i8 @smax_smax_smax_reassoc_constants_swap(i8 %x, i8 %y) {
 
 define i8 @smin_smin_smin_reassoc_constants(i8 %x, i8 %y) {
 ; CHECK-LABEL: @smin_smin_smin_reassoc_constants(
-; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smin.i8(i8 [[X:%.*]], i8 42)
-; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smin.i8(i8 [[Y:%.*]], i8 [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smin.i8(i8 [[M2]], i8 126)
-; CHECK-NEXT:    ret i8 [[M3]]
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.smin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smin.i8(i8 [[M1]], i8 42)
+; CHECK-NEXT:    ret i8 [[M2]]
 ;
   %m1 = call i8 @llvm.smin.i8(i8 %x, i8 42)
   %m2 = call i8 @llvm.smin.i8(i8 %y, i8 %m1)
@@ -2355,8 +2352,8 @@ define i8 @smin_smin_smin_reassoc_constants(i8 %x, i8 %y) {
 
 define i8 @umax_umax_reassoc_constantexpr_sink(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umax_umax_reassoc_constantexpr_sink(
-; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 42)
-; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.umax.i8(i8 [[M1]], i8 ptrtoint (i8 (i8, i8)* @umax_umax_reassoc_constantexpr_sink to i8))
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 ptrtoint (i8 (i8, i8)* @umax_umax_reassoc_constantexpr_sink to i8))
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.umax.i8(i8 [[M1]], i8 42)
 ; CHECK-NEXT:    ret i8 [[M2]]
 ;
   %m1 = call i8 @llvm.umax.i8(i8 %x, i8 42)

From 051f7cdcd2ccfb54f5030e16d0026057d203ddd2 Mon Sep 17 00:00:00 2001
From: fourdim <fourdim@foxmail.com>
Date: Thu, 17 Feb 2022 23:40:32 +0800
Subject: [PATCH 107/748] Revert "[JITLink][RISCV] fix the extractBits behavior
 and add R_RISCV_JAL relocation."

This reverts commit 3af7bbca4a0ef64de64b8bb38d3b167673ec60f0.
---
 .../llvm/ExecutionEngine/JITLink/riscv.h      |  7 ----
 .../lib/ExecutionEngine/JITLink/ELF_riscv.cpp | 16 --------
 llvm/lib/ExecutionEngine/JITLink/riscv.cpp    |  2 -
 .../ExecutionEngine/JITLink/RISCV/ELF_jal.s   | 37 -------------------
 4 files changed, 62 deletions(-)
 delete mode 100644 llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
index 2d32a749111d1..d0d3a3786e55d 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
@@ -44,13 +44,6 @@ enum EdgeKind_riscv : Edge::Kind {
   ///
   R_RISCV_BRANCH,
 
-  /// High 20 bits of PC-relative jump pointer value relocation
-  ///
-  /// Fixup expression:
-  ///   Fixup <- Target - Fixup + Addend
-  ///
-  R_RISCV_JAL,
-
   /// High 20 bits of 32-bit pointer value relocation
   ///
   /// Fixup expression
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index 469a81d882aea..90f3a38b81d53 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -220,20 +220,6 @@ class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
       *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7;
       break;
     }
-    case R_RISCV_JAL: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      Error AlignmentIssue = checkAlignment(FixupAddress, Value, 2, E);
-      if (AlignmentIssue) {
-        return AlignmentIssue;
-      }
-      uint32_t Imm20 = extractBits(Value, 20, 1) << 31;
-      uint32_t Imm10_1 = extractBits(Value, 1, 10) << 21;
-      uint32_t Imm11 = extractBits(Value, 11, 1) << 20;
-      uint32_t Imm19_12 = extractBits(Value, 12, 8) << 12;
-      uint32_t RawInstr = *(little32_t *)FixupPtr;
-      *(little32_t *)FixupPtr = RawInstr | Imm20 | Imm10_1 | Imm11 | Imm19_12;
-      break;
-    }
     case R_RISCV_HI20: {
       int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       int64_t Hi = Value + 0x800;
@@ -423,8 +409,6 @@ class ELFLinkGraphBuilder_riscv : public ELFLinkGraphBuilder<ELFT> {
       return EdgeKind_riscv::R_RISCV_64;
     case ELF::R_RISCV_BRANCH:
       return EdgeKind_riscv::R_RISCV_BRANCH;
-    case ELF::R_RISCV_JAL:
-      return EdgeKind_riscv::R_RISCV_JAL;
     case ELF::R_RISCV_HI20:
       return EdgeKind_riscv::R_RISCV_HI20;
     case ELF::R_RISCV_LO12_I:
diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
index 0bd57b654d402..3ce2cf10a24cb 100644
--- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
@@ -26,8 +26,6 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "R_RISCV_64";
   case R_RISCV_BRANCH:
     return "R_RISCV_BRANCH";
-  case R_RISCV_JAL:
-    return "R_RISCV_JAL";
   case R_RISCV_HI20:
     return "R_RISCV_HI20";
   case R_RISCV_LO12_I:
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s
deleted file mode 100644
index 82f12358cb6b6..0000000000000
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s
+++ /dev/null
@@ -1,37 +0,0 @@
-# RUN: rm -rf %t && mkdir -p %t
-# RUN: llvm-mc -triple=riscv64 -filetype=obj \
-# RUN:     -o %t/elf_riscv64_jal.o %s
-# RUN: llvm-mc -triple=riscv32 -filetype=obj \
-# RUN:     -o %t/elf_riscv32_jal.o %s
-# RUN: llvm-jitlink -noexec \
-# RUN:     -slab-allocate 100Kb -slab-address 0x1ff00000 -slab-page-size 4096 \
-# RUN:     -define-abs external_func=0x1fe000fe \
-# RUN:     -check %s %t/elf_riscv64_jal.o
-# RUN: llvm-jitlink -noexec \
-# RUN:     -slab-allocate 100Kb -slab-address 0x1ff00000 -slab-page-size 4096 \
-# RUN:     -define-abs external_func=0x1fe000fe \
-# RUN:     -check %s %t/elf_riscv32_jal.o
-#
-
-        .text
-        .file   "testcase.c"
-
-# Empty main entry point.
-        .globl  main
-        .p2align  1
-        .type   main,@function
-main:
-        ret
-
-        .size   main, .-main
-
-# Test R_RISCV_JAL
-
-# jitlink-check: decode_operand(test_jal, 1)[31:12] = (external_func - test_jal)[31:12]
-  .globl  test_jal
-  .p2align  1
-  .type  test_jal,@function
-test_jal:
-  jal	x0, external_func
-
-  .size test_jal, .-test_jal

From 2aa624a94fa00ac5e9d34dbf549b016a8afd4d37 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Thu, 17 Feb 2022 09:42:15 -0600
Subject: [PATCH 108/748] [polly] Fix regression test after D110620.

---
 polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
index 6164f5cdcb1d2..48cdcabd1dcb6 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
@@ -73,7 +73,7 @@
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 1]
 ; CHECK-NEXT:             [N] -> { Stmt_bb18[i0] -> MemRef_j_2__phi[] };
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:             [N] -> { Stmt_bb18[i0] -> MemRef_A[o0] };
+; CHECK-NEXT:             [N] -> { Stmt_bb18[i0] -> MemRef_A[o0] : 0 <= o0 <= 2147483647 };
 ; CHECK-NEXT:         MustWriteAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:             [N] -> { Stmt_bb18[i0] -> MemRef_A[i0] };
 ; CHECK-NEXT:     Stmt_bb23

From 2f2dcb4fb134a7e06d99ef62ca512c8307187207 Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.devereau@arm.com>
Date: Tue, 8 Feb 2022 14:24:03 +0000
Subject: [PATCH 109/748] [AArch64][SVE] Invert VSelect operand order and
 condition for predicated arithmetic operations

   (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
=> (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))

As a follow up to D117689, invert the operand order and condition
in order to fold vselects into predicated instructions.

Differential Revision: https://reviews.llvm.org/D119424
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  39 +++++++
 .../test/CodeGen/AArch64/sve-fp-reciprocal.ll |  15 +--
 llvm/test/CodeGen/AArch64/sve-select.ll       | 108 ++++++++++++++++++
 3 files changed, 153 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6e763202ce917..810cdb748b3bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17152,12 +17152,51 @@ static SDValue performTBZCombine(SDNode *N,
                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
 }
 
+// Swap vselect operands where it may allow a predicated operation to achieve
+// the `sel`.
+//
+//     (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
+//  => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
+static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
+  auto SelectA = N->getOperand(1);
+  auto SelectB = N->getOperand(2);
+  auto NTy = N->getValueType(0);
+
+  if (!NTy.isScalableVector())
+    return SDValue();
+  SDValue SetCC = N->getOperand(0);
+  if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
+    return SDValue();
+
+  switch (SelectB.getOpcode()) {
+  default:
+    return SDValue();
+  case ISD::FMUL:
+  case ISD::FSUB:
+  case ISD::FADD:
+    break;
+  }
+  if (SelectA != SelectB.getOperand(0))
+    return SDValue();
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  auto InverseSetCC = DAG.getSetCC(
+      SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
+      SetCC.getOperand(1), ISD::getSetCCInverse(CC, SetCC.getValueType()));
+
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
+                     {InverseSetCC, SelectB, SelectA});
+}
+
 // vselect (v1i1 setcc) ->
 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
 // such VSELECT.
 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  if (auto SwapResult = trySwapVSelectOperands(N, DAG))
+    return SwapResult;
+
   SDValue N0 = N->getOperand(0);
   EVT CCVT = N0.getValueType();
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
index 2385436cfe587..1b2e0be6111c0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
@@ -95,14 +95,13 @@ define <vscale x 8 x half> @fsqrt_recip_8f16(<vscale x 8 x half> %a) #0 {
 ; CHECK-NEXT:    frsqrte z1.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fmul z2.h, z1.h, z1.h
-; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    fcmne p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    frsqrts z2.h, z0.h, z2.h
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
 ; CHECK-NEXT:    fmul z2.h, z1.h, z1.h
 ; CHECK-NEXT:    frsqrts z2.h, z0.h, z2.h
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    fmul z1.h, z0.h, z1.h
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %fsqrt = call fast <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> %a)
   ret <vscale x 8 x half> %fsqrt
@@ -124,14 +123,13 @@ define <vscale x 4 x float> @fsqrt_recip_4f32(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    frsqrte z1.s, z0.s
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmul z2.s, z1.s, z1.s
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    frsqrts z2.s, z0.s, z2.s
 ; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
 ; CHECK-NEXT:    fmul z2.s, z1.s, z1.s
 ; CHECK-NEXT:    frsqrts z2.s, z0.s, z2.s
 ; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
-; CHECK-NEXT:    fmul z1.s, z0.s, z1.s
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %fsqrt = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> %a)
   ret <vscale x 4 x float> %fsqrt
@@ -153,7 +151,7 @@ define <vscale x 2 x double> @fsqrt_recip_2f64(<vscale x 2 x double> %a) #0 {
 ; CHECK-NEXT:    frsqrte z1.d, z0.d
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmul z2.d, z1.d, z1.d
-; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmne p0.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    frsqrts z2.d, z0.d, z2.d
 ; CHECK-NEXT:    fmul z1.d, z1.d, z2.d
 ; CHECK-NEXT:    fmul z2.d, z1.d, z1.d
@@ -162,8 +160,7 @@ define <vscale x 2 x double> @fsqrt_recip_2f64(<vscale x 2 x double> %a) #0 {
 ; CHECK-NEXT:    fmul z2.d, z1.d, z1.d
 ; CHECK-NEXT:    frsqrts z2.d, z0.d, z2.d
 ; CHECK-NEXT:    fmul z1.d, z1.d, z2.d
-; CHECK-NEXT:    fmul z1.d, z0.d, z1.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %fsqrt = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %a)
   ret <vscale x 2 x double> %fsqrt
diff --git a/llvm/test/CodeGen/AArch64/sve-select.ll b/llvm/test/CodeGen/AArch64/sve-select.ll
index 819620a299f00..3183e1e54f081 100644
--- a/llvm/test/CodeGen/AArch64/sve-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-select.ll
@@ -542,3 +542,111 @@ define <vscale x 16 x i1> @icmp_select_nxv16i1(<vscale x 16 x i1> %a, <vscale x
     %sel = select i1 %mask, <vscale x 16 x i1> %a, <vscale x 16 x i1> %b
     ret <vscale x 16 x i1> %sel
 }
+
+define <vscale x 4 x float> @select_f32_invert_fmul(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
+; CHECK-LABEL: select_f32_invert_fmul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+  %fmul = fmul <vscale x 4 x float> %a, %b
+  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fmul
+  ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_invert_fadd(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_invert_fadd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+  %fadd = fadd <vscale x 4 x float> %a, %b
+  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fadd
+  ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_invert_fsub(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_invert_fsub:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+  %fsub = fsub <vscale x 4 x float> %a, %b
+  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fsub
+  ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_op_lhs(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_no_invert_op_lhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+  %fmul = fmul <vscale x 4 x float> %a, %b
+  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %fmul, <vscale x 4 x float> %a
+  ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_2_op(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x float> %d) {
+; CHECK-LABEL: select_f32_no_invert_2_op:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmul z2.s, z2.s, z3.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    ret
+  %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+  %fmul1 = fmul <vscale x 4 x float> %a, %b
+  %fmul2 = fmul <vscale x 4 x float> %c, %d
+  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %fmul1, <vscale x 4 x float> %fmul2
+  ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_equal_ops(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: select_f32_no_invert_equal_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %m = fmul <vscale x 4 x float> %a, %b
+  %p = fcmp oeq <vscale x 4 x float> %m, zeroinitializer
+  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %m, <vscale x 4 x float> %m
+  ret <vscale x 4 x float> %sel
+}
+
+define <vscale x 4 x float> @select_f32_no_invert_fmul_two_setcc_uses(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, i32 %len) #0 {
+; CHECK-LABEL: select_f32_no_invert_fmul_two_setcc_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fadd z1.s, z0.s, z1.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+  %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
+  %fadd = fadd <vscale x 4 x float> %a, %b
+  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fadd
+  %sel2 = select <vscale x 4 x i1> %p, <vscale x 4 x float> %c, <vscale x 4 x float> %sel
+  ret <vscale x 4 x float> %sel2
+}
+
+define <4 x float> @select_f32_no_invert_not_scalable(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: select_f32_no_invert_not_scalable:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
+; CHECK-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %p = fcmp oeq <4 x float> %a, zeroinitializer
+  %fmul = fmul <4 x float> %a, %b
+  %sel = select <4 x i1> %p, <4 x float> %a, <4 x float> %fmul
+  ret <4 x float> %sel
+}

From 9071393c18e5264e3bbf3ca3f3584fa5f45be6c2 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 17 Feb 2022 14:17:36 +0000
Subject: [PATCH 110/748] [GlobalDCE] Simplify and return Changed = true less
 often

Removing dead constants should not count as making a change to the
module. This means that RemoveUnusedGlobalValue simplifies to just
calling removeDeadConstantUsers, so inline it.

Differential Revision: https://reviews.llvm.org/D120052
---
 llvm/lib/Transforms/IPO/GlobalDCE.cpp | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 5e5d2086adc2e..e375504099610 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -317,7 +317,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   // Loop over the module, adding globals which are obviously necessary.
   for (GlobalObject &GO : M.global_objects()) {
-    Changed |= RemoveUnusedGlobalValue(GO);
+    GO.removeDeadConstantUsers();
     // Functions with external linkage are needed if they have a body.
     // Externally visible & appending globals are needed, if they have an
     // initializer.
@@ -330,7 +330,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   // Compute direct dependencies of aliases.
   for (GlobalAlias &GA : M.aliases()) {
-    Changed |= RemoveUnusedGlobalValue(GA);
+    GA.removeDeadConstantUsers();
     // Externally visible aliases are needed.
     if (!GA.isDiscardableIfUnused())
       MarkLive(GA);
@@ -340,7 +340,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   // Compute direct dependencies of ifuncs.
   for (GlobalIFunc &GIF : M.ifuncs()) {
-    Changed |= RemoveUnusedGlobalValue(GIF);
+    GIF.removeDeadConstantUsers();
     // Externally visible ifuncs are needed.
     if (!GIF.isDiscardableIfUnused())
       MarkLive(GIF);
@@ -403,7 +403,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   // Now that all interferences have been dropped, delete the actual objects
   // themselves.
   auto EraseUnusedGlobalValue = [&](GlobalValue *GV) {
-    RemoveUnusedGlobalValue(*GV);
+    GV->removeDeadConstantUsers();
     GV->eraseFromParent();
     Changed = true;
   };
@@ -455,16 +455,3 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
-
-// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
-// GlobalValue, looking for the constant pointer ref that may be pointing to it.
-// If found, check to see if the constant pointer ref is safe to destroy, and if
-// so, nuke it.  This will reduce the reference count on the global value, which
-// might make it deader.
-//
-bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) {
-  if (GV.use_empty())
-    return false;
-  GV.removeDeadConstantUsers();
-  return GV.use_empty();
-}

From 8e17c9613f36f23b5a9d2720f330a37e54c6924f Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Tue, 25 Jan 2022 17:53:58 +0000
Subject: [PATCH 111/748] [AArch64] Add some missing strict FP vector lowering

Also add a test for the codegen of strict FP vector operations so
these changes get tested.

Differential Revision: https://reviews.llvm.org/D117795
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  63 +-
 .../CodeGen/AArch64/fp-intrinsics-vector.ll   | 886 ++++++++++++++++++
 2 files changed, 947 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 810cdb748b3bf..58a853c2ddf8c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3404,7 +3404,8 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
-  EVT InVT = Op.getOperand(0).getValueType();
+  bool IsStrict = Op->isStrictFPOpcode();
+  EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
   EVT VT = Op.getValueType();
 
   if (VT.isScalableVector()) {
@@ -3424,6 +3425,12 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
       !Subtarget->hasFullFP16()) {
     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
     SDLoc dl(Op);
+    if (IsStrict) {
+      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
+                                {Op.getOperand(0), Op.getOperand(1)});
+      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+                         {Ext.getValue(1), Ext.getValue(0)});
+    }
     return DAG.getNode(
         Op.getOpcode(), dl, Op.getValueType(),
         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
@@ -3433,6 +3440,13 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   uint64_t InVTSize = InVT.getFixedSizeInBits();
   if (VTSize < InVTSize) {
     SDLoc dl(Op);
+    if (IsStrict) {
+      InVT = InVT.changeVectorElementTypeToInteger();
+      SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
+                               {Op.getOperand(0), Op.getOperand(1)});
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+      return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
+    }
     SDValue Cv =
         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
                     Op.getOperand(0));
@@ -3444,10 +3458,32 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
     MVT ExtVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
                          VT.getVectorNumElements());
+    if (IsStrict) {
+      SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
+                                {ExtVT, MVT::Other},
+                                {Op.getOperand(0), Op.getOperand(1)});
+      return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
+                         {Ext.getValue(1), Ext.getValue(0)});
+    }
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
 
+  // Use a scalar operation for conversions between single-element vectors of
+  // the same size.
+  if (NumElts == 1) {
+    SDLoc dl(Op);
+    SDValue Extract = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
+        Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
+    EVT ScalarVT = VT.getScalarType();
+    SDValue ScalarCvt;
+    if (IsStrict)
+      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
+                         {Op.getOperand(0), Extract});
+    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
+  }
+
   // Type changing conversions are illegal.
   return Op;
 }
@@ -3610,9 +3646,10 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
+  bool IsStrict = Op->isStrictFPOpcode();
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
-  SDValue In = Op.getOperand(0);
+  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   EVT InVT = In.getValueType();
   unsigned Opc = Op.getOpcode();
   bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
@@ -3640,6 +3677,13 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
     MVT CastVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());
+    if (IsStrict) {
+      In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
+                       {Op.getOperand(0), In});
+      return DAG.getNode(
+          ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
+          {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
+    }
     In = DAG.getNode(Opc, dl, CastVT, In);
     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
   }
@@ -3648,9 +3692,24 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
     unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     EVT CastVT = VT.changeVectorElementTypeToInteger();
     In = DAG.getNode(CastOpc, dl, CastVT, In);
+    if (IsStrict)
+      return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
     return DAG.getNode(Opc, dl, VT, In);
   }
 
+  // Use a scalar operation for conversions between single-element vectors of
+  // the same size.
+  if (VT.getVectorNumElements() == 1) {
+    SDValue Extract = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
+        In, DAG.getConstant(0, dl, MVT::i64));
+    EVT ScalarVT = VT.getScalarType();
+    if (IsStrict)
+      return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
+                         {Op.getOperand(0), Extract});
+    return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
+  }
+
   return Op;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
new file mode 100644
index 0000000000000..b78798531e01f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
@@ -0,0 +1,886 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-eabi %s -disable-strictnode-mutation -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel=true -global-isel-abort=2 -disable-strictnode-mutation %s -o - | FileCheck %s
+
+; Check that constrained fp vector intrinsics are correctly lowered.
+
+
+; Single-precision intrinsics
+
+define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: add_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %x, <4 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: sub_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %x, <4 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: mul_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %x, <4 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: div_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> %x, <4 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) #0 {
+; CHECK-LABEL: fma_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x i32> @fptosi_v4i32_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fptosi_v4i32_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x i32> %val
+}
+
+define <4 x i32> @fptoui_v4i32_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fptoui_v4i32_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x i32> %val
+}
+
+define <4 x i64> @fptosi_v4i64_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fptosi_v4i64_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x i64> %val
+}
+
+define <4 x i64> @fptoui_v4i64_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: fptoui_v4i64_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    fcvtzu v1.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x i64> %val
+}
+
+define <4 x float> @sitofp_v4f32_v4i32(<4 x i32> %x) #0 {
+; CHECK-LABEL: sitofp_v4f32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @uitofp_v4f32_v4i32(<4 x i32> %x) #0 {
+; CHECK-LABEL: uitofp_v4f32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
+; CHECK-LABEL: sitofp_v4f32_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
+; CHECK-LABEL: uitofp_v4f32_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @sqrt_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: sqrt_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @rint_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: rint_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.rint.v4f32(<4 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @nearbyint_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: nearbyint_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @maxnum_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: maxnum_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.maxnum.v4f32(<4 x float> %x, <4 x float> %y, metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @minnum_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: minnum_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.minnum.v4f32(<4 x float> %x, <4 x float> %y, metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @ceil_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: ceil_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.ceil.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @floor_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: floor_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @round_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: round_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @roundeven_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: roundeven_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x float> @trunc_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: trunc_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %val = call <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float> %x, metadata !"fpexcept.strict") #0
+  ret <4 x float> %val
+}
+
+define <4 x i1> @fcmp_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: fcmp_v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    mov s3, v0.s[1]
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    mov s4, v1.s[2]
+; CHECK-NEXT:    mov s5, v0.s[2]
+; CHECK-NEXT:    mov s1, v1.s[3]
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    mov v2.s[1], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    mov v2.s[2], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.s[3], w8
+; CHECK-NEXT:    xtn v0.4h, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %val = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x float> %x, <4 x float> %y, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <4 x i1> %val
+}
+
+define <4 x i1> @fcmps_v4f32(<4 x float> %x, <4 x float> %y) #0 {
+; CHECK-LABEL: fcmps_v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    mov s3, v0.s[1]
+; CHECK-NEXT:    fcmpe s0, s1
+; CHECK-NEXT:    mov s4, v1.s[2]
+; CHECK-NEXT:    mov s5, v0.s[2]
+; CHECK-NEXT:    mov s1, v1.s[3]
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmpe s3, s2
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmpe s5, s4
+; CHECK-NEXT:    mov v2.s[1], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmpe s0, s1
+; CHECK-NEXT:    mov v2.s[2], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.s[3], w8
+; CHECK-NEXT:    xtn v0.4h, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %val = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x float> %x, <4 x float> %y, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <4 x i1> %val
+}
+
+
+; Double-precision intrinsics
+
+define <2 x double> @add_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: add_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @sub_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: sub_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @mul_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: mul_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @div_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: div_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
+; CHECK-LABEL: fma_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmla v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x i32> @fptosi_v2i32_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: fptosi_v2i32_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x i32> %val
+}
+
+define <2 x i32> @fptoui_v2i32_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: fptoui_v2i32_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x i32> %val
+}
+
+define <2 x i64> @fptosi_v2i64_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: fptosi_v2i64_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x i64> %val
+}
+
+define <2 x i64> @fptoui_v2i64_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: fptoui_v2i64_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x i64> %val
+}
+
+define <2 x double> @sitofp_v2f64_v2i32(<2 x i32> %x) #0 {
+; CHECK-LABEL: sitofp_v2f64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @uitofp_v2f64_v2i32(<2 x i32> %x) #0 {
+; CHECK-LABEL: uitofp_v2f64_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @sitofp_v2f64_v2i64(<2 x i64> %x) #0 {
+; CHECK-LABEL: sitofp_v2f64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @uitofp_v2f64_v2i64(<2 x i64> %x) #0 {
+; CHECK-LABEL: uitofp_v2f64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @sqrt_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: sqrt_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @rint_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: rint_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @nearbyint_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: nearbyint_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @maxnum_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: maxnum_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64(<2 x double> %x, <2 x double> %y, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @minnum_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: minnum_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.minnum.v2f64(<2 x double> %x, <2 x double> %y, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @ceil_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: ceil_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @floor_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: floor_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @round_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: round_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @roundeven_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: roundeven_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x double> @trunc_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: trunc_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz v0.2d, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double> %x, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+define <2 x i1> @fcmp_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: fcmp_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    mov d3, v0.d[1]
+; CHECK-NEXT:    fcmp d0, d1
+; CHECK-NEXT:    csetm x8, eq
+; CHECK-NEXT:    fcmp d3, d2
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    csetm x8, eq
+; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %val = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> %x, <2 x double> %y, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <2 x i1> %val
+}
+
+define <2 x i1> @fcmps_v2f64(<2 x double> %x, <2 x double> %y) #0 {
+; CHECK-LABEL: fcmps_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    mov d3, v0.d[1]
+; CHECK-NEXT:    fcmpe d0, d1
+; CHECK-NEXT:    csetm x8, eq
+; CHECK-NEXT:    fcmpe d3, d2
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    csetm x8, eq
+; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %val = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %x, <2 x double> %y, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <2 x i1> %val
+}
+
+
+; Double-precision single element intrinsics
+
+define <1 x double> @add_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: add_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> %x, <1 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @sub_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: sub_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub d0, d0, d1
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> %x, <1 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @mul_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: mul_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %x, <1 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @div_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: div_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double> %x, <1 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @fma_v1f64(<1 x double> %x, <1 x double> %y, <1 x double> %z) #0 {
+; CHECK-LABEL: fma_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmadd d0, d0, d1, d2
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %x, <1 x double> %y, <1 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x i32> @fptosi_v1i32_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: fptosi_v1i32_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %val = call <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x i32> %val
+}
+
+define <1 x i32> @fptoui_v1i32_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: fptoui_v1i32_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %val = call <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x i32> %val
+}
+
+define <1 x i64> @fptosi_v1i64_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: fptosi_v1i64_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %val = call <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x i64> %val
+}
+
+define <1 x i64> @fptoui_v1i64_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: fptoui_v1i64_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu x8, d0
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ret
+  %val = call <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x i64> %val
+}
+
+define <1 x double> @sitofp_v1f64_v1i32(<1 x i32> %x) #0 {
+; CHECK-LABEL: sitofp_v1f64_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @uitofp_v1f64_v1i32(<1 x i32> %x) #0 {
+; CHECK-LABEL: uitofp_v1f64_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    ucvtf d0, w8
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @sitofp_v1f64_v1i64(<1 x i64> %x) #0 {
+; CHECK-LABEL: sitofp_v1f64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @uitofp_v1f64_v1i64(<1 x i64> %x) #0 {
+; CHECK-LABEL: uitofp_v1f64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @sqrt_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: sqrt_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsqrt d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @rint_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: rint_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintx d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @nearbyint_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: nearbyint_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinti d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @maxnum_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: maxnum_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmaxnm d0, d0, d1
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.maxnum.v1f64(<1 x double> %x, <1 x double> %y, metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @minnum_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: minnum_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fminnm d0, d0, d1
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.minnum.v1f64(<1 x double> %x, <1 x double> %y, metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @ceil_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: ceil_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintp d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @floor_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: floor_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintm d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @round_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: round_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frinta d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @roundeven_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: roundeven_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintn d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.roundeven.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x double> @trunc_v1f64(<1 x double> %x) #0 {
+; CHECK-LABEL: trunc_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz d0, d0
+; CHECK-NEXT:    ret
+  %val = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> %x, metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+define <1 x i1> @fcmp_v1f61(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: fcmp_v1f61:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp d0, d1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %val = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %x, <1 x double> %y, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <1 x i1> %val
+}
+
+define <1 x i1> @fcmps_v1f61(<1 x double> %x, <1 x double> %y) #0 {
+; CHECK-LABEL: fcmps_v1f61:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmpe d0, d1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %val = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %x, <1 x double> %y, metadata !"oeq", metadata !"fpexcept.strict")
+  ret <1 x i1> %val
+}
+
+
+; Intrinsics to convert between floating-point types
+
+define <2 x float> @fptrunc_v2f32_v2f64(<2 x double> %x) #0 {
+; CHECK-LABEL: fptrunc_v2f32_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %val = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <2 x float> %val
+}
+
+define <2 x double> @fpext_v2f64_v2f32(<2 x float> %x) #0 {
+; CHECK-LABEL: fpext_v2f64_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ret
+  %val = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float> %x, metadata !"fpexcept.strict") #0
+  ret <2 x double> %val
+}
+
+
+attributes #0 = { strictfp }
+
+declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float>, <4 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float>, <4 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float>, <4 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
+declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float>, metadata)
+declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float>, metadata)
+declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float>, metadata)
+declare <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float>, metadata)
+declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.rint.v4f32(<4 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.maxnum.v4f32(<4 x float>, <4 x float>, metadata)
+declare <4 x float> @llvm.experimental.constrained.minnum.v4f32(<4 x float>, <4 x float>, metadata)
+declare <4 x float> @llvm.experimental.constrained.ceil.v4f32(<4 x float>, metadata)
+declare <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float>, metadata)
+declare <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float>, metadata)
+declare <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float>, metadata)
+declare <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float>, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x float>, <4 x float>, metadata, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x float>, <4 x float>, metadata, metadata)
+
+declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
+declare <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(<2 x double>, metadata)
+declare <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(<2 x double>, metadata)
+declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double>, metadata)
+declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double>, metadata)
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.maxnum.v2f64(<2 x double>, <2 x double>, metadata)
+declare <2 x double> @llvm.experimental.constrained.minnum.v2f64(<2 x double>, <2 x double>, metadata)
+declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, metadata)
+declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata)
+declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata)
+declare <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double>, metadata)
+declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata)
+declare <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+
+declare <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double>, <1 x double>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double>, <1 x double>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double>, <1 x double>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double>, <1 x double>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double>, <1 x double>, <1 x double>, metadata, metadata)
+declare <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f64(<1 x double>, metadata)
+declare <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f64(<1 x double>, metadata)
+declare <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f64(<1 x double>, metadata)
+declare <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f64(<1 x double>, metadata)
+declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double>, metadata, metadata)
+declare <1 x double> @llvm.experimental.constrained.maxnum.v1f64(<1 x double>, <1 x double>, metadata)
+declare <1 x double> @llvm.experimental.constrained.minnum.v1f64(<1 x double>, <1 x double>, metadata)
+declare <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double>, metadata)
+declare <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double>, metadata)
+declare <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double>, metadata)
+declare <1 x double> @llvm.experimental.constrained.roundeven.v1f64(<1 x double>, metadata)
+declare <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double>, metadata)
+declare <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double>, <1 x double>, metadata, metadata)
+declare <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double>, <1 x double>, metadata, metadata)
+
+declare <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata)

From 954fe404ab7f5dab917fe7987f68a3095ba10413 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 17 Feb 2022 08:10:20 -0800
Subject: [PATCH 112/748] [RISCV] Fix incorrect MemOperand copy converting
 splat+load to vlse.

Due to an incorrect copy/paste from load intrinsic handling we
checked if the splat node was a MemSDNode which of course it isn't.

Instead get the MemOperand from the LoadSDNode for the source of
the splat.

This enables LICM to see the load is loop invariant and hoist it
out of the loop.

Reviewed By: frasercrmck

Differential Revision: https://reviews.llvm.org/D120014
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  3 +-
 .../RISCV/rvv/fixed-vectors-fp-buildvec.ll    | 48 +++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 8b315960eff9b..fbf5eb2f3f980 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1667,8 +1667,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     MachineSDNode *Load =
         CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
 
-    if (auto *MemOp = dyn_cast<MemSDNode>(Node))
-      CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+    CurDAG->setNodeMemRefs(Load, {Ld->getMemOperand()});
 
     ReplaceNode(Node, Load);
     return;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 80abc47a23bb7..af74ea9f50543 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -277,3 +277,51 @@ define <8 x float> @splat_idx_v8f32(<8 x float> %v, i64 %idx) {
   %splat = shufflevector <8 x float> %ins, <8 x float> poison, <8 x i32> zeroinitializer
   ret <8 x float> %splat
 }
+
+; Test that we pull the vlse of the constant pool out of the loop.
+define dso_local void @splat_load_licm(float* %0) {
+; RV32-LABEL: splat_load_licm:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI12_0)
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV32-NEXT:    vlse32.v v8, (a1), zero
+; RV32-NEXT:    li a1, 1024
+; RV32-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi a1, a1, -4
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    bnez a1, .LBB12_1
+; RV32-NEXT:  # %bb.2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: splat_load_licm:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI12_0)
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV64-NEXT:    vlse32.v v8, (a1), zero
+; RV64-NEXT:    li a1, 0
+; RV64-NEXT:    li a2, 1024
+; RV64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    slli a3, a1, 2
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    addiw a1, a1, 4
+; RV64-NEXT:    vse32.v v8, (a3)
+; RV64-NEXT:    bne a1, a2, .LBB12_1
+; RV64-NEXT:  # %bb.2:
+; RV64-NEXT:    ret
+  br label %2
+
+2:                                                ; preds = %2, %1
+  %3 = phi i32 [ 0, %1 ], [ %6, %2 ]
+  %4 = getelementptr inbounds float, float* %0, i32 %3
+  %5 = bitcast float* %4 to <4 x float>*
+  store <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float>* %5, align 4
+  %6 = add nuw i32 %3, 4
+  %7 = icmp eq i32 %6, 1024
+  br i1 %7, label %8, label %2
+
+8:                                                ; preds = %2
+  ret void
+}

From bbee9e77f34c073642c73c2a423c284cca7444d6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 17 Feb 2022 08:17:41 -0800
Subject: [PATCH 113/748] [RISCV] Match shufflevector corresponding to slideup.

This generalizes isElementRotate to work when there's only a single
slide needed. I've removed matchShuffleAsSlideDown which is now
redundant.

Reviewed By: frasercrmck, khchen

Differential Revision: https://reviews.llvm.org/D119759
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 139 ++++++++----------
 .../RISCV/rvv/fixed-vectors-fp-shuffles.ll    |  22 +++
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |  22 +++
 3 files changed, 102 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c20d2d1ef710a..ca11d0d431ffe 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2456,35 +2456,6 @@ static SDValue lowerScalarSplat(SDValue Passthru, SDValue Scalar, SDValue VL,
   return splatSplitI64WithVL(DL, VT, Passthru, Scalar, VL, DAG);
 }
 
-// Is the mask a slidedown that shifts in undefs.
-static int matchShuffleAsSlideDown(ArrayRef<int> Mask) {
-  int Size = Mask.size();
-
-  // Elements shifted in should be undef.
-  auto CheckUndefs = [&](int Shift) {
-    for (int i = Size - Shift; i != Size; ++i)
-      if (Mask[i] >= 0)
-        return false;
-    return true;
-  };
-
-  // Elements should be shifted or undef.
-  auto MatchShift = [&](int Shift) {
-    for (int i = 0; i != Size - Shift; ++i)
-       if (Mask[i] >= 0 && Mask[i] != Shift + i)
-         return false;
-    return true;
-  };
-
-  // Try all possible shifts.
-  for (int Shift = 1; Shift != Size; ++Shift)
-    if (CheckUndefs(Shift) && MatchShift(Shift))
-      return Shift;
-
-  // No match.
-  return -1;
-}
-
 static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
                                 const RISCVSubtarget &Subtarget) {
   // We need to be able to widen elements to the next larger integer type.
@@ -2527,7 +2498,19 @@ static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
   return true;
 }
 
-static int isElementRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
+/// Match shuffles that concatenate two vectors, rotate the concatenation,
+/// and then extract the original number of elements from the rotated result.
+/// This is equivalent to vector.splice or X86's PALIGNR instruction. The
+/// returned rotation amount is for a rotate right, where elements move from
+/// higher elements to lower elements. \p LoSrc indicates the first source
+/// vector of the rotate or -1 for undef. \p HiSrc indicates the second vector
+/// of the rotate or -1 for undef. At least one of \p LoSrc and \p HiSrc will be
+/// 0 or 1 if a rotation is found.
+///
+/// NOTE: We talk about rotate to the right which matches how bit shift and
+/// rotate instructions are described where LSBs are on the right, but LLVM IR
+/// and the table below write vectors with the lowest elements on the left.
+static int isElementRotate(int &LoSrc, int &HiSrc, ArrayRef<int> Mask) {
   int Size = Mask.size();
 
   // We need to detect various ways of spelling a rotation:
@@ -2538,7 +2521,8 @@ static int isElementRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
   //   [-1,  4,  5,  6, -1, -1,  9, -1]
   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   int Rotation = 0;
-  SDValue Lo, Hi;
+  LoSrc = -1;
+  HiSrc = -1;
   for (int i = 0; i != Size; ++i) {
     int M = Mask[i];
     if (M < 0)
@@ -2562,18 +2546,18 @@ static int isElementRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
       return -1;
 
     // Compute which value this mask is pointing at.
-    SDValue MaskV = M < Size ? V1 : V2;
+    int MaskSrc = M < Size ? 0 : 1;
 
     // Compute which of the two target values this index should be assigned to.
     // This reflects whether the high elements are remaining or the low elemnts
     // are remaining.
-    SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+    int &TargetSrc = StartIdx < 0 ? HiSrc : LoSrc;
 
     // Either set up this value if we've not encountered it before, or check
     // that it remains consistent.
-    if (!TargetV)
-      TargetV = MaskV;
-    else if (TargetV != MaskV)
+    if (TargetSrc < 0)
+      TargetSrc = MaskSrc;
+    else if (TargetSrc != MaskSrc)
       // This may be a rotation, but it pulls from the inputs in some
       // unsupported interleaving.
       return -1;
@@ -2581,14 +2565,8 @@ static int isElementRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
 
   // Check that we successfully analyzed the mask, and normalize the results.
   assert(Rotation != 0 && "Failed to locate a viable rotation!");
-  assert((Lo || Hi) && "Failed to find a rotated input vector!");
-
-  // Make sure we've found a value for both halves.
-  if (!Lo || !Hi)
-    return -1;
-
-  V1 = Lo;
-  V2 = Hi;
+  assert((LoSrc >= 0 || HiSrc >= 0) &&
+         "Failed to find a rotated input vector!");
 
   return Rotation;
 }
@@ -2685,45 +2663,43 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 
   ArrayRef<int> Mask = SVN->getMask();
 
-  // Try to match as a slidedown.
-  int SlideAmt = matchShuffleAsSlideDown(Mask);
-  if (SlideAmt >= 0) {
-    // TODO: Should we reduce the VL to account for the upper undef elements?
-    // Requires additional vsetvlis, but might be faster to execute.
-    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    SDValue SlideDown =
-        DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
-                    DAG.getUNDEF(ContainerVT), V1,
-                    DAG.getConstant(SlideAmt, DL, XLenVT),
-                    TrueMask, VL);
-    return convertFromScalableVector(VT, SlideDown, DAG, Subtarget);
-  }
-
-  // Match shuffles that concatenate two vectors, rotate the concatenation,
-  // and then extract the original number of elements from the rotated result.
-  // This is equivalent to vector.splice or X86's PALIGNR instruction. Lower
-  // it to a SLIDEDOWN and a SLIDEUP.
-  // FIXME: We don't really need it to be a concatenation. We just need two
-  // regions with contiguous elements that need to be shifted down and up.
-  int Rotation = isElementRotate(V1, V2, Mask);
+  // Lower rotations to a SLIDEDOWN and a SLIDEUP. One of the source vectors may
+  // be undef which can be handled with a single SLIDEDOWN/UP.
+  int LoSrc, HiSrc;
+  int Rotation = isElementRotate(LoSrc, HiSrc, Mask);
   if (Rotation > 0) {
-    // We found a rotation. We need to slide V1 down by Rotation. Using
-    // (NumElts - Rotation) for VL. Then we need to slide V2 up by
-    // (NumElts - Rotation) using NumElts for VL.
-    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+    SDValue LoV, HiV;
+    if (LoSrc >= 0) {
+      LoV = LoSrc == 0 ? V1 : V2;
+      LoV = convertToScalableVector(ContainerVT, LoV, DAG, Subtarget);
+    }
+    if (HiSrc >= 0) {
+      HiV = HiSrc == 0 ? V1 : V2;
+      HiV = convertToScalableVector(ContainerVT, HiV, DAG, Subtarget);
+    }
 
+    // We found a rotation. We need to slide HiV down by Rotation. Then we need
+    // to slide LoV up by (NumElts - Rotation).
     unsigned InvRotate = NumElts - Rotation;
-    SDValue SlideDown =
-        DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
-                    DAG.getUNDEF(ContainerVT), V2,
-                    DAG.getConstant(Rotation, DL, XLenVT),
-                    TrueMask, DAG.getConstant(InvRotate, DL, XLenVT));
-    SDValue SlideUp =
-        DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, SlideDown, V1,
-                    DAG.getConstant(InvRotate, DL, XLenVT),
-                    TrueMask, VL);
-    return convertFromScalableVector(VT, SlideUp, DAG, Subtarget);
+
+    SDValue Res = DAG.getUNDEF(ContainerVT);
+    if (HiV) {
+      // If we are doing a SLIDEDOWN+SLIDEUP, reduce the VL for the SLIDEDOWN.
+      // FIXME: If we are only doing a SLIDEDOWN, don't reduce the VL as it
+      // causes multiple vsetvlis in some test cases such as lowering
+      // reduce.mul
+      SDValue DownVL = VL;
+      if (LoV)
+        DownVL = DAG.getConstant(InvRotate, DL, XLenVT);
+      Res =
+          DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT, Res, HiV,
+                      DAG.getConstant(Rotation, DL, XLenVT), TrueMask, DownVL);
+    }
+    if (LoV)
+      Res = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Res, LoV,
+                        DAG.getConstant(InvRotate, DL, XLenVT), TrueMask, VL);
+
+    return convertFromScalableVector(VT, Res, DAG, Subtarget);
   }
 
   // Detect an interleave shuffle and lower to
@@ -2947,7 +2923,8 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   MVT SVT = VT.getSimpleVT();
 
   bool SwapSources;
-  return (matchShuffleAsSlideDown(M) >= 0) ||
+  int LoSrc, HiSrc;
+  return (isElementRotate(LoSrc, HiSrc, M) > 0) ||
          isInterleaveShuffle(M, SVT, SwapSources, Subtarget);
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
index 83f5832b33cce..cd8f3721b546e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -256,6 +256,28 @@ define <8 x float> @slidedown_v8f32(<8 x float> %x) {
   ret <8 x float> %s
 }
 
+define <4 x half> @slideup_v4f16(<4 x half> %x) {
+; CHECK-LABEL: slideup_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vi v9, v8, 1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %s = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+  ret <4 x half> %s
+}
+
+define <8 x float> @slideup_v8f32(<8 x float> %x) {
+; CHECK-LABEL: slideup_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; CHECK-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x float> %s
+}
+
 define <8 x float> @splice_unary(<8 x float> %x) {
 ; CHECK-LABEL: splice_unary:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 0b4e7ac65bbce..061d48b7add0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -555,6 +555,28 @@ define <8 x i32> @slidedown_v8i32(<8 x i32> %x) {
   ret <8 x i32> %s
 }
 
+define <4 x i16> @slideup_v4i16(<4 x i16> %x) {
+; CHECK-LABEL: slideup_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vi v9, v8, 1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %s = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 undef, i32 0, i32 1, i32 2>
+  ret <4 x i16> %s
+}
+
+define <8 x i32> @slideup_v8i32(<8 x i32> %x) {
+; CHECK-LABEL: slideup_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; CHECK-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %s = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i32> %s
+}
+
 define <8 x i16> @splice_unary(<8 x i16> %x) {
 ; CHECK-LABEL: splice_unary:
 ; CHECK:       # %bb.0:

From 0b57e6c46b707c0e7a123efe82abf3c1e7b5a503 Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Thu, 17 Feb 2022 19:04:51 +0300
Subject: [PATCH 114/748] [objcopy] followup patch after
 f75da0c8e65cf1b09012a8b62cd7f3e9a646bbc9

---
 clang/docs/tools/clang-formatted-files.txt | 78 ++++++++++++----------
 llvm/unittests/CMakeLists.txt              |  1 +
 2 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index 4a2c28bcf32ae..231877144d28a 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -5063,6 +5063,18 @@ llvm/include/llvm/MCA/Stages/InstructionTables.h
 llvm/include/llvm/MCA/Stages/MicroOpQueueStage.h
 llvm/include/llvm/MCA/Stages/RetireStage.h
 llvm/include/llvm/MCA/Stages/Stage.h
+llvm/include/llvm/ObjCopy/MultiFormatConfig.h
+llvm/include/llvm/ObjCopy/ConfigManager.h
+llvm/include/llvm/ObjCopy/CommonConfig.h
+llvm/include/llvm/ObjCopy/ObjCopy.h
+llvm/include/llvm/ObjCopy/wasm/WasmConfig.h
+llvm/include/llvm/ObjCopy/wasm/WasmObjcopy.h
+llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
+llvm/include/llvm/ObjCopy/ELF/ELFObjcopy.h
+llvm/include/llvm/ObjCopy/MachO/MachOObjcopy.h
+llvm/include/llvm/ObjCopy/MachO/MachOConfig.h
+llvm/include/llvm/ObjCopy/COFF/COFFConfig.h
+llvm/include/llvm/ObjCopy/COFF/COFFObjcopy.h
 llvm/include/llvm/Object/Archive.h
 llvm/include/llvm/Object/COFFModuleDefinition.h
 llvm/include/llvm/Object/Decompressor.h
@@ -5766,6 +5778,34 @@ llvm/lib/MCA/Stages/InOrderIssueStage.cpp
 llvm/lib/MCA/Stages/MicroOpQueueStage.cpp
 llvm/lib/MCA/Stages/RetireStage.cpp
 llvm/lib/MCA/Stages/Stage.cpp
+llvm/lib/ObjCopy/Archive.cpp
+llvm/lib/ObjCopy/ConfigManager.cpp
+llvm/lib/ObjCopy/ObjCopy.cpp
+llvm/lib/ObjCopy/Archive.h
+llvm/lib/ObjCopy/wasm/Reader.cpp
+llvm/lib/ObjCopy/wasm/Reader.h
+llvm/lib/ObjCopy/wasm/Object.cpp
+llvm/lib/ObjCopy/wasm/Writer.cpp
+llvm/lib/ObjCopy/wasm/Writer.h
+llvm/lib/ObjCopy/wasm/Object.h
+llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
+llvm/lib/ObjCopy/ELF/Object.cpp
+llvm/lib/ObjCopy/MachO/MachOWriter.cpp
+llvm/lib/ObjCopy/MachO/Object.cpp
+llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
+llvm/lib/ObjCopy/MachO/MachOWriter.h
+llvm/lib/ObjCopy/MachO/MachOReader.h
+llvm/lib/ObjCopy/MachO/MachOReader.cpp
+llvm/lib/ObjCopy/MachO/Object.h
+llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp
+llvm/lib/ObjCopy/COFF/Reader.cpp
+llvm/lib/ObjCopy/COFF/Reader.h
+llvm/lib/ObjCopy/COFF/Object.cpp
+llvm/lib/ObjCopy/COFF/Writer.cpp
+llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
+llvm/lib/ObjCopy/COFF/Writer.h
+llvm/lib/ObjCopy/COFF/Object.h
 llvm/lib/Object/Archive.cpp
 llvm/lib/Object/Binary.cpp
 llvm/lib/Object/Decompressor.cpp
@@ -6629,42 +6669,9 @@ llvm/tools/llvm-microsoft-demangle-fuzzer/DummyDemanglerFuzzer.cpp
 llvm/tools/llvm-microsoft-demangle-fuzzer/llvm-microsoft-demangle-fuzzer.cpp
 llvm/tools/llvm-ml/Disassembler.h
 llvm/tools/llvm-modextract/llvm-modextract.cpp
-llvm/tools/llvm-objcopy/CommonConfig.h
-llvm/tools/llvm-objcopy/ConfigManager.h
 llvm/tools/llvm-objcopy/llvm-objcopy.cpp
-llvm/tools/llvm-objcopy/llvm-objcopy.h
-llvm/tools/llvm-objcopy/MultiFormatConfig.h
-llvm/tools/llvm-objcopy/COFF/COFFConfig.h
-llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp
-llvm/tools/llvm-objcopy/COFF/COFFObjcopy.h
-llvm/tools/llvm-objcopy/COFF/Object.cpp
-llvm/tools/llvm-objcopy/COFF/Object.h
-llvm/tools/llvm-objcopy/COFF/Reader.cpp
-llvm/tools/llvm-objcopy/COFF/Reader.h
-llvm/tools/llvm-objcopy/COFF/Writer.cpp
-llvm/tools/llvm-objcopy/COFF/Writer.h
-llvm/tools/llvm-objcopy/ELF/ELFConfig.h
-llvm/tools/llvm-objcopy/ELF/ELFObjcopy.h
-llvm/tools/llvm-objcopy/MachO/MachOConfig.h
-llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.cpp
-llvm/tools/llvm-objcopy/MachO/MachOLayoutBuilder.h
-llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp
-llvm/tools/llvm-objcopy/MachO/MachOObjcopy.h
-llvm/tools/llvm-objcopy/MachO/MachOReader.cpp
-llvm/tools/llvm-objcopy/MachO/MachOReader.h
-llvm/tools/llvm-objcopy/MachO/MachOWriter.cpp
-llvm/tools/llvm-objcopy/MachO/MachOWriter.h
-llvm/tools/llvm-objcopy/MachO/Object.cpp
-llvm/tools/llvm-objcopy/MachO/Object.h
-llvm/tools/llvm-objcopy/wasm/Object.cpp
-llvm/tools/llvm-objcopy/wasm/Object.h
-llvm/tools/llvm-objcopy/wasm/Reader.cpp
-llvm/tools/llvm-objcopy/wasm/Reader.h
-llvm/tools/llvm-objcopy/wasm/WasmConfig.h
-llvm/tools/llvm-objcopy/wasm/WasmObjcopy.cpp
-llvm/tools/llvm-objcopy/wasm/WasmObjcopy.h
-llvm/tools/llvm-objcopy/wasm/Writer.cpp
-llvm/tools/llvm-objcopy/wasm/Writer.h
+llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+llvm/tools/llvm-objcopy/ObjcopyOptions.h
 llvm/tools/llvm-objdump/COFFDump.h
 llvm/tools/llvm-objdump/ELFDump.h
 llvm/tools/llvm-objdump/MachODump.h
@@ -6916,6 +6923,7 @@ llvm/unittests/MC/MCInstPrinter.cpp
 llvm/unittests/MC/TargetRegistry.cpp
 llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp
 llvm/unittests/MC/SystemZ/SystemZAsmLexerTest.cpp
+llvm/unittests/ObjCopy/ObjCopyTest.cpp
 llvm/unittests/Object/ArchiveTest.cpp
 llvm/unittests/Object/ELFTest.cpp
 llvm/unittests/Object/ELFTypesTest.cpp
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 6f4df5b585bf6..99782949b2559 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -35,6 +35,7 @@ add_subdirectory(Linker)
 add_subdirectory(MC)
 add_subdirectory(MI)
 add_subdirectory(MIR)
+add_subdirectory(ObjCopy)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)

From fd3ba1f862f54811ff9f4663ff298ff02d9c3b70 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian@ca.ibm.com>
Date: Thu, 17 Feb 2022 11:37:33 -0500
Subject: [PATCH 115/748] Title: Export unique symbol list with llvm-nm new
 option "--export-symbols"

Summary:

the patch implement of following functionality.
1. export the symbols from archive or object files.
2. sort the export symbols. (based on same symbol name and visibility)
3. delete the duplicate export symbols (based on same symbol name and visibility)
4. print out the  unique and sorted export symbols (print the symbol name and visibility).

there are two new options are add in the patch
1. --export-symbols (enable the functionality of export unique symbol)
2. --no-rsrc (exclude the symbol name begin with "__rsrc" from be exporting from xcoff object file)

Export symbol list for xcoff object file has the same functionality as
The patch has the same functionality as
https://www.ibm.com/docs/en/xl-c-aix/13.1.0?topic=library-exporting-symbols-createexportlist-utility

Reviewers: James Henderson,Fangrui Song
Differential Revision: https://reviews.llvm.org/D112735
---
 llvm/docs/CommandGuide/llvm-nm.rst            |  12 +
 llvm/include/llvm/BinaryFormat/XCOFF.h        |  30 ++
 llvm/include/llvm/Object/XCOFFObjectFile.h    |   3 +
 llvm/lib/Object/XCOFFObjectFile.cpp           |  10 +
 .../tools/llvm-nm/XCOFF/export-symbols.test   | 340 ++++++++++++++++++
 .../tools/llvm-nm/bitcode-export-sym.test     |  12 +
 llvm/tools/llvm-nm/Opts.td                    |   6 +
 llvm/tools/llvm-nm/llvm-nm.cpp                | 185 ++++++++--
 8 files changed, 574 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/tools/llvm-nm/XCOFF/export-symbols.test
 create mode 100644 llvm/test/tools/llvm-nm/bitcode-export-sym.test

diff --git a/llvm/docs/CommandGuide/llvm-nm.rst b/llvm/docs/CommandGuide/llvm-nm.rst
index 4b8db71e6a148..8fc2b214cf05a 100644
--- a/llvm/docs/CommandGuide/llvm-nm.rst
+++ b/llvm/docs/CommandGuide/llvm-nm.rst
@@ -157,6 +157,11 @@ OPTIONS
 
  Display dynamic symbols instead of normal symbols.
 
+.. option:: --export-symbols
+
+ Print sorted symbols with their visibility (if applicable), with duplicates
+ removed.
+
 .. option:: --extern-only, -g
 
  Print only symbols whose definitions are external; that is, accessible from
@@ -282,6 +287,13 @@ MACH-O SPECIFIC OPTIONS
 
  Print symbol entry in hex.
 
+XCOFF SPECIFIC OPTIONS
+----------------------
+
+.. option:: --no-rsrc
+
+  Exclude resource file symbols (``__rsrc``) from export symbol list.
+
 BUGS
 ----
 
diff --git a/llvm/include/llvm/BinaryFormat/XCOFF.h b/llvm/include/llvm/BinaryFormat/XCOFF.h
index cffd8618f1e3a..5d23ec5cd911f 100644
--- a/llvm/include/llvm/BinaryFormat/XCOFF.h
+++ b/llvm/include/llvm/BinaryFormat/XCOFF.h
@@ -54,6 +54,34 @@ enum AuxHeaderFlags64 : uint16_t {
                         ///< future use and should be set to 0.
 };
 
+enum XCOFFInterpret : uint16_t {
+  OLD_XCOFF_INTERPRET = 1,
+  NEW_XCOFF_INTERPRET = 2
+};
+
+enum FileFlag : uint16_t {
+  F_RELFLG = 0x0001,    ///< relocation info stripped from file
+  F_EXEC = 0x0002,      ///< file is executable (i.e., it
+                        ///< has a loader section)
+  F_LNNO = 0x0004,      ///< line numbers stripped from file
+  F_LSYMS = 0x0008,     ///< local symbols stripped from file
+  F_FDPR_PROF = 0x0010, ///< file was profiled with FDPR
+  F_FDPR_OPTI = 0x0020, ///< file was reordered with FDPR
+  F_DSA = 0x0040,       ///< file uses Dynamic Segment Allocation (32-bit
+                        ///< only)
+  F_DEP_1 = 0x0080,     ///< Data Execution Protection bit 1
+  F_VARPG = 0x0100,     ///< executable requests using variable size pages
+  F_LPTEXT = 0x0400,    ///< executable requires large pages for text
+  F_LPDATA = 0x0800,    ///< executable requires large pages for data
+  F_DYNLOAD = 0x1000,   ///< file is dynamically loadable and
+                        ///< executable (equivalent to F_EXEC on AIX)
+  F_SHROBJ = 0x2000,    ///< file is a shared object
+  F_LOADONLY =
+      0x4000,      ///< file can be loaded by the system loader, but it is
+                   ///< ignored by the linker if it is a member of an archive.
+  F_DEP_2 = 0x8000 ///< Data Execution Protection bit 2
+};
+
 // x_smclas field of x_csect from system header: /usr/include/syms.h
 /// Storage Mapping Class definitions.
 enum StorageMappingClass : uint8_t {
@@ -212,6 +240,8 @@ enum VisibilityType : uint16_t {
   SYM_V_EXPORTED = 0x4000
 };
 
+constexpr uint16_t VISIBILITY_MASK = 0x7000;
+
 // Relocation types, defined in `/usr/include/reloc.h`.
 enum RelocationType : uint8_t {
   R_POS = 0x00, ///< Positive relocation. Provides the address of the referenced
diff --git a/llvm/include/llvm/Object/XCOFFObjectFile.h b/llvm/include/llvm/Object/XCOFFObjectFile.h
index ac911e534f341..5aad03b888fc2 100644
--- a/llvm/include/llvm/Object/XCOFFObjectFile.h
+++ b/llvm/include/llvm/Object/XCOFFObjectFile.h
@@ -60,10 +60,13 @@ template <typename T> struct XCOFFAuxiliaryHeader {
     return static_cast<const T *>(this)->FlagAndTDataAlignment &
            AuxiHeaderFlagMask;
   }
+
   uint8_t getTDataAlignment() const {
     return static_cast<const T *>(this)->FlagAndTDataAlignment &
            AuxiHeaderTDataAlignmentMask;
   }
+
+  uint16_t getVersion() const { return static_cast<const T *>(this)->Version; }
 };
 
 struct XCOFFAuxiliaryHeader32 : XCOFFAuxiliaryHeader<XCOFFAuxiliaryHeader32> {
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index f2f6d700ddd8c..d9ecb0aff6bdc 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -615,6 +615,16 @@ Expected<uint32_t> XCOFFObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   if (XCOFFSym.getSectionNumber() == XCOFF::N_UNDEF)
     Result |= SymbolRef::SF_Undefined;
 
+  // There is no visibility in old 32 bit XCOFF object file interpret.
+  if (is64Bit() || (auxiliaryHeader32() && (auxiliaryHeader32()->getVersion() ==
+                                            NEW_XCOFF_INTERPRET))) {
+    uint16_t SymType = XCOFFSym.getSymbolType();
+    if ((SymType & VISIBILITY_MASK) == SYM_V_HIDDEN)
+      Result |= SymbolRef::SF_Hidden;
+
+    if ((SymType & VISIBILITY_MASK) == SYM_V_EXPORTED)
+      Result |= SymbolRef::SF_Exported;
+  }
   return Result;
 }
 
diff --git a/llvm/test/tools/llvm-nm/XCOFF/export-symbols.test b/llvm/test/tools/llvm-nm/XCOFF/export-symbols.test
new file mode 100644
index 0000000000000..8cedd2e204007
--- /dev/null
+++ b/llvm/test/tools/llvm-nm/XCOFF/export-symbols.test
@@ -0,0 +1,340 @@
+## Test the "--export-symbols" option.
+## The option merges all the output of input files, sorts and prints out unique symbols from the input files.
+
+# RUN: yaml2obj --docnum=1 -DFLAG=0x0002 %s -o %t1.o
+# RUN: yaml2obj --docnum=2 -DFLAG=0x0002 %s -o %t2.o
+# RUN: yaml2obj --docnum=2 -DFLAG=0x0002 -DSECT=26 %s -o %t2_invalid.o
+
+## Test the following cases:
+## Do not export global symbols beginning with "__sinit" , "__sterm" , "." , "(". or regular expression "^__[0-9]+__".
+## Do not export hidden and internal symbols.
+## Remove name prefixes of global symbols beginning with "__tf1" and "__tf9".
+# RUN: llvm-nm --export-symbols %t1.o | FileCheck %s --check-prefixes=COMMON,WEAK,RSRC --implicit-check-not={{.}}
+
+## Show that weak symbols are not exported when using the "--no-weak" option.
+# RUN: llvm-nm --export-symbols --no-weak %t1.o | FileCheck --check-prefixes=COMMON,RSRC  %s --implicit-check-not={{.}}
+
+## Show that only unique symbols (with a different name or visibility) are exported.
+## RUN: llvm-nm --export-symbols %t1.o %t2.o | FileCheck --check-prefixes=COMMON,WEAK,OBJ2,RSRC %s --implicit-check-not={{.}}
+
+## Show that __rsrc symbols are not exported when using the "--no-rsrc" option.
+# RUN: llvm-nm --export-symbols --no-rsrc %t1.o  | FileCheck --check-prefixes=COMMON,WEAK %s --implicit-check-not={{.}}
+
+# COMMON:      023__
+# COMMON-NEXT: __023
+# COMMON-NEXT: __02er02__
+# COMMON-NEXT: ____
+# RSRC-NEXT:   __rsrc
+# COMMON-NEXT: __rsrc export
+# COMMON-NEXT: __tf2value
+# COMMON-NEXT: export_protected_var export
+# COMMON-NEXT: export_protected_var protected
+# OBJ2-NEXT:   export_var_in_sec_obj export
+# COMMON-NEXT: protected_var protected
+# OBJ2-NEXT:   protected_var_in_sec_obj protected
+# COMMON-NEXT: tf1value
+# COMMON-NEXT: tf9value
+# OBJ2-NEXT:   var1_in_sec_obj
+# WEAK-NEXT:   weak_func
+
+## Test the behavior of the symbol reference section.
+# RUN: llvm-nm --export-symbols --no-rsrc %t2_invalid.o 2>&1 | \
+# RUN:   FileCheck -DFILE=%t2_invalid.o --check-prefixes=INVALID %s
+
+# INVALID:      llvm-nm{{(\.exe)?}}: warning: [[FILE]]: for symbol with index 8: the section index (26) is invalid
+# INVALID-NEXT: export_protected_var export
+# INVALID-NEXT: export_protected_var protected
+# INVALID-NEXT: protected_var_in_sec_obj protected
+# INVALID-NEXT: var1_in_sec_obj
+
+## Show that symbols in shared object files are not exported.
+## Generate XCOFF shared object file.
+# RUN: yaml2obj -DFLAG=0x2000 --docnum=2 %s -o %t_shared.o
+# RUN: llvm-nm --export-symbols %t_shared.o | count 0
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:       0x1DF
+  Flags:             [[FLAG]]
+AuxiliaryHeader:
+  Magic:                 0x10B
+  Version:               0x2
+Sections:
+  - Name:            .text
+    Flags:           [ STYP_TEXT ]
+  - Name:            .data
+    Flags:           [ STYP_DATA ]
+  - Name:            .bss
+    Flags:           [ STYP_DATA ]
+  - Name:            .debug
+    Flags:           [ STYP_DEBUG ]
+Symbols:
+  - Name:            export_protected_var
+    Section:         .data
+## Exported visibility.
+    Type:            0x4000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            export_protected_var
+    Section:         .data
+## Protected visibility.
+    Type:            0x3000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            __rsrc
+    Section:         .data
+## No visibility.
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+  - Name:            __sinit
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x9
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0xC
+  - Name:            __sterm
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0xC
+  - Name:            .func
+    Section:         .text
+    Type:            0x20
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x02
+       StorageMappingClass:    XMC_PR
+  - Name:            (func)
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0xC
+  - Name:            __023__
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+  - Name:            __tf1_tf1value
+    Section:         .text
+    Type:            0x00
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RO
+  - Name:            __tf9_12345678tf9value
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            __tf2value
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x21
+       StorageMappingClass:    XMC_TC
+  - Name:            __tf2value
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            weak_func
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_WEAKEXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            protected_var
+    Section:         .bss
+    Type:            0x3000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x02
+       StorageMappingClass:    XMC_RW
+  - Name:            hidden_var
+    Section:         .data
+## Hidden visibility.
+    Type:            0x2000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            internal_var
+    Section:         .data
+## Internal visibility.
+    Type:            0x1000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+## A symbol that is neither text, nor data, nor bss.
+  - Name:            debug
+    Section:         .debug
+## Empty symbol name.
+  - Name:            ""
+    Section:         .data
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_TC
+  - Name:            undef_var
+    SectionIndex:    0
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x20
+       StorageMappingClass:    XMC_UA
+## Do not export not global symbol.
+  - Name:            hidext_var
+    Section:         .data
+## Protected visibility.
+    Type:            0x3000
+    StorageClass:    C_HIDEXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+## Symbol should not be filtered out by option --no-rsrc.
+  - Name:            __tf1___rsrc
+    Section:         .data
+    Type:            0x4000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+## Following symbols should not be filtered out by regular expression "^__[0-9]+__".
+  - Name:            __023
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+  - Name:            023__
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+  - Name:            ____
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+  - Name:            __02er02__
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:       0x1DF
+  Flags:             [[FLAG]]
+AuxiliaryHeader:
+  Magic:                 0x10B
+  Version:               0x2
+  TextSectionSize:       0x280
+  DataSectionSize:       0x90
+Sections:
+  - Name:            .text
+    Flags:           [ STYP_TEXT ]
+  - Name:            .data
+    Flags:           [ STYP_DATA ]
+Symbols:
+  - Name:            export_protected_var
+    Section:         .data
+    Type:            0x4000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            export_protected_var
+    Section:         .data
+    Type:            0x3000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            var1_in_sec_obj
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+       SectionOrLength:        0x4
+  - Name:            protected_var_in_sec_obj
+    Section:         .data
+    Type:            0x3000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
+  - Name:            export_var_in_sec_obj
+    SectionIndex:    [[SECT=2]]
+    Type:            0x4000
+    StorageClass:    C_EXT
+    AuxEntries:
+     - Type:                   AUX_CSECT
+       SymbolAlignmentAndType: 0x09
+       StorageMappingClass:    XMC_RW
diff --git a/llvm/test/tools/llvm-nm/bitcode-export-sym.test b/llvm/test/tools/llvm-nm/bitcode-export-sym.test
new file mode 100644
index 0000000000000..9866666ffa775
--- /dev/null
+++ b/llvm/test/tools/llvm-nm/bitcode-export-sym.test
@@ -0,0 +1,12 @@
+# REQUIRES: powerpc-registered-target
+## Test the "--export-symbols" option with bitcode input files.
+
+# RUN: llvm-as -o %t32.bc %p/Inputs/bitcode-sym32.ll
+# RUN: llvm-as -o %t64.bc %p/Inputs/bitcode-sym64.ll
+
+# RUN: llvm-nm --export-symbols %t32.bc %t64.bc | FileCheck %s --check-prefixes=CHECK --implicit-check-not={{.}}
+
+# CHECK:      C32
+# CHECK-NEXT: C64
+# CHECK-NEXT: bar64
+# CHECK-NEXT: foo32
diff --git a/llvm/tools/llvm-nm/Opts.td b/llvm/tools/llvm-nm/Opts.td
index 434a70b1fbc90..6cb530db72f4d 100644
--- a/llvm/tools/llvm-nm/Opts.td
+++ b/llvm/tools/llvm-nm/Opts.td
@@ -18,6 +18,7 @@ def debug_syms : FF<"debug-syms", "Show all symbols, even debugger only">;
 def defined_only : FF<"defined-only", "Show only defined symbols">;
 defm demangle : BB<"demangle", "Demangle C++ symbol names", "Don't demangle symbol names">;
 def dynamic : FF<"dynamic", "Display dynamic symbols instead of normal symbols">;
+def export_symbols : FF<"export-symbols", "Export symbol list for all inputs">;
 def extern_only : FF<"extern-only", "Show only external symbols">;
 defm format : Eq<"format", "Specify output format: bsd (default), posix, sysv, darwin, just-symbols">, MetaVarName<"<format>">;
 def help : FF<"help", "Display this help">;
@@ -49,6 +50,11 @@ def no_dyldinfo : FF<"no-dyldinfo", "Don't add any symbols from the dyldinfo">,
 def s : F<"s", "Dump only symbols from this segment and section name">, Group<grp_mach_o>;
 def x : F<"x", "Print symbol entry in hex">, Group<grp_mach_o>;
 
+// XCOFF specific options.
+def grp_xcoff_o : OptionGroup<"kind">, HelpText<"llvm-nm XCOFF Specific Options">;
+
+def no_rsrc : FF<"no-rsrc", "Exclude resource file symbols (__rsrc) from the export symbol list.">, Group<grp_xcoff_o>;
+
 def : FF<"just-symbol-name", "Alias for --format=just-symbols">, Alias<format_EQ>, AliasArgs<["just-symbols"]>, Flags<[HelpHidden]>;
 def : FF<"portability", "Alias for --format=posix">, Alias<format_EQ>, AliasArgs<["posix"]>;
 
diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index 2196faf933b8d..9324cafa7f11b 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -92,6 +93,7 @@ static bool DebugSyms;
 static bool DefinedOnly;
 static bool Demangle;
 static bool DynamicSyms;
+static bool ExportSymbols;
 static bool ExternalOnly;
 static OutputFormatTy OutputFormat;
 static bool NoLLVMBitcode;
@@ -107,6 +109,9 @@ static bool SizeSort;
 static bool UndefinedOnly;
 static bool WithoutAliases;
 
+// XCOFF-specific options.
+static bool NoRsrc;
+
 namespace {
 enum Radix { d, o, x };
 } // namespace
@@ -130,7 +135,8 @@ static bool HadError = false;
 
 static StringRef ToolName;
 
-static void warn(Error Err, Twine FileName, Twine Context = Twine()) {
+static void warn(Error Err, Twine FileName, Twine Context = Twine(),
+                 Twine Archive = Twine()) {
   assert(Err);
 
   // Flush the standard output so that the warning isn't interleaved with other
@@ -139,8 +145,9 @@ static void warn(Error Err, Twine FileName, Twine Context = Twine()) {
 
   handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
     WithColor::warning(errs(), ToolName)
-        << FileName << ": " << (Context.str().empty() ? "" : Context + ": ")
-        << EI.message() << "\n";
+        << (Archive.str().empty() ? FileName : Archive + "(" + FileName + ")")
+        << ": " << (Context.str().empty() ? "" : Context + ": ") << EI.message()
+        << "\n";
   });
 }
 
@@ -213,6 +220,8 @@ struct NMSymbol {
   StringRef SectionName;
   StringRef TypeName;
   BasicSymbolRef Sym;
+  StringRef Visibility;
+
   // The Sym field above points to the native symbol in the object file,
   // for Mach-O when we are creating symbols from the dyld info the above
   // pointer is null as there is no native symbol.  In these cases the fields
@@ -232,6 +241,29 @@ struct NMSymbol {
     }
     return TypeChar != 'U';
   }
+
+  bool initializeFlags(const SymbolicFile &Obj) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr) {
+      // TODO: Test this error.
+      error(SymFlagsOrErr.takeError(), Obj.getFileName());
+      return false;
+    }
+    SymFlags = *SymFlagsOrErr;
+    return true;
+  }
+
+  bool shouldPrint() const {
+    bool Undefined = SymFlags & SymbolRef::SF_Undefined;
+    bool Global = SymFlags & SymbolRef::SF_Global;
+    bool Weak = SymFlags & SymbolRef::SF_Weak;
+    bool FormatSpecific = SymFlags & SymbolRef::SF_FormatSpecific;
+    if ((!Undefined && UndefinedOnly) || (Undefined && DefinedOnly) ||
+        (!Global && ExternalOnly) || (Weak && NoWeakSymbols) ||
+        (FormatSpecific && !(SpecialSyms || DebugSyms)))
+      return false;
+    return true;
+  }
 };
 
 bool operator<(const NMSymbol &A, const NMSymbol &B) {
@@ -241,11 +273,17 @@ bool operator<(const NMSymbol &A, const NMSymbol &B) {
   if (SizeSort)
     return std::make_tuple(A.Size, A.Name, A.Address) <
            std::make_tuple(B.Size, B.Name, B.Address);
+  if (ExportSymbols)
+    return std::make_tuple(A.Name, A.Visibility) <
+           std::make_tuple(B.Name, B.Visibility);
   return std::make_tuple(A.Name, A.Size, A.Address) <
          std::make_tuple(B.Name, B.Size, B.Address);
 }
 
 bool operator>(const NMSymbol &A, const NMSymbol &B) { return B < A; }
+bool operator==(const NMSymbol &A, const NMSymbol &B) {
+  return !(A < B) && !(B < A);
+}
 } // anonymous namespace
 
 static char isSymbolList64Bit(SymbolicFile &Obj) {
@@ -659,6 +697,15 @@ static void sortSymbolList() {
     llvm::sort(SymbolList);
 }
 
+static void printExportSymbolList() {
+  for (const NMSymbol &Sym : SymbolList) {
+    outs() << Sym.Name;
+    if (!Sym.Visibility.empty())
+      outs() << ' ' << Sym.Visibility;
+    outs() << '\n';
+  }
+}
+
 static void printSymbolList(SymbolicFile &Obj, bool printName,
                             StringRef ArchiveName, StringRef ArchitectureName) {
   if (!PrintFileName) {
@@ -707,25 +754,7 @@ static void printSymbolList(SymbolicFile &Obj, bool printName,
   }
 
   for (const NMSymbol &S : SymbolList) {
-    uint32_t SymFlags;
-    if (S.Sym.getRawDataRefImpl().p) {
-      Expected<uint32_t> SymFlagsOrErr = S.Sym.getFlags();
-      if (!SymFlagsOrErr) {
-        // TODO: Test this error.
-        error(SymFlagsOrErr.takeError(), Obj.getFileName());
-        return;
-      }
-      SymFlags = *SymFlagsOrErr;
-    } else
-      SymFlags = S.SymFlags;
-
-    bool Undefined = SymFlags & SymbolRef::SF_Undefined;
-    bool Global = SymFlags & SymbolRef::SF_Global;
-    bool Weak = SymFlags & SymbolRef::SF_Weak;
-    bool FormatSpecific = SymFlags & SymbolRef::SF_FormatSpecific;
-    if ((!Undefined && UndefinedOnly) || (Undefined && DefinedOnly) ||
-        (!Global && ExternalOnly) || (Weak && NoWeakSymbols) ||
-        (FormatSpecific && !(SpecialSyms || DebugSyms)))
+    if (!S.shouldPrint())
       continue;
 
     std::string Name = S.Name;
@@ -1638,11 +1667,93 @@ static bool shouldDump(SymbolicFile &Obj) {
                                 : BitMode != BitModeTy::Bit64;
 }
 
+static void getXCOFFExports(XCOFFObjectFile *XCOFFObj, StringRef ArchiveName) {
+  // Skip Shared object file.
+  if (XCOFFObj->getFlags() & XCOFF::F_SHROBJ)
+    return;
+
+  for (SymbolRef Sym : XCOFFObj->symbols()) {
+    // There is no visibility in old 32 bit XCOFF object file interpret.
+    bool HasVisibilityAttr =
+        XCOFFObj->is64Bit() || (XCOFFObj->auxiliaryHeader32() &&
+                                (XCOFFObj->auxiliaryHeader32()->getVersion() ==
+                                 XCOFF::NEW_XCOFF_INTERPRET));
+
+    if (HasVisibilityAttr) {
+      XCOFFSymbolRef XCOFFSym = XCOFFObj->toSymbolRef(Sym.getRawDataRefImpl());
+      uint16_t SymType = XCOFFSym.getSymbolType();
+      if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_INTERNAL)
+        continue;
+      if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_HIDDEN)
+        continue;
+    }
+
+    Expected<section_iterator> SymSecOrErr = Sym.getSection();
+    if (!SymSecOrErr) {
+      warn(SymSecOrErr.takeError(), XCOFFObj->getFileName(),
+           "for symbol with index " +
+               Twine(XCOFFObj->getSymbolIndex(Sym.getRawDataRefImpl().p)),
+           ArchiveName);
+      continue;
+    }
+    section_iterator SecIter = *SymSecOrErr;
+    // If the symbol is not in a text or data section, it is not exported.
+    if (SecIter == XCOFFObj->section_end())
+      continue;
+    if (!(SecIter->isText() || SecIter->isData() || SecIter->isBSS()))
+      continue;
+
+    StringRef SymName = cantFail(Sym.getName());
+    if (SymName.empty())
+      continue;
+    if (SymName.startswith("__sinit") || SymName.startswith("__sterm") ||
+        SymName.front() == '.' || SymName.front() == '(')
+      continue;
+
+    // Check the SymName regex matching with "^__[0-9]+__".
+    if (SymName.size() > 4 && SymName.startswith("__") &&
+        SymName.endswith("__")) {
+      if (std::all_of(SymName.begin() + 2, SymName.end() - 2, isDigit))
+        continue;
+    }
+
+    if (SymName == "__rsrc" && NoRsrc)
+      continue;
+
+    if (SymName.startswith("__tf1"))
+      SymName = SymName.substr(6);
+    else if (SymName.startswith("__tf9"))
+      SymName = SymName.substr(14);
+
+    NMSymbol S = {};
+    S.Name = SymName.str();
+    S.Sym = Sym;
+
+    if (HasVisibilityAttr) {
+      XCOFFSymbolRef XCOFFSym = XCOFFObj->toSymbolRef(Sym.getRawDataRefImpl());
+      uint16_t SymType = XCOFFSym.getSymbolType();
+      if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_PROTECTED)
+        S.Visibility = "protected";
+      else if ((SymType & XCOFF::VISIBILITY_MASK) == XCOFF::SYM_V_EXPORTED)
+        S.Visibility = "export";
+    }
+    if (S.initializeFlags(*XCOFFObj))
+      SymbolList.push_back(S);
+  }
+}
+
 static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
                                       StringRef ArchiveName = {},
                                       StringRef ArchitectureName = {}) {
   if (!shouldDump(Obj))
     return;
+
+  if (ExportSymbols && Obj.isXCOFF()) {
+    XCOFFObjectFile *XCOFFObj = cast<XCOFFObjectFile>(&Obj);
+    getXCOFFExports(XCOFFObj, ArchiveName);
+    return;
+  }
+
   auto Symbols = Obj.symbols();
   std::vector<VersionEntry> SymbolVersions;
   if (DynamicSyms) {
@@ -1672,6 +1783,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
     if (Nsect == 0)
       return;
   }
+
   if (!(MachO && DyldInfoOnly)) {
     size_t I = -1;
     for (BasicSymbolRef Sym : Symbols) {
@@ -1732,7 +1844,8 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
             (SymbolVersions[I].IsVerDef ? "@@" : "@") + SymbolVersions[I].Name;
 
       S.Sym = Sym;
-      SymbolList.push_back(S);
+      if (S.initializeFlags(Obj))
+        SymbolList.push_back(S);
     }
   }
 
@@ -1745,6 +1858,9 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
   if (MachO && !NoDyldInfo)
     dumpSymbolsFromDLInfoMachO(*MachO);
 
+  if (ExportSymbols)
+    return;
+
   CurrentFilename = Obj.getFileName();
 
   if (Symbols.empty() && SymbolList.empty() && !Quiet) {
@@ -1846,7 +1962,7 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
           }
           if (!checkMachOAndArchFlags(O, Filename))
             return;
-          if (!PrintFileName && shouldDump(*O)) {
+          if (!PrintFileName && shouldDump(*O) && !ExportSymbols) {
             outs() << "\n";
             if (isa<MachOObjectFile>(O)) {
               outs() << Filename << "(" << O->getFileName() << ")";
@@ -2168,6 +2284,12 @@ int main(int argc, char **argv) {
   PrintFileName = Args.hasArg(OPT_print_file_name);
   PrintSize = Args.hasArg(OPT_print_size);
   ReverseSort = Args.hasArg(OPT_reverse_sort);
+  ExportSymbols = Args.hasArg(OPT_export_symbols);
+  if (ExportSymbols) {
+    ExternalOnly = true;
+    DefinedOnly = true;
+  }
+
   Quiet = Args.hasArg(OPT_quiet);
   V = Args.getLastArgValue(OPT_radix_EQ, "x");
   if (V == "o")
@@ -2203,6 +2325,9 @@ int main(int argc, char **argv) {
   DyldInfoOnly = Args.hasArg(OPT_dyldinfo_only);
   NoDyldInfo = Args.hasArg(OPT_no_dyldinfo);
 
+  // XCOFF specific options.
+  NoRsrc = Args.hasArg(OPT_no_rsrc);
+
   // llvm-nm only reads binary files.
   if (error(sys::ChangeStdinToBinary()))
     return 1;
@@ -2262,6 +2387,18 @@ int main(int argc, char **argv) {
 
   llvm::for_each(InputFilenames, dumpSymbolNamesFromFile);
 
+  if (ExportSymbols) {
+    // Delete symbols which should not be printed from SymolList.
+    SymbolList.erase(
+        std::remove_if(SymbolList.begin(), SymbolList.end(),
+                       [](const NMSymbol &s) { return !s.shouldPrint(); }),
+        SymbolList.end());
+    sortSymbolList();
+    SymbolList.erase(std::unique(SymbolList.begin(), SymbolList.end()),
+                     SymbolList.end());
+    printExportSymbolList();
+  }
+
   if (HadError)
     return 1;
 }

From bbd7eac800e6505f10a0931da8c5b1ff2ac66bef Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Thu, 17 Feb 2022 16:37:38 +0000
Subject: [PATCH 116/748] [AArch64] Remove an unused variable in my previous
 patch

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 58a853c2ddf8c..185e18e884fa7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3477,7 +3477,6 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
         ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
         Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
     EVT ScalarVT = VT.getScalarType();
-    SDValue ScalarCvt;
     if (IsStrict)
       return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
                          {Op.getOperand(0), Extract});

From edde46b5d0120b14c8e005d96b92ea28f8c562f0 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 16 Feb 2022 20:34:08 -0800
Subject: [PATCH 117/748] [test][IndVarSimplify][OpaquePtr] Precommit test

---
 .../Transforms/IndVarSimplify/opaque-ptr.ll   | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll

diff --git a/llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll b/llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll
new file mode 100644
index 0000000000000..c6af7014ef531
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=indvars -S < %s -opaque-pointers | FileCheck %s
+
+declare void @c(ptr)
+
+define void @test(ptr %arg) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    call void @c(ptr [[O]])
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IDX]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[INC]], 16
+; CHECK-NEXT:    call void @c(ptr [[O]])
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %o = getelementptr ptr, ptr %arg, i64 16
+  call void @c(ptr %o)
+  br label %loop
+loop:
+  %idx = phi i32 [ 1, %entry ], [ %inc, %loop ]
+  %inc = add i32 %idx, 1
+  %c = icmp ne i32 %inc, 16
+  %p = getelementptr ptr, ptr %arg, i32 2
+  %p1 = getelementptr ptr, ptr %p, i32 %idx
+  %neg = sub i32 0, %idx
+  %p2 = getelementptr ptr, ptr %p1, i32 %neg
+  call void @c(ptr %p2)
+  br i1 %c, label %loop, label %end
+end:
+  ret void
+}

From 129af4daa7ce6ca8d1d649de4eba76129c199399 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 16 Feb 2022 20:34:55 -0800
Subject: [PATCH 118/748] [SCEVExpander][OpaquePtr] Check GEP source type when
 finding identical GEP

Fixes an opaque pointers miscompile.

Reviewed By: #opaque-pointers, nikic

Differential Revision: https://reviews.llvm.org/D120004
---
 llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp | 4 +++-
 llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll     | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 007578d9ada7e..754382aa0845e 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -591,7 +591,9 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
         if (isa<DbgInfoIntrinsic>(IP))
           ScanLimit++;
         if (IP->getOpcode() == Instruction::GetElementPtr &&
-            IP->getOperand(0) == V && IP->getOperand(1) == Idx)
+            IP->getOperand(0) == V && IP->getOperand(1) == Idx &&
+            cast<GEPOperator>(&*IP)->getSourceElementType() ==
+                Type::getInt8Ty(Ty->getContext()))
           return &*IP;
         if (IP == BlockBegin) break;
       }
diff --git a/llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll b/llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll
index c6af7014ef531..3b36930b68baa 100644
--- a/llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll
+++ b/llvm/test/Transforms/IndVarSimplify/opaque-ptr.ll
@@ -8,12 +8,13 @@ define void @test(ptr %arg) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[O:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i64 16
 ; CHECK-NEXT:    call void @c(ptr [[O]])
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[ARG]], i64 16
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IDX]], 1
 ; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[INC]], 16
-; CHECK-NEXT:    call void @c(ptr [[O]])
+; CHECK-NEXT:    call void @c(ptr [[UGLYGEP]])
 ; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void

From 515c617003bb340a72aeab148ea705c53950c44d Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Wed, 16 Feb 2022 12:56:43 -0800
Subject: [PATCH 119/748] [mlir][linalg][sparse] add linalg optimization passes
 "upstream"

It is time to compose Linalg related optimizations with SparseTensor
related optimizations. This is a careful first start by adding some
general Linalg optimizations "upstream" of the sparse compiler in the
full sparse compiler pipeline. Some minor changes were needed to make
those optimizations aware of sparsity.

Note that after this, we will add a sparse specific fusion rule,
just to demonstrate the power of the new composition.

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D119971
---
 mlir/lib/Dialect/Linalg/IR/CMakeLists.txt     |  1 +
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 16 ++++++++---
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |  1 +
 .../Linalg/Transforms/ElementwiseOpFusion.cpp |  7 ++++-
 .../Pipelines/SparseTensorPipelines.cpp       |  2 ++
 .../SparseTensor/CPU/dense_output.mlir        | 27 ++++++++-----------
 .../CPU/sparse_filter_conv2d.mlir             | 14 +++-------
 .../SparseTensor/CPU/sparse_matmul.mlir       |  8 ++----
 .../CPU/sparse_quantized_matmul.mlir          | 14 +++-------
 .../SparseTensor/CPU/sparse_reductions.mlir   | 10 +++----
 .../CPU/sparse_sampled_mm_fusion.mlir         | 14 +++-------
 .../Dialect/SparseTensor/python/test_SpMM.py  |  1 -
 .../SparseTensor/python/test_output.py        |  1 -
 .../SparseTensor/python/test_stress.py        |  1 -
 .../llvm-project-overlay/mlir/BUILD.bazel     |  2 ++
 15 files changed, 53 insertions(+), 66 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
index 2ff2cc8de338a..e310f58a2e3c0 100644
--- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_dialect_library(MLIRLinalg
   MLIRIR
   MLIRParser
   MLIRSideEffectInterfaces
+  MLIRSparseTensor
   MLIRSCF
   MLIRMath
   MLIRMemRef
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 87278dcba0896..c148ab9bcfa7f 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/Dialect/Arithmetic/Utils/Utils.h"
 #include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineExprVisitor.h"
@@ -819,9 +820,18 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
       Type resultType = genericOp->getResult(yieldVal.index()).getType();
       // The input can have a different type than the result, e.g. a dynamic
       // input dimension can be turned into a static output dimension.
-      if (returnedArg.getType() != resultType)
-        returnedArg = rewriter.create<tensor::CastOp>(genericOp.getLoc(),
-                                                      resultType, returnedArg);
+      Type returnType = returnedArg.getType();
+      if (returnType != resultType) {
+        // Distinguish between sparse conversion or dense tensor casting.
+        // TODO: unify the two ops?
+        if (sparse_tensor::getSparseTensorEncoding(returnType) ||
+            sparse_tensor::getSparseTensorEncoding(resultType))
+          returnedArg = rewriter.create<sparse_tensor::ConvertOp>(
+              genericOp.getLoc(), resultType, returnedArg);
+        else
+          returnedArg = rewriter.create<tensor::CastOp>(
+              genericOp.getLoc(), resultType, returnedArg);
+      }
       returnedArgs.push_back(returnedArg);
     }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 6897cb9d00d05..57bef39d65338 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -50,6 +50,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   MLIRSCFTransforms
   MLIRSCFUtils
   MLIRPass
+  MLIRSparseTensor
   MLIRStandard
   MLIRStandardOpsTransforms
   MLIRStandardToLLVM
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 570e844878d79..7e0e857643eb6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Matchers.h"
@@ -2184,6 +2185,10 @@ struct RemoveOutsDependency : public OpRewritePattern<GenericOp> {
         if (!operandType)
           continue;
 
+        // If outs is sparse, leave it to the sparse compiler.
+        if (sparse_tensor::getSparseTensorEncoding(operandVal.getType()))
+          continue;
+
         // If outs is already an `init_tensor` operation, nothing to do.
         auto definingOp = operandVal.getDefiningOp<InitTensorOp>();
         if (definingOp)
@@ -2213,7 +2218,7 @@ struct RemoveOutsDependency : public OpRewritePattern<GenericOp> {
 } // namespace
 
 //===---------------------------------------------------------------------===//
-// Methods that add patterns descrined in this file to a pattern list.
+// Methods that add patterns described in this file to a pattern list.
 //===---------------------------------------------------------------------===//
 
 void mlir::linalg::populateFoldReshapeOpsByLinearizationPatterns(
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 25487e431708b..ff6577abaa019 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -29,6 +29,8 @@ using namespace mlir::sparse_tensor;
 void mlir::sparse_tensor::buildSparseCompiler(
     OpPassManager &pm, const SparseCompilerOptions &options) {
   // TODO(wrengr): ensure the original `pm` is for ModuleOp
+  pm.addNestedPass<FuncOp>(createLinalgGeneralizationPass());
+  pm.addPass(createLinalgElementwiseOpFusionPass());
   pm.addPass(createSparsificationPass(options.sparsificationOptions()));
   pm.addPass(createSparseTensorConversionPass());
   pm.addNestedPass<FuncOp>(createLinalgBufferizePass());
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
index 2d8898e9ec45d..a263972587efb 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir
@@ -1,6 +1,5 @@
 // RUN: mlir-opt %s --sparse-compiler | \
 // RUN: TENSOR0="%mlir_integration_test_dir/data/test.mtx" \
-// RUN: TENSOR1="%mlir_integration_test_dir/data/zero.mtx" \
 // RUN: mlir-cpu-runner \
 // RUN:  -e entry -entry-point-result=void  \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
@@ -40,15 +39,17 @@
 // library.
 module {
   //
-  // A kernel that assigns elements from A to an initially zero X.
+  // A kernel that assigns elements from A to X.
   //
-  func @dense_output(%arga: tensor<?x?xf64, #SparseMatrix>,
-                     %argx: tensor<?x?xf64, #DenseMatrix>
-		     {linalg.inplaceable = true})
-       -> tensor<?x?xf64, #DenseMatrix> {
+  func @dense_output(%arga: tensor<?x?xf64, #SparseMatrix>) -> tensor<?x?xf64, #DenseMatrix> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %d0 = tensor.dim %arga, %c0 : tensor<?x?xf64, #SparseMatrix>
+    %d1 = tensor.dim %arga, %c1 : tensor<?x?xf64, #SparseMatrix>
+    %init = sparse_tensor.init [%d0, %d1] : tensor<?x?xf64, #DenseMatrix>
     %0 = linalg.generic #trait_assign
        ins(%arga: tensor<?x?xf64, #SparseMatrix>)
-      outs(%argx: tensor<?x?xf64, #DenseMatrix>) {
+      outs(%init: tensor<?x?xf64, #DenseMatrix>) {
       ^bb(%a: f64, %x: f64):
         linalg.yield %a : f64
     } -> tensor<?x?xf64, #DenseMatrix>
@@ -70,15 +71,9 @@ module {
     %a = sparse_tensor.new %fileName
       : !Filename to tensor<?x?xf64, #SparseMatrix>
 
-    // Initialize all-dense annotated "sparse" matrix to all zeros.
-    %fileZero = call @getTensorFilename(%c1) : (index) -> (!Filename)
-    %x = sparse_tensor.new %fileZero
-      : !Filename to tensor<?x?xf64, #DenseMatrix>
-
     // Call the kernel.
-    %0 = call @dense_output(%a, %x)
-      : (tensor<?x?xf64, #SparseMatrix>,
-         tensor<?x?xf64, #DenseMatrix>) -> tensor<?x?xf64, #DenseMatrix>
+    %0 = call @dense_output(%a)
+      : (tensor<?x?xf64, #SparseMatrix>) -> tensor<?x?xf64, #DenseMatrix>
 
     //
     // Print the linearized 5x5 result for verification.
@@ -92,7 +87,7 @@ module {
 
     // Release the resources.
     sparse_tensor.release %a : tensor<?x?xf64, #SparseMatrix>
-    sparse_tensor.release %x : tensor<?x?xf64, #DenseMatrix>
+    sparse_tensor.release %0 : tensor<?x?xf64, #DenseMatrix>
 
     return
   }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
index a758a891658d5..02d5cc0b51881 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir
@@ -1,18 +1,12 @@
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 //
 // Do the same run, but now with SIMDization as well. This should not change the outcome.
 //
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler="vectorization-strategy=2 vl=2" | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=2" | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
index 6c207d7e59a33..f2a35efafabfd 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -1,11 +1,7 @@
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
-//
 
 #CSR = #sparse_tensor.encoding<{
   dimLevelType = [ "dense", "compressed" ],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir
index c4cb95af45994..1db865b3c2bff 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_quantized_matmul.mlir
@@ -1,18 +1,12 @@
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 //
 // Do the same run, but now with SIMDization as well. This should not change the outcome.
 //
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler="vectorization-strategy=2 vl=2" | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=2" | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
index 8f409fe8ecff0..b0fde087d8085 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir
@@ -1,16 +1,12 @@
 // RUN: mlir-opt %s --sparse-compiler | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 //
 // Do the same run, but now with SIMDization as well. This should not change the outcome.
 //
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler="vectorization-strategy=2 vl=8" | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=8" | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
index 87092754224a0..017950391e39f 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
@@ -1,18 +1,12 @@
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s --sparse-compiler | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 //
 // Do the same run, but now with SIMDization as well. This should not change the outcome.
 //
-// RUN: mlir-opt %s \
-// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
-// RUN:   --sparse-compiler="vectorization-strategy=2 vl=8" | \
-// RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN: mlir-opt %s -sparse-compiler="vectorization-strategy=2 vl=8" | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py
index a79c4b4cf595a..1b66628ad7bda 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py
@@ -113,7 +113,6 @@ class SparseCompiler:
 
   def __init__(self, options: str):
     pipeline = (
-        f'builtin.func(linalg-generalize-named-ops,linalg-fuse-elementwise-ops),'
         f'sparse-compiler{{{options} reassociate-fp-reductions=1 enable-index-optimizations=1}}')
     self.pipeline = pipeline
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
index f03756b8b1294..c29f618e26980 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
@@ -73,7 +73,6 @@ class SparseCompiler:
 
   def __init__(self):
     pipeline = (
-        f'builtin.func(linalg-generalize-named-ops,linalg-fuse-elementwise-ops),'
         f'sparse-compiler{{reassociate-fp-reductions=1 enable-index-optimizations=1}}')
     self.pipeline = pipeline
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
index f18655ea3ba5c..ccf1ffd6cd263 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
@@ -171,7 +171,6 @@ class SparseCompiler:
   def __init__(self, sparsification_options: str, support_lib: str):
     self._support_lib = support_lib
     self._pipeline = (
-        f'builtin.func(linalg-generalize-named-ops,linalg-fuse-elementwise-ops),'
         f'sparse-compiler{{{sparsification_options} reassociate-fp-reductions=1 enable-index-optimizations=1}}')
     # Must be in the scope of a `with ir.Context():`
     self._passmanager = PassManager.parse(self._pipeline)
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 511c22132ad67..32d78f225a05b 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6997,6 +6997,7 @@ cc_library(
         ":Parser",
         ":SCFDialect",
         ":SideEffectInterfaces",
+        ":SparseTensor",
         ":StandardOps",
         ":Support",
         ":TensorDialect",
@@ -7083,6 +7084,7 @@ cc_library(
         ":SCFDialect",
         ":SCFTransforms",
         ":SCFUtils",
+        ":SparseTensor",
         ":StandardOps",
         ":StandardOpsTransforms",
         ":Support",

From 7adb85884b35be033b6c54d5916aed5edcb354fb Mon Sep 17 00:00:00 2001
From: Arthur O'Dwyer <arthur.j.odwyer@gmail.com>
Date: Mon, 14 Feb 2022 15:11:47 -0500
Subject: [PATCH 120/748] [clang] [NFC] More exhaustive tests for deducing void
 return types

Differential Revision: https://reviews.llvm.org/D119772
---
 clang/test/SemaCXX/deduced-return-void.cpp | 135 +++++++++++++++++++--
 1 file changed, 126 insertions(+), 9 deletions(-)

diff --git a/clang/test/SemaCXX/deduced-return-void.cpp b/clang/test/SemaCXX/deduced-return-void.cpp
index 7b6c514eab71b..2cd518d426e5c 100644
--- a/clang/test/SemaCXX/deduced-return-void.cpp
+++ b/clang/test/SemaCXX/deduced-return-void.cpp
@@ -4,20 +4,137 @@
 // Check that we don't get any extra warning for "return" without an
 // expression, in a function that might have been intended to return
 // void all along.
-auto f1() {
-  return 1;
-  return; // expected-error {{deduced as 'void' here but deduced as 'int' in earlier return statement}}
+decltype(h1) h1() { // expected-error {{use of undeclared identifier 'h1'}}
+  return;
+}
+
+namespace JustAuto {
+int i;
+auto f1() { }
+auto f2() { return; }
+auto f3() { return void(); }
+auto f4() {
+  return i;
+  return; // expected-error {{'auto' in return type deduced as 'void' here but deduced as 'int' in earlier return statement}}
+}
+auto f5() {
+  return i;
+  return void(); // expected-error {{'auto' in return type deduced as 'void' here but deduced as 'int' in earlier return statement}}
 }
 
-decltype(auto) f2() {
-  return 1;
-  return; // expected-error {{deduced as 'void' here but deduced as 'int' in earlier return statement}}
+auto l1 = []() { };
+auto l2 = []() { return; };
+auto l3 = []() { return void(); };
+auto l4 = []() {
+  return i;
+  return; // expected-error {{return type 'void' must match previous return type 'int' when lambda expression has unspecified explicit return type}}
+};
+auto l5 = []() {
+  return i;
+  return void(); // expected-error {{return type 'void' must match previous return type 'int' when lambda expression has unspecified explicit return type}}
+};
+
+} // namespace JustAuto
+
+namespace DecltypeAuto {
+int i;
+decltype(auto) f1() { }
+decltype(auto) f2() { return; }
+decltype(auto) f3() { return void(); }
+decltype(auto) f4() {
+  return i;
+  return; // expected-error {{'decltype(auto)' in return type deduced as 'void' here but deduced as 'int' in earlier return statement}}
+}
+decltype(auto) f5() {
+  return i;
+  return void(); // expected-error {{'decltype(auto)' in return type deduced as 'void' here but deduced as 'int' in earlier return statement}}
 }
 
-auto *g() {
+auto l1 = []() -> decltype(auto) { };
+auto l2 = []() -> decltype(auto) { return; };
+auto l3 = []() -> decltype(auto) { return void(); };
+auto l4 = []() -> decltype(auto) {
+  return i;
+  return; // expected-error {{'decltype(auto)' in return type deduced as 'void' here but deduced as 'int' in earlier return statement}}
+};
+auto l5 = []() -> decltype(auto) {
+  return i;
+  return void(); // expected-error {{'decltype(auto)' in return type deduced as 'void' here but deduced as 'int' in earlier return statement}}
+};
+
+} // namespace DecltypeAuto
+
+namespace AutoPtr {
+int i;
+auto *f1() { } // expected-error {{cannot deduce return type 'auto *' for function with no return statements}}
+auto *f2() {
+  return; // expected-error {{cannot deduce return type 'auto *' from omitted return expression}}
+}
+auto *f3() {
+  return void(); // expected-error {{cannot deduce return type 'auto *' from returned value of type 'void'}}
+}
+auto *f4() {
+  return &i;
   return; // expected-error {{cannot deduce return type 'auto *' from omitted return expression}}
 }
+auto *f5() {
+  return &i;
+  return void(); // expected-error {{cannot deduce return type 'auto *' from returned value of type 'void'}}
+}
 
-decltype(h1) h1() { // expected-error {{use of undeclared identifier 'h1'}}
-  return;
+auto l1 = []() -> auto* { }; // expected-error {{cannot deduce return type 'auto *' for function with no return statements}}
+auto l2 = []() -> auto* {
+  return; // expected-error {{cannot deduce return type 'auto *' from omitted return expression}}
+};
+auto l3 = []() -> auto* {
+  return void(); // expected-error {{cannot deduce return type 'auto *' from returned value of type 'void'}}
+};
+auto l4 = []() -> auto* {
+  return &i;
+  return; // expected-error {{cannot deduce return type 'auto *' from omitted return expression}}
+};
+auto l5 = []() -> auto* {
+  return &i;
+  return void(); // expected-error {{cannot deduce return type 'auto *' from returned value of type 'void'}}
+};
+} // namespace AutoPtr
+
+namespace AutoRef {
+int i;
+auto& f1() { // expected-error {{cannot deduce return type 'auto &' for function with no return statements}}
+}
+auto& f2() {
+  return; // expected-error {{cannot deduce return type 'auto &' from omitted return expression}}
+}
+auto& f3() {
+  return void(); // expected-error@-1 {{cannot form a reference to 'void'}}
 }
+auto& f4() {
+  return i;
+  return; // expected-error {{cannot deduce return type 'auto &' from omitted return expression}}
+}
+auto& f5() {
+  return i;
+  return void(); // expected-error@-2 {{cannot form a reference to 'void'}}
+}
+auto& f6() { return 42; } // expected-error {{non-const lvalue reference to type 'int' cannot bind to a temporary of type 'int'}}
+
+auto l1 = []() -> auto& { }; // expected-error {{cannot deduce return type 'auto &' for function with no return statements}}
+auto l2 = []() -> auto& {
+  return; // expected-error {{cannot deduce return type 'auto &' from omitted return expression}}
+};
+auto l3 = []() -> auto& { // expected-error {{cannot form a reference to 'void'}}
+  return void();
+};
+auto l4 = []() -> auto& {
+  return i;
+  return; // expected-error {{cannot deduce return type 'auto &' from omitted return expression}}
+};
+auto l5 = []() -> auto& { // expected-error {{cannot form a reference to 'void'}}
+  return i;
+  return void();
+};
+auto l6 = []() -> auto& {
+  return 42; // expected-error {{non-const lvalue reference to type 'int' cannot bind to a temporary of type 'int'}}
+};
+} // namespace AutoRef

From 32b73bc6ab8234b670c34d5ef999300e072cc706 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Tue, 19 Oct 2021 09:12:57 -0700
Subject: [PATCH 121/748] Add support for floating-point option
 `ffp-eval-method` and for `pragma clang fp eval_method`.

https://reviews.llvm.org/D109239
---
 clang/docs/LanguageExtensions.rst             |  32 ++++
 clang/docs/UsersManual.rst                    |  27 +++
 .../include/clang/Basic/DiagnosticLexKinds.td |   4 +
 .../clang/Basic/DiagnosticParseKinds.td       |   3 +
 clang/include/clang/Basic/FPOptions.def       |   1 +
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Basic/LangOptions.h       |  18 ++
 clang/include/clang/Basic/TargetInfo.h        |   6 +-
 clang/include/clang/Driver/Options.td         |   5 +
 clang/include/clang/Lex/Preprocessor.h        |  41 +++++
 clang/include/clang/Parse/Parser.h            |   1 +
 clang/include/clang/Sema/Sema.h               |  14 +-
 clang/lib/Basic/Targets/OSTargets.h           |   4 +-
 clang/lib/Basic/Targets/X86.h                 |  14 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  17 ++
 clang/lib/Frontend/InitPreprocessor.cpp       |   1 -
 clang/lib/Lex/PPMacroExpansion.cpp            |  12 ++
 clang/lib/Parse/ParsePragma.cpp               |  25 ++-
 clang/lib/Parse/ParseStmt.cpp                 |  10 ++
 clang/lib/Sema/Sema.cpp                       |  21 +++
 clang/lib/Sema/SemaAttr.cpp                   |  21 +++
 clang/lib/Sema/SemaExpr.cpp                   |  34 ++++
 .../test/CodeGen/X86/32bit-behavior-no-eval.c |  30 ++++
 clang/test/CodeGen/X86/32bit-behavior.c       | 109 ++++++++++++
 clang/test/CodeGen/X86/fp-eval-method.c       |  20 +++
 clang/test/CodeGen/flt_eval_macro.cpp         |  79 +++++++++
 clang/test/CodeGen/fp-floatcontrol-pragma.cpp | 166 +++++++++++++++++-
 clang/test/Preprocessor/flt_eval_macro.cpp    |  59 +++++++
 clang/test/Preprocessor/init-aarch64.c        |   3 -
 clang/test/Preprocessor/init-arm.c            |   5 -
 clang/test/Preprocessor/init-mips.c           |   6 -
 clang/test/Preprocessor/init-ppc.c            |   5 -
 clang/test/Preprocessor/init-ppc64.c          |   4 -
 clang/test/Preprocessor/init-s390x.c          |   1 -
 clang/test/Preprocessor/init-v7k-compat.c     |   1 -
 clang/test/Preprocessor/init-x86.c            |  15 --
 clang/test/Preprocessor/init.c                |  11 --
 clang/test/Sema/fp-eval-pragma.cpp            |  87 +++++++++
 clang/test/Sema/x86-eval-method.c             |  18 ++
 clang/test/Sema/x86_64-eval-method.c          |  13 ++
 40 files changed, 872 insertions(+), 72 deletions(-)
 create mode 100644 clang/test/CodeGen/X86/32bit-behavior-no-eval.c
 create mode 100644 clang/test/CodeGen/X86/32bit-behavior.c
 create mode 100644 clang/test/CodeGen/X86/fp-eval-method.c
 create mode 100644 clang/test/CodeGen/flt_eval_macro.cpp
 create mode 100644 clang/test/Preprocessor/flt_eval_macro.cpp
 create mode 100644 clang/test/Sema/fp-eval-pragma.cpp
 create mode 100644 clang/test/Sema/x86-eval-method.c
 create mode 100644 clang/test/Sema/x86_64-eval-method.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index f45d88092eb4a..5249d3f3f7930 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3907,6 +3907,38 @@ A ``#pragma clang fp`` pragma may contain any number of options:
     ...
   }
 
+``#pragma clang fp eval_method`` allows floating-point behavior to be specified
+for a section of the source code. This pragma can appear at file or namespace
+scope, or at the start of a compound statement (excluding comments).
+The pragma is active within the scope of the compound statement.
+
+When ``pragma clang fp eval_method(source)`` is enabled, the section of code
+governed by the pragma behaves as though the command-line option
+``-ffp-eval-method=source`` is enabled. Rounds intermediate results to
+source-defined precision.
+
+When ``pragma clang fp eval_method(double)`` is enabled, the section of code
+governed by the pragma behaves as though the command-line option
+``-ffp-eval-method=double`` is enabled. Rounds intermediate results to
+``double`` precision.
+
+When ``pragma clang fp eval_method(extended)`` is enabled, the section of code
+governed by the pragma behaves as though the command-line option
+``-ffp-eval-method=extended`` is enabled. Rounds intermediate results to
+target-dependent ``long double`` precision. In Win32 programming, for instance,
+the long double data type maps to the double, 64-bit precision data type.
+
+The full syntax this pragma supports is
+``#pragma clang fp eval_method(source|double|extended)``.
+
+.. code-block:: c++
+
+  for(...) {
+    // The compiler will use long double as the floating-point evaluation
+    // method.
+    #pragma clang fp eval_method(extended)
+    a = b[i] * c[i] + e;
+  }
 
 The ``#pragma float_control`` pragma allows precise floating-point
 semantics and floating-point exception behavior to be specified
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 981909aa16eaf..4a776eb86775c 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1566,6 +1566,22 @@ Note that floating-point operations performed as part of constant initialization
    * ``maytrap`` The compiler avoids transformations that may raise exceptions that would not have been raised by the original code. Constant folding performed by the compiler is exempt from this option.
    * ``strict`` The compiler ensures that all transformations strictly preserve the floating point exception semantics of the original code.
 
+.. option:: -ffp-eval-method=<value>
+
+   Specify the floating-point evaluation method for intermediate results within
+   a single expression of the code.
+
+   Valid values are: ``source``, ``double``, and ``extended``.
+   For 64-bit targets, the default value is ``source``. For 32-bit x86 targets
+   however, in the case of NETBSD 6.99.26 and under, the default value is
+   ``double``; in the case of NETBSD greater than 6.99.26, with NoSSE, the
+   default value is ``extended``, with SSE the default value is ``source``.
+   Details:
+
+   * ``source`` The compiler uses the floating-point type declared in the source program as the evaluation method.
+   * ``double`` The compiler uses ``double`` as the floating-point evaluation method for all float expressions of type that is narrower than ``double``.
+   * ``extended`` The compiler uses ``long double`` as the floating-point evaluation method for all float expressions of type that is narrower than ``long double``.
+
 .. option:: -f[no-]protect-parens:
 
    This option pertains to floating-point types, complex types with
@@ -1587,6 +1603,17 @@ Note that floating-point operations performed as part of constant initialization
    has no effect because the optimizer is prohibited from making unsafe
    transformations.
 
+.. _FLT_EVAL_METHOD:
+
+A note about ``__FLT_EVAL_METHOD__``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The macro ``__FLT_EVAL_METHOD__`` will expand to either the value set from the
+command line option ``ffp-eval-method`` or to the value from the target info
+setting. The ``__FLT_EVAL_METHOD__`` macro cannot expand to the correct
+evaluation method in the presence of a ``#pragma`` which alters the evaluation
+method. An error is issued if ``__FLT_EVAL_METHOD__`` is expanded inside a scope
+modified by ``#pragma clang fp eval_method``.
+
 .. _fp-constant-eval:
 
 A note about Floating Point Constant Evaluation
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index a4436208799f9..0f424b02c812a 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -321,6 +321,10 @@ def err_pragma_include_instead_system_reserved : Error<
   "header '%0' is an implementation detail; #include %select{'%2'|either '%2' "
   "or '%3'|one of %2}1 instead">;
 
+def err_illegal_use_of_flt_eval_macro : Error<
+  "'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing "
+  "'#pragma clang fp eval_method'">;
+
 def pp_poisoning_existing_macro : Warning<"poisoning existing macro">;
 def pp_out_of_date_dependency : Warning<
   "current file is older than dependency %0">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index e23810f402365..bcf8186896303 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1267,6 +1267,9 @@ def err_pragma_attribute_namespace_on_attribute : Error<
 def note_pragma_attribute_namespace_on_attribute : Note<
   "omit the namespace to add attributes to the most-recently"
   " pushed attribute group">;
+def warn_no_support_for_eval_method_source_on_m32 : Warning<
+  "Setting the floating point evaluation method to `source` on a target"
+  " without SSE is not supported.">, InGroup<Pragmas>;
 
 // OpenCL EXTENSION pragma (OpenCL 1.1 [9.1])
 def warn_pragma_expected_colon : Warning<
diff --git a/clang/include/clang/Basic/FPOptions.def b/clang/include/clang/Basic/FPOptions.def
index a93fa475cd5f6..224c1827144f5 100644
--- a/clang/include/clang/Basic/FPOptions.def
+++ b/clang/include/clang/Basic/FPOptions.def
@@ -23,4 +23,5 @@ OPTION(NoHonorInfs, bool, 1, NoHonorNaNs)
 OPTION(NoSignedZero, bool, 1, NoHonorInfs)
 OPTION(AllowReciprocal, bool, 1, NoSignedZero)
 OPTION(AllowApproxFunc, bool, 1, AllowReciprocal)
+OPTION(FPEvalMethod, LangOptions::FPEvalMethodKind, 2, AllowApproxFunc)
 #undef OPTION
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4651f4fff6aa0..89b11fdea89b2 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -301,6 +301,7 @@ BENIGN_ENUM_LANGOPT(DefaultFPContractMode, FPModeKind, 2, FPM_Off, "FP contracti
 COMPATIBLE_LANGOPT(ExpStrictFP, 1, false, "Enable experimental strict floating point")
 BENIGN_ENUM_LANGOPT(FPRoundingMode, RoundingMode, 3, RoundingMode::NearestTiesToEven, "FP Rounding Mode type")
 BENIGN_ENUM_LANGOPT(FPExceptionMode, FPExceptionModeKind, 2, FPE_Ignore, "FP Exception Behavior Mode type")
+BENIGN_ENUM_LANGOPT(FPEvalMethod, FPEvalMethodKind, 2, FEM_UnsetOnCommandLine, "FP type used for floating point arithmetic")
 LANGOPT(NoBitFieldTypeAlign , 1, 0, "bit-field type alignment")
 LANGOPT(HexagonQdsp6Compat , 1, 0, "hexagon-qdsp6 backward compatibility")
 LANGOPT(ObjCAutoRefCount , 1, 0, "Objective-C automated reference counting")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 50c7f038fc6be..2e334e375950e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -235,6 +235,24 @@ class LangOptions : public LangOptionsBase {
     FPE_Strict
   };
 
+  /// Possible float expression evaluation method choices.
+  enum FPEvalMethodKind {
+    /// The evaluation method cannot be determined or is inconsistent for this
+    /// target.
+    FEM_Indeterminable = -1,
+    /// Use the declared type for fp arithmetic.
+    FEM_Source = 0,
+    /// Use the type double for fp arithmetic.
+    FEM_Double = 1,
+    /// Use extended type for fp arithmetic.
+    FEM_Extended = 2,
+    /// Used only for FE option processing; this is only used to indicate that
+    /// the user did not specify an explicit evaluation method on the command
+    /// line and so the target should be queried for its default evaluation
+    /// method instead.
+    FEM_UnsetOnCommandLine = 3
+  };
+
   /// Possible exception handling behavior.
   enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 22918f7e12e84..8e18ded7d3765 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -726,7 +726,11 @@ class TargetInfo : public virtual TransferrableTargetInfo,
   }
 
   /// Return the value for the C99 FLT_EVAL_METHOD macro.
-  virtual unsigned getFloatEvalMethod() const { return 0; }
+  virtual LangOptions::FPEvalMethodKind getFPEvalMethod() const {
+    return LangOptions::FPEvalMethodKind::FEM_Source;
+  }
+
+  virtual bool supportSourceEvalMethod() const { return true; }
 
   // getLargeArrayMinWidth/Align - Return the minimum array size that is
   // 'large' and its alignment.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 37a8e9b77bbfb..10a98f637cddd 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1495,6 +1495,11 @@ def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
 def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
 def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
 def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
+def ffp_eval_method_EQ : Joined<["-"], "ffp-eval-method=">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Specifies the evaluation method to use for floating-point arithmetic.">,
+  Values<"source,double,extended">, NormalizedValuesScope<"LangOptions">,
+  NormalizedValues<["FEM_Source", "FEM_Double", "FEM_Extended"]>,
+  MarshallingInfoEnum<LangOpts<"FPEvalMethod">, "FEM_UnsetOnCommandLine">;
 def ffp_model_EQ : Joined<["-"], "ffp-model=">, Group<f_Group>, Flags<[NoXarchOption]>,
   HelpText<"Controls the semantics of floating-point calculations.">;
 def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<f_Group>, Flags<[CC1Option]>,
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 2802329a60220..f2c84e43ddca3 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -179,12 +179,27 @@ class Preprocessor {
   IdentifierInfo *Ident__is_target_vendor;         // __is_target_vendor
   IdentifierInfo *Ident__is_target_os;             // __is_target_os
   IdentifierInfo *Ident__is_target_environment;    // __is_target_environment
+  IdentifierInfo *Ident__FLT_EVAL_METHOD__;        // __FLT_EVAL_METHOD
 
   // Weak, only valid (and set) while InMacroArgs is true.
   Token* ArgMacro;
 
   SourceLocation DATELoc, TIMELoc;
 
+  // FEM_UnsetOnCommandLine means that an explicit evaluation method was
+  // not specified on the command line. The target is queried to set the
+  // default evaluation method.
+  LangOptions::FPEvalMethodKind CurrentFPEvalMethod =
+      LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
+
+  // The most recent pragma location where the floating point evaluation
+  // method was modified. This is used to determine whether the
+  // 'pragma clang fp eval_method' was used whithin the current scope.
+  SourceLocation LastFPEvalPragmaLocation;
+
+  LangOptions::FPEvalMethodKind TUFPEvalMethod =
+      LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
+
   // Next __COUNTER__ value, starts at 0.
   unsigned CounterValue = 0;
 
@@ -2048,6 +2063,32 @@ class Preprocessor {
   unsigned getCounterValue() const { return CounterValue; }
   void setCounterValue(unsigned V) { CounterValue = V; }
 
+  LangOptions::FPEvalMethodKind getCurrentFPEvalMethod() const {
+    assert(CurrentFPEvalMethod != LangOptions::FEM_UnsetOnCommandLine &&
+           "FPEvalMethod should be set either from command line or from the "
+           "target info");
+    return CurrentFPEvalMethod;
+  }
+
+  LangOptions::FPEvalMethodKind getTUFPEvalMethod() const {
+    return TUFPEvalMethod;
+  }
+
+  SourceLocation getLastFPEvalPragmaLocation() const {
+    return LastFPEvalPragmaLocation;
+  }
+
+  void setCurrentFPEvalMethod(SourceLocation PragmaLoc,
+                              LangOptions::FPEvalMethodKind Val) {
+    assert(Val != LangOptions::FEM_UnsetOnCommandLine &&
+           "FPEvalMethod should never be set to FEM_UnsetOnCommandLine");
+    // This is the location of the '#pragma float_control" where the
+    // execution state is modifed.
+    LastFPEvalPragmaLocation = PragmaLoc;
+    CurrentFPEvalMethod = Val;
+    TUFPEvalMethod = Val;
+  }
+
   /// Retrieves the module that we're currently building, if any.
   Module *getCurrentModule();
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 981800a7e2356..d2e588992238d 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -184,6 +184,7 @@ class Parser : public CodeCompletionHandler {
   std::unique_ptr<PragmaHandler> PCSectionHandler;
   std::unique_ptr<PragmaHandler> MSCommentHandler;
   std::unique_ptr<PragmaHandler> MSDetectMismatchHandler;
+  std::unique_ptr<PragmaHandler> FPEvalMethodHandler;
   std::unique_ptr<PragmaHandler> FloatControlHandler;
   std::unique_ptr<PragmaHandler> MSPointersToMembers;
   std::unique_ptr<PragmaHandler> MSVtorDisp;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index c1e846c55dee7..60ee577fca06a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1541,19 +1541,16 @@ class Sema final {
   /// statements.
   class FPFeaturesStateRAII {
   public:
-    FPFeaturesStateRAII(Sema &S) : S(S), OldFPFeaturesState(S.CurFPFeatures) {
-      OldOverrides = S.FpPragmaStack.CurrentValue;
-    }
-    ~FPFeaturesStateRAII() {
-      S.CurFPFeatures = OldFPFeaturesState;
-      S.FpPragmaStack.CurrentValue = OldOverrides;
-    }
+    FPFeaturesStateRAII(Sema &S);
+    ~FPFeaturesStateRAII();
     FPOptionsOverride getOverrides() { return OldOverrides; }
 
   private:
     Sema& S;
     FPOptions OldFPFeaturesState;
     FPOptionsOverride OldOverrides;
+    LangOptions::FPEvalMethodKind OldEvalMethod;
+    SourceLocation OldFPPragmaLocation;
   };
 
   void addImplicitTypedef(StringRef Name, QualType T);
@@ -10131,6 +10128,9 @@ class Sema final {
            !CurFPFeatures.getAllowApproxFunc();
   }
 
+  void ActOnPragmaFPEvalMethod(SourceLocation Loc,
+                               LangOptions::FPEvalMethodKind Value);
+
   /// ActOnPragmaFloatControl - Call on well-formed \#pragma float_control
   void ActOnPragmaFloatControl(SourceLocation Loc, PragmaMsStackAction Action,
                                PragmaFloatControlKind Value);
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 3c1830d5f8e89..f61652d285a89 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -749,7 +749,9 @@ class AIXTargetInfo : public OSTargetInfo<Target> {
   }
 
   // AIX sets FLT_EVAL_METHOD to be 1.
-  unsigned getFloatEvalMethod() const override { return 1; }
+  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
+    return LangOptions::FPEvalMethodKind::FEM_Double;
+  }
 
   bool defaultsToAIXPowerAlignment() const override { return true; }
 };
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index d1b66432e38b4..e0bb3c344c5b6 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -168,11 +168,15 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
     return LongDoubleFormat == &llvm::APFloat::IEEEquad() ? "g" : "e";
   }
 
-  unsigned getFloatEvalMethod() const override {
+  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
     // X87 evaluates with 80 bits "long double" precision.
-    return SSELevel == NoSSE ? 2 : 0;
+    return SSELevel == NoSSE ? LangOptions::FPEvalMethodKind::FEM_Extended
+                             : LangOptions::FPEvalMethodKind::FEM_Source;
   }
 
+  // EvalMethod `source` is not supported for targets with `NoSSE` feature.
+  bool supportSourceEvalMethod() const override { return SSELevel > NoSSE; }
+
   ArrayRef<const char *> getGCCRegNames() const override;
 
   ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
@@ -471,13 +475,13 @@ class LLVM_LIBRARY_VISIBILITY NetBSDI386TargetInfo
   NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}
 
-  unsigned getFloatEvalMethod() const override {
+  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
     VersionTuple OsVersion = getTriple().getOSVersion();
     // New NetBSD uses the default rounding mode.
     if (OsVersion >= VersionTuple(6, 99, 26) || OsVersion.getMajor() == 0)
-      return X86_32TargetInfo::getFloatEvalMethod();
+      return X86_32TargetInfo::getFPEvalMethod();
     // NetBSD before 6.99.26 defaults to "double" rounding.
-    return 1;
+    return LangOptions::FPEvalMethodKind::FEM_Double;
   }
 };
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a16175ebebbca..5877a33df1017 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2726,6 +2726,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   StringRef FPModel = "";
   // -ffp-exception-behavior options: strict, maytrap, ignore
   StringRef FPExceptionBehavior = "";
+  // -ffp-eval-method options: double, extended, source
+  StringRef FPEvalMethod = "";
   const llvm::DenormalMode DefaultDenormalFPMath =
       TC.getDefaultDenormalModeForType(Args, JA);
   const llvm::DenormalMode DefaultDenormalFP32Math =
@@ -2921,6 +2923,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       break;
     }
 
+    // Validate and pass through -ffp-eval-method option.
+    case options::OPT_ffp_eval_method_EQ: {
+      StringRef Val = A->getValue();
+      if (Val.equals("double") || Val.equals("extended") ||
+          Val.equals("source"))
+        FPEvalMethod = Val;
+      else
+        D.Diag(diag::err_drv_unsupported_option_argument)
+            << A->getOption().getName() << Val;
+      break;
+    }
+
     case options::OPT_ffinite_math_only:
       HonorINFs = false;
       HonorNaNs = false;
@@ -3076,6 +3090,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     CmdArgs.push_back(Args.MakeArgString("-ffp-exception-behavior=" +
                       FPExceptionBehavior));
 
+  if (!FPEvalMethod.empty())
+    CmdArgs.push_back(Args.MakeArgString("-ffp-eval-method=" + FPEvalMethod));
+
   ParseMRecip(D, Args, CmdArgs);
 
   // -ffast-math enables the __FAST_MATH__ preprocessor macro, but check for the
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index bf8a0b2abe22e..ff507e2c00aaa 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1136,7 +1136,6 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   }
 
   // Macros to control C99 numerics and <float.h>
-  Builder.defineMacro("__FLT_EVAL_METHOD__", Twine(TI.getFloatEvalMethod()));
   Builder.defineMacro("__FLT_RADIX__", "2");
   Builder.defineMacro("__DECIMAL_DIG__", "__LDBL_DECIMAL_DIG__");
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index a29ff215d7ea0..82fc57c8f2e88 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -342,6 +342,7 @@ void Preprocessor::RegisterBuiltinMacros() {
   Ident__TIME__ = RegisterBuiltinMacro(*this, "__TIME__");
   Ident__COUNTER__ = RegisterBuiltinMacro(*this, "__COUNTER__");
   Ident_Pragma  = RegisterBuiltinMacro(*this, "_Pragma");
+  Ident__FLT_EVAL_METHOD__ = RegisterBuiltinMacro(*this, "__FLT_EVAL_METHOD__");
 
   // C++ Standing Document Extensions.
   if (getLangOpts().CPlusPlus)
@@ -1574,6 +1575,17 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
     // Surround the string with " and strip the trailing newline.
     OS << '"' << StringRef(Result).drop_back() << '"';
     Tok.setKind(tok::string_literal);
+  } else if (II == Ident__FLT_EVAL_METHOD__) {
+    // __FLT_EVAL_METHOD__ is set to the default value.
+    OS << getTUFPEvalMethod();
+    // __FLT_EVAL_METHOD__ expands to a simple numeric value.
+    Tok.setKind(tok::numeric_constant);
+    if (getLastFPEvalPragmaLocation().isValid()) {
+      // The program is ill-formed. The value of __FLT_EVAL_METHOD__ is altered
+      // by the pragma.
+      Diag(Tok, diag::err_illegal_use_of_flt_eval_macro);
+      Diag(getLastFPEvalPragmaLocation(), diag::note_pragma_entered_here);
+    }
   } else if (II == Ident__COUNTER__) {
     // __COUNTER__ expands to a simple numeric value.
     OS << CounterValue++;
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 27e8501278626..5c6aa0e47635b 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -3028,12 +3028,13 @@ void PragmaOptimizeHandler::HandlePragma(Preprocessor &PP,
 namespace {
 /// Used as the annotation value for tok::annot_pragma_fp.
 struct TokFPAnnotValue {
-  enum FlagKinds { Contract, Reassociate, Exceptions };
+  enum FlagKinds { Contract, Reassociate, Exceptions, EvalMethod };
   enum FlagValues { On, Off, Fast };
 
   llvm::Optional<LangOptions::FPModeKind> ContractValue;
   llvm::Optional<LangOptions::FPModeKind> ReassociateValue;
   llvm::Optional<LangOptions::FPExceptionModeKind> ExceptionsValue;
+  llvm::Optional<LangOptions::FPEvalMethodKind> EvalMethodValue;
 };
 } // end anonymous namespace
 
@@ -3060,6 +3061,7 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
             .Case("contract", TokFPAnnotValue::Contract)
             .Case("reassociate", TokFPAnnotValue::Reassociate)
             .Case("exceptions", TokFPAnnotValue::Exceptions)
+            .Case("eval_method", TokFPAnnotValue::EvalMethod)
             .Default(None);
     if (!FlagKind) {
       PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_option)
@@ -3074,8 +3076,11 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
       return;
     }
     PP.Lex(Tok);
+    bool isEvalMethodDouble =
+        Tok.is(tok::kw_double) && FlagKind == TokFPAnnotValue::EvalMethod;
 
-    if (Tok.isNot(tok::identifier)) {
+    // Don't diagnose if we have an eval_metod pragma with "double" kind.
+    if (Tok.isNot(tok::identifier) && !isEvalMethodDouble) {
       PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_argument)
           << PP.getSpelling(Tok) << OptionInfo->getName()
           << static_cast<int>(*FlagKind);
@@ -3121,6 +3126,19 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
             << PP.getSpelling(Tok) << OptionInfo->getName() << *FlagKind;
         return;
       }
+    } else if (FlagKind == TokFPAnnotValue::EvalMethod) {
+      AnnotValue->EvalMethodValue =
+          llvm::StringSwitch<llvm::Optional<LangOptions::FPEvalMethodKind>>(
+              II->getName())
+              .Case("source", LangOptions::FPEvalMethodKind::FEM_Source)
+              .Case("double", LangOptions::FPEvalMethodKind::FEM_Double)
+              .Case("extended", LangOptions::FPEvalMethodKind::FEM_Extended)
+              .Default(llvm::None);
+      if (!AnnotValue->EvalMethodValue) {
+        PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_argument)
+            << PP.getSpelling(Tok) << OptionInfo->getName() << *FlagKind;
+        return;
+      }
     }
     PP.Lex(Tok);
 
@@ -3223,6 +3241,9 @@ void Parser::HandlePragmaFP() {
   if (AnnotValue->ExceptionsValue)
     Actions.ActOnPragmaFPExceptions(Tok.getLocation(),
                                     *AnnotValue->ExceptionsValue);
+  if (AnnotValue->EvalMethodValue)
+    Actions.ActOnPragmaFPEvalMethod(Tok.getLocation(),
+                                    *AnnotValue->EvalMethodValue);
   ConsumeAnnotationToken();
 }
 
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index ee07775b6346f..cadedf6d98dbd 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -1153,6 +1153,16 @@ StmtResult Parser::ParseCompoundStatementBody(bool isStmtExpr) {
     if (R.isUsable())
       Stmts.push_back(R.get());
   }
+  // Warn the user that using option `-ffp-eval-method=source` on a
+  // 32-bit target and feature `sse` disabled, or using
+  // `pragma clang fp eval_method=source` and feature `sse` disabled, is not
+  // supported.
+  if (!PP.getTargetInfo().supportSourceEvalMethod() &&
+      (PP.getLastFPEvalPragmaLocation().isValid() ||
+       PP.getCurrentFPEvalMethod() ==
+           LangOptions::FPEvalMethodKind::FEM_Source))
+    Diag(Tok.getLocation(),
+         diag::warn_no_support_for_eval_method_source_on_m32);
 
   SourceLocation CloseLoc = Tok.getLocation();
 
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 7b57c8da4e9cc..db3eda622639f 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -242,6 +242,15 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
   SemaPPCallbackHandler = Callbacks.get();
   PP.addPPCallbacks(std::move(Callbacks));
   SemaPPCallbackHandler->set(*this);
+  if (getLangOpts().getFPEvalMethod() == LangOptions::FEM_UnsetOnCommandLine)
+    // Use setting from TargetInfo.
+    PP.setCurrentFPEvalMethod(SourceLocation(),
+                              ctxt.getTargetInfo().getFPEvalMethod());
+  else
+    // Set initial value of __FLT_EVAL_METHOD__ from the command line.
+    PP.setCurrentFPEvalMethod(SourceLocation(),
+                              getLangOpts().getFPEvalMethod());
+  CurFPFeatures.setFPEvalMethod(PP.getCurrentFPEvalMethod());
 }
 
 // Anchor Sema's type info to this TU.
@@ -2630,3 +2639,15 @@ const llvm::MapVector<FieldDecl *, Sema::DeleteLocs> &
 Sema::getMismatchingDeleteExpressions() const {
   return DeleteExprs;
 }
+
+Sema::FPFeaturesStateRAII::FPFeaturesStateRAII(Sema &S)
+    : S(S), OldFPFeaturesState(S.CurFPFeatures),
+      OldOverrides(S.FpPragmaStack.CurrentValue),
+      OldEvalMethod(S.PP.getCurrentFPEvalMethod()),
+      OldFPPragmaLocation(S.PP.getLastFPEvalPragmaLocation()) {}
+
+Sema::FPFeaturesStateRAII::~FPFeaturesStateRAII() {
+  S.CurFPFeatures = OldFPFeaturesState;
+  S.FpPragmaStack.CurrentValue = OldOverrides;
+  S.PP.setCurrentFPEvalMethod(OldFPPragmaLocation, OldEvalMethod);
+}
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index 38e6e60af90db..d623060fd10cf 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -470,6 +470,27 @@ void Sema::ActOnPragmaDetectMismatch(SourceLocation Loc, StringRef Name,
   Consumer.HandleTopLevelDecl(DeclGroupRef(PDMD));
 }
 
+void Sema::ActOnPragmaFPEvalMethod(SourceLocation Loc,
+                                   LangOptions::FPEvalMethodKind Value) {
+  FPOptionsOverride NewFPFeatures = CurFPFeatureOverrides();
+  switch (Value) {
+  default:
+    llvm_unreachable("invalid pragma eval_method kind");
+  case LangOptions::FEM_Source:
+    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Source);
+    break;
+  case LangOptions::FEM_Double:
+    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Double);
+    break;
+  case LangOptions::FEM_Extended:
+    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Extended);
+    break;
+  }
+  FpPragmaStack.Act(Loc, PSK_Set, StringRef(), NewFPFeatures);
+  CurFPFeatures = NewFPFeatures.applyOverrides(getLangOpts());
+  PP.setCurrentFPEvalMethod(Loc, Value);
+}
+
 void Sema::ActOnPragmaFloatControl(SourceLocation Loc,
                                    PragmaMsStackAction Action,
                                    PragmaFloatControlKind Value) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 22b3f371afe79..88fc89bec629a 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -773,6 +773,40 @@ ExprResult Sema::UsualUnaryConversions(Expr *E) {
   QualType Ty = E->getType();
   assert(!Ty.isNull() && "UsualUnaryConversions - missing type");
 
+  LangOptions::FPEvalMethodKind EvalMethod = CurFPFeatures.getFPEvalMethod();
+  if (EvalMethod != LangOptions::FEM_Source && Ty->isFloatingType() &&
+      (getLangOpts().getFPEvalMethod() !=
+           LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine ||
+       PP.getLastFPEvalPragmaLocation().isValid())) {
+    switch (EvalMethod) {
+    default:
+      llvm_unreachable("Unrecognized float evaluation method");
+      break;
+    case LangOptions::FEM_UnsetOnCommandLine:
+      llvm_unreachable("Float evaluation method should be set by now");
+      break;
+    case LangOptions::FEM_Double:
+      if (Context.getFloatingTypeOrder(Context.DoubleTy, Ty) > 0)
+        // Widen the expression to double.
+        return Ty->isComplexType()
+                   ? ImpCastExprToType(E,
+                                       Context.getComplexType(Context.DoubleTy),
+                                       CK_FloatingComplexCast)
+                   : ImpCastExprToType(E, Context.DoubleTy, CK_FloatingCast);
+      break;
+    case LangOptions::FEM_Extended:
+      if (Context.getFloatingTypeOrder(Context.LongDoubleTy, Ty) > 0)
+        // Widen the expression to long double.
+        return Ty->isComplexType()
+                   ? ImpCastExprToType(
+                         E, Context.getComplexType(Context.LongDoubleTy),
+                         CK_FloatingComplexCast)
+                   : ImpCastExprToType(E, Context.LongDoubleTy,
+                                       CK_FloatingCast);
+      break;
+    }
+  }
+
   // Half FP have to be promoted to float unless it is natively supported
   if (Ty->isHalfType() && !getLangOpts().NativeHalfType)
     return ImpCastExprToType(Res.get(), Context.FloatTy, CK_FloatingCast);
diff --git a/clang/test/CodeGen/X86/32bit-behavior-no-eval.c b/clang/test/CodeGen/X86/32bit-behavior-no-eval.c
new file mode 100644
index 0000000000000..d040e827ce31c
--- /dev/null
+++ b/clang/test/CodeGen/X86/32bit-behavior-no-eval.c
@@ -0,0 +1,30 @@
+// SSE
+// RUN: %clang_cc1  \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CHECK %s
+
+// NO SSE
+// RUN: %clang_cc1  \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CHECK %s
+
+// NO SSE Fast Math
+// RUN: %clang_cc1  \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -ffast-math -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-FM %s
+
+float addit(float a, float b, float c) {
+  // CHECK: load float, float*
+  // CHECK: load float, float*
+  // CHECK: fadd float
+  // CHECK: load float, float*
+  // CHECK: fadd float
+
+  // CHECK-FM: load float, float*
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn float
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn float
+
+  return a + b + c;
+}
diff --git a/clang/test/CodeGen/X86/32bit-behavior.c b/clang/test/CodeGen/X86/32bit-behavior.c
new file mode 100644
index 0000000000000..a7e0f008c9f35
--- /dev/null
+++ b/clang/test/CodeGen/X86/32bit-behavior.c
@@ -0,0 +1,109 @@
+// SSE
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// SSE Fast Math
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM %s
+
+// NO SSE
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source  \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// NO SSE Fast Math
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-DBL-FM %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-DBL-FM %s
+
+float addit(float a, float b, float c) {
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+
+  // CHECK-FM-SRC: load float, float*
+  // CHECK-FM-SRC: load float, float*
+  // CHECK-FM-SRC: fadd reassoc nnan ninf nsz arcp afn float
+  // CHECK-FM-SRC: load float, float*
+  // CHECK-FM-SRC: fadd reassoc nnan ninf nsz arcp afn float
+
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fpext float {{.*}} to double
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fpext float {{.*}} to double
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-FM: fptrunc double {{.*}} to float
+
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float {{.*}} to double
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float {{.*}} to double
+  // CHECK-DBL: fadd double
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float {{.*}} to double
+  // CHECK-DBL: fadd double
+  // CHECK-DBL: fptrunc double {{.*}} to float
+
+  // CHECK-DBL-FM: load float, float*
+  // CHECK-DBL-FM: fpext float {{.*}} to double
+  // CHECK-DBL-FM: load float, float*
+  // CHECK-DBL-FM: fpext float {{.*}} to double
+  // CHECK-DBL-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-DBL-FM: load float, float*
+  // CHECK-DBL-FM: fpext float {{.*}} to double
+  // CHECK-DBL-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-DBL-FM: fptrunc double {{.*}} to float
+
+  // CHECK: ret float
+  return a + b + c;
+}
diff --git a/clang/test/CodeGen/X86/fp-eval-method.c b/clang/test/CodeGen/X86/fp-eval-method.c
new file mode 100644
index 0000000000000..5bfc3701050f5
--- /dev/null
+++ b/clang/test/CodeGen/X86/fp-eval-method.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple i386-unknown-netbsd6 -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefixes=CHECK
+
+// RUN: %clang_cc1 -triple i386-unknown-netbsd7 -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefixes=CHECK-EXT
+
+// RUN: %clang_cc1 -triple i386--linux -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefixes=CHECK-EXT
+
+float f(float x, float y) {
+  // CHECK: define{{.*}} float @f
+  // CHECK: fadd float
+  return 2.0f + x + y;
+}
+
+int getEvalMethod() {
+  // CHECK: ret i32 1
+  // CHECK-EXT: ret i32 2
+  return __FLT_EVAL_METHOD__;
+}
diff --git a/clang/test/CodeGen/flt_eval_macro.cpp b/clang/test/CodeGen/flt_eval_macro.cpp
new file mode 100644
index 0000000000000..aa7455f0efe0b
--- /dev/null
+++ b/clang/test/CodeGen/flt_eval_macro.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 \
+// RUN: -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck -check-prefixes=CHECK-DBL %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FLT %s
+
+// RUN: %clang_cc1 -triple powerpc-unknown-aix -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=CHECK-DBL-PPC
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended -mlong-double-80 \
+// RUN: | FileCheck %s -check-prefix=CHECK-EXT-FLT
+
+int getFEM() {
+  // LABEL: define {{.*}}getFEM{{.*}}
+  return __FLT_EVAL_METHOD__;
+  // CHECK-SRC: ret {{.*}} 0
+  // CHECK-DBL: ret {{.*}} 1
+  // CHECK-DBL-PPC: ret {{.*}} 1
+  // CHECK-EXT-FLT: ret {{.*}} 2
+}
+
+float func() {
+  // LABEL: define {{.*}}@_Z4func{{.*}}
+  float X = 100.0f;
+  float Y = -45.3f;
+  float Z = 393.78f;
+  float temp;
+#if __FLT_EVAL_METHOD__ == 0
+  temp = X + Y + Z;
+#elif __FLT_EVAL_METHOD__ == 1
+  temp = X * Y * Z;
+#elif __FLT_EVAL_METHOD__ == 2
+  temp = X * Y - Z;
+#endif
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float
+  // CHECK-DBL: fmul double
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float
+  // CHECK-DBL: fmul double
+  // CHECK-DBL: fptrunc double
+
+  // CHECK-EXT-FLT: load float, float*
+  // CHECK-EXT-FLT: fpext float
+  // CHECK-EXT-FLT: load float, float*
+  // CHECK-EXT-FLT: fpext float
+  // CHECK-EXT-FLT: fmul x86_fp80
+  // CHECK-EXT-FLT: load float, float*
+  // CHECK-EXT-FLT: fpext float
+  // CHECK-EXT-FLT: fsub x86_fp80
+  // CHECK-EXT-FLT: fptrunc x86_fp80
+
+  // CHECK-DBL-PPC: load float, float*
+  // CHECK-DBL-PPC: load float, float*
+  // CHECK-DBL-PPC: fmul float
+  // CHECK-DBL-PPC: load float, float*
+  // CHECK-DBL-PPC: fmul float
+
+  return temp;
+}
diff --git a/clang/test/CodeGen/fp-floatcontrol-pragma.cpp b/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
index ef29d24de1dbc..966eaf6053970 100644
--- a/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
+++ b/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
@@ -1,7 +1,53 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NS %s
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DFENV_ON=1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-FENV %s
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple %itanium_abi_triple -O3 -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-O3 %s
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 \
+// RUN: -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-NS %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s \
+// RUN: -check-prefixes=CHECK-DEFAULT,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DFENV_ON=1 \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-FENV %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DNF128 \
+// RUN: -triple %itanium_abi_triple -O3 -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-O3 %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck %s -check-prefixes=CHECK-SOURCE,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck %s -check-prefixes=CHECK-DOUBLE,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: -mlong-double-80 | FileCheck %s \
+// RUN: -check-prefixes=CHECK-EXTENDED,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck %s -check-prefix=CHECK-SOURCE
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double | FileCheck %s \
+// RUN: -check-prefix=CHECK-DOUBLE
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended -mlong-double-80 \
+// RUN: | FileCheck %s -check-prefix=CHECK-EXTENDED
+
+// RUN: %clang_cc1 -triple powerpc-unknown-aix -DNF128 -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=CHECK-AIX
+
+bool f() {
+  // CHECK: define {{.*}}f{{.*}}
+  return __FLT_EVAL_METHOD__ < 0 &&
+         __FLT_EVAL_METHOD__ == -1;
+  // CHECK: ret {{.*}} true
+}
 
 // Verify float_control(precise, off) enables fast math flags on fp operations.
 float fp_precise_1(float a, float b, float c) {
@@ -229,3 +275,115 @@ float try_lam(float x, unsigned n) {
   result = x + t;
   return result;
 }
+
+float mySub(float x, float y) {
+  // CHECK: define {{.*}}float {{.*}}mySub{{.*}}
+  // CHECK-NS: fsub float
+  // CHECK-SOURCE: fsub float
+  // CHECK-DOUBLE: fpext float
+  // CHECK-DOUBLE: fpext float
+  // CHECK-DOUBLE: fsub double
+  // CHECK-DOUBLE: fptrunc double {{.*}} to float
+  // CHECK-EXTENDED: fpext float
+  // CHECK-EXTENDED: fpext float
+  // CHECK-EXTENDED: fsub double
+  // CHECK-EXTENDED: fptrunc double {{.*}} to float
+  return x - y;
+}
+
+float mySubSource(float x, float y) {
+// CHECK: define {{.*}}float {{.*}}mySubSource{{.*}}
+#pragma clang fp eval_method(source)
+  return x - y;
+  // CHECK: fsub float
+}
+
+float mySubExtended(float x, float y) {
+// CHECK: define {{.*}}float {{.*}}mySubExtended{{.*}}
+#pragma clang fp eval_method(extended)
+  return x - y;
+  // CHECK: fpext float
+  // CHECK: fpext float
+  // CHECK: fsub x86_fp80
+  // CHECK: fptrunc x86_fp80 {{.*}} to float
+  // CHECK-AIX: fsub double
+  // CHECK-AIX: fptrunc double
+}
+
+float mySubDouble(float x, float y) {
+// CHECK: define {{.*}}float {{.*}}mySubDouble{{.*}}
+#pragma clang fp eval_method(double)
+  return x - y;
+  // CHECK: fpext float
+  // CHECK: fpext float
+  // CHECK: fsub double
+  // CHECK: fptrunc double {{.*}} to float
+}
+
+#ifndef NF128
+__float128 mySub128(__float128 x, __float128 y) {
+  // CHECK: define {{.*}}mySub128{{.*}}
+  // Expect no fpext since fp128 is already widest
+  // CHECK: load fp128
+  // CHECK-NEXT: load fp128
+  // CHECK-NEXT: fsub fp128
+  // CHECK-NEXT: ret fp128
+  return x - y;
+}
+#endif
+
+void mySubfp16(__fp16 *res, __fp16 *x, __fp16 *y) {
+  // CHECK: define {{.*}}mySubfp16{{.*}}
+  *res = *x - *y;
+  // CHECK: load half
+  // CHECK-NEXT: load half
+  // CHECK-NEXT: fpext half{{.*}}
+  // CHECK-NEXT: load half
+  // CHECK-NEXT: load half
+  // CHECK-NS: fpext half{{.*}} to float
+  // CHECK-DEFAULT: fpext half{{.*}} to float
+  // CHECK-DOUBLE: fpext half{{.*}} to float
+  // CHECK-EXTENDED: fpext half{{.*}} to float
+  // CHECK-NEXT: fsub
+  // CHECK-NEXT: fptrunc {{.*}}to half
+  // CHECK-NS: fptrunc float {{.*}} to half
+  // CHECK-DOUBLE: fptrunc float {{.*}} to half
+  // CHECK-EXTENDED: fptrunc float {{.*}} to half
+}
+
+float Div(float x, float y, float z) {
+  // CHECK: define{{.*}}float {{.*}}Div{{.*}}
+  // CHECK-CONST-ARGS: fdiv float
+  return x / (y / z);
+}
+
+float DivExtended(float x, float y, float z) {
+// CHECK: define{{.*}}float {{.*}}DivExtended{{.*}}
+#pragma clang fp eval_method(extended)
+  // CHECK-CONST-ARGS: fdiv x86_fp80
+  // CHECK-CONST-ARGS: fptrunc x86_fp80
+  return x / (y / z);
+}
+
+float DivDouble(float x, float y, float z) {
+// CHECK: define{{.*}}float {{.*}}DivDouble{{.*}}
+#pragma clang fp eval_method(double)
+  // CHECK-CONST-ARGS: fdiv double
+  // CHECK-CONST-ARGS: fptrunc double
+  return x / (y / z);
+}
+
+float DivSource(float x, float y, float z) {
+// CHECK: define{{.*}}float {{.*}}DivSource{{.*}}
+#pragma clang fp eval_method(source)
+  // CHECK-CONST-ARGS: fdiv float
+  return x / (y / z);
+}
+
+int main() {
+  float f = Div(4.2f, 1.0f, 3.0f);
+  float fextended = DivExtended(4.2f, 1.0f, 3.0f);
+  float fdouble = DivDouble(4.2f, 1.0f, 3.0f);
+  float fsource = DivSource(4.2f, 1.0f, 3.0f);
+  // CHECK: store float
+}
diff --git a/clang/test/Preprocessor/flt_eval_macro.cpp b/clang/test/Preprocessor/flt_eval_macro.cpp
new file mode 100644
index 0000000000000..47f2592e261bd
--- /dev/null
+++ b/clang/test/Preprocessor/flt_eval_macro.cpp
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -E -dM %s -o - | FileCheck %s -strict-whitespace
+
+#ifdef __FLT_EVAL_METHOD__
+#if __FLT_EVAL_METHOD__ == 3
+#define __GLIBC_FLT_EVAL_METHOD 2
+#else
+#define __GLIBC_FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#endif
+#elif defined __x86_64__
+#define __GLIBC_FLT_EVAL_METHOD 0
+#else
+#define __GLIBC_FLT_EVAL_METHOD 2
+#endif
+
+#if __GLIBC_FLT_EVAL_METHOD == 0 || __GLIBC_FLT_EVAL_METHOD == 16
+#define Name "One"
+#elif __GLIBC_FLT_EVAL_METHOD == 1
+#define Name "Two"
+#elif __GLIBC_FLT_EVAL_METHOD == 2
+#define Name "Unset on command line"
+#elif __GLIBC_FLT_EVAL_METHOD == 32
+#define Name "Four"
+#elif __GLIBC_FLT_EVAL_METHOD == 33
+#define Name "Five"
+#elif __GLIBC_FLT_EVAL_METHOD == 64
+#define Name "Six"
+#elif __GLIBC_FLT_EVAL_METHOD == 65
+#define Name "Seven"
+#elif __GLIBC_FLT_EVAL_METHOD == 128
+#define Name "Eight"
+#elif __GLIBC_FLT_EVAL_METHOD == 129
+#define Name "Nine"
+#else
+#error "Unknown __GLIBC_FLT_EVAL_METHOD"
+#endif
+
+int foo() {
+  // CHECK: #define Name "Unset on command line"
+  return Name;
+}
+
+#if __FLT_EVAL_METHOD__ == 3
+#define Val "val0"
+#endif
+
+#pragma fp eval_method(double)
+
+#if __FLT_EVAL_METHOD__ == 0
+#define Val "val1"
+#elif __FLT_EVAL_METHOD__ == 1
+#define Val "val2"
+#elif __FLT_EVAL_METHOD__ == 2
+#define Val "val3"
+#endif
+
+int goo() {
+  // CHECK: #define Val "val0"
+  return Name;
+}
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index f6809d8d9b48f..66cab8b1f8d04 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -93,7 +93,6 @@
 // AARCH64-NEXT: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-NEXT: #define __FLT_DIG__ 6
 // AARCH64-NEXT: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-NEXT: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-NEXT: #define __FLT_HAS_DENORM__ 1
 // AARCH64-NEXT: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-NEXT: #define __FLT_HAS_QUIET_NAN__ 1
@@ -388,7 +387,6 @@
 // AARCH64-DARWIN: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-DARWIN: #define __FLT_DIG__ 6
 // AARCH64-DARWIN: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-DARWIN: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-DARWIN: #define __FLT_HAS_DENORM__ 1
 // AARCH64-DARWIN: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-DARWIN: #define __FLT_HAS_QUIET_NAN__ 1
@@ -604,7 +602,6 @@
 // AARCH64-MSVC: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-MSVC: #define __FLT_DIG__ 6
 // AARCH64-MSVC: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-MSVC: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-MSVC: #define __FLT_HAS_DENORM__ 1
 // AARCH64-MSVC: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-MSVC: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-arm.c b/clang/test/Preprocessor/init-arm.c
index 32eb2c513f8b0..2d1503c18560e 100644
--- a/clang/test/Preprocessor/init-arm.c
+++ b/clang/test/Preprocessor/init-arm.c
@@ -35,7 +35,6 @@
 // ARM:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM:#define __FLT_DIG__ 6
 // ARM:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARM:#define __FLT_EVAL_METHOD__ 0
 // ARM:#define __FLT_HAS_DENORM__ 1
 // ARM:#define __FLT_HAS_INFINITY__ 1
 // ARM:#define __FLT_HAS_QUIET_NAN__ 1
@@ -235,7 +234,6 @@
 // ARM-BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM-BE:#define __FLT_DIG__ 6
 // ARM-BE:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARM-BE:#define __FLT_EVAL_METHOD__ 0
 // ARM-BE:#define __FLT_HAS_DENORM__ 1
 // ARM-BE:#define __FLT_HAS_INFINITY__ 1
 // ARM-BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -428,7 +426,6 @@
 // ARMEABISOFTFP:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARMEABISOFTFP:#define __FLT_DIG__ 6
 // ARMEABISOFTFP:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARMEABISOFTFP:#define __FLT_EVAL_METHOD__ 0
 // ARMEABISOFTFP:#define __FLT_HAS_DENORM__ 1
 // ARMEABISOFTFP:#define __FLT_HAS_INFINITY__ 1
 // ARMEABISOFTFP:#define __FLT_HAS_QUIET_NAN__ 1
@@ -623,7 +620,6 @@
 // ARMEABIHARDFP:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARMEABIHARDFP:#define __FLT_DIG__ 6
 // ARMEABIHARDFP:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARMEABIHARDFP:#define __FLT_EVAL_METHOD__ 0
 // ARMEABIHARDFP:#define __FLT_HAS_DENORM__ 1
 // ARMEABIHARDFP:#define __FLT_HAS_INFINITY__ 1
 // ARMEABIHARDFP:#define __FLT_HAS_QUIET_NAN__ 1
@@ -821,7 +817,6 @@
 // ARM-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM-NETBSD:#define __FLT_DIG__ 6
 // ARM-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARM-NETBSD:#define __FLT_EVAL_METHOD__ 0
 // ARM-NETBSD:#define __FLT_HAS_DENORM__ 1
 // ARM-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // ARM-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c
index d76396aa35c91..a07cee64e6848 100644
--- a/clang/test/Preprocessor/init-mips.c
+++ b/clang/test/Preprocessor/init-mips.c
@@ -37,7 +37,6 @@
 // MIPS32BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS32BE:#define __FLT_DIG__ 6
 // MIPS32BE:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS32BE:#define __FLT_EVAL_METHOD__ 0
 // MIPS32BE:#define __FLT_HAS_DENORM__ 1
 // MIPS32BE:#define __FLT_HAS_INFINITY__ 1
 // MIPS32BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -247,7 +246,6 @@
 // MIPS32EL:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS32EL:#define __FLT_DIG__ 6
 // MIPS32EL:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS32EL:#define __FLT_EVAL_METHOD__ 0
 // MIPS32EL:#define __FLT_HAS_DENORM__ 1
 // MIPS32EL:#define __FLT_HAS_INFINITY__ 1
 // MIPS32EL:#define __FLT_HAS_QUIET_NAN__ 1
@@ -467,7 +465,6 @@
 // MIPSN32BE: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPSN32BE: #define __FLT_DIG__ 6
 // MIPSN32BE: #define __FLT_EPSILON__ 1.19209290e-7F
-// MIPSN32BE: #define __FLT_EVAL_METHOD__ 0
 // MIPSN32BE: #define __FLT_HAS_DENORM__ 1
 // MIPSN32BE: #define __FLT_HAS_INFINITY__ 1
 // MIPSN32BE: #define __FLT_HAS_QUIET_NAN__ 1
@@ -774,7 +771,6 @@
 // MIPSN32EL: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPSN32EL: #define __FLT_DIG__ 6
 // MIPSN32EL: #define __FLT_EPSILON__ 1.19209290e-7F
-// MIPSN32EL: #define __FLT_EVAL_METHOD__ 0
 // MIPSN32EL: #define __FLT_HAS_DENORM__ 1
 // MIPSN32EL: #define __FLT_HAS_INFINITY__ 1
 // MIPSN32EL: #define __FLT_HAS_QUIET_NAN__ 1
@@ -1074,7 +1070,6 @@
 // MIPS64BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS64BE:#define __FLT_DIG__ 6
 // MIPS64BE:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS64BE:#define __FLT_EVAL_METHOD__ 0
 // MIPS64BE:#define __FLT_HAS_DENORM__ 1
 // MIPS64BE:#define __FLT_HAS_INFINITY__ 1
 // MIPS64BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1284,7 +1279,6 @@
 // MIPS64EL:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS64EL:#define __FLT_DIG__ 6
 // MIPS64EL:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS64EL:#define __FLT_EVAL_METHOD__ 0
 // MIPS64EL:#define __FLT_HAS_DENORM__ 1
 // MIPS64EL:#define __FLT_HAS_INFINITY__ 1
 // MIPS64EL:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-ppc.c b/clang/test/Preprocessor/init-ppc.c
index 611b16dfb8f7e..45c8a5e53ad4f 100644
--- a/clang/test/Preprocessor/init-ppc.c
+++ b/clang/test/Preprocessor/init-ppc.c
@@ -30,7 +30,6 @@
 // PPC603E:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC603E:#define __FLT_DIG__ 6
 // PPC603E:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC603E:#define __FLT_EVAL_METHOD__ 0
 // PPC603E:#define __FLT_HAS_DENORM__ 1
 // PPC603E:#define __FLT_HAS_INFINITY__ 1
 // PPC603E:#define __FLT_HAS_QUIET_NAN__ 1
@@ -224,7 +223,6 @@
 // PPC:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC:#define __FLT_DIG__ 6
 // PPC:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC:#define __FLT_EVAL_METHOD__ 0
 // PPC:#define __FLT_HAS_DENORM__ 1
 // PPC:#define __FLT_HAS_INFINITY__ 1
 // PPC:#define __FLT_HAS_QUIET_NAN__ 1
@@ -425,7 +423,6 @@
 // PPC-AIX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-AIX:#define __FLT_DIG__ 6
 // PPC-AIX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC-AIX:#define __FLT_EVAL_METHOD__ 1
 // PPC-AIX:#define __FLT_HAS_DENORM__ 1
 // PPC-AIX:#define __FLT_HAS_INFINITY__ 1
 // PPC-AIX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -798,7 +795,6 @@
 // PPC-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-LINUX:#define __FLT_DIG__ 6
 // PPC-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC-LINUX:#define __FLT_EVAL_METHOD__ 0
 // PPC-LINUX:#define __FLT_HAS_DENORM__ 1
 // PPC-LINUX:#define __FLT_HAS_INFINITY__ 1
 // PPC-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1006,7 +1002,6 @@
 // PPC-DARWIN:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-DARWIN:#define __FLT_DIG__ 6
 // PPC-DARWIN:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC-DARWIN:#define __FLT_EVAL_METHOD__ 0
 // PPC-DARWIN:#define __FLT_HAS_DENORM__ 1
 // PPC-DARWIN:#define __FLT_HAS_INFINITY__ 1
 // PPC-DARWIN:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index 7a9525228c3b6..f0ccd1638c04d 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -35,7 +35,6 @@
 // PPC64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64:#define __FLT_DIG__ 6
 // PPC64:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64:#define __FLT_EVAL_METHOD__ 0
 // PPC64:#define __FLT_HAS_DENORM__ 1
 // PPC64:#define __FLT_HAS_INFINITY__ 1
 // PPC64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -240,7 +239,6 @@
 // PPC64LE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64LE:#define __FLT_DIG__ 6
 // PPC64LE:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64LE:#define __FLT_EVAL_METHOD__ 0
 // PPC64LE:#define __FLT_HAS_DENORM__ 1
 // PPC64LE:#define __FLT_HAS_INFINITY__ 1
 // PPC64LE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -703,7 +701,6 @@
 // PPC64-AIX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64-AIX:#define __FLT_DIG__ 6
 // PPC64-AIX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64-AIX:#define __FLT_EVAL_METHOD__ 1
 // PPC64-AIX:#define __FLT_HAS_DENORM__ 1
 // PPC64-AIX:#define __FLT_HAS_INFINITY__ 1
 // PPC64-AIX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -902,7 +899,6 @@
 // PPC64-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64-LINUX:#define __FLT_DIG__ 6
 // PPC64-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64-LINUX:#define __FLT_EVAL_METHOD__ 0
 // PPC64-LINUX:#define __FLT_HAS_DENORM__ 1
 // PPC64-LINUX:#define __FLT_HAS_INFINITY__ 1
 // PPC64-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c
index b0e45b5348ce9..6c646527f50f7 100644
--- a/clang/test/Preprocessor/init-s390x.c
+++ b/clang/test/Preprocessor/init-s390x.c
@@ -23,7 +23,6 @@
 // S390X:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // S390X:#define __FLT_DIG__ 6
 // S390X:#define __FLT_EPSILON__ 1.19209290e-7F
-// S390X:#define __FLT_EVAL_METHOD__ 0
 // S390X:#define __FLT_HAS_DENORM__ 1
 // S390X:#define __FLT_HAS_INFINITY__ 1
 // S390X:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-v7k-compat.c b/clang/test/Preprocessor/init-v7k-compat.c
index 482c7ad6ff687..ff5d4bbdea53a 100644
--- a/clang/test/Preprocessor/init-v7k-compat.c
+++ b/clang/test/Preprocessor/init-v7k-compat.c
@@ -28,7 +28,6 @@
 // CHECK: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // CHECK: #define __FLT_DIG__ 6
 // CHECK: #define __FLT_EPSILON__ 1.19209290e-7F
-// CHECK: #define __FLT_EVAL_METHOD__ 0
 // CHECK: #define __FLT_HAS_DENORM__ 1
 // CHECK: #define __FLT_HAS_INFINITY__ 1
 // CHECK: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-x86.c b/clang/test/Preprocessor/init-x86.c
index 527cd39508889..aa2e05ec807c7 100644
--- a/clang/test/Preprocessor/init-x86.c
+++ b/clang/test/Preprocessor/init-x86.c
@@ -24,7 +24,6 @@
 // I386:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386:#define __FLT_DIG__ 6
 // I386:#define __FLT_EPSILON__ 1.19209290e-7F
-// I386:#define __FLT_EVAL_METHOD__ 2
 // I386:#define __FLT_HAS_DENORM__ 1
 // I386:#define __FLT_HAS_INFINITY__ 1
 // I386:#define __FLT_HAS_QUIET_NAN__ 1
@@ -213,7 +212,6 @@
 // I386-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386-LINUX:#define __FLT_DIG__ 6
 // I386-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// I386-LINUX:#define __FLT_EVAL_METHOD__ 0
 // I386-LINUX:#define __FLT_HAS_DENORM__ 1
 // I386-LINUX:#define __FLT_HAS_INFINITY__ 1
 // I386-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -416,7 +414,6 @@
 // I386-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386-NETBSD:#define __FLT_DIG__ 6
 // I386-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// I386-NETBSD:#define __FLT_EVAL_METHOD__ 2
 // I386-NETBSD:#define __FLT_HAS_DENORM__ 1
 // I386-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // I386-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
@@ -590,13 +587,6 @@
 // I386-NETBSD:#define __i386__ 1
 // I386-NETBSD:#define i386 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD-SSE %s
-// I386-NETBSD-SSE:#define __FLT_EVAL_METHOD__ 0
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6  < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6 %s
-// I386-NETBSD6:#define __FLT_EVAL_METHOD__ 1
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6 -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6-SSE %s
-// I386-NETBSD6-SSE:#define __FLT_EVAL_METHOD__ 1
-
 // RUN: %clang_cc1 -E -dM -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
 // RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
 // RUN: %clang_cc1 -E -dM -triple=i686-unknown-cygwin < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
@@ -631,7 +621,6 @@
 // X86_64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64:#define __FLT_DIG__ 6
 // X86_64:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64:#define __FLT_EVAL_METHOD__ 0
 // X86_64:#define __FLT_HAS_DENORM__ 1
 // X86_64:#define __FLT_HAS_INFINITY__ 1
 // X86_64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -839,7 +828,6 @@
 // X32:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X32:#define __FLT_DIG__ 6
 // X32:#define __FLT_EPSILON__ 1.19209290e-7F
-// X32:#define __FLT_EVAL_METHOD__ 0
 // X32:#define __FLT_HAS_DENORM__ 1
 // X32:#define __FLT_HAS_INFINITY__ 1
 // X32:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1046,7 +1034,6 @@
 // X86_64-CLOUDABI:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-CLOUDABI:#define __FLT_DIG__ 6
 // X86_64-CLOUDABI:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64-CLOUDABI:#define __FLT_EVAL_METHOD__ 0
 // X86_64-CLOUDABI:#define __FLT_HAS_DENORM__ 1
 // X86_64-CLOUDABI:#define __FLT_HAS_INFINITY__ 1
 // X86_64-CLOUDABI:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1341,7 +1328,6 @@
 // X86_64-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-LINUX:#define __FLT_DIG__ 6
 // X86_64-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64-LINUX:#define __FLT_EVAL_METHOD__ 0
 // X86_64-LINUX:#define __FLT_HAS_DENORM__ 1
 // X86_64-LINUX:#define __FLT_HAS_INFINITY__ 1
 // X86_64-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1554,7 +1540,6 @@
 // X86_64-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-NETBSD:#define __FLT_DIG__ 6
 // X86_64-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64-NETBSD:#define __FLT_EVAL_METHOD__ 0
 // X86_64-NETBSD:#define __FLT_HAS_DENORM__ 1
 // X86_64-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // X86_64-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index dd645bf6003ce..a08e503570723 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -325,7 +325,6 @@
 // MSP430:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MSP430:#define __FLT_DIG__ 6
 // MSP430:#define __FLT_EPSILON__ 1.19209290e-7F
-// MSP430:#define __FLT_EVAL_METHOD__ 0
 // MSP430:#define __FLT_HAS_DENORM__ 1
 // MSP430:#define __FLT_HAS_INFINITY__ 1
 // MSP430:#define __FLT_HAS_QUIET_NAN__ 1
@@ -513,7 +512,6 @@
 // NVPTX32:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // NVPTX32:#define __FLT_DIG__ 6
 // NVPTX32:#define __FLT_EPSILON__ 1.19209290e-7F
-// NVPTX32:#define __FLT_EVAL_METHOD__ 0
 // NVPTX32:#define __FLT_HAS_DENORM__ 1
 // NVPTX32:#define __FLT_HAS_INFINITY__ 1
 // NVPTX32:#define __FLT_HAS_QUIET_NAN__ 1
@@ -702,7 +700,6 @@
 // NVPTX64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // NVPTX64:#define __FLT_DIG__ 6
 // NVPTX64:#define __FLT_EPSILON__ 1.19209290e-7F
-// NVPTX64:#define __FLT_EVAL_METHOD__ 0
 // NVPTX64:#define __FLT_HAS_DENORM__ 1
 // NVPTX64:#define __FLT_HAS_INFINITY__ 1
 // NVPTX64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -906,7 +903,6 @@
 // SPARC:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // SPARC:#define __FLT_DIG__ 6
 // SPARC:#define __FLT_EPSILON__ 1.19209290e-7F
-// SPARC:#define __FLT_EVAL_METHOD__ 0
 // SPARC:#define __FLT_HAS_DENORM__ 1
 // SPARC:#define __FLT_HAS_INFINITY__ 1
 // SPARC:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1107,7 +1103,6 @@
 // TCE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // TCE:#define __FLT_DIG__ 6
 // TCE:#define __FLT_EPSILON__ 1.19209290e-7F
-// TCE:#define __FLT_EVAL_METHOD__ 0
 // TCE:#define __FLT_HAS_DENORM__ 1
 // TCE:#define __FLT_HAS_INFINITY__ 1
 // TCE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1274,7 +1269,6 @@
 // PS4:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PS4:#define __FLT_DIG__ 6
 // PS4:#define __FLT_EPSILON__ 1.19209290e-7F
-// PS4:#define __FLT_EVAL_METHOD__ 0
 // PS4:#define __FLT_HAS_DENORM__ 1
 // PS4:#define __FLT_HAS_INFINITY__ 1
 // PS4:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1576,7 +1570,6 @@
 // WEBASSEMBLY-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // WEBASSEMBLY-NEXT:#define __FLT_DIG__ 6
 // WEBASSEMBLY-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F
-// WEBASSEMBLY-NEXT:#define __FLT_EVAL_METHOD__ 0
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_DENORM__ 1
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_INFINITY__ 1
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1946,7 +1939,6 @@
 // AVR:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AVR:#define __FLT_DIG__ 6
 // AVR:#define __FLT_EPSILON__ 1.19209290e-7F
-// AVR:#define __FLT_EVAL_METHOD__ 0
 // AVR:#define __FLT_HAS_DENORM__ 1
 // AVR:#define __FLT_HAS_INFINITY__ 1
 // AVR:#define __FLT_HAS_QUIET_NAN__ 1
@@ -2083,7 +2075,6 @@
 // AVR:#define __WCHAR_TYPE__ int
 // AVR:#define __WINT_TYPE__ int
 
-
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:    -triple i686-windows-msvc -fms-compatibility -x c++ < /dev/null \
 // RUN:  | FileCheck -match-full-lines -check-prefix MSVC-X32 %s
@@ -2229,7 +2220,6 @@
 // RISCV32: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // RISCV32: #define __FLT_DIG__ 6
 // RISCV32: #define __FLT_EPSILON__ 1.19209290e-7F
-// RISCV32: #define __FLT_EVAL_METHOD__ 0
 // RISCV32: #define __FLT_HAS_DENORM__ 1
 // RISCV32: #define __FLT_HAS_INFINITY__ 1
 // RISCV32: #define __FLT_HAS_QUIET_NAN__ 1
@@ -2437,7 +2427,6 @@
 // RISCV64: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // RISCV64: #define __FLT_DIG__ 6
 // RISCV64: #define __FLT_EPSILON__ 1.19209290e-7F
-// RISCV64: #define __FLT_EVAL_METHOD__ 0
 // RISCV64: #define __FLT_HAS_DENORM__ 1
 // RISCV64: #define __FLT_HAS_INFINITY__ 1
 // RISCV64: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Sema/fp-eval-pragma.cpp b/clang/test/Sema/fp-eval-pragma.cpp
new file mode 100644
index 0000000000000..42d88fd438e81
--- /dev/null
+++ b/clang/test/Sema/fp-eval-pragma.cpp
@@ -0,0 +1,87 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s \
+// RUN: -ffp-eval-method=source
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s \
+// RUN: -ffp-eval-method=double
+
+extern "C" int printf(const char *, ...);
+
+void foo1() {
+  printf("FP: %d\n", __FLT_EVAL_METHOD__);
+}
+
+void apply_pragma() {
+  // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+  // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+  printf("FP: %d\n", __FLT_EVAL_METHOD__);
+}
+
+int foo2() {
+  apply_pragma();
+  return 0;
+}
+
+void foo() {
+  auto a = __FLT_EVAL_METHOD__;
+  {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    auto b = __FLT_EVAL_METHOD__;
+  }
+  auto c = __FLT_EVAL_METHOD__;
+}
+
+void func() {
+  {
+    {
+#pragma clang fp eval_method(source)
+    }
+    int i = __FLT_EVAL_METHOD__; // ok, not in a scope changed by the pragma
+  }
+  {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(source)
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    int i = __FLT_EVAL_METHOD__;
+  }
+}
+
+float G;
+
+int f(float x, float y, float z) {
+  G = x * y + z;
+  return __FLT_EVAL_METHOD__;
+}
+
+int foo(int flag, float x, float y, float z) {
+  if (flag) {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+    G = x + y + z;
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    return __FLT_EVAL_METHOD__;
+  } else {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(extended)
+    G = x + y + z;
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    return __FLT_EVAL_METHOD__;
+  }
+}
+
+#if __FLT_EVAL_METHOD__ == 1
+#endif
+#pragma clang fp eval_method(source)
+
+// expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+// expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+#if __FLT_EVAL_METHOD__ == 1
+#endif
diff --git a/clang/test/Sema/x86-eval-method.c b/clang/test/Sema/x86-eval-method.c
new file mode 100644
index 0000000000000..f475b0d1b29bc
--- /dev/null
+++ b/clang/test/Sema/x86-eval-method.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -ffp-eval-method=source  -o - -verify=warn %s
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 \
+// RUN: -emit-llvm -ffp-eval-method=source  -o - -verify=no-warn %s
+
+// no-warn-no-diagnostics
+
+float add1(float a, float b, float c) {
+  return a + b + c;
+} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+
+float add2(float a, float b, float c) {
+#pragma clang fp eval_method(source)
+  return a + b + c;
+} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
diff --git a/clang/test/Sema/x86_64-eval-method.c b/clang/test/Sema/x86_64-eval-method.c
new file mode 100644
index 0000000000000..dbdc1f881b4a8
--- /dev/null
+++ b/clang/test/Sema/x86_64-eval-method.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -target-feature -sse -emit-llvm \
+// RUN: -o - -verify=warn %s
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify=no-warn %s
+
+// no-warn-no-diagnostics
+
+float add2(float a, float b, float c) {
+#pragma clang fp eval_method(source)
+  return a + b + c;
+} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}

From 5fe64d238b8b5ed1861de63a2072ddf3e81af806 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Feb 2022 16:59:41 +0000
Subject: [PATCH 122/748] [clang] Sema::CheckEquivalentExceptionSpec - remove
 useless nullptr test

We use castAs<> for NewProto/OldProto, which would assert if the cast failed.
---
 clang/lib/Sema/SemaExceptionSpec.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 29cb4be7b1ba5..151fbb48651db 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -342,8 +342,7 @@ bool Sema::CheckEquivalentExceptionSpec(FunctionDecl *Old, FunctionDecl *New) {
   if (!MissingExceptionSpecification)
     return ReturnValueOnError;
 
-  const FunctionProtoType *NewProto =
-    New->getType()->castAs<FunctionProtoType>();
+  const auto *NewProto = New->getType()->castAs<FunctionProtoType>();
 
   // The new function declaration is only missing an empty exception
   // specification "throw()". If the throw() specification came from a
@@ -353,7 +352,7 @@ bool Sema::CheckEquivalentExceptionSpec(FunctionDecl *Old, FunctionDecl *New) {
   // specifications.
   //
   // Likewise if the old function is a builtin.
-  if (MissingEmptyExceptionSpecification && NewProto &&
+  if (MissingEmptyExceptionSpecification &&
       (Old->getLocation().isInvalid() ||
        Context.getSourceManager().isInSystemHeader(Old->getLocation()) ||
        Old->getBuiltinID()) &&
@@ -364,8 +363,7 @@ bool Sema::CheckEquivalentExceptionSpec(FunctionDecl *Old, FunctionDecl *New) {
     return false;
   }
 
-  const FunctionProtoType *OldProto =
-    Old->getType()->castAs<FunctionProtoType>();
+  const auto *OldProto = Old->getType()->castAs<FunctionProtoType>();
 
   FunctionProtoType::ExceptionSpecInfo ESI = OldProto->getExceptionSpecType();
   if (ESI.Type == EST_Dynamic) {

From ca7f06fcb60ce7730e7768aa3409387e6aa4efaf Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 17 Feb 2022 09:02:19 -0800
Subject: [PATCH 123/748] add missing include

---
 lldb/tools/debugserver/source/RNBRemote.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index 8b83506e9660b..cb0409afa6214 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -48,6 +48,7 @@
 #include <compression.h>
 
 #include <TargetConditionals.h>
+#include <algorithm>
 #include <iomanip>
 #include <memory>
 #include <sstream>

From 08361bb3c88a4001349bc6cd1406214a22ca7fa7 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian@ca.ibm.com>
Date: Thu, 17 Feb 2022 12:04:04 -0500
Subject: [PATCH 124/748] [NFC][llvm-nm] refactor function
 dumpSymbolNamesFromFile Summary: split the function into several small
 functions.

Reviewers: James Henderson,Fangrui Song
Differential Revision: https://reviews.llvm.org/D119974
---
 llvm/tools/llvm-nm/llvm-nm.cpp | 573 +++++++++++++++++----------------
 1 file changed, 297 insertions(+), 276 deletions(-)

diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index 9324cafa7f11b..5e67e76447cd1 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -1904,282 +1904,191 @@ static bool checkMachOAndArchFlags(SymbolicFile *O, std::string &Filename) {
   return true;
 }
 
-static void dumpSymbolNamesFromFile(std::string &Filename) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
-      MemoryBuffer::getFileOrSTDIN(Filename);
-  if (error(BufferOrErr.getError(), Filename))
-    return;
-
-  LLVMContext Context;
-  LLVMContext *ContextPtr = NoLLVMBitcode ? nullptr : &Context;
-  Expected<std::unique_ptr<Binary>> BinaryOrErr =
-      createBinary(BufferOrErr.get()->getMemBufferRef(), ContextPtr);
-  if (!BinaryOrErr) {
-    error(BinaryOrErr.takeError(), Filename);
-    return;
+static void dumpArchiveMap(Archive *A, std::string &Filename) {
+  Archive::symbol_iterator I = A->symbol_begin();
+  Archive::symbol_iterator E = A->symbol_end();
+  if (I != E) {
+    outs() << "Archive map\n";
+    for (; I != E; ++I) {
+      Expected<Archive::Child> C = I->getMember();
+      if (!C) {
+        error(C.takeError(), Filename);
+        break;
+      }
+      Expected<StringRef> FileNameOrErr = C->getName();
+      if (!FileNameOrErr) {
+        error(FileNameOrErr.takeError(), Filename);
+        break;
+      }
+      StringRef SymName = I->getName();
+      outs() << SymName << " in " << FileNameOrErr.get() << "\n";
+    }
+    outs() << "\n";
   }
-  Binary &Bin = *BinaryOrErr.get();
+}
 
-  if (Archive *A = dyn_cast<Archive>(&Bin)) {
-    if (ArchiveMap) {
-      Archive::symbol_iterator I = A->symbol_begin();
-      Archive::symbol_iterator E = A->symbol_end();
-      if (I != E) {
-        outs() << "Archive map\n";
-        for (; I != E; ++I) {
-          Expected<Archive::Child> C = I->getMember();
-          if (!C) {
-            error(C.takeError(), Filename);
-            break;
-          }
-          Expected<StringRef> FileNameOrErr = C->getName();
-          if (!FileNameOrErr) {
-            error(FileNameOrErr.takeError(), Filename);
-            break;
-          }
-          StringRef SymName = I->getName();
-          outs() << SymName << " in " << FileNameOrErr.get() << "\n";
-        }
+static void dumpArchive(Archive *A, std::string &Filename,
+                        LLVMContext *ContextPtr) {
+  if (ArchiveMap)
+    dumpArchiveMap(A, Filename);
+
+  Error Err = Error::success();
+  for (auto &C : A->children(Err)) {
+    Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary(ContextPtr);
+    if (!ChildOrErr) {
+      if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+        error(std::move(E), Filename, C);
+      continue;
+    }
+    if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+      if (!MachOPrintSizeWarning && PrintSize && isa<MachOObjectFile>(O)) {
+        WithColor::warning(errs(), ToolName)
+            << "sizes with -print-size for Mach-O files are always zero.\n";
+        MachOPrintSizeWarning = true;
+      }
+      if (!checkMachOAndArchFlags(O, Filename))
+        return;
+      if (!PrintFileName && shouldDump(*O) && !ExportSymbols) {
         outs() << "\n";
+        if (isa<MachOObjectFile>(O)) {
+          outs() << Filename << "(" << O->getFileName() << ")";
+        } else
+          outs() << O->getFileName();
+        outs() << ":\n";
       }
+      dumpSymbolNamesFromObject(*O, false, Filename);
     }
+  }
+  if (Err)
+    error(std::move(Err), A->getFileName());
+}
 
-    {
-      Error Err = Error::success();
-      for (auto &C : A->children(Err)) {
-        Expected<std::unique_ptr<Binary>> ChildOrErr =
-            C.getAsBinary(ContextPtr);
-        if (!ChildOrErr) {
-          if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
-            error(std::move(E), Filename, C);
-          continue;
-        }
-        if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-          if (!MachOPrintSizeWarning && PrintSize &&  isa<MachOObjectFile>(O)) {
-            WithColor::warning(errs(), ToolName)
-                << "sizes with -print-size for Mach-O files are always zero.\n";
-            MachOPrintSizeWarning = true;
-          }
-          if (!checkMachOAndArchFlags(O, Filename))
-            return;
-          if (!PrintFileName && shouldDump(*O) && !ExportSymbols) {
-            outs() << "\n";
-            if (isa<MachOObjectFile>(O)) {
-              outs() << Filename << "(" << O->getFileName() << ")";
-            } else
-              outs() << O->getFileName();
-            outs() << ":\n";
+static void dumpMachOUniversalBinaryMatchArchFlags(MachOUniversalBinary *UB,
+                                                   std::string &Filename,
+                                                   LLVMContext *ContextPtr) {
+  // Look for a slice in the universal binary that matches each ArchFlag.
+  bool ArchFound;
+  for (unsigned i = 0; i < ArchFlags.size(); ++i) {
+    ArchFound = false;
+    for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                               E = UB->end_objects();
+         I != E; ++I) {
+      if (ArchFlags[i] == I->getArchFlagName()) {
+        ArchFound = true;
+        Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
+        std::string ArchiveName;
+        std::string ArchitectureName;
+        ArchiveName.clear();
+        ArchitectureName.clear();
+        if (ObjOrErr) {
+          ObjectFile &Obj = *ObjOrErr.get();
+          if (ArchFlags.size() > 1) {
+            if (PrintFileName)
+              ArchitectureName = I->getArchFlagName();
+            else
+              outs() << "\n"
+                     << Obj.getFileName() << " (for architecture "
+                     << I->getArchFlagName() << ")"
+                     << ":\n";
           }
-          dumpSymbolNamesFromObject(*O, false, Filename);
-        }
-      }
-      if (Err)
-        error(std::move(Err), A->getFileName());
-    }
-    return;
-  }
-  if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin)) {
-    // If we have a list of architecture flags specified dump only those.
-    if (!ArchAll && !ArchFlags.empty()) {
-      // Look for a slice in the universal binary that matches each ArchFlag.
-      bool ArchFound;
-      for (unsigned i = 0; i < ArchFlags.size(); ++i) {
-        ArchFound = false;
-        for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
-                                                   E = UB->end_objects();
-             I != E; ++I) {
-          if (ArchFlags[i] == I->getArchFlagName()) {
-            ArchFound = true;
-            Expected<std::unique_ptr<ObjectFile>> ObjOrErr =
-                I->getAsObjectFile();
-            std::string ArchiveName;
-            std::string ArchitectureName;
-            ArchiveName.clear();
-            ArchitectureName.clear();
-            if (ObjOrErr) {
-              ObjectFile &Obj = *ObjOrErr.get();
-              if (ArchFlags.size() > 1) {
-                if (PrintFileName)
-                  ArchitectureName = I->getArchFlagName();
-                else
-                  outs() << "\n" << Obj.getFileName() << " (for architecture "
-                         << I->getArchFlagName() << ")"
-                         << ":\n";
+          dumpSymbolNamesFromObject(Obj, false, ArchiveName, ArchitectureName);
+        } else if (auto E =
+                       isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
+          error(std::move(E), Filename,
+                ArchFlags.size() > 1 ? StringRef(I->getArchFlagName())
+                                     : StringRef());
+          continue;
+        } else if (Expected<std::unique_ptr<Archive>> AOrErr =
+                       I->getAsArchive()) {
+          std::unique_ptr<Archive> &A = *AOrErr;
+          Error Err = Error::success();
+          for (auto &C : A->children(Err)) {
+            Expected<std::unique_ptr<Binary>> ChildOrErr =
+                C.getAsBinary(ContextPtr);
+            if (!ChildOrErr) {
+              if (auto E =
+                      isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) {
+                error(std::move(E), Filename, C,
+                      ArchFlags.size() > 1 ? StringRef(I->getArchFlagName())
+                                           : StringRef());
               }
-              dumpSymbolNamesFromObject(Obj, false, ArchiveName,
-                                        ArchitectureName);
-            } else if (auto E = isNotObjectErrorInvalidFileType(
-                       ObjOrErr.takeError())) {
-              error(std::move(E), Filename, ArchFlags.size() > 1 ?
-                    StringRef(I->getArchFlagName()) : StringRef());
               continue;
-            } else if (Expected<std::unique_ptr<Archive>> AOrErr =
-                           I->getAsArchive()) {
-              std::unique_ptr<Archive> &A = *AOrErr;
-              Error Err = Error::success();
-              for (auto &C : A->children(Err)) {
-                Expected<std::unique_ptr<Binary>> ChildOrErr =
-                    C.getAsBinary(ContextPtr);
-                if (!ChildOrErr) {
-                  if (auto E = isNotObjectErrorInvalidFileType(
-                                       ChildOrErr.takeError())) {
-                    error(std::move(E), Filename, C, ArchFlags.size() > 1 ?
-                          StringRef(I->getArchFlagName()) : StringRef());
-                  }
-                  continue;
-                }
-                if (SymbolicFile *O =
-                        dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-                  if (PrintFileName) {
-                    ArchiveName = std::string(A->getFileName());
-                    if (ArchFlags.size() > 1)
-                      ArchitectureName = I->getArchFlagName();
-                  } else {
-                    outs() << "\n" << A->getFileName();
-                    outs() << "(" << O->getFileName() << ")";
-                    if (ArchFlags.size() > 1) {
-                      outs() << " (for architecture " << I->getArchFlagName()
-                             << ")";
-                    }
-                    outs() << ":\n";
-                  }
-                  dumpSymbolNamesFromObject(*O, false, ArchiveName,
-                                            ArchitectureName);
+            }
+            if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+              if (PrintFileName) {
+                ArchiveName = std::string(A->getFileName());
+                if (ArchFlags.size() > 1)
+                  ArchitectureName = I->getArchFlagName();
+              } else {
+                outs() << "\n" << A->getFileName();
+                outs() << "(" << O->getFileName() << ")";
+                if (ArchFlags.size() > 1) {
+                  outs() << " (for architecture " << I->getArchFlagName()
+                         << ")";
                 }
+                outs() << ":\n";
               }
-              if (Err)
-                error(std::move(Err), A->getFileName());
-            } else {
-              consumeError(AOrErr.takeError());
-              error(Filename + " for architecture " +
-                    StringRef(I->getArchFlagName()) +
-                    " is not a Mach-O file or an archive file",
-                    "Mach-O universal file");
+              dumpSymbolNamesFromObject(*O, false, ArchiveName,
+                                        ArchitectureName);
             }
           }
-        }
-        if (!ArchFound) {
-          error(ArchFlags[i],
-                "file: " + Filename + " does not contain architecture");
-          return;
+          if (Err)
+            error(std::move(Err), A->getFileName());
+        } else {
+          consumeError(AOrErr.takeError());
+          error(Filename + " for architecture " +
+                    StringRef(I->getArchFlagName()) +
+                    " is not a Mach-O file or an archive file",
+                "Mach-O universal file");
         }
       }
-      return;
     }
-    // No architecture flags were specified so if this contains a slice that
-    // matches the host architecture dump only that.
-    if (!ArchAll) {
-      Triple HostTriple = MachOObjectFile::getHostArch();
-      StringRef HostArchName = HostTriple.getArchName();
-      for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
-                                                 E = UB->end_objects();
-           I != E; ++I) {
-        if (HostArchName == I->getArchFlagName()) {
-          Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
-          std::string ArchiveName;
-          if (ObjOrErr) {
-            ObjectFile &Obj = *ObjOrErr.get();
-            dumpSymbolNamesFromObject(Obj, false);
-          } else if (auto E = isNotObjectErrorInvalidFileType(
-                     ObjOrErr.takeError())) {
-            error(std::move(E), Filename);
-            return;
-          } else if (Expected<std::unique_ptr<Archive>> AOrErr =
-                         I->getAsArchive()) {
-            std::unique_ptr<Archive> &A = *AOrErr;
-            Error Err = Error::success();
-            for (auto &C : A->children(Err)) {
-              Expected<std::unique_ptr<Binary>> ChildOrErr =
-                  C.getAsBinary(ContextPtr);
-              if (!ChildOrErr) {
-                if (auto E = isNotObjectErrorInvalidFileType(
-                                     ChildOrErr.takeError()))
-                  error(std::move(E), Filename, C);
-                continue;
-              }
-              if (SymbolicFile *O =
-                      dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-                if (PrintFileName)
-                  ArchiveName = std::string(A->getFileName());
-                else
-                  outs() << "\n" << A->getFileName() << "(" << O->getFileName()
-                         << ")"
-                         << ":\n";
-                dumpSymbolNamesFromObject(*O, false, ArchiveName);
-              }
-            }
-            if (Err)
-              error(std::move(Err), A->getFileName());
-          } else {
-            consumeError(AOrErr.takeError());
-            error(Filename + " for architecture " +
-                  StringRef(I->getArchFlagName()) +
-                  " is not a Mach-O file or an archive file",
-                  "Mach-O universal file");
-          }
-          return;
-        }
-      }
+    if (!ArchFound) {
+      error(ArchFlags[i],
+            "file: " + Filename + " does not contain architecture");
+      return;
     }
-    // Either all architectures have been specified or none have been specified
-    // and this does not contain the host architecture so dump all the slices.
-    bool moreThanOneArch = UB->getNumberOfObjects() > 1;
-    for (const MachOUniversalBinary::ObjectForArch &O : UB->objects()) {
-      Expected<std::unique_ptr<ObjectFile>> ObjOrErr = O.getAsObjectFile();
+  }
+}
+
+// Returns true If the binary contains a slice that matches the host
+// architecture, or false otherwise.
+static bool dumpMachOUniversalBinaryMatchHost(MachOUniversalBinary *UB,
+                                              std::string &Filename,
+                                              LLVMContext *ContextPtr) {
+  Triple HostTriple = MachOObjectFile::getHostArch();
+  StringRef HostArchName = HostTriple.getArchName();
+  for (MachOUniversalBinary::object_iterator I = UB->begin_objects(),
+                                             E = UB->end_objects();
+       I != E; ++I) {
+    if (HostArchName == I->getArchFlagName()) {
+      Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
       std::string ArchiveName;
-      std::string ArchitectureName;
-      ArchiveName.clear();
-      ArchitectureName.clear();
       if (ObjOrErr) {
         ObjectFile &Obj = *ObjOrErr.get();
-        if (PrintFileName) {
-          if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
-            ArchitectureName = O.getArchFlagName();
-        } else {
-          if (moreThanOneArch)
-            outs() << "\n";
-          outs() << Obj.getFileName();
-          if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
-            outs() << " (for architecture " << O.getArchFlagName() << ")";
-          outs() << ":\n";
-        }
-        dumpSymbolNamesFromObject(Obj, false, ArchiveName, ArchitectureName);
-      } else if (auto E = isNotObjectErrorInvalidFileType(
-                 ObjOrErr.takeError())) {
-        error(std::move(E), Filename, moreThanOneArch ?
-              StringRef(O.getArchFlagName()) : StringRef());
-        continue;
-      } else if (Expected<std::unique_ptr<Archive>> AOrErr =
-                  O.getAsArchive()) {
+        dumpSymbolNamesFromObject(Obj, false);
+      } else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError()))
+        error(std::move(E), Filename);
+      else if (Expected<std::unique_ptr<Archive>> AOrErr = I->getAsArchive()) {
         std::unique_ptr<Archive> &A = *AOrErr;
         Error Err = Error::success();
         for (auto &C : A->children(Err)) {
           Expected<std::unique_ptr<Binary>> ChildOrErr =
-            C.getAsBinary(ContextPtr);
+              C.getAsBinary(ContextPtr);
           if (!ChildOrErr) {
-            if (auto E = isNotObjectErrorInvalidFileType(
-                                 ChildOrErr.takeError()))
-              error(std::move(E), Filename, C, moreThanOneArch ?
-                    StringRef(ArchitectureName) : StringRef());
+            if (auto E =
+                    isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+              error(std::move(E), Filename, C);
             continue;
           }
-          if (SymbolicFile *F = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-            if (PrintFileName) {
+          if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+            if (PrintFileName)
               ArchiveName = std::string(A->getFileName());
-              if (isa<MachOObjectFile>(F) && moreThanOneArch)
-                ArchitectureName = O.getArchFlagName();
-            } else {
-              outs() << "\n" << A->getFileName();
-              if (isa<MachOObjectFile>(F)) {
-                outs() << "(" << F->getFileName() << ")";
-                if (moreThanOneArch)
-                  outs() << " (for architecture " << O.getArchFlagName()
-                         << ")";
-              } else
-                outs() << ":" << F->getFileName();
-              outs() << ":\n";
-            }
-            dumpSymbolNamesFromObject(*F, false, ArchiveName, ArchitectureName);
+            else
+              outs() << "\n"
+                     << A->getFileName() << "(" << O->getFileName() << ")"
+                     << ":\n";
+            dumpSymbolNamesFromObject(*O, false, ArchiveName);
           }
         }
         if (Err)
@@ -2187,49 +2096,161 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
       } else {
         consumeError(AOrErr.takeError());
         error(Filename + " for architecture " +
-              StringRef(O.getArchFlagName()) +
-              " is not a Mach-O file or an archive file",
+                  StringRef(I->getArchFlagName()) +
+                  " is not a Mach-O file or an archive file",
               "Mach-O universal file");
       }
+      return true;
     }
-    return;
   }
+  return false;
+}
 
-  if (TapiUniversal *TU = dyn_cast<TapiUniversal>(&Bin)) {
-    for (const TapiUniversal::ObjectForArch &I : TU->objects()) {
-      StringRef ArchName = I.getArchFlagName();
-      const bool ShowArch =
-          ArchFlags.empty() || llvm::is_contained(ArchFlags, ArchName);
-      if (!ShowArch)
-        continue;
-      if (!AddInlinedInfo && !I.isTopLevelLib())
-        continue;
-      if (auto ObjOrErr = I.getAsObjectFile()) {
-        outs() << "\n"
-               << I.getInstallName() << " (for architecture " << ArchName << ")"
-               << ":\n";
-        dumpSymbolNamesFromObject(*ObjOrErr.get(), false, {}, ArchName);
-      } else if (Error E =
-                     isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
-        error(std::move(E), Filename, ArchName);
+static void dumpMachOUniversalBinaryArchAll(MachOUniversalBinary *UB,
+                                            std::string &Filename,
+                                            LLVMContext *ContextPtr) {
+  bool moreThanOneArch = UB->getNumberOfObjects() > 1;
+  for (const MachOUniversalBinary::ObjectForArch &O : UB->objects()) {
+    Expected<std::unique_ptr<ObjectFile>> ObjOrErr = O.getAsObjectFile();
+    std::string ArchiveName;
+    std::string ArchitectureName;
+    ArchiveName.clear();
+    ArchitectureName.clear();
+    if (ObjOrErr) {
+      ObjectFile &Obj = *ObjOrErr.get();
+      if (PrintFileName) {
+        if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
+          ArchitectureName = O.getArchFlagName();
+      } else {
+        if (moreThanOneArch)
+          outs() << "\n";
+        outs() << Obj.getFileName();
+        if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
+          outs() << " (for architecture " << O.getArchFlagName() << ")";
+        outs() << ":\n";
+      }
+      dumpSymbolNamesFromObject(Obj, false, ArchiveName, ArchitectureName);
+    } else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
+      error(std::move(E), Filename,
+            moreThanOneArch ? StringRef(O.getArchFlagName()) : StringRef());
+      continue;
+    } else if (Expected<std::unique_ptr<Archive>> AOrErr = O.getAsArchive()) {
+      std::unique_ptr<Archive> &A = *AOrErr;
+      Error Err = Error::success();
+      for (auto &C : A->children(Err)) {
+        Expected<std::unique_ptr<Binary>> ChildOrErr =
+            C.getAsBinary(ContextPtr);
+        if (!ChildOrErr) {
+          if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
+            error(std::move(E), Filename, C,
+                  moreThanOneArch ? StringRef(ArchitectureName) : StringRef());
+          continue;
+        }
+        if (SymbolicFile *F = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
+          if (PrintFileName) {
+            ArchiveName = std::string(A->getFileName());
+            if (isa<MachOObjectFile>(F) && moreThanOneArch)
+              ArchitectureName = O.getArchFlagName();
+          } else {
+            outs() << "\n" << A->getFileName();
+            if (isa<MachOObjectFile>(F)) {
+              outs() << "(" << F->getFileName() << ")";
+              if (moreThanOneArch)
+                outs() << " (for architecture " << O.getArchFlagName() << ")";
+            } else
+              outs() << ":" << F->getFileName();
+            outs() << ":\n";
+          }
+          dumpSymbolNamesFromObject(*F, false, ArchiveName, ArchitectureName);
+        }
       }
+      if (Err)
+        error(std::move(Err), A->getFileName());
+    } else {
+      consumeError(AOrErr.takeError());
+      error(Filename + " for architecture " + StringRef(O.getArchFlagName()) +
+                " is not a Mach-O file or an archive file",
+            "Mach-O universal file");
     }
+  }
+}
 
+static void dumpMachOUniversalBinary(MachOUniversalBinary *UB,
+                                     std::string &Filename,
+                                     LLVMContext *ContextPtr) {
+  // If we have a list of architecture flags specified dump only those.
+  if (!ArchAll && !ArchFlags.empty()) {
+    dumpMachOUniversalBinaryMatchArchFlags(UB, Filename, ContextPtr);
     return;
   }
 
-  if (SymbolicFile *O = dyn_cast<SymbolicFile>(&Bin)) {
-    if (!MachOPrintSizeWarning && PrintSize &&  isa<MachOObjectFile>(O)) {
-      WithColor::warning(errs(), ToolName)
-          << "sizes with --print-size for Mach-O files are always zero.\n";
-      MachOPrintSizeWarning = true;
+  // No architecture flags were specified so if this contains a slice that
+  // matches the host architecture dump only that.
+  if (!ArchAll && dumpMachOUniversalBinaryMatchHost(UB, Filename, ContextPtr))
+    return;
+
+  // Either all architectures have been specified or none have been specified
+  // and this does not contain the host architecture so dump all the slices.
+  dumpMachOUniversalBinaryArchAll(UB, Filename, ContextPtr);
+}
+
+static void dumpTapiUniversal(TapiUniversal *TU, std::string &Filename) {
+  for (const TapiUniversal::ObjectForArch &I : TU->objects()) {
+    StringRef ArchName = I.getArchFlagName();
+    const bool ShowArch =
+        ArchFlags.empty() || llvm::is_contained(ArchFlags, ArchName);
+    if (!ShowArch)
+      continue;
+    if (!AddInlinedInfo && !I.isTopLevelLib())
+      continue;
+    if (auto ObjOrErr = I.getAsObjectFile()) {
+      outs() << "\n"
+             << I.getInstallName() << " (for architecture " << ArchName << ")"
+             << ":\n";
+      dumpSymbolNamesFromObject(*ObjOrErr.get(), false, {}, ArchName);
+    } else if (Error E =
+                   isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
+      error(std::move(E), Filename, ArchName);
     }
-    if (!checkMachOAndArchFlags(O, Filename))
-      return;
-    dumpSymbolNamesFromObject(*O, true);
   }
 }
 
+static void dumpSymbolicFile(SymbolicFile *O, std::string &Filename) {
+  if (!MachOPrintSizeWarning && PrintSize && isa<MachOObjectFile>(O)) {
+    WithColor::warning(errs(), ToolName)
+        << "sizes with --print-size for Mach-O files are always zero.\n";
+    MachOPrintSizeWarning = true;
+  }
+  if (!checkMachOAndArchFlags(O, Filename))
+    return;
+  dumpSymbolNamesFromObject(*O, true);
+}
+
+static void dumpSymbolNamesFromFile(std::string &Filename) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (error(BufferOrErr.getError(), Filename))
+    return;
+
+  LLVMContext Context;
+  LLVMContext *ContextPtr = NoLLVMBitcode ? nullptr : &Context;
+  Expected<std::unique_ptr<Binary>> BinaryOrErr =
+      createBinary(BufferOrErr.get()->getMemBufferRef(), ContextPtr);
+  if (!BinaryOrErr) {
+    error(BinaryOrErr.takeError(), Filename);
+    return;
+  }
+  Binary &Bin = *BinaryOrErr.get();
+  if (Archive *A = dyn_cast<Archive>(&Bin))
+    dumpArchive(A, Filename, ContextPtr);
+  else if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin))
+    dumpMachOUniversalBinary(UB, Filename, ContextPtr);
+  else if (TapiUniversal *TU = dyn_cast<TapiUniversal>(&Bin))
+    dumpTapiUniversal(TU, Filename);
+  else if (SymbolicFile *O = dyn_cast<SymbolicFile>(&Bin))
+    dumpSymbolicFile(O, Filename);
+}
+
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   BumpPtrAllocator A;

From fc1b21228e39d63f1a2ab98026d548de66cb3760 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 17 Feb 2022 16:00:36 +0000
Subject: [PATCH 125/748] [AArch64][SVE] Add structured load/store opcodes to
 getMemOpInfo

Currently, loading from or storing to a stack location with a structured load
or store crashes in isAArch64FrameOffsetLegal as the opcodes are not handled by
getMemOpInfo. This patch adds the opcodes for structured load/store instructions
with an immediate index to getMemOpInfo & getLoadStoreImmIdx, setting appropriate
values for the scale, width & min/max offsets.

Reviewed By: sdesmalen, david-arm

Differential Revision: https://reviews.llvm.org/D119338
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  65 +++++
 .../CodeGen/AArch64/sve-fixed-ld2-alloca.ll   |  27 ++
 llvm/test/CodeGen/AArch64/sve-ldN.mir         | 261 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/sve-stN.mir         | 261 ++++++++++++++++++
 4 files changed, 614 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-ldN.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sve-stN.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index b2689e900f625..e80a9ae7c0eea 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2270,6 +2270,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LD1SW_D_IMM:
   case AArch64::LD1D_IMM:
 
+  case AArch64::LD2B_IMM:
+  case AArch64::LD2H_IMM:
+  case AArch64::LD2W_IMM:
+  case AArch64::LD2D_IMM:
+  case AArch64::LD3B_IMM:
+  case AArch64::LD3H_IMM:
+  case AArch64::LD3W_IMM:
+  case AArch64::LD3D_IMM:
+  case AArch64::LD4B_IMM:
+  case AArch64::LD4H_IMM:
+  case AArch64::LD4W_IMM:
+  case AArch64::LD4D_IMM:
+
   case AArch64::ST1B_IMM:
   case AArch64::ST1B_H_IMM:
   case AArch64::ST1B_S_IMM:
@@ -2281,6 +2294,19 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::ST1W_D_IMM:
   case AArch64::ST1D_IMM:
 
+  case AArch64::ST2B_IMM:
+  case AArch64::ST2H_IMM:
+  case AArch64::ST2W_IMM:
+  case AArch64::ST2D_IMM:
+  case AArch64::ST3B_IMM:
+  case AArch64::ST3H_IMM:
+  case AArch64::ST3W_IMM:
+  case AArch64::ST3D_IMM:
+  case AArch64::ST4B_IMM:
+  case AArch64::ST4H_IMM:
+  case AArch64::ST4W_IMM:
+  case AArch64::ST4D_IMM:
+
   case AArch64::LD1RB_IMM:
   case AArch64::LD1RB_H_IMM:
   case AArch64::LD1RB_S_IMM:
@@ -2897,6 +2923,45 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -8;
     MaxOffset = 7;
     break;
+  case AArch64::LD2B_IMM:
+  case AArch64::LD2H_IMM:
+  case AArch64::LD2W_IMM:
+  case AArch64::LD2D_IMM:
+  case AArch64::ST2B_IMM:
+  case AArch64::ST2H_IMM:
+  case AArch64::ST2W_IMM:
+  case AArch64::ST2D_IMM:
+    Scale = TypeSize::Scalable(32);
+    Width = SVEMaxBytesPerVector * 2;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
+  case AArch64::LD3B_IMM:
+  case AArch64::LD3H_IMM:
+  case AArch64::LD3W_IMM:
+  case AArch64::LD3D_IMM:
+  case AArch64::ST3B_IMM:
+  case AArch64::ST3H_IMM:
+  case AArch64::ST3W_IMM:
+  case AArch64::ST3D_IMM:
+    Scale = TypeSize::Scalable(48);
+    Width = SVEMaxBytesPerVector * 3;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
+  case AArch64::LD4B_IMM:
+  case AArch64::LD4H_IMM:
+  case AArch64::LD4W_IMM:
+  case AArch64::LD4D_IMM:
+  case AArch64::ST4B_IMM:
+  case AArch64::ST4H_IMM:
+  case AArch64::ST4W_IMM:
+  case AArch64::ST4D_IMM:
+    Scale = TypeSize::Scalable(64);
+    Width = SVEMaxBytesPerVector * 4;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
   case AArch64::LD1B_H_IMM:
   case AArch64::LD1SB_H_IMM:
   case AArch64::LD1H_S_IMM:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll
new file mode 100644
index 0000000000000..f59891c31e934
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @st1d_fixed(<8 x double>* %ptr) #0 {
+; CHECK-LABEL: st1d_fixed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld2d { z0.d, z1.d }, p0/z, [x8]
+; CHECK-NEXT:    mov x8, #4
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %alloc = alloca [16 x double], i32 0
+  %bc = bitcast [16 x double]* %alloc to <8 x double>*
+  %load = load <8 x double>, <8 x double>* %bc
+  %strided.vec = shufflevector <8 x double> %load, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <8 x double> zeroinitializer, <8 x double>* %ptr
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" vscale_range(2,2) nounwind }
diff --git a/llvm/test/CodeGen/AArch64/sve-ldN.mir b/llvm/test/CodeGen/AArch64/sve-ldN.mir
new file mode 100644
index 0000000000000..c59c53da806ba
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-ldN.mir
@@ -0,0 +1,261 @@
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=prologepilog -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -start-before=prologepilog -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-OFFSET
+
+--- |
+  define void @testcase_valid_offset() nounwind { entry: unreachable }
+  define void @testcase_offset_out_of_range() nounwind { entry: unreachable }
+...
+---
+name:            testcase_valid_offset
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 512, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0:
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_valid_offset
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.1)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32
+    ; CHECK-NEXT: renamable $z0_z1 = LD2B_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1 = LD2B_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1 = LD2H_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1 = LD2H_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1 = LD2W_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1 = LD2W_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1 = LD2D_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1 = LD2D_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3B_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3B_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3H_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3H_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3W_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3W_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3D_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3D_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, $sp, -8
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, $sp, 7
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 31
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0, implicit $z1, implicit $z2, implicit $z3
+
+    ; CHECK-OFFSET-LABEL: testcase_valid_offset:
+    ; CHECK-OFFSET: str x29, [sp, #-16]!
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #-32
+    ; CHECK-OFFSET-NEXT: ld2b { z0.b, z1.b }, p0/z, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: ld2b { z0.b, z1.b }, p0/z, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: ld2h { z0.h, z1.h }, p0/z, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: ld2h { z0.h, z1.h }, p0/z, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: ld2w { z0.s, z1.s }, p0/z, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: ld2w { z0.s, z1.s }, p0/z, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: ld2d { z0.d, z1.d }, p0/z, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: ld2d { z0.d, z1.d }, p0/z, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #31
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #1
+    ; CHECK-OFFSET-NEXT: ldr x29, [sp], #16
+    ; CHECK-OFFSET-NEXT: ret
+
+    renamable $z0_z1 = LD2B_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1 = LD2B_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1 = LD2H_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1 = LD2H_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1 = LD2W_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1 = LD2W_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1 = LD2D_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1 = LD2D_IMM renamable $p0, %stack.0, 7
+
+    renamable $z0_z1_z2 = LD3B_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2 = LD3B_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1_z2 = LD3H_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2 = LD3H_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1_z2 = LD3W_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2 = LD3W_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1_z2 = LD3D_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2 = LD3D_IMM renamable $p0, %stack.0, 7
+
+    renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, %stack.0, 7
+    renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, %stack.0, -8
+    renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, %stack.0, 7
+    RET_ReallyLR implicit $z0, implicit $z1, implicit $z2, implicit $z3
+...
+---
+name:            testcase_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 512, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0:
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_offset_out_of_range
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.1)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2B_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2B_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2H_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2H_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2W_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2W_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2D_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: renamable $z0_z1 = LD2D_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3B_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3B_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3H_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3H_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3W_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3W_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3D_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: renamable $z0_z1_z2 = LD3D_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 31
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0, implicit $z1, implicit $z2, implicit $z3
+
+    ; CHECK-OFFSET-LABEL: testcase_offset_out_of_range:
+    ; CHECK-OFFSET: str x29, [sp, #-16]!
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #-32
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: ld2b { z0.b, z1.b }, p0/z, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: ld2b { z0.b, z1.b }, p0/z, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: ld2h { z0.h, z1.h }, p0/z, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: ld2h { z0.h, z1.h }, p0/z, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: ld2w { z0.s, z1.s }, p0/z, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: ld2w { z0.s, z1.s }, p0/z, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: ld2d { z0.d, z1.d }, p0/z, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: ld2d { z0.d, z1.d }, p0/z, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: ld3h { z0.h, z1.h, z2.h }, p0/z, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: ld3w { z0.s, z1.s, z2.s }, p0/z, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: ld3d { z0.d, z1.d, z2.d }, p0/z, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #31
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #1
+    ; CHECK-OFFSET-NEXT: ldr x29, [sp], #16
+    ; CHECK-OFFSET-NEXT: ret
+
+    renamable $z0_z1 = LD2B_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1 = LD2B_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1 = LD2H_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1 = LD2H_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1 = LD2W_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1 = LD2W_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1 = LD2D_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1 = LD2D_IMM renamable $p0, %stack.0, 8
+
+    renamable $z0_z1_z2 = LD3B_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2 = LD3B_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1_z2 = LD3H_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2 = LD3H_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1_z2 = LD3W_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2 = LD3W_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1_z2 = LD3D_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2 = LD3D_IMM renamable $p0, %stack.0, 8
+
+    renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2_z3 = LD4B_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2_z3 = LD4H_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2_z3 = LD4W_IMM renamable $p0, %stack.0, 8
+    renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, %stack.0, -9
+    renamable $z0_z1_z2_z3 = LD4D_IMM renamable $p0, %stack.0, 8
+    RET_ReallyLR implicit $z0, implicit $z1, implicit $z2, implicit $z3
+...
diff --git a/llvm/test/CodeGen/AArch64/sve-stN.mir b/llvm/test/CodeGen/AArch64/sve-stN.mir
new file mode 100644
index 0000000000000..ac5c036a10bd0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-stN.mir
@@ -0,0 +1,261 @@
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=prologepilog -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -start-before=prologepilog -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-OFFSET
+
+--- |
+  define void @testcase_valid_offset() nounwind { entry: unreachable }
+  define void @testcase_offset_out_of_range() nounwind { entry: unreachable }
+...
+---
+name:            testcase_valid_offset
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 512, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0:
+    liveins: $p0, $z0
+
+    ; CHECK-LABEL: name: testcase_valid_offset
+    ; CHECK: liveins: $p0, $z0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.1)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32
+    ; CHECK-NEXT: ST2B_IMM renamable $z0_z1, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST2B_IMM renamable $z0_z1, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST2H_IMM renamable $z0_z1, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST2H_IMM renamable $z0_z1, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST2W_IMM renamable $z0_z1, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST2W_IMM renamable $z0_z1, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST2D_IMM renamable $z0_z1, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST2D_IMM renamable $z0_z1, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST3B_IMM renamable $z0_z1_z2, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST3B_IMM renamable $z0_z1_z2, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST3H_IMM renamable $z0_z1_z2, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST3H_IMM renamable $z0_z1_z2, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST3W_IMM renamable $z0_z1_z2, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST3W_IMM renamable $z0_z1_z2, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST3D_IMM renamable $z0_z1_z2, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST3D_IMM renamable $z0_z1_z2, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, 7
+    ; CHECK-NEXT: ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, -8
+    ; CHECK-NEXT: ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, $sp, 7
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 31
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: RET_ReallyLR
+
+    ; CHECK-OFFSET-LABEL: testcase_valid_offset:
+    ; CHECK-OFFSET: str x29, [sp, #-16]!
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #-32
+    ; CHECK-OFFSET-NEXT: st2b { z0.b, z1.b }, p0, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: st2b { z0.b, z1.b }, p0, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: st2h { z0.h, z1.h }, p0, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: st2h { z0.h, z1.h }, p0, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: st2w { z0.s, z1.s }, p0, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: st2w { z0.s, z1.s }, p0, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: st2d { z0.d, z1.d }, p0, [sp, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: st2d { z0.d, z1.d }, p0, [sp, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: st3h { z0.h, z1.h, z2.h }, p0, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: st3h { z0.h, z1.h, z2.h }, p0, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: st3w { z0.s, z1.s, z2.s }, p0, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: st3w { z0.s, z1.s, z2.s }, p0, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: st3d { z0.d, z1.d, z2.d }, p0, [sp, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: st3d { z0.d, z1.d, z2.d }, p0, [sp, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [sp, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [sp, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #31
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #1
+    ; CHECK-OFFSET-NEXT: ldr x29, [sp], #16
+    ; CHECK-OFFSET-NEXT: ret
+
+    ST2B_IMM renamable $z0_z1, renamable $p0, %stack.0, -8
+    ST2B_IMM renamable $z0_z1, renamable $p0, %stack.0, 7
+    ST2H_IMM renamable $z0_z1, renamable $p0, %stack.0, -8
+    ST2H_IMM renamable $z0_z1, renamable $p0, %stack.0, 7
+    ST2W_IMM renamable $z0_z1, renamable $p0, %stack.0, -8
+    ST2W_IMM renamable $z0_z1, renamable $p0, %stack.0, 7
+    ST2D_IMM renamable $z0_z1, renamable $p0, %stack.0, -8
+    ST2D_IMM renamable $z0_z1, renamable $p0, %stack.0, 7
+
+    ST3B_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -8
+    ST3B_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 7
+    ST3H_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -8
+    ST3H_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 7
+    ST3W_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -8
+    ST3W_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 7
+    ST3D_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -8
+    ST3D_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 7
+
+    ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -8
+    ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 7
+    ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -8
+    ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 7
+    ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -8
+    ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 7
+    ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -8
+    ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 7
+    RET_ReallyLR
+...
+---
+name:            testcase_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 512, alignment: 16, stack-id: scalable-vector }
+body:             |
+  bb.0:
+    liveins: $p0, $z0
+
+    ; CHECK-LABEL: name: testcase_offset_out_of_range
+    ; CHECK: liveins: $p0, $z0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.1)
+    ; CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: ST2B_IMM renamable $z0_z1, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: ST2B_IMM renamable $z0_z1, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: ST2H_IMM renamable $z0_z1, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: ST2H_IMM renamable $z0_z1, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: ST2W_IMM renamable $z0_z1, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: ST2W_IMM renamable $z0_z1, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -2
+    ; CHECK-NEXT: ST2D_IMM renamable $z0_z1, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 2
+    ; CHECK-NEXT: ST2D_IMM renamable $z0_z1, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: ST3B_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: ST3B_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: ST3H_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: ST3H_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: ST3W_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: ST3W_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -3
+    ; CHECK-NEXT: ST3D_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 3
+    ; CHECK-NEXT: ST3D_IMM renamable $z0_z1_z2, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, -4
+    ; CHECK-NEXT: ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, -8
+    ; CHECK-NEXT: $x8 = ADDVL_XXI $sp, 4
+    ; CHECK-NEXT: ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, killed $x8, 7
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 31
+    ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1
+    ; CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: RET_ReallyLR
+
+    ; CHECK-OFFSET-LABEL: testcase_offset_out_of_range
+    ; CHECK-OFFSET: str x29, [sp, #-16]!
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #-32
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: st2b { z0.b, z1.b }, p0, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: st2b { z0.b, z1.b }, p0, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: st2h { z0.h, z1.h }, p0, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: st2h { z0.h, z1.h }, p0, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: st2w { z0.s, z1.s }, p0, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: st2w { z0.s, z1.s }, p0, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-2
+    ; CHECK-OFFSET-NEXT: st2d { z0.d, z1.d }, p0, [x8, #-16, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #2
+    ; CHECK-OFFSET-NEXT: st2d { z0.d, z1.d }, p0, [x8, #14, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: st3b { z0.b, z1.b, z2.b }, p0, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: st3h { z0.h, z1.h, z2.h }, p0, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: st3h { z0.h, z1.h, z2.h }, p0, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: st3w { z0.s, z1.s, z2.s }, p0, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: st3w { z0.s, z1.s, z2.s }, p0, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-3
+    ; CHECK-OFFSET-NEXT: st3d { z0.d, z1.d, z2.d }, p0, [x8, #-24, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #3
+    ; CHECK-OFFSET-NEXT: st3d { z0.d, z1.d, z2.d }, p0, [x8, #21, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: st4b { z0.b, z1.b, z2.b, z3.b }, p0, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: st4w { z0.s, z1.s, z2.s, z3.s }, p0, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #-4
+    ; CHECK-OFFSET-NEXT: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x8, #-32, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl x8, sp, #4
+    ; CHECK-OFFSET-NEXT: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x8, #28, mul vl]
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #31
+    ; CHECK-OFFSET-NEXT: addvl sp, sp, #1
+    ; CHECK-OFFSET-NEXT: ldr x29, [sp], #16
+    ; CHECK-OFFSET-NEXT: ret
+
+    ST2B_IMM renamable $z0_z1, renamable $p0, %stack.0, -9
+    ST2B_IMM renamable $z0_z1, renamable $p0, %stack.0, 8
+    ST2H_IMM renamable $z0_z1, renamable $p0, %stack.0, -9
+    ST2H_IMM renamable $z0_z1, renamable $p0, %stack.0, 8
+    ST2W_IMM renamable $z0_z1, renamable $p0, %stack.0, -9
+    ST2W_IMM renamable $z0_z1, renamable $p0, %stack.0, 8
+    ST2D_IMM renamable $z0_z1, renamable $p0, %stack.0, -9
+    ST2D_IMM renamable $z0_z1, renamable $p0, %stack.0, 8
+
+    ST3B_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -9
+    ST3B_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 8
+    ST3H_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -9
+    ST3H_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 8
+    ST3W_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -9
+    ST3W_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 8
+    ST3D_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, -9
+    ST3D_IMM renamable $z0_z1_z2, renamable $p0, %stack.0, 8
+
+    ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -9
+    ST4B_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 8
+    ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -9
+    ST4H_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 8
+    ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -9
+    ST4W_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 8
+    ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, -9
+    ST4D_IMM renamable $z0_z1_z2_z3, renamable $p0, %stack.0, 8
+    RET_ReallyLR
+...

From ca783124073f8db0900a38edd2647662484be09a Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Tue, 15 Feb 2022 23:20:51 -0800
Subject: [PATCH 126/748] [RISCV] Add the policy operand for nomask vector
 Multiply-Add IR intrinsics.

The goal is support tail and mask policy in RVV builtins.
We focus on IR part first.

The nomask vector Multiply-Add need a policy operand
because merge value could not be undef.

Reviewed By: monkchiang

Differential Revision: https://reviews.llvm.org/D119727
---
 clang/include/clang/Basic/riscv_vector.td     |  54 +-
 .../RISCV/rvv-intrinsics-overloaded/vfmacc.c  |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfmadd.c  |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfmsac.c  |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfmsub.c  |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfnmacc.c |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfnmadd.c |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfnmsac.c |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfnmsub.c |  36 +-
 .../RISCV/rvv-intrinsics-overloaded/vfwmacc.c |  16 +-
 .../RISCV/rvv-intrinsics-overloaded/vfwmsac.c |  16 +-
 .../rvv-intrinsics-overloaded/vfwnmacc.c      |  16 +-
 .../rvv-intrinsics-overloaded/vfwnmsac.c      |  16 +-
 .../RISCV/rvv-intrinsics-overloaded/vmacc.c   | 176 ++---
 .../RISCV/rvv-intrinsics-overloaded/vmadd.c   | 176 ++---
 .../RISCV/rvv-intrinsics-overloaded/vnmsac.c  | 176 ++---
 .../RISCV/rvv-intrinsics-overloaded/vnmsub.c  | 176 ++---
 .../RISCV/rvv-intrinsics-overloaded/vwmacc.c  | 210 +++---
 .../CodeGen/RISCV/rvv-intrinsics/vfmacc.c     |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfmadd.c     |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfmsac.c     |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfmsub.c     |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfnmacc.c    |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfnmadd.c    |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfnmsac.c    |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfnmsub.c    |  60 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfwmacc.c    |  36 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfwmsac.c    |  36 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfwnmacc.c   |  36 +-
 .../CodeGen/RISCV/rvv-intrinsics/vfwnmsac.c   |  36 +-
 .../test/CodeGen/RISCV/rvv-intrinsics/vmacc.c | 176 ++---
 .../test/CodeGen/RISCV/rvv-intrinsics/vmadd.c | 176 ++---
 .../CodeGen/RISCV/rvv-intrinsics/vnmsac.c     | 176 ++---
 .../CodeGen/RISCV/rvv-intrinsics/vnmsub.c     | 176 ++---
 .../CodeGen/RISCV/rvv-intrinsics/vwmacc.c     | 210 +++---
 clang/utils/TableGen/RISCVVEmitter.cpp        |  63 +-
 llvm/include/llvm/IR/IntrinsicsRISCV.td       |  10 +-
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |   4 +-
 llvm/test/CodeGen/RISCV/rvv/memory-args.ll    |   3 +-
 llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll    | 609 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vfmacc.ll         |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfmadd.ll         |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfmsac.ll         |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfmsub.ll         |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll        |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll        |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll        |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll        |  72 ++-
 llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll        |  54 +-
 llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll        |  54 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll       |  54 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll       |  54 +-
 llvm/test/CodeGen/RISCV/rvv/vmacc-rv32.ll     | 216 ++++---
 llvm/test/CodeGen/RISCV/rvv/vmacc-rv64.ll     | 108 ++--
 llvm/test/CodeGen/RISCV/rvv/vmadd-rv32.ll     | 108 ++--
 llvm/test/CodeGen/RISCV/rvv/vmadd-rv64.ll     | 108 ++--
 llvm/test/CodeGen/RISCV/rvv/vnmsac-rv32.ll    | 108 ++--
 llvm/test/CodeGen/RISCV/rvv/vnmsac-rv64.ll    | 108 ++--
 llvm/test/CodeGen/RISCV/rvv/vnmsub-rv32.ll    | 108 ++--
 llvm/test/CodeGen/RISCV/rvv/vnmsub-rv64.ll    | 108 ++--
 .../RISCV/rvv/vsetvli-insert-crossbb.ll       |   4 +-
 llvm/test/CodeGen/RISCV/rvv/vwmacc-rv32.ll    |  90 ++-
 llvm/test/CodeGen/RISCV/rvv/vwmacc-rv64.ll    |  90 ++-
 llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv32.ll  |  90 ++-
 llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv64.ll  |  90 ++-
 llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv32.ll   |  90 ++-
 llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv64.ll   |  90 ++-
 llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv32.ll  |  45 +-
 llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv64.ll  |  45 +-
 69 files changed, 3691 insertions(+), 2254 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index 94202f6359cee..7f0ad2ee20996 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -130,6 +130,14 @@
 //
 // There are a number of attributes that are used to constraint the number and
 // shape of the builtins generated. Refer to the comments below for them.
+
+class Policy<int val>{
+  int Value = val;
+}
+def NonePolicy : Policy<0>;
+def HasPassthruOperand : Policy<1>;
+def HasPolicyOperand : Policy<2>;
+
 class RVVBuiltin<string suffix, string prototype, string type_range,
                  string mangled_suffix = ""> {
   // Base name that will be prepended in __builtin_rvv_ and appended the
@@ -177,8 +185,12 @@ class RVVBuiltin<string suffix, string prototype, string type_range,
   // The policy argument is located at the last position.
   bit HasPolicy = true;
 
-  // The nomask intrinsic IR have the passthru operand.
-  bit HasNoMaskPassThru = false;
+  // The policy scheme for nomask intrinsic IR.
+  // HasPassthruOperand: Has a passthru operand to decide tail policy. If it is
+  // undef, tail policy is tail agnostic, otherwise policy is tail undisturbed.
+  // HasPolicyOperand: Has a policy operand. 1 is tail agnostic and 0 is tail
+  // undisturbed.
+  Policy NoMaskPolicy = NonePolicy;
 
   // This builtin supports non-masked function overloading api.
   // All masked operations support overloading api.
@@ -1586,7 +1598,7 @@ defm : RVVIndexedSegStore<"vsoxseg">;
 
 // 12. Vector Integer Arithmetic Instructions
 // 12.1. Vector Single-Width Integer Add and Subtract
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 defm vadd : RVVIntBinBuiltinSet;
 defm vsub : RVVIntBinBuiltinSet;
 defm vrsub : RVVOutOp1BuiltinSet<"vrsub", "csil",
@@ -1597,7 +1609,7 @@ defm vneg_v : RVVPseudoUnaryBuiltin<"vrsub", "csil">;
 
 // 12.2. Vector Widening Integer Add/Subtract
 // Widening unsigned integer add/subtract, 2*SEW = SEW +/- SEW
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 defm vwaddu : RVVUnsignedWidenBinBuiltinSet;
 defm vwsubu : RVVUnsignedWidenBinBuiltinSet;
 // Widening signed integer add/subtract, 2*SEW = SEW +/- SEW
@@ -1616,7 +1628,7 @@ defm vwcvt_x_x_v : RVVPseudoVWCVTBuiltin<"vwadd", "vwcvt_x", "csi",
                                          [["w", "wv"]]>;
 
 // 12.3. Vector Integer Extension
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 let Log2LMUL = [-3, -2, -1, 0, 1, 2] in {
   def vsext_vf2 : RVVIntExt<"vsext", "w", "wv", "csi">;
   def vzext_vf2 : RVVIntExt<"vzext", "Uw", "UwUv", "csi">;
@@ -1633,7 +1645,7 @@ let Log2LMUL = [-3, -2, -1, 0] in {
 
 // 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
 let HasMask = false, HasPolicy = false in {
-  let HasNoMaskPassThru = true in {
+  let NoMaskPolicy = HasPassthruOperand in {
     defm vadc : RVVCarryinBuiltinSet;
     defm vsbc : RVVCarryinBuiltinSet;
   }
@@ -1644,7 +1656,7 @@ let HasMask = false, HasPolicy = false in {
 }
 
 // 12.5. Vector Bitwise Logical Instructions
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 defm vand : RVVIntBinBuiltinSet;
 defm vxor : RVVIntBinBuiltinSet;
 defm vor : RVVIntBinBuiltinSet;
@@ -1652,7 +1664,7 @@ defm vor : RVVIntBinBuiltinSet;
 defm vnot_v : RVVPseudoVNotBuiltin<"vxor", "csil">;
 
 // 12.6. Vector Single-Width Bit Shift Instructions
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 defm vsll : RVVShiftBuiltinSet;
 defm vsrl : RVVUnsignedShiftBuiltinSet;
 defm vsra : RVVSignedShiftBuiltinSet;
@@ -1680,7 +1692,7 @@ defm vmsge : RVVSignedMaskOutBuiltinSet;
 }
 
 // 12.9. Vector Integer Min/Max Instructions
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 defm vminu : RVVUnsignedBinBuiltinSet;
 defm vmin : RVVSignedBinBuiltinSet;
 defm vmaxu : RVVUnsignedBinBuiltinSet;
@@ -1704,7 +1716,7 @@ defm vrem : RVVSignedBinBuiltinSet;
 }
 
 // 12.12. Vector Widening Integer Multiply Instructions
-let Log2LMUL = [-3, -2, -1, 0, 1, 2], HasNoMaskPassThru = true in {
+let Log2LMUL = [-3, -2, -1, 0, 1, 2], NoMaskPolicy = HasPassthruOperand in {
 defm vwmul : RVVOutOp0Op1BuiltinSet<"vwmul", "csi",
                                     [["vv", "w", "wvv"],
                                      ["vx", "w", "wve"]]>;
@@ -1717,6 +1729,7 @@ defm vwmulsu : RVVOutOp0Op1BuiltinSet<"vwmulsu", "csi",
 }
 
 // 12.13. Vector Single-Width Integer Multiply-Add Instructions
+let NoMaskPolicy = HasPolicyOperand in {
 defm vmacc  : RVVIntTerBuiltinSet;
 defm vnmsac : RVVIntTerBuiltinSet;
 defm vmadd  : RVVIntTerBuiltinSet;
@@ -1737,6 +1750,7 @@ defm vwmaccsu : RVVOutOp1Op2BuiltinSet<"vwmaccsu", "csi",
 defm vwmaccus : RVVOutOp1Op2BuiltinSet<"vwmaccus", "csi",
                                        [["vx", "w", "wwUev"]]>;
 }
+}
 
 // 12.15. Vector Integer Merge Instructions
 // C/C++ Operand: (mask, op1, op2, vl), Intrinsic: (op1, op2, mask, vl)
@@ -1755,7 +1769,7 @@ let HasMask = false, HasPolicy = false,
 }
 
 // 12.16. Vector Integer Move Instructions
-let HasMask = false, HasNoMaskPassThru = true, HasPolicy = false in {
+let HasMask = false, NoMaskPolicy = HasPassthruOperand, HasPolicy = false in {
   let MangledName = "vmv_v" in {
     defm vmv_v : RVVOutBuiltinSet<"vmv_v_v", "csil",
                                    [["v", "Uv", "UvUv"]]>;
@@ -1770,7 +1784,7 @@ let HasMask = false, HasNoMaskPassThru = true, HasPolicy = false in {
 
 // 13. Vector Fixed-Point Arithmetic Instructions
 // 13.1. Vector Single-Width Saturating Add and Subtract
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 defm vsaddu : RVVUnsignedBinBuiltinSet;
 defm vsadd : RVVSignedBinBuiltinSet;
 defm vssubu : RVVUnsignedBinBuiltinSet;
@@ -1823,6 +1837,7 @@ let Log2LMUL = [-2, -1, 0, 1, 2] in {
 }
 
 // 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+let NoMaskPolicy = HasPolicyOperand in {
 defm vfmacc  : RVVFloatingTerBuiltinSet;
 defm vfnmacc : RVVFloatingTerBuiltinSet;
 defm vfmsac  : RVVFloatingTerBuiltinSet;
@@ -1837,9 +1852,10 @@ defm vfwmacc  : RVVFloatingWidenTerBuiltinSet;
 defm vfwnmacc : RVVFloatingWidenTerBuiltinSet;
 defm vfwmsac  : RVVFloatingWidenTerBuiltinSet;
 defm vfwnmsac : RVVFloatingWidenTerBuiltinSet;
+}
 
 // 14.8. Vector Floating-Point Square-Root Instruction
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 def vfsqrt : RVVFloatingUnaryVVBuiltin;
 
 // 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
@@ -1871,7 +1887,7 @@ defm vmfge : RVVFloatingMaskOutBuiltinSet;
 }
 
 // 14.14. Vector Floating-Point Classify Instruction
-let Name = "vfclass_v", HasNoMaskPassThru = true in
+let Name = "vfclass_v", NoMaskPolicy = HasPassthruOperand in
   def vfclass : RVVOp0Builtin<"Uv", "Uvv", "xfd">;
 
 // 14.15. Vector Floating-Point Merge Instructio
@@ -1890,13 +1906,13 @@ let HasMask = false, HasPolicy = false,
 }
 
 // 14.16. Vector Floating-Point Move Instruction
-let HasMask = false, HasNoMaskPassThru = true, HasNoMaskedOverloaded = false,
-    HasPolicy = false in
+let HasMask = false, NoMaskPolicy = HasPassthruOperand,
+    HasNoMaskedOverloaded = false, HasPolicy = false in
   defm vfmv_v : RVVOutBuiltinSet<"vfmv_v_f", "xfd",
                                   [["f", "v", "ve"]]>;
 
 // 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 def vfcvt_xu_f_v : RVVConvToUnsignedBuiltin<"vfcvt_xu">;
 def vfcvt_x_f_v : RVVConvToSignedBuiltin<"vfcvt_x">;
 def vfcvt_rtz_xu_f_v : RVVConvToUnsignedBuiltin<"vfcvt_rtz_xu">;
@@ -1992,7 +2008,7 @@ def vmsif : RVVMaskUnaryBuiltin;
 // 16.6. vmsof.m set-only-first mask bit
 def vmsof : RVVMaskUnaryBuiltin;
 
-let HasNoMaskPassThru = true, HasNoMaskedOverloaded = false in {
+let NoMaskPolicy = HasPassthruOperand, HasNoMaskedOverloaded = false in {
   // 16.8. Vector Iota Instruction
   defm viota : RVVOutBuiltinSet<"viota", "csil", [["m", "Uv", "Uvm"]]>;
 
@@ -2033,7 +2049,7 @@ defm vslideup   : RVVSlideBuiltinSet;
 defm vslidedown : RVVSlideBuiltinSet;
 
 // 17.3.3. Vector Slide1up Instructions
-let HasNoMaskPassThru = true in {
+let NoMaskPolicy = HasPassthruOperand in {
 defm vslide1up : RVVSlideOneBuiltinSet;
 defm vfslide1up : RVVFloatingBinVFBuiltinSet;
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmacc.c
index 963271d80ca16..e14b903211eb3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmacc.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmacc_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfmacc_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmacc_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfmacc_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmacc_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfmacc_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmacc_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfmacc_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmacc_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfmacc_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmacc_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfmacc_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmacc_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfmacc_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmacc_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmadd.c
index 09734276482dd..86a53ca2a764d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmadd.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmadd.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmadd_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfmadd_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmadd_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfmadd_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmadd_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfmadd_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmadd_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfmadd_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmadd_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfmadd_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmadd_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfmadd_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmadd_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfmadd_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmadd_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsac.c
index 36308a35ef815..5120e756711ad 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsac.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsac_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfmsac_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsac_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfmsac_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsac_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfmsac_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsac_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfmsac_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsac_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfmsac_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsac_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfmsac_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsac_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfmsac_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsac_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsub.c
index abc60568c3557..79b6875f75745 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsub.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfmsub.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsub_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfmsub_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsub_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfmsub_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsub_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfmsub_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsub_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfmsub_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsub_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfmsub_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsub_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfmsub_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsub_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfmsub_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsub_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmacc.c
index c76d6b88d0869..8749f4a5825bb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmacc.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfnmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfnmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfnmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmacc_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfnmacc_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfnmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmacc_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfnmacc_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfnmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmacc_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfnmacc_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfnmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmacc_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfnmacc_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfnmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmacc_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfnmacc_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfnmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmacc_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfnmacc_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfnmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmacc_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfnmacc_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfnmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmacc_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmadd.c
index d1e5246cd2949..f5d49f05961ef 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmadd.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmadd.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfnmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfnmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfnmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmadd_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfnmadd_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfnmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmadd_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfnmadd_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfnmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmadd_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfnmadd_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfnmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmadd_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfnmadd_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfnmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmadd_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfnmadd_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfnmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmadd_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfnmadd_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfnmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmadd_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfnmadd_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfnmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmadd_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsac.c
index 24c2c9abbcb42..0571382c98977 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsac.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfnmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfnmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfnmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsac_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfnmsac_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfnmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsac_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfnmsac_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfnmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsac_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfnmsac_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfnmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsac_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfnmsac_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfnmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsac_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfnmsac_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfnmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsac_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfnmsac_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfnmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsac_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfnmsac_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfnmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsac_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsub.c
index c7c85314b5699..05a7d0977eae4 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsub.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfnmsub.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat32mf2_t test_vfnmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat32mf2_t test_vfnmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat32m1_t test_vfnmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsub_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat32m1_t test_vfnmsub_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat32m2_t test_vfnmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsub_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat32m2_t test_vfnmsub_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat32m4_t test_vfnmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsub_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -87,7 +87,7 @@ vfloat32m4_t test_vfnmsub_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -97,7 +97,7 @@ vfloat32m8_t test_vfnmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsub_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -107,7 +107,7 @@ vfloat32m8_t test_vfnmsub_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -117,7 +117,7 @@ vfloat64m1_t test_vfnmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsub_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -127,7 +127,7 @@ vfloat64m1_t test_vfnmsub_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -137,7 +137,7 @@ vfloat64m2_t test_vfnmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsub_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -147,7 +147,7 @@ vfloat64m2_t test_vfnmsub_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -157,7 +157,7 @@ vfloat64m4_t test_vfnmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsub_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -167,7 +167,7 @@ vfloat64m4_t test_vfnmsub_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -177,7 +177,7 @@ vfloat64m8_t test_vfnmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsub_vf_f64m8(vfloat64m8_t acc, double op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmacc.c
index 462f80cc4afb1..92f329ee1de85 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmacc.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat64m1_t test_vfwmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmacc_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat64m1_t test_vfwmacc_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat64m2_t test_vfwmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmacc_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat64m2_t test_vfwmacc_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat64m4_t test_vfwmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmacc_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat64m4_t test_vfwmacc_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat64m8_t test_vfwmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmacc_vf_f64m8(vfloat64m8_t acc, float op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmsac.c
index 1a7c37a298449..f43444ca1bc67 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwmsac.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat64m1_t test_vfwmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmsac_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat64m1_t test_vfwmsac_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat64m2_t test_vfwmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmsac_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat64m2_t test_vfwmsac_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat64m4_t test_vfwmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmsac_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat64m4_t test_vfwmsac_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat64m8_t test_vfwmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmsac_vf_f64m8(vfloat64m8_t acc, float op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmacc.c
index 9ac61004637ae..6a0a8f796786c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmacc.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat64m1_t test_vfwnmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmacc_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat64m1_t test_vfwnmacc_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat64m2_t test_vfwnmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmacc_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat64m2_t test_vfwnmacc_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat64m4_t test_vfwnmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmacc_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat64m4_t test_vfwnmacc_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat64m8_t test_vfwnmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmacc_vf_f64m8(vfloat64m8_t acc, float op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmsac.c
index f237873e74a92..302b53b273755 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vfwnmsac.c
@@ -7,7 +7,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -17,7 +17,7 @@ vfloat64m1_t test_vfwnmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmsac_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -27,7 +27,7 @@ vfloat64m1_t test_vfwnmsac_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -37,7 +37,7 @@ vfloat64m2_t test_vfwnmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmsac_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -47,7 +47,7 @@ vfloat64m2_t test_vfwnmsac_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -57,7 +57,7 @@ vfloat64m4_t test_vfwnmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmsac_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -67,7 +67,7 @@ vfloat64m4_t test_vfwnmsac_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -77,7 +77,7 @@ vfloat64m8_t test_vfwnmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmsac_vf_f64m8(vfloat64m8_t acc, float op1,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmacc.c
index 569301d120b9b..ce9ef31265c4a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmacc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmacc_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vmacc_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmacc_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vmacc_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmacc_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vmacc_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmacc_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vmacc_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmacc_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vmacc_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmacc_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vmacc_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmacc_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vmacc_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmacc_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vmacc_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmacc_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vmacc_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmacc_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vmacc_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmacc_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vmacc_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmacc_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vmacc_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmacc_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vmacc_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmacc_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vmacc_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmacc_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vmacc_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmacc_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vmacc_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmacc_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vmacc_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmacc_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vmacc_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmacc_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vmacc_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmacc_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vmacc_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmacc_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vmacc_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmacc_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vmacc_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmacc_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vmacc_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmacc_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vmacc_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmacc_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vmacc_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmacc_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vmacc_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmacc_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vmacc_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmacc_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vmacc_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmacc_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vmacc_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmacc_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vmacc_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmacc_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vmacc_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmacc_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vmacc_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmacc_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vmacc_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmacc_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vmacc_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmacc_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vmacc_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmacc_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vmacc_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmacc_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vmacc_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmacc_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vmacc_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmacc_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vmacc_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmacc_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vmacc_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmacc_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vmacc_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmacc_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vmacc_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmacc_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vmacc_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmacc_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vmacc_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmacc_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vmacc_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmacc_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vmacc_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmacc_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vmacc_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmacc_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vmacc_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmacc_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vmacc_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmacc_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vmacc_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmacc_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vmacc_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmacc_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vmacc_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmacc_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vmacc_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmacc_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vmacc_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmacc_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vmacc_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmacc_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vmacc_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmacc_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vmacc_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmacc_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vmacc_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmacc_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vmacc_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmacc_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vmacc_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmacc_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vmacc_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmacc_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vmacc_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmacc_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vmacc_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmacc_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vmacc_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmacc_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vmacc_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmacc_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vmacc_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmacc_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vmacc_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmacc_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vmacc_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmacc_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vmacc_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmacc_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vmacc_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmacc_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vmacc_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmacc_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vmacc_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmacc_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vmacc_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmacc_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vmacc_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmacc_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vmacc_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmacc_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vmacc_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmacc_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vmacc_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmacc_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vmacc_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmacc_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vmacc_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmacc_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vmacc_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmacc_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vmacc_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmacc_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vmacc_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmacc_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vmacc_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmacc_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vmacc_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmacc_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vmacc_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmacc_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vmacc_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmacc_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vmacc_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmacc_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmadd.c
index e6ece6f876b2a..204abc60a596a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmadd.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vmadd.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmadd_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vmadd_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmadd_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vmadd_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmadd_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vmadd_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmadd_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vmadd_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmadd_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vmadd_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmadd_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vmadd_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmadd_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vmadd_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmadd_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vmadd_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmadd_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vmadd_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmadd_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vmadd_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmadd_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vmadd_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmadd_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vmadd_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmadd_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vmadd_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmadd_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vmadd_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmadd_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vmadd_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmadd_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vmadd_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmadd_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vmadd_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmadd_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vmadd_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmadd_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vmadd_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmadd_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vmadd_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmadd_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vmadd_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmadd_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vmadd_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmadd_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vmadd_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmadd_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vmadd_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmadd_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vmadd_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmadd_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vmadd_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmadd_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vmadd_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmadd_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vmadd_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmadd_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vmadd_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmadd_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vmadd_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmadd_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vmadd_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmadd_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vmadd_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmadd_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vmadd_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmadd_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vmadd_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmadd_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vmadd_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmadd_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vmadd_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmadd_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vmadd_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmadd_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vmadd_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmadd_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vmadd_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmadd_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vmadd_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmadd_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vmadd_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmadd_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vmadd_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmadd_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vmadd_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmadd_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vmadd_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmadd_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vmadd_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmadd_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vmadd_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmadd_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vmadd_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmadd_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vmadd_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmadd_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vmadd_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmadd_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vmadd_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmadd_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vmadd_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmadd_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vmadd_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmadd_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vmadd_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmadd_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vmadd_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmadd_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vmadd_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmadd_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vmadd_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmadd_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vmadd_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmadd_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vmadd_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmadd_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vmadd_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmadd_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vmadd_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmadd_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vmadd_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmadd_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vmadd_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmadd_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vmadd_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmadd_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vmadd_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmadd_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vmadd_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmadd_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vmadd_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmadd_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vmadd_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmadd_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vmadd_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmadd_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vmadd_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmadd_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vmadd_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmadd_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vmadd_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmadd_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vmadd_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmadd_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vmadd_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmadd_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vmadd_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmadd_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vmadd_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmadd_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vmadd_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmadd_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vmadd_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmadd_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vmadd_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmadd_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vmadd_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmadd_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vmadd_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmadd_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vmadd_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmadd_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vmadd_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmadd_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vmadd_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmadd_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vmadd_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmadd_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vmadd_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmadd_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vmadd_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmadd_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vmadd_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmadd_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsac.c
index 211dcf4d636ba..85da2dd7dee3e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsac.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsac_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vnmsac_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsac_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vnmsac_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsac_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vnmsac_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsac_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vnmsac_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsac_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vnmsac_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsac_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vnmsac_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsac_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vnmsac_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsac_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vnmsac_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsac_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vnmsac_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsac_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vnmsac_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsac_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vnmsac_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsac_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vnmsac_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsac_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vnmsac_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsac_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vnmsac_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsac_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vnmsac_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsac_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vnmsac_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsac_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vnmsac_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsac_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vnmsac_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsac_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vnmsac_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsac_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vnmsac_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsac_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vnmsac_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsac_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vnmsac_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsac_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vnmsac_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsac_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vnmsac_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsac_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vnmsac_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsac_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vnmsac_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsac_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vnmsac_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsac_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vnmsac_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsac_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vnmsac_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsac_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vnmsac_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsac_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vnmsac_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsac_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vnmsac_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsac_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vnmsac_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsac_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vnmsac_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsac_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vnmsac_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsac_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vnmsac_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsac_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vnmsac_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsac_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vnmsac_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsac_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vnmsac_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsac_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vnmsac_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsac_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vnmsac_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsac_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vnmsac_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsac_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vnmsac_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsac_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vnmsac_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsac_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vnmsac_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsac_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vnmsac_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsac_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vnmsac_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsac_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vnmsac_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsac_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vnmsac_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsac_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vnmsac_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsac_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vnmsac_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsac_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vnmsac_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsac_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vnmsac_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsac_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vnmsac_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsac_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vnmsac_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsac_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vnmsac_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsac_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vnmsac_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsac_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vnmsac_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsac_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vnmsac_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsac_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vnmsac_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsac_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vnmsac_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsac_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vnmsac_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsac_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vnmsac_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsac_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vnmsac_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsac_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vnmsac_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsac_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vnmsac_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsac_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vnmsac_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsac_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vnmsac_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsac_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vnmsac_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsac_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vnmsac_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsac_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vnmsac_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsac_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vnmsac_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsac_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vnmsac_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsac_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vnmsac_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsac_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vnmsac_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsac_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vnmsac_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsac_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vnmsac_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsac_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vnmsac_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsac_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vnmsac_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsac_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vnmsac_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsac_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vnmsac_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsac_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vnmsac_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsac_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vnmsac_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsac_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vnmsac_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsac_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vnmsac_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsac_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vnmsac_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsac_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vnmsac_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsac_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsub.c
index a27ef3451d600..af6ca88c5610b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsub.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vnmsub.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsub_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vnmsub_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsub_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vnmsub_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsub_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vnmsub_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsub_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vnmsub_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsub_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vnmsub_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsub_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vnmsub_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsub_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vnmsub_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsub_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vnmsub_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsub_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vnmsub_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsub_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vnmsub_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsub_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vnmsub_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsub_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vnmsub_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsub_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vnmsub_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsub_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vnmsub_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsub_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vnmsub_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsub_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vnmsub_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsub_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vnmsub_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsub_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vnmsub_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsub_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vnmsub_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsub_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vnmsub_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsub_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vnmsub_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsub_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vnmsub_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsub_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vnmsub_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsub_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vnmsub_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsub_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vnmsub_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsub_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vnmsub_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsub_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vnmsub_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsub_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vnmsub_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsub_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vnmsub_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsub_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vnmsub_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsub_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vnmsub_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsub_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vnmsub_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsub_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vnmsub_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsub_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vnmsub_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsub_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vnmsub_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsub_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vnmsub_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsub_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vnmsub_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsub_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vnmsub_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsub_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vnmsub_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsub_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vnmsub_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsub_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vnmsub_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsub_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vnmsub_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsub_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vnmsub_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsub_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vnmsub_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsub_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vnmsub_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsub_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vnmsub_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsub_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vnmsub_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsub_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vnmsub_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsub_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vnmsub_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsub_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vnmsub_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsub_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vnmsub_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsub_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vnmsub_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsub_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vnmsub_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsub_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vnmsub_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsub_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vnmsub_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsub_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vnmsub_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsub_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vnmsub_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsub_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vnmsub_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsub_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vnmsub_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsub_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vnmsub_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsub_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vnmsub_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsub_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vnmsub_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsub_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vnmsub_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsub_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vnmsub_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsub_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vnmsub_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsub_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vnmsub_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsub_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vnmsub_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsub_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vnmsub_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsub_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vnmsub_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsub_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vnmsub_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsub_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vnmsub_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsub_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vnmsub_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsub_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vnmsub_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsub_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vnmsub_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsub_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vnmsub_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsub_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vnmsub_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsub_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vnmsub_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsub_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vnmsub_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsub_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vnmsub_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsub_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vnmsub_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsub_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vnmsub_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsub_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vnmsub_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsub_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vnmsub_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsub_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vnmsub_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsub_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vnmsub_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsub_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vnmsub_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsub_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vnmsub_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsub_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vwmacc.c
index 916c14745e0d7..3ae6e5fea860c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vwmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-overloaded/vwmacc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmacc_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
@@ -16,7 +16,7 @@ vint16mf4_t test_vwmacc_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmacc_vx_i16mf4(vint16mf4_t acc, int8_t op1, vint8mf8_t op2,
@@ -26,7 +26,7 @@ vint16mf4_t test_vwmacc_vx_i16mf4(vint16mf4_t acc, int8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmacc_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
@@ -36,7 +36,7 @@ vint16mf2_t test_vwmacc_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmacc_vx_i16mf2(vint16mf2_t acc, int8_t op1, vint8mf4_t op2,
@@ -46,7 +46,7 @@ vint16mf2_t test_vwmacc_vx_i16mf2(vint16mf2_t acc, int8_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmacc_vv_i16m1(vint16m1_t acc, vint8mf2_t op1, vint8mf2_t op2,
@@ -56,7 +56,7 @@ vint16m1_t test_vwmacc_vv_i16m1(vint16m1_t acc, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmacc_vx_i16m1(vint16m1_t acc, int8_t op1, vint8mf2_t op2,
@@ -66,7 +66,7 @@ vint16m1_t test_vwmacc_vx_i16m1(vint16m1_t acc, int8_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmacc_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vint8m1_t op2,
@@ -76,7 +76,7 @@ vint16m2_t test_vwmacc_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmacc_vx_i16m2(vint16m2_t acc, int8_t op1, vint8m1_t op2,
@@ -86,7 +86,7 @@ vint16m2_t test_vwmacc_vx_i16m2(vint16m2_t acc, int8_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmacc_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vint8m2_t op2,
@@ -96,7 +96,7 @@ vint16m4_t test_vwmacc_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmacc_vx_i16m4(vint16m4_t acc, int8_t op1, vint8m2_t op2,
@@ -106,7 +106,7 @@ vint16m4_t test_vwmacc_vx_i16m4(vint16m4_t acc, int8_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmacc_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vint8m4_t op2,
@@ -116,7 +116,7 @@ vint16m8_t test_vwmacc_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmacc_vx_i16m8(vint16m8_t acc, int8_t op1, vint8m4_t op2,
@@ -126,7 +126,7 @@ vint16m8_t test_vwmacc_vx_i16m8(vint16m8_t acc, int8_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmacc_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
@@ -136,7 +136,7 @@ vint32mf2_t test_vwmacc_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmacc_vx_i32mf2(vint32mf2_t acc, int16_t op1, vint16mf4_t op2,
@@ -146,7 +146,7 @@ vint32mf2_t test_vwmacc_vx_i32mf2(vint32mf2_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmacc_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
@@ -156,7 +156,7 @@ vint32m1_t test_vwmacc_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmacc_vx_i32m1(vint32m1_t acc, int16_t op1, vint16mf2_t op2,
@@ -166,7 +166,7 @@ vint32m1_t test_vwmacc_vx_i32m1(vint32m1_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmacc_vv_i32m2(vint32m2_t acc, vint16m1_t op1, vint16m1_t op2,
@@ -176,7 +176,7 @@ vint32m2_t test_vwmacc_vv_i32m2(vint32m2_t acc, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmacc_vx_i32m2(vint32m2_t acc, int16_t op1, vint16m1_t op2,
@@ -186,7 +186,7 @@ vint32m2_t test_vwmacc_vx_i32m2(vint32m2_t acc, int16_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmacc_vv_i32m4(vint32m4_t acc, vint16m2_t op1, vint16m2_t op2,
@@ -196,7 +196,7 @@ vint32m4_t test_vwmacc_vv_i32m4(vint32m4_t acc, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmacc_vx_i32m4(vint32m4_t acc, int16_t op1, vint16m2_t op2,
@@ -206,7 +206,7 @@ vint32m4_t test_vwmacc_vx_i32m4(vint32m4_t acc, int16_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmacc_vv_i32m8(vint32m8_t acc, vint16m4_t op1, vint16m4_t op2,
@@ -216,7 +216,7 @@ vint32m8_t test_vwmacc_vv_i32m8(vint32m8_t acc, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmacc_vx_i32m8(vint32m8_t acc, int16_t op1, vint16m4_t op2,
@@ -226,7 +226,7 @@ vint32m8_t test_vwmacc_vx_i32m8(vint32m8_t acc, int16_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmacc_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
@@ -236,7 +236,7 @@ vint64m1_t test_vwmacc_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmacc_vx_i64m1(vint64m1_t acc, int32_t op1, vint32mf2_t op2,
@@ -246,7 +246,7 @@ vint64m1_t test_vwmacc_vx_i64m1(vint64m1_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmacc_vv_i64m2(vint64m2_t acc, vint32m1_t op1, vint32m1_t op2,
@@ -256,7 +256,7 @@ vint64m2_t test_vwmacc_vv_i64m2(vint64m2_t acc, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmacc_vx_i64m2(vint64m2_t acc, int32_t op1, vint32m1_t op2,
@@ -266,7 +266,7 @@ vint64m2_t test_vwmacc_vx_i64m2(vint64m2_t acc, int32_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmacc_vv_i64m4(vint64m4_t acc, vint32m2_t op1, vint32m2_t op2,
@@ -276,7 +276,7 @@ vint64m4_t test_vwmacc_vv_i64m4(vint64m4_t acc, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmacc_vx_i64m4(vint64m4_t acc, int32_t op1, vint32m2_t op2,
@@ -286,7 +286,7 @@ vint64m4_t test_vwmacc_vx_i64m4(vint64m4_t acc, int32_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmacc_vv_i64m8(vint64m8_t acc, vint32m4_t op1, vint32m4_t op2,
@@ -296,7 +296,7 @@ vint64m8_t test_vwmacc_vv_i64m8(vint64m8_t acc, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmacc_vx_i64m8(vint64m8_t acc, int32_t op1, vint32m4_t op2,
@@ -306,7 +306,7 @@ vint64m8_t test_vwmacc_vx_i64m8(vint64m8_t acc, int32_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwmaccu_vv_u16mf4(vuint16mf4_t acc, vuint8mf8_t op1,
@@ -316,7 +316,7 @@ vuint16mf4_t test_vwmaccu_vv_u16mf4(vuint16mf4_t acc, vuint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwmaccu_vx_u16mf4(vuint16mf4_t acc, uint8_t op1,
@@ -326,7 +326,7 @@ vuint16mf4_t test_vwmaccu_vx_u16mf4(vuint16mf4_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwmaccu_vv_u16mf2(vuint16mf2_t acc, vuint8mf4_t op1,
@@ -336,7 +336,7 @@ vuint16mf2_t test_vwmaccu_vv_u16mf2(vuint16mf2_t acc, vuint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwmaccu_vx_u16mf2(vuint16mf2_t acc, uint8_t op1,
@@ -346,7 +346,7 @@ vuint16mf2_t test_vwmaccu_vx_u16mf2(vuint16mf2_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwmaccu_vv_u16m1(vuint16m1_t acc, vuint8mf2_t op1,
@@ -356,7 +356,7 @@ vuint16m1_t test_vwmaccu_vv_u16m1(vuint16m1_t acc, vuint8mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwmaccu_vx_u16m1(vuint16m1_t acc, uint8_t op1, vuint8mf2_t op2,
@@ -366,7 +366,7 @@ vuint16m1_t test_vwmaccu_vx_u16m1(vuint16m1_t acc, uint8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwmaccu_vv_u16m2(vuint16m2_t acc, vuint8m1_t op1,
@@ -376,7 +376,7 @@ vuint16m2_t test_vwmaccu_vv_u16m2(vuint16m2_t acc, vuint8m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwmaccu_vx_u16m2(vuint16m2_t acc, uint8_t op1, vuint8m1_t op2,
@@ -386,7 +386,7 @@ vuint16m2_t test_vwmaccu_vx_u16m2(vuint16m2_t acc, uint8_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwmaccu_vv_u16m4(vuint16m4_t acc, vuint8m2_t op1,
@@ -396,7 +396,7 @@ vuint16m4_t test_vwmaccu_vv_u16m4(vuint16m4_t acc, vuint8m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwmaccu_vx_u16m4(vuint16m4_t acc, uint8_t op1, vuint8m2_t op2,
@@ -406,7 +406,7 @@ vuint16m4_t test_vwmaccu_vx_u16m4(vuint16m4_t acc, uint8_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwmaccu_vv_u16m8(vuint16m8_t acc, vuint8m4_t op1,
@@ -416,7 +416,7 @@ vuint16m8_t test_vwmaccu_vv_u16m8(vuint16m8_t acc, vuint8m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwmaccu_vx_u16m8(vuint16m8_t acc, uint8_t op1, vuint8m4_t op2,
@@ -426,7 +426,7 @@ vuint16m8_t test_vwmaccu_vx_u16m8(vuint16m8_t acc, uint8_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwmaccu_vv_u32mf2(vuint32mf2_t acc, vuint16mf4_t op1,
@@ -436,7 +436,7 @@ vuint32mf2_t test_vwmaccu_vv_u32mf2(vuint32mf2_t acc, vuint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwmaccu_vx_u32mf2(vuint32mf2_t acc, uint16_t op1,
@@ -446,7 +446,7 @@ vuint32mf2_t test_vwmaccu_vx_u32mf2(vuint32mf2_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwmaccu_vv_u32m1(vuint32m1_t acc, vuint16mf2_t op1,
@@ -456,7 +456,7 @@ vuint32m1_t test_vwmaccu_vv_u32m1(vuint32m1_t acc, vuint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwmaccu_vx_u32m1(vuint32m1_t acc, uint16_t op1,
@@ -466,7 +466,7 @@ vuint32m1_t test_vwmaccu_vx_u32m1(vuint32m1_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwmaccu_vv_u32m2(vuint32m2_t acc, vuint16m1_t op1,
@@ -476,7 +476,7 @@ vuint32m2_t test_vwmaccu_vv_u32m2(vuint32m2_t acc, vuint16m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwmaccu_vx_u32m2(vuint32m2_t acc, uint16_t op1,
@@ -486,7 +486,7 @@ vuint32m2_t test_vwmaccu_vx_u32m2(vuint32m2_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwmaccu_vv_u32m4(vuint32m4_t acc, vuint16m2_t op1,
@@ -496,7 +496,7 @@ vuint32m4_t test_vwmaccu_vv_u32m4(vuint32m4_t acc, vuint16m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwmaccu_vx_u32m4(vuint32m4_t acc, uint16_t op1,
@@ -506,7 +506,7 @@ vuint32m4_t test_vwmaccu_vx_u32m4(vuint32m4_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwmaccu_vv_u32m8(vuint32m8_t acc, vuint16m4_t op1,
@@ -516,7 +516,7 @@ vuint32m8_t test_vwmaccu_vv_u32m8(vuint32m8_t acc, vuint16m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwmaccu_vx_u32m8(vuint32m8_t acc, uint16_t op1,
@@ -526,7 +526,7 @@ vuint32m8_t test_vwmaccu_vx_u32m8(vuint32m8_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwmaccu_vv_u64m1(vuint64m1_t acc, vuint32mf2_t op1,
@@ -536,7 +536,7 @@ vuint64m1_t test_vwmaccu_vv_u64m1(vuint64m1_t acc, vuint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwmaccu_vx_u64m1(vuint64m1_t acc, uint32_t op1,
@@ -546,7 +546,7 @@ vuint64m1_t test_vwmaccu_vx_u64m1(vuint64m1_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwmaccu_vv_u64m2(vuint64m2_t acc, vuint32m1_t op1,
@@ -556,7 +556,7 @@ vuint64m2_t test_vwmaccu_vv_u64m2(vuint64m2_t acc, vuint32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwmaccu_vx_u64m2(vuint64m2_t acc, uint32_t op1,
@@ -566,7 +566,7 @@ vuint64m2_t test_vwmaccu_vx_u64m2(vuint64m2_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwmaccu_vv_u64m4(vuint64m4_t acc, vuint32m2_t op1,
@@ -576,7 +576,7 @@ vuint64m4_t test_vwmaccu_vv_u64m4(vuint64m4_t acc, vuint32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwmaccu_vx_u64m4(vuint64m4_t acc, uint32_t op1,
@@ -586,7 +586,7 @@ vuint64m4_t test_vwmaccu_vx_u64m4(vuint64m4_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwmaccu_vv_u64m8(vuint64m8_t acc, vuint32m4_t op1,
@@ -596,7 +596,7 @@ vuint64m8_t test_vwmaccu_vv_u64m8(vuint64m8_t acc, vuint32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwmaccu_vx_u64m8(vuint64m8_t acc, uint32_t op1,
@@ -606,7 +606,7 @@ vuint64m8_t test_vwmaccu_vx_u64m8(vuint64m8_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmaccsu_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
@@ -616,7 +616,7 @@ vint16mf4_t test_vwmaccsu_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmaccsu_vx_i16mf4(vint16mf4_t acc, int8_t op1,
@@ -626,7 +626,7 @@ vint16mf4_t test_vwmaccsu_vx_i16mf4(vint16mf4_t acc, int8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmaccsu_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
@@ -636,7 +636,7 @@ vint16mf2_t test_vwmaccsu_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmaccsu_vx_i16mf2(vint16mf2_t acc, int8_t op1,
@@ -646,7 +646,7 @@ vint16mf2_t test_vwmaccsu_vx_i16mf2(vint16mf2_t acc, int8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmaccsu_vv_i16m1(vint16m1_t acc, vint8mf2_t op1,
@@ -656,7 +656,7 @@ vint16m1_t test_vwmaccsu_vv_i16m1(vint16m1_t acc, vint8mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmaccsu_vx_i16m1(vint16m1_t acc, int8_t op1, vuint8mf2_t op2,
@@ -666,7 +666,7 @@ vint16m1_t test_vwmaccsu_vx_i16m1(vint16m1_t acc, int8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmaccsu_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vuint8m1_t op2,
@@ -676,7 +676,7 @@ vint16m2_t test_vwmaccsu_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmaccsu_vx_i16m2(vint16m2_t acc, int8_t op1, vuint8m1_t op2,
@@ -686,7 +686,7 @@ vint16m2_t test_vwmaccsu_vx_i16m2(vint16m2_t acc, int8_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmaccsu_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vuint8m2_t op2,
@@ -696,7 +696,7 @@ vint16m4_t test_vwmaccsu_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmaccsu_vx_i16m4(vint16m4_t acc, int8_t op1, vuint8m2_t op2,
@@ -706,7 +706,7 @@ vint16m4_t test_vwmaccsu_vx_i16m4(vint16m4_t acc, int8_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmaccsu_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vuint8m4_t op2,
@@ -716,7 +716,7 @@ vint16m8_t test_vwmaccsu_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmaccsu_vx_i16m8(vint16m8_t acc, int8_t op1, vuint8m4_t op2,
@@ -726,7 +726,7 @@ vint16m8_t test_vwmaccsu_vx_i16m8(vint16m8_t acc, int8_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmaccsu_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
@@ -736,7 +736,7 @@ vint32mf2_t test_vwmaccsu_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmaccsu_vx_i32mf2(vint32mf2_t acc, int16_t op1,
@@ -746,7 +746,7 @@ vint32mf2_t test_vwmaccsu_vx_i32mf2(vint32mf2_t acc, int16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmaccsu_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
@@ -756,7 +756,7 @@ vint32m1_t test_vwmaccsu_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmaccsu_vx_i32m1(vint32m1_t acc, int16_t op1, vuint16mf2_t op2,
@@ -766,7 +766,7 @@ vint32m1_t test_vwmaccsu_vx_i32m1(vint32m1_t acc, int16_t op1, vuint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmaccsu_vv_i32m2(vint32m2_t acc, vint16m1_t op1,
@@ -776,7 +776,7 @@ vint32m2_t test_vwmaccsu_vv_i32m2(vint32m2_t acc, vint16m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmaccsu_vx_i32m2(vint32m2_t acc, int16_t op1, vuint16m1_t op2,
@@ -786,7 +786,7 @@ vint32m2_t test_vwmaccsu_vx_i32m2(vint32m2_t acc, int16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmaccsu_vv_i32m4(vint32m4_t acc, vint16m2_t op1,
@@ -796,7 +796,7 @@ vint32m4_t test_vwmaccsu_vv_i32m4(vint32m4_t acc, vint16m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmaccsu_vx_i32m4(vint32m4_t acc, int16_t op1, vuint16m2_t op2,
@@ -806,7 +806,7 @@ vint32m4_t test_vwmaccsu_vx_i32m4(vint32m4_t acc, int16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmaccsu_vv_i32m8(vint32m8_t acc, vint16m4_t op1,
@@ -816,7 +816,7 @@ vint32m8_t test_vwmaccsu_vv_i32m8(vint32m8_t acc, vint16m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmaccsu_vx_i32m8(vint32m8_t acc, int16_t op1, vuint16m4_t op2,
@@ -826,7 +826,7 @@ vint32m8_t test_vwmaccsu_vx_i32m8(vint32m8_t acc, int16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmaccsu_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
@@ -836,7 +836,7 @@ vint64m1_t test_vwmaccsu_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmaccsu_vx_i64m1(vint64m1_t acc, int32_t op1, vuint32mf2_t op2,
@@ -846,7 +846,7 @@ vint64m1_t test_vwmaccsu_vx_i64m1(vint64m1_t acc, int32_t op1, vuint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmaccsu_vv_i64m2(vint64m2_t acc, vint32m1_t op1,
@@ -856,7 +856,7 @@ vint64m2_t test_vwmaccsu_vv_i64m2(vint64m2_t acc, vint32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmaccsu_vx_i64m2(vint64m2_t acc, int32_t op1, vuint32m1_t op2,
@@ -866,7 +866,7 @@ vint64m2_t test_vwmaccsu_vx_i64m2(vint64m2_t acc, int32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmaccsu_vv_i64m4(vint64m4_t acc, vint32m2_t op1,
@@ -876,7 +876,7 @@ vint64m4_t test_vwmaccsu_vv_i64m4(vint64m4_t acc, vint32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmaccsu_vx_i64m4(vint64m4_t acc, int32_t op1, vuint32m2_t op2,
@@ -886,7 +886,7 @@ vint64m4_t test_vwmaccsu_vx_i64m4(vint64m4_t acc, int32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmaccsu_vv_i64m8(vint64m8_t acc, vint32m4_t op1,
@@ -896,7 +896,7 @@ vint64m8_t test_vwmaccsu_vv_i64m8(vint64m8_t acc, vint32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmaccsu_vx_i64m8(vint64m8_t acc, int32_t op1, vuint32m4_t op2,
@@ -906,7 +906,7 @@ vint64m8_t test_vwmaccsu_vx_i64m8(vint64m8_t acc, int32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmaccus_vx_i16mf4(vint16mf4_t acc, uint8_t op1,
@@ -916,7 +916,7 @@ vint16mf4_t test_vwmaccus_vx_i16mf4(vint16mf4_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccus.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccus.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmaccus_vx_i16mf2(vint16mf2_t acc, uint8_t op1,
@@ -926,7 +926,7 @@ vint16mf2_t test_vwmaccus_vx_i16mf2(vint16mf2_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccus.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccus.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmaccus_vx_i16m1(vint16m1_t acc, uint8_t op1, vint8mf2_t op2,
@@ -936,7 +936,7 @@ vint16m1_t test_vwmaccus_vx_i16m1(vint16m1_t acc, uint8_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccus.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccus.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmaccus_vx_i16m2(vint16m2_t acc, uint8_t op1, vint8m1_t op2,
@@ -946,7 +946,7 @@ vint16m2_t test_vwmaccus_vx_i16m2(vint16m2_t acc, uint8_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccus.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccus.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmaccus_vx_i16m4(vint16m4_t acc, uint8_t op1, vint8m2_t op2,
@@ -956,7 +956,7 @@ vint16m4_t test_vwmaccus_vx_i16m4(vint16m4_t acc, uint8_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccus.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccus.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmaccus_vx_i16m8(vint16m8_t acc, uint8_t op1, vint8m4_t op2,
@@ -966,7 +966,7 @@ vint16m8_t test_vwmaccus_vx_i16m8(vint16m8_t acc, uint8_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccus.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccus.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmaccus_vx_i32mf2(vint32mf2_t acc, uint16_t op1,
@@ -976,7 +976,7 @@ vint32mf2_t test_vwmaccus_vx_i32mf2(vint32mf2_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccus.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccus.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmaccus_vx_i32m1(vint32m1_t acc, uint16_t op1, vint16mf2_t op2,
@@ -986,7 +986,7 @@ vint32m1_t test_vwmaccus_vx_i32m1(vint32m1_t acc, uint16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmaccus_vx_i32m2(vint32m2_t acc, uint16_t op1, vint16m1_t op2,
@@ -996,7 +996,7 @@ vint32m2_t test_vwmaccus_vx_i32m2(vint32m2_t acc, uint16_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccus.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccus.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmaccus_vx_i32m4(vint32m4_t acc, uint16_t op1, vint16m2_t op2,
@@ -1006,7 +1006,7 @@ vint32m4_t test_vwmaccus_vx_i32m4(vint32m4_t acc, uint16_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccus.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccus.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmaccus_vx_i32m8(vint32m8_t acc, uint16_t op1, vint16m4_t op2,
@@ -1016,7 +1016,7 @@ vint32m8_t test_vwmaccus_vx_i32m8(vint32m8_t acc, uint16_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccus.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccus.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmaccus_vx_i64m1(vint64m1_t acc, uint32_t op1, vint32mf2_t op2,
@@ -1026,7 +1026,7 @@ vint64m1_t test_vwmaccus_vx_i64m1(vint64m1_t acc, uint32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccus.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccus.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmaccus_vx_i64m2(vint64m2_t acc, uint32_t op1, vint32m1_t op2,
@@ -1036,7 +1036,7 @@ vint64m2_t test_vwmaccus_vx_i64m2(vint64m2_t acc, uint32_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccus.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccus.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmaccus_vx_i64m4(vint64m4_t acc, uint32_t op1, vint32m2_t op2,
@@ -1046,7 +1046,7 @@ vint64m4_t test_vwmaccus_vx_i64m4(vint64m4_t acc, uint32_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccus.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccus.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmaccus_vx_i64m8(vint64m8_t acc, uint32_t op1, vint32m4_t op2,
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmacc.c
index ab4ef84ccc257..15735e25bd959 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmacc.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmacc_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfmacc_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmacc_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfmacc_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmacc_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfmacc_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmacc_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfmacc_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmacc_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfmacc_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmacc_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfmacc_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmacc_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfmacc_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmacc_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -377,7 +377,7 @@ vfloat64m8_t test_vfmacc_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmacc_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -386,7 +386,7 @@ vfloat16mf4_t test_vfmacc_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmacc_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -395,7 +395,7 @@ vfloat16mf4_t test_vfmacc_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmacc_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -404,7 +404,7 @@ vfloat16mf2_t test_vfmacc_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmacc_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -413,7 +413,7 @@ vfloat16mf2_t test_vfmacc_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmacc_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -422,7 +422,7 @@ vfloat16m1_t test_vfmacc_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmacc_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -431,7 +431,7 @@ vfloat16m1_t test_vfmacc_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t v
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmacc_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -440,7 +440,7 @@ vfloat16m2_t test_vfmacc_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmacc_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -449,7 +449,7 @@ vfloat16m2_t test_vfmacc_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t v
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmacc_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -458,7 +458,7 @@ vfloat16m4_t test_vfmacc_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmacc_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -467,7 +467,7 @@ vfloat16m4_t test_vfmacc_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t v
 
 // CHECK-RV64-LABEL: @test_vfmacc_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmacc.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmacc.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmacc_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -476,7 +476,7 @@ vfloat16m8_t test_vfmacc_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8
 
 // CHECK-RV64-LABEL: @test_vfmacc_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmacc.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmacc.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmacc_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmadd.c
index 29351df8302b9..9a9171e039490 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmadd.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmadd.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmadd_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfmadd_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmadd_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfmadd_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmadd_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfmadd_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmadd_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfmadd_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmadd_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfmadd_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmadd_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfmadd_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmadd_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfmadd_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmadd_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -377,7 +377,7 @@ vfloat64m8_t test_vfmadd_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmadd_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -386,7 +386,7 @@ vfloat16mf4_t test_vfmadd_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmadd_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -395,7 +395,7 @@ vfloat16mf4_t test_vfmadd_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmadd_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -404,7 +404,7 @@ vfloat16mf2_t test_vfmadd_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmadd_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -413,7 +413,7 @@ vfloat16mf2_t test_vfmadd_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmadd_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -422,7 +422,7 @@ vfloat16m1_t test_vfmadd_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmadd_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -431,7 +431,7 @@ vfloat16m1_t test_vfmadd_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t v
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmadd_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -440,7 +440,7 @@ vfloat16m2_t test_vfmadd_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmadd_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -449,7 +449,7 @@ vfloat16m2_t test_vfmadd_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t v
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmadd_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -458,7 +458,7 @@ vfloat16m4_t test_vfmadd_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmadd_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -467,7 +467,7 @@ vfloat16m4_t test_vfmadd_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t v
 
 // CHECK-RV64-LABEL: @test_vfmadd_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmadd.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmadd.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmadd_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -476,7 +476,7 @@ vfloat16m8_t test_vfmadd_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8
 
 // CHECK-RV64-LABEL: @test_vfmadd_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmadd.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmadd.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmadd_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsac.c
index 39458d19f069e..9082e6280cc78 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsac.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsac_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfmsac_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsac_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfmsac_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsac_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfmsac_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsac_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfmsac_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsac_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfmsac_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsac_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfmsac_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsac_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfmsac_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsac_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -377,7 +377,7 @@ vfloat64m8_t test_vfmsac_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmsac_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -386,7 +386,7 @@ vfloat16mf4_t test_vfmsac_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmsac_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -395,7 +395,7 @@ vfloat16mf4_t test_vfmsac_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmsac_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -404,7 +404,7 @@ vfloat16mf2_t test_vfmsac_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmsac_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -413,7 +413,7 @@ vfloat16mf2_t test_vfmsac_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmsac_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -422,7 +422,7 @@ vfloat16m1_t test_vfmsac_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmsac_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -431,7 +431,7 @@ vfloat16m1_t test_vfmsac_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t v
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmsac_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -440,7 +440,7 @@ vfloat16m2_t test_vfmsac_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmsac_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -449,7 +449,7 @@ vfloat16m2_t test_vfmsac_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t v
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmsac_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -458,7 +458,7 @@ vfloat16m4_t test_vfmsac_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmsac_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -467,7 +467,7 @@ vfloat16m4_t test_vfmsac_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t v
 
 // CHECK-RV64-LABEL: @test_vfmsac_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsac.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsac.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmsac_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -476,7 +476,7 @@ vfloat16m8_t test_vfmsac_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8
 
 // CHECK-RV64-LABEL: @test_vfmsac_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsac.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsac.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmsac_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsub.c
index 9bd7a68151460..e56913898824a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsub.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfmsub.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfmsub_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfmsub_vf_f32m1(vfloat32m1_t acc, float op1, vfloat32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfmsub_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfmsub_vf_f32m2(vfloat32m2_t acc, float op1, vfloat32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfmsub_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfmsub_vf_f32m4(vfloat32m4_t acc, float op1, vfloat32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfmsub_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfmsub_vf_f32m8(vfloat32m8_t acc, float op1, vfloat32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfmsub_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfmsub_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfmsub_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfmsub_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfmsub_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfmsub_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfmsub_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -377,7 +377,7 @@ vfloat64m8_t test_vfmsub_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmsub_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -386,7 +386,7 @@ vfloat16mf4_t test_vfmsub_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfmsub_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -395,7 +395,7 @@ vfloat16mf4_t test_vfmsub_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmsub_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -404,7 +404,7 @@ vfloat16mf2_t test_vfmsub_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfmsub_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -413,7 +413,7 @@ vfloat16mf2_t test_vfmsub_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmsub_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -422,7 +422,7 @@ vfloat16m1_t test_vfmsub_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfmsub_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -431,7 +431,7 @@ vfloat16m1_t test_vfmsub_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t v
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmsub_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -440,7 +440,7 @@ vfloat16m2_t test_vfmsub_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfmsub_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -449,7 +449,7 @@ vfloat16m2_t test_vfmsub_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t v
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmsub_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -458,7 +458,7 @@ vfloat16m4_t test_vfmsub_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfmsub_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -467,7 +467,7 @@ vfloat16m4_t test_vfmsub_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t v
 
 // CHECK-RV64-LABEL: @test_vfmsub_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsub.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsub.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmsub_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -476,7 +476,7 @@ vfloat16m8_t test_vfmsub_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8
 
 // CHECK-RV64-LABEL: @test_vfmsub_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsub.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfmsub.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfmsub_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmacc.c
index 703d7aa097f7c..c332da8bf5988 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmacc.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfnmacc_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfnmacc_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfnmacc_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmacc_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfnmacc_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfnmacc_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmacc_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfnmacc_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfnmacc_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmacc_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfnmacc_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfnmacc_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmacc.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmacc_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfnmacc_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfnmacc_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmacc_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfnmacc_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfnmacc_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmacc_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfnmacc_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfnmacc_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmacc_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfnmacc_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfnmacc_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmacc.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmacc_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -378,7 +378,7 @@ vfloat64m8_t test_vfnmacc_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc,
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmacc_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -387,7 +387,7 @@ vfloat16mf4_t test_vfnmacc_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmacc_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -396,7 +396,7 @@ vfloat16mf4_t test_vfnmacc_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmacc_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -405,7 +405,7 @@ vfloat16mf2_t test_vfnmacc_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmacc_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -414,7 +414,7 @@ vfloat16mf2_t test_vfnmacc_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmacc_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -423,7 +423,7 @@ vfloat16m1_t test_vfnmacc_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmacc_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -432,7 +432,7 @@ vfloat16m1_t test_vfnmacc_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmacc_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -441,7 +441,7 @@ vfloat16m2_t test_vfnmacc_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmacc_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -450,7 +450,7 @@ vfloat16m2_t test_vfnmacc_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmacc_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -459,7 +459,7 @@ vfloat16m4_t test_vfnmacc_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmacc_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -468,7 +468,7 @@ vfloat16m4_t test_vfnmacc_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmacc.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmacc.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmacc_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -477,7 +477,7 @@ vfloat16m8_t test_vfnmacc_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmacc_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmacc.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmacc.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmacc_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmadd.c
index adefa17bb4f37..7f737b7815544 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmadd.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmadd.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfnmadd_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfnmadd_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfnmadd_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmadd_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfnmadd_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfnmadd_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmadd_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfnmadd_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfnmadd_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmadd_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfnmadd_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfnmadd_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmadd.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmadd_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfnmadd_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfnmadd_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmadd_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfnmadd_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfnmadd_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmadd_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfnmadd_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfnmadd_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmadd_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfnmadd_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfnmadd_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmadd.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmadd_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -378,7 +378,7 @@ vfloat64m8_t test_vfnmadd_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc,
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmadd_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -387,7 +387,7 @@ vfloat16mf4_t test_vfnmadd_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmadd_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -396,7 +396,7 @@ vfloat16mf4_t test_vfnmadd_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmadd_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -405,7 +405,7 @@ vfloat16mf2_t test_vfnmadd_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmadd_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -414,7 +414,7 @@ vfloat16mf2_t test_vfnmadd_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmadd_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -423,7 +423,7 @@ vfloat16m1_t test_vfnmadd_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmadd_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -432,7 +432,7 @@ vfloat16m1_t test_vfnmadd_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmadd_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -441,7 +441,7 @@ vfloat16m2_t test_vfnmadd_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmadd_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -450,7 +450,7 @@ vfloat16m2_t test_vfnmadd_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmadd_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -459,7 +459,7 @@ vfloat16m4_t test_vfnmadd_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmadd_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -468,7 +468,7 @@ vfloat16m4_t test_vfnmadd_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmadd.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmadd.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmadd_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -477,7 +477,7 @@ vfloat16m8_t test_vfnmadd_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmadd_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmadd.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmadd.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmadd_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsac.c
index c9f805328a107..c4f28d69b427e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsac.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfnmsac_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfnmsac_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfnmsac_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsac_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfnmsac_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfnmsac_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsac_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfnmsac_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfnmsac_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsac_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfnmsac_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfnmsac_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsac.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsac_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfnmsac_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfnmsac_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsac_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfnmsac_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfnmsac_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsac_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfnmsac_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfnmsac_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsac_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfnmsac_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfnmsac_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsac.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsac_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -378,7 +378,7 @@ vfloat64m8_t test_vfnmsac_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc,
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmsac_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -387,7 +387,7 @@ vfloat16mf4_t test_vfnmsac_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmsac_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -396,7 +396,7 @@ vfloat16mf4_t test_vfnmsac_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmsac_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -405,7 +405,7 @@ vfloat16mf2_t test_vfnmsac_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmsac_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -414,7 +414,7 @@ vfloat16mf2_t test_vfnmsac_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmsac_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -423,7 +423,7 @@ vfloat16m1_t test_vfnmsac_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmsac_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -432,7 +432,7 @@ vfloat16m1_t test_vfnmsac_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmsac_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -441,7 +441,7 @@ vfloat16m2_t test_vfnmsac_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmsac_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -450,7 +450,7 @@ vfloat16m2_t test_vfnmsac_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmsac_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -459,7 +459,7 @@ vfloat16m4_t test_vfnmsac_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmsac_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -468,7 +468,7 @@ vfloat16m4_t test_vfnmsac_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsac.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsac.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmsac_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -477,7 +477,7 @@ vfloat16m8_t test_vfnmsac_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsac_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsac.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsac.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmsac_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsub.c
index c2254419baa54..ecba795117345 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsub.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfnmsub.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32.i64(<vscale x 1 x float> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat32mf2_t test_vfnmsub_vv_f32mf2(vfloat32mf2_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.f32.i64(<vscale x 1 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfnmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat32mf2_t test_vfnmsub_vf_f32mf2(vfloat32mf2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.nxv2f32.i64(<vscale x 2 x float> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat32m1_t test_vfnmsub_vv_f32m1(vfloat32m1_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.f32.i64(<vscale x 2 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfnmsub_vf_f32m1(vfloat32m1_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat32m1_t test_vfnmsub_vf_f32m1(vfloat32m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.nxv4f32.i64(<vscale x 4 x float> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat32m2_t test_vfnmsub_vv_f32m2(vfloat32m2_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.f32.i64(<vscale x 4 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfnmsub_vf_f32m2(vfloat32m2_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat32m2_t test_vfnmsub_vf_f32m2(vfloat32m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.nxv8f32.i64(<vscale x 8 x float> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat32m4_t test_vfnmsub_vv_f32m4(vfloat32m4_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.f32.i64(<vscale x 8 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfnmsub_vf_f32m4(vfloat32m4_t acc, float op1,
@@ -88,7 +88,7 @@ vfloat32m4_t test_vfnmsub_vf_f32m4(vfloat32m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.nxv16f32.i64(<vscale x 16 x float> [[ACC:%.*]], <vscale x 16 x float> [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
@@ -98,7 +98,7 @@ vfloat32m8_t test_vfnmsub_vv_f32m8(vfloat32m8_t acc, vfloat32m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfnmsub.nxv16f32.f32.i64(<vscale x 16 x float> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 16 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfnmsub_vf_f32m8(vfloat32m8_t acc, float op1,
@@ -108,7 +108,7 @@ vfloat32m8_t test_vfnmsub_vf_f32m8(vfloat32m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.nxv1f64.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x double> [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
@@ -118,7 +118,7 @@ vfloat64m1_t test_vfnmsub_vv_f64m1(vfloat64m1_t acc, vfloat64m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.f64.i64(<vscale x 1 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 1 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfnmsub_vf_f64m1(vfloat64m1_t acc, double op1,
@@ -128,7 +128,7 @@ vfloat64m1_t test_vfnmsub_vf_f64m1(vfloat64m1_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.nxv2f64.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x double> [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
@@ -138,7 +138,7 @@ vfloat64m2_t test_vfnmsub_vv_f64m2(vfloat64m2_t acc, vfloat64m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.f64.i64(<vscale x 2 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 2 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfnmsub_vf_f64m2(vfloat64m2_t acc, double op1,
@@ -148,7 +148,7 @@ vfloat64m2_t test_vfnmsub_vf_f64m2(vfloat64m2_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.nxv4f64.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x double> [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
@@ -158,7 +158,7 @@ vfloat64m4_t test_vfnmsub_vv_f64m4(vfloat64m4_t acc, vfloat64m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.f64.i64(<vscale x 4 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 4 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfnmsub_vf_f64m4(vfloat64m4_t acc, double op1,
@@ -168,7 +168,7 @@ vfloat64m4_t test_vfnmsub_vf_f64m4(vfloat64m4_t acc, double op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.nxv8f64.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x double> [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
@@ -178,7 +178,7 @@ vfloat64m8_t test_vfnmsub_vv_f64m8(vfloat64m8_t acc, vfloat64m8_t op1,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfnmsub.nxv8f64.f64.i64(<vscale x 8 x double> [[ACC:%.*]], double [[OP1:%.*]], <vscale x 8 x double> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfnmsub_vf_f64m8(vfloat64m8_t acc, double op1,
@@ -378,7 +378,7 @@ vfloat64m8_t test_vfnmsub_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc,
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.nxv1f16.i64(<vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmsub_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -387,7 +387,7 @@ vfloat16mf4_t test_vfnmsub_vv_f16mf4 (vfloat16mf4_t vd, vfloat16mf4_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.f16.i64(<vscale x 1 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
 //
 vfloat16mf4_t test_vfnmsub_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf4_t vs2, size_t vl) {
@@ -396,7 +396,7 @@ vfloat16mf4_t test_vfnmsub_vf_f16mf4 (vfloat16mf4_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.nxv2f16.i64(<vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmsub_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -405,7 +405,7 @@ vfloat16mf2_t test_vfnmsub_vv_f16mf2 (vfloat16mf2_t vd, vfloat16mf2_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.f16.i64(<vscale x 2 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
 //
 vfloat16mf2_t test_vfnmsub_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf2_t vs2, size_t vl) {
@@ -414,7 +414,7 @@ vfloat16mf2_t test_vfnmsub_vf_f16mf2 (vfloat16mf2_t vd, _Float16 rs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.nxv4f16.i64(<vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmsub_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -423,7 +423,7 @@ vfloat16m1_t test_vfnmsub_vv_f16m1 (vfloat16m1_t vd, vfloat16m1_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.f16.i64(<vscale x 4 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
 //
 vfloat16m1_t test_vfnmsub_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t vs2, size_t vl) {
@@ -432,7 +432,7 @@ vfloat16m1_t test_vfnmsub_vf_f16m1 (vfloat16m1_t vd, _Float16 rs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.nxv8f16.i64(<vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmsub_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -441,7 +441,7 @@ vfloat16m2_t test_vfnmsub_vv_f16m2 (vfloat16m2_t vd, vfloat16m2_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.f16.i64(<vscale x 8 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 vfloat16m2_t test_vfnmsub_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t vs2, size_t vl) {
@@ -450,7 +450,7 @@ vfloat16m2_t test_vfnmsub_vf_f16m2 (vfloat16m2_t vd, _Float16 rs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.nxv16f16.i64(<vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmsub_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -459,7 +459,7 @@ vfloat16m4_t test_vfnmsub_vv_f16m4 (vfloat16m4_t vd, vfloat16m4_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.f16.i64(<vscale x 16 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
 vfloat16m4_t test_vfnmsub_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t vs2, size_t vl) {
@@ -468,7 +468,7 @@ vfloat16m4_t test_vfnmsub_vf_f16m4 (vfloat16m4_t vd, _Float16 rs1, vfloat16m4_t
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vv_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsub.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsub.nxv32f16.nxv32f16.i64(<vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmsub_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m8_t vs2, size_t vl) {
@@ -477,7 +477,7 @@ vfloat16m8_t test_vfnmsub_vv_f16m8 (vfloat16m8_t vd, vfloat16m8_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfnmsub_vf_f16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsub.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.vfnmsub.nxv32f16.f16.i64(<vscale x 32 x half> [[VD:%.*]], half [[RS1:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
 vfloat16m8_t test_vfnmsub_vf_f16m8 (vfloat16m8_t vd, _Float16 rs1, vfloat16m8_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmacc.c
index c43b9ad466f72..3f58e61461f3f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmacc.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat64m1_t test_vfwmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmacc_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat64m1_t test_vfwmacc_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat64m2_t test_vfwmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmacc_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat64m2_t test_vfwmacc_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat64m4_t test_vfwmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmacc_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat64m4_t test_vfwmacc_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat64m8_t test_vfwmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmacc_vf_f64m8(vfloat64m8_t acc, float op1,
@@ -172,7 +172,7 @@ vfloat64m8_t test_vfwmacc_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwmacc_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -181,7 +181,7 @@ vfloat32mf2_t test_vfwmacc_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwmacc_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -190,7 +190,7 @@ vfloat32mf2_t test_vfwmacc_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwmacc_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -199,7 +199,7 @@ vfloat32m1_t test_vfwmacc_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwmacc_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -208,7 +208,7 @@ vfloat32m1_t test_vfwmacc_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_t
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwmacc_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -217,7 +217,7 @@ vfloat32m2_t test_vfwmacc_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwmacc_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t vs2, size_t vl) {
@@ -226,7 +226,7 @@ vfloat32m2_t test_vfwmacc_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwmacc_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -235,7 +235,7 @@ vfloat32m4_t test_vfwmacc_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwmacc_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t vs2, size_t vl) {
@@ -244,7 +244,7 @@ vfloat32m4_t test_vfwmacc_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwmacc_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -253,7 +253,7 @@ vfloat32m8_t test_vfwmacc_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwmacc_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwmacc_vf_f32m8 (vfloat32m8_t vd, _Float16 vs1, vfloat16m4_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmsac.c
index e6cb4861a4abc..23332ba44584b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwmsac.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat64m1_t test_vfwmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwmsac_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat64m1_t test_vfwmsac_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat64m2_t test_vfwmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwmsac_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat64m2_t test_vfwmsac_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat64m4_t test_vfwmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwmsac_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat64m4_t test_vfwmsac_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat64m8_t test_vfwmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwmsac_vf_f64m8(vfloat64m8_t acc, float op1,
@@ -172,7 +172,7 @@ vfloat64m8_t test_vfwmsac_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwmsac_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -181,7 +181,7 @@ vfloat32mf2_t test_vfwmsac_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vfloa
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwmsac_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -190,7 +190,7 @@ vfloat32mf2_t test_vfwmsac_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16mf
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwmsac_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -199,7 +199,7 @@ vfloat32m1_t test_vfwmsac_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwmsac_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -208,7 +208,7 @@ vfloat32m1_t test_vfwmsac_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_t
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwmsac_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -217,7 +217,7 @@ vfloat32m2_t test_vfwmsac_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwmsac_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t vs2, size_t vl) {
@@ -226,7 +226,7 @@ vfloat32m2_t test_vfwmsac_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwmsac_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -235,7 +235,7 @@ vfloat32m4_t test_vfwmsac_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwmsac_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t vs2, size_t vl) {
@@ -244,7 +244,7 @@ vfloat32m4_t test_vfwmsac_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwmsac_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -253,7 +253,7 @@ vfloat32m8_t test_vfwmsac_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwmsac_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwmsac_vf_f32m8 (vfloat32m8_t vd, _Float16 vs1, vfloat16m4_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmacc.c
index cff04ff4bd30d..c6463a946d4bb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmacc.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat64m1_t test_vfwnmacc_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmacc_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat64m1_t test_vfwnmacc_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat64m2_t test_vfwnmacc_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmacc_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat64m2_t test_vfwnmacc_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat64m4_t test_vfwnmacc_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmacc_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat64m4_t test_vfwnmacc_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat64m8_t test_vfwnmacc_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmacc_vf_f64m8(vfloat64m8_t acc, float op1,
@@ -172,7 +172,7 @@ vfloat64m8_t test_vfwnmacc_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc,
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwnmacc_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -181,7 +181,7 @@ vfloat32mf2_t test_vfwnmacc_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vflo
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwnmacc_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -190,7 +190,7 @@ vfloat32mf2_t test_vfwnmacc_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwnmacc_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -199,7 +199,7 @@ vfloat32m1_t test_vfwnmacc_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat1
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwnmacc_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -208,7 +208,7 @@ vfloat32m1_t test_vfwnmacc_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwnmacc_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -217,7 +217,7 @@ vfloat32m2_t test_vfwnmacc_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwnmacc_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t vs2, size_t vl) {
@@ -226,7 +226,7 @@ vfloat32m2_t test_vfwnmacc_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwnmacc_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -235,7 +235,7 @@ vfloat32m4_t test_vfwnmacc_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwnmacc_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t vs2, size_t vl) {
@@ -244,7 +244,7 @@ vfloat32m4_t test_vfwnmacc_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwnmacc_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -253,7 +253,7 @@ vfloat32m8_t test_vfwnmacc_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwnmacc_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwnmacc_vf_f32m8 (vfloat32m8_t vd, _Float16 vs1, vfloat16m4_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmsac.c
index d4a8239312bbd..514ae5c129ff8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vfwnmsac.c
@@ -8,7 +8,7 @@
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.nxv1f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], <vscale x 1 x float> [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
@@ -18,7 +18,7 @@ vfloat64m1_t test_vfwnmsac_vv_f64m1(vfloat64m1_t acc, vfloat32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.f32.nxv1f32.i64(<vscale x 1 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 1 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
 //
 vfloat64m1_t test_vfwnmsac_vf_f64m1(vfloat64m1_t acc, float op1,
@@ -28,7 +28,7 @@ vfloat64m1_t test_vfwnmsac_vf_f64m1(vfloat64m1_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.nxv2f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], <vscale x 2 x float> [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
@@ -38,7 +38,7 @@ vfloat64m2_t test_vfwnmsac_vv_f64m2(vfloat64m2_t acc, vfloat32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.f32.nxv2f32.i64(<vscale x 2 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 2 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 vfloat64m2_t test_vfwnmsac_vf_f64m2(vfloat64m2_t acc, float op1,
@@ -48,7 +48,7 @@ vfloat64m2_t test_vfwnmsac_vf_f64m2(vfloat64m2_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.nxv4f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], <vscale x 4 x float> [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
@@ -58,7 +58,7 @@ vfloat64m4_t test_vfwnmsac_vv_f64m4(vfloat64m4_t acc, vfloat32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.f32.nxv4f32.i64(<vscale x 4 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 4 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
 vfloat64m4_t test_vfwnmsac_vf_f64m4(vfloat64m4_t acc, float op1,
@@ -68,7 +68,7 @@ vfloat64m4_t test_vfwnmsac_vf_f64m4(vfloat64m4_t acc, float op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.nxv8f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], <vscale x 8 x float> [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
@@ -78,7 +78,7 @@ vfloat64m8_t test_vfwnmsac_vv_f64m8(vfloat64m8_t acc, vfloat32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.f32.nxv8f32.i64(<vscale x 8 x double> [[ACC:%.*]], float [[OP1:%.*]], <vscale x 8 x float> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
 vfloat64m8_t test_vfwnmsac_vf_f64m8(vfloat64m8_t acc, float op1,
@@ -172,7 +172,7 @@ vfloat64m8_t test_vfwnmsac_vf_f64m8_m(vbool8_t mask, vfloat64m8_t acc,
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], <vscale x 1 x half> [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwnmsac_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -181,7 +181,7 @@ vfloat32mf2_t test_vfwnmsac_vv_f32mf2 (vfloat32mf2_t vd, vfloat16mf4_t vs1, vflo
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.f16.nxv1f16.i64(<vscale x 1 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
 //
 vfloat32mf2_t test_vfwnmsac_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16mf4_t vs2, size_t vl) {
@@ -190,7 +190,7 @@ vfloat32mf2_t test_vfwnmsac_vf_f32mf2 (vfloat32mf2_t vd, _Float16 vs1, vfloat16m
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], <vscale x 2 x half> [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwnmsac_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -199,7 +199,7 @@ vfloat32m1_t test_vfwnmsac_vv_f32m1 (vfloat32m1_t vd, vfloat16mf2_t vs1, vfloat1
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.f16.nxv2f16.i64(<vscale x 2 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
 //
 vfloat32m1_t test_vfwnmsac_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_t vs2, size_t vl) {
@@ -208,7 +208,7 @@ vfloat32m1_t test_vfwnmsac_vf_f32m1 (vfloat32m1_t vd, _Float16 vs1, vfloat16mf2_
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], <vscale x 4 x half> [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwnmsac_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16m1_t vs2, size_t vl) {
@@ -217,7 +217,7 @@ vfloat32m2_t test_vfwnmsac_vv_f32m2 (vfloat32m2_t vd, vfloat16m1_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.f16.nxv4f16.i64(<vscale x 4 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 vfloat32m2_t test_vfwnmsac_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t vs2, size_t vl) {
@@ -226,7 +226,7 @@ vfloat32m2_t test_vfwnmsac_vf_f32m2 (vfloat32m2_t vd, _Float16 vs1, vfloat16m1_t
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], <vscale x 8 x half> [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwnmsac_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16m2_t vs2, size_t vl) {
@@ -235,7 +235,7 @@ vfloat32m4_t test_vfwnmsac_vv_f32m4 (vfloat32m4_t vd, vfloat16m2_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.f16.nxv8f16.i64(<vscale x 8 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
 vfloat32m4_t test_vfwnmsac_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t vs2, size_t vl) {
@@ -244,7 +244,7 @@ vfloat32m4_t test_vfwnmsac_vf_f32m4 (vfloat32m4_t vd, _Float16 vs1, vfloat16m2_t
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vv_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], <vscale x 16 x half> [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwnmsac_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16m4_t vs2, size_t vl) {
@@ -253,7 +253,7 @@ vfloat32m8_t test_vfwnmsac_vv_f32m8 (vfloat32m8_t vd, vfloat16m4_t vs1, vfloat16
 
 // CHECK-RV64-LABEL: @test_vfwnmsac_vf_f32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.f16.nxv16f16.i64(<vscale x 16 x float> [[VD:%.*]], half [[VS1:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
 vfloat32m8_t test_vfwnmsac_vf_f32m8 (vfloat32m8_t vd, _Float16 vs1, vfloat16m4_t vs2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmacc.c
index 243567cfab13f..9193197d60d30 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmacc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmacc_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vmacc_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmacc_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vmacc_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmacc_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vmacc_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmacc_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vmacc_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmacc_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vmacc_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmacc_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vmacc_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmacc_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vmacc_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmacc_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vmacc_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmacc_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vmacc_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmacc_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vmacc_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmacc_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vmacc_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmacc_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vmacc_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmacc_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vmacc_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmacc_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vmacc_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmacc_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vmacc_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmacc_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vmacc_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmacc_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vmacc_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmacc_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vmacc_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmacc_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vmacc_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmacc_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vmacc_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmacc_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vmacc_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmacc_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vmacc_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmacc_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vmacc_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmacc_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vmacc_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmacc_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vmacc_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmacc_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vmacc_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmacc_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vmacc_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmacc_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vmacc_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmacc_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vmacc_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmacc_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vmacc_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmacc_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vmacc_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmacc_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vmacc_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmacc_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vmacc_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmacc_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vmacc_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmacc_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vmacc_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmacc_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vmacc_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmacc_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vmacc_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmacc_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vmacc_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmacc_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vmacc_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmacc_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vmacc_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmacc_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vmacc_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmacc_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vmacc_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmacc_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vmacc_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmacc_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vmacc_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmacc_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vmacc_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmacc_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vmacc_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmacc_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vmacc_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmacc_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vmacc_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmacc_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vmacc_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmacc_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vmacc_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmacc_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vmacc_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmacc_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vmacc_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmacc_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vmacc_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmacc_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vmacc_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmacc_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vmacc_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmacc_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vmacc_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmacc_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vmacc_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmacc_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vmacc_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmacc_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vmacc_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmacc_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vmacc_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmacc_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vmacc_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmacc_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vmacc_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmacc_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vmacc_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmacc_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vmacc_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmacc_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vmacc_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmacc_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vmacc_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmacc_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vmacc_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmacc_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vmacc_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmacc_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vmacc_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmacc.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmacc_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vmacc_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmacc_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vmacc_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmacc_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vmacc_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmacc_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vmacc_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmacc_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vmacc_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmacc_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vmacc_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmacc_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vmacc_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmacc_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vmacc_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmacc_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vmacc_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmacc_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vmacc_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmacc.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmacc_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vmacc_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmacc_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vmacc_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmacc_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vmacc_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmacc_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vmacc_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmacc_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vmacc_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmacc_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vmacc_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmacc_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vmacc_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmacc_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmacc_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vmacc_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op
 
 // CHECK-RV64-LABEL: @test_vmacc_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmacc.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmacc_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmadd.c
index 9835a68f21651..4ec23d669735e 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vmadd.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vmadd.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmadd_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vmadd_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vmadd_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vmadd_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmadd_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vmadd_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vmadd_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vmadd_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmadd_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vmadd_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vmadd_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vmadd_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmadd_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vmadd_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vmadd_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vmadd_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmadd_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vmadd_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vmadd_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vmadd_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmadd_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vmadd_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vmadd_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vmadd_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmadd_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vmadd_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vmadd_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vmadd_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmadd_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vmadd_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vmadd_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vmadd_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmadd_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vmadd_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vmadd_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vmadd_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmadd_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vmadd_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vmadd_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vmadd_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmadd_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vmadd_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vmadd_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vmadd_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmadd_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vmadd_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vmadd_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vmadd_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmadd_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vmadd_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vmadd_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vmadd_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmadd_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vmadd_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vmadd_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vmadd_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmadd_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vmadd_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vmadd_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vmadd_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmadd_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vmadd_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vmadd_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vmadd_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmadd_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vmadd_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vmadd_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vmadd_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmadd_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vmadd_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vmadd_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vmadd_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmadd_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vmadd_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vmadd_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vmadd_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmadd_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vmadd_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vmadd_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vmadd_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmadd_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vmadd_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vmadd_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vmadd_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmadd_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vmadd_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vmadd_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vmadd_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmadd_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vmadd_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vmadd_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vmadd_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmadd_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vmadd_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vmadd_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vmadd_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmadd_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vmadd_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vmadd_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vmadd_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmadd_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vmadd_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vmadd_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vmadd_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmadd_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vmadd_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vmadd_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vmadd_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmadd_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vmadd_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vmadd_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vmadd_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmadd_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vmadd_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, si
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vmadd.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vmadd_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vmadd_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmadd_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vmadd_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vmadd_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vmadd_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmadd_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vmadd_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vmadd_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vmadd_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmadd_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vmadd_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vmadd_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vmadd_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmadd_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vmadd_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vmadd_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vmadd_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmadd_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vmadd_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vmadd_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vmadd_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmadd_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vmadd_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vmadd.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vmadd_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vmadd_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmadd_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vmadd_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vmadd_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vmadd_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t o
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmadd_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vmadd_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vmadd_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vmadd_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmadd_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vmadd_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vmadd_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vmadd_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmadd_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vmadd_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vmadd_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vmadd_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmadd_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vmadd_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vmadd.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vmadd_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vmadd_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmadd_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vmadd_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vmadd_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vmadd_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmadd_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vmadd_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vmadd_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vmadd_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmadd_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vmadd_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vmadd_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vmadd_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vmadd_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmadd_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vmadd_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op
 
 // CHECK-RV64-LABEL: @test_vmadd_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vmadd.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vmadd_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsac.c
index e0aa06746cc55..7a8ed978c425d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsac.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsac.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsac_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vnmsac_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsac_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vnmsac_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsac_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vnmsac_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsac_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vnmsac_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsac_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vnmsac_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsac_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vnmsac_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsac_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vnmsac_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsac_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vnmsac_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsac_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vnmsac_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsac_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vnmsac_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsac_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vnmsac_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsac_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vnmsac_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsac_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vnmsac_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsac_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vnmsac_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsac_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vnmsac_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsac_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vnmsac_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsac_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vnmsac_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsac_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vnmsac_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsac_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vnmsac_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsac_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vnmsac_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsac_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vnmsac_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsac_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vnmsac_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsac_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vnmsac_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsac_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vnmsac_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsac_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vnmsac_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsac_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vnmsac_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsac_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vnmsac_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsac_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vnmsac_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsac_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vnmsac_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsac_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vnmsac_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsac_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vnmsac_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsac_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vnmsac_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsac_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vnmsac_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsac_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vnmsac_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsac_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vnmsac_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsac_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vnmsac_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsac_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vnmsac_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsac_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vnmsac_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsac_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vnmsac_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsac_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vnmsac_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsac_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vnmsac_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsac_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vnmsac_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsac_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vnmsac_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsac_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vnmsac_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsac_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vnmsac_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsac_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vnmsac_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsac_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vnmsac_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsac_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vnmsac_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsac_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vnmsac_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsac_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vnmsac_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsac_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vnmsac_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsac_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vnmsac_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsac_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vnmsac_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsac_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vnmsac_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsac_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vnmsac_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsac_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vnmsac_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsac_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vnmsac_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsac.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsac_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vnmsac_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsac_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vnmsac_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsac_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vnmsac_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsac_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vnmsac_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsac_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vnmsac_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsac_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vnmsac_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsac_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vnmsac_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsac_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vnmsac_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsac_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vnmsac_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsac_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vnmsac_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsac_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vnmsac_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsac_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vnmsac_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsac.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsac_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vnmsac_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsac_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vnmsac_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsac_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vnmsac_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsac_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vnmsac_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsac_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vnmsac_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsac_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vnmsac_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsac_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vnmsac_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsac_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vnmsac_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsac_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vnmsac_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsac_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vnmsac_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsac.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsac_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vnmsac_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsac_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vnmsac_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsac_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vnmsac_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsac_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vnmsac_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsac_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vnmsac_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsac_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vnmsac_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsac_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vnmsac_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsac_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsac_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vnmsac_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsac_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsac.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsac_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsub.c
index 9a409c7abb662..e14753308e9b6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsub.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vnmsub.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsub_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2, size_t vl) {
@@ -15,7 +15,7 @@ vint8mf8_t test_vnmsub_vv_i8mf8(vint8mf8_t acc, vint8mf8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vint8mf8_t test_vnmsub_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size_t vl) {
@@ -24,7 +24,7 @@ vint8mf8_t test_vnmsub_vx_i8mf8(vint8mf8_t acc, int8_t op1, vint8mf8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsub_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2, size_t vl) {
@@ -33,7 +33,7 @@ vint8mf4_t test_vnmsub_vv_i8mf4(vint8mf4_t acc, vint8mf4_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vint8mf4_t test_vnmsub_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size_t vl) {
@@ -42,7 +42,7 @@ vint8mf4_t test_vnmsub_vx_i8mf4(vint8mf4_t acc, int8_t op1, vint8mf4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsub_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2, size_t vl) {
@@ -51,7 +51,7 @@ vint8mf2_t test_vnmsub_vv_i8mf2(vint8mf2_t acc, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vint8mf2_t test_vnmsub_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size_t vl) {
@@ -60,7 +60,7 @@ vint8mf2_t test_vnmsub_vx_i8mf2(vint8mf2_t acc, int8_t op1, vint8mf2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsub_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_t vl) {
@@ -69,7 +69,7 @@ vint8m1_t test_vnmsub_vv_i8m1(vint8m1_t acc, vint8m1_t op1, vint8m1_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vint8m1_t test_vnmsub_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t vl) {
@@ -78,7 +78,7 @@ vint8m1_t test_vnmsub_vx_i8m1(vint8m1_t acc, int8_t op1, vint8m1_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsub_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_t vl) {
@@ -87,7 +87,7 @@ vint8m2_t test_vnmsub_vv_i8m2(vint8m2_t acc, vint8m2_t op1, vint8m2_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vint8m2_t test_vnmsub_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t vl) {
@@ -96,7 +96,7 @@ vint8m2_t test_vnmsub_vx_i8m2(vint8m2_t acc, int8_t op1, vint8m2_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsub_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_t vl) {
@@ -105,7 +105,7 @@ vint8m4_t test_vnmsub_vv_i8m4(vint8m4_t acc, vint8m4_t op1, vint8m4_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vint8m4_t test_vnmsub_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t vl) {
@@ -114,7 +114,7 @@ vint8m4_t test_vnmsub_vx_i8m4(vint8m4_t acc, int8_t op1, vint8m4_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsub_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_t vl) {
@@ -123,7 +123,7 @@ vint8m8_t test_vnmsub_vv_i8m8(vint8m8_t acc, vint8m8_t op1, vint8m8_t op2, size_
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vint8m8_t test_vnmsub_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t vl) {
@@ -132,7 +132,7 @@ vint8m8_t test_vnmsub_vx_i8m8(vint8m8_t acc, int8_t op1, vint8m8_t op2, size_t v
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsub_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t op2, size_t vl) {
@@ -141,7 +141,7 @@ vint16mf4_t test_vnmsub_vv_i16mf4(vint16mf4_t acc, vint16mf4_t op1, vint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vnmsub_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2, size_t vl) {
@@ -150,7 +150,7 @@ vint16mf4_t test_vnmsub_vx_i16mf4(vint16mf4_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsub_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t op2, size_t vl) {
@@ -159,7 +159,7 @@ vint16mf2_t test_vnmsub_vv_i16mf2(vint16mf2_t acc, vint16mf2_t op1, vint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vnmsub_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2, size_t vl) {
@@ -168,7 +168,7 @@ vint16mf2_t test_vnmsub_vx_i16mf2(vint16mf2_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsub_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2, size_t vl) {
@@ -177,7 +177,7 @@ vint16m1_t test_vnmsub_vv_i16m1(vint16m1_t acc, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vnmsub_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, size_t vl) {
@@ -186,7 +186,7 @@ vint16m1_t test_vnmsub_vx_i16m1(vint16m1_t acc, int16_t op1, vint16m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsub_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2, size_t vl) {
@@ -195,7 +195,7 @@ vint16m2_t test_vnmsub_vv_i16m2(vint16m2_t acc, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vnmsub_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, size_t vl) {
@@ -204,7 +204,7 @@ vint16m2_t test_vnmsub_vx_i16m2(vint16m2_t acc, int16_t op1, vint16m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsub_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2, size_t vl) {
@@ -213,7 +213,7 @@ vint16m4_t test_vnmsub_vv_i16m4(vint16m4_t acc, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vnmsub_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, size_t vl) {
@@ -222,7 +222,7 @@ vint16m4_t test_vnmsub_vx_i16m4(vint16m4_t acc, int16_t op1, vint16m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsub_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2, size_t vl) {
@@ -231,7 +231,7 @@ vint16m8_t test_vnmsub_vv_i16m8(vint16m8_t acc, vint16m8_t op1, vint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vnmsub_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, size_t vl) {
@@ -240,7 +240,7 @@ vint16m8_t test_vnmsub_vx_i16m8(vint16m8_t acc, int16_t op1, vint16m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsub_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t op2, size_t vl) {
@@ -249,7 +249,7 @@ vint32mf2_t test_vnmsub_vv_i32mf2(vint32mf2_t acc, vint32mf2_t op1, vint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vnmsub_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2, size_t vl) {
@@ -258,7 +258,7 @@ vint32mf2_t test_vnmsub_vx_i32mf2(vint32mf2_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsub_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2, size_t vl) {
@@ -267,7 +267,7 @@ vint32m1_t test_vnmsub_vv_i32m1(vint32m1_t acc, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vnmsub_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, size_t vl) {
@@ -276,7 +276,7 @@ vint32m1_t test_vnmsub_vx_i32m1(vint32m1_t acc, int32_t op1, vint32m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsub_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2, size_t vl) {
@@ -285,7 +285,7 @@ vint32m2_t test_vnmsub_vv_i32m2(vint32m2_t acc, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vnmsub_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, size_t vl) {
@@ -294,7 +294,7 @@ vint32m2_t test_vnmsub_vx_i32m2(vint32m2_t acc, int32_t op1, vint32m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsub_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2, size_t vl) {
@@ -303,7 +303,7 @@ vint32m4_t test_vnmsub_vv_i32m4(vint32m4_t acc, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vnmsub_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, size_t vl) {
@@ -312,7 +312,7 @@ vint32m4_t test_vnmsub_vx_i32m4(vint32m4_t acc, int32_t op1, vint32m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsub_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2, size_t vl) {
@@ -321,7 +321,7 @@ vint32m8_t test_vnmsub_vv_i32m8(vint32m8_t acc, vint32m8_t op1, vint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vnmsub_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, size_t vl) {
@@ -330,7 +330,7 @@ vint32m8_t test_vnmsub_vx_i32m8(vint32m8_t acc, int32_t op1, vint32m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsub_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2, size_t vl) {
@@ -339,7 +339,7 @@ vint64m1_t test_vnmsub_vv_i64m1(vint64m1_t acc, vint64m1_t op1, vint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vnmsub_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, size_t vl) {
@@ -348,7 +348,7 @@ vint64m1_t test_vnmsub_vx_i64m1(vint64m1_t acc, int64_t op1, vint64m1_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsub_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2, size_t vl) {
@@ -357,7 +357,7 @@ vint64m2_t test_vnmsub_vv_i64m2(vint64m2_t acc, vint64m2_t op1, vint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vnmsub_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, size_t vl) {
@@ -366,7 +366,7 @@ vint64m2_t test_vnmsub_vx_i64m2(vint64m2_t acc, int64_t op1, vint64m2_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsub_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2, size_t vl) {
@@ -375,7 +375,7 @@ vint64m4_t test_vnmsub_vv_i64m4(vint64m4_t acc, vint64m4_t op1, vint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vnmsub_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, size_t vl) {
@@ -384,7 +384,7 @@ vint64m4_t test_vnmsub_vx_i64m4(vint64m4_t acc, int64_t op1, vint64m4_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsub_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2, size_t vl) {
@@ -393,7 +393,7 @@ vint64m8_t test_vnmsub_vv_i64m8(vint64m8_t acc, vint64m8_t op1, vint64m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vnmsub_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, size_t vl) {
@@ -402,7 +402,7 @@ vint64m8_t test_vnmsub_vx_i64m8(vint64m8_t acc, int64_t op1, vint64m8_t op2, siz
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8.i64(<vscale x 1 x i8> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsub_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -411,7 +411,7 @@ vuint8mf8_t test_vnmsub_vv_u8mf8(vuint8mf8_t acc, vuint8mf8_t op1, vuint8mf8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8mf8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8.i64(<vscale x 1 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
 //
 vuint8mf8_t test_vnmsub_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2, size_t vl) {
@@ -420,7 +420,7 @@ vuint8mf8_t test_vnmsub_vx_u8mf8(vuint8mf8_t acc, uint8_t op1, vuint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8.i64(<vscale x 2 x i8> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsub_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t op2, size_t vl) {
@@ -429,7 +429,7 @@ vuint8mf4_t test_vnmsub_vv_u8mf4(vuint8mf4_t acc, vuint8mf4_t op1, vuint8mf4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8.i64(<vscale x 2 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
 //
 vuint8mf4_t test_vnmsub_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2, size_t vl) {
@@ -438,7 +438,7 @@ vuint8mf4_t test_vnmsub_vx_u8mf4(vuint8mf4_t acc, uint8_t op1, vuint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8.i64(<vscale x 4 x i8> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsub_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t op2, size_t vl) {
@@ -447,7 +447,7 @@ vuint8mf2_t test_vnmsub_vv_u8mf2(vuint8mf2_t acc, vuint8mf2_t op1, vuint8mf2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8.i64(<vscale x 4 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 //
 vuint8mf2_t test_vnmsub_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2, size_t vl) {
@@ -456,7 +456,7 @@ vuint8mf2_t test_vnmsub_vx_u8mf2(vuint8mf2_t acc, uint8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8.i64(<vscale x 8 x i8> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsub_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, size_t vl) {
@@ -465,7 +465,7 @@ vuint8m1_t test_vnmsub_vv_u8m1(vuint8m1_t acc, vuint8m1_t op1, vuint8m1_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8.i64(<vscale x 8 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
 //
 vuint8m1_t test_vnmsub_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size_t vl) {
@@ -474,7 +474,7 @@ vuint8m1_t test_vnmsub_vx_u8m1(vuint8m1_t acc, uint8_t op1, vuint8m1_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8.i64(<vscale x 16 x i8> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsub_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, size_t vl) {
@@ -483,7 +483,7 @@ vuint8m2_t test_vnmsub_vv_u8m2(vuint8m2_t acc, vuint8m2_t op1, vuint8m2_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8.i64(<vscale x 16 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 vuint8m2_t test_vnmsub_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size_t vl) {
@@ -492,7 +492,7 @@ vuint8m2_t test_vnmsub_vx_u8m2(vuint8m2_t acc, uint8_t op1, vuint8m2_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8.i64(<vscale x 32 x i8> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsub_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, size_t vl) {
@@ -501,7 +501,7 @@ vuint8m4_t test_vnmsub_vv_u8m4(vuint8m4_t acc, vuint8m4_t op1, vuint8m4_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8.i64(<vscale x 32 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
 vuint8m4_t test_vnmsub_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size_t vl) {
@@ -510,7 +510,7 @@ vuint8m4_t test_vnmsub_vx_u8m4(vuint8m4_t acc, uint8_t op1, vuint8m4_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> [[ACC:%.*]], <vscale x 64 x i8> [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsub_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, size_t vl) {
@@ -519,7 +519,7 @@ vuint8m8_t test_vnmsub_vv_u8m8(vuint8m8_t acc, vuint8m8_t op1, vuint8m8_t op2, s
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u8m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vnmsub.nxv64i8.i8.i64(<vscale x 64 x i8> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 64 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
 vuint8m8_t test_vnmsub_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size_t vl) {
@@ -528,7 +528,7 @@ vuint8m8_t test_vnmsub_vx_u8m8(vuint8m8_t acc, uint8_t op1, vuint8m8_t op2, size
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsub_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf4_t op2, size_t vl) {
@@ -537,7 +537,7 @@ vuint16mf4_t test_vnmsub_vv_u16mf4(vuint16mf4_t acc, vuint16mf4_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16.i64(<vscale x 1 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vnmsub_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t op2, size_t vl) {
@@ -546,7 +546,7 @@ vuint16mf4_t test_vnmsub_vx_u16mf4(vuint16mf4_t acc, uint16_t op1, vuint16mf4_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsub_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf2_t op2, size_t vl) {
@@ -555,7 +555,7 @@ vuint16mf2_t test_vnmsub_vv_u16mf2(vuint16mf2_t acc, vuint16mf2_t op1, vuint16mf
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16.i64(<vscale x 2 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vnmsub_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t op2, size_t vl) {
@@ -564,7 +564,7 @@ vuint16mf2_t test_vnmsub_vx_u16mf2(vuint16mf2_t acc, uint16_t op1, vuint16mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsub_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t op2, size_t vl) {
@@ -573,7 +573,7 @@ vuint16m1_t test_vnmsub_vv_u16m1(vuint16m1_t acc, vuint16m1_t op1, vuint16m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16.i64(<vscale x 4 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vnmsub_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2, size_t vl) {
@@ -582,7 +582,7 @@ vuint16m1_t test_vnmsub_vx_u16m1(vuint16m1_t acc, uint16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsub_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t op2, size_t vl) {
@@ -591,7 +591,7 @@ vuint16m2_t test_vnmsub_vv_u16m2(vuint16m2_t acc, vuint16m2_t op1, vuint16m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16.i64(<vscale x 8 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vnmsub_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2, size_t vl) {
@@ -600,7 +600,7 @@ vuint16m2_t test_vnmsub_vx_u16m2(vuint16m2_t acc, uint16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsub_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t op2, size_t vl) {
@@ -609,7 +609,7 @@ vuint16m4_t test_vnmsub_vv_u16m4(vuint16m4_t acc, vuint16m4_t op1, vuint16m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16.i64(<vscale x 16 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vnmsub_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2, size_t vl) {
@@ -618,7 +618,7 @@ vuint16m4_t test_vnmsub_vx_u16m4(vuint16m4_t acc, uint16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.nxv32i16.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i16> [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsub_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t op2, size_t vl) {
@@ -627,7 +627,7 @@ vuint16m8_t test_vnmsub_vv_u16m8(vuint16m8_t acc, vuint16m8_t op1, vuint16m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vnmsub.nxv32i16.i16.i64(<vscale x 32 x i16> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 32 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vnmsub_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2, size_t vl) {
@@ -636,7 +636,7 @@ vuint16m8_t test_vnmsub_vx_u16m8(vuint16m8_t acc, uint16_t op1, vuint16m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsub_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf2_t op2, size_t vl) {
@@ -645,7 +645,7 @@ vuint32mf2_t test_vnmsub_vv_u32mf2(vuint32mf2_t acc, vuint32mf2_t op1, vuint32mf
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32.i64(<vscale x 1 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vnmsub_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t op2, size_t vl) {
@@ -654,7 +654,7 @@ vuint32mf2_t test_vnmsub_vx_u32mf2(vuint32mf2_t acc, uint32_t op1, vuint32mf2_t
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsub_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t op2, size_t vl) {
@@ -663,7 +663,7 @@ vuint32m1_t test_vnmsub_vv_u32m1(vuint32m1_t acc, vuint32m1_t op1, vuint32m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32.i64(<vscale x 2 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vnmsub_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2, size_t vl) {
@@ -672,7 +672,7 @@ vuint32m1_t test_vnmsub_vx_u32m1(vuint32m1_t acc, uint32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsub_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t op2, size_t vl) {
@@ -681,7 +681,7 @@ vuint32m2_t test_vnmsub_vv_u32m2(vuint32m2_t acc, vuint32m2_t op1, vuint32m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32.i64(<vscale x 4 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vnmsub_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2, size_t vl) {
@@ -690,7 +690,7 @@ vuint32m2_t test_vnmsub_vx_u32m2(vuint32m2_t acc, uint32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsub_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t op2, size_t vl) {
@@ -699,7 +699,7 @@ vuint32m4_t test_vnmsub_vv_u32m4(vuint32m4_t acc, vuint32m4_t op1, vuint32m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32.i64(<vscale x 8 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vnmsub_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2, size_t vl) {
@@ -708,7 +708,7 @@ vuint32m4_t test_vnmsub_vx_u32m4(vuint32m4_t acc, uint32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.nxv16i32.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i32> [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsub_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t op2, size_t vl) {
@@ -717,7 +717,7 @@ vuint32m8_t test_vnmsub_vv_u32m8(vuint32m8_t acc, vuint32m8_t op1, vuint32m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vnmsub.nxv16i32.i32.i64(<vscale x 16 x i32> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 16 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vnmsub_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2, size_t vl) {
@@ -726,7 +726,7 @@ vuint32m8_t test_vnmsub_vx_u32m8(vuint32m8_t acc, uint32_t op1, vuint32m8_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i64> [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsub_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t op2, size_t vl) {
@@ -735,7 +735,7 @@ vuint64m1_t test_vnmsub_vv_u64m1(vuint64m1_t acc, vuint64m1_t op1, vuint64m1_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64.i64(<vscale x 1 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 1 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vnmsub_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2, size_t vl) {
@@ -744,7 +744,7 @@ vuint64m1_t test_vnmsub_vx_u64m1(vuint64m1_t acc, uint64_t op1, vuint64m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i64> [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsub_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t op2, size_t vl) {
@@ -753,7 +753,7 @@ vuint64m2_t test_vnmsub_vv_u64m2(vuint64m2_t acc, vuint64m2_t op1, vuint64m2_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64.i64(<vscale x 2 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 2 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vnmsub_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2, size_t vl) {
@@ -762,7 +762,7 @@ vuint64m2_t test_vnmsub_vx_u64m2(vuint64m2_t acc, uint64_t op1, vuint64m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i64> [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsub_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t op2, size_t vl) {
@@ -771,7 +771,7 @@ vuint64m4_t test_vnmsub_vv_u64m4(vuint64m4_t acc, vuint64m4_t op1, vuint64m4_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64.i64(<vscale x 4 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 4 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vnmsub_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2, size_t vl) {
@@ -780,7 +780,7 @@ vuint64m4_t test_vnmsub_vx_u64m4(vuint64m4_t acc, uint64_t op1, vuint64m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vnmsub_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.nxv8i64.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i64> [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsub_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t op2, size_t vl) {
@@ -789,7 +789,7 @@ vuint64m8_t test_vnmsub_vv_u64m8(vuint64m8_t acc, vuint64m8_t op1, vuint64m8_t o
 
 // CHECK-RV64-LABEL: @test_vnmsub_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vnmsub.nxv8i64.i64.i64(<vscale x 8 x i64> [[ACC:%.*]], i64 [[OP1:%.*]], <vscale x 8 x i64> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vnmsub_vx_u64m8(vuint64m8_t acc, uint64_t op1, vuint64m8_t op2, size_t vl) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics/vwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics/vwmacc.c
index f8eb6e6d111a1..3239f2ee4287d 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics/vwmacc.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics/vwmacc.c
@@ -6,7 +6,7 @@
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmacc_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
@@ -16,7 +16,7 @@ vint16mf4_t test_vwmacc_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmacc_vx_i16mf4(vint16mf4_t acc, int8_t op1, vint8mf8_t op2,
@@ -26,7 +26,7 @@ vint16mf4_t test_vwmacc_vx_i16mf4(vint16mf4_t acc, int8_t op1, vint8mf8_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmacc_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
@@ -36,7 +36,7 @@ vint16mf2_t test_vwmacc_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmacc_vx_i16mf2(vint16mf2_t acc, int8_t op1, vint8mf4_t op2,
@@ -46,7 +46,7 @@ vint16mf2_t test_vwmacc_vx_i16mf2(vint16mf2_t acc, int8_t op1, vint8mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmacc_vv_i16m1(vint16m1_t acc, vint8mf2_t op1, vint8mf2_t op2,
@@ -56,7 +56,7 @@ vint16m1_t test_vwmacc_vv_i16m1(vint16m1_t acc, vint8mf2_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmacc_vx_i16m1(vint16m1_t acc, int8_t op1, vint8mf2_t op2,
@@ -66,7 +66,7 @@ vint16m1_t test_vwmacc_vx_i16m1(vint16m1_t acc, int8_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmacc_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vint8m1_t op2,
@@ -76,7 +76,7 @@ vint16m2_t test_vwmacc_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmacc_vx_i16m2(vint16m2_t acc, int8_t op1, vint8m1_t op2,
@@ -86,7 +86,7 @@ vint16m2_t test_vwmacc_vx_i16m2(vint16m2_t acc, int8_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmacc_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vint8m2_t op2,
@@ -96,7 +96,7 @@ vint16m4_t test_vwmacc_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmacc_vx_i16m4(vint16m4_t acc, int8_t op1, vint8m2_t op2,
@@ -106,7 +106,7 @@ vint16m4_t test_vwmacc_vx_i16m4(vint16m4_t acc, int8_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmacc_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vint8m4_t op2,
@@ -116,7 +116,7 @@ vint16m8_t test_vwmacc_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmacc_vx_i16m8(vint16m8_t acc, int8_t op1, vint8m4_t op2,
@@ -126,7 +126,7 @@ vint16m8_t test_vwmacc_vx_i16m8(vint16m8_t acc, int8_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmacc_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
@@ -136,7 +136,7 @@ vint32mf2_t test_vwmacc_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmacc_vx_i32mf2(vint32mf2_t acc, int16_t op1, vint16mf4_t op2,
@@ -146,7 +146,7 @@ vint32mf2_t test_vwmacc_vx_i32mf2(vint32mf2_t acc, int16_t op1, vint16mf4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmacc_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
@@ -156,7 +156,7 @@ vint32m1_t test_vwmacc_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmacc_vx_i32m1(vint32m1_t acc, int16_t op1, vint16mf2_t op2,
@@ -166,7 +166,7 @@ vint32m1_t test_vwmacc_vx_i32m1(vint32m1_t acc, int16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmacc_vv_i32m2(vint32m2_t acc, vint16m1_t op1, vint16m1_t op2,
@@ -176,7 +176,7 @@ vint32m2_t test_vwmacc_vv_i32m2(vint32m2_t acc, vint16m1_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmacc_vx_i32m2(vint32m2_t acc, int16_t op1, vint16m1_t op2,
@@ -186,7 +186,7 @@ vint32m2_t test_vwmacc_vx_i32m2(vint32m2_t acc, int16_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmacc_vv_i32m4(vint32m4_t acc, vint16m2_t op1, vint16m2_t op2,
@@ -196,7 +196,7 @@ vint32m4_t test_vwmacc_vv_i32m4(vint32m4_t acc, vint16m2_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmacc_vx_i32m4(vint32m4_t acc, int16_t op1, vint16m2_t op2,
@@ -206,7 +206,7 @@ vint32m4_t test_vwmacc_vx_i32m4(vint32m4_t acc, int16_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmacc_vv_i32m8(vint32m8_t acc, vint16m4_t op1, vint16m4_t op2,
@@ -216,7 +216,7 @@ vint32m8_t test_vwmacc_vv_i32m8(vint32m8_t acc, vint16m4_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmacc_vx_i32m8(vint32m8_t acc, int16_t op1, vint16m4_t op2,
@@ -226,7 +226,7 @@ vint32m8_t test_vwmacc_vx_i32m8(vint32m8_t acc, int16_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmacc_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
@@ -236,7 +236,7 @@ vint64m1_t test_vwmacc_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmacc_vx_i64m1(vint64m1_t acc, int32_t op1, vint32mf2_t op2,
@@ -246,7 +246,7 @@ vint64m1_t test_vwmacc_vx_i64m1(vint64m1_t acc, int32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmacc_vv_i64m2(vint64m2_t acc, vint32m1_t op1, vint32m1_t op2,
@@ -256,7 +256,7 @@ vint64m2_t test_vwmacc_vv_i64m2(vint64m2_t acc, vint32m1_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmacc_vx_i64m2(vint64m2_t acc, int32_t op1, vint32m1_t op2,
@@ -266,7 +266,7 @@ vint64m2_t test_vwmacc_vx_i64m2(vint64m2_t acc, int32_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmacc_vv_i64m4(vint64m4_t acc, vint32m2_t op1, vint32m2_t op2,
@@ -276,7 +276,7 @@ vint64m4_t test_vwmacc_vv_i64m4(vint64m4_t acc, vint32m2_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmacc_vx_i64m4(vint64m4_t acc, int32_t op1, vint32m2_t op2,
@@ -286,7 +286,7 @@ vint64m4_t test_vwmacc_vx_i64m4(vint64m4_t acc, int32_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmacc_vv_i64m8(vint64m8_t acc, vint32m4_t op1, vint32m4_t op2,
@@ -296,7 +296,7 @@ vint64m8_t test_vwmacc_vv_i64m8(vint64m8_t acc, vint32m4_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmacc_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmacc_vx_i64m8(vint64m8_t acc, int32_t op1, vint32m4_t op2,
@@ -306,7 +306,7 @@ vint64m8_t test_vwmacc_vx_i64m8(vint64m8_t acc, int32_t op1, vint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwmaccu_vv_u16mf4(vuint16mf4_t acc, vuint8mf8_t op1,
@@ -316,7 +316,7 @@ vuint16mf4_t test_vwmaccu_vv_u16mf4(vuint16mf4_t acc, vuint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vuint16mf4_t test_vwmaccu_vx_u16mf4(vuint16mf4_t acc, uint8_t op1,
@@ -326,7 +326,7 @@ vuint16mf4_t test_vwmaccu_vx_u16mf4(vuint16mf4_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwmaccu_vv_u16mf2(vuint16mf2_t acc, vuint8mf4_t op1,
@@ -336,7 +336,7 @@ vuint16mf2_t test_vwmaccu_vv_u16mf2(vuint16mf2_t acc, vuint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vuint16mf2_t test_vwmaccu_vx_u16mf2(vuint16mf2_t acc, uint8_t op1,
@@ -346,7 +346,7 @@ vuint16mf2_t test_vwmaccu_vx_u16mf2(vuint16mf2_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwmaccu_vv_u16m1(vuint16m1_t acc, vuint8mf2_t op1,
@@ -356,7 +356,7 @@ vuint16m1_t test_vwmaccu_vv_u16m1(vuint16m1_t acc, vuint8mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vuint16m1_t test_vwmaccu_vx_u16m1(vuint16m1_t acc, uint8_t op1, vuint8mf2_t op2,
@@ -366,7 +366,7 @@ vuint16m1_t test_vwmaccu_vx_u16m1(vuint16m1_t acc, uint8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwmaccu_vv_u16m2(vuint16m2_t acc, vuint8m1_t op1,
@@ -376,7 +376,7 @@ vuint16m2_t test_vwmaccu_vv_u16m2(vuint16m2_t acc, vuint8m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vuint16m2_t test_vwmaccu_vx_u16m2(vuint16m2_t acc, uint8_t op1, vuint8m1_t op2,
@@ -386,7 +386,7 @@ vuint16m2_t test_vwmaccu_vx_u16m2(vuint16m2_t acc, uint8_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwmaccu_vv_u16m4(vuint16m4_t acc, vuint8m2_t op1,
@@ -396,7 +396,7 @@ vuint16m4_t test_vwmaccu_vv_u16m4(vuint16m4_t acc, vuint8m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vuint16m4_t test_vwmaccu_vx_u16m4(vuint16m4_t acc, uint8_t op1, vuint8m2_t op2,
@@ -406,7 +406,7 @@ vuint16m4_t test_vwmaccu_vx_u16m4(vuint16m4_t acc, uint8_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwmaccu_vv_u16m8(vuint16m8_t acc, vuint8m4_t op1,
@@ -416,7 +416,7 @@ vuint16m8_t test_vwmaccu_vv_u16m8(vuint16m8_t acc, vuint8m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vuint16m8_t test_vwmaccu_vx_u16m8(vuint16m8_t acc, uint8_t op1, vuint8m4_t op2,
@@ -426,7 +426,7 @@ vuint16m8_t test_vwmaccu_vx_u16m8(vuint16m8_t acc, uint8_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwmaccu_vv_u32mf2(vuint32mf2_t acc, vuint16mf4_t op1,
@@ -436,7 +436,7 @@ vuint32mf2_t test_vwmaccu_vv_u32mf2(vuint32mf2_t acc, vuint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vuint32mf2_t test_vwmaccu_vx_u32mf2(vuint32mf2_t acc, uint16_t op1,
@@ -446,7 +446,7 @@ vuint32mf2_t test_vwmaccu_vx_u32mf2(vuint32mf2_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwmaccu_vv_u32m1(vuint32m1_t acc, vuint16mf2_t op1,
@@ -456,7 +456,7 @@ vuint32m1_t test_vwmaccu_vv_u32m1(vuint32m1_t acc, vuint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vuint32m1_t test_vwmaccu_vx_u32m1(vuint32m1_t acc, uint16_t op1,
@@ -466,7 +466,7 @@ vuint32m1_t test_vwmaccu_vx_u32m1(vuint32m1_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwmaccu_vv_u32m2(vuint32m2_t acc, vuint16m1_t op1,
@@ -476,7 +476,7 @@ vuint32m2_t test_vwmaccu_vv_u32m2(vuint32m2_t acc, vuint16m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vuint32m2_t test_vwmaccu_vx_u32m2(vuint32m2_t acc, uint16_t op1,
@@ -486,7 +486,7 @@ vuint32m2_t test_vwmaccu_vx_u32m2(vuint32m2_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwmaccu_vv_u32m4(vuint32m4_t acc, vuint16m2_t op1,
@@ -496,7 +496,7 @@ vuint32m4_t test_vwmaccu_vv_u32m4(vuint32m4_t acc, vuint16m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vuint32m4_t test_vwmaccu_vx_u32m4(vuint32m4_t acc, uint16_t op1,
@@ -506,7 +506,7 @@ vuint32m4_t test_vwmaccu_vx_u32m4(vuint32m4_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwmaccu_vv_u32m8(vuint32m8_t acc, vuint16m4_t op1,
@@ -516,7 +516,7 @@ vuint32m8_t test_vwmaccu_vv_u32m8(vuint32m8_t acc, vuint16m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vuint32m8_t test_vwmaccu_vx_u32m8(vuint32m8_t acc, uint16_t op1,
@@ -526,7 +526,7 @@ vuint32m8_t test_vwmaccu_vx_u32m8(vuint32m8_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwmaccu_vv_u64m1(vuint64m1_t acc, vuint32mf2_t op1,
@@ -536,7 +536,7 @@ vuint64m1_t test_vwmaccu_vv_u64m1(vuint64m1_t acc, vuint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vuint64m1_t test_vwmaccu_vx_u64m1(vuint64m1_t acc, uint32_t op1,
@@ -546,7 +546,7 @@ vuint64m1_t test_vwmaccu_vx_u64m1(vuint64m1_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwmaccu_vv_u64m2(vuint64m2_t acc, vuint32m1_t op1,
@@ -556,7 +556,7 @@ vuint64m2_t test_vwmaccu_vv_u64m2(vuint64m2_t acc, vuint32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vuint64m2_t test_vwmaccu_vx_u64m2(vuint64m2_t acc, uint32_t op1,
@@ -566,7 +566,7 @@ vuint64m2_t test_vwmaccu_vx_u64m2(vuint64m2_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwmaccu_vv_u64m4(vuint64m4_t acc, vuint32m2_t op1,
@@ -576,7 +576,7 @@ vuint64m4_t test_vwmaccu_vv_u64m4(vuint64m4_t acc, vuint32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vuint64m4_t test_vwmaccu_vx_u64m4(vuint64m4_t acc, uint32_t op1,
@@ -586,7 +586,7 @@ vuint64m4_t test_vwmaccu_vx_u64m4(vuint64m4_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vv_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwmaccu_vv_u64m8(vuint64m8_t acc, vuint32m4_t op1,
@@ -596,7 +596,7 @@ vuint64m8_t test_vwmaccu_vv_u64m8(vuint64m8_t acc, vuint32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccu_vx_u64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vuint64m8_t test_vwmaccu_vx_u64m8(vuint64m8_t acc, uint32_t op1,
@@ -606,7 +606,7 @@ vuint64m8_t test_vwmaccu_vx_u64m8(vuint64m8_t acc, uint32_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], <vscale x 1 x i8> [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmaccsu_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
@@ -616,7 +616,7 @@ vint16mf4_t test_vwmaccsu_vv_i16mf4(vint16mf4_t acc, vint8mf8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmaccsu_vx_i16mf4(vint16mf4_t acc, int8_t op1,
@@ -626,7 +626,7 @@ vint16mf4_t test_vwmaccsu_vx_i16mf4(vint16mf4_t acc, int8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.nxv2i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], <vscale x 2 x i8> [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmaccsu_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
@@ -636,7 +636,7 @@ vint16mf2_t test_vwmaccsu_vv_i16mf2(vint16mf2_t acc, vint8mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmaccsu_vx_i16mf2(vint16mf2_t acc, int8_t op1,
@@ -646,7 +646,7 @@ vint16mf2_t test_vwmaccsu_vx_i16mf2(vint16mf2_t acc, int8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.nxv4i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], <vscale x 4 x i8> [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmaccsu_vv_i16m1(vint16m1_t acc, vint8mf2_t op1,
@@ -656,7 +656,7 @@ vint16m1_t test_vwmaccsu_vv_i16m1(vint16m1_t acc, vint8mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmaccsu_vx_i16m1(vint16m1_t acc, int8_t op1, vuint8mf2_t op2,
@@ -666,7 +666,7 @@ vint16m1_t test_vwmaccsu_vx_i16m1(vint16m1_t acc, int8_t op1, vuint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.nxv8i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], <vscale x 8 x i8> [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmaccsu_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vuint8m1_t op2,
@@ -676,7 +676,7 @@ vint16m2_t test_vwmaccsu_vv_i16m2(vint16m2_t acc, vint8m1_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmaccsu_vx_i16m2(vint16m2_t acc, int8_t op1, vuint8m1_t op2,
@@ -686,7 +686,7 @@ vint16m2_t test_vwmaccsu_vx_i16m2(vint16m2_t acc, int8_t op1, vuint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.nxv16i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], <vscale x 16 x i8> [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmaccsu_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vuint8m2_t op2,
@@ -696,7 +696,7 @@ vint16m4_t test_vwmaccsu_vv_i16m4(vint16m4_t acc, vint8m2_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmaccsu_vx_i16m4(vint16m4_t acc, int8_t op1, vuint8m2_t op2,
@@ -706,7 +706,7 @@ vint16m4_t test_vwmaccsu_vx_i16m4(vint16m4_t acc, int8_t op1, vuint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.nxv32i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], <vscale x 32 x i8> [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmaccsu_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vuint8m4_t op2,
@@ -716,7 +716,7 @@ vint16m8_t test_vwmaccsu_vv_i16m8(vint16m8_t acc, vint8m4_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmaccsu_vx_i16m8(vint16m8_t acc, int8_t op1, vuint8m4_t op2,
@@ -726,7 +726,7 @@ vint16m8_t test_vwmaccsu_vx_i16m8(vint16m8_t acc, int8_t op1, vuint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.nxv1i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], <vscale x 1 x i16> [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmaccsu_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
@@ -736,7 +736,7 @@ vint32mf2_t test_vwmaccsu_vv_i32mf2(vint32mf2_t acc, vint16mf4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmaccsu_vx_i32mf2(vint32mf2_t acc, int16_t op1,
@@ -746,7 +746,7 @@ vint32mf2_t test_vwmaccsu_vx_i32mf2(vint32mf2_t acc, int16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.nxv2i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], <vscale x 2 x i16> [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmaccsu_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
@@ -756,7 +756,7 @@ vint32m1_t test_vwmaccsu_vv_i32m1(vint32m1_t acc, vint16mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmaccsu_vx_i32m1(vint32m1_t acc, int16_t op1, vuint16mf2_t op2,
@@ -766,7 +766,7 @@ vint32m1_t test_vwmaccsu_vx_i32m1(vint32m1_t acc, int16_t op1, vuint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], <vscale x 4 x i16> [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmaccsu_vv_i32m2(vint32m2_t acc, vint16m1_t op1,
@@ -776,7 +776,7 @@ vint32m2_t test_vwmaccsu_vv_i32m2(vint32m2_t acc, vint16m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmaccsu_vx_i32m2(vint32m2_t acc, int16_t op1, vuint16m1_t op2,
@@ -786,7 +786,7 @@ vint32m2_t test_vwmaccsu_vx_i32m2(vint32m2_t acc, int16_t op1, vuint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.nxv8i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], <vscale x 8 x i16> [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmaccsu_vv_i32m4(vint32m4_t acc, vint16m2_t op1,
@@ -796,7 +796,7 @@ vint32m4_t test_vwmaccsu_vv_i32m4(vint32m4_t acc, vint16m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmaccsu_vx_i32m4(vint32m4_t acc, int16_t op1, vuint16m2_t op2,
@@ -806,7 +806,7 @@ vint32m4_t test_vwmaccsu_vx_i32m4(vint32m4_t acc, int16_t op1, vuint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.nxv16i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], <vscale x 16 x i16> [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmaccsu_vv_i32m8(vint32m8_t acc, vint16m4_t op1,
@@ -816,7 +816,7 @@ vint32m8_t test_vwmaccsu_vv_i32m8(vint32m8_t acc, vint16m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmaccsu_vx_i32m8(vint32m8_t acc, int16_t op1, vuint16m4_t op2,
@@ -826,7 +826,7 @@ vint32m8_t test_vwmaccsu_vx_i32m8(vint32m8_t acc, int16_t op1, vuint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.nxv1i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], <vscale x 1 x i32> [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmaccsu_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
@@ -836,7 +836,7 @@ vint64m1_t test_vwmaccsu_vv_i64m1(vint64m1_t acc, vint32mf2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmaccsu_vx_i64m1(vint64m1_t acc, int32_t op1, vuint32mf2_t op2,
@@ -846,7 +846,7 @@ vint64m1_t test_vwmaccsu_vx_i64m1(vint64m1_t acc, int32_t op1, vuint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.nxv2i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], <vscale x 2 x i32> [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmaccsu_vv_i64m2(vint64m2_t acc, vint32m1_t op1,
@@ -856,7 +856,7 @@ vint64m2_t test_vwmaccsu_vv_i64m2(vint64m2_t acc, vint32m1_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmaccsu_vx_i64m2(vint64m2_t acc, int32_t op1, vuint32m1_t op2,
@@ -866,7 +866,7 @@ vint64m2_t test_vwmaccsu_vx_i64m2(vint64m2_t acc, int32_t op1, vuint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.nxv4i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], <vscale x 4 x i32> [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmaccsu_vv_i64m4(vint64m4_t acc, vint32m2_t op1,
@@ -876,7 +876,7 @@ vint64m4_t test_vwmaccsu_vv_i64m4(vint64m4_t acc, vint32m2_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmaccsu_vx_i64m4(vint64m4_t acc, int32_t op1, vuint32m2_t op2,
@@ -886,7 +886,7 @@ vint64m4_t test_vwmaccsu_vx_i64m4(vint64m4_t acc, int32_t op1, vuint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vv_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.nxv8i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], <vscale x 8 x i32> [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmaccsu_vv_i64m8(vint64m8_t acc, vint32m4_t op1,
@@ -896,7 +896,7 @@ vint64m8_t test_vwmaccsu_vv_i64m8(vint64m8_t acc, vint32m4_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccsu_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmaccsu_vx_i64m8(vint64m8_t acc, int32_t op1, vuint32m4_t op2,
@@ -906,7 +906,7 @@ vint64m8_t test_vwmaccsu_vx_i64m8(vint64m8_t acc, int32_t op1, vuint32m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16mf4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8.nxv1i8.i64(<vscale x 1 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 1 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
 //
 vint16mf4_t test_vwmaccus_vx_i16mf4(vint16mf4_t acc, uint8_t op1,
@@ -916,7 +916,7 @@ vint16mf4_t test_vwmaccus_vx_i16mf4(vint16mf4_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccus.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vwmaccus.nxv2i16.i8.nxv2i8.i64(<vscale x 2 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 2 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
 //
 vint16mf2_t test_vwmaccus_vx_i16mf2(vint16mf2_t acc, uint8_t op1,
@@ -926,7 +926,7 @@ vint16mf2_t test_vwmaccus_vx_i16mf2(vint16mf2_t acc, uint8_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccus.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vwmaccus.nxv4i16.i8.nxv4i8.i64(<vscale x 4 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 4 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
 //
 vint16m1_t test_vwmaccus_vx_i16m1(vint16m1_t acc, uint8_t op1, vint8mf2_t op2,
@@ -936,7 +936,7 @@ vint16m1_t test_vwmaccus_vx_i16m1(vint16m1_t acc, uint8_t op1, vint8mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccus.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vwmaccus.nxv8i16.i8.nxv8i8.i64(<vscale x 8 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 8 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
 vint16m2_t test_vwmaccus_vx_i16m2(vint16m2_t acc, uint8_t op1, vint8m1_t op2,
@@ -946,7 +946,7 @@ vint16m2_t test_vwmaccus_vx_i16m2(vint16m2_t acc, uint8_t op1, vint8m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccus.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vwmaccus.nxv16i16.i8.nxv16i8.i64(<vscale x 16 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 16 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
 vint16m4_t test_vwmaccus_vx_i16m4(vint16m4_t acc, uint8_t op1, vint8m2_t op2,
@@ -956,7 +956,7 @@ vint16m4_t test_vwmaccus_vx_i16m4(vint16m4_t acc, uint8_t op1, vint8m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i16m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccus.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vwmaccus.nxv32i16.i8.nxv32i8.i64(<vscale x 32 x i16> [[ACC:%.*]], i8 [[OP1:%.*]], <vscale x 32 x i8> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
 vint16m8_t test_vwmaccus_vx_i16m8(vint16m8_t acc, uint8_t op1, vint8m4_t op2,
@@ -966,7 +966,7 @@ vint16m8_t test_vwmaccus_vx_i16m8(vint16m8_t acc, uint8_t op1, vint8m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32mf2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccus.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vwmaccus.nxv1i32.i16.nxv1i16.i64(<vscale x 1 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 1 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> [[TMP0]]
 //
 vint32mf2_t test_vwmaccus_vx_i32mf2(vint32mf2_t acc, uint16_t op1,
@@ -976,7 +976,7 @@ vint32mf2_t test_vwmaccus_vx_i32mf2(vint32mf2_t acc, uint16_t op1,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccus.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vwmaccus.nxv2i32.i16.nxv2i16.i64(<vscale x 2 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 2 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
 vint32m1_t test_vwmaccus_vx_i32m1(vint32m1_t acc, uint16_t op1, vint16mf2_t op2,
@@ -986,7 +986,7 @@ vint32m1_t test_vwmaccus_vx_i32m1(vint32m1_t acc, uint16_t op1, vint16mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16.nxv4i16.i64(<vscale x 4 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 4 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 vint32m2_t test_vwmaccus_vx_i32m2(vint32m2_t acc, uint16_t op1, vint16m1_t op2,
@@ -996,7 +996,7 @@ vint32m2_t test_vwmaccus_vx_i32m2(vint32m2_t acc, uint16_t op1, vint16m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccus.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vwmaccus.nxv8i32.i16.nxv8i16.i64(<vscale x 8 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 8 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
 vint32m4_t test_vwmaccus_vx_i32m4(vint32m4_t acc, uint16_t op1, vint16m2_t op2,
@@ -1006,7 +1006,7 @@ vint32m4_t test_vwmaccus_vx_i32m4(vint32m4_t acc, uint16_t op1, vint16m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i32m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccus.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vwmaccus.nxv16i32.i16.nxv16i16.i64(<vscale x 16 x i32> [[ACC:%.*]], i16 [[OP1:%.*]], <vscale x 16 x i16> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
 vint32m8_t test_vwmaccus_vx_i32m8(vint32m8_t acc, uint16_t op1, vint16m4_t op2,
@@ -1016,7 +1016,7 @@ vint32m8_t test_vwmaccus_vx_i32m8(vint32m8_t acc, uint16_t op1, vint16m4_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m1(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccus.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vwmaccus.nxv1i64.i32.nxv1i32.i64(<vscale x 1 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 1 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
 //
 vint64m1_t test_vwmaccus_vx_i64m1(vint64m1_t acc, uint32_t op1, vint32mf2_t op2,
@@ -1026,7 +1026,7 @@ vint64m1_t test_vwmaccus_vx_i64m1(vint64m1_t acc, uint32_t op1, vint32mf2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m2(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccus.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vwmaccus.nxv2i64.i32.nxv2i32.i64(<vscale x 2 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 2 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 vint64m2_t test_vwmaccus_vx_i64m2(vint64m2_t acc, uint32_t op1, vint32m1_t op2,
@@ -1036,7 +1036,7 @@ vint64m2_t test_vwmaccus_vx_i64m2(vint64m2_t acc, uint32_t op1, vint32m1_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m4(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccus.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vwmaccus.nxv4i64.i32.nxv4i32.i64(<vscale x 4 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 4 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
 vint64m4_t test_vwmaccus_vx_i64m4(vint64m4_t acc, uint32_t op1, vint32m2_t op2,
@@ -1046,7 +1046,7 @@ vint64m4_t test_vwmaccus_vx_i64m4(vint64m4_t acc, uint32_t op1, vint32m2_t op2,
 
 // CHECK-RV64-LABEL: @test_vwmaccus_vx_i64m8(
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccus.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]])
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vwmaccus.nxv8i64.i32.nxv8i32.i64(<vscale x 8 x i64> [[ACC:%.*]], i32 [[OP1:%.*]], <vscale x 8 x i32> [[OP2:%.*]], i64 [[VL:%.*]], i64 0)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
 vint64m8_t test_vwmaccus_vx_i64m8(vint64m8_t acc, uint32_t op1, vint32m4_t op2,
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index 9888e6c6862cf..dbb636c292d07 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -149,6 +149,12 @@ enum RISCVPredefinedMacro : RISCVPredefinedMacroT {
   VectorMaxELenFp64 = 1 << 6,
 };
 
+enum Policy : uint8_t {
+  None,
+  HasPassthruOperand,
+  HasPolicyOperand,
+};
+
 // TODO refactor RVVIntrinsic class design after support all intrinsic
 // combination. This represents an instantiation of an intrinsic with a
 // particular type and prototype
@@ -162,7 +168,7 @@ class RVVIntrinsic {
   bool IsMask;
   bool HasVL;
   bool HasPolicy;
-  bool HasNoMaskPassThru;
+  Policy NoMaskPolicy;
   bool HasNoMaskedOverloaded;
   bool HasAutoDef; // There is automiatic definition in header
   std::string ManualCodegen;
@@ -178,7 +184,7 @@ class RVVIntrinsic {
   RVVIntrinsic(StringRef Name, StringRef Suffix, StringRef MangledName,
                StringRef MangledSuffix, StringRef IRName, bool IsMask,
                bool HasMaskedOffOperand, bool HasVL, bool HasPolicy,
-               bool HasNoMaskPassThru, bool HasNoMaskedOverloaded,
+               Policy NoMaskPolicy, bool HasNoMaskedOverloaded,
                bool HasAutoDef, StringRef ManualCodegen, const RVVTypes &Types,
                const std::vector<int64_t> &IntrinsicTypes,
                const std::vector<StringRef> &RequiredFeatures, unsigned NF);
@@ -189,13 +195,15 @@ class RVVIntrinsic {
   StringRef getMangledName() const { return MangledName; }
   bool hasVL() const { return HasVL; }
   bool hasPolicy() const { return HasPolicy; }
-  bool hasNoMaskPassThru() const { return HasNoMaskPassThru; }
+  bool hasNoMaskPassthru() const { return NoMaskPolicy == HasPassthruOperand; }
+  bool hasNoMaskPolicy() const { return NoMaskPolicy == HasPolicyOperand; }
   bool hasNoMaskedOverloaded() const { return HasNoMaskedOverloaded; }
   bool hasManualCodegen() const { return !ManualCodegen.empty(); }
   bool hasAutoDef() const { return HasAutoDef; }
   bool isMask() const { return IsMask; }
   StringRef getIRName() const { return IRName; }
   StringRef getManualCodegen() const { return ManualCodegen; }
+  Policy getNoMaskPolicy() const { return NoMaskPolicy; }
   RISCVPredefinedMacroT getRISCVPredefinedMacros() const {
     return RISCVPredefinedMacros;
   }
@@ -307,7 +315,7 @@ VScaleVal LMULType::getScale(unsigned ElementBitwidth) const {
   }
   // Illegal vscale result would be less than 1
   if (Log2ScaleResult < 0)
-    return None;
+    return llvm::None;
   return 1 << Log2ScaleResult;
 }
 
@@ -768,20 +776,16 @@ void RVVType::applyModifier(StringRef Transformer) {
 //===----------------------------------------------------------------------===//
 // RVVIntrinsic implementation
 //===----------------------------------------------------------------------===//
-RVVIntrinsic::RVVIntrinsic(StringRef NewName, StringRef Suffix,
-                           StringRef NewMangledName, StringRef MangledSuffix,
-                           StringRef IRName, bool IsMask,
-                           bool HasMaskedOffOperand, bool HasVL, bool HasPolicy,
-                           bool HasNoMaskPassThru, bool HasNoMaskedOverloaded,
-                           bool HasAutoDef, StringRef ManualCodegen,
-                           const RVVTypes &OutInTypes,
-                           const std::vector<int64_t> &NewIntrinsicTypes,
-                           const std::vector<StringRef> &RequiredFeatures,
-                           unsigned NF)
+RVVIntrinsic::RVVIntrinsic(
+    StringRef NewName, StringRef Suffix, StringRef NewMangledName,
+    StringRef MangledSuffix, StringRef IRName, bool IsMask,
+    bool HasMaskedOffOperand, bool HasVL, bool HasPolicy, Policy NoMaskPolicy,
+    bool HasNoMaskedOverloaded, bool HasAutoDef, StringRef ManualCodegen,
+    const RVVTypes &OutInTypes, const std::vector<int64_t> &NewIntrinsicTypes,
+    const std::vector<StringRef> &RequiredFeatures, unsigned NF)
     : IRName(IRName), IsMask(IsMask), HasVL(HasVL), HasPolicy(HasPolicy),
-      HasNoMaskPassThru(HasNoMaskPassThru),
-      HasNoMaskedOverloaded(HasNoMaskedOverloaded), HasAutoDef(HasAutoDef),
-      ManualCodegen(ManualCodegen.str()), NF(NF) {
+      NoMaskPolicy(NoMaskPolicy), HasNoMaskedOverloaded(HasNoMaskedOverloaded),
+      HasAutoDef(HasAutoDef), ManualCodegen(ManualCodegen.str()), NF(NF) {
 
   // Init BuiltinName, Name and MangledName
   BuiltinName = NewName.str();
@@ -827,7 +831,7 @@ RVVIntrinsic::RVVIntrinsic(StringRef NewName, StringRef Suffix,
   // IntrinsicTypes is nonmasked version index. Need to update it
   // if there is maskedoff operand (It is always in first operand).
   IntrinsicTypes = NewIntrinsicTypes;
-  if ((IsMask && HasMaskedOffOperand) || (!IsMask && HasNoMaskPassThru)) {
+  if ((IsMask && HasMaskedOffOperand) || (!IsMask && hasNoMaskPassthru())) {
     for (auto &I : IntrinsicTypes) {
       if (I >= 0)
         I += NF;
@@ -864,9 +868,14 @@ void RVVIntrinsic::emitCodeGenSwitchBody(raw_ostream &OS) const {
     } else {
       OS << "  std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());\n";
     }
-  } else if (hasNoMaskPassThru()) {
-    OS << "  Ops.push_back(llvm::UndefValue::get(ResultType));\n";
-    OS << "  std::rotate(Ops.rbegin(), Ops.rbegin() + 1,  Ops.rend());\n";
+  } else {
+    if (hasNoMaskPolicy())
+      OS << "  Ops.push_back(ConstantInt::get(Ops.back()->getType(), "
+            "TAIL_UNDISTURBED));\n";
+    else if (hasNoMaskPassthru()) {
+      OS << "  Ops.push_back(llvm::UndefValue::get(ResultType));\n";
+      OS << "  std::rotate(Ops.rbegin(), Ops.rbegin() + 1,  Ops.rend());\n";
+    }
   }
 
   OS << "  IntrinsicTypes = {";
@@ -1114,8 +1123,8 @@ void RVVEmitter::createCodeGen(raw_ostream &OS) {
       PrintFatalError("Builtin with same name has different HasPolicy");
     else if (P.first->second->hasPolicy() != Def->hasPolicy())
       PrintFatalError("Builtin with same name has different HasPolicy");
-    else if (P.first->second->hasNoMaskPassThru() != Def->hasNoMaskPassThru())
-      PrintFatalError("Builtin with same name has different HasNoMaskPassThru");
+    else if (P.first->second->getNoMaskPolicy() != Def->getNoMaskPolicy())
+      PrintFatalError("Builtin with same name has different getNoMaskPolicy");
     else if (P.first->second->getIntrinsicTypes() != Def->getIntrinsicTypes())
       PrintFatalError("Builtin with same name has different IntrinsicTypes");
   }
@@ -1163,7 +1172,9 @@ void RVVEmitter::createRVVIntrinsics(
     bool HasMaskedOffOperand = R->getValueAsBit("HasMaskedOffOperand");
     bool HasVL = R->getValueAsBit("HasVL");
     bool HasPolicy = R->getValueAsBit("HasPolicy");
-    bool HasNoMaskPassThru = R->getValueAsBit("HasNoMaskPassThru");
+    Record* NoMaskPolicyRecord = R->getValueAsDef("NoMaskPolicy");
+    Policy NoMaskPolicy =
+        static_cast<Policy>(NoMaskPolicyRecord->getValueAsInt("Value"));
     bool HasNoMaskedOverloaded = R->getValueAsBit("HasNoMaskedOverloaded");
     std::vector<int64_t> Log2LMULList = R->getValueAsListOfInts("Log2LMUL");
     StringRef ManualCodegen = R->getValueAsString("ManualCodegen");
@@ -1238,7 +1249,7 @@ void RVVEmitter::createRVVIntrinsics(
         Out.push_back(std::make_unique<RVVIntrinsic>(
             Name, SuffixStr, MangledName, MangledSuffixStr, IRName,
             /*IsMask=*/false, /*HasMaskedOffOperand=*/false, HasVL, HasPolicy,
-            HasNoMaskPassThru, HasNoMaskedOverloaded, HasAutoDef, ManualCodegen,
+            NoMaskPolicy, HasNoMaskedOverloaded, HasAutoDef, ManualCodegen,
             Types.getValue(), IntrinsicTypes, RequiredFeatures, NF));
         if (HasMask) {
           // Create a mask intrinsic
@@ -1247,7 +1258,7 @@ void RVVEmitter::createRVVIntrinsics(
           Out.push_back(std::make_unique<RVVIntrinsic>(
               Name, SuffixStr, MangledName, MangledSuffixStr, IRNameMask,
               /*IsMask=*/true, HasMaskedOffOperand, HasVL, HasPolicy,
-              HasNoMaskPassThru, HasNoMaskedOverloaded, HasAutoDef,
+              NoMaskPolicy, HasNoMaskedOverloaded, HasAutoDef,
               ManualCodegenMask, MaskTypes.getValue(), IntrinsicTypes,
               RequiredFeatures, NF));
         }
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 33bbf2a2bf4c2..b588687da12ff 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -686,11 +686,12 @@ let TargetPrefix = "riscv" in {
                     [ImmArg<ArgIndex<5>>, IntrNoMem]>, RISCVVIntrinsic {
     let VLOperand = 4;
   }
+  // Input: (vector_in, vector_in/scalar, vector_in, vl, policy)
   class RISCVTernaryAAXANoMask
         : Intrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_any_ty, LLVMMatchType<0>,
-                     llvm_anyint_ty],
-                    [IntrNoMem]>, RISCVVIntrinsic {
+                     llvm_anyint_ty, LLVMMatchType<2>],
+                    [ImmArg<ArgIndex<4>>, IntrNoMem]>, RISCVVIntrinsic {
     let SplatOperand = 1;
     let VLOperand = 3;
   }
@@ -704,11 +705,12 @@ let TargetPrefix = "riscv" in {
     let SplatOperand = 1;
     let VLOperand = 4;
   }
+  // Input: (vector_in, vector_in/scalar, vector_in, vl, policy)
   class RISCVTernaryWideNoMask
         : Intrinsic< [llvm_anyvector_ty],
                      [LLVMMatchType<0>, llvm_any_ty, llvm_anyvector_ty,
-                      llvm_anyint_ty],
-                     [IntrNoMem] >, RISCVVIntrinsic {
+                      llvm_anyint_ty, LLVMMatchType<3>],
+                     [ImmArg<ArgIndex<4>>, IntrNoMem] >, RISCVVIntrinsic {
     let SplatOperand = 1;
     let VLOperand = 3;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index fc6ec3879c779..4e762b63d8013 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3196,12 +3196,12 @@ class VPatTernaryNoMaskWithPolicy<string intrinsic,
                     (result_type result_reg_class:$rs3),
                     (op1_type op1_reg_class:$rs1),
                     (op2_type op2_kind:$rs2),
-                    VLOpFrag)),
+                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
                     result_reg_class:$rs3,
                     (op1_type op1_reg_class:$rs1),
                     op2_kind:$rs2,
-                    GPR:$vl, sew, TAIL_UNDISTURBED)>;
+                    GPR:$vl, sew, (XLenVT timm:$policy))>;
 
 class VPatTernaryMask<string intrinsic,
                       string inst,
diff --git a/llvm/test/CodeGen/RISCV/rvv/memory-args.ll b/llvm/test/CodeGen/RISCV/rvv/memory-args.ll
index c8d55a2e65c86..1e0950eeae6de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memory-args.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memory-args.ll
@@ -6,6 +6,7 @@ declare <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
+  i64,
   i64);
 
 define <vscale x 64 x i8> @callee(<vscale x 64 x i8> %arg0, <vscale x 64 x i8> %arg1, <vscale x 64 x i8> %arg2) {
@@ -19,7 +20,7 @@ define <vscale x 64 x i8> @callee(<vscale x 64 x i8> %arg0, <vscale x 64 x i8> %
   %ret = call <vscale x 64 x i8> @llvm.riscv.vmacc.nxv64i8.nxv64i8(
                                   <vscale x 64 x i8> %arg0,
                                   <vscale x 64 x i8> %arg1,
-                                  <vscale x 64 x i8> %arg2, i64 1024)
+                                  <vscale x 64 x i8> %arg2, i64 1024, i64 0)
   ret <vscale x 64 x i8> %ret
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
new file mode 100644
index 0000000000000..a13c20dfba6d4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
@@ -0,0 +1,609 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefix=RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefix=RV64
+
+declare <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfmacc_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfmacc_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfmacc.vv v8, v10, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmacc_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfmacc.vv v8, v10, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfmadd_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfmadd_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfmadd.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmadd_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfmadd.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfmsac_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfmsac_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfmsac.vv v8, v10, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmsac_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfmsac.vv v8, v10, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfmsub_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfmsub_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfmsub.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmsub_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfmsub.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfnmacc_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfnmacc_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfnmacc.vv v8, v10, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfnmacc_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfnmacc.vv v8, v10, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfnmadd_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfnmadd_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfnmadd.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfnmadd_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfnmadd.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfnmsac_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfnmsac_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfnmsac.vv v8, v10, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfnmsac_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfnmsac.vv v8, v10, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfnmsub_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfnmsub_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV32-NEXT:    vfnmsub.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfnmsub_vv_nxv1f32_nxv1f32_nxv1f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
+; RV64-NEXT:    vfnmsub.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x float> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1f16(
+  <vscale x 1 x float>,
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwmacc_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfwmacc_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfwmacc.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfwmacc_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfwmacc.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1f16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1f16(
+  <vscale x 1 x float>,
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwmsac_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfwmsac_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfwmsac.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfwmsac_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfwmsac.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1f16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16(
+  <vscale x 1 x float>,
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmacc_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfwnmacc_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfwnmacc.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfwnmacc_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfwnmacc.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16(
+  <vscale x 1 x float>,
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmsac_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vfwnmsac_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfwnmsac.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfwnmsac_vv_nxv1f32_nxv1f16_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfwnmsac.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x half> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64(
+  <vscale x 1 x i64>,
+  i64,
+  <vscale x 1 x i64>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64>  @intrinsic_vmacc_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vmacc_vx_nxv1i64_i64_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmacc.vv v8, v9, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmacc_vx_nxv1i64_i64_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vmacc.vx v8, a0, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64(
+    <vscale x 1 x i64> %0,
+    i64 %1,
+    <vscale x 1 x i64> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64(
+  <vscale x 1 x i64>,
+  i64,
+  <vscale x 1 x i64>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64>  @intrinsic_vmadd_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vmadd_vx_nxv1i64_i64_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vmadd.vv v8, v10, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmadd_vx_nxv1i64_i64_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64(
+    <vscale x 1 x i64> %0,
+    i64 %1,
+    <vscale x 1 x i64> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64(
+  <vscale x 1 x i64>,
+  i64,
+  <vscale x 1 x i64>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64>  @intrinsic_vnmsac_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vnmsac_vx_nxv1i64_i64_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vnmsac.vv v8, v9, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vnmsac_vx_nxv1i64_i64_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vnmsac.vx v8, a0, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64(
+    <vscale x 1 x i64> %0,
+    i64 %1,
+    <vscale x 1 x i64> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64(
+  <vscale x 1 x i64>,
+  i64,
+  <vscale x 1 x i64>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64>  @intrinsic_vnmsub_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vnmsub_vx_nxv1i64_i64_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vnmsub.vv v8, v10, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vnmsub_vx_nxv1i64_i64_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vnmsub.vx v8, a0, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64(
+    <vscale x 1 x i64> %0,
+    i64 %1,
+    <vscale x 1 x i64> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16>  @intrinsic_vwmacc_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vwmacc_vv_nxv1i16_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vwmacc.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vwmacc_vv_nxv1i16_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vwmacc.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16>  @intrinsic_vwmaccsu_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vwmaccsu_vv_nxv1i16_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vwmaccsu.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vwmaccsu_vv_nxv1i16_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vwmaccsu.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16>  @intrinsic_vwmaccu_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vwmaccu_vv_nxv1i16_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vwmaccu.vv v8, v9, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vwmaccu_vv_nxv1i16_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vwmaccu.vv v8, v9, v10
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8(
+  <vscale x 1 x i16>,
+  i8,
+  <vscale x 1 x i8>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16>  @intrinsic_vwmaccus_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; RV32-LABEL: intrinsic_vwmaccus_vx_nxv1i16_i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; RV32-NEXT:    vwmaccus.vx v8, a0, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vwmaccus_vx_nxv1i16_i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; RV64-NEXT:    vwmaccus.vx v8, a0, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8(
+    <vscale x 1 x i16> %0,
+    i8 %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll
index 02fa45976203b..d7a4856df49bb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmacc.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmacc_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmacc_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmacc_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmacc_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmacc_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmacc_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmacc_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmacc_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmacc_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmacc_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmacc_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmacc_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmacc.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmacc_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmacc.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmacc_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmacc.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmacc_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmacc.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmacc_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmacc.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmacc_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmacc.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmacc_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmacc.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmacc_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmacc.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmacc_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmacc.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmacc_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmacc.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmacc_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmacc.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmacc_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmacc.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmacc_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll
index 4b2ffbdc863e0..b826c3c61339c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmadd_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmadd_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmadd_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmadd_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmadd_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmadd_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmadd_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmadd_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmadd_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmadd_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmadd_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmadd_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmadd.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmadd_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmadd.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmadd_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmadd.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmadd_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmadd.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmadd_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmadd.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmadd_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmadd.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmadd_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmadd.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmadd_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmadd.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmadd_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmadd.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmadd_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmadd.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmadd_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmadd.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmadd_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmadd.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmadd_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll
index 6e7e3acadea40..d0a0e7e1d3aa2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsac.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmsac_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmsac_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmsac_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmsac_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmsac_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmsac_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmsac_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmsac_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmsac_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmsac_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmsac_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmsac_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsac.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmsac_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsac.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmsac_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsac.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmsac_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsac.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmsac_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsac.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmsac_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsac.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmsac_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsac.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmsac_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsac.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmsac_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsac.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmsac_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsac.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmsac_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsac.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmsac_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsac.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmsac_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll
index 0f7a58a9480ac..493d8865f5198 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmsub_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmsub_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmsub_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmsub_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmsub_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmsub_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmsub_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmsub_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmsub_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmsub_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmsub_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmsub_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmsub.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfmsub_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmsub.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfmsub_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfmsub.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfmsub_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfmsub.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfmsub_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfmsub.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfmsub_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmsub.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfmsub_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfmsub.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfmsub_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfmsub.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfmsub_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfmsub.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfmsub_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfmsub.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfmsub_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfmsub.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfmsub_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfmsub.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfmsub_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll
index 2d22dc269643c..579840216d767 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmacc.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmacc_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmacc_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmacc_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmacc_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmacc_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmacc_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmacc_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmacc_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmacc_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmacc_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmacc_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmacc_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmacc.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmacc_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmacc.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmacc_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmacc.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmacc_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmacc.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmacc_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmacc.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmacc_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmacc.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmacc_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmacc.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmacc_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmacc.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmacc_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmacc.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmacc_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmacc.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmacc_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmacc.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmacc_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmacc.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmacc_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll
index 4a7cf4d010256..00f9e8a9adca5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmadd_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmadd_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmadd_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmadd_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmadd_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmadd_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmadd_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmadd_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmadd_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmadd_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmadd_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmadd_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmadd.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmadd_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmadd.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmadd_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmadd.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmadd_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmadd.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmadd_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmadd.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmadd_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmadd.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmadd_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmadd.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmadd_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmadd.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmadd_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmadd.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmadd_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmadd.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmadd_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmadd.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmadd_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmadd.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmadd_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll
index dc77c03041dbd..37ae825f1873d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsac.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmsac_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmsac_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmsac_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmsac_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmsac_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmsac_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmsac_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmsac_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmsac_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmsac_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmsac_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmsac_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsac.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmsac_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsac.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmsac_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsac.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmsac_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsac.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmsac_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsac.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmsac_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsac.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmsac_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsac.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmsac_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsac.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmsac_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsac.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmsac_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsac.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmsac_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsac.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmsac_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsac.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmsac_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll
index 6a3a7bbf37607..4c5d368a7abe2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmsub_vv_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmsub_vv_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmsub_vv_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmsub_vv_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmsub_vv_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmsub_vv_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmsub_vv_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmsub_vv_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmsub_vv_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmsub_vv_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmsub_vv_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmsub_vv_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfnmsub.nxv1f16.f16(
   <vscale x 1 x half>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x half>  @intrinsic_vfnmsub_vf_nxv1f16_f16_nxv1f16(<vscale x 1 x half> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 1 x half> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x half> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfnmsub.nxv2f16.f16(
   <vscale x 2 x half>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x half>  @intrinsic_vfnmsub_vf_nxv2f16_f16_nxv2f16(<vscale x 2 x half> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 2 x half> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x half> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 4 x half> @llvm.riscv.vfnmsub.nxv4f16.f16(
   <vscale x 4 x half>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x half>  @intrinsic_vfnmsub_vf_nxv4f16_f16_nxv4f16(<vscale x 4 x half> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 4 x half> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x half> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 8 x half> @llvm.riscv.vfnmsub.nxv8f16.f16(
   <vscale x 8 x half>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x half>  @intrinsic_vfnmsub_vf_nxv8f16_f16_nxv8f16(<vscale x 8 x half> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 8 x half> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x half> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 16 x half> @llvm.riscv.vfnmsub.nxv16f16.f16(
   <vscale x 16 x half>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x half>  @intrinsic_vfnmsub_vf_nxv16f16_f16_nxv16f16(<vscale x 16 x half> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 16 x half> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x half> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfnmsub.nxv1f32.f32(
   <vscale x 1 x float>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfnmsub_vf_nxv1f32_f32_nxv1f32(<vscale x 1 x float> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 1 x float> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -835,6 +853,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfnmsub.nxv2f32.f32(
   <vscale x 2 x float>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfnmsub_vf_nxv2f32_f32_nxv2f32(<vscale x 2 x float> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -848,7 +867,7 @@ entry:
     <vscale x 2 x float> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -881,6 +900,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfnmsub.nxv4f32.f32(
   <vscale x 4 x float>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfnmsub_vf_nxv4f32_f32_nxv4f32(<vscale x 4 x float> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -894,7 +914,7 @@ entry:
     <vscale x 4 x float> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -927,6 +947,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfnmsub.nxv8f32.f32(
   <vscale x 8 x float>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfnmsub_vf_nxv8f32_f32_nxv8f32(<vscale x 8 x float> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -940,7 +961,7 @@ entry:
     <vscale x 8 x float> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -973,6 +994,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfnmsub.nxv1f64.f64(
   <vscale x 1 x double>,
   double,
   <vscale x 1 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfnmsub_vf_nxv1f64_f64_nxv1f64(<vscale x 1 x double> %0, double %1, <vscale x 1 x double> %2, iXLen %3) nounwind {
@@ -986,7 +1008,7 @@ entry:
     <vscale x 1 x double> %0,
     double %1,
     <vscale x 1 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -1019,6 +1041,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfnmsub.nxv2f64.f64(
   <vscale x 2 x double>,
   double,
   <vscale x 2 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfnmsub_vf_nxv2f64_f64_nxv2f64(<vscale x 2 x double> %0, double %1, <vscale x 2 x double> %2, iXLen %3) nounwind {
@@ -1032,7 +1055,7 @@ entry:
     <vscale x 2 x double> %0,
     double %1,
     <vscale x 2 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -1065,6 +1088,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfnmsub.nxv4f64.f64(
   <vscale x 4 x double>,
   double,
   <vscale x 4 x double>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfnmsub_vf_nxv4f64_f64_nxv4f64(<vscale x 4 x double> %0, double %1, <vscale x 4 x double> %2, iXLen %3) nounwind {
@@ -1078,7 +1102,7 @@ entry:
     <vscale x 4 x double> %0,
     double %1,
     <vscale x 4 x double> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll
index 6e9eec795aa35..3108f5c9ccdfc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1f16(
   <vscale x 1 x float>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwmacc_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2f16(
   <vscale x 2 x float>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwmacc_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4f16(
   <vscale x 4 x float>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwmacc_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8f16(
   <vscale x 8 x float>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwmacc_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16f16(
   <vscale x 16 x float>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwmacc_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x float> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.nxv1f32(
   <vscale x 1 x double>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwmacc_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.nxv2f32(
   <vscale x 2 x double>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwmacc_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.nxv4f32(
   <vscale x 4 x double>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwmacc_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.nxv8f32(
   <vscale x 8 x double>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwmacc_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x double> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.f16(
   <vscale x 1 x float>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwmacc_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x float> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.f16(
   <vscale x 2 x float>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwmacc_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x float> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.f16(
   <vscale x 4 x float>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwmacc_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x float> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.f16(
   <vscale x 8 x float>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwmacc_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 8 x float> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.f16(
   <vscale x 16 x float>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwmacc_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 16 x float> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmacc.nxv1f64.f32(
   <vscale x 1 x double>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwmacc_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 1 x double> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmacc.nxv2f64.f32(
   <vscale x 2 x double>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwmacc_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 2 x double> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmacc.nxv4f64.f32(
   <vscale x 4 x double>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwmacc_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 4 x double> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmacc.nxv8f64.f32(
   <vscale x 8 x double>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwmacc_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 8 x double> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll
index 1e9d25d0884d3..01dba90196e90 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1f16(
   <vscale x 1 x float>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwmsac_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2f16(
   <vscale x 2 x float>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwmsac_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4f16(
   <vscale x 4 x float>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwmsac_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8f16(
   <vscale x 8 x float>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwmsac_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16f16(
   <vscale x 16 x float>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwmsac_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x float> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.nxv1f32(
   <vscale x 1 x double>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwmsac_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.nxv2f32(
   <vscale x 2 x double>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwmsac_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.nxv4f32(
   <vscale x 4 x double>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwmsac_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.nxv8f32(
   <vscale x 8 x double>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwmsac_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x double> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.f16(
   <vscale x 1 x float>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwmsac_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x float> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.f16(
   <vscale x 2 x float>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwmsac_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x float> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.f16(
   <vscale x 4 x float>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwmsac_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x float> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.f16(
   <vscale x 8 x float>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwmsac_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 8 x float> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.f16(
   <vscale x 16 x float>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwmsac_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 16 x float> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwmsac.nxv1f64.f32(
   <vscale x 1 x double>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwmsac_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 1 x double> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwmsac.nxv2f64.f32(
   <vscale x 2 x double>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwmsac_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 2 x double> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwmsac.nxv4f64.f32(
   <vscale x 4 x double>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwmsac_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 4 x double> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwmsac.nxv8f64.f32(
   <vscale x 8 x double>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwmsac_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 8 x double> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll
index 2542be6d4a813..a2190427d980b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1f16(
   <vscale x 1 x float>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwnmacc_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2f16(
   <vscale x 2 x float>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwnmacc_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4f16(
   <vscale x 4 x float>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwnmacc_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8f16(
   <vscale x 8 x float>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwnmacc_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16f16(
   <vscale x 16 x float>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwnmacc_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x float> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.nxv1f32(
   <vscale x 1 x double>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwnmacc_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.nxv2f32(
   <vscale x 2 x double>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwnmacc_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.nxv4f32(
   <vscale x 4 x double>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwnmacc_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.nxv8f32(
   <vscale x 8 x double>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwnmacc_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x double> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.f16(
   <vscale x 1 x float>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwnmacc_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x float> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.f16(
   <vscale x 2 x float>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwnmacc_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x float> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.f16(
   <vscale x 4 x float>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwnmacc_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x float> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.f16(
   <vscale x 8 x float>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwnmacc_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 8 x float> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.f16(
   <vscale x 16 x float>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwnmacc_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 16 x float> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmacc.nxv1f64.f32(
   <vscale x 1 x double>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwnmacc_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 1 x double> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmacc.nxv2f64.f32(
   <vscale x 2 x double>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwnmacc_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 2 x double> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmacc.nxv4f64.f32(
   <vscale x 4 x double>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwnmacc_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 4 x double> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmacc.nxv8f64.f32(
   <vscale x 8 x double>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwnmacc_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 8 x double> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll
index ad05c7a68496e..faf6f74f9e4d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac.ll
@@ -7,6 +7,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1f16(
   <vscale x 1 x float>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwnmsac_vv_nxv1f32_nxv1f16_nxv1f16(<vscale x 1 x float> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -20,7 +21,7 @@ entry:
     <vscale x 1 x float> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -53,6 +54,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2f16(
   <vscale x 2 x float>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwnmsac_vv_nxv2f32_nxv2f16_nxv2f16(<vscale x 2 x float> %0, <vscale x 2 x half> %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x float> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -99,6 +101,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4f16(
   <vscale x 4 x float>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwnmsac_vv_nxv4f32_nxv4f16_nxv4f16(<vscale x 4 x float> %0, <vscale x 4 x half> %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -112,7 +115,7 @@ entry:
     <vscale x 4 x float> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -145,6 +148,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8f16(
   <vscale x 8 x float>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwnmsac_vv_nxv8f32_nxv8f16_nxv8f16(<vscale x 8 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -158,7 +162,7 @@ entry:
     <vscale x 8 x float> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -191,6 +195,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16f16(
   <vscale x 16 x float>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwnmsac_vv_nxv16f32_nxv16f16_nxv16f16(<vscale x 16 x float> %0, <vscale x 16 x half> %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -204,7 +209,7 @@ entry:
     <vscale x 16 x float> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -237,6 +242,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.nxv1f32(
   <vscale x 1 x double>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwnmsac_vv_nxv1f64_nxv1f32_nxv1f32(<vscale x 1 x double> %0, <vscale x 1 x float> %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -250,7 +256,7 @@ entry:
     <vscale x 1 x double> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -283,6 +289,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.nxv2f32(
   <vscale x 2 x double>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwnmsac_vv_nxv2f64_nxv2f32_nxv2f32(<vscale x 2 x double> %0, <vscale x 2 x float> %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -296,7 +303,7 @@ entry:
     <vscale x 2 x double> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -329,6 +336,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.nxv4f32(
   <vscale x 4 x double>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwnmsac_vv_nxv4f64_nxv4f32_nxv4f32(<vscale x 4 x double> %0, <vscale x 4 x float> %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -342,7 +350,7 @@ entry:
     <vscale x 4 x double> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -375,6 +383,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.nxv8f32(
   <vscale x 8 x double>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwnmsac_vv_nxv8f64_nxv8f32_nxv8f32(<vscale x 8 x double> %0, <vscale x 8 x float> %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -388,7 +397,7 @@ entry:
     <vscale x 8 x double> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
@@ -421,6 +430,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.f16(
   <vscale x 1 x float>,
   half,
   <vscale x 1 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x float>  @intrinsic_vfwnmsac_vf_nxv1f32_f16_nxv1f16(<vscale x 1 x float> %0, half %1, <vscale x 1 x half> %2, iXLen %3) nounwind {
@@ -434,7 +444,7 @@ entry:
     <vscale x 1 x float> %0,
     half %1,
     <vscale x 1 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x float> %a
 }
@@ -467,6 +477,7 @@ declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.f16(
   <vscale x 2 x float>,
   half,
   <vscale x 2 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x float>  @intrinsic_vfwnmsac_vf_nxv2f32_f16_nxv2f16(<vscale x 2 x float> %0, half %1, <vscale x 2 x half> %2, iXLen %3) nounwind {
@@ -480,7 +491,7 @@ entry:
     <vscale x 2 x float> %0,
     half %1,
     <vscale x 2 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x float> %a
 }
@@ -513,6 +524,7 @@ declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.f16(
   <vscale x 4 x float>,
   half,
   <vscale x 4 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x float>  @intrinsic_vfwnmsac_vf_nxv4f32_f16_nxv4f16(<vscale x 4 x float> %0, half %1, <vscale x 4 x half> %2, iXLen %3) nounwind {
@@ -526,7 +538,7 @@ entry:
     <vscale x 4 x float> %0,
     half %1,
     <vscale x 4 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x float> %a
 }
@@ -559,6 +571,7 @@ declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.f16(
   <vscale x 8 x float>,
   half,
   <vscale x 8 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x float>  @intrinsic_vfwnmsac_vf_nxv8f32_f16_nxv8f16(<vscale x 8 x float> %0, half %1, <vscale x 8 x half> %2, iXLen %3) nounwind {
@@ -572,7 +585,7 @@ entry:
     <vscale x 8 x float> %0,
     half %1,
     <vscale x 8 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x float> %a
 }
@@ -605,6 +618,7 @@ declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.f16(
   <vscale x 16 x float>,
   half,
   <vscale x 16 x half>,
+  iXLen,
   iXLen);
 
 define <vscale x 16 x float>  @intrinsic_vfwnmsac_vf_nxv16f32_f16_nxv16f16(<vscale x 16 x float> %0, half %1, <vscale x 16 x half> %2, iXLen %3) nounwind {
@@ -618,7 +632,7 @@ entry:
     <vscale x 16 x float> %0,
     half %1,
     <vscale x 16 x half> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 16 x float> %a
 }
@@ -651,6 +665,7 @@ declare <vscale x 1 x double> @llvm.riscv.vfwnmsac.nxv1f64.f32(
   <vscale x 1 x double>,
   float,
   <vscale x 1 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 1 x double>  @intrinsic_vfwnmsac_vf_nxv1f64_f32_nxv1f32(<vscale x 1 x double> %0, float %1, <vscale x 1 x float> %2, iXLen %3) nounwind {
@@ -664,7 +679,7 @@ entry:
     <vscale x 1 x double> %0,
     float %1,
     <vscale x 1 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 1 x double> %a
 }
@@ -697,6 +712,7 @@ declare <vscale x 2 x double> @llvm.riscv.vfwnmsac.nxv2f64.f32(
   <vscale x 2 x double>,
   float,
   <vscale x 2 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 2 x double>  @intrinsic_vfwnmsac_vf_nxv2f64_f32_nxv2f32(<vscale x 2 x double> %0, float %1, <vscale x 2 x float> %2, iXLen %3) nounwind {
@@ -710,7 +726,7 @@ entry:
     <vscale x 2 x double> %0,
     float %1,
     <vscale x 2 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 2 x double> %a
 }
@@ -743,6 +759,7 @@ declare <vscale x 4 x double> @llvm.riscv.vfwnmsac.nxv4f64.f32(
   <vscale x 4 x double>,
   float,
   <vscale x 4 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 4 x double>  @intrinsic_vfwnmsac_vf_nxv4f64_f32_nxv4f32(<vscale x 4 x double> %0, float %1, <vscale x 4 x float> %2, iXLen %3) nounwind {
@@ -756,7 +773,7 @@ entry:
     <vscale x 4 x double> %0,
     float %1,
     <vscale x 4 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 4 x double> %a
 }
@@ -789,6 +806,7 @@ declare <vscale x 8 x double> @llvm.riscv.vfwnmsac.nxv8f64.f32(
   <vscale x 8 x double>,
   float,
   <vscale x 8 x float>,
+  iXLen,
   iXLen);
 
 define <vscale x 8 x double>  @intrinsic_vfwnmsac_vf_nxv8f64_f32_nxv8f32(<vscale x 8 x double> %0, float %1, <vscale x 8 x float> %2, iXLen %3) nounwind {
@@ -802,7 +820,7 @@ entry:
     <vscale x 8 x double> %0,
     float %1,
     <vscale x 8 x float> %2,
-    iXLen %3)
+    iXLen %3, iXLen 0)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmacc-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmacc-rv32.ll
index fbdfac8c0cba8..ca1360ada55ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmacc-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmacc-rv32.ll
@@ -5,7 +5,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i8>  @intrinsic_vmacc_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv1i8_nxv1i8_nxv1i8:
@@ -18,7 +20,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,7 +53,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i8>  @intrinsic_vmacc_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv2i8_nxv2i8_nxv2i8:
@@ -64,7 +68,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,7 +101,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i8>  @intrinsic_vmacc_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv4i8_nxv4i8_nxv4i8:
@@ -110,7 +116,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,7 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 8 x i8>  @intrinsic_vmacc_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv8i8_nxv8i8_nxv8i8:
@@ -156,7 +164,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,7 +197,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 16 x i8>  @intrinsic_vmacc_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv16i8_nxv16i8_nxv16i8:
@@ -202,7 +212,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,7 +245,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 32 x i8>  @intrinsic_vmacc_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv32i8_nxv32i8_nxv32i8:
@@ -248,7 +260,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,7 +293,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i16>  @intrinsic_vmacc_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv1i16_nxv1i16_nxv1i16:
@@ -294,7 +308,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,7 +341,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i16>  @intrinsic_vmacc_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv2i16_nxv2i16_nxv2i16:
@@ -340,7 +356,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,7 +389,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i16>  @intrinsic_vmacc_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv4i16_nxv4i16_nxv4i16:
@@ -386,7 +404,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,7 +437,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 8 x i16>  @intrinsic_vmacc_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv8i16_nxv8i16_nxv8i16:
@@ -432,7 +452,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,7 +485,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 16 x i16>  @intrinsic_vmacc_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv16i16_nxv16i16_nxv16i16:
@@ -478,7 +500,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,7 +533,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i32>  @intrinsic_vmacc_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv1i32_nxv1i32_nxv1i32:
@@ -524,7 +548,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,7 +581,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i32>  @intrinsic_vmacc_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv2i32_nxv2i32_nxv2i32:
@@ -570,7 +596,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,7 +629,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i32>  @intrinsic_vmacc_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv4i32_nxv4i32_nxv4i32:
@@ -616,7 +644,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,7 +677,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 8 x i32>  @intrinsic_vmacc_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv8i32_nxv8i32_nxv8i32:
@@ -662,7 +692,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,7 +725,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i64>  @intrinsic_vmacc_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv1i64_nxv1i64_nxv1i64:
@@ -708,7 +740,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,7 +773,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i64>  @intrinsic_vmacc_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv2i64_nxv2i64_nxv2i64:
@@ -754,7 +788,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,7 +821,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i64>  @intrinsic_vmacc_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vv_nxv4i64_nxv4i64_nxv4i64:
@@ -800,7 +836,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,7 +869,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i8>  @intrinsic_vmacc_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv1i8_i8_nxv1i8:
@@ -846,7 +884,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,7 +917,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i8>  @intrinsic_vmacc_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv2i8_i8_nxv2i8:
@@ -892,7 +932,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,7 +965,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i8>  @intrinsic_vmacc_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv4i8_i8_nxv4i8:
@@ -938,7 +980,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,7 +1013,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 8 x i8>  @intrinsic_vmacc_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv8i8_i8_nxv8i8:
@@ -984,7 +1028,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,7 +1061,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 16 x i8>  @intrinsic_vmacc_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv16i8_i8_nxv16i8:
@@ -1030,7 +1076,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,7 +1109,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 32 x i8>  @intrinsic_vmacc_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv32i8_i8_nxv32i8:
@@ -1076,7 +1124,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,7 +1157,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i16>  @intrinsic_vmacc_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv1i16_i16_nxv1i16:
@@ -1122,7 +1172,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,7 +1205,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i16>  @intrinsic_vmacc_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv2i16_i16_nxv2i16:
@@ -1168,7 +1220,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,7 +1253,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i16>  @intrinsic_vmacc_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv4i16_i16_nxv4i16:
@@ -1214,7 +1268,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,7 +1301,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 8 x i16>  @intrinsic_vmacc_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv8i16_i16_nxv8i16:
@@ -1260,7 +1316,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,7 +1349,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 16 x i16>  @intrinsic_vmacc_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv16i16_i16_nxv16i16:
@@ -1306,7 +1364,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,7 +1397,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i32>  @intrinsic_vmacc_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv1i32_i32_nxv1i32:
@@ -1352,7 +1412,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,7 +1445,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i32>  @intrinsic_vmacc_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv2i32_i32_nxv2i32:
@@ -1398,7 +1460,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,7 +1493,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i32>  @intrinsic_vmacc_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv4i32_i32_nxv4i32:
@@ -1444,7 +1508,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,7 +1541,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 8 x i32>  @intrinsic_vmacc_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv8i32_i32_nxv8i32:
@@ -1490,7 +1556,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,7 +1589,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 1 x i64>  @intrinsic_vmacc_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv1i64_i64_nxv1i64:
@@ -1543,7 +1611,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1583,7 +1651,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 2 x i64>  @intrinsic_vmacc_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv2i64_i64_nxv2i64:
@@ -1603,7 +1673,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1643,7 +1713,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
-  i32);
+  i32,
+  i32
+);
 
 define <vscale x 4 x i64>  @intrinsic_vmacc_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
 ; CHECK-LABEL: intrinsic_vmacc_vx_nxv4i64_i64_nxv4i64:
@@ -1663,7 +1735,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmacc-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmacc-rv64.ll
index 16ea00ad78959..4cce1910d9576 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmacc-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmacc-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vmacc_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vmacc_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vmacc_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vmacc_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vmacc_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vmacc_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vmacc_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vmacc_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vmacc_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vmacc_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vmacc_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vmacc_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vmacc_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vmacc_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vmacc_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vmacc_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vmacc_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vmacc_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vmacc.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vmacc_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vmacc.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vmacc_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vmacc.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vmacc_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vmacc.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vmacc_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vmacc.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vmacc_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vmacc.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vmacc_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vmacc.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vmacc_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vmacc.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vmacc_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vmacc.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vmacc_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vmacc.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vmacc_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vmacc.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vmacc_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vmacc.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vmacc_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,6 +1415,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vmacc_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -1398,7 +1429,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,6 +1462,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vmacc.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vmacc_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -1444,7 +1476,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,6 +1509,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vmacc.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vmacc_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -1490,7 +1523,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,6 +1556,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vmacc.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vmacc_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -1536,7 +1570,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1569,6 +1603,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vmacc.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vmacc_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -1582,7 +1617,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1615,6 +1650,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vmacc.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vmacc_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -1628,7 +1664,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-rv32.ll
index e93f3bffe82a0..d6fd33f9a5651 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmadd-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-rv32.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i8>  @intrinsic_vmadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i8>  @intrinsic_vmadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i8>  @intrinsic_vmadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i8>  @intrinsic_vmadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i8>  @intrinsic_vmadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i8>  @intrinsic_vmadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vmadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vmadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vmadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vmadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vmadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vmadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vmadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vmadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vmadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vmadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vmadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vmadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i8>  @intrinsic_vmadd_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i8>  @intrinsic_vmadd_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i8>  @intrinsic_vmadd_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i8>  @intrinsic_vmadd_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i8>  @intrinsic_vmadd_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i8>  @intrinsic_vmadd_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vmadd_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vmadd_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vmadd_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vmadd_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vmadd_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vmadd_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,6 +1415,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vmadd_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -1398,7 +1429,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,6 +1462,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vmadd_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -1444,7 +1476,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,6 +1509,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vmadd_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -1490,7 +1523,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,6 +1556,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vmadd_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
@@ -1543,7 +1577,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1583,6 +1617,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vmadd_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
@@ -1603,7 +1638,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1643,6 +1678,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vmadd_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
@@ -1663,7 +1699,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-rv64.ll
index 63695f673f302..f1dd69ba98e9e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmadd-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vmadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vmadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vmadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vmadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vmadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vmadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vmadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vmadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vmadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vmadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vmadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vmadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vmadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vmadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vmadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vmadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vmadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vmadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vmadd.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vmadd_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vmadd.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vmadd_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vmadd.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vmadd_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vmadd.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vmadd_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vmadd.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vmadd_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vmadd.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vmadd_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vmadd.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vmadd_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vmadd.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vmadd_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vmadd.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vmadd_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vmadd.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vmadd_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vmadd.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vmadd_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vmadd.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vmadd_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,6 +1415,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vmadd.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vmadd_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -1398,7 +1429,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,6 +1462,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vmadd.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vmadd_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -1444,7 +1476,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,6 +1509,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vmadd.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vmadd_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -1490,7 +1523,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,6 +1556,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vmadd.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vmadd_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -1536,7 +1570,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1569,6 +1603,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vmadd.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vmadd_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -1582,7 +1617,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1615,6 +1650,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vmadd.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vmadd_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -1628,7 +1664,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv32.ll
index 45a6d986a2abc..e0930323ab928 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv32.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsac_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsac_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsac_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsac_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsac_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsac_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsac_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsac_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsac_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsac_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsac_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsac_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsac_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsac_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsac_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsac_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsac_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsac_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsac_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsac_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsac_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsac_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsac_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsac_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsac_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsac_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsac_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsac_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsac_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsac_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,6 +1415,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsac_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -1398,7 +1429,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,6 +1462,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsac_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -1444,7 +1476,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,6 +1509,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsac_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -1490,7 +1523,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,6 +1556,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsac_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
@@ -1543,7 +1577,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1583,6 +1617,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsac_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
@@ -1603,7 +1638,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1643,6 +1678,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsac_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
@@ -1663,7 +1699,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv64.ll
index a7173a53263ad..3a97666afebf4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsac-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsac_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsac_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsac_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsac_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsac_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsac_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsac_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsac_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsac_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsac_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsac_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsac_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsac_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsac_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsac_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsac_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsac_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsac_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsac.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsac_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsac.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsac_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsac.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsac_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsac.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsac_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsac.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsac_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsac.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsac_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsac.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsac_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsac.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsac_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsac.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsac_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsac.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsac_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsac.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsac_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsac.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsac_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,6 +1415,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsac.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsac_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -1398,7 +1429,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,6 +1462,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsac.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsac_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -1444,7 +1476,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,6 +1509,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsac.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsac_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -1490,7 +1523,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,6 +1556,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsac.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsac_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -1536,7 +1570,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1569,6 +1603,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsac.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsac_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -1582,7 +1617,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1615,6 +1650,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsac.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsac_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -1628,7 +1664,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv32.ll
index 8440b0610872d..d96cc487f92cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv32.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsub_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsub_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsub_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsub_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsub_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsub_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsub_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsub_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsub_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsub_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsub_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsub_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,6 +1415,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsub_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -1398,7 +1429,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,6 +1462,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsub_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -1444,7 +1476,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,6 +1509,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsub_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -1490,7 +1523,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,6 +1556,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsub_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i32 %3) nounwind {
@@ -1543,7 +1577,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1583,6 +1617,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsub_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i32 %3) nounwind {
@@ -1603,7 +1638,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1643,6 +1678,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsub_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i32 %3) nounwind {
@@ -1663,7 +1699,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv64.ll
index b18086afca243..75659d8a5ef41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsub-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vnmsub.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i8>  @intrinsic_vnmsub_vx_nxv1i8_i8_nxv1i8(<vscale x 1 x i8> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 1 x i8> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i8> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 2 x i8> @llvm.riscv.vnmsub.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i8>  @intrinsic_vnmsub_vx_nxv2i8_i8_nxv2i8(<vscale x 2 x i8> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 2 x i8> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i8> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 4 x i8> @llvm.riscv.vnmsub.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i8>  @intrinsic_vnmsub_vx_nxv4i8_i8_nxv4i8(<vscale x 4 x i8> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 4 x i8> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i8> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 8 x i8> @llvm.riscv.vnmsub.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i8>  @intrinsic_vnmsub_vx_nxv8i8_i8_nxv8i8(<vscale x 8 x i8> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 8 x i8> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 16 x i8> @llvm.riscv.vnmsub.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i8>  @intrinsic_vnmsub_vx_nxv16i8_i8_nxv16i8(<vscale x 16 x i8> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 16 x i8> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 32 x i8> @llvm.riscv.vnmsub.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i8>  @intrinsic_vnmsub_vx_nxv32i8_i8_nxv32i8(<vscale x 32 x i8> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 32 x i8> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vnmsub.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vnmsub_vx_nxv1i16_i16_nxv1i16(<vscale x 1 x i16> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 1 x i16> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vnmsub.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vnmsub_vx_nxv2i16_i16_nxv2i16(<vscale x 2 x i16> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 2 x i16> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vnmsub.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vnmsub_vx_nxv4i16_i16_nxv4i16(<vscale x 4 x i16> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 4 x i16> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vnmsub.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vnmsub_vx_nxv8i16_i16_nxv8i16(<vscale x 8 x i16> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 8 x i16> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vnmsub.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vnmsub_vx_nxv16i16_i16_nxv16i16(<vscale x 16 x i16> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 16 x i16> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vnmsub.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vnmsub_vx_nxv1i32_i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 1 x i32> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1385,6 +1415,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vnmsub.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vnmsub_vx_nxv2i32_i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -1398,7 +1429,7 @@ entry:
     <vscale x 2 x i32> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1431,6 +1462,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vnmsub.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vnmsub_vx_nxv4i32_i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -1444,7 +1476,7 @@ entry:
     <vscale x 4 x i32> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1477,6 +1509,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vnmsub.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vnmsub_vx_nxv8i32_i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -1490,7 +1523,7 @@ entry:
     <vscale x 8 x i32> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1523,6 +1556,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vnmsub.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i64>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vnmsub_vx_nxv1i64_i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
@@ -1536,7 +1570,7 @@ entry:
     <vscale x 1 x i64> %0,
     i64 %1,
     <vscale x 1 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1569,6 +1603,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vnmsub.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i64>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vnmsub_vx_nxv2i64_i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
@@ -1582,7 +1617,7 @@ entry:
     <vscale x 2 x i64> %0,
     i64 %1,
     <vscale x 2 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1615,6 +1650,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vnmsub.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i64>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vnmsub_vx_nxv4i64_i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
@@ -1628,7 +1664,7 @@ entry:
     <vscale x 4 x i64> %0,
     i64 %1,
     <vscale x 4 x i64> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index f93b0a5812d44..fdd30fbdab3bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -479,7 +479,7 @@ for.body:                                         ; preds = %for.body, %entry
   %add.ptr = getelementptr inbounds float, float* %x.addr.015, i64 %1
   %4 = bitcast float* %y.addr.014 to <vscale x 16 x float>*
   %5 = tail call <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float> undef, <vscale x 16 x float>* %4, i64 %1)
-  %6 = tail call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> %5, float %a, <vscale x 16 x float> %3, i64 %1)
+  %6 = tail call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> %5, float %a, <vscale x 16 x float> %3, i64 %1, i64 0)
   tail call void @llvm.riscv.vse.nxv16f32.i64(<vscale x 16 x float> %6, <vscale x 16 x float>* %4, i64 %1)
   %add.ptr1 = getelementptr inbounds float, float* %y.addr.014, i64 %1
   %sub = sub i64 %n.addr.016, %1
@@ -493,7 +493,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg)
 declare <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float>, <vscale x 16 x float>* nocapture, i64)
-declare <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float>, float, <vscale x 16 x float>, i64)
+declare <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float>, float, <vscale x 16 x float>, i64, i64)
 declare void @llvm.riscv.vse.nxv16f32.i64(<vscale x 16 x float>, <vscale x 16 x float>* nocapture, i64)
 
 ; We need a vsetvli in the last block because the predecessors have different
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv32.ll
index 5fb3a63b137fb..2dc32e6eb146e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv32.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8(
   <vscale x 1 x i16>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vwmacc_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.nxv2i8(
   <vscale x 2 x i16>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vwmacc_vv_nxv2i16_nxv2i8_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.nxv4i8(
   <vscale x 4 x i16>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vwmacc_vv_nxv4i16_nxv4i8_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.nxv8i8(
   <vscale x 8 x i16>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vwmacc_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.nxv16i8(
   <vscale x 16 x i16>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vwmacc_vv_nxv16i16_nxv16i8_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.nxv32i8(
   <vscale x 32 x i16>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i16>  @intrinsic_vwmacc_vv_nxv32i16_nxv32i8_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.nxv1i16(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vwmacc_vv_nxv1i32_nxv1i16_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.nxv2i16(
   <vscale x 2 x i32>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vwmacc_vv_nxv2i32_nxv2i16_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vwmacc_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.nxv8i16(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vwmacc_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.nxv16i16(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i32>  @intrinsic_vwmacc_vv_nxv16i32_nxv16i16_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.nxv1i32(
   <vscale x 1 x i64>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vwmacc_vv_nxv1i64_nxv1i32_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.nxv2i32(
   <vscale x 2 x i64>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vwmacc_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.nxv4i32(
   <vscale x 4 x i64>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vwmacc_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.nxv8i32(
   <vscale x 8 x i64>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i64>  @intrinsic_vwmacc_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i64> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vwmacc_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vwmacc_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vwmacc_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vwmacc_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vwmacc_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i16>  @intrinsic_vwmacc_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vwmacc_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vwmacc_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vwmacc_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vwmacc_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i32>  @intrinsic_vwmacc_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vwmacc_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vwmacc_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vwmacc_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i64>  @intrinsic_vwmacc_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv64.ll
index 2b547165170f8..7b919972e9a4d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmacc-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.nxv1i8(
   <vscale x 1 x i16>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vwmacc_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.nxv2i8(
   <vscale x 2 x i16>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vwmacc_vv_nxv2i16_nxv2i8_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.nxv4i8(
   <vscale x 4 x i16>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vwmacc_vv_nxv4i16_nxv4i8_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.nxv8i8(
   <vscale x 8 x i16>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vwmacc_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.nxv16i8(
   <vscale x 16 x i16>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vwmacc_vv_nxv16i16_nxv16i8_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.nxv32i8(
   <vscale x 32 x i16>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i16>  @intrinsic_vwmacc_vv_nxv32i16_nxv32i8_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.nxv1i16(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vwmacc_vv_nxv1i32_nxv1i16_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.nxv2i16(
   <vscale x 2 x i32>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vwmacc_vv_nxv2i32_nxv2i16_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vwmacc_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.nxv8i16(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vwmacc_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.nxv16i16(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i32>  @intrinsic_vwmacc_vv_nxv16i32_nxv16i16_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.nxv1i32(
   <vscale x 1 x i64>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vwmacc_vv_nxv1i64_nxv1i32_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.nxv2i32(
   <vscale x 2 x i64>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vwmacc_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.nxv4i32(
   <vscale x 4 x i64>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vwmacc_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.nxv8i32(
   <vscale x 8 x i64>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i64>  @intrinsic_vwmacc_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i64> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmacc.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vwmacc_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmacc.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vwmacc_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmacc.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vwmacc_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmacc.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vwmacc_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmacc.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vwmacc_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmacc.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i16>  @intrinsic_vwmacc_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmacc.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vwmacc_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmacc.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vwmacc_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vwmacc_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmacc.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vwmacc_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmacc.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i32>  @intrinsic_vwmacc_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmacc.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vwmacc_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmacc.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vwmacc_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vwmacc_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmacc.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i64>  @intrinsic_vwmacc_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv32.ll
index 1522eb1e86b21..20a79177fe623 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv32.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8(
   <vscale x 1 x i16>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccsu_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.nxv2i8(
   <vscale x 2 x i16>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccsu_vv_nxv2i16_nxv2i8_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.nxv4i8(
   <vscale x 4 x i16>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccsu_vv_nxv4i16_nxv4i8_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.nxv8i8(
   <vscale x 8 x i16>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccsu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.nxv16i8(
   <vscale x 16 x i16>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccsu_vv_nxv16i16_nxv16i8_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.nxv32i8(
   <vscale x 32 x i16>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccsu_vv_nxv32i16_nxv32i8_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.nxv1i16(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccsu_vv_nxv1i32_nxv1i16_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.nxv2i16(
   <vscale x 2 x i32>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccsu_vv_nxv2i32_nxv2i16_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccsu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.nxv8i16(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccsu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.nxv16i16(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccsu_vv_nxv16i32_nxv16i16_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.nxv1i32(
   <vscale x 1 x i64>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccsu_vv_nxv1i64_nxv1i32_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.nxv2i32(
   <vscale x 2 x i64>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccsu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.nxv4i32(
   <vscale x 4 x i64>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccsu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.nxv8i32(
   <vscale x 8 x i64>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccsu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i64> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccsu_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccsu_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccsu_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccsu_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccsu_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccsu_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccsu_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccsu_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccsu_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccsu_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccsu_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccsu_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccsu_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccsu_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccsu_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv64.ll
index ab86a96db8bd0..ab2d5e29399a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccsu-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.nxv1i8(
   <vscale x 1 x i16>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccsu_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.nxv2i8(
   <vscale x 2 x i16>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccsu_vv_nxv2i16_nxv2i8_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.nxv4i8(
   <vscale x 4 x i16>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccsu_vv_nxv4i16_nxv4i8_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.nxv8i8(
   <vscale x 8 x i16>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccsu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.nxv16i8(
   <vscale x 16 x i16>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccsu_vv_nxv16i16_nxv16i8_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.nxv32i8(
   <vscale x 32 x i16>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccsu_vv_nxv32i16_nxv32i8_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.nxv1i16(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccsu_vv_nxv1i32_nxv1i16_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.nxv2i16(
   <vscale x 2 x i32>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccsu_vv_nxv2i32_nxv2i16_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccsu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.nxv8i16(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccsu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.nxv16i16(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccsu_vv_nxv16i32_nxv16i16_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.nxv1i32(
   <vscale x 1 x i64>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccsu_vv_nxv1i64_nxv1i32_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.nxv2i32(
   <vscale x 2 x i64>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccsu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.nxv4i32(
   <vscale x 4 x i64>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccsu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.nxv8i32(
   <vscale x 8 x i64>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccsu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i64> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccsu.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccsu_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccsu.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccsu_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccsu.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccsu_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccsu.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccsu_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccsu.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccsu_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccsu.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccsu_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccsu.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccsu_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccsu.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccsu_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccsu_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccsu.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccsu_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccsu.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccsu_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccsu.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccsu_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccsu.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccsu_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccsu.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccsu_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccsu.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccsu_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv32.ll
index 183fd36188a90..7a3487acc8796 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv32.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8(
   <vscale x 1 x i16>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccu_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.nxv2i8(
   <vscale x 2 x i16>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccu_vv_nxv2i16_nxv2i8_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.nxv4i8(
   <vscale x 4 x i16>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccu_vv_nxv4i16_nxv4i8_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.nxv8i8(
   <vscale x 8 x i16>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.nxv16i8(
   <vscale x 16 x i16>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccu_vv_nxv16i16_nxv16i8_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.nxv32i8(
   <vscale x 32 x i16>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccu_vv_nxv32i16_nxv32i8_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.nxv1i16(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccu_vv_nxv1i32_nxv1i16_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.nxv2i16(
   <vscale x 2 x i32>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccu_vv_nxv2i32_nxv2i16_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.nxv8i16(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.nxv16i16(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccu_vv_nxv16i32_nxv16i16_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.nxv1i32(
   <vscale x 1 x i64>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccu_vv_nxv1i64_nxv1i32_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.nxv2i32(
   <vscale x 2 x i64>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.nxv4i32(
   <vscale x 4 x i64>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.nxv8i32(
   <vscale x 8 x i64>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i64> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccu_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccu_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccu_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccu_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccu_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccu_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccu_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccu_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccu_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccu_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccu_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccu_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccu_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccu_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccu_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv64.ll
index 49b2fec00220a..d4ff4ae333889 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.nxv1i8(
   <vscale x 1 x i16>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccu_vv_nxv1i16_nxv1i8_nxv1i8(<vscale x 1 x i16> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.nxv2i8(
   <vscale x 2 x i16>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccu_vv_nxv2i16_nxv2i8_nxv2i8(<vscale x 2 x i16> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.nxv4i8(
   <vscale x 4 x i16>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccu_vv_nxv4i16_nxv4i8_nxv4i8(<vscale x 4 x i16> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.nxv8i8(
   <vscale x 8 x i16>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccu_vv_nxv8i16_nxv8i8_nxv8i8(<vscale x 8 x i16> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.nxv16i8(
   <vscale x 16 x i16>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccu_vv_nxv16i16_nxv16i8_nxv16i8(<vscale x 16 x i16> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.nxv32i8(
   <vscale x 32 x i16>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccu_vv_nxv32i16_nxv32i8_nxv32i8(<vscale x 32 x i16> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.nxv1i16(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccu_vv_nxv1i32_nxv1i16_nxv1i16(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.nxv2i16(
   <vscale x 2 x i32>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccu_vv_nxv2i32_nxv2i16_nxv2i16(<vscale x 2 x i32> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccu_vv_nxv4i32_nxv4i16_nxv4i16(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.nxv8i16(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccu_vv_nxv8i32_nxv8i16_nxv8i16(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.nxv16i16(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccu_vv_nxv16i32_nxv16i16_nxv16i16(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.nxv1i32(
   <vscale x 1 x i64>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccu_vv_nxv1i64_nxv1i32_nxv1i32(<vscale x 1 x i64> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.nxv2i32(
   <vscale x 2 x i64>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccu_vv_nxv2i64_nxv2i32_nxv2i32(<vscale x 2 x i64> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.nxv4i32(
   <vscale x 4 x i64>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccu_vv_nxv4i64_nxv4i32_nxv4i32(<vscale x 4 x i64> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.nxv8i32(
   <vscale x 8 x i64>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccu_vv_nxv8i64_nxv8i32_nxv8i32(<vscale x 8 x i64> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i64> %a
 }
@@ -695,6 +710,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccu.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccu_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -708,7 +724,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -741,6 +757,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccu.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccu_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -754,7 +771,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -787,6 +804,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccu.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccu_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -800,7 +818,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -833,6 +851,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccu.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccu_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -846,7 +865,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -879,6 +898,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccu.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccu_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -892,7 +912,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -925,6 +945,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccu.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccu_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -938,7 +959,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -971,6 +992,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccu.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccu_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -984,7 +1006,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1017,6 +1039,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccu.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccu_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -1030,7 +1053,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1063,6 +1086,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccu_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -1076,7 +1100,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1109,6 +1133,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccu.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccu_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -1122,7 +1147,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1155,6 +1180,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccu.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccu_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -1168,7 +1194,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1201,6 +1227,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccu.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccu_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -1214,7 +1241,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1247,6 +1274,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccu.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccu_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -1260,7 +1288,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1293,6 +1321,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccu_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -1306,7 +1335,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1339,6 +1368,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccu.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccu_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -1352,7 +1382,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv32.ll
index ccde867da07c6..80a8b858e1647 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv32.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i32,
   i32);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccus_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i32 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccus.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i32,
   i32);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccus_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i32 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccus.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i32,
   i32);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccus_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i32 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccus.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i32,
   i32);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccus_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i32 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccus.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i32,
   i32);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccus_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i32 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccus.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i32,
   i32);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccus_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i32 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccus.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i32,
   i32);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccus_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i32 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccus.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i32,
   i32);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccus_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i32 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i32,
   i32);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccus_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i32 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccus.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i32,
   i32);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccus_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i32 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccus.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i32,
   i32);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccus_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i32 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccus.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i32,
   i32);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccus_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i32 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccus.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i32,
   i32);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccus_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i32 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccus.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i32,
   i32);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccus_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i32 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccus.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i32,
   i32);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccus_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i32 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i32 %3)
+    i32 %3, i32 0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv64.ll
index 95b164270ced5..c44fae76c4fb3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccus-rv64.ll
@@ -5,6 +5,7 @@ declare <vscale x 1 x i16> @llvm.riscv.vwmaccus.nxv1i16.i8(
   <vscale x 1 x i16>,
   i8,
   <vscale x 1 x i8>,
+  i64,
   i64);
 
 define <vscale x 1 x i16>  @intrinsic_vwmaccus_vx_nxv1i16_i8_nxv1i8(<vscale x 1 x i16> %0, i8 %1, <vscale x 1 x i8> %2, i64 %3) nounwind {
@@ -18,7 +19,7 @@ entry:
     <vscale x 1 x i16> %0,
     i8 %1,
     <vscale x 1 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i16> %a
 }
@@ -51,6 +52,7 @@ declare <vscale x 2 x i16> @llvm.riscv.vwmaccus.nxv2i16.i8(
   <vscale x 2 x i16>,
   i8,
   <vscale x 2 x i8>,
+  i64,
   i64);
 
 define <vscale x 2 x i16>  @intrinsic_vwmaccus_vx_nxv2i16_i8_nxv2i8(<vscale x 2 x i16> %0, i8 %1, <vscale x 2 x i8> %2, i64 %3) nounwind {
@@ -64,7 +66,7 @@ entry:
     <vscale x 2 x i16> %0,
     i8 %1,
     <vscale x 2 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i16> %a
 }
@@ -97,6 +99,7 @@ declare <vscale x 4 x i16> @llvm.riscv.vwmaccus.nxv4i16.i8(
   <vscale x 4 x i16>,
   i8,
   <vscale x 4 x i8>,
+  i64,
   i64);
 
 define <vscale x 4 x i16>  @intrinsic_vwmaccus_vx_nxv4i16_i8_nxv4i8(<vscale x 4 x i16> %0, i8 %1, <vscale x 4 x i8> %2, i64 %3) nounwind {
@@ -110,7 +113,7 @@ entry:
     <vscale x 4 x i16> %0,
     i8 %1,
     <vscale x 4 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i16> %a
 }
@@ -143,6 +146,7 @@ declare <vscale x 8 x i16> @llvm.riscv.vwmaccus.nxv8i16.i8(
   <vscale x 8 x i16>,
   i8,
   <vscale x 8 x i8>,
+  i64,
   i64);
 
 define <vscale x 8 x i16>  @intrinsic_vwmaccus_vx_nxv8i16_i8_nxv8i8(<vscale x 8 x i16> %0, i8 %1, <vscale x 8 x i8> %2, i64 %3) nounwind {
@@ -156,7 +160,7 @@ entry:
     <vscale x 8 x i16> %0,
     i8 %1,
     <vscale x 8 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i16> %a
 }
@@ -189,6 +193,7 @@ declare <vscale x 16 x i16> @llvm.riscv.vwmaccus.nxv16i16.i8(
   <vscale x 16 x i16>,
   i8,
   <vscale x 16 x i8>,
+  i64,
   i64);
 
 define <vscale x 16 x i16>  @intrinsic_vwmaccus_vx_nxv16i16_i8_nxv16i8(<vscale x 16 x i16> %0, i8 %1, <vscale x 16 x i8> %2, i64 %3) nounwind {
@@ -202,7 +207,7 @@ entry:
     <vscale x 16 x i16> %0,
     i8 %1,
     <vscale x 16 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i16> %a
 }
@@ -235,6 +240,7 @@ declare <vscale x 32 x i16> @llvm.riscv.vwmaccus.nxv32i16.i8(
   <vscale x 32 x i16>,
   i8,
   <vscale x 32 x i8>,
+  i64,
   i64);
 
 define <vscale x 32 x i16>  @intrinsic_vwmaccus_vx_nxv32i16_i8_nxv32i8(<vscale x 32 x i16> %0, i8 %1, <vscale x 32 x i8> %2, i64 %3) nounwind {
@@ -248,7 +254,7 @@ entry:
     <vscale x 32 x i16> %0,
     i8 %1,
     <vscale x 32 x i8> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 32 x i16> %a
 }
@@ -281,6 +287,7 @@ declare <vscale x 1 x i32> @llvm.riscv.vwmaccus.nxv1i32.i16(
   <vscale x 1 x i32>,
   i16,
   <vscale x 1 x i16>,
+  i64,
   i64);
 
 define <vscale x 1 x i32>  @intrinsic_vwmaccus_vx_nxv1i32_i16_nxv1i16(<vscale x 1 x i32> %0, i16 %1, <vscale x 1 x i16> %2, i64 %3) nounwind {
@@ -294,7 +301,7 @@ entry:
     <vscale x 1 x i32> %0,
     i16 %1,
     <vscale x 1 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i32> %a
 }
@@ -327,6 +334,7 @@ declare <vscale x 2 x i32> @llvm.riscv.vwmaccus.nxv2i32.i16(
   <vscale x 2 x i32>,
   i16,
   <vscale x 2 x i16>,
+  i64,
   i64);
 
 define <vscale x 2 x i32>  @intrinsic_vwmaccus_vx_nxv2i32_i16_nxv2i16(<vscale x 2 x i32> %0, i16 %1, <vscale x 2 x i16> %2, i64 %3) nounwind {
@@ -340,7 +348,7 @@ entry:
     <vscale x 2 x i32> %0,
     i16 %1,
     <vscale x 2 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i32> %a
 }
@@ -373,6 +381,7 @@ declare <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16(
   <vscale x 4 x i32>,
   i16,
   <vscale x 4 x i16>,
+  i64,
   i64);
 
 define <vscale x 4 x i32>  @intrinsic_vwmaccus_vx_nxv4i32_i16_nxv4i16(<vscale x 4 x i32> %0, i16 %1, <vscale x 4 x i16> %2, i64 %3) nounwind {
@@ -386,7 +395,7 @@ entry:
     <vscale x 4 x i32> %0,
     i16 %1,
     <vscale x 4 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i32> %a
 }
@@ -419,6 +428,7 @@ declare <vscale x 8 x i32> @llvm.riscv.vwmaccus.nxv8i32.i16(
   <vscale x 8 x i32>,
   i16,
   <vscale x 8 x i16>,
+  i64,
   i64);
 
 define <vscale x 8 x i32>  @intrinsic_vwmaccus_vx_nxv8i32_i16_nxv8i16(<vscale x 8 x i32> %0, i16 %1, <vscale x 8 x i16> %2, i64 %3) nounwind {
@@ -432,7 +442,7 @@ entry:
     <vscale x 8 x i32> %0,
     i16 %1,
     <vscale x 8 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i32> %a
 }
@@ -465,6 +475,7 @@ declare <vscale x 16 x i32> @llvm.riscv.vwmaccus.nxv16i32.i16(
   <vscale x 16 x i32>,
   i16,
   <vscale x 16 x i16>,
+  i64,
   i64);
 
 define <vscale x 16 x i32>  @intrinsic_vwmaccus_vx_nxv16i32_i16_nxv16i16(<vscale x 16 x i32> %0, i16 %1, <vscale x 16 x i16> %2, i64 %3) nounwind {
@@ -478,7 +489,7 @@ entry:
     <vscale x 16 x i32> %0,
     i16 %1,
     <vscale x 16 x i16> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 16 x i32> %a
 }
@@ -511,6 +522,7 @@ declare <vscale x 1 x i64> @llvm.riscv.vwmaccus.nxv1i64.i32(
   <vscale x 1 x i64>,
   i32,
   <vscale x 1 x i32>,
+  i64,
   i64);
 
 define <vscale x 1 x i64>  @intrinsic_vwmaccus_vx_nxv1i64_i32_nxv1i32(<vscale x 1 x i64> %0, i32 %1, <vscale x 1 x i32> %2, i64 %3) nounwind {
@@ -524,7 +536,7 @@ entry:
     <vscale x 1 x i64> %0,
     i32 %1,
     <vscale x 1 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 1 x i64> %a
 }
@@ -557,6 +569,7 @@ declare <vscale x 2 x i64> @llvm.riscv.vwmaccus.nxv2i64.i32(
   <vscale x 2 x i64>,
   i32,
   <vscale x 2 x i32>,
+  i64,
   i64);
 
 define <vscale x 2 x i64>  @intrinsic_vwmaccus_vx_nxv2i64_i32_nxv2i32(<vscale x 2 x i64> %0, i32 %1, <vscale x 2 x i32> %2, i64 %3) nounwind {
@@ -570,7 +583,7 @@ entry:
     <vscale x 2 x i64> %0,
     i32 %1,
     <vscale x 2 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 2 x i64> %a
 }
@@ -603,6 +616,7 @@ declare <vscale x 4 x i64> @llvm.riscv.vwmaccus.nxv4i64.i32(
   <vscale x 4 x i64>,
   i32,
   <vscale x 4 x i32>,
+  i64,
   i64);
 
 define <vscale x 4 x i64>  @intrinsic_vwmaccus_vx_nxv4i64_i32_nxv4i32(<vscale x 4 x i64> %0, i32 %1, <vscale x 4 x i32> %2, i64 %3) nounwind {
@@ -616,7 +630,7 @@ entry:
     <vscale x 4 x i64> %0,
     i32 %1,
     <vscale x 4 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 4 x i64> %a
 }
@@ -649,6 +663,7 @@ declare <vscale x 8 x i64> @llvm.riscv.vwmaccus.nxv8i64.i32(
   <vscale x 8 x i64>,
   i32,
   <vscale x 8 x i32>,
+  i64,
   i64);
 
 define <vscale x 8 x i64>  @intrinsic_vwmaccus_vx_nxv8i64_i32_nxv8i32(<vscale x 8 x i64> %0, i32 %1, <vscale x 8 x i32> %2, i64 %3) nounwind {
@@ -662,7 +677,7 @@ entry:
     <vscale x 8 x i64> %0,
     i32 %1,
     <vscale x 8 x i32> %2,
-    i64 %3)
+    i64 %3, i64 0)
 
   ret <vscale x 8 x i64> %a
 }

From 7eb3ce997abec85e923f648fd8b6d5ef1c634415 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 17 Feb 2022 09:12:17 -0800
Subject: [PATCH 127/748] [instsimplify] Precommit a test showing an alloca
 equality miscompile

---
 llvm/test/Transforms/InstSimplify/compare.ll | 22 ++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index 571b2a4a7d051..b305296a49769 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -2700,4 +2700,26 @@ define <2 x i1> @cttz_slt_bitwidth_splat(<2 x i13> %x) {
   ret <2 x i1> %cmp
 }
 
+; FIXME: A zero sized alloca *can* be equal to another alloca
+define i1 @zero_sized_alloca1() {
+; CHECK-LABEL: @zero_sized_alloca1(
+; CHECK-NEXT:    ret i1 true
+;
+  %a = alloca i32, i32 0
+  %b = alloca i32, i32 0
+  %res = icmp ne i32* %a, %b
+  ret i1 %res
+}
+
+define i1 @zero_sized_alloca2() {
+; CHECK-LABEL: @zero_sized_alloca2(
+; CHECK-NEXT:    ret i1 true
+;
+  %a = alloca i32, i32 0
+  %b = alloca i32
+  %res = icmp ne i32* %a, %b
+  ret i1 %res
+}
+
+
 attributes #0 = { null_pointer_is_valid }

From 7db1d4d8da4d4dfc5d0240825e8c4d536a12b19c Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 17 Feb 2022 12:09:21 -0500
Subject: [PATCH 128/748] [RuntimeDyld] Fix building on OpenBSD

With https://reviews.llvm.org/D105466 the tree does not build on OpenBSD/amd64.
Moritz suggested only building this code on Linux.

Reviewed By: MoritzS

Differential Revision: https://reviews.llvm.org/D119991
---
 llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 21339a3f8f3d8..893d8a55c8950 100644
--- a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -286,7 +286,7 @@ class TrivialMemoryManager : public RTDyldMemoryManager {
   uintptr_t SlabSize = 0;
   uintptr_t CurrentSlabOffset = 0;
   SectionIDMap *SecIDMap = nullptr;
-#if defined(__x86_64__) && defined(__ELF__)
+#if defined(__x86_64__) && defined(__ELF__) && defined(__linux__)
   unsigned UsedTLSStorage = 0;
 #endif
 };
@@ -350,7 +350,7 @@ uint8_t *TrivialMemoryManager::allocateDataSection(uintptr_t Size,
 
 // In case the execution needs TLS storage, we define a very small TLS memory
 // area here that will be used in allocateTLSSection().
-#if defined(__x86_64__) && defined(__ELF__)
+#if defined(__x86_64__) && defined(__ELF__) && defined(__linux__)
 extern "C" {
 alignas(16) __attribute__((visibility("hidden"), tls_model("initial-exec"),
                            used)) thread_local char LLVMRTDyldTLSSpace[16];
@@ -361,7 +361,7 @@ TrivialMemoryManager::TLSSection
 TrivialMemoryManager::allocateTLSSection(uintptr_t Size, unsigned Alignment,
                                          unsigned SectionID,
                                          StringRef SectionName) {
-#if defined(__x86_64__) && defined(__ELF__)
+#if defined(__x86_64__) && defined(__ELF__) && defined(__linux__)
   if (Size + UsedTLSStorage > sizeof(LLVMRTDyldTLSSpace)) {
     return {};
   }

From f374c8ddf2dd4920190cac0ea81e18a74040ddda Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 17 Feb 2022 12:15:14 -0500
Subject: [PATCH 129/748] [clangd] Fix building SerializationTests unit test on
 OpenBSD

This fixes building the unit tests on OpenBSD. OpenBSD does not support RLIMIT_AS.

Reviewed By: kadircet

Differential Revision: https://reviews.llvm.org/D119989
---
 clang-tools-extra/clangd/unittests/SerializationTests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/unittests/SerializationTests.cpp b/clang-tools-extra/clangd/unittests/SerializationTests.cpp
index 290e20a082d66..6070b229f31c7 100644
--- a/clang-tools-extra/clangd/unittests/SerializationTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SerializationTests.cpp
@@ -308,9 +308,9 @@ TEST(SerializationTest, CmdlTest) {
   }
 }
 
-// rlimit is part of POSIX.
+// rlimit is part of POSIX. RLIMIT_AS does not exist in OpenBSD.
 // Sanitizers use a lot of address space, so we can't apply strict limits.
-#if LLVM_ON_UNIX && !LLVM_ADDRESS_SANITIZER_BUILD &&                           \
+#if LLVM_ON_UNIX && defined(RLIMIT_AS) && !LLVM_ADDRESS_SANITIZER_BUILD &&     \
     !LLVM_MEMORY_SANITIZER_BUILD
 class ScopedMemoryLimit {
   struct rlimit OriginalLimit;

From d66983861a66b1eb9ff6d08b57a9cdd8e2a35932 Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Thu, 17 Feb 2022 17:21:55 +0000
Subject: [PATCH 130/748] [libc] Add exit and atexit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Often atexit is implemented using __cxa_atexit. I have not implemented __cxa_atexit here because it potentially requires more discussion. It is unique for llvm-libc (I think) that it is an exported symbol that wouldn’t be defined in any spec file because it doesn’t have a header. Implementing it will be trivial given what is here already, but I figured it would be more contentious so it can be implemented later.

Reviewed By: lntue

Differential Revision: https://reviews.llvm.org/D119512
---
 libc/config/linux/api.td                      |  3 +-
 libc/config/linux/x86_64/entrypoints.txt      |  2 +
 libc/include/CMakeLists.txt                   |  1 +
 libc/include/llvm-libc-types/CMakeLists.txt   |  1 +
 .../llvm-libc-types/__atexithandler_t.h       | 14 +++
 libc/spec/spec.td                             |  2 +
 libc/spec/stdc.td                             |  3 +
 libc/src/stdlib/CMakeLists.txt                | 24 +++++
 libc/src/stdlib/atexit.cpp                    | 54 +++++++++++
 libc/src/stdlib/atexit.h                      | 18 ++++
 libc/src/stdlib/exit.cpp                      | 24 +++++
 libc/src/stdlib/exit.h                        | 20 ++++
 libc/test/src/stdlib/CMakeLists.txt           | 73 ++++++++------
 libc/test/src/stdlib/_Exit_test.cpp           |  4 +
 libc/test/src/stdlib/atexit_test.cpp          | 94 +++++++++++++++++++
 15 files changed, 307 insertions(+), 30 deletions(-)
 create mode 100644 libc/include/llvm-libc-types/__atexithandler_t.h
 create mode 100644 libc/src/stdlib/atexit.cpp
 create mode 100644 libc/src/stdlib/atexit.h
 create mode 100644 libc/src/stdlib/exit.cpp
 create mode 100644 libc/src/stdlib/exit.h
 create mode 100644 libc/test/src/stdlib/atexit_test.cpp

diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 69c663f388b2c..d584de545f22b 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -159,7 +159,8 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
     "lldiv_t",
     "size_t",
     "__bsearchcompare_t",
-    "__qsortcompare_t"
+    "__qsortcompare_t",
+    "__atexithandler_t",
   ];
 }
 
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index f21d032464dc4..500e9cc3a43be 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -226,6 +226,8 @@ if(LLVM_LIBC_FULL_BUILD)
     # stdlib.h entrypoints
     libc.src.stdlib._Exit
     # libc.src.stdlib.abort
+    libc.src.stdlib.atexit
+    libc.src.stdlib.exit
 
     # signal.h entrypoints
     # TODO: Enable signal.h entrypoints after fixing signal.h
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 6198e90e3fd0d..9370b5a37705e 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -141,6 +141,7 @@ add_gen_header(
     .llvm-libc-types.ldiv_t
     .llvm-libc-types.lldiv_t
     .llvm-libc-types.size_t
+    .llvm-libc-types.__atexithandler_t
 )
 
 add_gen_header(
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index cf70d4472c8f3..dfb34c0696773 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -23,3 +23,4 @@ add_header(struct_tm HDR struct_tm.h)
 add_header(thrd_start_t HDR thrd_start_t.h)
 add_header(thrd_t HDR thrd_t.h)
 add_header(time_t HDR time_t.h)
+add_header(__atexithandler_t HDR __atexithandler_t.h)
diff --git a/libc/include/llvm-libc-types/__atexithandler_t.h b/libc/include/llvm-libc-types/__atexithandler_t.h
new file mode 100644
index 0000000000000..a9887b6abf708
--- /dev/null
+++ b/libc/include/llvm-libc-types/__atexithandler_t.h
@@ -0,0 +1,14 @@
+//===-- Definition of type __atexithandler_t ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_TYPES_ATEXITHANDLER_T_H__
+#define __LLVM_LIBC_TYPES_ATEXITHANDLER_T_H__
+
+typedef void (*__atexithandler_t)(void);
+
+#endif // __LLVM_LIBC_TYPES_ATEXITHANDLER_T_H__
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index d2f194bd966f6..183e1b7d0ecc4 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -94,6 +94,8 @@ def TimeTType : NamedType<"time_t">;
 def BSearchCompareT : NamedType<"__bsearchcompare_t">;
 def QSortCompareT : NamedType<"__qsortcompare_t">;
 
+def AtexitHandlerT : NamedType<"__atexithandler_t">;
+
 //added because __assert_fail needs it.
 def UnsignedType : NamedType<"unsigned">;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 14b2d4d9f2eb7..653a8a14de37f 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -502,6 +502,7 @@ def StdC : StandardSpec<"stdc"> {
           SizeTType,
           BSearchCompareT,
           QSortCompareT,
+          AtexitHandlerT,
       ], // Types
       [], // Enumerations
       [
@@ -538,6 +539,8 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"free", RetValSpec<VoidType>, [ArgSpec<VoidPtr>]>,
 
           FunctionSpec<"_Exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>,
+          FunctionSpec<"exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>,
+          FunctionSpec<"atexit", RetValSpec<IntType>, [ArgSpec<AtexitHandlerT>]>,
       ]
   >;
 
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index d679b068ee313..2472466a086dc 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -270,6 +270,30 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}._Exit
 )
 
+add_entrypoint_object(
+  atexit
+  SRCS
+    atexit.cpp
+  HDRS
+    atexit.h
+  DEPENDS
+    libc.src.__support.CPP.vector
+    libc.src.threads.mtx_init
+    libc.src.threads.mtx_lock
+    libc.src.threads.mtx_unlock
+)
+
+add_entrypoint_object(
+  exit
+  SRCS
+    exit.cpp
+  HDRS
+    exit.h
+  DEPENDS
+    ._Exit
+    .atexit
+)
+
 # add_entrypoint_object(
 #   abort
 #   ALIAS
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
new file mode 100644
index 0000000000000..9d18cd9c3cece
--- /dev/null
+++ b/libc/src/stdlib/atexit.cpp
@@ -0,0 +1,54 @@
+//===-- Implementation of atexit ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/atexit.h"
+#include "src/__support/CPP/vector.h"
+#include "src/__support/common.h"
+#include "src/threads/mtx_init.h"
+#include "src/threads/mtx_lock.h"
+#include "src/threads/mtx_unlock.h"
+
+namespace __llvm_libc {
+
+namespace {
+
+mtx_t lock;
+// TODO need an easier way to use mtx_t internally, or use pthread_mutex_t
+// with PTHREAD_MUTEX_INITIALIZER when it lands.
+struct Init {
+  Init() { __llvm_libc::mtx_init(&lock, mtx_plain); }
+} init;
+
+// TOOD should we make cpp::vector like llvm::SmallVector<T, N> where it will
+// allocate at least N before needing dynamic allocation?
+static cpp::vector<void (*)(void)> handlers;
+
+} // namespace
+
+namespace internal {
+
+void call_exit_handlers() {
+  __llvm_libc::mtx_lock(&lock);
+  // TODO: implement rbegin() + rend() for cpp::vector
+  for (int i = handlers.size() - 1; i >= 0; i--) {
+    __llvm_libc::mtx_unlock(&lock);
+    handlers[i]();
+    __llvm_libc::mtx_lock(&lock);
+  }
+}
+
+} // namespace internal
+
+LLVM_LIBC_FUNCTION(int, atexit, (void (*function)())) {
+  __llvm_libc::mtx_lock(&lock);
+  handlers.push_back(function);
+  __llvm_libc::mtx_unlock(&lock);
+  return 0;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/atexit.h b/libc/src/stdlib/atexit.h
new file mode 100644
index 0000000000000..574549e1f32bb
--- /dev/null
+++ b/libc/src/stdlib/atexit.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for atexit ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_ATEXIT_H
+#define LLVM_LIBC_SRC_STDLIB_ATEXIT_H
+
+namespace __llvm_libc {
+
+int atexit(void (*function)());
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_ATEXIT_H
diff --git a/libc/src/stdlib/exit.cpp b/libc/src/stdlib/exit.cpp
new file mode 100644
index 0000000000000..5a02a45d4f18e
--- /dev/null
+++ b/libc/src/stdlib/exit.cpp
@@ -0,0 +1,24 @@
+//===-- Implementation of exit --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/exit.h"
+#include "src/__support/common.h"
+#include "src/stdlib/_Exit.h"
+
+namespace __llvm_libc {
+
+namespace internal {
+void call_exit_handlers();
+}
+
+LLVM_LIBC_FUNCTION(void, exit, (int status)) {
+  internal::call_exit_handlers();
+  _Exit(status);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/exit.h b/libc/src/stdlib/exit.h
new file mode 100644
index 0000000000000..7c015e3c0dae3
--- /dev/null
+++ b/libc/src/stdlib/exit.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for exit --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+
+#ifndef LLVM_LIBC_SRC_STDLIB_EXIT_H
+#define LLVM_LIBC_SRC_STDLIB_EXIT_H
+
+namespace __llvm_libc {
+
+void exit(int status);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_EXIT_H
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 8e1948399123a..8534692b342b1 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -110,35 +110,6 @@ add_libc_unittest(
     libc.src.stdlib.strtoull
 )
 
-if(NOT LLVM_LIBC_FULL_BUILD)
-  return()
-endif()
-
-add_libc_unittest(
-  _Exit_test
-  SUITE
-    libc_stdlib_unittests
-  SRCS
-    _Exit_test.cpp
-  DEPENDS
-    libc.include.stdlib
-    libc.src.stdlib._Exit
-)
-
-# add_libc_unittest(
-#   abort_test
-#   SUITE
-#     libc_stdlib_unittests
-#   SRCS
-#     abort_test.cpp
-#   DEPENDS
-#     libc.include.stdlib
-#     libc.include.signal
-#     libc.src.stdlib.abort
-#     libc.src.stdlib._Exit
-#     libc.src.signal.raise
-# )
-
 add_libc_unittest(
   abs_test
   SUITE
@@ -229,3 +200,47 @@ add_libc_unittest(
     libc.include.stdlib
     libc.src.stdlib.qsort
 )
+
+if(LLVM_LIBC_FULL_BUILD)
+
+  add_libc_unittest(
+    _Exit_test
+    SUITE
+      libc_stdlib_unittests
+    SRCS
+      _Exit_test.cpp
+    DEPENDS
+      libc.include.stdlib
+      libc.src.stdlib._Exit
+      libc.src.stdlib.exit
+  )
+
+  add_libc_unittest(
+    atexit_test
+    SUITE
+      libc_stdlib_unittests
+    SRCS
+      atexit_test.cpp
+    DEPENDS
+      libc.include.stdlib
+      libc.src.stdlib._Exit
+      libc.src.stdlib.exit
+      libc.src.stdlib.atexit
+      libc.src.__support.CPP.standalone_cpp
+  )
+
+  # add_libc_unittest(
+  #   abort_test
+  #   SUITE
+  #     libc_stdlib_unittests
+  #   SRCS
+  #     abort_test.cpp
+  #   DEPENDS
+  #     libc.include.stdlib
+  #     libc.include.signal
+  #     libc.src.stdlib.abort
+  #     libc.src.stdlib._Exit
+  #     libc.src.signal.raise
+  # )
+
+endif()
diff --git a/libc/test/src/stdlib/_Exit_test.cpp b/libc/test/src/stdlib/_Exit_test.cpp
index 03d5ffcc22366..a07034dbf310e 100644
--- a/libc/test/src/stdlib/_Exit_test.cpp
+++ b/libc/test/src/stdlib/_Exit_test.cpp
@@ -8,9 +8,13 @@
 
 #include "include/stdlib.h"
 #include "src/stdlib/_Exit.h"
+#include "src/stdlib/exit.h"
 #include "utils/UnitTest/Test.h"
 
 TEST(LlvmLibcStdlib, _Exit) {
   EXPECT_EXITS([] { __llvm_libc::_Exit(1); }, 1);
   EXPECT_EXITS([] { __llvm_libc::_Exit(65); }, 65);
+
+  EXPECT_EXITS([] { __llvm_libc::exit(1); }, 1);
+  EXPECT_EXITS([] { __llvm_libc::exit(65); }, 65);
 }
diff --git a/libc/test/src/stdlib/atexit_test.cpp b/libc/test/src/stdlib/atexit_test.cpp
new file mode 100644
index 0000000000000..bce69d88dc26e
--- /dev/null
+++ b/libc/test/src/stdlib/atexit_test.cpp
@@ -0,0 +1,94 @@
+//===-- Unittests for atexit ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/Array.h"
+#include "src/__support/CPP/Utility.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
+#include "utils/UnitTest/Test.h"
+
+static int a;
+TEST(LlvmLibcAtExit, Basic) {
+  // In case tests ever run multiple times.
+  a = 0;
+
+  auto test = [] {
+    int status = __llvm_libc::atexit(+[] {
+      if (a != 1)
+        __builtin_trap();
+    });
+    status |= __llvm_libc::atexit(+[] { a++; });
+    if (status)
+      __builtin_trap();
+
+    __llvm_libc::exit(0);
+  };
+  EXPECT_EXITS(test, 0);
+}
+
+TEST(LlvmLibcAtExit, AtExitCallsSysExit) {
+  auto test = [] {
+    __llvm_libc::atexit(+[] { _Exit(1); });
+    __llvm_libc::exit(0);
+  };
+  EXPECT_EXITS(test, 1);
+}
+
+static int size;
+static __llvm_libc::cpp::Array<int, 256> arr;
+
+template <int... Ts>
+void register_atexit_handlers(__llvm_libc::cpp::IntegerSequence<int, Ts...>) {
+  (__llvm_libc::atexit(+[] { arr[size++] = Ts; }), ...);
+}
+
+template <int count> constexpr auto getTest() {
+  return [] {
+    __llvm_libc::atexit(+[] {
+      if (size != count)
+        __builtin_trap();
+      for (int i = 0; i < count; i++)
+        if (arr[i] != count - 1 - i)
+          __builtin_trap();
+    });
+    register_atexit_handlers(
+        __llvm_libc::cpp::MakeIntegerSequence<int, count>{});
+    __llvm_libc::exit(0);
+  };
+}
+
+TEST(LlvmLibcAtExit, ReverseOrder) {
+  // In case tests ever run multiple times.
+  size = 0;
+
+  auto test = getTest<32>();
+  EXPECT_EXITS(test, 0);
+}
+
+TEST(LlvmLibcAtExit, Many) {
+  // In case tests ever run multiple times.
+  size = 0;
+
+  auto test = getTest<256>();
+  EXPECT_EXITS(test, 0);
+}
+
+// POSIX doesn't specify if an atexit handler can call atexit, it only says it
+// is undefined for a handler to call exit(3). The current implementation will
+// end up invoking the newly registered function, although glibc does, other
+// libc's do not. This just tests that we don't deadlock when an exit handler
+// calls atexit.
+TEST(LlvmLibcAtExit, HandlerCallsAtExit) {
+  auto test = [] {
+    __llvm_libc::atexit(+[] {
+      __llvm_libc::atexit(+[] { __builtin_trap(); });
+      __llvm_libc::exit(0);
+    });
+  };
+  EXPECT_EXITS(test, 0);
+}

From e641c29f41971597dbe190f98784f0e4cfc220cc Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Thu, 17 Feb 2022 18:23:22 +0100
Subject: [PATCH 131/748] [flang] Lower simple scalar assignment

This patch hanlde lowering of simple scalar assignment.

This patch is part of the upstreaming effort from fir-dev branch.

Reviewed By: PeteSteinfeld

Differential Revision: https://reviews.llvm.org/D120058

Co-authored-by: Jean Perier <jperier@nvidia.com>
---
 flang/include/flang/Lower/ConvertExpr.h       |  15 ++
 flang/include/flang/Lower/ConvertType.h       |   5 +-
 flang/include/flang/Lower/Support/Utils.h     |  11 ++
 flang/include/flang/Lower/Utils.h             |  31 ----
 .../include/flang/Optimizer/Dialect/FIROps.h  |   4 +
 flang/lib/Lower/Bridge.cpp                    | 131 ++++++++++++++++-
 flang/lib/Lower/ConvertExpr.cpp               | 139 +++++++++++++++++-
 flang/lib/Lower/ConvertType.cpp               |  67 ++++++++-
 flang/lib/Lower/Mangler.cpp                   |   2 +-
 flang/test/Lower/assignment.f90               |  24 +++
 10 files changed, 381 insertions(+), 48 deletions(-)
 delete mode 100644 flang/include/flang/Lower/Utils.h
 create mode 100644 flang/test/Lower/assignment.f90

diff --git a/flang/include/flang/Lower/ConvertExpr.h b/flang/include/flang/Lower/ConvertExpr.h
index fde3d612f33a1..459ab71074a0f 100644
--- a/flang/include/flang/Lower/ConvertExpr.h
+++ b/flang/include/flang/Lower/ConvertExpr.h
@@ -43,6 +43,21 @@ fir::ExtendedValue createSomeExtendedExpression(mlir::Location loc,
                                                 const SomeExpr &expr,
                                                 SymMap &symMap);
 
+/// Create an extended expression address.
+fir::ExtendedValue createSomeExtendedAddress(mlir::Location loc,
+                                             AbstractConverter &converter,
+                                             const SomeExpr &expr,
+                                             SymMap &symMap);
+
+// Attribute for an alloca that is a trivial adaptor for converting a value to
+// pass-by-ref semantics for a VALUE parameter. The optimizer may be able to
+// eliminate these.
+inline mlir::NamedAttribute getAdaptToByRefAttr(fir::FirOpBuilder &builder) {
+  return {mlir::StringAttr::get(builder.getContext(),
+                                fir::getAdaptToByRefAttrName()),
+          builder.getUnitAttr()};
+}
+
 } // namespace Fortran::lower
 
 #endif // FORTRAN_LOWER_CONVERTEXPR_H
diff --git a/flang/include/flang/Lower/ConvertType.h b/flang/include/flang/Lower/ConvertType.h
index 6a815f5affc2e..ea931e28cb3fb 100644
--- a/flang/include/flang/Lower/ConvertType.h
+++ b/flang/include/flang/Lower/ConvertType.h
@@ -61,6 +61,9 @@ struct Variable;
 using SomeExpr = evaluate::Expr<evaluate::SomeType>;
 using SymbolRef = common::Reference<const semantics::Symbol>;
 
+// Type for compile time constant length type parameters.
+using LenParameterTy = std::int64_t;
+
 /// Get a FIR type based on a category and kind.
 mlir::Type getFIRType(mlir::MLIRContext *ctxt, common::TypeCategory tc,
                       int kind);
@@ -75,7 +78,7 @@ mlir::Type translateDataRefToFIRType(Fortran::lower::AbstractConverter &,
 
 /// Translate a SomeExpr to an mlir::Type.
 mlir::Type translateSomeExprToFIRType(Fortran::lower::AbstractConverter &,
-                                      const SomeExpr *expr);
+                                      const SomeExpr &expr);
 
 /// Translate a Fortran::semantics::Symbol to an mlir::Type.
 mlir::Type translateSymbolToFIRType(Fortran::lower::AbstractConverter &,
diff --git a/flang/include/flang/Lower/Support/Utils.h b/flang/include/flang/Lower/Support/Utils.h
index 63b614098fbba..0acd6076ca30c 100644
--- a/flang/include/flang/Lower/Support/Utils.h
+++ b/flang/include/flang/Lower/Support/Utils.h
@@ -15,11 +15,16 @@
 
 #include "flang/Common/indirection.h"
 #include "flang/Parser/char-block.h"
+#include "flang/Semantics/tools.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdint>
 
+namespace Fortran::lower {
+using SomeExpr = Fortran::evaluate::Expr<Fortran::evaluate::SomeType>;
+}
+
 //===----------------------------------------------------------------------===//
 // Small inline helper functions to deal with repetitive, clumsy conversions.
 //===----------------------------------------------------------------------===//
@@ -46,4 +51,10 @@ const A &removeIndirection(const Fortran::common::Indirection<A> &a) {
   return a.value();
 }
 
+/// Clone subexpression and wrap it as a generic `Fortran::evaluate::Expr`.
+template <typename A>
+static Fortran::lower::SomeExpr toEvExpr(const A &x) {
+  return Fortran::evaluate::AsGenericExpr(Fortran::common::Clone(x));
+}
+
 #endif // FORTRAN_LOWER_SUPPORT_UTILS_H
diff --git a/flang/include/flang/Lower/Utils.h b/flang/include/flang/Lower/Utils.h
deleted file mode 100644
index d7c7b565dbc6a..0000000000000
--- a/flang/include/flang/Lower/Utils.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- Lower/Utils.h -- utilities ------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FORTRAN_LOWER_UTILS_H
-#define FORTRAN_LOWER_UTILS_H
-
-#include "flang/Common/indirection.h"
-#include "flang/Parser/char-block.h"
-#include "llvm/ADT/StringRef.h"
-
-/// Convert an F18 CharBlock to an LLVM StringRef
-inline llvm::StringRef toStringRef(const Fortran::parser::CharBlock &cb) {
-  return {cb.begin(), cb.size()};
-}
-
-/// Template helper to remove Fortran::common::Indirection wrappers.
-template <typename A>
-const A &removeIndirection(const A &a) {
-  return a;
-}
-template <typename A>
-const A &removeIndirection(const Fortran::common::Indirection<A> &a) {
-  return a.value();
-}
-
-#endif // FORTRAN_LOWER_UTILS_H
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.h b/flang/include/flang/Optimizer/Dialect/FIROps.h
index c6d60c0099847..3a67577d1c9a8 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.h
@@ -38,6 +38,10 @@ mlir::ParseResult parseSelector(mlir::OpAsmParser &parser,
                                 mlir::OpAsmParser::OperandType &selector,
                                 mlir::Type &type);
 
+static constexpr llvm::StringRef getAdaptToByRefAttrName() {
+  return "adapt.valuebyref";
+}
+
 } // namespace fir
 
 #define GET_OP_CLASSES
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index cfb326c3af483..bf346ec6f80b2 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -22,6 +22,7 @@
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Lower/Todo.h"
 #include "flang/Optimizer/Support/FIRContext.h"
+#include "flang/Semantics/tools.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -77,8 +78,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   fir::ExtendedValue genExprAddr(const Fortran::lower::SomeExpr &expr,
                                  mlir::Location *loc = nullptr) override final {
-    TODO_NOLOC("Not implemented genExprAddr. Needed for more complex "
-               "expression lowering");
+    return createSomeExtendedAddress(loc ? *loc : toLocation(), *this, expr,
+                                     localSymbols);
   }
   fir::ExtendedValue
   genExprValue(const Fortran::lower::SomeExpr &expr,
@@ -95,9 +96,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     TODO_NOLOC("Not implemented genType DataRef. Needed for more complex "
                "expression lowering");
   }
-  mlir::Type genType(const Fortran::lower::SomeExpr &) override final {
-    TODO_NOLOC("Not implemented genType SomeExpr. Needed for more complex "
-               "expression lowering");
+  mlir::Type genType(const Fortran::lower::SomeExpr &expr) override final {
+    return Fortran::lower::translateSomeExprToFIRType(*this, expr);
   }
   mlir::Type genType(Fortran::lower::SymbolRef sym) override final {
     return Fortran::lower::translateSymbolToFIRType(*this, sym);
@@ -385,6 +385,19 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     return true;
   }
 
+  bool isNumericScalarCategory(Fortran::common::TypeCategory cat) {
+    return cat == Fortran::common::TypeCategory::Integer ||
+           cat == Fortran::common::TypeCategory::Real ||
+           cat == Fortran::common::TypeCategory::Complex ||
+           cat == Fortran::common::TypeCategory::Logical;
+  }
+  bool isCharacterCategory(Fortran::common::TypeCategory cat) {
+    return cat == Fortran::common::TypeCategory::Character;
+  }
+  bool isDerivedCategory(Fortran::common::TypeCategory cat) {
+    return cat == Fortran::common::TypeCategory::Derived;
+  }
+
   void genFIRBranch(mlir::Block *targetBlock) {
     assert(targetBlock && "missing unconditional target block");
     builder->create<cf::BranchOp>(toLocation(), targetBlock);
@@ -449,6 +462,112 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     }
   }
 
+  [[maybe_unused]] static bool
+  isFuncResultDesignator(const Fortran::lower::SomeExpr &expr) {
+    const Fortran::semantics::Symbol *sym =
+        Fortran::evaluate::GetFirstSymbol(expr);
+    return sym && sym->IsFuncResult();
+  }
+
+  static bool isWholeAllocatable(const Fortran::lower::SomeExpr &expr) {
+    const Fortran::semantics::Symbol *sym =
+        Fortran::evaluate::UnwrapWholeSymbolOrComponentDataRef(expr);
+    return sym && Fortran::semantics::IsAllocatable(*sym);
+  }
+
+  void genAssignment(const Fortran::evaluate::Assignment &assign) {
+    mlir::Location loc = toLocation();
+
+    std::visit(
+        Fortran::common::visitors{
+            // [1] Plain old assignment.
+            [&](const Fortran::evaluate::Assignment::Intrinsic &) {
+              const Fortran::semantics::Symbol *sym =
+                  Fortran::evaluate::GetLastSymbol(assign.lhs);
+
+              if (!sym)
+                TODO(loc, "assignment to pointer result of function reference");
+
+              std::optional<Fortran::evaluate::DynamicType> lhsType =
+                  assign.lhs.GetType();
+              assert(lhsType && "lhs cannot be typeless");
+              // Assignment to polymorphic allocatables may require changing the
+              // variable dynamic type (See Fortran 2018 10.2.1.3 p3).
+              if (lhsType->IsPolymorphic() && isWholeAllocatable(assign.lhs))
+                TODO(loc, "assignment to polymorphic allocatable");
+
+              // Note: No ad-hoc handling for pointers is required here. The
+              // target will be assigned as per 2018 10.2.1.3 p2. genExprAddr
+              // on a pointer returns the target address and not the address of
+              // the pointer variable.
+
+              if (assign.lhs.Rank() > 0) {
+                // Array assignment
+                // See Fortran 2018 10.2.1.3 p5, p6, and p7
+                TODO(toLocation(), "Array assignment");
+                return;
+              }
+
+              // Scalar assignment
+              const bool isNumericScalar =
+                  isNumericScalarCategory(lhsType->category());
+              fir::ExtendedValue rhs = isNumericScalar
+                                           ? genExprValue(assign.rhs)
+                                           : genExprAddr(assign.rhs);
+
+              if (isNumericScalar) {
+                // Fortran 2018 10.2.1.3 p8 and p9
+                // Conversions should have been inserted by semantic analysis,
+                // but they can be incorrect between the rhs and lhs. Correct
+                // that here.
+                mlir::Value addr = fir::getBase(genExprAddr(assign.lhs));
+                mlir::Value val = fir::getBase(rhs);
+                // A function with multiple entry points returning different
+                // types tags all result variables with one of the largest
+                // types to allow them to share the same storage.  Assignment
+                // to a result variable of one of the other types requires
+                // conversion to the actual type.
+                mlir::Type toTy = genType(assign.lhs);
+                mlir::Value cast =
+                    builder->convertWithSemantics(loc, toTy, val);
+                if (fir::dyn_cast_ptrEleTy(addr.getType()) != toTy) {
+                  assert(isFuncResultDesignator(assign.lhs) && "type mismatch");
+                  addr = builder->createConvert(
+                      toLocation(), builder->getRefType(toTy), addr);
+                }
+                builder->create<fir::StoreOp>(loc, cast, addr);
+              } else if (isCharacterCategory(lhsType->category())) {
+                TODO(toLocation(), "Character assignment");
+              } else if (isDerivedCategory(lhsType->category())) {
+                TODO(toLocation(), "Derived type assignment");
+              } else {
+                llvm_unreachable("unknown category");
+              }
+            },
+
+            // [2] User defined assignment. If the context is a scalar
+            // expression then call the procedure.
+            [&](const Fortran::evaluate::ProcedureRef &procRef) {
+              TODO(toLocation(), "User defined assignment");
+            },
+
+            // [3] Pointer assignment with possibly empty bounds-spec. R1035: a
+            // bounds-spec is a lower bound value.
+            [&](const Fortran::evaluate::Assignment::BoundsSpec &lbExprs) {
+              TODO(toLocation(),
+                   "Pointer assignment with possibly empty bounds-spec");
+            },
+
+            // [4] Pointer assignment with bounds-remapping. R1036: a
+            // bounds-remapping is a pair, lower bound and upper bound.
+            [&](const Fortran::evaluate::Assignment::BoundsRemapping
+                    &boundExprs) {
+              TODO(toLocation(), "Pointer assignment with bounds-remapping");
+            },
+        },
+        assign.u);
+  }
+
   void genFIR(const Fortran::parser::CallStmt &stmt) {
     TODO(toLocation(), "CallStmt lowering");
   }
@@ -712,7 +831,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   }
 
   void genFIR(const Fortran::parser::AssignmentStmt &stmt) {
-    TODO(toLocation(), "AssignmentStmt lowering");
+    genAssignment(*stmt.typedAssignment->v);
   }
 
   void genFIR(const Fortran::parser::SyncAllStmt &stmt) {
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index f97e4409aae93..497d1eaf06a0b 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -37,6 +37,33 @@
 // to the correct FIR representation in SSA form.
 //===----------------------------------------------------------------------===//
 
+/// Place \p exv in memory if it is not already a memory reference. If
+/// \p forceValueType is provided, the value is first casted to the provided
+/// type before being stored (this is mainly intended for logicals whose value
+/// may be `i1` but needed to be stored as Fortran logicals).
+static fir::ExtendedValue
+placeScalarValueInMemory(fir::FirOpBuilder &builder, mlir::Location loc,
+                         const fir::ExtendedValue &exv,
+                         mlir::Type storageType) {
+  mlir::Value valBase = fir::getBase(exv);
+  if (fir::conformsWithPassByRef(valBase.getType()))
+    return exv;
+
+  assert(!fir::hasDynamicSize(storageType) &&
+         "only expect statically sized scalars to be by value");
+
+  // Since `a` is not itself a valid referent, determine its value and
+  // create a temporary location at the beginning of the function for
+  // referencing.
+  mlir::Value val = builder.createConvert(loc, storageType, valBase);
+  mlir::Value temp = builder.createTemporary(
+      loc, storageType,
+      llvm::ArrayRef<mlir::NamedAttribute>{
+          Fortran::lower::getAdaptToByRefAttr(builder)});
+  builder.create<fir::StoreOp>(loc, val, temp);
+  return fir::substBase(exv, temp);
+}
+
 /// Generate a load of a value from an address. Beware that this will lose
 /// any dynamic type information for polymorphic entities (note that unlimited
 /// polymorphic cannot be loaded and must not be provided here).
@@ -78,6 +105,14 @@ class ScalarExprLowering {
 
   mlir::Location getLoc() { return location; }
 
+  template <typename A>
+  mlir::Value genunbox(const A &expr) {
+    ExtValue e = genval(expr);
+    if (const fir::UnboxedValue *r = e.getUnboxed())
+      return *r;
+    fir::emitFatalError(getLoc(), "unboxed expression expected");
+  }
+
   /// Generate an integral constant of `value`
   template <int KIND>
   mlir::Value genIntegerConstant(mlir::MLIRContext *context,
@@ -256,7 +291,9 @@ class ScalarExprLowering {
   ExtValue
   genval(const Fortran::evaluate::Convert<Fortran::evaluate::Type<TC1, KIND>,
                                           TC2> &convert) {
-    TODO(getLoc(), "genval convert<TC1, KIND, TC2>");
+    mlir::Type ty = converter.genType(TC1, KIND);
+    mlir::Value operand = genunbox(convert.left());
+    return builder.convertWithSemantics(getLoc(), ty, operand);
   }
 
   template <typename A>
@@ -330,10 +367,16 @@ class ScalarExprLowering {
     TODO(getLoc(), "genval ArrayConstructor<A>");
   }
 
+  ExtValue gen(const Fortran::evaluate::ComplexPart &x) {
+    TODO(getLoc(), "gen ComplexPart");
+  }
   ExtValue genval(const Fortran::evaluate::ComplexPart &x) {
     TODO(getLoc(), "genval ComplexPart");
   }
 
+  ExtValue gen(const Fortran::evaluate::Substring &s) {
+    TODO(getLoc(), "gen Substring");
+  }
   ExtValue genval(const Fortran::evaluate::Substring &ss) {
     TODO(getLoc(), "genval Substring");
   }
@@ -342,10 +385,16 @@ class ScalarExprLowering {
     TODO(getLoc(), "genval Subscript");
   }
 
+  ExtValue gen(const Fortran::evaluate::DataRef &dref) {
+    TODO(getLoc(), "gen DataRef");
+  }
   ExtValue genval(const Fortran::evaluate::DataRef &dref) {
     TODO(getLoc(), "genval DataRef");
   }
 
+  ExtValue gen(const Fortran::evaluate::Component &cmpt) {
+    TODO(getLoc(), "gen Component");
+  }
   ExtValue genval(const Fortran::evaluate::Component &cmpt) {
     TODO(getLoc(), "genval Component");
   }
@@ -354,19 +403,34 @@ class ScalarExprLowering {
     TODO(getLoc(), "genval Bound");
   }
 
+  ExtValue gen(const Fortran::evaluate::ArrayRef &aref) {
+    TODO(getLoc(), "gen ArrayRef");
+  }
   ExtValue genval(const Fortran::evaluate::ArrayRef &aref) {
     TODO(getLoc(), "genval ArrayRef");
   }
 
+  ExtValue gen(const Fortran::evaluate::CoarrayRef &coref) {
+    TODO(getLoc(), "gen CoarrayRef");
+  }
   ExtValue genval(const Fortran::evaluate::CoarrayRef &coref) {
     TODO(getLoc(), "genval CoarrayRef");
   }
 
+  template <typename A>
+  ExtValue gen(const Fortran::evaluate::Designator<A> &des) {
+    return std::visit([&](const auto &x) { return gen(x); }, des.u);
+  }
   template <typename A>
   ExtValue genval(const Fortran::evaluate::Designator<A> &des) {
     return std::visit([&](const auto &x) { return genval(x); }, des.u);
   }
 
+  template <typename A>
+  ExtValue gen(const Fortran::evaluate::FunctionRef<A> &funcRef) {
+    TODO(getLoc(), "gen FunctionRef<A>");
+  }
+
   template <typename A>
   ExtValue genval(const Fortran::evaluate::FunctionRef<A> &funcRef) {
     TODO(getLoc(), "genval FunctionRef<A>");
@@ -376,11 +440,6 @@ class ScalarExprLowering {
     TODO(getLoc(), "genval ProcedureRef");
   }
 
-  template <typename A>
-  bool isScalar(const A &x) {
-    return x.Rank() == 0;
-  }
-
   template <typename A>
   ExtValue genval(const Fortran::evaluate::Expr<A> &x) {
     if (isScalar(x))
@@ -388,12 +447,73 @@ class ScalarExprLowering {
     TODO(getLoc(), "genval Expr<A> arrays");
   }
 
+  /// Helper to detect Transformational function reference.
+  template <typename T>
+  bool isTransformationalRef(const T &) {
+    return false;
+  }
+  template <typename T>
+  bool isTransformationalRef(const Fortran::evaluate::FunctionRef<T> &funcRef) {
+    return !funcRef.IsElemental() && funcRef.Rank();
+  }
+  template <typename T>
+  bool isTransformationalRef(Fortran::evaluate::Expr<T> expr) {
+    return std::visit([&](const auto &e) { return isTransformationalRef(e); },
+                      expr.u);
+  }
+
+  template <typename A>
+  ExtValue gen(const Fortran::evaluate::Expr<A> &x) {
+    // Whole array symbols or components, and results of transformational
+    // functions already have a storage and the scalar expression lowering path
+    // is used to not create a new temporary storage.
+    if (isScalar(x) ||
+        Fortran::evaluate::UnwrapWholeSymbolOrComponentDataRef(x) ||
+        isTransformationalRef(x))
+      return std::visit([&](const auto &e) { return genref(e); }, x.u);
+    TODO(getLoc(), "gen Expr non-scalar");
+  }
+
+  template <typename A>
+  bool isScalar(const A &x) {
+    return x.Rank() == 0;
+  }
+
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::Expr<Fortran::evaluate::Type<
                       Fortran::common::TypeCategory::Logical, KIND>> &exp) {
     return std::visit([&](const auto &e) { return genval(e); }, exp.u);
   }
 
+  using RefSet =
+      std::tuple<Fortran::evaluate::ComplexPart, Fortran::evaluate::Substring,
+                 Fortran::evaluate::DataRef, Fortran::evaluate::Component,
+                 Fortran::evaluate::ArrayRef, Fortran::evaluate::CoarrayRef,
+                 Fortran::semantics::SymbolRef>;
+  template <typename A>
+  static constexpr bool inRefSet = Fortran::common::HasMember<A, RefSet>;
+
+  template <typename A, typename = std::enable_if_t<inRefSet<A>>>
+  ExtValue genref(const A &a) {
+    return gen(a);
+  }
+  template <typename A>
+  ExtValue genref(const A &a) {
+    mlir::Type storageType = converter.genType(toEvExpr(a));
+    return placeScalarValueInMemory(builder, getLoc(), genval(a), storageType);
+  }
+
+  template <typename A, template <typename> typename T,
+            typename B = std::decay_t<T<A>>,
+            std::enable_if_t<
+                std::is_same_v<B, Fortran::evaluate::Expr<A>> ||
+                    std::is_same_v<B, Fortran::evaluate::Designator<A>> ||
+                    std::is_same_v<B, Fortran::evaluate::FunctionRef<A>>,
+                bool> = true>
+  ExtValue genref(const T<A> &x) {
+    return gen(x);
+  }
+
 private:
   mlir::Location location;
   Fortran::lower::AbstractConverter &converter;
@@ -408,3 +528,10 @@ fir::ExtendedValue Fortran::lower::createSomeExtendedExpression(
   LLVM_DEBUG(expr.AsFortran(llvm::dbgs() << "expr: ") << '\n');
   return ScalarExprLowering{loc, converter, symMap}.genval(expr);
 }
+
+fir::ExtendedValue Fortran::lower::createSomeExtendedAddress(
+    mlir::Location loc, Fortran::lower::AbstractConverter &converter,
+    const Fortran::lower::SomeExpr &expr, Fortran::lower::SymMap &symMap) {
+  LLVM_DEBUG(expr.AsFortran(llvm::dbgs() << "address: ") << '\n');
+  return ScalarExprLowering{loc, converter, symMap}.gen(expr);
+}
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 848f38b389cc0..39424d3ff0b0a 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -9,8 +9,8 @@
 #include "flang/Lower/ConvertType.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/Support/Utils.h"
 #include "flang/Lower/Todo.h"
-#include "flang/Lower/Utils.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Semantics/type.h"
@@ -154,6 +154,39 @@ class TypeBuilder {
   TypeBuilder(Fortran::lower::AbstractConverter &converter)
       : converter{converter}, context{&converter.getMLIRContext()} {}
 
+  mlir::Type genExprType(const Fortran::lower::SomeExpr &expr) {
+    std::optional<Fortran::evaluate::DynamicType> dynamicType = expr.GetType();
+    if (!dynamicType)
+      return genTypelessExprType(expr);
+    Fortran::common::TypeCategory category = dynamicType->category();
+
+    mlir::Type baseType;
+    if (category == Fortran::common::TypeCategory::Derived) {
+      TODO(converter.getCurrentLocation(), "genExprType derived");
+    } else {
+      // LOGICAL, INTEGER, REAL, COMPLEX, CHARACTER
+      baseType = genFIRType(context, category, dynamicType->kind());
+    }
+    std::optional<Fortran::evaluate::Shape> shapeExpr =
+        Fortran::evaluate::GetShape(converter.getFoldingContext(), expr);
+    fir::SequenceType::Shape shape;
+    if (shapeExpr) {
+      translateShape(shape, std::move(*shapeExpr));
+    } else {
+      // Shape static analysis cannot return something useful for the shape.
+      // Use unknown extents.
+      int rank = expr.Rank();
+      if (rank < 0)
+        TODO(converter.getCurrentLocation(),
+             "Assumed rank expression type lowering");
+      for (int dim = 0; dim < rank; ++dim)
+        shape.emplace_back(fir::SequenceType::getUnknownExtent());
+    }
+    if (!shape.empty())
+      return fir::SequenceType::get(shape, baseType);
+    return baseType;
+  }
+
   template <typename A>
   void translateShape(A &shape, Fortran::evaluate::Shape &&shapeExpr) {
     for (Fortran::evaluate::MaybeExtentExpr extentExpr : shapeExpr) {
@@ -171,6 +204,34 @@ class TypeBuilder {
         converter.getFoldingContext(), std::move(expr)));
   }
 
+  mlir::Type genTypelessExprType(const Fortran::lower::SomeExpr &expr) {
+    return std::visit(
+        Fortran::common::visitors{
+            [&](const Fortran::evaluate::BOZLiteralConstant &) -> mlir::Type {
+              return mlir::NoneType::get(context);
+            },
+            [&](const Fortran::evaluate::NullPointer &) -> mlir::Type {
+              return fir::ReferenceType::get(mlir::NoneType::get(context));
+            },
+            [&](const Fortran::evaluate::ProcedureDesignator &proc)
+                -> mlir::Type {
+              TODO(converter.getCurrentLocation(),
+                   "genTypelessExprType ProcedureDesignator");
+            },
+            [&](const Fortran::evaluate::ProcedureRef &) -> mlir::Type {
+              return mlir::NoneType::get(context);
+            },
+            [](const auto &x) -> mlir::Type {
+              using T = std::decay_t<decltype(x)>;
+              static_assert(!Fortran::common::HasMember<
+                                T, Fortran::evaluate::TypelessExpression>,
+                            "missing typeless expr handling in type lowering");
+              llvm::report_fatal_error("not a typeless expression");
+            },
+        },
+        expr.u);
+  }
+
   mlir::Type genSymbolType(const Fortran::semantics::Symbol &symbol,
                            bool isAlloc = false, bool isPtr = false) {
     mlir::Location loc = converter.genLocation(symbol.name());
@@ -443,8 +504,8 @@ mlir::Type Fortran::lower::translateDataRefToFIRType(
 }
 
 mlir::Type Fortran::lower::translateSomeExprToFIRType(
-    Fortran::lower::AbstractConverter &converter, const SomeExpr *expr) {
-  return TypeBuilder{converter}.gen(*expr);
+    Fortran::lower::AbstractConverter &converter, const SomeExpr &expr) {
+  return TypeBuilder{converter}.genExprType(expr);
 }
 
 mlir::Type Fortran::lower::translateSymbolToFIRType(
diff --git a/flang/lib/Lower/Mangler.cpp b/flang/lib/Lower/Mangler.cpp
index bc3252b018c83..e58b4d61a71e4 100644
--- a/flang/lib/Lower/Mangler.cpp
+++ b/flang/lib/Lower/Mangler.cpp
@@ -8,8 +8,8 @@
 
 #include "flang/Lower/Mangler.h"
 #include "flang/Common/reference.h"
+#include "flang/Lower/Support/Utils.h"
 #include "flang/Lower/Todo.h"
-#include "flang/Lower/Utils.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Semantics/tools.h"
diff --git a/flang/test/Lower/assignment.f90 b/flang/test/Lower/assignment.f90
new file mode 100644
index 0000000000000..6cb2e32095cee
--- /dev/null
+++ b/flang/test/Lower/assignment.f90
@@ -0,0 +1,24 @@
+! RUN: bbc %s -o "-" -emit-fir | FileCheck %s
+
+subroutine sub1(a)
+  integer :: a
+  a = 1
+end
+
+! CHECK-LABEL: func @_QPsub1(
+! CHECK-SAME:    %[[ARG0:.*]]: !fir.ref<i32>
+! CHECK:         %[[C1:.*]] = arith.constant 1 : i32
+! CHECK:         fir.store %[[C1]] to %[[ARG0]] : !fir.ref<i32>
+
+subroutine sub2(a, b)
+  integer(4) :: a
+  integer(8) :: b
+  a = b
+end
+
+! CHECK-LABEL: func @_QPsub2(
+! CHECK:         %[[A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}
+! CHECK:         %[[B:.*]]: !fir.ref<i64> {fir.bindc_name = "b"}
+! CHECK:         %[[B_VAL:.*]] = fir.load %arg1 : !fir.ref<i64>
+! CHECK:         %[[B_CONV:.*]] = fir.convert %[[B_VAL]] : (i64) -> i32
+! CHECK:         fir.store %[[B_CONV]] to %[[A]] : !fir.ref<i32>

From 2404313d8023d2a650f4cd12f8b4e334c58f5736 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 17 Feb 2022 09:21:42 -0800
Subject: [PATCH 132/748] [instsimplify] Fix a miscompile with zero sized
 allocas

Remove some code which tried to handle the case of comparing two allocas where an object size could not be precisely computed.  This code had zero coverage in tree, and at least one nasty bug.

The bug comes from the fact that the code uses the size of the result pointer as a proxy for whether the alloca can be of size zero.  Since the result of an alloca is *always* a pointer type, and a pointer type can *never* be empty, this check was a nop.  As a result, we blindly consider a zero offset from two allocas to never be equal.  They can in fact be equal when one or more of the allocas is zero sized.

This is particularly ugly because instcombine contains the exact opposite rule.  If instcombine reaches the allocas first, it combines them into one (making them equal).  If instsimplify reaches the compare first, it would consider them not equal.  This creates all kinds of fun scenarios for order of optimization reaching different and contradictory conclusions.
---
 llvm/lib/Analysis/InstructionSimplify.cpp    |  8 --------
 llvm/test/Transforms/InstSimplify/compare.ll | 10 ++++++++--
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 23f2e06b6e777..7d5b62d9a8048 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2640,14 +2640,6 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
       }
-
-      // Repeat the above check but this time without depending on DataLayout
-      // or being able to compute a precise size.
-      if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
-          !cast<PointerType>(RHS->getType())->isEmptyTy() &&
-          LHSOffset.isNullValue() && RHSOffset.isNullValue())
-        return ConstantInt::get(GetCompareTy(LHS),
-                                !CmpInst::isTrueWhenEqual(Pred));
     }
 
     // If one side of the equality comparison must come from a noalias call
diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index b305296a49769..7daee2a8a8da5 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -2703,7 +2703,10 @@ define <2 x i1> @cttz_slt_bitwidth_splat(<2 x i13> %x) {
 ; FIXME: A zero sized alloca *can* be equal to another alloca
 define i1 @zero_sized_alloca1() {
 ; CHECK-LABEL: @zero_sized_alloca1(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, i32 0, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, i32 0, align 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %a = alloca i32, i32 0
   %b = alloca i32, i32 0
@@ -2713,7 +2716,10 @@ define i1 @zero_sized_alloca1() {
 
 define i1 @zero_sized_alloca2() {
 ; CHECK-LABEL: @zero_sized_alloca2(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, i32 0, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A]], [[B]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %a = alloca i32, i32 0
   %b = alloca i32

From 9de4fc0f2d3b60542956f7e5254951d049edeb1f Mon Sep 17 00:00:00 2001
From: Shangwu Yao <shangwuyao@waymo.com>
Date: Thu, 17 Feb 2022 09:38:06 -0800
Subject: [PATCH 133/748] [CUDA][SPIRV] Assign global address space to CUDA
 kernel arguments

This patch converts CUDA pointer kernel arguments with default address space to
CrossWorkGroup address space (__global in OpenCL). This is because Generic or
Function (OpenCL's private) is not supported as storage class for kernel pointer types.

Differential Revision: https://reviews.llvm.org/D119207
---
 clang/lib/Basic/Targets/SPIR.h                 | 10 +++++-----
 clang/lib/CodeGen/TargetInfo.cpp               |  6 +++---
 clang/test/CodeGenCUDASPIRV/kernel-argument.cu | 17 +++++++++++++++++
 3 files changed, 25 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CodeGenCUDASPIRV/kernel-argument.cu

diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index a40d4b3ca27e1..08c49f018ac79 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -144,16 +144,16 @@ class LLVM_LIBRARY_VISIBILITY BaseSPIRTargetInfo : public TargetInfo {
     // FIXME: SYCL specification considers unannotated pointers and references
     // to be pointing to the generic address space. See section 5.9.3 of
     // SYCL 2020 specification.
-    // Currently, there is no way of representing SYCL's and HIP's default
+    // Currently, there is no way of representing SYCL's and HIP/CUDA's default
     // address space language semantic along with the semantics of embedded C's
     // default address space in the same address space map. Hence the map needs
     // to be reset to allow mapping to the desired value of 'Default' entry for
-    // SYCL and HIP.
+    // SYCL and HIP/CUDA.
     setAddressSpaceMap(
         /*DefaultIsGeneric=*/Opts.SYCLIsDevice ||
-        // The address mapping from HIP language for device code is only defined
-        // for SPIR-V.
-        (getTriple().isSPIRV() && Opts.HIP && Opts.CUDAIsDevice));
+        // The address mapping from HIP/CUDA language for device code is only
+        // defined for SPIR-V.
+        (getTriple().isSPIRV() && Opts.CUDAIsDevice));
   }
 
   void setSupportedOpenCLOpts() override {
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 3e1df744b2ad7..5a2991dfe1762 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -10320,10 +10320,10 @@ void CommonSPIRABIInfo::setCCs() {
 }
 
 ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const {
-  if (getContext().getLangOpts().HIP) {
+  if (getContext().getLangOpts().CUDAIsDevice) {
     // Coerce pointer arguments with default address space to CrossWorkGroup
-    // pointers for HIPSPV. When the language mode is HIP, the SPIRTargetInfo
-    // maps cuda_device to SPIR-V's CrossWorkGroup address space.
+    // pointers for HIPSPV/CUDASPV. When the language mode is HIP/CUDA, the
+    // SPIRTargetInfo maps cuda_device to SPIR-V's CrossWorkGroup address space.
     llvm::Type *LTy = CGT.ConvertType(Ty);
     auto DefaultAS = getContext().getTargetAddressSpace(LangAS::Default);
     auto GlobalAS = getContext().getTargetAddressSpace(LangAS::cuda_device);
diff --git a/clang/test/CodeGenCUDASPIRV/kernel-argument.cu b/clang/test/CodeGenCUDASPIRV/kernel-argument.cu
new file mode 100644
index 0000000000000..0ccacffd12a5f
--- /dev/null
+++ b/clang/test/CodeGenCUDASPIRV/kernel-argument.cu
@@ -0,0 +1,17 @@
+// Tests CUDA kernel arguments get global address space when targetting SPIR-V.
+
+// REQUIRES: clang-driver
+
+// RUN: %clang -emit-llvm --cuda-device-only --offload=spirv32 \
+// RUN:   -nocudalib -nocudainc %s -o %t.bc -c 2>&1
+// RUN: llvm-dis %t.bc -o %t.ll
+// RUN: FileCheck %s --input-file=%t.ll
+
+// RUN: %clang -emit-llvm --cuda-device-only --offload=spirv64 \
+// RUN:   -nocudalib -nocudainc %s -o %t.bc -c 2>&1
+// RUN: llvm-dis %t.bc -o %t.ll
+// RUN: FileCheck %s --input-file=%t.ll
+
+// CHECK: define spir_kernel void @_Z6kernelPi(i32 addrspace(1)* noundef %output.coerce)
+
+__attribute__((global)) void kernel(int* output) { *output = 1; }

From cf5e88864b286e5b3433cd2d7995fe9465d57804 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 17 Feb 2022 09:50:32 -0800
Subject: [PATCH 134/748] [instsimplify] When compare allocas, consider their
 minimal size

The code was using exact sizing only, but since what we really need is just to make sure the offsets are in bounds, a minimum bound on the object size is sufficient.

To demonstrate the difference, support computing minimum sizes from obects of scalable vector type.
---
 llvm/lib/Analysis/InstructionSimplify.cpp    |  1 +
 llvm/lib/Analysis/MemoryBuiltins.cpp         |  6 +++---
 llvm/test/Transforms/InstSimplify/compare.ll | 10 ++++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 7d5b62d9a8048..35e93143f96a1 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2631,6 +2631,7 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
         (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
       uint64_t LHSSize, RHSSize;
       ObjectSizeOpts Opts;
+      Opts.EvalMode = ObjectSizeOpts::Mode::Min;
       Opts.NullIsUnknownSize =
           NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
       if (getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 5dc56654512a1..a52b1a8b19ccc 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -659,10 +659,10 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return unknown();
 
-  if (isa<ScalableVectorType>(I.getAllocatedType()))
+  TypeSize ElemSize = DL.getTypeAllocSize(I.getAllocatedType());
+  if (ElemSize.isScalable() && Options.EvalMode != ObjectSizeOpts::Mode::Min)
     return unknown();
-
-  APInt Size(IntTyBits, DL.getTypeAllocSize(I.getAllocatedType()));
+  APInt Size(IntTyBits, ElemSize.getKnownMinSize());
   if (!I.isArrayAllocation())
     return std::make_pair(align(Size, I.getAlign()), Zero);
 
diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index 7daee2a8a8da5..25a6c19d2291c 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -2727,5 +2727,15 @@ define i1 @zero_sized_alloca2() {
   ret i1 %res
 }
 
+define i1 @scalar_vectors_are_non_empty() {
+; CHECK-LABEL: @scalar_vectors_are_non_empty(
+; CHECK-NEXT:    ret i1 true
+;
+  %a = alloca <vscale x 2 x i32>
+  %b = alloca <vscale x 2 x i32>
+  %res = icmp ne <vscale x 2 x i32>* %a, %b
+  ret i1 %res
+}
+
 
 attributes #0 = { null_pointer_is_valid }

From 27f72eb25e366cf6fd79ea7495fec5d926a5b895 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 17 Feb 2022 09:34:58 -0800
Subject: [PATCH 135/748] [SLP][NFC]Add another test for swapped main/alternate
 cmp, NFC.

---
 .../SLPVectorizer/X86/cmp-as-alternate-ops.ll | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
index 7463fa08b2143..0d17fc440cd97 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
@@ -41,3 +41,50 @@ define void @test(double* %0, double %1) {
   store double %15, double* %4, align 8
   br label %6
 }
+
+define { <2 x float>, <2 x float> } @test1(i32 %conv.i32.i.i.i) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV_I32_I_I_I1:%.*]] = fptosi float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[CONV_I32_I_I_I:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[CONV_I32_I_I_I1]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> zeroinitializer, <4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    [[RETVAL_SROA_0_0_VEC_INSERT4:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
+; CHECK-NEXT:    [[RETVAL_SROA_0_4_VEC_INSERT7:%.*]] = insertelement <2 x float> [[RETVAL_SROA_0_0_VEC_INSERT4]], float [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
+; CHECK-NEXT:    [[RETVAL_SROA_7_8_VEC_INSERT11:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
+; CHECK-NEXT:    [[RETVAL_SROA_7_12_VEC_INSERT13:%.*]] = insertelement <2 x float> [[RETVAL_SROA_7_8_VEC_INSERT11]], float [[TMP9]], i64 1
+; CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } zeroinitializer, <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT7]], 0
+; CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[RETVAL_SROA_7_12_VEC_INSERT13]], 1
+; CHECK-NEXT:    ret { <2 x float>, <2 x float> } zeroinitializer
+;
+entry:
+  %cmp.i.i.i.i.i = icmp slt i32 0, 0
+  %cond.i.i.i.i = select i1 %cmp.i.i.i.i.i, float 0.000000e+00, float 0.000000e+00
+  %conv.i32.i.i.i1 = fptosi float 0.000000e+00 to i32
+  %cmp.i.i34.i.i.i = icmp slt i32 %conv.i32.i.i.i1, 0
+  %cond.i35.i.i.i = select i1 %cmp.i.i34.i.i.i, float 0.000000e+00, float 0.000000e+00
+  %cmp.i.i38.i.i.i = icmp sgt i32 0, 0
+  %cond.i39.i.i.i = select i1 %cmp.i.i38.i.i.i, float 0.000000e+00, float 0.000000e+00
+  %cmp.i.i42.i.i.i = icmp sgt i32 %conv.i32.i.i.i, 0
+  %cond.i43.i.i.i = select i1 %cmp.i.i42.i.i.i, float 0.000000e+00, float 0.000000e+00
+  %add.i.i = fadd float 0.000000e+00, 0.000000e+00
+  %add4.i.i = fadd float 0.000000e+00, 0.000000e+00
+  %add.i9.i = fadd float %cond.i43.i.i.i, %add.i.i
+  %retval.sroa.0.0.vec.insert4 = insertelement <2 x float> zeroinitializer, float %add.i9.i, i64 0
+  %add4.i12.i = fadd float %cond.i39.i.i.i, %add4.i.i
+  %retval.sroa.0.4.vec.insert7 = insertelement <2 x float> %retval.sroa.0.0.vec.insert4, float %add4.i12.i, i64 1
+  %add.i15.i = fadd float %cond.i35.i.i.i, %add.i.i
+  %retval.sroa.7.8.vec.insert11 = insertelement <2 x float> zeroinitializer, float %add.i15.i, i64 0
+  %add4.i18.i = fadd float %cond.i.i.i.i, %add4.i.i
+  %retval.sroa.7.12.vec.insert13 = insertelement <2 x float> %retval.sroa.7.8.vec.insert11, float %add4.i18.i, i64 1
+  %.fca.0.insert = insertvalue { <2 x float>, <2 x float> } zeroinitializer, <2 x float> %retval.sroa.0.4.vec.insert7, 0
+  %.fca.1.insert = insertvalue { <2 x float>, <2 x float> } %.fca.0.insert, <2 x float> %retval.sroa.7.12.vec.insert13, 1
+  ret { <2 x float>, <2 x float> } zeroinitializer
+}

From 4a26abc0b9ec426f562b7ac6baa5db3d8636a12f Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 17 Feb 2022 10:00:26 -0800
Subject: [PATCH 136/748] [InstCombine][OpaquePtr] Check store type in DSE
 implementation

---
 .../InstCombine/InstCombineLoadStoreAlloca.cpp        |  6 ++++--
 llvm/test/Transforms/InstCombine/opaque-ptr.ll        | 11 +++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 756792918dba5..fffef500312f6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1395,8 +1395,10 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
 
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
-      if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
-                                                        SI.getOperand(1))) {
+      if (PrevSI->isUnordered() &&
+          equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1)) &&
+          PrevSI->getValueOperand()->getType() ==
+              SI.getValueOperand()->getType()) {
         ++NumDeadStore;
         // Manually add back the original store to the worklist now, so it will
         // be processed after the operands of the removed store, as this may
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index 87326008386d1..1826cfdf43bbb 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -482,3 +482,14 @@ define ptr @select_of_gep_different_type(i1 %c, ptr %p) {
   %s = select i1 %c, ptr %gep1, ptr %gep2
   ret ptr %s
 }
+
+define void @dse(ptr %p) {
+; CHECK-LABEL: @dse(
+; CHECK-NEXT:    store i32 0, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store i8 1, ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  store i32 0, ptr %p
+  store i8 1, ptr %p
+  ret void
+}

From e0e174845b08b36a3888f47f6b06e496f75cf847 Mon Sep 17 00:00:00 2001
From: Artem Dergachev <artem.dergachev@gmail.com>
Date: Wed, 16 Feb 2022 21:09:09 -0800
Subject: [PATCH 137/748] [analyzer] Fix a crash in NoStateChangeVisitor with
 body-farmed stack frames.

LocationContext::getDecl() isn't useful for obtaining the "farmed" body because
the (synthetic) body statement isn't actually attached to the (natural-grown)
declaration in the AST.

Differential Revision: https://reviews.llvm.org/D119509
---
 .../StaticAnalyzer/Checkers/MallocChecker.cpp | 10 ++++++-
 clang/test/Analysis/malloc-bodyfarms.c        | 19 +++++++++++++
 clang/test/Analysis/malloc-bodyfarms.cpp      | 28 +++++++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Analysis/malloc-bodyfarms.c
 create mode 100644 clang/test/Analysis/malloc-bodyfarms.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 57080a84451ad..63194d69d6363 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -797,8 +797,16 @@ class NoOwnershipChangeVisitor final : public NoStateChangeFuncVisitor {
   bool doesFnIntendToHandleOwnership(const Decl *Callee, ASTContext &ACtx) {
     using namespace clang::ast_matchers;
     const FunctionDecl *FD = dyn_cast<FunctionDecl>(Callee);
-    if (!FD)
+
+    // Given that the stack frame was entered, the body should always be
+    // theoretically obtainable. In case of body farms, the synthesized body
+    // is not attached to declaration, thus triggering the '!FD->hasBody()'
+    // branch. That said, would a synthesized body ever intend to handle
+    // ownership? As of today they don't. And if they did, how would we
+    // put notes inside it, given that it doesn't match any source locations?
+    if (!FD || !FD->hasBody())
       return false;
+
     // TODO: Operator delete is hardly the only deallocator -- Can we reuse
     // isFreeingCall() or something thats already here?
     auto Deallocations = match(
diff --git a/clang/test/Analysis/malloc-bodyfarms.c b/clang/test/Analysis/malloc-bodyfarms.c
new file mode 100644
index 0000000000000..c613a6f52dbc9
--- /dev/null
+++ b/clang/test/Analysis/malloc-bodyfarms.c
@@ -0,0 +1,19 @@
+// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker core,unix -verify %s
+
+typedef __typeof(sizeof(int)) size_t;
+void *calloc(size_t, size_t);
+
+typedef struct dispatch_queue_s *dispatch_queue_t;
+typedef void (^dispatch_block_t)(void);
+void dispatch_sync(dispatch_queue_t, dispatch_block_t);
+
+void test_no_state_change_in_body_farm(dispatch_queue_t queue) {
+  dispatch_sync(queue, ^{}); // no-crash
+  calloc(1, 1);
+} // expected-warning{{Potential memory leak}}
+
+void test_no_state_change_in_body_farm_2(dispatch_queue_t queue) {
+  void *p = calloc(1, 1);
+  dispatch_sync(queue, ^{}); // no-crash
+  p = 0;
+} // expected-warning{{Potential leak of memory pointed to by 'p'}}
diff --git a/clang/test/Analysis/malloc-bodyfarms.cpp b/clang/test/Analysis/malloc-bodyfarms.cpp
new file mode 100644
index 0000000000000..f09b2fef9b5a7
--- /dev/null
+++ b/clang/test/Analysis/malloc-bodyfarms.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker core,unix -verify %s
+
+namespace std {
+typedef struct once_flag_s {
+  int _M_once = 0;
+} once_flag;
+
+template <class Callable, class... Args>
+void call_once(once_flag &o, Callable&& func, Args&&... args);
+} // namespace std
+
+typedef __typeof(sizeof(int)) size_t;
+void *malloc(size_t);
+
+void callee() {}
+
+void test_no_state_change_in_body_farm() {
+  std::once_flag flag;
+  call_once(flag, callee); // no-crash
+  malloc(1);
+} // expected-warning{{Potential memory leak}}
+
+void test_no_state_change_in_body_farm_2() {
+  void *p = malloc(1);
+  std::once_flag flag;
+  call_once(flag, callee); // no-crash
+  p = 0;
+} // expected-warning{{Potential leak of memory pointed to by 'p'}}

From b9f4dff8ab40250aac2343e86c1289de46af5585 Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Thu, 17 Feb 2022 18:20:23 +0000
Subject: [PATCH 138/748] [Driver][Fuchsia][NFC] Use GetLinkerPath to see if
 linker is lld

Reviewed By: phosek

Differential Revision: https://reviews.llvm.org/D120074
---
 clang/lib/Driver/ToolChains/Fuchsia.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 9e0b259dfcae7..1b60541ee8463 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -53,9 +53,9 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-z");
   CmdArgs.push_back("now");
 
-  const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  if (llvm::sys::path::filename(Exec).equals_insensitive("ld.lld") ||
-      llvm::sys::path::stem(Exec).equals_insensitive("ld.lld")) {
+  bool IsLLD;
+  const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath(&IsLLD));
+  if (IsLLD) {
     CmdArgs.push_back("-z");
     CmdArgs.push_back("rodynamic");
     CmdArgs.push_back("-z");

From cf426100d665e8368abfc036a734b2402f94fac0 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Thu, 17 Feb 2022 02:12:06 +0100
Subject: [PATCH 139/748] [SystemZ] Improve emission of alignment hints.

Handle multiple memoperands in lowerAlignmentHint().

Review: Ulrich Weigand
---
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 14 +++++---
 llvm/test/CodeGen/SystemZ/vec-move-03.ll      | 32 +++++++++++++++++++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 5c255967bc870..46538f966cd90 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -88,13 +88,19 @@ static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
 // an instruction with the corresponding hint set.
 static void lowerAlignmentHint(const MachineInstr *MI, MCInst &LoweredMI,
                                unsigned Opcode) {
-  if (!MI->hasOneMemOperand())
+  if (MI->memoperands_empty())
     return;
-  const MachineMemOperand *MMO = *MI->memoperands_begin();
+
+  Align Alignment = Align(16);
+  for (MachineInstr::mmo_iterator MMOI = MI->memoperands_begin(),
+         EE = MI->memoperands_end(); MMOI != EE; ++MMOI)
+    if ((*MMOI)->getAlign() < Alignment)
+      Alignment = (*MMOI)->getAlign();
+
   unsigned AlignmentHint = 0;
-  if (MMO->getAlign() >= Align(16))
+  if (Alignment >= Align(16))
     AlignmentHint = 4;
-  else if (MMO->getAlign() >= Align(8))
+  else if (Alignment >= Align(8))
     AlignmentHint = 3;
   if (AlignmentHint == 0)
     return;
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-03.ll b/llvm/test/CodeGen/SystemZ/vec-move-03.ll
index 4f5bb0c374f4a..6e5b314d75618 100644
--- a/llvm/test/CodeGen/SystemZ/vec-move-03.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-03.ll
@@ -182,3 +182,35 @@ define void @f19(<16 x i8> %val, <16 x i8> *%ptr) {
   ret void
 }
 
+; Test that the alignment hint for VST is emitted also when CFG optimizer
+; replaces two VSTs with just one that then carries two memoperands.
+define void @f20() {
+; CHECK-LABEL: f20:
+; CHECK: vst %v0, 0(%r1), 3
+; CHECK-NOT: vst
+entry:
+  switch i32 undef, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+  ]
+
+bb1:
+  %C1 = call i64* @foo()
+  %I1 = insertelement <2 x i64*> poison, i64* %C1, i64 0
+  %S1 = shufflevector <2 x i64*> %I1, <2 x i64*> poison, <2 x i32> zeroinitializer
+  store <2 x i64*> %S1, <2 x i64*>* undef, align 8
+  br label %exit
+
+bb2:
+  %C2 = call i64* @foo()
+  %I2 = insertelement <2 x i64*> poison, i64* %C2, i64 0
+  %S2 = shufflevector <2 x i64*> %I2, <2 x i64*> poison, <2 x i32> zeroinitializer
+  %U = bitcast i64** undef to <2 x i64*>*
+  store <2 x i64*> %S2, <2 x i64*>* %U, align 8
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare i64* @foo()

From 383f3a467c92499956ed804eb2bd69ad8576615b Mon Sep 17 00:00:00 2001
From: Mike Rice <michael.p.rice@intel.com>
Date: Wed, 16 Feb 2022 13:58:45 -0800
Subject: [PATCH 140/748] [OpenMP] Diagnose bad 'omp declare variant' that
 references itself.

When an a variant is specified that is the same as the base function
the compiler will end up crashing in CodeGen. Give an error instead.

Differential Revision: https://reviews.llvm.org/D119979
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 ++
 clang/lib/Sema/SemaOpenMP.cpp                    | 7 +++++++
 clang/test/OpenMP/declare_variant_messages.c     | 9 +++++++++
 3 files changed, 18 insertions(+)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index e7c204fef2a09..8af1bed7b67f1 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10830,6 +10830,8 @@ def err_omp_interop_type_not_found : Error<
 def err_omp_declare_variant_incompat_types : Error<
   "variant in '#pragma omp declare variant' with type %0 is incompatible with"
   " type %1%select{| with appended arguments}2">;
+def err_omp_declare_variant_same_base_function : Error<
+  "variant in '#pragma omp declare variant' is the same as the base function">;
 def warn_omp_declare_variant_marked_as_declare_variant : Warning<
   "variant function in '#pragma omp declare variant' is itself marked as '#pragma omp declare variant'"
   >, InGroup<SourceUsesOpenMP>;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 79823fcf148b7..64647f59fcb5f 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -7171,6 +7171,13 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
     return None;
   }
 
+  if (FD->getCanonicalDecl() == NewFD->getCanonicalDecl()) {
+    Diag(VariantRef->getExprLoc(),
+         diag::err_omp_declare_variant_same_base_function)
+        << VariantRef->getSourceRange();
+    return None;
+  }
+
   // Check if function types are compatible in C.
   if (!LangOpts.CPlusPlus) {
     QualType NewType =
diff --git a/clang/test/OpenMP/declare_variant_messages.c b/clang/test/OpenMP/declare_variant_messages.c
index 66ead8909ad8f..a049285cdb01c 100644
--- a/clang/test/OpenMP/declare_variant_messages.c
+++ b/clang/test/OpenMP/declare_variant_messages.c
@@ -113,6 +113,15 @@ int bar(void) {
   return after_use();
 }
 
+// expected-error@+1 {{variant in '#pragma omp declare variant' is the same as the base function}}
+#pragma omp declare variant (self) \
+  match(construct={dispatch}, device={arch(arm)})
+void self(int n);
+
+void self_test(int n, int d_no) {
+  #pragma omp dispatch device(d_no) nowait
+  self(n);
+}
 
 #pragma omp declare variant(after_use_variant) match(xxx={}) // expected-warning {{'xxx' is not a valid context set in a `declare variant`; set ignored}} expected-warning {{'#pragma omp declare variant' cannot be applied for function after first usage; the original function might be used}} expected-note {{context set options are: 'construct' 'device' 'implementation' 'user'}} expected-note {{the ignored set spans until here}}
 int after_use(void);

From 5364b36868210364b2ccf8e9f9169ed1fd545ae0 Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Thu, 17 Feb 2022 18:41:49 +0000
Subject: [PATCH 141/748] Revert "[Driver][Fuchsia][NFC] Use GetLinkerPath to
 see if linker is lld"

This reverts commit b9f4dff8ab40250aac2343e86c1289de46af5585.
---
 clang/lib/Driver/ToolChains/Fuchsia.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 1b60541ee8463..9e0b259dfcae7 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -53,9 +53,9 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-z");
   CmdArgs.push_back("now");
 
-  bool IsLLD;
-  const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath(&IsLLD));
-  if (IsLLD) {
+  const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
+  if (llvm::sys::path::filename(Exec).equals_insensitive("ld.lld") ||
+      llvm::sys::path::stem(Exec).equals_insensitive("ld.lld")) {
     CmdArgs.push_back("-z");
     CmdArgs.push_back("rodynamic");
     CmdArgs.push_back("-z");

From 7c3e2b92cf66fd7cf84e59e10fb911d0887c5788 Mon Sep 17 00:00:00 2001
From: Daniil Suchkov <dsuchkov@azul.com>
Date: Wed, 16 Feb 2022 23:21:15 +0000
Subject: [PATCH 142/748] [RewriteStatepointsForGC] Fix an incorrect assertion

The assertion verifying that a newly computed value matches what is
already cached used stripPointerCasts() to strip bitcasts, however the
values can be not only pointers, but also vectors of pointers. That is
problematic because stripPointerCasts() doesn't handle vectors of
pointers. This patch introduces an ad-hoc utility function to strip all
bitcasts regardless of the value type.

Reviewed By: skatkov, reames

Differential Revision: https://reviews.llvm.org/D119994
---
 llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 10 +++++++++-
 .../RewriteStatepointsForGC/phi-vector-bitcast.ll      |  1 -
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 6be0c7dc3adac..9d7ea43629d8b 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1164,13 +1164,21 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
 #ifndef NDEBUG
           Value *OldBase = BlockToValue[InBB];
           Value *Base = getBaseForInput(InVal, nullptr);
+
+          // We can't use `stripPointerCasts` instead of this function because
+          // `stripPointerCasts` doesn't handle vectors of pointers.
+          auto StripBitCasts = [](Value *V) -> Value * {
+            while (auto *BC = dyn_cast<BitCastInst>(V))
+              V = BC->getOperand(0);
+            return V;
+          };
           // In essence this assert states: the only way two values
           // incoming from the same basic block may be different is by
           // being different bitcasts of the same value.  A cleanup
           // that remains TODO is changing findBaseOrBDV to return an
           // llvm::Value of the correct type (and still remain pure).
           // This will remove the need to add bitcasts.
-          assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
+          assert(StripBitCasts(Base) == StripBitCasts(OldBase) &&
                  "findBaseOrBDV should be pure!");
 #endif
         }
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll b/llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll
index 6f69234cc993e..f87b84ef3480b 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/phi-vector-bitcast.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; REQUIRES: asserts
 ; RUN: opt < %s -disable-output -passes=rewrite-statepoints-for-gc
 

From 21e5a5f0cfab0268e0e089170f4f8f61d89597cf Mon Sep 17 00:00:00 2001
From: Kuba Mracek <mracek@apple.com>
Date: Wed, 16 Feb 2022 21:20:04 -0800
Subject: [PATCH 143/748] [GlobalDCE] [VFE] Add a test for incorrect VFE
 behavior in presence of null/invalid vtable entries

Add a test for VFE where there's several vtables, and one of them contains an
invalid entry (from VFE's perspective), and which causes VFE to incorrectly skip
scanning subsequent vtables and drop their dependencies.
---
 .../GlobalDCE/virtual-functions-null.ll       | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll

diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll
new file mode 100644
index 0000000000000..33be6451fa3f7
--- /dev/null
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -globaldce -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare { i8*, i1 } @llvm.type.checked.load(i8*, i32, metadata)
+
+@vtableA = internal unnamed_addr constant { [2 x i8*] } { [2 x i8*] [
+  i8* null,
+  i8* bitcast (void ()* @vfunc2 to i8*)
+]}, align 8, !type !{i64 0, !"vfunc1.type"}, !type !{i64 8, !"vfunc2.type"}, !vcall_visibility !{i64 2}
+
+; CHECK:      @vtableA = internal unnamed_addr constant { [2 x i8*] } { [2 x i8*] [
+; CHECK-SAME:   i8* null,
+; CHECK-SAME:   i8* bitcast (void ()* @vfunc2 to i8*)
+; CHECK-SAME: ] }, align 8
+
+@vtableB = internal unnamed_addr constant { [2 x i8*] } { [2 x i8*] [
+  i8* bitcast (void ()* @vfunc1 to i8*),
+  i8* bitcast (void ()* @vfunc2 to i8*)
+]}, align 8, !type !{i64 0, !"vfunc1.type"}, !type !{i64 8, !"vfunc2.type"}, !vcall_visibility !{i64 2}
+
+; CHECK:      @vtableB = internal unnamed_addr constant { [2 x i8*] } { [2 x i8*] [
+; CHECK-SAME:   i8* null,
+; CHECK-SAME:   i8* bitcast (void ()* @vfunc2 to i8*)
+; CHECK-SAME: ] }, align 8
+
+define internal void @vfunc1() {
+  ret void
+}
+
+define internal void @vfunc2() {
+  ret void
+}
+
+define void @main() {
+  %1 = ptrtoint { [2 x i8*] }* @vtableA to i64 ; to keep @vtableA alive
+  %2 = ptrtoint { [2 x i8*] }* @vtableB to i64 ; to keep @vtableB alive
+  %3 = tail call { i8*, i1 } @llvm.type.checked.load(i8* null, i32 0, metadata !"vfunc1.type")
+  %4 = tail call { i8*, i1 } @llvm.type.checked.load(i8* null, i32 0, metadata !"vfunc2.type")
+  ret void
+}
+
+!999 = !{i32 1, !"Virtual Function Elim", i32 1}
+!llvm.module.flags = !{!999}

From 254d6da02067cab500231b0ddb4db67819645cb0 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 17 Feb 2022 12:48:32 -0600
Subject: [PATCH 144/748] [Attributor][FIX] Ensure stable iteration order

With
https://github.com/llvm/llvm-project/commit/668c5c688be7ab0af37739bbbe2d653be82d5c6f
we introduced an ordering issue revealed by the reverse iteration
buildbot. Depending on the order of the map that tracks the AAIsDead AAs
we ended up with slightly different attributes. This is not totally
unexpected and can happen. We should however be deterministic in our
orderings to avoid such issues.
---
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 0ad64dbb45953..c94f38687b219 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
@@ -270,7 +271,7 @@ static bool genericValueTraversal(
     const AAIsDead *LivenessAA = nullptr;
     bool AnyDead = false;
   };
-  DenseMap<const Function *, LivenessInfo> LivenessAAs;
+  SmallMapVector<const Function *, LivenessInfo, 4> LivenessAAs;
   auto GetLivenessInfo = [&](const Function &F) -> LivenessInfo & {
     LivenessInfo &LI = LivenessAAs[&F];
     if (!LI.LivenessAA)

From 5824d2bb0f036e631419ae0993fd03d633398266 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Thu, 17 Feb 2022 13:52:07 -0500
Subject: [PATCH 145/748] Fix the declaration printer to properly handle
 prototypes in C

Previously, we would take a declaration like void f(void) and print it
as void f(). That's correct in C++ as far as it goes, but is incorrect
in C because that converts the function from having a prototype to one
which does not.

This turns out to matter for some of our tests that use the pretty
printer where we'd like to get rid of the K&R prototypes from the test
but can't because the test is checking the pretty printed function
signature, as done with the ARCMT tests.
---
 clang/lib/AST/DeclPrinter.cpp                                | 4 ++++
 clang/test/Analysis/cfg.c                                    | 2 +-
 clang/test/Analysis/designated-initializer.c                 | 2 +-
 .../Analysis/std-c-library-functions-vs-stream-checker.c     | 2 +-
 clang/test/Analysis/std-c-library-functions.c                | 2 +-
 clang/test/OpenMP/declare_mapper_ast_print.c                 | 2 +-
 clang/test/OpenMP/declare_reduction_ast_print.c              | 2 +-
 clang/test/OpenMP/declare_variant_ast_print.c                | 4 ++--
 clang/test/OpenMP/metadirective_ast_print.c                  | 4 ++--
 clang/test/PCH/chain-decls.c                                 | 4 ++--
 clang/test/PCH/chain-macro.c                                 | 4 ++--
 clang/test/Sema/attr-print.c                                 | 5 ++---
 clang/test/SemaObjC/static-ivar-ref-1.m                      | 2 +-
 13 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index c3f1d1544f79a..faafe307f03cf 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -680,6 +680,10 @@ void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) {
       if (FT->isVariadic()) {
         if (D->getNumParams()) POut << ", ";
         POut << "...";
+      } else if (!D->getNumParams() && !Context.getLangOpts().CPlusPlus) {
+        // The function has a prototype, so it needs to retain the prototype
+        // in C.
+        POut << "void";
       }
     } else if (D->doesThisDeclarationHaveABody() && !D->hasPrototype()) {
       for (unsigned i = 0, e = D->getNumParams(); i != e; ++i) {
diff --git a/clang/test/Analysis/cfg.c b/clang/test/Analysis/cfg.c
index 4bd84e689f251..fc2523859e49b 100644
--- a/clang/test/Analysis/cfg.c
+++ b/clang/test/Analysis/cfg.c
@@ -40,7 +40,7 @@ void checkWrap(int i) {
   }
 }
 
-// CHECK-LABEL: void checkGCCAsmRValueOutput()
+// CHECK-LABEL: void checkGCCAsmRValueOutput(void)
 // CHECK: [B2 (ENTRY)]
 // CHECK-NEXT: Succs (1): B1
 // CHECK: [B1]
diff --git a/clang/test/Analysis/designated-initializer.c b/clang/test/Analysis/designated-initializer.c
index aba037a3f49bb..6274ed12911a4 100644
--- a/clang/test/Analysis/designated-initializer.c
+++ b/clang/test/Analysis/designated-initializer.c
@@ -16,7 +16,7 @@ void test(void) {
   }; 
 }
 
-// CHECK: void test()
+// CHECK: void test(void)
 // CHECK: [B1]
 // CHECK:   1: getUQ
 // CHECK:   2: [B1.1] (ImplicitCastExpr, FunctionToPointerDecay, union UQ (*)(void))
diff --git a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
index 61106f1f8d6bc..e895b54158de3 100644
--- a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
+++ b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c
@@ -28,7 +28,7 @@
 
 // Verify that the summaries are loaded when the StdLibraryFunctionsChecker is
 // enabled.
-//      CHECK: Loaded summary for: int getchar()
+//      CHECK: Loaded summary for: int getchar(void)
 // CHECK-NEXT: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: unsigned long fwrite(const void *restrict, size_t, size_t, FILE *restrict)
 
diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c
index 50b36dc84e68b..a4032298734d0 100644
--- a/clang/test/Analysis/std-c-library-functions.c
+++ b/clang/test/Analysis/std-c-library-functions.c
@@ -52,7 +52,7 @@
 // CHECK-NEXT: Loaded summary for: int isxdigit(int)
 // CHECK-NEXT: Loaded summary for: int getc(FILE *)
 // CHECK-NEXT: Loaded summary for: int fgetc(FILE *)
-// CHECK-NEXT: Loaded summary for: int getchar()
+// CHECK-NEXT: Loaded summary for: int getchar(void)
 // CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict)
 // CHECK-NEXT: Loaded summary for: ssize_t read(int, void *, size_t)
diff --git a/clang/test/OpenMP/declare_mapper_ast_print.c b/clang/test/OpenMP/declare_mapper_ast_print.c
index 55ebd8334c587..6b9686f0a15b8 100644
--- a/clang/test/OpenMP/declare_mapper_ast_print.c
+++ b/clang/test/OpenMP/declare_mapper_ast_print.c
@@ -41,7 +41,7 @@ struct dat {
 #pragma omp declare mapper(struct dat d) map(to: d.d)
 // CHECK: #pragma omp declare mapper (default : struct dat d) map(to: d.d){{$}}
 
-// CHECK: int main() {
+// CHECK: int main(void) {
 int main(void) {
 #pragma omp declare mapper(id: struct vec v) map(v.len)
 // CHECK: #pragma omp declare mapper (id : struct vec v) map(tofrom: v.len)
diff --git a/clang/test/OpenMP/declare_reduction_ast_print.c b/clang/test/OpenMP/declare_reduction_ast_print.c
index cdc1685b14ef2..74c28b3219f51 100644
--- a/clang/test/OpenMP/declare_reduction_ast_print.c
+++ b/clang/test/OpenMP/declare_reduction_ast_print.c
@@ -31,7 +31,7 @@ void init(struct SSS *priv, struct SSS orig);
 #pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
 // CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
 
-// CHECK: int main() {
+// CHECK: int main(void) {
 int main(void) {
 #pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
   // CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
diff --git a/clang/test/OpenMP/declare_variant_ast_print.c b/clang/test/OpenMP/declare_variant_ast_print.c
index d5c2d440f25ae..0df10263cde5e 100644
--- a/clang/test/OpenMP/declare_variant_ast_print.c
+++ b/clang/test/OpenMP/declare_variant_ast_print.c
@@ -25,7 +25,7 @@ int foo(void);
 #pragma omp declare variant(foo) match(implementation={extension(match_none)})
 int bar(void);
 
-// CHECK:      int foo();
+// CHECK:      int foo(void);
 // CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={extension(match_none)})
 // CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={extension(match_any)})
 // CHECK-NEXT: #pragma omp declare variant(foo) match(implementation={extension(match_all)})
@@ -41,4 +41,4 @@ int bar(void);
 // CHECK-NEXT: #pragma omp declare variant(foo) match(construct={parallel})
 // CHECK-NEXT: #pragma omp declare variant(foo) match(construct={teams})
 // CHECK-NEXT: #pragma omp declare variant(foo) match(construct={target})
-// CHECK-NEXT: int bar();
+// CHECK-NEXT: int bar(void);
diff --git a/clang/test/OpenMP/metadirective_ast_print.c b/clang/test/OpenMP/metadirective_ast_print.c
index fbd7e2291330f..6c75cb0592d6c 100644
--- a/clang/test/OpenMP/metadirective_ast_print.c
+++ b/clang/test/OpenMP/metadirective_ast_print.c
@@ -59,8 +59,8 @@ void foo(void) {
   }
 }
 
-// CHECK: void bar();
-// CHECK: void foo()
+// CHECK: void bar(void);
+// CHECK: void foo(void)
 // CHECK-NEXT: #pragma omp parallel
 // CHECK-NEXT: bar()
 // CHECK-NEXT: #pragma omp parallel
diff --git a/clang/test/PCH/chain-decls.c b/clang/test/PCH/chain-decls.c
index 9011f22cdd59c..b745318c091ef 100644
--- a/clang/test/PCH/chain-decls.c
+++ b/clang/test/PCH/chain-decls.c
@@ -9,8 +9,8 @@
 
 // expected-no-diagnostics
 
-// CHECK: void f();
-// CHECK: void g();
+// CHECK: void f(void);
+// CHECK: void g(void);
 
 int h(void) {
   f();
diff --git a/clang/test/PCH/chain-macro.c b/clang/test/PCH/chain-macro.c
index b0fd63de46d5a..2aa69661e6827 100644
--- a/clang/test/PCH/chain-macro.c
+++ b/clang/test/PCH/chain-macro.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -ast-print -include-pch %t2 %s | FileCheck %s
 // expected-no-diagnostics
 
-// CHECK: void f();
+// CHECK: void f(void);
 FOOBAR
-// CHECK: void g();
+// CHECK: void g(void);
 BARFOO
diff --git a/clang/test/Sema/attr-print.c b/clang/test/Sema/attr-print.c
index e2364b60dc1d7..cf6b2463c7bbb 100644
--- a/clang/test/Sema/attr-print.c
+++ b/clang/test/Sema/attr-print.c
@@ -10,11 +10,10 @@ __declspec(align(4)) int y;
 // CHECK: short arr[3] __attribute__((aligned));
 short arr[3] __attribute__((aligned));
 
-// FIXME: -ast-print is printing this function signature with a K&R C style.
-// CHECK: void foo() __attribute__((const));
+// CHECK: void foo(void) __attribute__((const));
 void foo(void) __attribute__((const));
 
-// CHECK: void bar() __attribute__((__const));
+// CHECK: void bar(void) __attribute__((__const));
 void bar(void) __attribute__((__const));
 
 // CHECK: int * __ptr32 p32;
diff --git a/clang/test/SemaObjC/static-ivar-ref-1.m b/clang/test/SemaObjC/static-ivar-ref-1.m
index ff48d93508179..cc240091a3f66 100644
--- a/clang/test/SemaObjC/static-ivar-ref-1.m
+++ b/clang/test/SemaObjC/static-ivar-ref-1.m
@@ -24,7 +24,7 @@ int foo(void)
 // CHECK: }
 // CHECK: @end
 // CHECK: current *pc;
-// CHECK: int foo() {
+// CHECK: int foo(void) {
 // CHECK:     return pc->ivar2 + (*pc).ivar + pc->ivar1;
 // CHECK: }
 

From 027c16bef4b727095eea00bbef9266f1f4a78c27 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Thu, 17 Feb 2022 10:43:12 -0800
Subject: [PATCH 146/748] [X86ISelLowering] permit BlockAddressSDNode "i"
 constraints for PIC

When building 32b x86 code as PIC, the existing handling of "i"
constraints is conservative since generally we have to go through the
GOT to find references to functions.

But generally, BlockAddresses from C code refer to the Function in the
current TU.  Permit BlockAddresses to be used with the "i" constraint
for those cases.

I regressed this in
commit 4edb9983cb8c ("[SelectionDAG] treat X constrained labels as i for asm")

Fixes: https://github.com/llvm/llvm-project/issues/53868

Reviewed By: efriedma, MaskRay

Differential Revision: https://reviews.llvm.org/D119905
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  5 ++--
 llvm/test/CodeGen/X86/inline-asm-pic.ll | 38 +++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4c622568f8d0d..f2509dc9e7a83 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54789,8 +54789,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
     // In any sort of PIC mode addresses need to be computed at runtime by
     // adding in a register or some sort of table lookup.  These can't
-    // be used as immediates.
-    if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
+    // be used as immediates. BlockAddresses are fine though.
+    if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
+        !isa<BlockAddressSDNode>(Op))
       return;
 
     // If we are in non-pic codegen mode, we allow the address of a global (with
diff --git a/llvm/test/CodeGen/X86/inline-asm-pic.ll b/llvm/test/CodeGen/X86/inline-asm-pic.ll
index 7aeb1bfbdf416..503f8db91a29b 100644
--- a/llvm/test/CodeGen/X86/inline-asm-pic.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-pic.ll
@@ -18,3 +18,41 @@ entry:
 	tail call void asm "mov $1,%gs:$0", "=*m,ri,~{dirflag},~{fpsr},~{flags}"(i8** elementtype(i8*) inttoptr (i32 152 to i8**), i8* bitcast (i8** @main_q to i8*)) nounwind
 	ret void
 }
+
+; The intent of this test is to ensure that we handle blockaddress' correctly
+; with "i" constraints for -m32 -fPIC.
+
+define void @x() {
+; CHECK-LABEL: x:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    ## Ltmp0
+; CHECK-EMPTY:
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:  ## %bb.2: ## %return
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  Ltmp0: ## Block address taken
+; CHECK-NEXT:  LBB1_1: ## %overflow
+; CHECK-NEXT:    retl
+  callbr void asm "#  ${0:l}\0A", "i"(i8* blockaddress(@x, %overflow))
+          to label %return [label %overflow]
+
+overflow:
+  br label %return
+
+return:
+  ret void
+}
+
+; Test unusual case of blockaddress from @x in @y's asm.
+define void @y() {
+; CHECK-LABEL: y:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    ## Ltmp0
+; CHECK-EMPTY:
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retl
+  call void asm "# ${0:l}\0A", "i"(i8* blockaddress(@x, %overflow))
+  ret void
+}

From 2c91754a13f333d7fe9f9d3d40fb618e40c48cab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= <david.bolvansky@gmail.com>
Date: Thu, 17 Feb 2022 19:58:12 +0100
Subject: [PATCH 147/748] [Clang] Add attributes alloc_size and alloc_align to
 mm_malloc

LLVM optimizes source codes with mm_malloc better, especially due to alignment info.

alloc align https://clang.llvm.org/docs/AttributeReference.html#alloc-align
alloc size https://clang.llvm.org/docs/AttributeReference.html#alloc-size

Reviewed By: aaron.ballman

Differential Revision: https://reviews.llvm.org/D117091
---
 clang/lib/Headers/mm_malloc.h              |  6 +++---
 clang/test/Headers/Inputs/include/malloc.h |  7 +++++++
 clang/test/Headers/mm_malloc.c             | 12 ++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Headers/Inputs/include/malloc.h
 create mode 100644 clang/test/Headers/mm_malloc.c

diff --git a/clang/lib/Headers/mm_malloc.h b/clang/lib/Headers/mm_malloc.h
index 933dbaacade59..d32fe59416277 100644
--- a/clang/lib/Headers/mm_malloc.h
+++ b/clang/lib/Headers/mm_malloc.h
@@ -28,9 +28,9 @@ extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size
 
 #if !(defined(_WIN32) && defined(_mm_malloc))
 static __inline__ void *__attribute__((__always_inline__, __nodebug__,
-                                       __malloc__))
-_mm_malloc(size_t __size, size_t __align)
-{
+                                       __malloc__, __alloc_size__(1),
+                                       __alloc_align__(2)))
+_mm_malloc(size_t __size, size_t __align) {
   if (__align == 1) {
     return malloc(__size);
   }
diff --git a/clang/test/Headers/Inputs/include/malloc.h b/clang/test/Headers/Inputs/include/malloc.h
new file mode 100644
index 0000000000000..590263bb010a3
--- /dev/null
+++ b/clang/test/Headers/Inputs/include/malloc.h
@@ -0,0 +1,7 @@
+#if defined(__MINGW32__)
+void *__mingw_aligned_malloc(size_t, size_t);
+void __mingw_aligned_free(void *);
+#elif defined(_WIN32)
+void *_aligned_malloc(size_t, size_t);
+void _aligned_free(void *);
+#endif
diff --git a/clang/test/Headers/mm_malloc.c b/clang/test/Headers/mm_malloc.c
new file mode 100644
index 0000000000000..a436ff3013bf6
--- /dev/null
+++ b/clang/test/Headers/mm_malloc.c
@@ -0,0 +1,12 @@
+
+// RUN: %clang_cc1 -internal-isystem %S/Inputs/include %s -emit-llvm -O1 -triple x86_64-linux-gnu -o - | FileCheck %s
+#include <mm_malloc.h>
+
+_Bool align_test(void) {
+// CHECK-LABEL: @align_test(
+// CHECK:    ret i1 true
+     void *p = _mm_malloc(1024, 16);
+    _Bool ret = ((__UINTPTR_TYPE__)p % 16) == 0;
+    _mm_free(p);
+    return ret;
+}

From 4dfa68e483137cc13eb9027c0dd834ede19f2fd4 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Thu, 17 Feb 2022 08:19:40 -0800
Subject: [PATCH 148/748] [NFC] Fix debug-info-hotpatch.cpp failure due to
 downstream regex issue.

In our downstream, we discovered that the that the .* wildcard
in debug-info-hotpatch.cpp (added https://reviews.llvm.org/D116511)
ended up matching the entire line on our Windows configurations, causing
the -function-padmin check to already be consumed. After digging into it
we weren't able to find any sort of reason why the platform would matter
here, however we suspect there must be some difference in the regex
matcher between systems.
This NFC patch replaces the regex with a more conservative regex that
prevents this from happening by replacing the . match with an 'everything
but double-quote match, [^"].

https://reviews.llvm.org/D120066
---
 clang/test/CodeGenCXX/debug-info-hotpatch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGenCXX/debug-info-hotpatch.cpp b/clang/test/CodeGenCXX/debug-info-hotpatch.cpp
index fde1a6ad085ea..e005c9c5ee489 100644
--- a/clang/test/CodeGenCXX/debug-info-hotpatch.cpp
+++ b/clang/test/CodeGenCXX/debug-info-hotpatch.cpp
@@ -12,7 +12,7 @@
 // RUN: %clang_cl --target=x86_64-windows-msvc /hotpatch -### -- %s 2>&1 \
 // RUN:    | FileCheck %s --check-prefix=FUNCTIONPADMIN
 // FUNCTIONPADMIN: clang{{.*}}
-// FUNCTIONPADMIN: {{link.*"}}
+// FUNCTIONPADMIN: {{link[^"]*"}} 
 // FUNCTIONPADMIN: -functionpadmin
 
 int main() {

From 1af15de6b77278fec12e72ca8be9f6408fd4761b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 17 Feb 2022 19:56:38 +0100
Subject: [PATCH 149/748] [mlir] Switch {collapse,expand}_shape ops to the
 declarative assembly format

Same functionality, a lot less code.
---
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       |  5 +-
 .../mlir/Dialect/Tensor/IR/TensorOps.td       |  5 +-
 .../mlir/Dialect/Utils/ReshapeOpsUtils.h      | 25 ---------
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp      | 15 -----
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      | 11 ----
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp    | 56 -------------------
 6 files changed, 8 insertions(+), 109 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 2445280b01573..9102da3db7877 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1240,9 +1240,12 @@ class MemRef_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
     Value getViewSource() { return src(); }
   }];
 
+  let assemblyFormat = [{
+    $src $reassociation attr-dict `:` type($src) `into` type($result)
+  }];
+
   let hasFolder = 1;
   let hasCanonicalizer = 1;
-  let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 3a2ec73791d3c..a53f3e7e5ca35 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -732,9 +732,12 @@ class Tensor_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
     }
   }];
 
+  let assemblyFormat = [{
+    $src $reassociation attr-dict `:` type($src) `into` type($result)
+  }];
+
   let hasFolder = 1;
   let hasCanonicalizer = 1;
-  let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 }
 
diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
index b2d4cf1e4bffc..dfeac25fd6c99 100644
--- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
@@ -74,31 +74,6 @@ getReassociationIndicesForReshape(ShapedType sourceType, ShapedType targetType);
 bool isReassociationValid(ArrayRef<AffineMap> reassociation,
                           int *invalidIndex = nullptr);
 
-/// Parse a reshape-like op, i.e. linalg::(Tensor)ExpandShapeOp,
-/// linalg::(Tensor)CollapseShapeOp.
-ParseResult parseReshapeLikeOp(OpAsmParser &parser, OperationState &result);
-
-/// Print a reshape-like op, i.e. linalg::(Tensor)ExpandShapeOp,
-/// linalg::(Tensor)CollapseShapeOp.
-template <typename ReshapeLikeOp>
-void printReshapeOp(OpAsmPrinter &p, ReshapeLikeOp op) {
-  p << ' ' << op.src() << " [";
-
-  llvm::interleaveComma(op.reassociation(), p, [&](const Attribute &attr) {
-    p << '[';
-    auto arrayAttr = attr.template cast<ArrayAttr>();
-    llvm::interleaveComma(arrayAttr, p, [&](const Attribute &attr) {
-      p << attr.cast<IntegerAttr>().getInt();
-    });
-    p << ']';
-  });
-
-  p << "] ";
-  p.printOptionalAttrDict(op->getAttrs(),
-                          /*elidedAttrs=*/{getReassociationAttrName()});
-  p << ": " << op.src().getType() << " into " << op.getType();
-}
-
 template <typename ReshapeOpTy, typename InverseReshapeOpTy>
 static OpFoldResult foldReshapeOp(ReshapeOpTy reshapeOp,
                                   ArrayRef<Attribute> operands) {
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index b64fb00ce4cc1..541da53cb2a49 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1370,21 +1370,6 @@ SmallVector<ReassociationExprs, 4> ExpandShapeOp::getReassociationExprs() {
                                             getReassociationIndices());
 }
 
-ParseResult ExpandShapeOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseReshapeLikeOp(parser, result);
-}
-void ExpandShapeOp::print(OpAsmPrinter &p) {
-  ::mlir::printReshapeOp<ExpandShapeOp>(p, *this);
-}
-
-ParseResult CollapseShapeOp::parse(OpAsmParser &parser,
-                                   OperationState &result) {
-  return parseReshapeLikeOp(parser, result);
-}
-void CollapseShapeOp::print(OpAsmPrinter &p) {
-  ::mlir::printReshapeOp<CollapseShapeOp>(p, *this);
-}
-
 /// Detect whether memref dims [dim, dim + extent) can be reshaped without
 /// copies.
 static bool isReshapableDimBand(unsigned dim, unsigned extent,
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index a13a274c28e2a..5edb620d5cc32 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -733,17 +733,6 @@ SmallVector<ReassociationExprs, 4> ExpandShapeOp::getReassociationExprs() {
                                             getReassociationIndices());
 }
 
-ParseResult ExpandShapeOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseReshapeLikeOp(parser, result);
-}
-void ExpandShapeOp::print(OpAsmPrinter &p) { printReshapeOp(p, *this); }
-
-ParseResult CollapseShapeOp::parse(OpAsmParser &parser,
-                                   OperationState &result) {
-  return parseReshapeLikeOp(parser, result);
-}
-void CollapseShapeOp::print(OpAsmPrinter &p) { printReshapeOp(p, *this); }
-
 /// Compute the RankedTensorType obtained by applying `reassociation` to `type`.
 static RankedTensorType
 computeTensorReshapeCollapsedType(RankedTensorType type,
diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index 0048abee4194a..fd509621015d2 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -91,62 +91,6 @@ mlir::getReassociationIndicesForReshape(ShapedType sourceType,
   return reassociationMap;
 }
 
-ParseResult mlir::parseReshapeLikeOp(OpAsmParser &parser,
-                                     OperationState &result) {
-  // Parse the operand.
-  OpAsmParser::OperandType src;
-  if (parser.parseOperand(src))
-    return failure();
-
-  // Parse reassociation indices.
-  Builder &b = parser.getBuilder();
-  SmallVector<Attribute, 4> reassociation;
-  if (parser.parseLSquare())
-    return failure();
-
-  while (true) {
-    if (succeeded(parser.parseOptionalRSquare()))
-      break;
-    if (parser.parseLSquare())
-      return failure();
-    SmallVector<int64_t> indices;
-    while (true) {
-      int64_t index;
-      if (parser.parseInteger(index))
-        return failure();
-      indices.push_back(index);
-
-      if (succeeded(parser.parseOptionalComma()))
-        continue;
-      if (failed(parser.parseRSquare()))
-        return failure();
-      break;
-    }
-    reassociation.push_back(b.getI64ArrayAttr(indices));
-    if (succeeded(parser.parseOptionalComma()))
-      continue;
-    if (failed(parser.parseRSquare()))
-      return failure();
-    break;
-  }
-
-  result.addAttribute(getReassociationAttrName(),
-                      b.getArrayAttr(reassociation));
-
-  // Parse optional attributes.
-  parser.parseOptionalAttrDict(result.attributes);
-
-  // Parse types.
-  Type srcType;
-  Type resultType;
-  if (parser.parseColon() || parser.parseType(srcType) ||
-      parser.resolveOperand(src, srcType, result.operands) ||
-      parser.parseKeyword("into") || parser.parseType(resultType))
-    return failure();
-  result.addTypes(resultType);
-  return success();
-}
-
 Optional<SmallVector<ReassociationIndices>> mlir::composeReassociationIndices(
     ArrayRef<ReassociationIndices> producerReassociations,
     ArrayRef<ReassociationIndices> consumerReassociations,

From b0aa1946dfe1d204e49b8238c4960f64a68f31d5 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Fri, 11 Feb 2022 12:00:05 -0800
Subject: [PATCH 150/748] [AMDGPU] Promote recursive loads from kernel argument
 to constant

Not clobbered pointer load chains are promoted to global now. That
is possible to promote these loads itself into constant address
space. Loaded pointers still need to point to global because we
need to be able to store into that pointer and because an actual
load from it may occur after a clobber.

Differential Revision: https://reviews.llvm.org/D119886
---
 .../amdgpu-kernel-arg-pointer-type.cu         |  12 +-
 .../AMDGPU/AMDGPUPromoteKernelArguments.cpp   |  69 +++--
 .../AMDGPU/promote-kernel-arguments.ll        | 250 +++++++++++++++---
 3 files changed, 273 insertions(+), 58 deletions(-)

diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index d483803005074..01e0d3db46127 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -18,7 +18,7 @@
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce)
 // CHECK:     ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
-// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4
+// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
 // OPT: ret void
@@ -30,7 +30,7 @@ __global__ void kernel1(int *x) {
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
 // CHECK:     ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
-// OPT: [[VAL:%.*]] = load i32, i32 addrspace(1)* %x.coerce, align 4
+// OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
 // OPT: ret void
@@ -68,7 +68,8 @@ struct S {
 // OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
-// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[G0]], align 4
+// OPT: [[G2:%.*]] ={{.*}} addrspacecast i32* [[P0]] to i32 addrspace(4)*
+// OPT: [[V0:%.*]] = load i32, i32 addrspace(4)* [[G2]], align 4
 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
 // OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4
 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
@@ -103,7 +104,8 @@ struct T {
 // OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1
 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
-// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[G0]], align 4
+// OPT: [[G2:%.*]] ={{.*}} addrspacecast float* [[P0]] to float addrspace(4)*
+// OPT: [[V0:%.*]] = load float, float addrspace(4)* [[G2]], align 4
 // OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
 // OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4
 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
@@ -130,7 +132,7 @@ struct SS {
 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(float addrspace(1)*{{.*}} %a.coerce)
 // CHECK:     ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
-// OPT: [[VAL:%.*]] = load float, float addrspace(1)* %a.coerce, align 4
+// OPT: [[VAL:%.*]] = load float, float addrspace(4)* %a.coerce.const, align 4
 // OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
 // OPT: store float [[INC]], float addrspace(1)* %a.coerce, align 4
 // OPT: ret void
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
index b9b48290dd277..65ad8b2aeacd3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -42,6 +42,8 @@ class AMDGPUPromoteKernelArguments : public FunctionPass {
 
   bool promotePointer(Value *Ptr);
 
+  bool promoteLoad(LoadInst *LI);
+
 public:
   static char ID;
 
@@ -73,16 +75,10 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
       break;
     case Instruction::Load: {
       LoadInst *LD = cast<LoadInst>(U);
-      PointerType *PT = dyn_cast<PointerType>(LD->getType());
-      if (!PT ||
-          (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
-           PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
-           PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
-          LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
-        break;
-      // TODO: This load poprobably can be promoted to constant address space.
-      if (!AMDGPU::isClobberedInFunction(LD, MSSA, AA))
+      if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr &&
+          !AMDGPU::isClobberedInFunction(LD, MSSA, AA))
         Ptrs.push_back(LD);
+
       break;
     }
     case Instruction::GetElementPtr:
@@ -96,15 +92,26 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
 }
 
 bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
-  enqueueUsers(Ptr);
+  bool Changed = false;
+
+  LoadInst *LI = dyn_cast<LoadInst>(Ptr);
+  if (LI)
+    Changed |= promoteLoad(LI);
+
+  PointerType *PT = dyn_cast<PointerType>(Ptr->getType());
+  if (!PT)
+    return Changed;
+
+  if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+      PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+      PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+    enqueueUsers(Ptr);
 
-  PointerType *PT = cast<PointerType>(Ptr->getType());
   if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
-    return false;
+    return Changed;
 
-  bool IsArg = isa<Argument>(Ptr);
-  IRBuilder<> B(IsArg ? ArgCastInsertPt
-                      : &*std::next(cast<Instruction>(Ptr)->getIterator()));
+  IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator())
+                   : ArgCastInsertPt);
 
   // Cast pointer to global address space and back to flat and let
   // Infer Address Spaces pass to do all necessary rewriting.
@@ -120,6 +127,38 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
   return true;
 }
 
+bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
+  if (!LI->isSimple())
+    return false;
+
+  Value *Ptr = LI->getPointerOperand();
+
+  // Strip casts we have created earlier.
+  Value *OrigPtr = Ptr;
+  PointerType *PT;
+  for ( ; ; ) {
+    PT = cast<PointerType>(OrigPtr->getType());
+    if (PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+      return false;
+    auto *P = dyn_cast<AddrSpaceCastInst>(OrigPtr);
+    if (!P)
+      break;
+    auto *NewPtr = P->getPointerOperand();
+    if (!cast<PointerType>(NewPtr->getType())->hasSameElementTypeAs(PT))
+      break;
+    OrigPtr = NewPtr;
+  }
+
+  IRBuilder<> B(LI);
+
+  PointerType *NewPT =
+      PointerType::getWithSamePointeeType(PT, AMDGPUAS::CONSTANT_ADDRESS);
+  Value *Cast = B.CreateAddrSpaceCast(OrigPtr, NewPT,
+                                      Twine(OrigPtr->getName(), ".const"));
+  LI->replaceUsesOfWith(Ptr, Cast);
+  return true;
+}
+
 // skip allocas
 static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
   BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
index 82ca6a8b3f644..5cc37b45e0cc4 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
@@ -11,11 +11,15 @@ define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly %
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
-; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8
-; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    [[P1_CONST:%.*]] = addrspacecast float** addrspace(1)* [[P1]] to float** addrspace(4)*
+; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(4)* [[P1_CONST]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float**
+; CHECK-NEXT:    [[P2_FLAT:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float**
+; CHECK-NEXT:    [[P2_CONST:%.*]] = addrspacecast float** [[TMP1]] to float* addrspace(4)*
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2_CONST]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[TMP2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -37,9 +41,11 @@ define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) {
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]]
 ; CHECK-NEXT:    [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)*
-; CHECK-NEXT:    [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)*
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast i32* addrspace(1)* [[P1_CAST]] to i32**
+; CHECK-NEXT:    [[P1_CAST_CONST:%.*]] = addrspacecast i32** [[TMP0]] to i32* addrspace(4)*
+; CHECK-NEXT:    [[P2:%.*]] = load i32*, i32* addrspace(4)* [[P1_CAST_CONST]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)*
+; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -60,10 +66,11 @@ define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture reado
 ; CHECK-LABEL: @ptr_in_struct(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8
-; CHECK-NEXT:    [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
+; CHECK-NEXT:    [[P_CONST:%.*]] = addrspacecast float* addrspace(1)* [[P]] to float* addrspace(4)*
+; CHECK-NEXT:    [[P1:%.*]] = load float*, float* addrspace(4)* [[P_CONST]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
 ; CHECK-NEXT:    [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i32 [[ID]]
 ; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -80,7 +87,14 @@ entry:
 
 ; GCN-LABEL: flat_ptr_arg:
 ; GCN-COUNT-2: global_load_dwordx2
-; GCN:         global_load_dwordx4
+
+; FIXME: First load is in the constant address space and second is in global
+;        because it is clobbered by store. GPU load store vectorizer cannot
+;        combine them. Note, this does not happen with -O3 because loads are
+;        vectorized in pairs earlier and stay in the global address space.
+
+; GCN:         global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}}
+; GCN:         global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4
 ; GCN:         global_store_dword
 define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) {
 ; CHECK-LABEL: @flat_ptr_arg(
@@ -90,22 +104,26 @@ define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg,
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
-; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX10]] to float**
+; CHECK-NEXT:    [[ARRAYIDX10_CONST:%.*]] = addrspacecast float** [[TMP0]] to float* addrspace(4)*
+; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX10_CONST]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast float addrspace(1)* [[TMP1]] to float*
+; CHECK-NEXT:    [[I1_CONST:%.*]] = addrspacecast float* [[TMP2]] to float addrspace(4)*
+; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(4)* [[I1_CONST]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
 ; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP1]], i64 1
 ; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
 ; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP1]], i64 2
 ; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
 ; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
 ; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP1]], i64 3
 ; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
 ; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
@@ -114,10 +132,12 @@ define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg,
 ; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
 ; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8
-; CHECK-NEXT:    [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)*
+; CHECK-NEXT:    [[TMP3:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX11]] to float**
+; CHECK-NEXT:    [[ARRAYIDX11_CONST:%.*]] = addrspacecast float** [[TMP3]] to float* addrspace(4)*
+; CHECK-NEXT:    [[I7:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX11_CONST]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)*
 ; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP4]], i64 [[IDXPROM8]]
 ; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -157,7 +177,8 @@ entry:
 
 ; GCN-LABEL: global_ptr_arg:
 ; GCN: global_load_dwordx2
-; GCN: global_load_dwordx4
+; GCN: global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}}
+; GCN: global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4
 ; GCN: global_store_dword
 define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
 ; CHECK-LABEL: @global_ptr_arg(
@@ -165,22 +186,25 @@ define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonl
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
-; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10_CONST:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX10]] to float* addrspace(4)*
+; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX10_CONST]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast float addrspace(1)* [[TMP0]] to float*
+; CHECK-NEXT:    [[I1_CONST:%.*]] = addrspacecast float* [[TMP1]] to float addrspace(4)*
+; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(4)* [[I1_CONST]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
 ; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 1
 ; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
 ; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 2
 ; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
 ; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
 ; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 3
 ; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
 ; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
@@ -189,7 +213,7 @@ define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonl
 ; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
 ; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
 ; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 [[IDXPROM8]]
 ; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -280,18 +304,19 @@ define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace(
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
+; CHECK-NEXT:    [[ARRAYIDX10_CONST:%.*]] = addrspacecast float* addrspace(1)* [[ARRAYIDX10]] to float* addrspace(4)*
+; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(4)* [[ARRAYIDX10_CONST]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
 ; CHECK-NEXT:    store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
+; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
 ; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
 ; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
 ; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
 ; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[TMP0]], i64 [[IDXPROM8]]
 ; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -323,11 +348,15 @@ define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture re
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
-; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8
-; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    [[P1_CONST:%.*]] = addrspacecast float** addrspace(1)* [[P1]] to float** addrspace(4)*
+; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(4)* [[P1_CONST]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float**
+; CHECK-NEXT:    [[P2_FLAT:%.*]] = addrspacecast float* addrspace(1)* [[TMP0]] to float**
+; CHECK-NEXT:    [[P2_CONST:%.*]] = addrspacecast float** [[TMP1]] to float* addrspace(4)*
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2_CONST]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[TMP2]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -340,5 +369,150 @@ entry:
   ret void
 }
 
+; GCN-LABEL: flat_ptr_nest_2:
+; GCN: s_lshl_b64
+; GCN: s_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @flat_ptr_nest_2(float** nocapture readonly %Arg, i32 %i) {
+; CHECK-LABEL: @flat_ptr_nest_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[P1]] to float**
+; CHECK-NEXT:    [[P1_CONST:%.*]] = addrspacecast float** [[TMP0]] to float* addrspace(4)*
+; CHECK-NEXT:    [[P2:%.*]] = load float*, float* addrspace(4)* [[P1_CONST]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
+  %p2 = load float*, float** %p1, align 8
+  store float 0.000000e+00, float* %p2, align 4
+  ret void
+}
+
+; GCN-LABEL: const_ptr_nest_3:
+; GCN: s_lshl_b64
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) {
+; CHECK-LABEL: @const_ptr_nest_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i
+  %p2 = load float* addrspace(4)*, float * addrspace(4)* addrspace(4)* %p1, align 8
+  %p3 = load float*, float* addrspace(4)* %p2, align 8
+  store float 0.000000e+00, float* %p3, align 4
+  ret void
+}
+
+; GCN-LABEL: cast_from_const_const_ptr_nest_3:
+; GCN: s_lshl_b64
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) {
+; CHECK-LABEL: @cast_from_const_const_ptr_nest_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i
+  %a1 = addrspacecast float* addrspace(4)* addrspace(4)* %p1 to float* addrspace(4)**
+  %p2 = load float* addrspace(4)*, float* addrspace(4)** %a1, align 8
+  %a2 = addrspacecast float* addrspace(4)* %p2 to float**
+  %p3 = load float*, float** %a2, align 8
+  store float 0.000000e+00, float* %p3, align 4
+  ret void
+}
+
+; GCN-LABEL: flat_ptr_volatile_load:
+; GCN: s_lshl_b64
+; GCN: flat_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @flat_ptr_volatile_load(float** nocapture readonly %Arg, i32 %i) {
+; CHECK-LABEL: @flat_ptr_volatile_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[P1]] to float**
+; CHECK-NEXT:    [[P2:%.*]] = load volatile float*, float** [[TMP0]], align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
+  %p2 = load volatile float*, float** %p1, align 8
+  store float 0.000000e+00, float* %p2, align 4
+  ret void
+}
+
+; GCN-LABEL: flat_ptr_atomic_load:
+; GCN: s_lshl_b64
+; GCN: global_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @flat_ptr_atomic_load(float** nocapture readonly %Arg, i32 %i) {
+; CHECK-LABEL: @flat_ptr_atomic_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load atomic float*, float* addrspace(1)* [[P1]] monotonic, align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
+  %p2 = load atomic float*, float** %p1 monotonic, align 8
+  store float 0.000000e+00, float* %p2, align 4
+  ret void
+}
+
+; GCN-LABEL: cast_changing_pointee_type:
+; GCN: s_lshl_b64
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: global_store_dword
+define amdgpu_kernel void @cast_changing_pointee_type(float* addrspace(1)* addrspace(1)* nocapture readonly %Arg, i32 %i) {
+; CHECK-LABEL: @cast_changing_pointee_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* [[ARG:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[A1:%.*]] = bitcast float* addrspace(1)* addrspace(1)* [[P1]] to i32* addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* addrspace(1)* addrspace(1)* [[P1]] to i32* addrspace(1)**
+; CHECK-NEXT:    [[A1_CONST:%.*]] = addrspacecast i32* addrspace(1)** [[TMP0]] to i32* addrspace(1)* addrspace(4)*
+; CHECK-NEXT:    [[P2:%.*]] = load i32* addrspace(1)*, i32* addrspace(1)* addrspace(4)* [[A1_CONST]], align 8
+; CHECK-NEXT:    [[A2:%.*]] = bitcast i32* addrspace(1)* [[P2]] to float* addrspace(1)*
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast i32* addrspace(1)* [[P2]] to float**
+; CHECK-NEXT:    [[A2_CONST:%.*]] = addrspacecast float** [[TMP1]] to float* addrspace(4)*
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[A2_CONST]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p1 = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* %Arg, i32 %i
+  %a1 = addrspacecast float* addrspace(1)* addrspace(1)* %p1 to i32* addrspace(1)**
+  %p2 = load i32* addrspace(1)*, i32* addrspace(1)** %a1, align 8
+  %a2 = addrspacecast i32* addrspace(1)* %p2 to float**
+  %p3 = load float*, float** %a2, align 8
+  store float 0.000000e+00, float* %p3, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare void @llvm.amdgcn.s.barrier()

From bd8db271e730f9a4f4f89176e1627b064d91e484 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 17 Feb 2022 19:11:45 +0000
Subject: [PATCH 151/748] [AArch64] Add extra widening mul tests. NFC

Also regenerate arm64-neon-2velem-high.ll.
---
 llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll | 164 ++++++++++
 .../CodeGen/AArch64/arm64-neon-2velem-high.ll | 297 ++++++++++--------
 2 files changed, 338 insertions(+), 123 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll

diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
new file mode 100644
index 0000000000000..b591438b7ceef
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-unknown-linux-gnu | FileCheck %s
+
+; Tests for wider-than-legal extensions into mul/mla.
+
+define <16 x i16> @mul_i16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: mul_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %ea = zext <16 x i8> %a to <16 x i16>
+  %eb = zext <16 x i8> %b to <16 x i16>
+  %m = mul <16 x i16> %ea, %eb
+  ret <16 x i16> %m
+}
+
+define <16 x i32> @mul_i32(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: mul_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-NEXT:    umull2 v3.4s, v4.8h, v5.8h
+; CHECK-NEXT:    umull2 v1.4s, v2.8h, v0.8h
+; CHECK-NEXT:    umull v0.4s, v2.4h, v0.4h
+; CHECK-NEXT:    umull v2.4s, v4.4h, v5.4h
+; CHECK-NEXT:    ret
+entry:
+  %ea = zext <16 x i8> %a to <16 x i32>
+  %eb = zext <16 x i8> %b to <16 x i32>
+  %m = mul <16 x i32> %ea, %eb
+  ret <16 x i32> %m
+}
+
+define <16 x i64> @mul_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: mul_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-NEXT:    ushll v6.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v16.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v17.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v18.4s, v1.8h, #0
+; CHECK-NEXT:    ushll2 v19.4s, v0.8h, #0
+; CHECK-NEXT:    umull2 v7.2d, v16.4s, v18.4s
+; CHECK-NEXT:    umull2 v3.2d, v2.4s, v19.4s
+; CHECK-NEXT:    umull2 v1.2d, v4.4s, v5.4s
+; CHECK-NEXT:    umull v0.2d, v4.2s, v5.2s
+; CHECK-NEXT:    umull2 v5.2d, v6.4s, v17.4s
+; CHECK-NEXT:    umull v2.2d, v2.2s, v19.2s
+; CHECK-NEXT:    umull v4.2d, v6.2s, v17.2s
+; CHECK-NEXT:    umull v6.2d, v16.2s, v18.2s
+; CHECK-NEXT:    ret
+entry:
+  %ea = zext <16 x i8> %a to <16 x i64>
+  %eb = zext <16 x i8> %b to <16 x i64>
+  %m = mul <16 x i64> %ea, %eb
+  ret <16 x i64> %m
+}
+
+
+define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
+; CHECK-LABEL: mla_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    umlal v2.8h, v0.8b, v1.8b
+; CHECK-NEXT:    umlal v3.8h, v4.8b, v5.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %ea = zext <16 x i8> %a to <16 x i16>
+  %eb = zext <16 x i8> %b to <16 x i16>
+  %m = mul <16 x i16> %ea, %eb
+  %d = add <16 x i16> %m, %c
+  ret <16 x i16> %d
+}
+
+define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
+; CHECK-LABEL: mla_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v6.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v7.8h, v1.8b, #0
+; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NEXT:    ext v16.16b, v6.16b, v6.16b, #8
+; CHECK-NEXT:    ext v17.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v18.16b, v7.16b, v7.16b, #8
+; CHECK-NEXT:    ext v19.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    umlal v4.4s, v0.4h, v1.4h
+; CHECK-NEXT:    umlal v2.4s, v6.4h, v7.4h
+; CHECK-NEXT:    umlal v3.4s, v16.4h, v18.4h
+; CHECK-NEXT:    umlal v5.4s, v17.4h, v19.4h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v2.16b, v4.16b
+; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    ret
+entry:
+  %ea = zext <16 x i8> %a to <16 x i32>
+  %eb = zext <16 x i8> %b to <16 x i32>
+  %m = mul <16 x i32> %ea, %eb
+  %d = add <16 x i32> %m, %c
+  ret <16 x i32> %d
+}
+
+define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
+; CHECK-LABEL: mla_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v18.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v25.8h, v1.8b, #0
+; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v19.4s, v18.4h, #0
+; CHECK-NEXT:    ushll v20.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-NEXT:    ushll v26.4s, v25.4h, #0
+; CHECK-NEXT:    ushll v27.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v25.4s, v25.8h, #0
+; CHECK-NEXT:    mov v16.16b, v7.16b
+; CHECK-NEXT:    mov v17.16b, v6.16b
+; CHECK-NEXT:    ldp q6, q7, [sp]
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ext v21.16b, v19.16b, v19.16b, #8
+; CHECK-NEXT:    ext v22.16b, v20.16b, v20.16b, #8
+; CHECK-NEXT:    ext v23.16b, v18.16b, v18.16b, #8
+; CHECK-NEXT:    ext v28.16b, v26.16b, v26.16b, #8
+; CHECK-NEXT:    ext v29.16b, v27.16b, v27.16b, #8
+; CHECK-NEXT:    ext v30.16b, v25.16b, v25.16b, #8
+; CHECK-NEXT:    ext v24.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v31.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    umlal v4.2d, v18.2s, v25.2s
+; CHECK-NEXT:    umlal v17.2d, v20.2s, v27.2s
+; CHECK-NEXT:    umlal v2.2d, v19.2s, v26.2s
+; CHECK-NEXT:    umlal v3.2d, v21.2s, v28.2s
+; CHECK-NEXT:    umlal v5.2d, v23.2s, v30.2s
+; CHECK-NEXT:    umlal v16.2d, v22.2s, v29.2s
+; CHECK-NEXT:    umlal v6.2d, v0.2s, v1.2s
+; CHECK-NEXT:    umlal v7.2d, v24.2s, v31.2s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v2.16b, v4.16b
+; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    mov v4.16b, v17.16b
+; CHECK-NEXT:    mov v5.16b, v16.16b
+; CHECK-NEXT:    ret
+entry:
+  %ea = zext <16 x i8> %a to <16 x i64>
+  %eb = zext <16 x i8> %b to <16 x i64>
+  %m = mul <16 x i64> %ea, %eb
+  %d = add <16 x i64> %m, %c
+  ret <16 x i64> %d
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
index 575acf723753b..f1678ca19f47d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast \
-; RUN:     < %s -verify-machineinstrs -asm-verbose=false | FileCheck %s
+; RUN:     < %s -verify-machineinstrs | FileCheck %s
 
 define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
 ; CHECK-LABEL: test_vmull_high_n_s16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.8h, w0
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
@@ -18,9 +20,10 @@ entry:
 
 define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 {
 ; CHECK-LABEL: test_vmull_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
-; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.8h, #29
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
@@ -29,9 +32,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
 ; CHECK-LABEL: test_vmull_high_n_s32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
@@ -42,9 +46,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 {
 ; CHECK-LABEL: test_vmull_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #1, msl #8
-; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.4s, #1, msl #8
+; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 511, i32 511>)
@@ -53,9 +58,10 @@ entry:
 
 define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
 ; CHECK-LABEL: test_vmull_high_n_u16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.8h, w0
+; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
@@ -68,9 +74,10 @@ entry:
 
 define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 {
 ; CHECK-LABEL: test_vmull_high_n_u16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
-; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.8h, #17, lsl #8
+; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 4352, i16 4352, i16 4352, i16 4352>)
@@ -79,9 +86,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
 ; CHECK-LABEL: test_vmull_high_n_u32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
@@ -92,9 +100,10 @@ entry:
 
 define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 {
 ; CHECK-LABEL: test_vmull_high_n_u32_imm:
-; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].4s, #1, msl #8
-; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mvni v1.4s, #1, msl #8
+; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 4294966784, i32 4294966784>)
@@ -103,9 +112,10 @@ entry:
 
 define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
 ; CHECK-LABEL: test_vqdmull_high_n_s16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.8h, w0
+; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
@@ -118,9 +128,10 @@ entry:
 
 define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 {
 ; CHECK-LABEL: test_vqdmull_high_n_s16_imm:
-; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
-; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mvni v1.8h, #17, lsl #8
+; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 61183, i16 61183, i16 61183, i16 61183>)
@@ -129,9 +140,10 @@ entry:
 
 define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
 ; CHECK-LABEL: test_vqdmull_high_n_s32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v1.4s, w0
+; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
@@ -142,9 +154,10 @@ entry:
 
 define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 {
 ; CHECK-LABEL: test_vqdmull_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
-; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.4s, #29
+; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
@@ -153,9 +166,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_s16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.8h, w0
+; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -169,9 +183,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
-; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8h, #29
+; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
@@ -181,9 +196,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_s32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.4s, w0
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
@@ -195,9 +211,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
-; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4s, #29
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
@@ -207,9 +224,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_u16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.8h, w0
+; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -223,9 +241,10 @@ entry:
 
 define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_u16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
-; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8h, #29
+; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
@@ -235,9 +254,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_u32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.4s, w0
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
@@ -249,9 +269,10 @@ entry:
 
 define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
 ; CHECK-LABEL: test_vmlal_high_n_u32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
-; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4s, #29
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
@@ -261,9 +282,10 @@ entry:
 
 define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
 ; CHECK-LABEL: test_vqdmlal_high_n_s16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.8h, w0
+; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -277,9 +299,10 @@ entry:
 
 define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: test_vqdmlal_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
-; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8h, #29
+; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
@@ -289,9 +312,10 @@ entry:
 
 define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
 ; CHECK-LABEL: test_vqdmlal_high_n_s32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.4s, w0
+; CHECK-NEXT:    sqdmlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
@@ -303,9 +327,10 @@ entry:
 
 define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
 ; CHECK-LABEL: test_vqdmlal_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
-; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4s, #29
+; CHECK-NEXT:    sqdmlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
@@ -315,9 +340,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_s16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.8h, w0
+; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -331,9 +357,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
-; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8h, #29
+; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
@@ -343,9 +370,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_s32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.4s, w0
+; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
@@ -357,9 +385,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
-; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4s, #29
+; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
@@ -369,9 +398,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_u16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.8h, w0
+; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -385,9 +415,10 @@ entry:
 
 define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_u16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
-; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8h, #29
+; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
@@ -397,9 +428,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_u32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.4s, w0
+; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
@@ -411,9 +443,10 @@ entry:
 
 define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
 ; CHECK-LABEL: test_vmlsl_high_n_u32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
-; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4s, #29
+; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
@@ -423,9 +456,10 @@ entry:
 
 define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
 ; CHECK-LABEL: test_vqdmlsl_high_n_s16:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
-; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.8h, w0
+; CHECK-NEXT:    sqdmlsl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
@@ -439,9 +473,10 @@ entry:
 
 define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: test_vqdmlsl_high_n_s16_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
-; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.8h, #29
+; CHECK-NEXT:    sqdmlsl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
@@ -451,9 +486,10 @@ entry:
 
 define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
 ; CHECK-LABEL: test_vqdmlsl_high_n_s32:
-; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
-; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    dup v2.4s, w0
+; CHECK-NEXT:    sqdmlsl2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
@@ -465,9 +501,10 @@ entry:
 
 define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
 ; CHECK-LABEL: test_vqdmlsl_high_n_s32_imm:
-; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
-; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.4s, #29
+; CHECK-NEXT:    sqdmlsl2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    ret
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
@@ -477,8 +514,10 @@ entry:
 
 define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
 ; CHECK-LABEL: test_vmul_n_f32:
-; CHECK-NEXT: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
   %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
@@ -488,8 +527,10 @@ entry:
 
 define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
 ; CHECK-LABEL: test_vmulq_n_f32:
-; CHECK-NEXT: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
   %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
@@ -501,8 +542,10 @@ entry:
 
 define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
 ; CHECK-LABEL: test_vmulq_n_f64:
-; CHECK-NEXT: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.d[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
   %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
@@ -512,8 +555,10 @@ entry:
 
 define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
 ; CHECK-LABEL: test_vfma_n_f32:
-; CHECK-NEXT: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
   %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
@@ -523,8 +568,10 @@ entry:
 
 define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
 ; CHECK-LABEL: test_vfmaq_n_f32:
-; CHECK-NEXT: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmla v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
   %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
@@ -536,8 +583,10 @@ entry:
 
 define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
 ; CHECK-LABEL: test_vfms_n_f32:
-; CHECK-NEXT: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmls v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
   %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
@@ -548,8 +597,10 @@ entry:
 
 define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
 ; CHECK-LABEL: test_vfmsq_n_f32:
-; CHECK-NEXT: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    fmls v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
   %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1

From f56cb520d8554ca42a215e82ecfa58d0b6c178e4 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <syaghmour@apple.com>
Date: Thu, 17 Feb 2022 11:13:46 -0800
Subject: [PATCH 152/748] [DEBUGINFO] [LLDB] Add support for generating
 debug-info for structured bindings of structs and arrays

Currently we are not emitting debug-info for all cases of structured bindings a
C++17 feature which allows us to bind names to subobjects in an initializer.

A structured binding is represented by a DecompositionDecl AST node and the
binding are represented by a BindingDecl. It looks the original implementation
only covered the tuple like case which be represented by a DeclRefExpr which
contains a VarDecl.

If the binding is to a subobject of the struct the binding will contain a
MemberExpr and in the case of arrays it will contain an ArraySubscriptExpr.
This PR adds support emitting debug-info for the MemberExpr and ArraySubscriptExpr
cases as well as llvm and lldb tests for these cases as well as the tuple case.

Differential Revision: https://reviews.llvm.org/D119178
---
 clang/lib/CodeGen/CGDebugInfo.cpp             | 92 +++++++++++++++++++
 clang/lib/CodeGen/CGDebugInfo.h               |  8 ++
 .../debug-info-structured-binding.cpp         | 19 ++++
 .../API/lang/cpp/structured-binding/Makefile  |  4 +
 .../TestStructuredBinding.py                  | 84 +++++++++++++++++
 .../API/lang/cpp/structured-binding/main.cpp  | 69 ++++++++++++++
 6 files changed, 276 insertions(+)
 create mode 100644 clang/test/CodeGenCXX/debug-info-structured-binding.cpp
 create mode 100644 lldb/test/API/lang/cpp/structured-binding/Makefile
 create mode 100644 lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
 create mode 100644 lldb/test/API/lang/cpp/structured-binding/main.cpp

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index d75b5a1a9d125..2203f0aec5c7c 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -4635,11 +4635,103 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const VarDecl *VD,
   return D;
 }
 
+llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const BindingDecl *BD,
+                                                llvm::Value *Storage,
+                                                llvm::Optional<unsigned> ArgNo,
+                                                CGBuilderTy &Builder,
+                                                const bool UsePointerValue) {
+  assert(CGM.getCodeGenOpts().hasReducedDebugInfo());
+  assert(!LexicalBlockStack.empty() && "Region stack mismatch, stack empty!");
+  if (BD->hasAttr<NoDebugAttr>())
+    return nullptr;
+
+  // Skip the tuple like case, we don't handle that here
+  if (isa<DeclRefExpr>(BD->getBinding()))
+    return nullptr;
+
+  llvm::DIFile *Unit = getOrCreateFile(BD->getLocation());
+  llvm::DIType *Ty = getOrCreateType(BD->getType(), Unit);
+
+  // If there is no debug info for this type then do not emit debug info
+  // for this variable.
+  if (!Ty)
+    return nullptr;
+
+  auto Align = getDeclAlignIfRequired(BD, CGM.getContext());
+  unsigned AddressSpace = CGM.getContext().getTargetAddressSpace(BD->getType());
+
+  SmallVector<uint64_t, 3> Expr;
+  AppendAddressSpaceXDeref(AddressSpace, Expr);
+
+  // Clang stores the sret pointer provided by the caller in a static alloca.
+  // Use DW_OP_deref to tell the debugger to load the pointer and treat it as
+  // the address of the variable.
+  if (UsePointerValue) {
+    assert(!llvm::is_contained(Expr, llvm::dwarf::DW_OP_deref) &&
+           "Debug info already contains DW_OP_deref.");
+    Expr.push_back(llvm::dwarf::DW_OP_deref);
+  }
+
+  unsigned Line = getLineNumber(BD->getLocation());
+  unsigned Column = getColumnNumber(BD->getLocation());
+  StringRef Name = BD->getName();
+  auto *Scope = cast<llvm::DIScope>(LexicalBlockStack.back());
+  // Create the descriptor for the variable.
+  llvm::DILocalVariable *D = DBuilder.createAutoVariable(
+      Scope, Name, Unit, Line, Ty, CGM.getLangOpts().Optimize,
+      llvm::DINode::FlagZero, Align);
+
+  if (const MemberExpr *ME = dyn_cast<MemberExpr>(BD->getBinding())) {
+    if (const FieldDecl *FD = dyn_cast<FieldDecl>(ME->getMemberDecl())) {
+      const unsigned fieldIndex = FD->getFieldIndex();
+      const clang::CXXRecordDecl *parent =
+          (const CXXRecordDecl *)FD->getParent();
+      const ASTRecordLayout &layout =
+          CGM.getContext().getASTRecordLayout(parent);
+      const uint64_t fieldOffset = layout.getFieldOffset(fieldIndex);
+
+      if (fieldOffset != 0) {
+        Expr.push_back(llvm::dwarf::DW_OP_plus_uconst);
+        Expr.push_back(
+            CGM.getContext().toCharUnitsFromBits(fieldOffset).getQuantity());
+      }
+    }
+  } else if (const ArraySubscriptExpr *ASE =
+                 dyn_cast<ArraySubscriptExpr>(BD->getBinding())) {
+    if (const IntegerLiteral *IL = dyn_cast<IntegerLiteral>(ASE->getIdx())) {
+      const uint64_t value = IL->getValue().getZExtValue();
+      const uint64_t typeSize = CGM.getContext().getTypeSize(BD->getType());
+
+      if (value != 0) {
+        Expr.push_back(llvm::dwarf::DW_OP_plus_uconst);
+        Expr.push_back(CGM.getContext()
+                           .toCharUnitsFromBits(value * typeSize)
+                           .getQuantity());
+      }
+    }
+  }
+
+  // Insert an llvm.dbg.declare into the current block.
+  DBuilder.insertDeclare(Storage, D, DBuilder.createExpression(Expr),
+                         llvm::DILocation::get(CGM.getLLVMContext(), Line,
+                                               Column, Scope, CurInlinedAt),
+                         Builder.GetInsertBlock());
+
+  return D;
+}
+
 llvm::DILocalVariable *
 CGDebugInfo::EmitDeclareOfAutoVariable(const VarDecl *VD, llvm::Value *Storage,
                                        CGBuilderTy &Builder,
                                        const bool UsePointerValue) {
   assert(CGM.getCodeGenOpts().hasReducedDebugInfo());
+
+  if (auto *DD = dyn_cast<DecompositionDecl>(VD))
+    for (auto *B : DD->bindings()) {
+      EmitDeclare(B, Storage, llvm::None, Builder,
+                  VD->getType()->isReferenceType());
+    }
+
   return EmitDeclare(VD, Storage, llvm::None, Builder, UsePointerValue);
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index a76426e585c8e..165ece4224c99 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -583,6 +583,14 @@ class CGDebugInfo {
                                      CGBuilderTy &Builder,
                                      const bool UsePointerValue = false);
 
+  /// Emit call to llvm.dbg.declare for a binding declaration.
+  /// Returns a pointer to the DILocalVariable associated with the
+  /// llvm.dbg.declare, or nullptr otherwise.
+  llvm::DILocalVariable *EmitDeclare(const BindingDecl *decl, llvm::Value *AI,
+                                     llvm::Optional<unsigned> ArgNo,
+                                     CGBuilderTy &Builder,
+                                     const bool UsePointerValue = false);
+
   struct BlockByRefType {
     /// The wrapper struct used inside the __block_literal struct.
     llvm::DIType *BlockByRefWrapper;
diff --git a/clang/test/CodeGenCXX/debug-info-structured-binding.cpp b/clang/test/CodeGenCXX/debug-info-structured-binding.cpp
new file mode 100644
index 0000000000000..27b918bffa6f2
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-structured-binding.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -triple %itanium_abi_triple %s -o - | FileCheck %s
+
+// CHECK: call void @llvm.dbg.declare(metadata %struct.A* %{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !DIExpression())
+// CHECK: call void @llvm.dbg.declare(metadata %struct.A* %{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_plus_uconst, {{[0-9]+}}))
+// CHECK: call void @llvm.dbg.declare(metadata %struct.A* %{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !DIExpression())
+// CHECK: call void @llvm.dbg.declare(metadata %struct.A** %{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_deref))
+// CHECK: call void @llvm.dbg.declare(metadata %struct.A** %{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !DIExpression(DW_OP_deref, DW_OP_plus_uconst, {{[0-9]+}}))
+// CHECK: call void @llvm.dbg.declare(metadata %struct.A** %{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !DIExpression())
+struct A {
+  int x;
+  int y;
+};
+
+int f() {
+  A a{10, 20};
+  auto [x1, y1] = a;
+  auto &[x2, y2] = a;
+  return x1 + y1 + x2 + y2;
+}
diff --git a/lldb/test/API/lang/cpp/structured-binding/Makefile b/lldb/test/API/lang/cpp/structured-binding/Makefile
new file mode 100644
index 0000000000000..d5f5fec8441b5
--- /dev/null
+++ b/lldb/test/API/lang/cpp/structured-binding/Makefile
@@ -0,0 +1,4 @@
+CXX_SOURCES := main.cpp
+CXXFLAGS_EXTRAS := -std=c++17
+
+include Makefile.rules
diff --git a/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
new file mode 100644
index 0000000000000..694377341a03c
--- /dev/null
+++ b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
@@ -0,0 +1,84 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+class TestStructuredBinding(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    @skipIf(compiler="clang", compiler_version=['<', '14.0'])
+    def test(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.cpp"))
+
+        self.expect_expr("a1", result_type="A",
+            result_children=[ValueCheck(name="x", type="int"),
+                             ValueCheck(name="y", type="int")])
+        self.expect_expr("b1", result_type="char", result_value="'a'")
+        self.expect_expr("c1", result_type="char", result_value="'b'")
+        self.expect_expr("d1", result_type="short", result_value="50")
+        self.expect_expr("e1", result_type="int", result_value="60")
+        self.expect_expr("f1", result_type="char", result_value="'c'")
+
+        self.expect_expr("a2", result_type="A",
+            result_children=[ValueCheck(name="x", type="int"),
+                             ValueCheck(name="y", type="int")])
+        self.expect_expr("b2", result_type="char", result_value="'a'")
+        self.expect_expr("c2", result_type="char", result_value="'b'")
+        self.expect_expr("d2", result_type="short", result_value="50")
+        self.expect_expr("e2", result_type="int", result_value="60")
+        self.expect_expr("f2", result_type="char", result_value="'c'")
+
+        self.expect_expr("a3", result_type="A",
+            result_children=[ValueCheck(name="x", type="int"),
+                             ValueCheck(name="y", type="int")])
+        self.expect_expr("b3", result_type="char", result_value="'a'")
+        self.expect_expr("c3", result_type="char", result_value="'b'")
+        self.expect_expr("d3", result_type="short", result_value="50")
+        self.expect_expr("e3", result_type="int", result_value="60")
+        self.expect_expr("f3", result_type="char", result_value="'c'")
+
+        self.expect_expr("carr_ref1", result_type="char", result_value="'a'")
+        self.expect_expr("carr_ref2", result_type="char", result_value="'b'")
+        self.expect_expr("carr_ref3", result_type="char", result_value="'c'")
+
+        self.expect_expr("sarr_ref1", result_type="short", result_value="11")
+        self.expect_expr("sarr_ref2", result_type="short", result_value="12")
+        self.expect_expr("sarr_ref3", result_type="short", result_value="13")
+
+        self.expect_expr("iarr_ref1", result_type="int", result_value="22")
+        self.expect_expr("iarr_ref2", result_type="int", result_value="33")
+        self.expect_expr("iarr_ref3", result_type="int", result_value="44")
+
+        self.expect_expr("carr_rref1", result_type="char", result_value="'a'")
+        self.expect_expr("carr_rref2", result_type="char", result_value="'b'")
+        self.expect_expr("carr_rref3", result_type="char", result_value="'c'")
+
+        self.expect_expr("sarr_rref1", result_type="short", result_value="11")
+        self.expect_expr("sarr_rref2", result_type="short", result_value="12")
+        self.expect_expr("sarr_rref3", result_type="short", result_value="13")
+
+        self.expect_expr("iarr_rref1", result_type="int", result_value="22")
+        self.expect_expr("iarr_rref2", result_type="int", result_value="33")
+        self.expect_expr("iarr_rref3", result_type="int", result_value="44")
+
+        self.expect_expr("carr_copy1", result_type="char", result_value="'a'")
+        self.expect_expr("carr_copy2", result_type="char", result_value="'b'")
+        self.expect_expr("carr_copy3", result_type="char", result_value="'c'")
+
+        self.expect_expr("sarr_copy1", result_type="short", result_value="11")
+        self.expect_expr("sarr_copy2", result_type="short", result_value="12")
+        self.expect_expr("sarr_copy3", result_type="short", result_value="13")
+
+        self.expect_expr("iarr_copy1", result_type="int", result_value="22")
+        self.expect_expr("iarr_copy2", result_type="int", result_value="33")
+        self.expect_expr("iarr_copy3", result_type="int", result_value="44")
+
+        self.expect_expr("tx1", result_type="float", result_value="4")
+        self.expect_expr("ty1", result_type="char", result_value="'z'")
+        self.expect_expr("tz1", result_type="int", result_value="10")
+
+        self.expect_expr("tx2", result_type="float", result_value="4")
+        self.expect_expr("ty2", result_type="char", result_value="'z'")
+        self.expect_expr("tz2", result_type="int", result_value="10")
diff --git a/lldb/test/API/lang/cpp/structured-binding/main.cpp b/lldb/test/API/lang/cpp/structured-binding/main.cpp
new file mode 100644
index 0000000000000..3fbfb18dbeff0
--- /dev/null
+++ b/lldb/test/API/lang/cpp/structured-binding/main.cpp
@@ -0,0 +1,69 @@
+// Structured binding in C++ can bind identifiers to subobjects of an object.
+//
+// There are three cases we need to test:
+// 1) arrays
+// 2) tuples like objects
+// 3) non-static data members
+//
+// They can also bind by copy, reference or rvalue reference.
+
+#include <tuple>
+
+struct A {
+  int x;
+  int y;
+};
+
+// We want to cover a mix of types and also different sizes to make sure we
+// hande the offsets correctly.
+struct MixedTypesAndSizesStruct {
+  A a;
+  char b1;
+  char b2;
+  short b3;
+  int b4;
+  char b5;
+};
+
+int main() {
+  MixedTypesAndSizesStruct b{{20, 30}, 'a', 'b', 50, 60, 'c'};
+
+  auto [a1, b1, c1, d1, e1, f1] = b;
+  auto &[a2, b2, c2, d2, e2, f2] = b;
+  auto &&[a3, b3, c3, d3, e3, f3] =
+      MixedTypesAndSizesStruct{{20, 30}, 'a', 'b', 50, 60, 'c'};
+
+  // Array with different sized types
+  char carr[]{'a', 'b', 'c'};
+  short sarr[]{11, 12, 13};
+  int iarr[]{22, 33, 44};
+
+  auto [carr_copy1, carr_copy2, carr_copy3] = carr;
+  auto [sarr_copy1, sarr_copy2, sarr_copy3] = sarr;
+  auto [iarr_copy1, iarr_copy2, iarr_copy3] = iarr;
+
+  auto &[carr_ref1, carr_ref2, carr_ref3] = carr;
+  auto &[sarr_ref1, sarr_ref2, sarr_ref3] = sarr;
+  auto &[iarr_ref1, iarr_ref2, iarr_ref3] = iarr;
+
+  auto &&[carr_rref1, carr_rref2, carr_rref3] = carr;
+  auto &&[sarr_rref1, sarr_rref2, sarr_rref3] = sarr;
+  auto &&[iarr_rref1, iarr_rref2, iarr_rref3] = iarr;
+
+  float x{4.0};
+  char y{'z'};
+  int z{10};
+
+  std::tuple<float, char, int> tpl(x, y, z);
+  auto [tx1, ty1, tz1] = tpl;
+  auto &[tx2, ty2, tz2] = tpl;
+
+  return a1.x + b1 + c1 + d1 + e1 + f1 + a2.y + b2 + c2 + d2 + e2 + f2 + a3.x +
+         b3 + c3 + d3 + e3 + f3 + carr_copy1 + carr_copy2 + carr_copy3 +
+         sarr_copy1 + sarr_copy2 + sarr_copy3 + iarr_copy1 + iarr_copy2 +
+         iarr_copy3 + carr_ref1 + carr_ref2 + carr_ref3 + sarr_ref1 +
+         sarr_ref2 + sarr_ref3 + iarr_ref1 + iarr_ref2 + iarr_ref3 +
+         carr_rref1 + carr_rref2 + carr_rref3 + sarr_rref1 + sarr_rref2 +
+         sarr_rref3 + iarr_rref1 + iarr_rref2 + iarr_rref3 + tx1 + ty1 + tz1 +
+         tx2 + ty2 + tz2; // break here
+}

From a52b9102d1f75ca0229e5e395d317fb9ecd51590 Mon Sep 17 00:00:00 2001
From: Leonard Grey <lgrey@chromium.org>
Date: Fri, 14 Jan 2022 14:37:00 -0500
Subject: [PATCH 153/748] [lld-macho] Allow order files and call graph sorting
 to be used together

If both an order file and a call graph profile are present, the edges of the
call graph which use symbols present in the order file are not used. All of
the symbols in the order file will appear at the beginning of the section just
as they do currently. In other words, the highest priority derived from the
call graph will be below the lowest priority derived from the order file.

Practically, this change renames CallGraphSort.{h,cpp} to SectionPriorities.{h,cpp},
and most order file and call graph profile related code is moved into the new
file to reduce duplication.

Differential Revision: https://reviews.llvm.org/D117354
---
 lld/MachO/Driver.cpp                 | 18 +---------
 lld/MachO/SectionPriorities.cpp      | 52 +++++++++++++++++-----------
 lld/MachO/SectionPriorities.h        |  3 +-
 lld/test/MachO/cgprofile-orderfile.s | 50 ++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 39 deletions(-)
 create mode 100644 lld/test/MachO/cgprofile-orderfile.s

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 790f31a44a3b9..aca794ffa41b6 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -460,20 +460,6 @@ static void addFileList(StringRef path, bool isLazy) {
     addFile(rerootPath(path), ForceLoad::Default, isLazy);
 }
 
-// An order file has one entry per line, in the following format:
-//
-//   <cpu>:<object file>:<symbol name>
-//
-// <cpu> and <object file> are optional. If not specified, then that entry
-// matches any symbol of that name. Parsing this format is not quite
-// straightforward because the symbol name itself can contain colons, so when
-// encountering a colon, we consider the preceding characters to decide if it
-// can be a valid CPU type or file path.
-//
-// If a symbol is matched by multiple entries, then it takes the lowest-ordered
-// entry (the one nearest to the front of the list.)
-//
-// The file can also have line comments that start with '#'.
 // We expect sub-library names of the form "libfoo", which will match a dylib
 // with a path of .*/libfoo.{dylib, tbd}.
 // XXX ld64 seems to ignore the extension entirely when matching sub-libraries;
@@ -1461,10 +1447,8 @@ bool macho::link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     replaceCommonSymbols();
 
     StringRef orderFile = args.getLastArgValue(OPT_order_file);
-    if (!orderFile.empty()) {
+    if (!orderFile.empty())
       parseOrderFile(orderFile);
-      config->callGraphProfileSort = false;
-    }
 
     referenceStubBinder();
 
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index 35510d7338e89..405cb23c42c53 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -16,6 +16,7 @@
 #include "InputFiles.h"
 #include "Symbols.h"
 #include "Target.h"
+
 #include "lld/Common/Args.h"
 #include "lld/Common/CommonLinkerContext.h"
 #include "lld/Common/ErrorHandler.h"
@@ -34,6 +35,9 @@ using namespace lld;
 using namespace lld::macho;
 
 namespace {
+
+size_t lowestPriority = std::numeric_limits<size_t>::max();
+
 struct Edge {
   int from;
   uint64_t weight;
@@ -208,7 +212,7 @@ DenseMap<const InputSection *, size_t> CallGraphSort::run() {
   // priority 0 and be placed at the end of sections.
   // NB: This is opposite from COFF/ELF to be compatible with the existing
   // order-file code.
-  int curOrder = clusters.size();
+  int curOrder = lowestPriority;
   for (int leader : sorted) {
     for (int i = leader;;) {
       orderMap[sections[i]] = curOrder--;
@@ -247,8 +251,17 @@ DenseMap<const InputSection *, size_t> CallGraphSort::run() {
   return orderMap;
 }
 
-static size_t getSymbolPriority(const SymbolPriorityEntry &entry,
-                                const InputFile *f) {
+static Optional<size_t> getSymbolPriority(const Defined *sym) {
+  if (sym->isAbsolute())
+    return None;
+
+  auto it = config->priorities.find(sym->getName());
+  if (it == config->priorities.end())
+    return None;
+  const SymbolPriorityEntry &entry = it->second;
+  const InputFile *f = sym->isec->getFile();
+  if (!f)
+    return entry.anyObjectFile;
   // We don't use toString(InputFile *) here because it returns the full path
   // for object files, and we only want the basename.
   StringRef filename;
@@ -262,6 +275,7 @@ static size_t getSymbolPriority(const SymbolPriorityEntry &entry,
 
 void macho::extractCallGraphProfile() {
   TimeTraceScope timeScope("Extract call graph profile");
+  bool hasOrderFile = !config->priorities.empty();
   for (const InputFile *file : inputFiles) {
     auto *obj = dyn_cast_or_null<ObjFile>(file);
     if (!obj)
@@ -271,8 +285,9 @@ void macho::extractCallGraphProfile() {
              entry.toIndex < obj->symbols.size());
       auto *fromSym = dyn_cast_or_null<Defined>(obj->symbols[entry.fromIndex]);
       auto *toSym = dyn_cast_or_null<Defined>(obj->symbols[entry.toIndex]);
-
-      if (!fromSym || !toSym)
+      if (!fromSym || !toSym ||
+          (hasOrderFile &&
+           (getSymbolPriority(fromSym) || getSymbolPriority(toSym))))
         continue;
       config->callGraphProfile[{fromSym->isec, toSym->isec}] += entry.count;
     }
@@ -280,6 +295,8 @@ void macho::extractCallGraphProfile() {
 }
 
 void macho::parseOrderFile(StringRef path) {
+  assert(config->callGraphProfile.empty() &&
+         "Order file must be parsed before call graph profile is processed");
   Optional<MemoryBufferRef> buffer = readFile(path);
   if (!buffer) {
     error("Could not read order file at " + path);
@@ -331,6 +348,7 @@ void macho::parseOrderFile(StringRef path) {
 
     --priority;
   }
+  lowestPriority = priority;
 }
 
 // Sort sections by the profile data provided by __LLVM,__cg_profile sections.
@@ -343,28 +361,20 @@ static DenseMap<const InputSection *, size_t> computeCallGraphProfileOrder() {
   return CallGraphSort().run();
 }
 
-// Each section gets assigned the priority of the highest-priority symbol it
-// contains.
 DenseMap<const InputSection *, size_t> macho::buildInputSectionPriorities() {
-  if (config->callGraphProfileSort)
-    return computeCallGraphProfileOrder();
   DenseMap<const InputSection *, size_t> sectionPriorities;
+  if (config->callGraphProfileSort)
+    sectionPriorities = computeCallGraphProfileOrder();
 
   if (config->priorities.empty())
     return sectionPriorities;
 
-  auto addSym = [&](Defined &sym) {
-    if (sym.isAbsolute())
-      return;
-
-    auto it = config->priorities.find(sym.getName());
-    if (it == config->priorities.end())
+  auto addSym = [&](const Defined *sym) {
+    Optional<size_t> symbolPriority = getSymbolPriority(sym);
+    if (!symbolPriority.hasValue())
       return;
-
-    SymbolPriorityEntry &entry = it->second;
-    size_t &priority = sectionPriorities[sym.isec];
-    priority =
-        std::max(priority, getSymbolPriority(entry, sym.isec->getFile()));
+    size_t &priority = sectionPriorities[sym->isec];
+    priority = std::max(priority, symbolPriority.getValue());
   };
 
   // TODO: Make sure this handles weak symbols correctly.
@@ -372,7 +382,7 @@ DenseMap<const InputSection *, size_t> macho::buildInputSectionPriorities() {
     if (isa<ObjFile>(file))
       for (Symbol *sym : file->symbols)
         if (auto *d = dyn_cast_or_null<Defined>(sym))
-          addSym(*d);
+          addSym(d);
   }
 
   return sectionPriorities;
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index 9cc4eff958cd7..1557cc4747d93 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -44,7 +44,8 @@ void parseOrderFile(StringRef path);
 //
 // If either an order file or a call graph profile are present, this is used
 // as the source of priorities. If both are present, the order file takes
-// precedence. If neither is present, an empty map is returned.
+// precedence, but the call graph profile is still used for symbols that don't
+// appear in the order file. If neither is present, an empty map is returned.
 //
 // Each section gets assigned the priority of the highest-priority symbol it
 // contains.
diff --git a/lld/test/MachO/cgprofile-orderfile.s b/lld/test/MachO/cgprofile-orderfile.s
new file mode 100644
index 0000000000000..eb3a30b27e1bd
--- /dev/null
+++ b/lld/test/MachO/cgprofile-orderfile.s
@@ -0,0 +1,50 @@
+# REQUIRES: x86
+
+# RUN: rm -rf %t; split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
+
+# RUN: %lld -e A %t/test.o -order_file %t/order_file -o %t/test 
+# RUN: llvm-nm --numeric-sort %t/test | FileCheck %s
+# RUN: %lld -e A %t/test.o -o %t/test 
+# RUN: llvm-nm --numeric-sort %t/test | FileCheck %s --check-prefix NO-ORDER
+
+
+#--- order_file
+B
+A
+
+#--- test.s
+
+.text
+    .globl  D
+D:
+    retq
+
+    .globl  C
+C:
+    retq
+
+    .globl  B
+B:
+    retq
+
+    .globl  A
+A:
+    retq
+
+.cg_profile A, B, 100
+.cg_profile A, C,  40
+.cg_profile C, D,  61
+
+.subsections_via_symbols
+
+# CHECK:      T B
+# CHECK-NEXT: T A
+# CHECK-NEXT: T C
+# CHECK-NEXT: T D
+
+# NO-ORDER:      T A
+# NO-ORDER-NEXT: T B
+# NO-ORDER-NEXT: T C
+# NO-ORDER-NEXT: T D
+

From df2812d8de2ac99d46c8786c00922c09a5b65db0 Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Thu, 17 Feb 2022 19:24:53 +0000
Subject: [PATCH 154/748] [ifs] Add --strip-needed flag

Reviewed By: haowei, mcgrathr

Differential Revision: https://reviews.llvm.org/D119907
---
 llvm/test/tools/llvm-ifs/strip-needed.test | 26 ++++++++++++++++++++++
 llvm/tools/llvm-ifs/llvm-ifs.cpp           |  6 +++++
 2 files changed, 32 insertions(+)
 create mode 100644 llvm/test/tools/llvm-ifs/strip-needed.test

diff --git a/llvm/test/tools/llvm-ifs/strip-needed.test b/llvm/test/tools/llvm-ifs/strip-needed.test
new file mode 100644
index 0000000000000..1ef579f8aba26
--- /dev/null
+++ b/llvm/test/tools/llvm-ifs/strip-needed.test
@@ -0,0 +1,26 @@
+## Test --strip-needed flag
+
+# RUN: llvm-ifs --input-format=IFS --strip-needed --output-ifs=- %s | FileCheck %s
+
+# RUN: llvm-ifs --input-format=IFS --output-elf=- %s | \
+# RUN: obj2yaml | FileCheck %s --check-prefix=FROM-ELF
+
+# RUN: llvm-ifs --input-format=IFS --strip-needed --output-elf=- %s | \
+# RUN: obj2yaml | FileCheck %s --check-prefix=FROM-ELF-STRIP-NEEDED
+
+--- !ifs-v1
+IfsVersion: 3.0
+Target: { ObjectFormat: ELF, Arch: AArch64, Endianness: little, BitWidth: 64 }
+NeededLibs:
+  - ''
+Symbols: []
+...
+
+# CHECK:      --- !ifs-v1
+# CHECK-NEXT: IfsVersion: {{[1-9]\d*\.(0|([1-9]\d*))}}
+# CHECK-NEXT: Target: { ObjectFormat: ELF, Arch: AArch64, Endianness: little, BitWidth: 64 }
+# CHECK-NEXT: Symbols: []
+# CHECK-NEXT: ...
+
+# FROM-ELF: DT_NEEDED
+# FROM-ELF-STRIP-NEEDED-NOT: DT_NEEDED
diff --git a/llvm/tools/llvm-ifs/llvm-ifs.cpp b/llvm/tools/llvm-ifs/llvm-ifs.cpp
index 2dcd0c5ca9e28..ef8864e08fdbf 100644
--- a/llvm/tools/llvm-ifs/llvm-ifs.cpp
+++ b/llvm/tools/llvm-ifs/llvm-ifs.cpp
@@ -103,6 +103,9 @@ cl::opt<bool>
     StripUndefined("strip-undefined",
                    cl::desc("Strip undefined symbols from IFS output"),
                    cl::cat(IfsCategory));
+cl::opt<bool> StripNeededLibs("strip-needed",
+                              cl::desc("Strip needed libs from output"),
+                              cl::cat(IfsCategory));
 
 cl::opt<std::string>
     SoName("soname",
@@ -417,6 +420,9 @@ int main(int argc, char *argv[]) {
   if (OverrideError)
     fatalError(std::move(OverrideError));
 
+  if (StripNeededLibs)
+    Stub.NeededLibs.clear();
+
   if (OutputELFFilePath.getNumOccurrences() == 0 &&
       OutputIFSFilePath.getNumOccurrences() == 0 &&
       OutputTBDFilePath.getNumOccurrences() == 0) {

From f6d390193cbcc3b9a0e900202023bd80bd5ac4a4 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <syaghmour@apple.com>
Date: Thu, 17 Feb 2022 11:29:05 -0800
Subject: [PATCH 155/748] [LLDB] Fix TestStructuredBinding.py for libstdc++

For the tuple case for the TestStructuredBinding.py the result type is different
between libc++ and libstdc++.
---
 .../cpp/structured-binding/TestStructuredBinding.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
index 694377341a03c..9f57d45dd9fc1 100644
--- a/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
+++ b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
@@ -75,10 +75,10 @@ def test(self):
         self.expect_expr("iarr_copy2", result_type="int", result_value="33")
         self.expect_expr("iarr_copy3", result_type="int", result_value="44")
 
-        self.expect_expr("tx1", result_type="float", result_value="4")
-        self.expect_expr("ty1", result_type="char", result_value="'z'")
-        self.expect_expr("tz1", result_type="int", result_value="10")
+        self.expect_expr("tx1", result_value="4")
+        self.expect_expr("ty1", result_value="'z'")
+        self.expect_expr("tz1", result_value="10")
 
-        self.expect_expr("tx2", result_type="float", result_value="4")
-        self.expect_expr("ty2", result_type="char", result_value="'z'")
-        self.expect_expr("tz2", result_type="int", result_value="10")
+        self.expect_expr("tx2", result_value="4")
+        self.expect_expr("ty2", result_value="'z'")
+        self.expect_expr("tz2", result_value="10")

From 74cacf212bb31f8ba837b7eb2434258dd79eaccb Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 17 Feb 2022 13:20:51 -0500
Subject: [PATCH 156/748] [OpenMP] Add RTL function to externalization RAII

This patch adds the '_kmpc_get_hardware_num_threads_in_block'
OpenMP RTL function to the externalization RAII struct. This was getting
optimized out and then being replaced with an undefined value once added
back in, causing bugs for complex reductions.

Fixes #53909.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120076
---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp                 |  2 ++
 .../OpenMP/get_hardware_num_threads_in_block_fold.ll  | 11 ++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index f577a6b0f1743..392b919c5a120 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2119,6 +2119,8 @@ struct OpenMPOpt {
                                        OMPRTL___kmpc_barrier_simple_generic);
     ExternalizationRAII ThreadId(OMPInfoCache,
                                  OMPRTL___kmpc_get_hardware_thread_id_in_block);
+    ExternalizationRAII NumThreads(
+        OMPInfoCache, OMPRTL___kmpc_get_hardware_num_threads_in_block);
     ExternalizationRAII WarpSize(OMPInfoCache, OMPRTL___kmpc_get_warp_size);
 
     registerAAs(IsModulePass);
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index b72031a9b68c0..57eaebc7e141c 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -178,7 +178,16 @@ entry:
   ret void
 }
 
-declare i32 @__kmpc_get_hardware_num_threads_in_block()
+define internal i32 @__kmpc_get_hardware_num_threads_in_block() {
+; CHECK-LABEL: define {{[^@]+}}@__kmpc_get_hardware_num_threads_in_block
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block_dummy()
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %ret = call i32 @__kmpc_get_hardware_num_threads_in_block_dummy()
+  ret i32 %ret
+}
+declare i32 @__kmpc_get_hardware_num_threads_in_block_dummy()
 declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1 zeroext, i1 zeroext) #1
 declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i8, i1 zeroext) #1
 declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)

From 18ead23385a4e0e6421d658591b1ee6a1c592b53 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Thu, 17 Feb 2022 11:23:33 -0800
Subject: [PATCH 157/748] AST: Make getEffectiveDeclContext() a member function
 of ItaniumMangleContextImpl. NFCI.

In an upcoming change we are going to need to access mangler state
from the getEffectiveDeclContext() function. Therefore, make it a
member function of ItaniumMangleContextImpl. Any callers that are
not currently members of ItaniumMangleContextImpl or CXXNameMangler
are made members of one or the other depending on where they are
called from.

Differential Revision: https://reviews.llvm.org/D116773
---
 clang/lib/AST/ItaniumMangle.cpp | 172 +++++++++++++++++---------------
 1 file changed, 94 insertions(+), 78 deletions(-)

diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 63e40a0f30721..4277a0166f202 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -40,65 +40,10 @@ using namespace clang;
 
 namespace {
 
-/// Retrieve the declaration context that should be used when mangling the given
-/// declaration.
-static const DeclContext *getEffectiveDeclContext(const Decl *D) {
-  // The ABI assumes that lambda closure types that occur within
-  // default arguments live in the context of the function. However, due to
-  // the way in which Clang parses and creates function declarations, this is
-  // not the case: the lambda closure type ends up living in the context
-  // where the function itself resides, because the function declaration itself
-  // had not yet been created. Fix the context here.
-  if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D)) {
-    if (RD->isLambda())
-      if (ParmVarDecl *ContextParam
-            = dyn_cast_or_null<ParmVarDecl>(RD->getLambdaContextDecl()))
-        return ContextParam->getDeclContext();
-  }
-
-  // Perform the same check for block literals.
-  if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
-    if (ParmVarDecl *ContextParam
-          = dyn_cast_or_null<ParmVarDecl>(BD->getBlockManglingContextDecl()))
-      return ContextParam->getDeclContext();
-  }
-
-  const DeclContext *DC = D->getDeclContext();
-  if (isa<CapturedDecl>(DC) || isa<OMPDeclareReductionDecl>(DC) ||
-      isa<OMPDeclareMapperDecl>(DC)) {
-    return getEffectiveDeclContext(cast<Decl>(DC));
-  }
-
-  if (const auto *VD = dyn_cast<VarDecl>(D))
-    if (VD->isExternC())
-      return VD->getASTContext().getTranslationUnitDecl();
-
-  if (const auto *FD = dyn_cast<FunctionDecl>(D))
-    if (FD->isExternC())
-      return FD->getASTContext().getTranslationUnitDecl();
-
-  return DC->getRedeclContext();
-}
-
-static const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
-  return getEffectiveDeclContext(cast<Decl>(DC));
-}
-
 static bool isLocalContainerContext(const DeclContext *DC) {
   return isa<FunctionDecl>(DC) || isa<ObjCMethodDecl>(DC) || isa<BlockDecl>(DC);
 }
 
-static const RecordDecl *GetLocalClassDecl(const Decl *D) {
-  const DeclContext *DC = getEffectiveDeclContext(D);
-  while (!DC->isNamespace() && !DC->isTranslationUnit()) {
-    if (isLocalContainerContext(DC))
-      return dyn_cast<RecordDecl>(D);
-    D = cast<Decl>(DC);
-    DC = getEffectiveDeclContext(D);
-  }
-  return nullptr;
-}
-
 static const FunctionDecl *getStructor(const FunctionDecl *fn) {
   if (const FunctionTemplateDecl *ftd = fn->getPrimaryTemplate())
     return ftd->getTemplatedDecl();
@@ -249,6 +194,14 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
     return DiscriminatorOverride;
   }
 
+  const DeclContext *getEffectiveDeclContext(const Decl *D);
+  const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
+    return getEffectiveDeclContext(cast<Decl>(DC));
+  }
+
+  bool isInternalLinkageDecl(const NamedDecl *ND);
+  const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC);
+
   /// @}
 };
 
@@ -427,6 +380,15 @@ class CXXNameMangler {
 
   ASTContext &getASTContext() const { return Context.getASTContext(); }
 
+  bool isStd(const NamespaceDecl *NS);
+  bool isStdNamespace(const DeclContext *DC);
+
+  const RecordDecl *GetLocalClassDecl(const Decl *D);
+  const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC);
+  bool isSpecializedAs(QualType S, llvm::StringRef Name, QualType A);
+  bool isStdCharSpecialization(const ClassTemplateSpecializationDecl *SD,
+                               llvm::StringRef Name, bool HasAllocator);
+
 public:
   CXXNameMangler(ItaniumMangleContextImpl &C, raw_ostream &Out_,
                  const NamedDecl *D = nullptr, bool NullOut_ = false)
@@ -628,7 +590,48 @@ class CXXNameMangler {
 
 }
 
-static bool isInternalLinkageDecl(const NamedDecl *ND) {
+/// Retrieve the declaration context that should be used when mangling the given
+/// declaration.
+const DeclContext *
+ItaniumMangleContextImpl::getEffectiveDeclContext(const Decl *D) {
+  // The ABI assumes that lambda closure types that occur within
+  // default arguments live in the context of the function. However, due to
+  // the way in which Clang parses and creates function declarations, this is
+  // not the case: the lambda closure type ends up living in the context
+  // where the function itself resides, because the function declaration itself
+  // had not yet been created. Fix the context here.
+  if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D)) {
+    if (RD->isLambda())
+      if (ParmVarDecl *ContextParam =
+              dyn_cast_or_null<ParmVarDecl>(RD->getLambdaContextDecl()))
+        return ContextParam->getDeclContext();
+  }
+
+  // Perform the same check for block literals.
+  if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
+    if (ParmVarDecl *ContextParam =
+            dyn_cast_or_null<ParmVarDecl>(BD->getBlockManglingContextDecl()))
+      return ContextParam->getDeclContext();
+  }
+
+  const DeclContext *DC = D->getDeclContext();
+  if (isa<CapturedDecl>(DC) || isa<OMPDeclareReductionDecl>(DC) ||
+      isa<OMPDeclareMapperDecl>(DC)) {
+    return getEffectiveDeclContext(cast<Decl>(DC));
+  }
+
+  if (const auto *VD = dyn_cast<VarDecl>(D))
+    if (VD->isExternC())
+      return getASTContext().getTranslationUnitDecl();
+
+  if (const auto *FD = dyn_cast<FunctionDecl>(D))
+    if (FD->isExternC())
+      return getASTContext().getTranslationUnitDecl();
+
+  return DC->getRedeclContext();
+}
+
+bool ItaniumMangleContextImpl::isInternalLinkageDecl(const NamedDecl *ND) {
   if (ND && ND->getFormalLinkage() == InternalLinkage &&
       !ND->isExternallyVisible() &&
       getEffectiveDeclContext(ND)->isFileContext() &&
@@ -863,8 +866,8 @@ void CXXNameMangler::mangleFunctionEncodingBareType(const FunctionDecl *FD) {
 }
 
 /// Return whether a given namespace is the 'std' namespace.
-static bool isStd(const NamespaceDecl *NS) {
-  if (!getEffectiveParentContext(NS)->isTranslationUnit())
+bool CXXNameMangler::isStd(const NamespaceDecl *NS) {
+  if (!Context.getEffectiveParentContext(NS)->isTranslationUnit())
     return false;
 
   const IdentifierInfo *II = NS->getOriginalNamespace()->getIdentifier();
@@ -873,7 +876,7 @@ static bool isStd(const NamespaceDecl *NS) {
 
 // isStdNamespace - Return whether a given decl context is a toplevel 'std'
 // namespace.
-static bool isStdNamespace(const DeclContext *DC) {
+bool CXXNameMangler::isStdNamespace(const DeclContext *DC) {
   if (!DC->isNamespace())
     return false;
 
@@ -947,6 +950,17 @@ void CXXNameMangler::mangleName(GlobalDecl GD) {
   }
 }
 
+const RecordDecl *CXXNameMangler::GetLocalClassDecl(const Decl *D) {
+  const DeclContext *DC = Context.getEffectiveDeclContext(D);
+  while (!DC->isNamespace() && !DC->isTranslationUnit()) {
+    if (isLocalContainerContext(DC))
+      return dyn_cast<RecordDecl>(D);
+    D = cast<Decl>(DC);
+    DC = Context.getEffectiveDeclContext(D);
+  }
+  return nullptr;
+}
+
 void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD,
                                            const AbiTagList *AdditionalAbiTags) {
   const NamedDecl *ND = cast<NamedDecl>(GD.getDecl());
@@ -955,7 +969,7 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD,
   //         ::= [<module-name>] <unscoped-template-name> <template-args>
   //         ::= <local-name>
   //
-  const DeclContext *DC = getEffectiveDeclContext(ND);
+  const DeclContext *DC = Context.getEffectiveDeclContext(ND);
 
   // If this is an extern variable declared locally, the relevant DeclContext
   // is that of the containing namespace, or the translation unit.
@@ -963,7 +977,7 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD,
   // a proper semantic declaration context!
   if (isLocalContainerContext(DC) && ND->hasLinkage() && !isLambda(ND))
     while (!DC->isNamespace() && !DC->isTranslationUnit())
-      DC = getEffectiveParentContext(DC);
+      DC = Context.getEffectiveParentContext(DC);
   else if (GetLocalClassDecl(ND)) {
     mangleLocalName(GD, AdditionalAbiTags);
     return;
@@ -1045,7 +1059,7 @@ void CXXNameMangler::mangleModuleNamePrefix(StringRef Name) {
 void CXXNameMangler::mangleTemplateName(const TemplateDecl *TD,
                                         const TemplateArgument *TemplateArgs,
                                         unsigned NumTemplateArgs) {
-  const DeclContext *DC = getEffectiveDeclContext(TD);
+  const DeclContext *DC = Context.getEffectiveDeclContext(TD);
 
   if (DC->isTranslationUnit() || isStdNamespace(DC)) {
     mangleUnscopedTemplateName(TD, nullptr);
@@ -1061,7 +1075,7 @@ void CXXNameMangler::mangleUnscopedName(GlobalDecl GD,
   //  <unscoped-name> ::= <unqualified-name>
   //                  ::= St <unqualified-name>   # ::std::
 
-  if (isStdNamespace(getEffectiveDeclContext(ND)))
+  if (isStdNamespace(Context.getEffectiveDeclContext(ND)))
     Out << "St";
 
   mangleUnqualifiedName(GD, AdditionalAbiTags);
@@ -1421,7 +1435,7 @@ void CXXNameMangler::mangleUnqualifiedName(GlobalDecl GD,
       // 12_GLOBAL__N_1 mangling is quite sufficient there, and this better
       // matches GCC anyway, because GCC does not treat anonymous namespaces as
       // implying internal linkage.
-      if (isInternalLinkageDecl(ND))
+      if (Context.isInternalLinkageDecl(ND))
         Out << 'L';
 
       auto *FD = dyn_cast<FunctionDecl>(ND);
@@ -1736,7 +1750,7 @@ void CXXNameMangler::mangleLocalName(GlobalDecl GD,
   // <discriminator> := _ <non-negative number>
   assert(isa<NamedDecl>(D) || isa<BlockDecl>(D));
   const RecordDecl *RD = GetLocalClassDecl(D);
-  const DeclContext *DC = getEffectiveDeclContext(RD ? RD : D);
+  const DeclContext *DC = Context.getEffectiveDeclContext(RD ? RD : D);
 
   Out << 'Z';
 
@@ -1789,13 +1803,13 @@ void CXXNameMangler::mangleLocalName(GlobalDecl GD,
       if (const NamedDecl *PrefixND = getClosurePrefix(BD))
         mangleClosurePrefix(PrefixND, true /*NoFunction*/);
       else
-        manglePrefix(getEffectiveDeclContext(BD), true /*NoFunction*/);
+        manglePrefix(Context.getEffectiveDeclContext(BD), true /*NoFunction*/);
       assert(!AdditionalAbiTags && "Block cannot have additional abi tags");
       mangleUnqualifiedBlock(BD);
     } else {
       const NamedDecl *ND = cast<NamedDecl>(D);
-      mangleNestedName(GD, getEffectiveDeclContext(ND), AdditionalAbiTags,
-                       true /*NoFunction*/);
+      mangleNestedName(GD, Context.getEffectiveDeclContext(ND),
+                       AdditionalAbiTags, true /*NoFunction*/);
     }
   } else if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
     // Mangle a block in a default parameter; see above explanation for
@@ -1834,7 +1848,7 @@ void CXXNameMangler::mangleBlockForPrefix(const BlockDecl *Block) {
     mangleLocalName(Block, /* AdditionalAbiTags */ nullptr);
     return;
   }
-  const DeclContext *DC = getEffectiveDeclContext(Block);
+  const DeclContext *DC = Context.getEffectiveDeclContext(Block);
   if (isLocalContainerContext(DC)) {
     mangleLocalName(Block, /* AdditionalAbiTags */ nullptr);
     return;
@@ -2044,7 +2058,7 @@ void CXXNameMangler::manglePrefix(const DeclContext *DC, bool NoFunction) {
     mangleClosurePrefix(PrefixND, NoFunction);
     mangleUnqualifiedName(ND, nullptr);
   } else {
-    manglePrefix(getEffectiveDeclContext(ND), NoFunction);
+    manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction);
     mangleUnqualifiedName(ND, nullptr);
   }
 
@@ -2098,7 +2112,7 @@ void CXXNameMangler::mangleTemplatePrefix(GlobalDecl GD,
   if (const auto *TTP = dyn_cast<TemplateTemplateParmDecl>(ND)) {
     mangleTemplateParameter(TTP->getDepth(), TTP->getIndex());
   } else {
-    manglePrefix(getEffectiveDeclContext(ND), NoFunction);
+    manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction);
     if (isa<BuiltinTemplateDecl>(ND) || isa<ConceptDecl>(ND))
       mangleUnqualifiedName(GD, nullptr);
     else
@@ -2143,7 +2157,7 @@ void CXXNameMangler::mangleClosurePrefix(const NamedDecl *ND, bool NoFunction) {
     mangleTemplatePrefix(TD, NoFunction);
     mangleTemplateArgs(asTemplateName(TD), *TemplateArgs);
   } else {
-    manglePrefix(getEffectiveDeclContext(ND), NoFunction);
+    manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction);
     mangleUnqualifiedName(ND, nullptr);
   }
 
@@ -5962,7 +5976,8 @@ bool CXXNameMangler::mangleSubstitution(uintptr_t Ptr) {
 
 /// Returns whether S is a template specialization of std::Name with a single
 /// argument of type A.
-static bool isSpecializedAs(QualType S, llvm::StringRef Name, QualType A) {
+bool CXXNameMangler::isSpecializedAs(QualType S, llvm::StringRef Name,
+                                     QualType A) {
   if (S.isNull())
     return false;
 
@@ -5975,7 +5990,7 @@ static bool isSpecializedAs(QualType S, llvm::StringRef Name, QualType A) {
   if (!SD || !SD->getIdentifier()->isStr(Name))
     return false;
 
-  if (!isStdNamespace(getEffectiveDeclContext(SD)))
+  if (!isStdNamespace(Context.getEffectiveDeclContext(SD)))
     return false;
 
   const TemplateArgumentList &TemplateArgs = SD->getTemplateArgs();
@@ -5991,8 +6006,9 @@ static bool isSpecializedAs(QualType S, llvm::StringRef Name, QualType A) {
 /// Returns whether SD is a template specialization std::Name<char,
 /// std::char_traits<char> [, std::allocator<char>]>
 /// HasAllocator controls whether the 3rd template argument is needed.
-static bool isStdCharSpecialization(const ClassTemplateSpecializationDecl *SD,
-                                    llvm::StringRef Name, bool HasAllocator) {
+bool CXXNameMangler::isStdCharSpecialization(
+    const ClassTemplateSpecializationDecl *SD, llvm::StringRef Name,
+    bool HasAllocator) {
   if (!SD->getIdentifier()->isStr(Name))
     return false;
 
@@ -6029,7 +6045,7 @@ bool CXXNameMangler::mangleStandardSubstitution(const NamedDecl *ND) {
   }
 
   if (const ClassTemplateDecl *TD = dyn_cast<ClassTemplateDecl>(ND)) {
-    if (!isStdNamespace(getEffectiveDeclContext(TD)))
+    if (!isStdNamespace(Context.getEffectiveDeclContext(TD)))
       return false;
 
     // <substitution> ::= Sa # ::std::allocator
@@ -6048,7 +6064,7 @@ bool CXXNameMangler::mangleStandardSubstitution(const NamedDecl *ND) {
 
   if (const ClassTemplateSpecializationDecl *SD =
         dyn_cast<ClassTemplateSpecializationDecl>(ND)) {
-    if (!isStdNamespace(getEffectiveDeclContext(SD)))
+    if (!isStdNamespace(Context.getEffectiveDeclContext(SD)))
       return false;
 
     //    <substitution> ::= Ss # ::std::basic_string<char,

From 82e5f951fd6e6ad6323067d8afcf025fc72d9c33 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Thu, 6 Jan 2022 13:37:21 -0800
Subject: [PATCH 158/748] AST: Move __va_list tag back to std conditionally on
 AArch64.

In post-commit feedback on D104830 Jessica Clarke pointed out that
unconditionally adding __va_list to the std namespace caused namespace
debug info to be emitted in C, which is not only inappropriate but
turned out to confuse the dtrace tool. Therefore, move __va_list back
to std only in C++ so that the correct debug info is generated. We
also considered moving __va_list to the top level unconditionally
but this would contradict the specification and be visible to AST
matchers and such, so make it conditional on the language mode.

To avoid breaking name mangling for __va_list, teach the Itanium
name mangler to always mangle it as if it were in the std namespace
when targeting ARM architectures. This logic is not needed for the
Microsoft name mangler because Microsoft platforms define va_list as
a typedef of char *.

Depends on D116773

Differential Revision: https://reviews.llvm.org/D116774
---
 clang/lib/AST/ASTContext.cpp             |  25 +--
 clang/lib/AST/ItaniumMangle.cpp          |  26 +++
 clang/test/CodeGen/aarch64-varargs.c     | 244 +++++++++++------------
 clang/test/CodeGen/arm64-be-hfa-vararg.c |   4 +-
 clang/test/Headers/stdarg.cpp            |   2 +-
 5 files changed, 162 insertions(+), 139 deletions(-)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 527c8b56159e0..f29e90c05713c 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -8548,21 +8548,18 @@ static TypedefDecl *CreateVoidPtrBuiltinVaListDecl(const ASTContext *Context) {
 
 static TypedefDecl *
 CreateAArch64ABIBuiltinVaListDecl(const ASTContext *Context) {
+  // struct __va_list
   RecordDecl *VaListTagDecl = Context->buildImplicitRecord("__va_list");
-  // namespace std { struct __va_list {
-  // Note that we create the namespace even in C. This is intentional so that
-  // the type is consistent between C and C++, which is important in cases where
-  // the types need to match between translation units (e.g. with
-  // -fsanitize=cfi-icall). Ideally we wouldn't have created this namespace at
-  // all, but it's now part of the ABI (e.g. in mangled names), so we can't
-  // change it.
-  auto *NS = NamespaceDecl::Create(
-      const_cast<ASTContext &>(*Context), Context->getTranslationUnitDecl(),
-      /*Inline*/ false, SourceLocation(), SourceLocation(),
-      &Context->Idents.get("std"),
-      /*PrevDecl*/ nullptr);
-  NS->setImplicit();
-  VaListTagDecl->setDeclContext(NS);
+  if (Context->getLangOpts().CPlusPlus) {
+    // namespace std { struct __va_list {
+    auto *NS = NamespaceDecl::Create(
+        const_cast<ASTContext &>(*Context), Context->getTranslationUnitDecl(),
+        /*Inline*/ false, SourceLocation(), SourceLocation(),
+        &Context->Idents.get("std"),
+        /*PrevDecl*/ nullptr);
+    NS->setImplicit();
+    VaListTagDecl->setDeclContext(NS);
+  }
 
   VaListTagDecl->startDefinition();
 
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 4277a0166f202..d1d7a7c40ceb9 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -71,6 +71,7 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
   llvm::DenseMap<DiscriminatorKeyTy, unsigned> Discriminator;
   llvm::DenseMap<const NamedDecl*, unsigned> Uniquifier;
   const DiscriminatorOverrideTy DiscriminatorOverride = nullptr;
+  NamespaceDecl *StdNamespace = nullptr;
 
   bool NeedsUniqueInternalLinkageNames = false;
 
@@ -194,6 +195,8 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext {
     return DiscriminatorOverride;
   }
 
+  NamespaceDecl *getStdNamespace();
+
   const DeclContext *getEffectiveDeclContext(const Decl *D);
   const DeclContext *getEffectiveParentContext(const DeclContext *DC) {
     return getEffectiveDeclContext(cast<Decl>(DC));
@@ -590,6 +593,18 @@ class CXXNameMangler {
 
 }
 
+NamespaceDecl *ItaniumMangleContextImpl::getStdNamespace() {
+  if (!StdNamespace) {
+    StdNamespace = NamespaceDecl::Create(
+        getASTContext(), getASTContext().getTranslationUnitDecl(),
+        /*Inline*/ false, SourceLocation(), SourceLocation(),
+        &getASTContext().Idents.get("std"),
+        /*PrevDecl*/ nullptr);
+    StdNamespace->setImplicit();
+  }
+  return StdNamespace;
+}
+
 /// Retrieve the declaration context that should be used when mangling the given
 /// declaration.
 const DeclContext *
@@ -614,6 +629,17 @@ ItaniumMangleContextImpl::getEffectiveDeclContext(const Decl *D) {
       return ContextParam->getDeclContext();
   }
 
+  // On ARM and AArch64, the va_list tag is always mangled as if in the std
+  // namespace. We do not represent va_list as actually being in the std
+  // namespace in C because this would result in incorrect debug info in C,
+  // among other things. It is important for both languages to have the same
+  // mangling in order for -fsanitize=cfi-icall to work.
+  if (D == getASTContext().getVaListTagDecl()) {
+    const llvm::Triple &T = getASTContext().getTargetInfo().getTriple();
+    if (T.isARM() || T.isThumb() || T.isAArch64())
+      return getStdNamespace();
+  }
+
   const DeclContext *DC = D->getDeclContext();
   if (isa<CapturedDecl>(DC) || isa<OMPDeclareReductionDecl>(DC) ||
       isa<OMPDeclareMapperDecl>(DC)) {
diff --git a/clang/test/CodeGen/aarch64-varargs.c b/clang/test/CodeGen/aarch64-varargs.c
index c43a5796f93da..8fe51a98653a4 100644
--- a/clang/test/CodeGen/aarch64-varargs.c
+++ b/clang/test/CodeGen/aarch64-varargs.c
@@ -11,18 +11,18 @@ va_list the_list;
 int simple_int(void) {
 // CHECK-LABEL: define{{.*}} i32 @simple_int
   return va_arg(the_list, int);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK-BE: [[REG_ADDR_ALIGNED:%[0-9]+]] = getelementptr inbounds i8, i8* [[REG_ADDR]], i64 4
 // CHECK-BE: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR_ALIGNED]] to i32*
@@ -30,9 +30,9 @@ int simple_int(void) {
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK-BE: [[STACK_ALIGNED:%[a-z_0-9]*]] = getelementptr inbounds i8, i8* [[STACK]], i64 4
 // CHECK-BE: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK_ALIGNED]] to i32*
 // CHECK-LE: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to i32*
@@ -47,7 +47,7 @@ int simple_int(void) {
 __int128 aligned_int(void) {
 // CHECK-LABEL: define{{.*}} i128 @aligned_int
   return va_arg(the_list, __int128);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
@@ -55,24 +55,24 @@ __int128 aligned_int(void) {
 // CHECK: [[ALIGN_REGOFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 15
 // CHECK: [[ALIGNED_REGOFFS:%[a-z_0-9]+]] = and i32 [[ALIGN_REGOFFS]], -16
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[ALIGNED_REGOFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[ALIGNED_REGOFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to i128*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[STACKINT:%[a-z_0-9]+]] = ptrtoint i8* [[STACK]] to i64
 // CHECK: [[ALIGN_STACK:%[a-z_0-9]+]] = add i64 [[STACKINT]], 15
 // CHECK: [[ALIGNED_STACK_INT:%[a-z_0-9]+]] = and i64 [[ALIGN_STACK]], -16
 // CHECK: [[ALIGNED_STACK_PTR:%[a-z_0-9]+]] = inttoptr i64 [[ALIGNED_STACK_INT]] to i8*
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[ALIGNED_STACK_PTR]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[ALIGNED_STACK_PTR]] to i128*
 // CHECK: br label %[[VAARG_END]]
 
@@ -89,28 +89,28 @@ struct bigstruct {
 struct bigstruct simple_indirect(void) {
 // CHECK-LABEL: define{{.*}} void @simple_indirect
   return va_arg(the_list, struct bigstruct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK-NOT: and i32
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.bigstruct**
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK-NOT: and i64
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.bigstruct**
 // CHECK: br label %[[VAARG_END]]
 
@@ -127,26 +127,26 @@ struct aligned_bigstruct {
 struct aligned_bigstruct simple_aligned_indirect(void) {
 // CHECK-LABEL: define{{.*}} void @simple_aligned_indirect
   return va_arg(the_list, struct aligned_bigstruct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.aligned_bigstruct**
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.aligned_bigstruct**
 // CHECK: br label %[[VAARG_END]]
 
@@ -158,18 +158,18 @@ struct aligned_bigstruct simple_aligned_indirect(void) {
 double simple_double(void) {
 // CHECK-LABEL: define{{.*}} double @simple_double
   return va_arg(the_list, double);
-// CHECK: [[VR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 4)
+// CHECK: [[VR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 4)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[VR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[VR_OFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 4)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 4)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 2)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 2)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[VR_OFFS]]
 // CHECK-BE: [[REG_ADDR_ALIGNED:%[a-z_0-9]*]] = getelementptr inbounds i8, i8* [[REG_ADDR]], i64 8
 // CHECK-BE: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR_ALIGNED]] to double*
@@ -177,9 +177,9 @@ double simple_double(void) {
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to double*
 // CHECK: br label %[[VAARG_END]]
 
@@ -196,18 +196,18 @@ struct hfa {
 struct hfa simple_hfa(void) {
 // CHECK-LABEL: define{{.*}} %struct.hfa @simple_hfa
   return va_arg(the_list, struct hfa);
-// CHECK: [[VR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 4)
+// CHECK: [[VR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 4)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[VR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[VR_OFFS]], 32
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 4)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 4)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 2)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 2)
 // CHECK: [[FIRST_REG:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[VR_OFFS]]
 // CHECK-LE: [[EL_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[FIRST_REG]], i64 0
 // CHECK-BE: [[EL_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[FIRST_REG]], i64 12
@@ -225,9 +225,9 @@ struct hfa simple_hfa(void) {
 // CHECK: br label %[[VAARG_END:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.hfa*
 // CHECK: br label %[[VAARG_END]]
 
@@ -243,18 +243,18 @@ typedef int underaligned_int __attribute__((packed,aligned(2)));
 underaligned_int underaligned_int_test(void) {
 // CHECK-LABEL: define{{.*}} i32 @underaligned_int_test()
   return va_arg(the_list, underaligned_int);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK-BE: [[REG_ADDR_ALIGNED:%[0-9]+]] = getelementptr inbounds i8, i8* [[REG_ADDR]], i64 4
 // CHECK-BE: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR_ALIGNED]] to i32*
@@ -262,9 +262,9 @@ underaligned_int underaligned_int_test(void) {
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK-BE: [[STACK_ALIGNED:%[a-z_0-9]*]] = getelementptr inbounds i8, i8* [[STACK]], i64 4
 // CHECK-BE: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK_ALIGNED]] to i32*
 // CHECK-LE: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to i32*
@@ -280,18 +280,18 @@ typedef int overaligned_int __attribute__((aligned(32)));
 overaligned_int overaligned_int_test(void) {
 // CHECK-LABEL: define{{.*}} i32 @overaligned_int_test()
   return va_arg(the_list, overaligned_int);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK-BE: [[REG_ADDR_ALIGNED:%[0-9]+]] = getelementptr inbounds i8, i8* [[REG_ADDR]], i64 4
 // CHECK-BE: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR_ALIGNED]] to i32*
@@ -299,9 +299,9 @@ overaligned_int overaligned_int_test(void) {
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK-BE: [[STACK_ALIGNED:%[a-z_0-9]*]] = getelementptr inbounds i8, i8* [[STACK]], i64 4
 // CHECK-BE: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK_ALIGNED]] to i32*
 // CHECK-LE: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to i32*
@@ -317,26 +317,26 @@ typedef long long underaligned_long_long  __attribute__((packed,aligned(2)));
 underaligned_long_long underaligned_long_long_test(void) {
 // CHECK-LABEL: define{{.*}} i64 @underaligned_long_long_test()
   return va_arg(the_list, underaligned_long_long);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to i64*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to i64*
 // CHECK: br label %[[VAARG_END]]
 
@@ -350,26 +350,26 @@ typedef long long overaligned_long_long  __attribute__((aligned(32)));
 overaligned_long_long overaligned_long_long_test(void) {
 // CHECK-LABEL: define{{.*}} i64 @overaligned_long_long_test()
   return va_arg(the_list, overaligned_long_long);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to i64*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to i64*
 // CHECK: br label %[[VAARG_END]]
 
@@ -383,7 +383,7 @@ typedef __int128 underaligned_int128  __attribute__((packed,aligned(2)));
 underaligned_int128 underaligned_int128_test(void) {
 // CHECK-LABEL: define{{.*}} i128 @underaligned_int128_test()
   return va_arg(the_list, underaligned_int128);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
@@ -391,24 +391,24 @@ underaligned_int128 underaligned_int128_test(void) {
 // CHECK: [[ALIGN_REGOFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 15
 // CHECK: [[ALIGNED_REGOFFS:%[a-z_0-9]+]] = and i32 [[ALIGN_REGOFFS]], -16
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[ALIGNED_REGOFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[ALIGNED_REGOFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to i128*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[STACKINT:%[a-z_0-9]+]] = ptrtoint i8* [[STACK]] to i64
 // CHECK: [[ALIGN_STACK:%[a-z_0-9]+]] = add i64 [[STACKINT]], 15
 // CHECK: [[ALIGNED_STACK_INT:%[a-z_0-9]+]] = and i64 [[ALIGN_STACK]], -16
 // CHECK: [[ALIGNED_STACK_PTR:%[a-z_0-9]+]] = inttoptr i64 [[ALIGNED_STACK_INT]] to i8*
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[ALIGNED_STACK_PTR]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[ALIGNED_STACK_PTR]] to i128*
 // CHECK: br label %[[VAARG_END]]
 
@@ -422,7 +422,7 @@ typedef __int128 overaligned_int128  __attribute__((aligned(32)));
 overaligned_int128 overaligned_int128_test(void) {
 // CHECK-LABEL: define{{.*}} i128 @overaligned_int128_test()
   return va_arg(the_list, overaligned_int128);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
@@ -430,24 +430,24 @@ overaligned_int128 overaligned_int128_test(void) {
 // CHECK: [[ALIGN_REGOFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 15
 // CHECK: [[ALIGNED_REGOFFS:%[a-z_0-9]+]] = and i32 [[ALIGN_REGOFFS]], -16
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[ALIGNED_REGOFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[ALIGNED_REGOFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to i128*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[STACKINT:%[a-z_0-9]+]] = ptrtoint i8* [[STACK]] to i64
 // CHECK: [[ALIGN_STACK:%[a-z_0-9]+]] = add i64 [[STACKINT]], 15
 // CHECK: [[ALIGNED_STACK_INT:%[a-z_0-9]+]] = and i64 [[ALIGN_STACK]], -16
 // CHECK: [[ALIGNED_STACK_PTR:%[a-z_0-9]+]] = inttoptr i64 [[ALIGNED_STACK_INT]] to i8*
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[ALIGNED_STACK_PTR]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[ALIGNED_STACK_PTR]] to i128*
 // CHECK: br label %[[VAARG_END]]
 
@@ -476,26 +476,26 @@ underaligned_int_struct underaligned_int_struct_test(void) {
 // CHECK-LE-LABEL: define{{.*}} i32 @underaligned_int_struct_test()
 // CHECK-BE-LABEL: define{{.*}} i64 @underaligned_int_struct_test()
   return va_arg(the_list, underaligned_int_struct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.underaligned_int_struct*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.underaligned_int_struct*
 // CHECK: br label %[[VAARG_END]]
 
@@ -509,26 +509,26 @@ typedef struct __attribute__((aligned(16))) {
 overaligned_int_struct overaligned_int_struct_test(void) {
 // CHECK-LABEL: define{{.*}} i128 @overaligned_int_struct_test()
   return va_arg(the_list, overaligned_int_struct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.overaligned_int_struct*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.overaligned_int_struct*
 // CHECK: br label %[[VAARG_END]]
 
@@ -542,26 +542,26 @@ typedef struct __attribute__((packed,aligned(2))) {
 underaligned_long_long_struct underaligned_long_long_struct_test(void) {
 // CHECK-LABEL: define{{.*}} i64 @underaligned_long_long_struct_test()
   return va_arg(the_list, underaligned_long_long_struct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.underaligned_long_long_struct*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.underaligned_long_long_struct*
 // CHECK: br label %[[VAARG_END]]
 
@@ -575,26 +575,26 @@ typedef struct __attribute__((aligned(16))) {
 overaligned_long_long_struct overaligned_long_long_struct_test(void) {
 // CHECK-LABEL: define{{.*}} i128 @overaligned_long_long_struct_test()
   return va_arg(the_list, overaligned_long_long_struct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.overaligned_long_long_struct*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.overaligned_long_long_struct*
 // CHECK: br label %[[VAARG_END]]
 
@@ -608,26 +608,26 @@ typedef struct __attribute__((packed,aligned(2))) {
 underaligned_int128_struct underaligned_int128_struct_test(void) {
 // CHECK-LABEL: define{{.*}} [2 x i64] @underaligned_int128_struct_test()
   return va_arg(the_list, underaligned_int128_struct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.underaligned_int128_struct*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.underaligned_int128_struct*
 // CHECK: br label %[[VAARG_END]]
 
@@ -642,26 +642,26 @@ typedef struct __attribute__((aligned(32))) {
 overaligned_int128_struct overaligned_int128_struct_test(void) {
 // CHECK-LABEL: define{{.*}} void @overaligned_int128_struct_test(%struct.overaligned_int128_struct* noalias sret(%struct.overaligned_int128_struct) align 32 %agg.result)
   return va_arg(the_list, overaligned_int128_struct);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.overaligned_int128_struct**
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.overaligned_int128_struct**
 // CHECK: br label %[[VAARG_END]]
 
@@ -679,26 +679,26 @@ underaligned_int_struct_member underaligned_int_struct_member_test(void) {
 // CHECK-LE-LABEL: define{{.*}} i32 @underaligned_int_struct_member_test()
 // CHECK-BE-LABEL: define{{.*}} i64 @underaligned_int_struct_member_test()
   return va_arg(the_list, underaligned_int_struct_member);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.underaligned_int_struct_member*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.underaligned_int_struct_member*
 // CHECK: br label %[[VAARG_END]]
 
@@ -712,7 +712,7 @@ typedef struct {
 overaligned_int_struct_member overaligned_int_struct_member_test(void) {
 // CHECK-LABEL: define{{.*}} i128 @overaligned_int_struct_member_test()
   return va_arg(the_list, overaligned_int_struct_member);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
@@ -720,24 +720,24 @@ overaligned_int_struct_member overaligned_int_struct_member_test(void) {
 // CHECK: [[ALIGN_REGOFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 15
 // CHECK: [[ALIGNED_REGOFFS:%[a-z_0-9]+]] = and i32 [[ALIGN_REGOFFS]], -16
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[ALIGNED_REGOFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[ALIGNED_REGOFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.overaligned_int_struct_member*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[STACKINT:%[a-z_0-9]+]] = ptrtoint i8* [[STACK]] to i64
 // CHECK: [[ALIGN_STACK:%[a-z_0-9]+]] = add i64 [[STACKINT]], 15
 // CHECK: [[ALIGNED_STACK_INT:%[a-z_0-9]+]] = and i64 [[ALIGN_STACK]], -16
 // CHECK: [[ALIGNED_STACK_PTR:%[a-z_0-9]+]] = inttoptr i64 [[ALIGNED_STACK_INT]] to i8*
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[ALIGNED_STACK_PTR]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[ALIGNED_STACK_PTR]] to %struct.overaligned_int_struct_member*
 // CHECK: br label %[[VAARG_END]]
 
@@ -751,26 +751,26 @@ typedef struct {
 underaligned_long_long_struct_member underaligned_long_long_struct_member_test(void) {
 // CHECK-LABEL: define{{.*}} i64 @underaligned_long_long_struct_member_test()
   return va_arg(the_list, underaligned_long_long_struct_member);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.underaligned_long_long_struct_member*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.underaligned_long_long_struct_member*
 // CHECK: br label %[[VAARG_END]]
 
@@ -784,7 +784,7 @@ typedef struct {
 overaligned_long_long_struct_member overaligned_long_long_struct_member_test(void) {
 // CHECK-LABEL: define{{.*}} i128 @overaligned_long_long_struct_member_test()
   return va_arg(the_list, overaligned_long_long_struct_member);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
@@ -792,24 +792,24 @@ overaligned_long_long_struct_member overaligned_long_long_struct_member_test(voi
 // CHECK: [[ALIGN_REGOFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 15
 // CHECK: [[ALIGNED_REGOFFS:%[a-z_0-9]+]] = and i32 [[ALIGN_REGOFFS]], -16
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[ALIGNED_REGOFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[ALIGNED_REGOFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.overaligned_long_long_struct_member*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[STACKINT:%[a-z_0-9]+]] = ptrtoint i8* [[STACK]] to i64
 // CHECK: [[ALIGN_STACK:%[a-z_0-9]+]] = add i64 [[STACKINT]], 15
 // CHECK: [[ALIGNED_STACK_INT:%[a-z_0-9]+]] = and i64 [[ALIGN_STACK]], -16
 // CHECK: [[ALIGNED_STACK_PTR:%[a-z_0-9]+]] = inttoptr i64 [[ALIGNED_STACK_INT]] to i8*
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[ALIGNED_STACK_PTR]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[ALIGNED_STACK_PTR]] to %struct.overaligned_long_long_struct_member*
 // CHECK: br label %[[VAARG_END]]
 
@@ -823,26 +823,26 @@ typedef struct {
 underaligned_int128_struct_member underaligned_int128_struct_member_test(void) {
 // CHECK-LABEL: define{{.*}} [2 x i64] @underaligned_int128_struct_member_test()
   return va_arg(the_list, underaligned_int128_struct_member);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 16
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.underaligned_int128_struct_member*
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 16
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.underaligned_int128_struct_member*
 // CHECK: br label %[[VAARG_END]]
 
@@ -857,26 +857,26 @@ typedef struct {
 overaligned_int128_struct_member overaligned_int128_struct_member_test(void) {
 // CHECK-LABEL: define{{.*}} void @overaligned_int128_struct_member_test(%struct.overaligned_int128_struct_member* noalias sret(%struct.overaligned_int128_struct_member) align 32 %agg.result)
   return va_arg(the_list, overaligned_int128_struct_member);
-// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0
 // CHECK: br i1 [[EARLY_ONSTACK]], label %[[VAARG_ON_STACK:[a-z_.0-9]+]], label %[[VAARG_MAYBE_REG:[a-z_.0-9]+]]
 
 // CHECK: [[VAARG_MAYBE_REG]]
 // CHECK: [[NEW_REG_OFFS:%[a-z_0-9]+]] = add i32 [[GR_OFFS]], 8
-// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 3)
+// CHECK: store i32 [[NEW_REG_OFFS]], i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3)
 // CHECK: [[INREG:%[a-z_0-9]+]] = icmp sle i32 [[NEW_REG_OFFS]], 0
 // CHECK: br i1 [[INREG]], label %[[VAARG_IN_REG:[a-z_.0-9]+]], label %[[VAARG_ON_STACK]]
 
 // CHECK: [[VAARG_IN_REG]]
-// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 1)
+// CHECK: [[REG_TOP:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 1)
 // CHECK: [[REG_ADDR:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[REG_TOP]], i32 [[GR_OFFS]]
 // CHECK: [[FROMREG_ADDR:%[a-z_0-9]+]] = bitcast i8* [[REG_ADDR]] to %struct.overaligned_int128_struct_member**
 // CHECK: br label %[[VAARG_END:[a-z._0-9]+]]
 
 // CHECK: [[VAARG_ON_STACK]]
-// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: [[STACK:%[a-z_0-9]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[NEW_STACK:%[a-z_0-9]+]] = getelementptr inbounds i8, i8* [[STACK]], i64 8
-// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%"struct.std::__va_list", %"struct.std::__va_list"* @the_list, i32 0, i32 0)
+// CHECK: store i8* [[NEW_STACK]], i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0)
 // CHECK: [[FROMSTACK_ADDR:%[a-z_0-9]+]] = bitcast i8* [[STACK]] to %struct.overaligned_int128_struct_member**
 // CHECK: br label %[[VAARG_END]]
 
@@ -889,8 +889,8 @@ void check_start(int n, ...) {
 
   va_list the_list;
   va_start(the_list, n);
-// CHECK: [[THE_LIST:%[a-z_0-9]+]] = alloca %"struct.std::__va_list"
-// CHECK: [[VOIDP_THE_LIST:%[a-z_0-9]+]] = bitcast %"struct.std::__va_list"* [[THE_LIST]] to i8*
+// CHECK: [[THE_LIST:%[a-z_0-9]+]] = alloca %struct.__va_list
+// CHECK: [[VOIDP_THE_LIST:%[a-z_0-9]+]] = bitcast %struct.__va_list* [[THE_LIST]] to i8*
 // CHECK: call void @llvm.va_start(i8* [[VOIDP_THE_LIST]])
 }
 
diff --git a/clang/test/CodeGen/arm64-be-hfa-vararg.c b/clang/test/CodeGen/arm64-be-hfa-vararg.c
index 2309de3dbc714..c22572459bab5 100644
--- a/clang/test/CodeGen/arm64-be-hfa-vararg.c
+++ b/clang/test/CodeGen/arm64-be-hfa-vararg.c
@@ -4,12 +4,12 @@
 
 // A single member HFA must be aligned just like a non-HFA register argument.
 double callee(int a, ...) {
-// CHECK: [[REGPP:%.*]] = getelementptr inbounds %"struct.std::__va_list", %"struct.std::__va_list"* [[VA:%.*]], i32 0, i32 2
+// CHECK: [[REGPP:%.*]] = getelementptr inbounds %struct.__va_list, %struct.__va_list* [[VA:%.*]], i32 0, i32 2
 // CHECK: [[REGP:%.*]] = load i8*, i8** [[REGPP]], align 8
 // CHECK: [[OFFSET0:%.*]] = getelementptr inbounds i8, i8* [[REGP]], i32 {{.*}}
 // CHECK: [[OFFSET1:%.*]] = getelementptr inbounds i8, i8* [[OFFSET0]], i64 8
 
-// CHECK: [[MEMPP:%.*]] = getelementptr inbounds %"struct.std::__va_list", %"struct.std::__va_list"* [[VA:%.*]], i32 0, i32 0
+// CHECK: [[MEMPP:%.*]] = getelementptr inbounds %struct.__va_list, %struct.__va_list* [[VA:%.*]], i32 0, i32 0
 // CHECK: [[MEMP:%.*]] = load i8*, i8** [[MEMPP]], align 8
 // CHECK: [[NEXTP:%.*]] = getelementptr inbounds i8, i8* [[MEMP]], i64 8
 // CHECK: store i8* [[NEXTP]], i8** [[MEMPP]], align 8
diff --git a/clang/test/Headers/stdarg.cpp b/clang/test/Headers/stdarg.cpp
index 9f65f2e50626b..e5c0cb3177adc 100644
--- a/clang/test/Headers/stdarg.cpp
+++ b/clang/test/Headers/stdarg.cpp
@@ -15,7 +15,7 @@
 
 #include <stdarg.h>
 
-// AARCH64-C: define {{.*}} @f(i32 noundef %n, %"struct.std::__va_list"* noundef %list)
+// AARCH64-C: define {{.*}} @f(i32 noundef %n, %struct.__va_list* noundef %list)
 // AARCH64-CXX: define {{.*}} @_Z1fiSt9__va_list(i32 noundef %n, %"struct.std::__va_list"* noundef %list)
 // X86_64-C: define {{.*}} @f(i32 noundef %n, %struct.__va_list_tag* noundef %list)
 // X86_64-CXX: define {{.*}} @_Z1fiP13__va_list_tag(i32 noundef %n, %struct.__va_list_tag* noundef %list)

From a3beb34015fcc5b61e804736247781a80554443a Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Mon, 14 Feb 2022 11:52:40 -0800
Subject: [PATCH 159/748] Reland "[InstrProf] Make the IndexedInstrProf header
 backwards compatible."

This reverts commit 9fd2cb21fb3f763fc784eab198bf1297a24596fa.

Fixes an issue on big endian systems where the format version
was not converted to little endian prior to passing to GET_VERSION.

Differential Revision: https://reviews.llvm.org/D118390
---
 llvm/include/llvm/ProfileData/InstrProf.h | 14 ++++++
 llvm/lib/ProfileData/InstrProf.cpp        | 60 +++++++++++++++++++++++
 llvm/lib/ProfileData/InstrProfReader.cpp  | 28 ++++-------
 3 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index a416eb28906e7..c015e8e4b43d0 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -1028,6 +1028,20 @@ struct Header {
   uint64_t Unused; // Becomes unused since version 4
   uint64_t HashType;
   uint64_t HashOffset;
+  // New fields should only be added at the end to ensure that the size
+  // computation is correct. The methods below need to be updated to ensure that
+  // the new field is read correctly.
+
+  // Reads a header struct from the buffer.
+  static Expected<Header> readFromBuffer(const unsigned char *Buffer);
+
+  // Returns the size of the header in bytes for all valid fields based on the
+  // version. I.e a older version header will return a smaller size.
+  size_t size() const;
+
+  // Returns the format version in little endian. The header retains the version
+  // in native endian of the compiler runtime.
+  uint64_t formatVersion() const;
 };
 
 // Profile summary data recorded in the profile data file in indexed
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 07d467305ae5a..6e53b0a276998 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -51,6 +51,7 @@
 #include <memory>
 #include <string>
 #include <system_error>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -1311,4 +1312,63 @@ void OverlapStats::dump(raw_fd_ostream &OS) const {
   }
 }
 
+namespace IndexedInstrProf {
+// A C++14 compatible version of the offsetof macro.
+template <typename T1, typename T2>
+inline size_t constexpr offsetOf(T1 T2::*Member) {
+  constexpr T2 Object{};
+  return size_t(&(Object.*Member)) - size_t(&Object);
+}
+
+static inline uint64_t read(const unsigned char *Buffer, size_t Offset) {
+  return *reinterpret_cast<const uint64_t *>(Buffer + Offset);
+}
+
+uint64_t Header::formatVersion() const {
+  using namespace support;
+  return endian::byte_swap<uint64_t, little>(Version);
+}
+
+Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
+  using namespace support;
+  static_assert(std::is_standard_layout<Header>::value,
+                "The header should be standard layout type since we use offset "
+                "of fields to read.");
+  Header H;
+
+  H.Magic = read(Buffer, offsetOf(&Header::Magic));
+  // Check the magic number.
+  uint64_t Magic = endian::byte_swap<uint64_t, little>(H.Magic);
+  if (Magic != IndexedInstrProf::Magic)
+    return make_error<InstrProfError>(instrprof_error::bad_magic);
+
+  // Read the version.
+  H.Version = read(Buffer, offsetOf(&Header::Version));
+  if (GET_VERSION(H.formatVersion()) >
+      IndexedInstrProf::ProfVersion::CurrentVersion)
+    return make_error<InstrProfError>(instrprof_error::unsupported_version);
+
+  switch (GET_VERSION(H.formatVersion())) {
+  // When a new field is added in the header add a case statement here to
+  // populate it.
+  default: // Version7 (when the backwards compatible header was introduced).
+    H.HashType = read(Buffer, offsetOf(&Header::HashType));
+    H.HashOffset = read(Buffer, offsetOf(&Header::HashOffset));
+  }
+
+  return H;
+}
+
+size_t Header::size() const {
+  switch (GET_VERSION(formatVersion())) {
+  // When a new field is added to the header add a case statement here to
+  // compute the size as offset of the new field + size of the new field. This
+  // relies on the field being added to the end of the list.
+  default: // Version7 (when the backwards compatible header was introduced).
+    return offsetOf(&Header::HashOffset) + sizeof(Header::HashOffset);
+  }
+}
+
+} // namespace IndexedInstrProf
+
 } // end namespace llvm
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index bc990755e0e47..d1e3438a6f412 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -934,24 +934,17 @@ Error IndexedInstrProfReader::readHeader() {
   if ((const unsigned char *)DataBuffer->getBufferEnd() - Cur < 24)
     return error(instrprof_error::truncated);
 
-  auto *Header = reinterpret_cast<const IndexedInstrProf::Header *>(Cur);
-  Cur += sizeof(IndexedInstrProf::Header);
+  auto HeaderOr = IndexedInstrProf::Header::readFromBuffer(Start);
+  if (!HeaderOr)
+    return HeaderOr.takeError();
 
-  // Check the magic number.
-  uint64_t Magic = endian::byte_swap<uint64_t, little>(Header->Magic);
-  if (Magic != IndexedInstrProf::Magic)
-    return error(instrprof_error::bad_magic);
-
-  // Read the version.
-  uint64_t FormatVersion = endian::byte_swap<uint64_t, little>(Header->Version);
-  if (GET_VERSION(FormatVersion) >
-      IndexedInstrProf::ProfVersion::CurrentVersion)
-    return error(instrprof_error::unsupported_version);
+  const IndexedInstrProf::Header *Header = &HeaderOr.get();
+  Cur += Header->size();
 
-  Cur = readSummary((IndexedInstrProf::ProfVersion)FormatVersion, Cur,
+  Cur = readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
                     /* UseCS */ false);
-  if (FormatVersion & VARIANT_MASK_CSIR_PROF)
-    Cur = readSummary((IndexedInstrProf::ProfVersion)FormatVersion, Cur,
+  if (Header->formatVersion() & VARIANT_MASK_CSIR_PROF)
+    Cur = readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
                       /* UseCS */ true);
 
   // Read the hash type and start offset.
@@ -963,9 +956,8 @@ Error IndexedInstrProfReader::readHeader() {
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
   // The rest of the file is an on disk hash table.
-  auto IndexPtr =
-      std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
-          Start + HashOffset, Cur, Start, HashType, FormatVersion);
+  auto IndexPtr = std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
+      Start + HashOffset, Cur, Start, HashType, Header->formatVersion());
 
   // Load the remapping table now if requested.
   if (RemappingBuffer) {

From 941f06282a3d304a96c1ea71b335be5fc91d8f7c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 17 Feb 2022 11:54:57 -0800
Subject: [PATCH 160/748] [lld] Make error handling functions opaque

The inline `lld::error` expands to two function calls `errorHandler` and `error`
where the latter is opaque. Move the functions to .cpp files to decrease code
size.

My x86-64 lld executable is 9KiB smaller.

Reviewed By: #lld-macho, thakis

Differential Revision: https://reviews.llvm.org/D120002
---
 lld/Common/ErrorHandler.cpp           | 12 ++++++++++++
 lld/include/lld/Common/ErrorHandler.h | 18 +++++++-----------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/lld/Common/ErrorHandler.cpp b/lld/Common/ErrorHandler.cpp
index e557e533dedce..4cacd82c9f354 100644
--- a/lld/Common/ErrorHandler.cpp
+++ b/lld/Common/ErrorHandler.cpp
@@ -53,6 +53,18 @@ void ErrorHandler::flushStreams() {
 
 ErrorHandler &lld::errorHandler() { return context().e; }
 
+void lld::error(const Twine &msg) { errorHandler().error(msg); }
+void lld::error(const Twine &msg, ErrorTag tag, ArrayRef<StringRef> args) {
+  errorHandler().error(msg, tag, args);
+}
+void lld::fatal(const Twine &msg) { errorHandler().fatal(msg); }
+void lld::log(const Twine &msg) { errorHandler().log(msg); }
+void lld::message(const Twine &msg, llvm::raw_ostream &s) {
+  errorHandler().message(msg, s);
+}
+void lld::warn(const Twine &msg) { errorHandler().warn(msg); }
+uint64_t lld::errorCount() { return errorHandler().errorCount; }
+
 raw_ostream &lld::outs() {
   ErrorHandler &e = errorHandler();
   return e.outs();
diff --git a/lld/include/lld/Common/ErrorHandler.h b/lld/include/lld/Common/ErrorHandler.h
index ce077290d60b3..0ba4787e5888e 100644
--- a/lld/include/lld/Common/ErrorHandler.h
+++ b/lld/include/lld/Common/ErrorHandler.h
@@ -143,17 +143,13 @@ class ErrorHandler {
 /// Returns the default error handler.
 ErrorHandler &errorHandler();
 
-inline void error(const Twine &msg) { errorHandler().error(msg); }
-inline void error(const Twine &msg, ErrorTag tag, ArrayRef<StringRef> args) {
-  errorHandler().error(msg, tag, args);
-}
-[[noreturn]] inline void fatal(const Twine &msg) { errorHandler().fatal(msg); }
-inline void log(const Twine &msg) { errorHandler().log(msg); }
-inline void message(const Twine &msg, llvm::raw_ostream &s = outs()) {
-  errorHandler().message(msg, s);
-}
-inline void warn(const Twine &msg) { errorHandler().warn(msg); }
-inline uint64_t errorCount() { return errorHandler().errorCount; }
+void error(const Twine &msg);
+void error(const Twine &msg, ErrorTag tag, ArrayRef<StringRef> args);
+[[noreturn]] void fatal(const Twine &msg);
+void log(const Twine &msg);
+void message(const Twine &msg, llvm::raw_ostream &s = outs());
+void warn(const Twine &msg);
+uint64_t errorCount();
 
 [[noreturn]] void exitLld(int val);
 

From af6b9939aac065ce19bdaaf9fc54a8368367f33f Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 17 Feb 2022 11:18:26 -0800
Subject: [PATCH 161/748] [EarlyCSE][OpaquePtr] Check access type when
 performing DSE

This will bail out on target specific intrinsics. If those are deemed
important enough for EarlyCSE to handle, we can augment MemIntrinsicInfo
with an access type for TargetTransformInfo::getTgtMemIntrinsic() to
handle.

Reviewed By: #opaque-pointers, nikic

Differential Revision: https://reviews.llvm.org/D120077
---
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp     | 18 ++++++++++++++++++
 llvm/test/Transforms/EarlyCSE/opaque-ptr.ll | 11 +++++++++++
 2 files changed, 29 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 59b934c16c8a0..091b7a51ccfc6 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -781,6 +781,21 @@ class EarlyCSE {
       return getLoadStorePointerOperand(Inst);
     }
 
+    Type *getValueType() const {
+      // TODO: handle target-specific intrinsics.
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::masked_load:
+          return II->getType();
+        case Intrinsic::masked_store:
+          return II->getArgOperand(0)->getType();
+        default:
+          return nullptr;
+        }
+      }
+      return getLoadStoreType(Inst);
+    }
+
     bool mayReadFromMemory() const {
       if (IntrID != 0)
         return Info.ReadMem;
@@ -1162,6 +1177,9 @@ bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
          "Violated invariant");
   if (Earlier.getPointerOperand() != Later.getPointerOperand())
     return false;
+  if (!Earlier.getValueType() || !Later.getValueType() ||
+      Earlier.getValueType() != Later.getValueType())
+    return false;
   if (Earlier.getMatchingId() != Later.getMatchingId())
     return false;
   // At the moment, we don't remove ordered stores, but do remove
diff --git a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
index bc278feb4ae88..b05fc802ca250 100644
--- a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
+++ b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
@@ -30,3 +30,14 @@ define i32 @different_types_store(ptr %p, i32 %a) {
   %sub = sub i32 %a, %v2.c
   ret i32 %sub
 }
+
+define void @dse(ptr %p, i32 %i1, i8 %i2) {
+; CHECK-LABEL: @dse(
+; CHECK-NEXT:    store i32 [[I1:%.*]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    store i8 [[I2:%.*]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
+  store i32 %i1, ptr %p
+  store i8 %i2, ptr %p
+  ret void
+}

From 9f7075de5c6200f4efda736220ca7716738a5e0e Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Thu, 17 Feb 2022 12:00:44 -0800
Subject: [PATCH 162/748] {instsimplify] Precommit some tests for provable
 inequal pointers derived from allocas

---
 .../InstSimplify/cmp-alloca-offsets.ll        | 202 ++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll

diff --git a/llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll b/llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll
new file mode 100644
index 0000000000000..b59fee40d3e13
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+target datalayout = "p:32:32-p1:64:64"
+
+; This is a collection of tests checking whether we can prove pointers
+; derived from two allocas as inequal *via offset checks*.  Note that
+; instcombine has alternate approaches (one cmp rule, and compare
+; bases of common offset) that also handles these, but with different
+; logic.
+
+; %a follows %b, derived equal
+define i1 @adjacent_alloca() {
+; CHECK-LABEL: @adjacent_alloca(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 0
+  %b.off = getelementptr i8, i8* %b, i64 4
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; %b follows %a, derived equal
+define i1 @adjacent_alloca2() {
+; CHECK-LABEL: @adjacent_alloca2(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 4
+  %b.off = getelementptr i8, i8* %b, i64 0
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; FIXME: Can't be equal
+define i1 @positive_non_equal_end() {
+; CHECK-LABEL: @positive_non_equal_end(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 4
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 4
+  %b.off = getelementptr i8, i8* %b, i64 4
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; %b follows %a, derived equal
+define i1 @positive_equal_past_end() {
+; CHECK-LABEL: @positive_equal_past_end(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 8
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 12
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 8
+  %b.off = getelementptr i8, i8* %b, i64 12
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+define i1 @positive_non_equal() {
+; CHECK-LABEL: @positive_non_equal(
+; CHECK-NEXT:    ret i1 true
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 3
+  %b.off = getelementptr i8, i8* %b, i64 3
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; %a follows %b, derived equal
+define i1 @one_neg_equal1() {
+; CHECK-LABEL: @one_neg_equal1(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 -1
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 3
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 -1
+  %b.off = getelementptr i8, i8* %b, i64 3
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; %b follows %a, derived equal
+define i1 @one_neg_equal2() {
+; CHECK-LABEL: @one_neg_equal2(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 3
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 -1
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 3
+  %b.off = getelementptr i8, i8* %b, i64 -1
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; %b follows %a, derived equal
+define i1 @both_neg_equal() {
+; CHECK-LABEL: @both_neg_equal(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 -4
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 -8
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 -4
+  %b.off = getelementptr i8, i8* %b, i64 -8
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; FIXME: Can't be equal
+define i1 @mixed_offsets1() {
+; CHECK-LABEL: @mixed_offsets1(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 -1
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 2
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 -1
+  %b.off = getelementptr i8, i8* %b, i64 2
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; FIXME: Can't be equal
+define i1 @mixed_offsets2() {
+; CHECK-LABEL: @mixed_offsets2(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 1
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 -2
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 1
+  %b.off = getelementptr i8, i8* %b, i64 -2
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+; FIXME: Can't be equal
+define i1 @negative_in_other() {
+; CHECK-LABEL: @negative_in_other(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, i32 4, align 1
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i8, i8* [[A]], i64 -3
+; CHECK-NEXT:    [[B_OFF:%.*]] = getelementptr i8, i8* [[B]], i64 -2
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i8* [[A_OFF]], [[B_OFF]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a = alloca i8, i32 4
+  %b = alloca i8, i32 4
+  %a.off = getelementptr i8, i8* %a, i64 -3
+  %b.off = getelementptr i8, i8* %b, i64 -2
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+
+attributes #0 = { null_pointer_is_valid }

From 66f8ac8d3604d67599734c3fd272032e9448aca2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 17 Feb 2022 12:10:58 -0800
Subject: [PATCH 163/748] [ELF] Support (TYPE=<value>) to customize the output
 section type

The current output section type allows to set the ELF section type to
SHT_PROGBITS or SHT_NOLOAD. This patch allows an arbitrary section value
to be specified. Some common SHT_* literal names are supported as well.

```
SECTIONS {
  note (TYPE=SHT_NOTE) : { BYTE(8) *(note) }
  init_array ( TYPE=14 ) : { QUAD(14) }
  fini_array (TYPE = SHT_FINI_ARRAY) : { QUAD(15) }
}
```

When `sh_type` is specified, it is an error if an input section has a different type.

Our syntax is compatible with GNU ld 2.39 (https://sourceware.org/bugzilla/show_bug.cgi?id=28841).

Reviewed By: peter.smith

Differential Revision: https://reviews.llvm.org/D118840
---
 lld/ELF/OutputSections.cpp                    | 42 ++++-----
 lld/ELF/OutputSections.h                      |  2 +-
 lld/ELF/ScriptParser.cpp                      | 40 +++++++--
 lld/docs/ELF/linker_script.rst                |  6 ++
 .../ELF/linkerscript/custom-section-type.s    | 89 +++++++++++++++++++
 lld/test/ELF/linkerscript/noload.s            | 13 ++-
 6 files changed, 164 insertions(+), 28 deletions(-)
 create mode 100644 lld/test/ELF/linkerscript/custom-section-type.s

diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 85c343b148c3a..2b5deecdcec75 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -108,32 +108,34 @@ void OutputSection::recordSection(InputSectionBase *isec) {
 // isec. Also check whether the InputSection flags and type are consistent with
 // other InputSections.
 void OutputSection::commitSection(InputSection *isec) {
+  if (LLVM_UNLIKELY(type != isec->type)) {
+    if (hasInputSections || typeIsSet) {
+      if (typeIsSet || !canMergeToProgbits(type) ||
+          !canMergeToProgbits(isec->type)) {
+        errorOrWarn("section type mismatch for " + isec->name + "\n>>> " +
+                    toString(isec) + ": " +
+                    getELFSectionTypeName(config->emachine, isec->type) +
+                    "\n>>> output section " + name + ": " +
+                    getELFSectionTypeName(config->emachine, type));
+      }
+      type = SHT_PROGBITS;
+    } else {
+      type = isec->type;
+    }
+  }
   if (!hasInputSections) {
     // If IS is the first section to be added to this section,
     // initialize type, entsize and flags from isec.
     hasInputSections = true;
-    type = isec->type;
     entsize = isec->entsize;
     flags = isec->flags;
   } else {
     // Otherwise, check if new type or flags are compatible with existing ones.
     if ((flags ^ isec->flags) & SHF_TLS)
-      error("incompatible section flags for " + name + "\n>>> " + toString(isec) +
-            ": 0x" + utohexstr(isec->flags) + "\n>>> output section " + name +
-            ": 0x" + utohexstr(flags));
-
-    if (type != isec->type) {
-      if (!canMergeToProgbits(type) || !canMergeToProgbits(isec->type))
-        error("section type mismatch for " + isec->name + "\n>>> " +
-              toString(isec) + ": " +
-              getELFSectionTypeName(config->emachine, isec->type) +
-              "\n>>> output section " + name + ": " +
-              getELFSectionTypeName(config->emachine, type));
-      type = SHT_PROGBITS;
-    }
+      error("incompatible section flags for " + name + "\n>>> " +
+            toString(isec) + ": 0x" + utohexstr(isec->flags) +
+            "\n>>> output section " + name + ": 0x" + utohexstr(flags));
   }
-  if (noload)
-    type = SHT_NOBITS;
 
   isec->parent = this;
   uint64_t andMask =
@@ -448,14 +450,14 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
       writeInt(buf + data->offset, data->expression().getValue(), data->size);
 }
 
-static void finalizeShtGroup(OutputSection *os,
-                             InputSection *section) {
-  assert(config->relocatable);
-
+static void finalizeShtGroup(OutputSection *os, InputSection *section) {
   // sh_link field for SHT_GROUP sections should contain the section index of
   // the symbol table.
   os->link = in.symTab->getParent()->sectionIndex;
 
+  if (!section)
+    return;
+
   // sh_info then contain index of an entry in symbol table section which
   // provides signature of the section group.
   ArrayRef<Symbol *> symbols = section->file->getSymbols();
diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
index 60a3629e2f256..005537151e530 100644
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -92,7 +92,7 @@ class OutputSection final : public SectionCommand, public SectionBase {
   std::string memoryRegionName;
   std::string lmaRegionName;
   bool nonAlloc = false;
-  bool noload = false;
+  bool typeIsSet = false;
   bool expressionsUseSymbols = false;
   bool usedInExpression = false;
   bool inOverlay = false;
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 9d9eb123eecf2..595050a83a386 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -786,19 +786,45 @@ Expr ScriptParser::readAssert() {
   };
 }
 
+#define ECase(X)                                                               \
+  { #X, X }
+constexpr std::pair<const char *, unsigned> typeMap[] = {
+    ECase(SHT_PROGBITS),   ECase(SHT_NOTE),       ECase(SHT_NOBITS),
+    ECase(SHT_INIT_ARRAY), ECase(SHT_FINI_ARRAY), ECase(SHT_PREINIT_ARRAY),
+};
+#undef ECase
+
 // Tries to read the special directive for an output section definition which
-// can be one of following: "(NOLOAD)", "(COPY)", "(INFO)" or "(OVERLAY)".
-// Tok1 and Tok2 are next 2 tokens peeked. See comment for readSectionAddressType below.
+// can be one of following: "(NOLOAD)", "(COPY)", "(INFO)", "(OVERLAY)", and
+// "(TYPE=<value>)".
+// Tok1 and Tok2 are next 2 tokens peeked. See comment for
+// readSectionAddressType below.
 bool ScriptParser::readSectionDirective(OutputSection *cmd, StringRef tok1, StringRef tok2) {
   if (tok1 != "(")
     return false;
-  if (tok2 != "NOLOAD" && tok2 != "COPY" && tok2 != "INFO" && tok2 != "OVERLAY")
+  if (tok2 != "NOLOAD" && tok2 != "COPY" && tok2 != "INFO" &&
+      tok2 != "OVERLAY" && tok2 != "TYPE")
     return false;
 
   expect("(");
   if (consume("NOLOAD")) {
-    cmd->noload = true;
     cmd->type = SHT_NOBITS;
+    cmd->typeIsSet = true;
+  } else if (consume("TYPE")) {
+    expect("=");
+    StringRef value = peek();
+    auto it = llvm::find_if(typeMap, [=](auto e) { return e.first == value; });
+    if (it != std::end(typeMap)) {
+      // The value is a recognized literal SHT_*.
+      cmd->type = it->second;
+      skip();
+    } else if (value.startswith("SHT_")) {
+      setError("unknown section type " + value);
+    } else {
+      // Otherwise, read an expression.
+      cmd->type = readExpr()().getValue();
+    }
+    cmd->typeIsSet = true;
   } else {
     skip(); // This is "COPY", "INFO" or "OVERLAY".
     cmd->nonAlloc = true;
@@ -819,7 +845,11 @@ bool ScriptParser::readSectionDirective(OutputSection *cmd, StringRef tok1, Stri
 // https://sourceware.org/binutils/docs/ld/Output-Section-Address.html
 // https://sourceware.org/binutils/docs/ld/Output-Section-Type.html
 void ScriptParser::readSectionAddressType(OutputSection *cmd) {
-  if (readSectionDirective(cmd, peek(), peek2()))
+  // Temporarily set inExpr to support TYPE=<value> without spaces.
+  bool saved = std::exchange(inExpr, true);
+  bool isDirective = readSectionDirective(cmd, peek(), peek2());
+  inExpr = saved;
+  if (isDirective)
     return;
 
   cmd->addrExpr = readExpr();
diff --git a/lld/docs/ELF/linker_script.rst b/lld/docs/ELF/linker_script.rst
index b94017b17158d..085b3aa77186e 100644
--- a/lld/docs/ELF/linker_script.rst
+++ b/lld/docs/ELF/linker_script.rst
@@ -102,6 +102,12 @@ When an *OutputSection* *S* has ``(type)``, LLD will set ``sh_type`` or
 
 - ``NOLOAD``: set ``sh_type`` to ``SHT_NOBITS``.
 - ``COPY``, ``INFO``, ``OVERLAY``: clear the ``SHF_ALLOC`` bit in ``sh_flags``.
+- ``TYPE=<value>``: set ``sh_type`` to the specified value. ``<value>`` must be
+  an integer or one of ``SHT_PROGBITS, SHT_NOTE, SHT_NOBITS, SHT_INIT_ARRAY,
+  SHT_FINI_ARRAY, SHT_PREINIT_ARRAY``.
+
+When ``sh_type`` is specified, it is an error if an input section in *S* has a
+different type.
 
 Output section alignment
 ------------------------
diff --git a/lld/test/ELF/linkerscript/custom-section-type.s b/lld/test/ELF/linkerscript/custom-section-type.s
new file mode 100644
index 0000000000000..21ef9922e25ec
--- /dev/null
+++ b/lld/test/ELF/linkerscript/custom-section-type.s
@@ -0,0 +1,89 @@
+# REQUIRES: x86
+## TYPE=<value> customizes the output section type.
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t/a.s -o %t/a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t/mismatch.s -o %t/mismatch.o
+# RUN: ld.lld -T %t/a.lds %t/a.o -o %t/a
+# RUN: llvm-readelf -S %t/a | FileCheck %s
+
+# RUN: ld.lld -r -T %t/a.lds %t/a.o -o %t/a.ro
+# RUN: llvm-readelf -S %t/a.ro | FileCheck %s
+
+# CHECK:       [Nr] Name              Type            Address          Off      Size   ES Flg Lk Inf Al
+# CHECK-NEXT:  [ 0]                   NULL            [[#%x,]]         [[#%x,]] 000000 00      0   0  0
+# CHECK-NEXT:  [ 1] progbits          PROGBITS        [[#%x,]]         [[#%x,]] 000001 00   A  0   0  1
+# CHECK-NEXT:  [ 2] note              NOTE            [[#%x,]]         [[#%x,]] 000002 00   A  0   0  1
+# CHECK-NEXT:  [ 3] nobits            NOBITS          [[#%x,]]         [[#%x,]] 000001 00   A  0   0  1
+# CHECK-NEXT:  [ 4] init_array        INIT_ARRAY      [[#%x,]]         [[#%x,]] 000008 00   A  0   0  1
+# CHECK-NEXT:  [ 5] fini_array        FINI_ARRAY      [[#%x,]]         [[#%x,]] 000008 00   A  0   0  1
+# CHECK-NEXT:  [ 6] preinit_array     PREINIT_ARRAY   [[#%x,]]         [[#%x,]] 000008 00   A  0   0  1
+# CHECK-NEXT:  [ 7] group             GROUP           [[#%x,]]         [[#%x,]] 000004 00   A [[#SYMTAB:]] 0  1
+# CHECK-NEXT:  [ 8] expr              0x42: <unknown> [[#%x,]]         [[#%x,]] 000001 00   A  0   0  1
+# CHECK:       [[[#SYMTAB]]] .symtab  SYMTAB
+
+# RUN: not ld.lld -T %t/a.lds %t/a.o %t/mismatch.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR1
+
+# ERR1:      error: section type mismatch for progbits
+# ERR1-NEXT: >>> {{.*}}.o:(progbits): SHT_NOTE
+# ERR1-NEXT: >>> output section progbits: SHT_PROGBITS
+# ERR1:      error: section type mismatch for expr
+# ERR1-NEXT: >>> {{.*}}.o:(expr): Unknown
+# ERR1-NEXT: >>> output section expr: Unknown
+
+# RUN: ld.lld -T %t/a.lds %t/a.o %t/mismatch.o -o %t/mismatch --noinhibit-exec
+# RUN: llvm-readelf -S %t/mismatch | FileCheck %s --check-prefix=MISMATCH
+
+## Mismatched progbits and expr are changed to SHT_PROGBITS.
+# MISMATCH: progbits PROGBITS
+# MISMATCH: note     NOTE
+# MISMATCH: expr     PROGBITS
+
+# RUN: not ld.lld -T %t/unknown1.lds %t/a.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN1
+# UNKNOWN1: error: {{.*}}.lds:1: symbol not found: foo
+
+## For a symbol named SHT_*, give a better diagnostic.
+# RUN: not ld.lld -T %t/unknown2.lds %t/a.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN2
+# UNKNOWN2: error: {{.*}}.lds:1: unknown section type SHT_DYNAMIC
+
+# RUN: not ld.lld -T %t/parseerr1.lds %t/a.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=PARSEERR1
+# PARSEERR1: error: {{.*}}.lds:1: = expected, but got )
+
+#--- a.s
+.globl _start, myinit
+_start:
+  ret
+myinit:
+  ret
+
+## Compatible with TYPE = SHT_NOTE below.
+.section note,"a",@note
+.byte 0
+
+#--- a.lds
+SECTIONS {
+  progbits (TYPE=SHT_PROGBITS) : { BYTE(1) }
+  note (TYPE = SHT_NOTE) : { BYTE(7) *(note) }
+  nobits ( TYPE=SHT_NOBITS) : { BYTE(8) }
+  init_array (TYPE=SHT_INIT_ARRAY ) : { QUAD(myinit) }
+  fini_array (TYPE=SHT_FINI_ARRAY) : { QUAD(15) }
+  preinit_array (TYPE=SHT_PREINIT_ARRAY) : { QUAD(16) }
+  group (TYPE=17) : { LONG(17) }
+  expr (TYPE=0x41+1) : { BYTE(0x42) *(expr) }
+}
+
+#--- mismatch.s
+.section progbits,"a",@note
+.byte 0
+
+.section expr,"a",@12345
+.byte 0
+
+#--- unknown1.lds
+SECTIONS { err (TYPE=foo) : {} }
+
+#--- unknown2.lds
+SECTIONS { err (TYPE=SHT_DYNAMIC) : {} }
+
+#--- parseerr1.lds
+SECTIONS { err (TYPE) : {} }
diff --git a/lld/test/ELF/linkerscript/noload.s b/lld/test/ELF/linkerscript/noload.s
index 76007e911b7da..92afadc9b263f 100644
--- a/lld/test/ELF/linkerscript/noload.s
+++ b/lld/test/ELF/linkerscript/noload.s
@@ -1,6 +1,7 @@
 # REQUIRES: x86
 # RUN: split-file %s %t
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %t/asm -o %t.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/mismatch.s -o %t/mismatch.o
 # RUN: ld.lld --script %t/lds %t.o -o %t/out
 # RUN: llvm-readelf -S -l %t/out | FileCheck %s
 
@@ -16,16 +17,24 @@
 # CHECK:      00 .data_noload_a .data_noload_b .no_input_sec_noload {{$}}
 # CHECK:      01 .text {{$}}
 
+# RUN: not ld.lld --script %t/lds %t.o %t/mismatch.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
+
+# ERR: error: section type mismatch for .data_noload_a
+
 #--- asm
 .section .text,"ax",@progbits
   nop
 
-.section .data_noload_a,"aw",@progbits
+.section .data_noload_a,"aw",@nobits
 .zero 4096
 
-.section .data_noload_b,"aw",@progbits
+.section .data_noload_b,"aw",@nobits
 .zero 4096
 
+#--- mismatch.s
+.section .data_noload_a,"aw",@progbits
+.byte 1
+
 #--- lds
 SECTIONS {
   .data_noload_a (NOLOAD) : { *(.data_noload_a) }

From 0aa3072649f20756bc06e8747d1a39cb1bbaeab1 Mon Sep 17 00:00:00 2001
From: Mogball <jeffniu22@gmail.com>
Date: Thu, 17 Feb 2022 20:29:25 +0000
Subject: [PATCH 164/748] [mlir] NFC NamedAttrList append with StringAttr

NamedAttrList.append(StringAttr, StringAttr) fails to compile because it is
matched to the IteratorT append. Fixes the method to only match if the type is
an iterator.
---
 mlir/include/mlir/IR/OperationSupport.h    | 5 ++++-
 mlir/unittests/IR/OperationSupportTest.cpp | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index 7a055bf055c7d..5ae07287b88c0 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -466,7 +466,10 @@ class NamedAttrList {
   }
 
   /// Add a range of named attributes.
-  template <typename IteratorT>
+  template <typename IteratorT,
+            typename = std::enable_if_t<std::is_convertible<
+                typename std::iterator_traits<IteratorT>::iterator_category,
+                std::input_iterator_tag>::value>>
   void append(IteratorT in_start, IteratorT in_end) {
     // TODO: expand to handle case where values appended are in order & after
     // end of current list.
diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
index 6484d7c50d19b..2511a5d3b6bfd 100644
--- a/mlir/unittests/IR/OperationSupportTest.cpp
+++ b/mlir/unittests/IR/OperationSupportTest.cpp
@@ -232,7 +232,7 @@ TEST(NamedAttrListTest, TestAppendAssign) {
   NamedAttrList attrs;
   Builder b(&ctx);
 
-  attrs.append("foo", b.getStringAttr("bar"));
+  attrs.append(b.getStringAttr("foo"), b.getStringAttr("bar"));
   attrs.append("baz", b.getStringAttr("boo"));
 
   {

From f5b85f15510db409277d8492524b2fc040e776d8 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Thu, 17 Feb 2022 15:32:15 -0500
Subject: [PATCH 165/748] Use functions with prototypes when appropriate; NFC

Now that the AST printer properly handles functions with no parameters
in C code, all of the tests relying on AST printing can be updated to
use prototypes where appropriate.
---
 clang/test/ARCMT/GC-check-warn-nsalloc.m      |   2 +-
 clang/test/ARCMT/autoreleases.m               |   2 +-
 clang/test/ARCMT/autoreleases.m.result        |   2 +-
 clang/test/ARCMT/checking.m                   |   2 +-
 clang/test/ARCMT/nonobjc-to-objc-cast-2.m     |   2 +-
 clang/test/ARCMT/objcmt-arc-cf-annotations.m  | 136 +++++++++---------
 .../ARCMT/objcmt-arc-cf-annotations.m.result  | 136 +++++++++---------
 clang/test/ARCMT/objcmt-instancetype.m        |   2 +-
 clang/test/ARCMT/objcmt-instancetype.m.result |   2 +-
 clang/test/ARCMT/objcmt-property-dot-syntax.m |   2 +-
 .../ARCMT/objcmt-property-dot-syntax.m.result |   2 +-
 .../test/ARCMT/objcmt-subscripting-literals.m |   2 +-
 .../objcmt-subscripting-literals.m.result     |   2 +-
 clang/test/ARCMT/objcmt-with-pch.m            |   2 +-
 clang/test/ARCMT/objcmt-with-pch.m.result     |   2 +-
 clang/test/ARCMT/releases-driver.m            |   2 +-
 clang/test/ARCMT/releases-driver.m.result     |   2 +-
 clang/test/ARCMT/releases.m                   |   6 +-
 clang/test/ARCMT/releases.m.result            |   6 +-
 clang/test/ARCMT/retains.m                    |   6 +-
 clang/test/ARCMT/retains.m.result             |   6 +-
 clang/test/ARCMT/rewrite-block-var.m          |   2 +-
 clang/test/ARCMT/rewrite-block-var.m.result   |   2 +-
 clang/test/Sema/ast-print.c                   |   6 +-
 24 files changed, 168 insertions(+), 168 deletions(-)

diff --git a/clang/test/ARCMT/GC-check-warn-nsalloc.m b/clang/test/ARCMT/GC-check-warn-nsalloc.m
index af17c1d69c5f3..26ead5f6a0907 100644
--- a/clang/test/ARCMT/GC-check-warn-nsalloc.m
+++ b/clang/test/ARCMT/GC-check-warn-nsalloc.m
@@ -6,6 +6,6 @@
 typedef unsigned NSUInteger;
 void *__strong NSAllocateCollectable(NSUInteger size, NSUInteger options);
 
-void test1() {
+void test1(void) {
   NSAllocateCollectable(100, 0);
 }
diff --git a/clang/test/ARCMT/autoreleases.m b/clang/test/ARCMT/autoreleases.m
index 91413e51ca66f..4c268c09a715c 100644
--- a/clang/test/ARCMT/autoreleases.m
+++ b/clang/test/ARCMT/autoreleases.m
@@ -69,7 +69,7 @@ id test2(A* val) {
   return val;
 }
 
-id test3() {
+id test3(void) {
   id a = [[A alloc] init];
   [a autorelease];
 }
diff --git a/clang/test/ARCMT/autoreleases.m.result b/clang/test/ARCMT/autoreleases.m.result
index 32c7ad3c39710..b3aad804a45be 100644
--- a/clang/test/ARCMT/autoreleases.m.result
+++ b/clang/test/ARCMT/autoreleases.m.result
@@ -64,6 +64,6 @@ id test2(A* val) {
   return val;
 }
 
-id test3() {
+id test3(void) {
   id a = [[A alloc] init];
 }
diff --git a/clang/test/ARCMT/checking.m b/clang/test/ARCMT/checking.m
index bf08ceaa6fe2a..0c69a7ffaad60 100644
--- a/clang/test/ARCMT/checking.m
+++ b/clang/test/ARCMT/checking.m
@@ -124,7 +124,7 @@ -(id)alloc;
 - (id)initWithInt: (int) i;
 @end
 
-void rdar8861761() {
+void rdar8861761(void) {
   B *o1 = [[B alloc] initWithInt:0];
   B *o2 = [B alloc];
   [o2 initWithInt:0];
diff --git a/clang/test/ARCMT/nonobjc-to-objc-cast-2.m b/clang/test/ARCMT/nonobjc-to-objc-cast-2.m
index 2e308a87ba838..b8f562f8a42dd 100644
--- a/clang/test/ARCMT/nonobjc-to-objc-cast-2.m
+++ b/clang/test/ARCMT/nonobjc-to-objc-cast-2.m
@@ -49,7 +49,7 @@ void f2(NSString *s) {
     // expected-note{{use CFBridgingRetain call to make an ARC object available as a +1 'CFStringRef' (aka 'const struct __CFString *')}}
 }
 
-CFStringRef f3() {
+CFStringRef f3(void) {
   return (CFStringRef)[[[NSString alloc] init] autorelease]; // expected-error {{it is not safe to cast to 'CFStringRef' the result of 'autorelease' message; a __bridge cast may result in a pointer to a destroyed object and a __bridge_retained may leak the object}} \
     // expected-note {{remove the cast and change return type of function to 'NSString *' to have the object automatically autoreleased}}
 }
diff --git a/clang/test/ARCMT/objcmt-arc-cf-annotations.m b/clang/test/ARCMT/objcmt-arc-cf-annotations.m
index e33b8801cd4dc..221a8a9e7d4d0 100644
--- a/clang/test/ARCMT/objcmt-arc-cf-annotations.m
+++ b/clang/test/ARCMT/objcmt-arc-cf-annotations.m
@@ -335,7 +335,7 @@ + (id)array;
 // Test cases.
 //===----------------------------------------------------------------------===//
 
-CFAbsoluteTime f1() {
+CFAbsoluteTime f1(void) {
   CFAbsoluteTime t = CFAbsoluteTimeGetCurrent();
   CFDateRef date = CFDateCreate(0, t);
   CFRetain(date);
@@ -346,7 +346,7 @@ CFAbsoluteTime f1() {
   return t;
 }
 
-CFAbsoluteTime f2() {
+CFAbsoluteTime f2(void) {
   CFAbsoluteTime t = CFAbsoluteTimeGetCurrent();
   CFDateRef date = CFDateCreate(0, t);  
   [((NSDate*) date) retain];
@@ -363,7 +363,7 @@ CFAbsoluteTime f2() {
 // Test to see if we suppress an error when we store the pointer
 // to a global.
 
-CFAbsoluteTime f3() {
+CFAbsoluteTime f3(void) {
   CFAbsoluteTime t = CFAbsoluteTimeGetCurrent();
   CFDateRef date = CFDateCreate(0, t);  
   [((NSDate*) date) retain];
@@ -402,7 +402,7 @@ CFDateRef f6(int x) {
 
 // Test a leak involving an overwrite.
 
-CFDateRef f7() {
+CFDateRef f7(void) {
   CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());  //expected-warning{{leak}}
   CFRetain(date);
   date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); // expected-warning {{leak}}
@@ -411,15 +411,15 @@ CFDateRef f7() {
 
 // Generalization of Create rule.  MyDateCreate returns a CFXXXTypeRef, and
 // has the word create.
-CFDateRef MyDateCreate();
+CFDateRef MyDateCreate(void);
 
-CFDateRef f8() {
+CFDateRef f8(void) {
   CFDateRef date = MyDateCreate(); // expected-warning{{leak}}
   CFRetain(date);  
   return date;
 }
 
-__attribute__((cf_returns_retained)) CFDateRef f9() {
+__attribute__((cf_returns_retained)) CFDateRef f9(void) {
   CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); // no-warning
   int *p = 0;
   // When allocations fail, CFDateCreate can return null.
@@ -453,7 +453,7 @@ void f10(io_service_t media, DADiskRef d, CFStringRef s) {
 }
 
 // Test retain/release checker with CFString and CFMutableArray.
-void f11() {
+void f11(void) {
   // Create the array.
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
 
@@ -478,32 +478,32 @@ void f11() {
 }
 
 // PR 3337: Handle functions declared using typedefs.
-typedef CFTypeRef CREATEFUN();
-CFTypeRef MyCreateFun();
+typedef CFTypeRef CREATEFUN(void);
+CFTypeRef MyCreateFun(void);
 
-void f12() {
+void f12(void) {
   CFTypeRef o = MyCreateFun(); // expected-warning {{leak}}
 }
 
-void f13_autorelease() {
+void f13_autorelease(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
   [(id) A autorelease]; // no-warning
 }
 
-void f13_autorelease_b() {
+void f13_autorelease_b(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   [(id) A autorelease];
   [(id) A autorelease];
 } // expected-warning{{Object autoreleased too many times}}
 
-CFMutableArrayRef f13_autorelease_c() {
+CFMutableArrayRef f13_autorelease_c(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   [(id) A autorelease];
   [(id) A autorelease]; 
   return A; // expected-warning{{Object autoreleased too many times}}
 }
 
-CFMutableArrayRef f13_autorelease_d() {
+CFMutableArrayRef f13_autorelease_d(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   [(id) A autorelease];
   [(id) A autorelease]; 
@@ -514,13 +514,13 @@ CFMutableArrayRef f13_autorelease_d() {
 
 
 // This case exercises the logic where the leak site is the same as the allocation site.
-void f14_leakimmediately() {
+void f14_leakimmediately(void) {
   CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning{{leak}}
 }
 
 // Test that we track an allocated object beyond the point where the *name*
 // of the variable storing the reference is no longer live.
-void f15() {
+void f15(void) {
   // Create the array.
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   CFMutableArrayRef *B = &A;
@@ -751,7 +751,7 @@ - (void)radar10102244 {
 // <rdar://problem/6257780> clang checker fails to catch use-after-release
 //===----------------------------------------------------------------------===//
 
-int rdar_6257780_Case1() {
+int rdar_6257780_Case1(void) {
   NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
   NSArray *array = [NSArray array];
   [array release]; // expected-warning{{Incorrect decrement of the reference count of an object that is not owned at this point by the caller}}
@@ -763,7 +763,7 @@ int rdar_6257780_Case1() {
 // <rdar://problem/10640253> Analyzer is confused about NSAutoreleasePool -allocWithZone:.
 //===----------------------------------------------------------------------===//
 
-void rdar_10640253_autorelease_allocWithZone() {
+void rdar_10640253_autorelease_allocWithZone(void) {
     NSAutoreleasePool *pool = [[NSAutoreleasePool allocWithZone:(NSZone*)0] init];
     (void) pool;
 }
@@ -772,7 +772,7 @@ void rdar_10640253_autorelease_allocWithZone() {
 // <rdar://problem/6866843> Checker should understand new/setObject:/release constructs
 //===----------------------------------------------------------------------===//
 
-void rdar_6866843() {
+void rdar_6866843(void) {
  NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
  NSMutableDictionary* dictionary = [[NSMutableDictionary alloc] init];
  NSArray* array = [[NSArray alloc] init];
@@ -844,7 +844,7 @@ - (id)initReturningNewClassBad2 {
 @implementation RDar6320065Subclass
 @end
 
-int RDar6320065_test() {
+int RDar6320065_test(void) {
   RDar6320065 *test = [[RDar6320065 alloc] init]; // no-warning
   [test release];
   return 0;
@@ -976,7 +976,7 @@ void IOServiceNameMatching_wrapper(const char * name) {
   IOServiceNameMatching(name); // expected-warning{{leak}}
 }
 
-CF_RETURNS_RETAINED CFDictionaryRef CreateDict();
+CF_RETURNS_RETAINED CFDictionaryRef CreateDict(void);
 
 void IOServiceAddNotification_wrapper(mach_port_t mainPort, const io_name_t notificationType,
   mach_port_t wakePort, uintptr_t reference, io_iterator_t * notification ) {
@@ -1367,25 +1367,25 @@ void test_attr1c(TestOwnershipAttr *X) {
   NSString *str4 = [[X newString_auto] retain]; // expected-warning {{leak}}
 }
 
-void testattr2_a() {
+void testattr2_a(void) {
   TestOwnershipAttr *x = [TestOwnershipAttr alloc]; // expected-warning{{leak}}
 }
 
-void testattr2_b() {
+void testattr2_b(void) {
   TestOwnershipAttr *x = [[TestOwnershipAttr alloc] pseudoInit];  // expected-warning{{leak}}
 }
 
-void testattr2_b_11358224_self_assign_looses_the_leak() {
+void testattr2_b_11358224_self_assign_looses_the_leak(void) {
   TestOwnershipAttr *x = [[TestOwnershipAttr alloc] pseudoInit];// expected-warning{{leak}}
   x = x;
 }
 
-void testattr2_c() {
+void testattr2_c(void) {
   TestOwnershipAttr *x = [[TestOwnershipAttr alloc] pseudoInit]; // no-warning
   [x release];
 }
 
-void testattr3() {
+void testattr3(void) {
   TestOwnershipAttr *x = [TestOwnershipAttr alloc]; // no-warning
   [TestOwnershipAttr consume:x];
   TestOwnershipAttr *y = [TestOwnershipAttr alloc]; // no-warning
@@ -1395,7 +1395,7 @@ void testattr3() {
 void consume_ns(id NS_CONSUMED x);
 void consume_cf(id CF_CONSUMED x);
 
-void testattr4() {
+void testattr4(void) {
   TestOwnershipAttr *x = [TestOwnershipAttr alloc]; // no-warning
   consume_ns(x);
   TestOwnershipAttr *y = [TestOwnershipAttr alloc]; // no-warning
@@ -1423,7 +1423,7 @@ - (NSDate*) returnsNSRetained NS_RETURNS_RETAINED;
 @end
 
 CF_RETURNS_RETAINED
-CFDateRef returnsRetainedCFDate()  {
+CFDateRef returnsRetainedCFDate(void)  {
   return CFDateCreate(0, CFAbsoluteTimeGetCurrent());
 }
 
@@ -1465,15 +1465,15 @@ - (NSDate*) returnsNSRetained {
 // to a noreturn or panic function
 //===----------------------------------------------------------------------===//
 
-void panic() __attribute__((noreturn));
-void panic_not_in_hardcoded_list() __attribute__((noreturn));
+void panic(void) __attribute__((noreturn));
+void panic_not_in_hardcoded_list(void) __attribute__((noreturn));
 
-void test_panic_negative() {
+void test_panic_negative(void) {
   signed z = 1;
   CFNumberRef value = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &z);  // expected-warning{{leak}}
 }
 
-void test_panic_positive() {
+void test_panic_positive(void) {
   signed z = 1;
   CFNumberRef value = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &z); // no-warning
   panic();
@@ -1545,8 +1545,8 @@ - (id)retain {
 // detector.
 
 @protocol Prot_R8272168 @end
-Class <Prot_R8272168> GetAClassThatImplementsProt_R8272168();
-void r8272168() {
+Class <Prot_R8272168> GetAClassThatImplementsProt_R8272168(void);
+void r8272168(void) {
   GetAClassThatImplementsProt_R8272168();
 }
 
@@ -1596,7 +1596,7 @@ static void rdar_8724287(CFErrorRef error)
 // correctly in argument positions besides the first.
 extern void *CFStringCreate(void);
 extern void rdar_9234108_helper(void *key, void * CF_CONSUMED value);
-void rdar_9234108() {
+void rdar_9234108(void) {
   rdar_9234108_helper(0, CFStringCreate());
 }
 
@@ -1619,7 +1619,7 @@ - (id)_prefix_initWithTwoDoubles:(TwoDoubles)twoDoubles
 }
 @end
 
-void rdar9726279() {
+void rdar9726279(void) {
   TwoDoubles twoDoubles = { 0.0, 0.0 };
   NSValue *value = [[NSValue alloc] _prefix_initWithTwoDoubles:twoDoubles];
   [value release];
@@ -1628,52 +1628,52 @@ void rdar9726279() {
 // <rdar://problem/9732321>
 // Test camelcase support for CF conventions.  While Core Foundation APIs
 // don't use camel casing, other code is allowed to use it.
-CFArrayRef camelcase_create_1() {
+CFArrayRef camelcase_create_1(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camelcase_createno() {
+CFArrayRef camelcase_createno(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
-CFArrayRef camelcase_copy() {
+CFArrayRef camelcase_copy(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camelcase_copying() {
+CFArrayRef camelcase_copying(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
-CFArrayRef copyCamelCase() {
+CFArrayRef copyCamelCase(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef __copyCamelCase() {
+CFArrayRef __copyCamelCase(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef __createCamelCase() {
+CFArrayRef __createCamelCase(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camel_create() {
+CFArrayRef camel_create(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
 
-CFArrayRef camel_creat() {
+CFArrayRef camel_creat(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
-CFArrayRef camel_copy() {
+CFArrayRef camel_copy(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camel_copyMachine() {
+CFArrayRef camel_copyMachine(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camel_copymachine() {
+CFArrayRef camel_copymachine(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
@@ -1708,7 +1708,7 @@ void rdar6582778_2(void) {
 // <rdar://problem/10232019> - Test that objects passed to containers
 // are marked "escaped".
 
-void rdar10232019() {
+void rdar10232019(void) {
   NSMutableArray *array = [NSMutableArray array];
 
   NSString *string = [[NSString alloc] initWithUTF8String:"foo"];
@@ -1719,7 +1719,7 @@ void rdar10232019() {
   NSLog(@"%@", otherString);
 }
 
-void rdar10232019_positive() {
+void rdar10232019_positive(void) {
   NSMutableArray *array = [NSMutableArray array];
 
   NSString *string = [[NSString alloc] initWithUTF8String:"foo"];
@@ -1735,7 +1735,7 @@ void rdar10232019_positive() {
 xpc_object_t _CFXPCCreateXPCObjectFromCFObject(CFTypeRef cf);
 void xpc_release(xpc_object_t object);
 
-void rdar9658496() {
+void rdar9658496(void) {
   CFStringRef cf;
   xpc_object_t xpc;
   cf = CFStringCreateWithCString( ((CFAllocatorRef)0), "test", kCFStringEncodingUTF8 ); // no-warning
@@ -1756,7 +1756,7 @@ - (id)initWithObj:(id)obj {
 }
 @end
 
-void rdar_10824732() {
+void rdar_10824732(void) {
   @autoreleasepool {
     NSString *obj = @"test";
     RDar10824732 *foo = [[RDar10824732 alloc] initWithObj:obj]; // no-warning
@@ -1832,14 +1832,14 @@ - (id)copyAutoreleaseRadar13081402 {
 //===----------------------------------------------------------------------===//
 void *malloc(size_t);
 struct rdar11104566 { CFStringRef myStr; };
-struct rdar11104566 test_rdar11104566() {
+struct rdar11104566 test_rdar11104566(void) {
   CFStringRef cf = CFStringCreateWithCString( ((CFAllocatorRef)0), "test", kCFStringEncodingUTF8 ); // no-warning
   struct rdar11104566 V;
   V.myStr = cf;
   return V; // no-warning
 }
 
-struct rdar11104566 *test_2_rdar11104566() {
+struct rdar11104566 *test_2_rdar11104566(void) {
   CFStringRef cf = CFStringCreateWithCString( ((CFAllocatorRef)0), "test", kCFStringEncodingUTF8 ); // no-warning
   struct rdar11104566 *V = (struct rdar11104566 *) malloc(sizeof(*V));
   V->myStr = cf;
@@ -1850,7 +1850,7 @@ struct rdar11104566 test_rdar11104566() {
 // ObjC literals support.
 //===----------------------------------------------------------------------===//
 
-void test_objc_arrays() {
+void test_objc_arrays(void) {
     { // CASE ONE -- OBJECT IN ARRAY CREATED DIRECTLY
         NSObject *o = [[NSObject alloc] init];
         NSArray *a = [[NSArray alloc] initWithObjects:o, (void*)0]; // expected-warning {{leak}}
@@ -1895,7 +1895,7 @@ void test_objc_arrays() {
     }
 }
 
-void test_objc_integer_literals() {
+void test_objc_integer_literals(void) {
   id value = [@1 retain]; // expected-warning {{leak}}
   [value description];
 }
@@ -1923,8 +1923,8 @@ void rdar11400885(int y)
   }
 }
 
-id makeCollectableNonLeak() {
-  extern CFTypeRef CFCreateSomething();
+id makeCollectableNonLeak(void) {
+  extern CFTypeRef CFCreateSomething(void);
 
   CFTypeRef object = CFCreateSomething(); // +1
   CFRetain(object); // +2
@@ -1937,7 +1937,7 @@ id makeCollectableNonLeak() {
 void consumeAndStopTracking(id NS_CONSUMED obj, void (^callback)(void));
 void CFConsumeAndStopTracking(CFTypeRef CF_CONSUMED obj, void (^callback)(void));
 
-void testConsumeAndStopTracking() {
+void testConsumeAndStopTracking(void) {
   id retained = [@[] retain]; // +1
   consumeAndStopTracking(retained, ^{}); // no-warning
 
@@ -1950,7 +1950,7 @@ void testConsumeAndStopTracking() {
   consumeAndStopTracking(unretained, ^{}); // expected-warning {{Incorrect decrement of the reference count of an object that is not owned at this point by the caller}}
 }
 
-void testCFConsumeAndStopTracking() {
+void testCFConsumeAndStopTracking(void) {
   id retained = [@[] retain]; // +1
   CFConsumeAndStopTracking((CFTypeRef)retained, ^{}); // no-warning
 
@@ -1968,10 +1968,10 @@ void testCFConsumeAndStopTracking() {
 
 typedef void *MyCFType;
 #pragma clang arc_cf_code_audited begin
-MyCFType CreateMyCFType();
+MyCFType CreateMyCFType(void);
 #pragma clang arc_cf_code_audited end 
     
-void test_custom_cf() {
+void test_custom_cf(void) {
   MyCFType x = CreateMyCFType(); // expected-warning {{leak of an object stored into 'x'}}
 }
 
@@ -1992,7 +1992,7 @@ @interface PR14927 : NSObject
 - (void)drain;
 @end
 
-void test_drain() {
+void test_drain(void) {
   PR14927 *obj = [[PR14927 alloc] init];
   [obj drain];
   [obj release]; // no-warning
@@ -2003,14 +2003,14 @@ void test_drain() {
 // value as tracked, even if the object isn't a known CF type.
 //===----------------------------------------------------------------------===//
 
-MyCFType getCustom() __attribute__((cf_returns_not_retained));
-MyCFType makeCustom() __attribute__((cf_returns_retained));
+MyCFType getCustom(void) __attribute__((cf_returns_not_retained));
+MyCFType makeCustom(void) __attribute__((cf_returns_retained));
 
-void testCustomReturnsRetained() {
+void testCustomReturnsRetained(void) {
   MyCFType obj = makeCustom(); // expected-warning {{leak of an object stored into 'obj'}}
 }
 
-void testCustomReturnsNotRetained() {
+void testCustomReturnsNotRetained(void) {
   CFRelease(getCustom()); // expected-warning {{Incorrect decrement of the reference count of an object that is not owned at this point by the caller}}
 }
 
diff --git a/clang/test/ARCMT/objcmt-arc-cf-annotations.m.result b/clang/test/ARCMT/objcmt-arc-cf-annotations.m.result
index d3e56a14eca5d..6621b10ef5d50 100644
--- a/clang/test/ARCMT/objcmt-arc-cf-annotations.m.result
+++ b/clang/test/ARCMT/objcmt-arc-cf-annotations.m.result
@@ -365,7 +365,7 @@ void *CFPlugInInstanceCreate(CFAllocatorRef allocator, CFUUIDRef factoryUUID, CF
 // Test cases.
 //===----------------------------------------------------------------------===//
 
-CFAbsoluteTime f1() {
+CFAbsoluteTime f1(void) {
   CFAbsoluteTime t = CFAbsoluteTimeGetCurrent();
   CFDateRef date = CFDateCreate(0, t);
   CFRetain(date);
@@ -376,7 +376,7 @@ CFAbsoluteTime f1() {
   return t;
 }
 
-CFAbsoluteTime f2() {
+CFAbsoluteTime f2(void) {
   CFAbsoluteTime t = CFAbsoluteTimeGetCurrent();
   CFDateRef date = CFDateCreate(0, t);  
   [((NSDate*) date) retain];
@@ -393,7 +393,7 @@ NSDate* global_x;
 // Test to see if we suppress an error when we store the pointer
 // to a global.
 
-CFAbsoluteTime f3() {
+CFAbsoluteTime f3(void) {
   CFAbsoluteTime t = CFAbsoluteTimeGetCurrent();
   CFDateRef date = CFDateCreate(0, t);  
   [((NSDate*) date) retain];
@@ -432,7 +432,7 @@ CFDateRef f6(int x) {
 
 // Test a leak involving an overwrite.
 
-CFDateRef f7() {
+CFDateRef f7(void) {
   CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent());  //expected-warning{{leak}}
   CFRetain(date);
   date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); // expected-warning {{leak}}
@@ -444,18 +444,18 @@ CFDateRef f7() {
 
 CF_IMPLICIT_BRIDGING_ENABLED
 
-CFDateRef MyDateCreate();
+CFDateRef MyDateCreate(void);
 
 CF_IMPLICIT_BRIDGING_DISABLED
 
 
-CFDateRef f8() {
+CFDateRef f8(void) {
   CFDateRef date = MyDateCreate(); // expected-warning{{leak}}
   CFRetain(date);  
   return date;
 }
 
-__attribute__((cf_returns_retained)) CFDateRef f9() {
+__attribute__((cf_returns_retained)) CFDateRef f9(void) {
   CFDateRef date = CFDateCreate(0, CFAbsoluteTimeGetCurrent()); // no-warning
   int *p = 0;
   // When allocations fail, CFDateCreate can return null.
@@ -489,7 +489,7 @@ void f10(io_service_t media, DADiskRef d, CFStringRef s) {
 }
 
 // Test retain/release checker with CFString and CFMutableArray.
-void f11() {
+void f11(void) {
   // Create the array.
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
 
@@ -514,38 +514,38 @@ void f11() {
 }
 
 // PR 3337: Handle functions declared using typedefs.
-typedef CFTypeRef CREATEFUN();
+typedef CFTypeRef CREATEFUN(void);
 
 CF_IMPLICIT_BRIDGING_ENABLED
 
-CFTypeRef MyCreateFun();
+CFTypeRef MyCreateFun(void);
 
 CF_IMPLICIT_BRIDGING_DISABLED
 
 
-void f12() {
+void f12(void) {
   CFTypeRef o = MyCreateFun(); // expected-warning {{leak}}
 }
 
-void f13_autorelease() {
+void f13_autorelease(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
   [(id) A autorelease]; // no-warning
 }
 
-void f13_autorelease_b() {
+void f13_autorelease_b(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   [(id) A autorelease];
   [(id) A autorelease];
 } // expected-warning{{Object autoreleased too many times}}
 
-CFMutableArrayRef f13_autorelease_c() {
+CFMutableArrayRef f13_autorelease_c(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   [(id) A autorelease];
   [(id) A autorelease]; 
   return A; // expected-warning{{Object autoreleased too many times}}
 }
 
-CFMutableArrayRef f13_autorelease_d() {
+CFMutableArrayRef f13_autorelease_d(void) {
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   [(id) A autorelease];
   [(id) A autorelease]; 
@@ -556,13 +556,13 @@ CFMutableArrayRef f13_autorelease_d() {
 
 
 // This case exercises the logic where the leak site is the same as the allocation site.
-void f14_leakimmediately() {
+void f14_leakimmediately(void) {
   CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning{{leak}}
 }
 
 // Test that we track an allocated object beyond the point where the *name*
 // of the variable storing the reference is no longer live.
-void f15() {
+void f15(void) {
   // Create the array.
   CFMutableArrayRef A = CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks);
   CFMutableArrayRef *B = &A;
@@ -793,7 +793,7 @@ void rdar6704930(unsigned char *s, unsigned int length) {
 // <rdar://problem/6257780> clang checker fails to catch use-after-release
 //===----------------------------------------------------------------------===//
 
-int rdar_6257780_Case1() {
+int rdar_6257780_Case1(void) {
   NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
   NSArray *array = [NSArray array];
   [array release]; // expected-warning{{Incorrect decrement of the reference count of an object that is not owned at this point by the caller}}
@@ -805,7 +805,7 @@ int rdar_6257780_Case1() {
 // <rdar://problem/10640253> Analyzer is confused about NSAutoreleasePool -allocWithZone:.
 //===----------------------------------------------------------------------===//
 
-void rdar_10640253_autorelease_allocWithZone() {
+void rdar_10640253_autorelease_allocWithZone(void) {
     NSAutoreleasePool *pool = [[NSAutoreleasePool allocWithZone:(NSZone*)0] init];
     (void) pool;
 }
@@ -814,7 +814,7 @@ void rdar_10640253_autorelease_allocWithZone() {
 // <rdar://problem/6866843> Checker should understand new/setObject:/release constructs
 //===----------------------------------------------------------------------===//
 
-void rdar_6866843() {
+void rdar_6866843(void) {
  NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
  NSMutableDictionary* dictionary = [[NSMutableDictionary alloc] init];
  NSArray* array = [[NSArray alloc] init];
@@ -886,7 +886,7 @@ typedef CFTypeRef OtherRef;
 @implementation RDar6320065Subclass
 @end
 
-int RDar6320065_test() {
+int RDar6320065_test(void) {
   RDar6320065 *test = [[RDar6320065 alloc] init]; // no-warning
   [test release];
   return 0;
@@ -1018,7 +1018,7 @@ void IOServiceNameMatching_wrapper(const char * name) {
   IOServiceNameMatching(name); // expected-warning{{leak}}
 }
 
-CF_RETURNS_RETAINED CFDictionaryRef CreateDict();
+CF_RETURNS_RETAINED CFDictionaryRef CreateDict(void);
 
 void IOServiceAddNotification_wrapper(mach_port_t mainPort, const io_name_t notificationType,
   mach_port_t wakePort, uintptr_t reference, io_iterator_t * notification ) {
@@ -1409,25 +1409,25 @@ void test_attr1c(TestOwnershipAttr *X) {
   NSString *str4 = [[X newString_auto] retain]; // expected-warning {{leak}}
 }
 
-void testattr2_a() {
+void testattr2_a(void) {
   TestOwnershipAttr *x = [TestOwnershipAttr alloc]; // expected-warning{{leak}}
 }
 
-void testattr2_b() {
+void testattr2_b(void) {
   TestOwnershipAttr *x = [[TestOwnershipAttr alloc] pseudoInit];  // expected-warning{{leak}}
 }
 
-void testattr2_b_11358224_self_assign_looses_the_leak() {
+void testattr2_b_11358224_self_assign_looses_the_leak(void) {
   TestOwnershipAttr *x = [[TestOwnershipAttr alloc] pseudoInit];// expected-warning{{leak}}
   x = x;
 }
 
-void testattr2_c() {
+void testattr2_c(void) {
   TestOwnershipAttr *x = [[TestOwnershipAttr alloc] pseudoInit]; // no-warning
   [x release];
 }
 
-void testattr3() {
+void testattr3(void) {
   TestOwnershipAttr *x = [TestOwnershipAttr alloc]; // no-warning
   [TestOwnershipAttr consume:x];
   TestOwnershipAttr *y = [TestOwnershipAttr alloc]; // no-warning
@@ -1437,7 +1437,7 @@ void testattr3() {
 void consume_ns(id NS_CONSUMED x);
 void consume_cf(id CF_CONSUMED x);
 
-void testattr4() {
+void testattr4(void) {
   TestOwnershipAttr *x = [TestOwnershipAttr alloc]; // no-warning
   consume_ns(x);
   TestOwnershipAttr *y = [TestOwnershipAttr alloc]; // no-warning
@@ -1465,7 +1465,7 @@ void testattr4() {
 @end
 
 CF_RETURNS_RETAINED
-CFDateRef returnsRetainedCFDate()  {
+CFDateRef returnsRetainedCFDate(void)  {
   return CFDateCreate(0, CFAbsoluteTimeGetCurrent());
 }
 
@@ -1507,15 +1507,15 @@ CFDateRef returnsRetainedCFDate()  {
 // to a noreturn or panic function
 //===----------------------------------------------------------------------===//
 
-void panic() __attribute__((noreturn));
-void panic_not_in_hardcoded_list() __attribute__((noreturn));
+void panic(void) __attribute__((noreturn));
+void panic_not_in_hardcoded_list(void) __attribute__((noreturn));
 
-void test_panic_negative() {
+void test_panic_negative(void) {
   signed z = 1;
   CFNumberRef value = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &z);  // expected-warning{{leak}}
 }
 
-void test_panic_positive() {
+void test_panic_positive(void) {
   signed z = 1;
   CFNumberRef value = CFNumberCreate(kCFAllocatorDefault, kCFNumberSInt32Type, &z); // no-warning
   panic();
@@ -1587,8 +1587,8 @@ void test_blocks_1_indirect_retain_via_call(void) {
 // detector.
 
 @protocol Prot_R8272168 @end
-Class <Prot_R8272168> GetAClassThatImplementsProt_R8272168();
-void r8272168() {
+Class <Prot_R8272168> GetAClassThatImplementsProt_R8272168(void);
+void r8272168(void) {
   GetAClassThatImplementsProt_R8272168();
 }
 
@@ -1644,7 +1644,7 @@ extern void *CFStringCreate(void);
 CF_IMPLICIT_BRIDGING_DISABLED
 
 extern void rdar_9234108_helper(void *key, void * CF_CONSUMED value);
-void rdar_9234108() {
+void rdar_9234108(void) {
   rdar_9234108_helper(0, CFStringCreate());
 }
 
@@ -1667,7 +1667,7 @@ typedef struct TwoDoubles TwoDoubles;
 }
 @end
 
-void rdar9726279() {
+void rdar9726279(void) {
   TwoDoubles twoDoubles = { 0.0, 0.0 };
   NSValue *value = [[NSValue alloc] _prefix_initWithTwoDoubles:twoDoubles];
   [value release];
@@ -1676,52 +1676,52 @@ void rdar9726279() {
 // <rdar://problem/9732321>
 // Test camelcase support for CF conventions.  While Core Foundation APIs
 // don't use camel casing, other code is allowed to use it.
-CFArrayRef camelcase_create_1() {
+CFArrayRef camelcase_create_1(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camelcase_createno() {
+CFArrayRef camelcase_createno(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
-CFArrayRef camelcase_copy() {
+CFArrayRef camelcase_copy(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camelcase_copying() {
+CFArrayRef camelcase_copying(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
-CFArrayRef copyCamelCase() {
+CFArrayRef copyCamelCase(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef __copyCamelCase() {
+CFArrayRef __copyCamelCase(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef __createCamelCase() {
+CFArrayRef __createCamelCase(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camel_create() {
+CFArrayRef camel_create(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
 
-CFArrayRef camel_creat() {
+CFArrayRef camel_creat(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
-CFArrayRef camel_copy() {
+CFArrayRef camel_copy(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camel_copyMachine() {
+CFArrayRef camel_copyMachine(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // no-warning
 }
 
-CFArrayRef camel_copymachine() {
+CFArrayRef camel_copymachine(void) {
   return CFArrayCreateMutable(0, 10, &kCFTypeArrayCallBacks); // expected-warning {{leak}}
 }
 
@@ -1756,7 +1756,7 @@ void rdar6582778_2(void) {
 // <rdar://problem/10232019> - Test that objects passed to containers
 // are marked "escaped".
 
-void rdar10232019() {
+void rdar10232019(void) {
   NSMutableArray *array = [NSMutableArray array];
 
   NSString *string = [[NSString alloc] initWithUTF8String:"foo"];
@@ -1767,7 +1767,7 @@ void rdar10232019() {
   NSLog(@"%@", otherString);
 }
 
-void rdar10232019_positive() {
+void rdar10232019_positive(void) {
   NSMutableArray *array = [NSMutableArray array];
 
   NSString *string = [[NSString alloc] initWithUTF8String:"foo"];
@@ -1783,7 +1783,7 @@ typedef void * xpc_object_t;
 xpc_object_t _CFXPCCreateXPCObjectFromCFObject(CFTypeRef cf);
 void xpc_release(xpc_object_t object);
 
-void rdar9658496() {
+void rdar9658496(void) {
   CFStringRef cf;
   xpc_object_t xpc;
   cf = CFStringCreateWithCString( ((CFAllocatorRef)0), "test", kCFStringEncodingUTF8 ); // no-warning
@@ -1804,7 +1804,7 @@ void rdar9658496() {
 }
 @end
 
-void rdar_10824732() {
+void rdar_10824732(void) {
   @autoreleasepool {
     NSString *obj = @"test";
     RDar10824732 *foo = [[RDar10824732 alloc] initWithObj:obj]; // no-warning
@@ -1880,14 +1880,14 @@ extern id NSApp;
 //===----------------------------------------------------------------------===//
 void *malloc(size_t);
 struct rdar11104566 { CFStringRef myStr; };
-struct rdar11104566 test_rdar11104566() {
+struct rdar11104566 test_rdar11104566(void) {
   CFStringRef cf = CFStringCreateWithCString( ((CFAllocatorRef)0), "test", kCFStringEncodingUTF8 ); // no-warning
   struct rdar11104566 V;
   V.myStr = cf;
   return V; // no-warning
 }
 
-struct rdar11104566 *test_2_rdar11104566() {
+struct rdar11104566 *test_2_rdar11104566(void) {
   CFStringRef cf = CFStringCreateWithCString( ((CFAllocatorRef)0), "test", kCFStringEncodingUTF8 ); // no-warning
   struct rdar11104566 *V = (struct rdar11104566 *) malloc(sizeof(*V));
   V->myStr = cf;
@@ -1898,7 +1898,7 @@ struct rdar11104566 *test_2_rdar11104566() {
 // ObjC literals support.
 //===----------------------------------------------------------------------===//
 
-void test_objc_arrays() {
+void test_objc_arrays(void) {
     { // CASE ONE -- OBJECT IN ARRAY CREATED DIRECTLY
         NSObject *o = [[NSObject alloc] init];
         NSArray *a = [[NSArray alloc] initWithObjects:o, (void*)0]; // expected-warning {{leak}}
@@ -1943,7 +1943,7 @@ void test_objc_arrays() {
     }
 }
 
-void test_objc_integer_literals() {
+void test_objc_integer_literals(void) {
   id value = [@1 retain]; // expected-warning {{leak}}
   [value description];
 }
@@ -1971,8 +1971,8 @@ void rdar11400885(int y)
   }
 }
 
-id makeCollectableNonLeak() {
-  extern CFTypeRef CFCreateSomething();
+id makeCollectableNonLeak(void) {
+  extern CFTypeRef CFCreateSomething(void);
 
   CFTypeRef object = CFCreateSomething(); // +1
   CFRetain(object); // +2
@@ -1985,7 +1985,7 @@ id makeCollectableNonLeak() {
 void consumeAndStopTracking(id NS_CONSUMED obj, void (^callback)(void));
 void CFConsumeAndStopTracking(CFTypeRef CF_CONSUMED obj, void (^callback)(void));
 
-void testConsumeAndStopTracking() {
+void testConsumeAndStopTracking(void) {
   id retained = [@[] retain]; // +1
   consumeAndStopTracking(retained, ^{}); // no-warning
 
@@ -1998,7 +1998,7 @@ void testConsumeAndStopTracking() {
   consumeAndStopTracking(unretained, ^{}); // expected-warning {{Incorrect decrement of the reference count of an object that is not owned at this point by the caller}}
 }
 
-void testCFConsumeAndStopTracking() {
+void testCFConsumeAndStopTracking(void) {
   id retained = [@[] retain]; // +1
   CFConsumeAndStopTracking((CFTypeRef)retained, ^{}); // no-warning
 
@@ -2016,10 +2016,10 @@ void testCFConsumeAndStopTracking() {
 
 typedef void *MyCFType;
 #pragma clang arc_cf_code_audited begin
-MyCFType CreateMyCFType();
+MyCFType CreateMyCFType(void);
 #pragma clang arc_cf_code_audited end 
     
-void test_custom_cf() {
+void test_custom_cf(void) {
   MyCFType x = CreateMyCFType(); // expected-warning {{leak of an object stored into 'x'}}
 }
 
@@ -2040,7 +2040,7 @@ void test_CFPlugInInstanceCreate(CFUUIDRef factoryUUID, CFUUIDRef typeUUID) {
 - (void)drain;
 @end
 
-void test_drain() {
+void test_drain(void) {
   PR14927 *obj = [[PR14927 alloc] init];
   [obj drain];
   [obj release]; // no-warning
@@ -2051,14 +2051,14 @@ void test_drain() {
 // value as tracked, even if the object isn't a known CF type.
 //===----------------------------------------------------------------------===//
 
-MyCFType getCustom() __attribute__((cf_returns_not_retained));
-MyCFType makeCustom() __attribute__((cf_returns_retained));
+MyCFType getCustom(void) __attribute__((cf_returns_not_retained));
+MyCFType makeCustom(void) __attribute__((cf_returns_retained));
 
-void testCustomReturnsRetained() {
+void testCustomReturnsRetained(void) {
   MyCFType obj = makeCustom(); // expected-warning {{leak of an object stored into 'obj'}}
 }
 
-void testCustomReturnsNotRetained() {
+void testCustomReturnsNotRetained(void) {
   CFRelease(getCustom()); // expected-warning {{Incorrect decrement of the reference count of an object that is not owned at this point by the caller}}
 }
 
diff --git a/clang/test/ARCMT/objcmt-instancetype.m b/clang/test/ARCMT/objcmt-instancetype.m
index 47dbd7aeed5a1..17dd5a46b1e78 100644
--- a/clang/test/ARCMT/objcmt-instancetype.m
+++ b/clang/test/ARCMT/objcmt-instancetype.m
@@ -103,7 +103,7 @@ + (NSNumber *)numberWithInt:(int)value { return 0; }
 #define PAIR(x) @#x, [NSNumber numberWithInt:(x)]
 #define TWO(x) ((x), (x))
 
-void foo() {
+void foo(void) {
   NSString *str = M([NSString stringWithString:@"foo"]); // expected-warning {{redundant}}
   str = [[NSString alloc] initWithString:@"foo"]; // expected-warning {{redundant}}
   NSArray *arr = [NSArray arrayWithArray:@[str]]; // expected-warning {{redundant}}
diff --git a/clang/test/ARCMT/objcmt-instancetype.m.result b/clang/test/ARCMT/objcmt-instancetype.m.result
index ce51678708dd8..5203368aad644 100644
--- a/clang/test/ARCMT/objcmt-instancetype.m.result
+++ b/clang/test/ARCMT/objcmt-instancetype.m.result
@@ -103,7 +103,7 @@ typedef signed char BOOL;
 #define PAIR(x) @#x, [NSNumber numberWithInt:(x)]
 #define TWO(x) ((x), (x))
 
-void foo() {
+void foo(void) {
   NSString *str = M([NSString stringWithString:@"foo"]); // expected-warning {{redundant}}
   str = [[NSString alloc] initWithString:@"foo"]; // expected-warning {{redundant}}
   NSArray *arr = [NSArray arrayWithArray:@[str]]; // expected-warning {{redundant}}
diff --git a/clang/test/ARCMT/objcmt-property-dot-syntax.m b/clang/test/ARCMT/objcmt-property-dot-syntax.m
index 5c71186b274fa..00426dcbdc33f 100644
--- a/clang/test/ARCMT/objcmt-property-dot-syntax.m
+++ b/clang/test/ARCMT/objcmt-property-dot-syntax.m
@@ -23,7 +23,7 @@ @interface P : NSObject
 - (P*) MethodReturnsPObj;
 @end
 
-P* fun();
+P* fun(void);
 
 @implementation P
 - (int) Meth : (P*)array {
diff --git a/clang/test/ARCMT/objcmt-property-dot-syntax.m.result b/clang/test/ARCMT/objcmt-property-dot-syntax.m.result
index 09b93f6ff8f2d..43d86a821d7c2 100644
--- a/clang/test/ARCMT/objcmt-property-dot-syntax.m.result
+++ b/clang/test/ARCMT/objcmt-property-dot-syntax.m.result
@@ -23,7 +23,7 @@
 - (P*) MethodReturnsPObj;
 @end
 
-P* fun();
+P* fun(void);
 
 @implementation P
 - (int) Meth : (P*)array {
diff --git a/clang/test/ARCMT/objcmt-subscripting-literals.m b/clang/test/ARCMT/objcmt-subscripting-literals.m
index 0974c3b8bb0f5..e2b03e2d7b587 100644
--- a/clang/test/ARCMT/objcmt-subscripting-literals.m
+++ b/clang/test/ARCMT/objcmt-subscripting-literals.m
@@ -209,7 +209,7 @@ @interface MutableCustomUnavail (Extended)
 - (void)setObject:(id)obj atIndexedSubscript:(unsigned)idx __attribute__((unavailable));
 @end
 
-void test2() {
+void test2(void) {
   MutableCustom *mutc;
   id o = [mutc objectAtIndex:4];
   [mutc replaceObjectAtIndex:2 withObject:@"val"];
diff --git a/clang/test/ARCMT/objcmt-subscripting-literals.m.result b/clang/test/ARCMT/objcmt-subscripting-literals.m.result
index ed7879bb139ee..e0b385741f01e 100644
--- a/clang/test/ARCMT/objcmt-subscripting-literals.m.result
+++ b/clang/test/ARCMT/objcmt-subscripting-literals.m.result
@@ -209,7 +209,7 @@ void test1(NSString *str) {
 - (void)setObject:(id)obj atIndexedSubscript:(unsigned)idx __attribute__((unavailable));
 @end
 
-void test2() {
+void test2(void) {
   MutableCustom *mutc;
   id o = mutc[4];
   mutc[2] = @"val";
diff --git a/clang/test/ARCMT/objcmt-with-pch.m b/clang/test/ARCMT/objcmt-with-pch.m
index 61c87a2b007d8..0925442d45eb9 100644
--- a/clang/test/ARCMT/objcmt-with-pch.m
+++ b/clang/test/ARCMT/objcmt-with-pch.m
@@ -12,6 +12,6 @@ @interface NSNumber (NSNumberCreation)
 + (NSNumber *)numberWithInt:(int)value;
 @end
 
-void foo() {
+void foo(void) {
   NSNumber *n = [NSNumber numberWithInt:1];
 }
diff --git a/clang/test/ARCMT/objcmt-with-pch.m.result b/clang/test/ARCMT/objcmt-with-pch.m.result
index 7e2570cfc456c..6d37d11fe480d 100644
--- a/clang/test/ARCMT/objcmt-with-pch.m.result
+++ b/clang/test/ARCMT/objcmt-with-pch.m.result
@@ -12,6 +12,6 @@
 + (NSNumber *)numberWithInt:(int)value;
 @end
 
-void foo() {
+void foo(void) {
   NSNumber *n = @1;
 }
diff --git a/clang/test/ARCMT/releases-driver.m b/clang/test/ARCMT/releases-driver.m
index f2f5b0848e8b4..96f96c1d3b482 100644
--- a/clang/test/ARCMT/releases-driver.m
+++ b/clang/test/ARCMT/releases-driver.m
@@ -6,7 +6,7 @@
 
 typedef int BOOL;
 
-id IhaveSideEffect();
+id IhaveSideEffect(void);
 
 @protocol NSObject
 - (BOOL)isEqual:(id)object;
diff --git a/clang/test/ARCMT/releases-driver.m.result b/clang/test/ARCMT/releases-driver.m.result
index e4427e6796e02..e7da9a04fc62a 100644
--- a/clang/test/ARCMT/releases-driver.m.result
+++ b/clang/test/ARCMT/releases-driver.m.result
@@ -6,7 +6,7 @@
 
 typedef int BOOL;
 
-id IhaveSideEffect();
+id IhaveSideEffect(void);
 
 @protocol NSObject
 - (BOOL)isEqual:(id)object;
diff --git a/clang/test/ARCMT/releases.m b/clang/test/ARCMT/releases.m
index 55008959efc4e..8636a8a5acea8 100644
--- a/clang/test/ARCMT/releases.m
+++ b/clang/test/ARCMT/releases.m
@@ -6,7 +6,7 @@
 
 typedef int BOOL;
 
-id IhaveSideEffect();
+id IhaveSideEffect(void);
 
 @protocol NSObject
 - (BOOL)isEqual:(id)object;
@@ -64,9 +64,9 @@ - (void) dealloc {
 @end
 
 void block_test(Foo *p) {
-  id (^B)() = ^() {
+  id (^B)(void) = ^(void) {
     if (p) {
-      id (^IB)() = ^() {
+      id (^IB)(void) = ^(void) {
         id bar = [p retain];
 	      [p release];
         return bar;
diff --git a/clang/test/ARCMT/releases.m.result b/clang/test/ARCMT/releases.m.result
index 473750e4e899b..261175362b9bd 100644
--- a/clang/test/ARCMT/releases.m.result
+++ b/clang/test/ARCMT/releases.m.result
@@ -6,7 +6,7 @@
 
 typedef int BOOL;
 
-id IhaveSideEffect();
+id IhaveSideEffect(void);
 
 @protocol NSObject
 - (BOOL)isEqual:(id)object;
@@ -57,9 +57,9 @@ void func(Foo *p) {
 @end
 
 void block_test(Foo *p) {
-  id (^B)() = ^() {
+  id (^B)(void) = ^(void) {
     if (p) {
-      id (^IB)() = ^() {
+      id (^IB)(void) = ^(void) {
         id bar = p;
         return bar;
       };
diff --git a/clang/test/ARCMT/retains.m b/clang/test/ARCMT/retains.m
index 60283a695ff43..43a94fc16cecf 100644
--- a/clang/test/ARCMT/retains.m
+++ b/clang/test/ARCMT/retains.m
@@ -4,7 +4,7 @@
 
 #include "Common.h"
 
-id IhaveSideEffect();
+id IhaveSideEffect(void);
 
 @interface Foo : NSObject {
   id bar;
@@ -58,9 +58,9 @@ id foo (Foo *p) {
 }
 
 void block_tests(Foo *p) {
-  id (^B)() = ^() {
+  id (^B)(void) = ^(void) {
     if (p) {
-      id (^IB)() = ^() {
+      id (^IB)(void) = ^(void) {
         id bar = [p retain];
         return bar;
       };
diff --git a/clang/test/ARCMT/retains.m.result b/clang/test/ARCMT/retains.m.result
index 2011e506360c5..4e720d6bb4c11 100644
--- a/clang/test/ARCMT/retains.m.result
+++ b/clang/test/ARCMT/retains.m.result
@@ -4,7 +4,7 @@
 
 #include "Common.h"
 
-id IhaveSideEffect();
+id IhaveSideEffect(void);
 
 @interface Foo : NSObject {
   id bar;
@@ -52,9 +52,9 @@ id foo (Foo *p) {
 }
 
 void block_tests(Foo *p) {
-  id (^B)() = ^() {
+  id (^B)(void) = ^(void) {
     if (p) {
-      id (^IB)() = ^() {
+      id (^IB)(void) = ^(void) {
         id bar = p;
         return bar;
       };
diff --git a/clang/test/ARCMT/rewrite-block-var.m b/clang/test/ARCMT/rewrite-block-var.m
index 538f16c255749..eb3c5b6535971 100644
--- a/clang/test/ARCMT/rewrite-block-var.m
+++ b/clang/test/ARCMT/rewrite-block-var.m
@@ -8,7 +8,7 @@ @interface Foo : NSObject
 -(Foo *)something;
 @end
 
-void bar(void (^block)());
+void bar(void (^block)(void));
 
 void test1(Foo *p) {
   __block Foo *x = p; // __block used just to break cycle.
diff --git a/clang/test/ARCMT/rewrite-block-var.m.result b/clang/test/ARCMT/rewrite-block-var.m.result
index a9d0b0f7fad80..cf5718fbd7f5d 100644
--- a/clang/test/ARCMT/rewrite-block-var.m.result
+++ b/clang/test/ARCMT/rewrite-block-var.m.result
@@ -8,7 +8,7 @@
 -(Foo *)something;
 @end
 
-void bar(void (^block)());
+void bar(void (^block)(void));
 
 void test1(Foo *p) {
   __weak Foo *x = p; // __block used just to break cycle.
diff --git a/clang/test/Sema/ast-print.c b/clang/test/Sema/ast-print.c
index 7f675206e58b3..2ba5ca34b5134 100644
--- a/clang/test/Sema/ast-print.c
+++ b/clang/test/Sema/ast-print.c
@@ -10,7 +10,7 @@
 // RUN: echo >> %t.c "// expected""-note@* {{'EnumWithAttributes3' has been explicitly marked deprecated here}}"
 // RUN: %clang_cc1 -fsyntax-only %t.c -verify
 
-typedef void func_typedef();
+typedef void func_typedef(void);
 func_typedef xxx;
 
 typedef void func_t(int x);
@@ -69,7 +69,7 @@ struct pair_t {
 // CHECK: struct pair_t p = {a: 3, .b = 4};
 struct pair_t p = {a: 3, .b = 4}; // expected-warning {{use of GNU old-style field designator extension}}
 
-void initializers() {
+void initializers(void) {
   // CHECK: int *x = ((void *)0), *y = ((void *)0);
   int *x = ((void *)0), *y = ((void *)0);
   struct Z{};
@@ -94,7 +94,7 @@ enum EnumWithAttributes { // expected-warning {{'EnumWithAttributes' is deprecat
 enum __attribute__((deprecated)) EnumWithAttributes2 *EnumWithAttributes2Ptr;
 
 // CHECK-LABEL: EnumWithAttributes3Fn
-void EnumWithAttributes3Fn() {
+void EnumWithAttributes3Fn(void) {
   // CHECK-NEXT: enum __attribute__((deprecated(""))) EnumWithAttributes3 *EnumWithAttributes3Ptr;
   // expected-warning@+2 {{'EnumWithAttributes3' is deprecated}}
   // expected-note@+1 {{'EnumWithAttributes3' has been explicitly marked deprecated here}}

From 3773d04a1316a79d11ff5c6ed261e57e87816428 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 17 Feb 2022 21:16:51 +0100
Subject: [PATCH 166/748] [mlir][memref] Switch ViewOp to the declarative
 assembly format

---
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       |  6 +++-
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp      | 33 -------------------
 mlir/test/Dialect/MemRef/invalid.mlir         |  2 +-
 3 files changed, 6 insertions(+), 35 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 9102da3db7877..1ee0b866a00a2 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1771,8 +1771,12 @@ def MemRef_ViewOp : MemRef_Op<"view", [
     }
   }];
 
+  let assemblyFormat = [{
+    $source `[` $byte_shift `]` `` `[` $sizes `]` attr-dict
+    `:` type($source) `to` type(results)
+  }];
+
   let hasCanonicalizer = 1;
-  let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 541da53cb2a49..4af1f5a25ba1d 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2278,39 +2278,6 @@ OpFoldResult TransposeOp::fold(ArrayRef<Attribute>) {
 // ViewOp
 //===----------------------------------------------------------------------===//
 
-ParseResult ViewOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::OperandType srcInfo;
-  SmallVector<OpAsmParser::OperandType, 1> offsetInfo;
-  SmallVector<OpAsmParser::OperandType, 4> sizesInfo;
-  auto indexType = parser.getBuilder().getIndexType();
-  Type srcType, dstType;
-  SMLoc offsetLoc;
-  if (parser.parseOperand(srcInfo) || parser.getCurrentLocation(&offsetLoc) ||
-      parser.parseOperandList(offsetInfo, OpAsmParser::Delimiter::Square))
-    return failure();
-
-  if (offsetInfo.size() != 1)
-    return parser.emitError(offsetLoc) << "expects 1 offset operand";
-
-  return failure(
-      parser.parseOperandList(sizesInfo, OpAsmParser::Delimiter::Square) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(srcType) ||
-      parser.resolveOperand(srcInfo, srcType, result.operands) ||
-      parser.resolveOperands(offsetInfo, indexType, result.operands) ||
-      parser.resolveOperands(sizesInfo, indexType, result.operands) ||
-      parser.parseKeywordType("to", dstType) ||
-      parser.addTypeToList(dstType, result.types));
-}
-
-void ViewOp::print(OpAsmPrinter &p) {
-  p << ' ' << getOperand(0) << '[';
-  p.printOperand(byte_shift());
-  p << "][" << sizes() << ']';
-  p.printOptionalAttrDict((*this)->getAttrs());
-  p << " : " << getOperand(0).getType() << " to " << getType();
-}
-
 LogicalResult ViewOp::verify() {
   auto baseType = getOperand(0).getType().cast<MemRefType>();
   auto viewType = getType();
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 54e405b3f2ad1..d65986f90441e 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -516,7 +516,7 @@ func @collapse_shape_illegal_mixed_memref_2(%arg0 : memref<?x4x5xf32>)
 
 func @invalid_view(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = memref.alloc() : memref<2048xi8>
-  // expected-error@+1 {{expects 1 offset operand}}
+  // expected-error@+1 {{expected SSA operand}}
   %1 = memref.view %0[][%arg0, %arg1]
     : memref<2048xi8> to memref<?x?xf32>
   return

From c79c13cae61564d3a03831013ea24ff483ee3e82 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 17 Feb 2022 17:18:43 +0100
Subject: [PATCH 167/748] [clang][SemaTemplate] Fix a stack use after scope

Differential Revision: https://reviews.llvm.org/D120065
---
 clang/include/clang/AST/DeclTemplate.h      | 6 +++---
 clang/lib/AST/DeclTemplate.cpp              | 9 ++++++++-
 clang/test/SemaTemplate/friend-template.cpp | 9 +++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index d216b359816e8..319e605a8a1c5 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -2461,10 +2461,10 @@ class FriendTemplateDecl : public Decl {
   SourceLocation FriendLoc;
 
   FriendTemplateDecl(DeclContext *DC, SourceLocation Loc,
-                     MutableArrayRef<TemplateParameterList *> Params,
+                     TemplateParameterList **Params, unsigned NumParams,
                      FriendUnion Friend, SourceLocation FriendLoc)
-      : Decl(Decl::FriendTemplate, DC, Loc), NumParams(Params.size()),
-        Params(Params.data()), Friend(Friend), FriendLoc(FriendLoc) {}
+      : Decl(Decl::FriendTemplate, DC, Loc), NumParams(NumParams),
+        Params(Params), Friend(Friend), FriendLoc(FriendLoc) {}
 
   FriendTemplateDecl(EmptyShell Empty) : Decl(Decl::FriendTemplate, Empty) {}
 
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 223f06b9db1c9..d9ff3517a589c 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1098,7 +1099,13 @@ FriendTemplateDecl::Create(ASTContext &Context, DeclContext *DC,
                            SourceLocation L,
                            MutableArrayRef<TemplateParameterList *> Params,
                            FriendUnion Friend, SourceLocation FLoc) {
-  return new (Context, DC) FriendTemplateDecl(DC, L, Params, Friend, FLoc);
+  TemplateParameterList **TPL = nullptr;
+  if (!Params.empty()) {
+    TPL = new (Context) TemplateParameterList *[Params.size()];
+    llvm::copy(Params, TPL);
+  }
+  return new (Context, DC)
+      FriendTemplateDecl(DC, L, TPL, Params.size(), Friend, FLoc);
 }
 
 FriendTemplateDecl *FriendTemplateDecl::CreateDeserialized(ASTContext &C,
diff --git a/clang/test/SemaTemplate/friend-template.cpp b/clang/test/SemaTemplate/friend-template.cpp
index e9b2b9b8e64e5..2dcee6c76da7d 100644
--- a/clang/test/SemaTemplate/friend-template.cpp
+++ b/clang/test/SemaTemplate/friend-template.cpp
@@ -329,3 +329,12 @@ namespace rdar12350696 {
     foo(b); // expected-note {{in instantiation}}
   }
 }
+
+namespace StackUseAfterScope {
+template <typename T> class Bar {};
+class Foo {
+  // Make sure this doesn't crash.
+  template <> friend class Bar<int>; // expected-error {{template specialization declaration cannot be a friend}}
+  bool aux;
+};
+}

From a7b9af7872d2d00f4e9e7b03adc841c2daf81900 Mon Sep 17 00:00:00 2001
From: Casey Carter <Casey@Carter.net>
Date: Thu, 17 Feb 2022 12:55:51 -0800
Subject: [PATCH 168/748] [libcxx][test] Silence signed/unsigned comparison
 warnings

---
 .../containers/associative/set/iterator.pass.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcxx/test/std/containers/associative/set/iterator.pass.cpp b/libcxx/test/std/containers/associative/set/iterator.pass.cpp
index b4344c9d3888f..4ac1187dddc71 100644
--- a/libcxx/test/std/containers/associative/set/iterator.pass.cpp
+++ b/libcxx/test/std/containers/associative/set/iterator.pass.cpp
@@ -70,10 +70,10 @@ int main(int, char**)
         i = m.begin();
         std::set<int>::const_iterator k = i;
         assert(i == k);
-        for (int j = 1; static_cast<std::size_t>(j) <= m.size(); ++j, ++i)
+        for (int j = 1; j <= static_cast<int>(m.size()); ++j, ++i)
             assert(*i == j);
         assert(i == m.end());
-        for (int j = m.size(); j >= 1; --j) {
+        for (int j = static_cast<int>(m.size()); j >= 1; --j) {
             --i;
             assert(*i == j);
         }
@@ -115,10 +115,10 @@ int main(int, char**)
         assert(static_cast<std::size_t>(std::distance(m.crbegin(), m.crend())) == m.size());
         std::set<int>::const_iterator i;
         i = m.begin();
-        for (int j = 1; static_cast<std::size_t>(j) <= m.size(); ++j, ++i)
+        for (int j = 1; j <= static_cast<int>(m.size()); ++j, ++i)
             assert(*i == j);
         assert(i == m.end());
-        for (int j = m.size(); j >= 1; --j) {
+        for (int j = static_cast<int>(m.size()); j >= 1; --j) {
             --i;
             assert(*i == j);
         }
@@ -161,10 +161,10 @@ int main(int, char**)
         i = m.begin();
         std::set<int, std::less<int>, min_allocator<int>>::const_iterator k = i;
         assert(i == k);
-        for (int j = 1; static_cast<std::size_t>(j) <= m.size(); ++j, ++i)
+        for (int j = 1; j <= static_cast<int>(m.size()); ++j, ++i)
             assert(*i == j);
         assert(i == m.end());
-        for (int j = m.size(); j >= 1; --j) {
+        for (int j = static_cast<int>(m.size()); j >= 1; --j) {
             --i;
             assert(*i == j);
         }
@@ -206,10 +206,10 @@ int main(int, char**)
         assert(static_cast<std::size_t>(std::distance(m.crbegin(), m.crend())) == m.size());
         std::set<int, std::less<int>, min_allocator<int>>::const_iterator i;
         i = m.begin();
-        for (int j = 1; static_cast<std::size_t>(j) <= m.size(); ++j, ++i)
+        for (int j = 1; j <= static_cast<int>(m.size()); ++j, ++i)
             assert(*i == j);
         assert(i == m.end());
-        for (int j = m.size(); j >= 1; --j) {
+        for (int j = static_cast<int>(m.size()); j >= 1; --j) {
             --i;
             assert(*i == j);
         }

From 1cfa4857693b405d21272414442b635d9678916e Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Fri, 11 Feb 2022 19:24:31 +0100
Subject: [PATCH 169/748] [libc++] Implement P1165R1 (Make stateful allocator
 propagation more consistent)

Reviewed By: Quuxplusone, ldionne, #libc

Spies: libcxx-commits

Differential Revision: https://reviews.llvm.org/D119112
---
 libcxx/docs/ReleaseNotes.rst                  |   1 +
 libcxx/docs/Status/Cxx20Papers.csv            |   2 +-
 libcxx/include/string                         |  31 +--
 .../string_op+/allocator_propagation.pass.cpp | 200 ++++++++++++++++++
 4 files changed, 220 insertions(+), 14 deletions(-)
 create mode 100644 libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/allocator_propagation.pass.cpp

diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index c57a7ded2e4f9..1e24ad6b3c550 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -39,6 +39,7 @@ New Features
 ------------
 
  - Implemented P0627R6 (Function to mark unreachable code)
+ - Implemented P1165R1 (Make stateful allocator propagation more consistent for operator+(basic_string))
 
 API Changes
 -----------
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index cf983a4b4829f..438e1f07b896c 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -72,7 +72,7 @@
 "`P1085R2 <https://wg21.link/P1085R2>`__","LWG","Should Span be Regular?","San Diego","|Complete|","8.0"
 "`P1123R0 <https://wg21.link/P1123R0>`__","LWG","Editorial Guidance for merging P0019r8 and P0528r3","San Diego","* *",""
 "`P1148R0 <https://wg21.link/P1148R0>`__","LWG","Cleaning up Clause 20","San Diego","* *",""
-"`P1165R1 <https://wg21.link/P1165R1>`__","LWG","Make stateful allocator propagation more consistent for ``operator+(basic_string)``\ ","San Diego","* *",""
+"`P1165R1 <https://wg21.link/P1165R1>`__","LWG","Make stateful allocator propagation more consistent for ``operator+(basic_string)``\ ","San Diego","|Complete|","15.0"
 "`P1209R0 <https://wg21.link/P1209R0>`__","LWG","Adopt Consistent Container Erasure from Library Fundamentals 2 for C++20","San Diego","|Complete|","8.0"
 "`P1210R0 <https://wg21.link/P1210R0>`__","LWG","Completing the Rebase of Library Fundamentals, Version 3, Working Draft","San Diego","* *",""
 "`P1236R1 <https://wg21.link/P1236R1>`__","CWG","Alternative Wording for P0907R4 Signed Integers are Two's Complement","San Diego","* *",""
diff --git a/libcxx/include/string b/libcxx/include/string
index 892df770756e2..d33f6edf97e9a 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -4161,9 +4161,10 @@ basic_string<_CharT, _Traits, _Allocator>
 operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs,
           const basic_string<_CharT, _Traits, _Allocator>& __rhs)
 {
-    basic_string<_CharT, _Traits, _Allocator> __r(__lhs.get_allocator());
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __lhs_sz = __lhs.size();
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __rhs_sz = __rhs.size();
+    using _String = basic_string<_CharT, _Traits, _Allocator>;
+    _String __r(_String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator()));
+    typename _String::size_type __lhs_sz = __lhs.size();
+    typename _String::size_type __rhs_sz = __rhs.size();
     __r.__init(__lhs.data(), __lhs_sz, __lhs_sz + __rhs_sz);
     __r.append(__rhs.data(), __rhs_sz);
     return __r;
@@ -4173,9 +4174,10 @@ template<class _CharT, class _Traits, class _Allocator>
 basic_string<_CharT, _Traits, _Allocator>
 operator+(const _CharT* __lhs , const basic_string<_CharT,_Traits,_Allocator>& __rhs)
 {
-    basic_string<_CharT, _Traits, _Allocator> __r(__rhs.get_allocator());
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __lhs_sz = _Traits::length(__lhs);
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __rhs_sz = __rhs.size();
+    using _String = basic_string<_CharT, _Traits, _Allocator>;
+    _String __r(_String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator()));
+    typename _String::size_type __lhs_sz = _Traits::length(__lhs);
+    typename _String::size_type __rhs_sz = __rhs.size();
     __r.__init(__lhs, __lhs_sz, __lhs_sz + __rhs_sz);
     __r.append(__rhs.data(), __rhs_sz);
     return __r;
@@ -4185,8 +4187,9 @@ template<class _CharT, class _Traits, class _Allocator>
 basic_string<_CharT, _Traits, _Allocator>
 operator+(_CharT __lhs, const basic_string<_CharT,_Traits,_Allocator>& __rhs)
 {
-    basic_string<_CharT, _Traits, _Allocator> __r(__rhs.get_allocator());
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __rhs_sz = __rhs.size();
+    using _String = basic_string<_CharT, _Traits, _Allocator>;
+    _String __r(_String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator()));
+    typename _String::size_type __rhs_sz = __rhs.size();
     __r.__init(&__lhs, 1, 1 + __rhs_sz);
     __r.append(__rhs.data(), __rhs_sz);
     return __r;
@@ -4197,9 +4200,10 @@ inline
 basic_string<_CharT, _Traits, _Allocator>
 operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs)
 {
-    basic_string<_CharT, _Traits, _Allocator> __r(__lhs.get_allocator());
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __lhs_sz = __lhs.size();
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __rhs_sz = _Traits::length(__rhs);
+    using _String = basic_string<_CharT, _Traits, _Allocator>;
+    _String __r(_String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator()));
+    typename _String::size_type __lhs_sz = __lhs.size();
+    typename _String::size_type __rhs_sz = _Traits::length(__rhs);
     __r.__init(__lhs.data(), __lhs_sz, __lhs_sz + __rhs_sz);
     __r.append(__rhs, __rhs_sz);
     return __r;
@@ -4209,8 +4213,9 @@ template<class _CharT, class _Traits, class _Allocator>
 basic_string<_CharT, _Traits, _Allocator>
 operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, _CharT __rhs)
 {
-    basic_string<_CharT, _Traits, _Allocator> __r(__lhs.get_allocator());
-    typename basic_string<_CharT, _Traits, _Allocator>::size_type __lhs_sz = __lhs.size();
+    using _String = basic_string<_CharT, _Traits, _Allocator>;
+    _String __r(_String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator()));
+    typename _String::size_type __lhs_sz = __lhs.size();
     __r.__init(__lhs.data(), __lhs_sz, __lhs_sz + 1);
     __r.push_back(__rhs);
     return __r;
diff --git a/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/allocator_propagation.pass.cpp b/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/allocator_propagation.pass.cpp
new file mode 100644
index 0000000000000..e25bff5fd7ad7
--- /dev/null
+++ b/libcxx/test/std/strings/basic.string/string.nonmembers/string_op+/allocator_propagation.pass.cpp
@@ -0,0 +1,200 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// This test ensures that we properly propagate allocators, per https://wg21.link/p1165r1
+
+#include <cassert>
+#include <string>
+
+#include "test_macros.h"
+
+template <class T>
+class soccc_allocator {
+  int* soccc_count;
+  int self_soccc_count;
+
+public:
+  using value_type = T;
+
+  constexpr explicit soccc_allocator(int* soccc_count_, int self_coccc_count_ = 0)
+      : soccc_count(soccc_count_), self_soccc_count(self_coccc_count_) {}
+
+  template <class U>
+  constexpr soccc_allocator(const soccc_allocator<U>& a) : soccc_count(a.soccc_count) {}
+
+  constexpr T* allocate(std::size_t n) { return std::allocator<T>().allocate(n); }
+  constexpr void deallocate(T* p, std::size_t s) { std::allocator<T>().deallocate(p, s); }
+
+  constexpr soccc_allocator select_on_container_copy_construction() const {
+    *soccc_count += 1;
+    return soccc_allocator(soccc_count, self_soccc_count + 1);
+  }
+
+  constexpr auto get_soccc() { return soccc_count; }
+  constexpr auto get_self_soccc() { return self_soccc_count; }
+
+  typedef std::true_type propagate_on_container_copy_assignment;
+  typedef std::true_type propagate_on_container_move_assignment;
+  typedef std::true_type propagate_on_container_swap;
+};
+
+template <class CharT>
+bool test() {
+  using S = std::basic_string<CharT, std::char_traits<CharT>, soccc_allocator<CharT>>;
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs + rhs;
+    assert(r.get_allocator().get_soccc() == &soccc_lhs);
+    assert(r.get_allocator().get_self_soccc() == 1);
+    assert(soccc_lhs == 1);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs + std::move(rhs);
+    assert(r.get_allocator().get_soccc() == &soccc_rhs);
+    assert(r.get_allocator().get_self_soccc() == 0);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = std::move(lhs) + rhs;
+    assert(r.get_allocator().get_soccc() == &soccc_lhs);
+    assert(r.get_allocator().get_self_soccc() == 0);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = std::move(lhs) + std::move(rhs);
+    assert(r.get_allocator().get_soccc() == &soccc_lhs);
+    assert(r.get_allocator().get_self_soccc() == 0);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs + rhs.data();
+    assert(r.get_allocator().get_soccc() == &soccc_lhs);
+    assert(r.get_allocator().get_self_soccc() == 1);
+    assert(soccc_lhs == 1);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs + rhs[0];
+    assert(r.get_allocator().get_soccc() == &soccc_lhs);
+    assert(r.get_allocator().get_self_soccc() == 1);
+    assert(soccc_lhs == 1);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = std::move(lhs) + rhs.data();
+    assert(r.get_allocator().get_soccc() == &soccc_lhs);
+    assert(r.get_allocator().get_self_soccc() == 0);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = std::move(lhs) + rhs[0];
+    assert(r.get_allocator().get_soccc() == &soccc_lhs);
+    assert(r.get_allocator().get_self_soccc() == 0);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs.data() + rhs;
+    assert(r.get_allocator().get_soccc() == &soccc_rhs);
+    assert(r.get_allocator().get_self_soccc() == 1);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 1);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs[0] + rhs;
+    assert(r.get_allocator().get_soccc() == &soccc_rhs);
+    assert(r.get_allocator().get_self_soccc() == 1);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 1);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs.data() + std::move(rhs);
+    assert(r.get_allocator().get_soccc() == &soccc_rhs);
+    assert(r.get_allocator().get_self_soccc() == 0);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 0);
+  }
+  {
+    int soccc_lhs = 0;
+    int soccc_rhs = 0;
+    S lhs(soccc_allocator<CharT>{&soccc_lhs});
+    S rhs(soccc_allocator<CharT>{&soccc_rhs});
+    auto r = lhs[0] + std::move(rhs);
+    assert(r.get_allocator().get_soccc() == &soccc_rhs);
+    assert(r.get_allocator().get_self_soccc() == 0);
+    assert(soccc_lhs == 0);
+    assert(soccc_rhs == 0);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+#if TEST_STD_VER > 17
+  // static_assert(test<char>());
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  // static_assert(test<wchar_t>());
+#endif
+#endif
+
+  return 0;
+}

From 6c80e385540b6e33b43262f0c5de4fafc03abfbd Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 17 Feb 2022 10:00:29 -0500
Subject: [PATCH 170/748] [libc++][CI] Upload ABI lists for all jobs

Some jobs might not produce those, but it makes the blocks easier to
copy-paste and makes sure that if a job does produce an ABI list, it
will be updloaded in the artifacts.

Differential Revision: https://reviews.llvm.org/D120056
---
 libcxx/utils/ci/buildkite-pipeline.yml | 43 +++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index 31d2825f0ef86..4067430c13fff 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -117,6 +117,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-modules"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -130,6 +131,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-gcc"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -191,6 +193,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-gcc-cxx11"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -204,6 +207,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-clang-12"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -217,6 +221,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-clang-13"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -233,6 +238,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-asan"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -246,6 +252,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-tsan"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -259,6 +266,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-ubsan"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -273,6 +281,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot bootstrapping-build"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -288,6 +297,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot legacy-test-config"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -301,6 +311,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot legacy-project-build"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -316,6 +327,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-static"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -357,6 +369,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-with_llvm_unwinder"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -372,6 +385,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-singlethreaded"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -385,6 +399,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-no-debug"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -398,6 +413,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-no-filesystem"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -411,6 +427,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-no-random_device"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -424,6 +441,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-no-localization"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -437,6 +455,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-no-unicode"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -450,6 +469,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-no-wide-characters"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -463,6 +483,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-no-experimental"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -476,6 +497,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot generic-noexceptions"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "linux"
@@ -489,6 +511,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot generic-abi-unstable"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -503,6 +526,7 @@ steps:
     command: "libcxx/utils/ci/run-buildbot benchmarks"
     artifact_paths:
       - "**/test-results.xml"
+      - "**/*.abilist"
     agents:
       queue: "libcxx-builders"
       os: "linux"
@@ -519,6 +543,7 @@ steps:
       command: "bash libcxx/utils/ci/run-buildbot clang-cl-dll"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "windows"
       retry:
@@ -531,6 +556,7 @@ steps:
       command: "bash libcxx/utils/ci/run-buildbot clang-cl-static"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "windows"
       retry:
@@ -543,6 +569,7 @@ steps:
       command: "bash libcxx/utils/ci/run-buildbot mingw-dll"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "windows"
       retry:
@@ -555,6 +582,7 @@ steps:
       command: "bash libcxx/utils/ci/run-buildbot mingw-static"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "windows"
       retry:
@@ -600,6 +628,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot apple-system"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "macos"
@@ -615,6 +644,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot apple-system-backdeployment-10.9"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "macos"
@@ -629,6 +659,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot apple-system-backdeployment-10.15"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders"
         os: "macos"
@@ -645,6 +676,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot aarch64"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders-linaro-arm"
         arch: "aarch64"
@@ -658,6 +690,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot aarch64-noexceptions"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders-linaro-arm"
         arch: "aarch64"
@@ -671,6 +704,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot armv8"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders-linaro-arm"
         arch: "armv8l"
@@ -684,6 +718,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot armv8-noexceptions"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders-linaro-arm"
         arch: "armv8l"
@@ -697,6 +732,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot armv7"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders-linaro-arm"
         arch: "armv8l" # Compiling for v7, running on v8 hardware
@@ -710,6 +746,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot armv7-noexceptions"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       agents:
         queue: "libcxx-builders-linaro-arm"
         arch: "armv8l" # Compiling for v7, running on v8 hardware
@@ -725,10 +762,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot aix"
       artifact_paths:
         - "**/test-results.xml"
-      retry:
-        automatic:
-          - exit_status: -1  # Agent was lost
-            limit: 2
+        - "**/*.abilist"
       env:
           OBJECT_MODE: "32"
       agents:
@@ -744,6 +778,7 @@ steps:
       command: "libcxx/utils/ci/run-buildbot aix"
       artifact_paths:
         - "**/test-results.xml"
+        - "**/*.abilist"
       env:
           OBJECT_MODE: "64"
       agents:

From 331e8e4e27be5dd673898a89a7cf00e76903216a Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Thu, 17 Feb 2022 11:10:41 +0100
Subject: [PATCH 171/748] [clang-format] Do not add space after return-like
 keywords in macros.

Fixes https://github.com/llvm/llvm-project/issues/33336.

Reviewed By: HazardyKnusperkeks, owenpan

Differential Revision: https://reviews.llvm.org/D120028
---
 clang/lib/Format/TokenAnnotator.cpp   |  5 +++--
 clang/unittests/Format/FormatTest.cpp | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 206fa4541217c..9a020eb6ca7dc 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2989,7 +2989,8 @@ bool TokenAnnotator::spaceRequiredBeforeParens(const FormatToken &Right) const {
 bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
                                           const FormatToken &Left,
                                           const FormatToken &Right) {
-  if (Left.is(tok::kw_return) && Right.isNot(tok::semi))
+  if (Left.is(tok::kw_return) &&
+      !Right.isOneOf(tok::semi, tok::r_paren, tok::hashhash))
     return true;
   if (Style.isJson() && Left.is(tok::string_literal) && Right.is(tok::colon))
     return false;
@@ -3026,7 +3027,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
     return false;
   // co_await (x), co_yield (x), co_return (x)
   if (Left.isOneOf(tok::kw_co_await, tok::kw_co_yield, tok::kw_co_return) &&
-      Right.isNot(tok::semi))
+      !Right.isOneOf(tok::semi, tok::r_paren))
     return true;
 
   if (Left.is(tok::l_paren) || Right.is(tok::r_paren))
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 73503696741a7..f71f8dc5de456 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -1861,6 +1861,16 @@ TEST_F(FormatTest, UnderstandsMacros) {
                "#define BBB }\n",
                Style);
   // verifyFormat("#define AAA N { //\n", Style);
+
+  verifyFormat("MACRO(return)");
+  verifyFormat("MACRO(co_await)");
+  verifyFormat("MACRO(co_return)");
+  verifyFormat("MACRO(co_yield)");
+  verifyFormat("MACRO(return, something)");
+  verifyFormat("MACRO(co_return, something)");
+  verifyFormat("MACRO(something##something)");
+  verifyFormat("MACRO(return##something)");
+  verifyFormat("MACRO(co_return##something)");
 }
 
 TEST_F(FormatTest, ShortBlocksInMacrosDontMergeWithCodeAfterMacro) {

From 807ba7aace188ada83ddb4477265728e97346af1 Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Mon, 14 Feb 2022 11:52:42 -0800
Subject: [PATCH 172/748] Reland "[memprof] Extend the index prof format to
 include memory profiles."

This reverts commit 85355a560a33897453df2ef959e255ee725eebce.

This patch adds support for optional memory profile information to be
included with and indexed profile. The indexed profile header adds a new
field which points to the offset of the memory profile section (if
present) in the indexed profile. For users who do not utilize this
feature the only overhead is a 64-bit offset in the header.

The memory profile section contains (1) profile metadata describing the
information recorded for each entry (2) an on-disk hashtable containing
the profile records indexed via llvm::md5(function_name). We chose to
introduce a separate hash table instead of the existing one since the
indexing for the instrumented fdo hash table is based on a CFG hash
which itself is perturbed by memprof instrumentation.

Differential Revision: https://reviews.llvm.org/D118653
---
 compiler-rt/include/profile/InstrProfData.inc |   4 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |   8 +-
 .../llvm/ProfileData/InstrProfData.inc        |   4 +-
 .../llvm/ProfileData/InstrProfReader.h        |  14 ++
 .../llvm/ProfileData/InstrProfWriter.h        |  11 ++
 llvm/include/llvm/ProfileData/MemProf.h       | 185 +++++++++++++++++-
 llvm/include/llvm/ProfileData/MemProfData.inc |   4 +-
 .../llvm/ProfileData/RawMemProfReader.h       |   3 +
 llvm/lib/ProfileData/CMakeLists.txt           |   1 +
 llvm/lib/ProfileData/InstrProf.cpp            |  23 ++-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  43 +++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  90 ++++++++-
 llvm/lib/ProfileData/MemProf.cpp              |  73 +++++++
 llvm/lib/ProfileData/RawMemProfReader.cpp     |   7 +-
 .../tools/llvm-profdata/Inputs/basic.profraw  | Bin 0 -> 152 bytes
 .../tools/llvm-profdata/memprof-merge.test    |  47 +++++
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  48 ++++-
 llvm/unittests/ProfileData/InstrProfTest.cpp  |  62 ++++++
 llvm/unittests/ProfileData/MemProfTest.cpp    |  54 ++++-
 19 files changed, 649 insertions(+), 32 deletions(-)
 create mode 100644 llvm/lib/ProfileData/MemProf.cpp
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/basic.profraw
 create mode 100644 llvm/test/tools/llvm-profdata/memprof-merge.test

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 62054a6a3df51..282620d8b5dc0 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -650,7 +650,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 7
+#define INSTR_PROF_INDEX_VERSION 8
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -662,6 +662,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
+ * The 62nd bit indicates whether memory profile information is present.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -671,6 +672,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define VARIANT_MASK_DBG_CORRELATE (0x1ULL << 59)
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
+#define VARIANT_MASK_MEMPROF (0x1ULL << 62)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index c015e8e4b43d0..e14d3e206e9f2 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -287,7 +287,8 @@ enum class InstrProfKind {
   CS = 0x8, // A context sensitive IR-level profile.
   SingleByteCoverage = 0x10, // Use single byte probes for coverage.
   FunctionEntryOnly = 0x20,  // Only instrument the function entry basic block.
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionEntryOnly)
+  MemProf = 0x40, // A memory profile collected using -fmemory-profile.
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/MemProf)
 };
 
 const std::error_category &instrprof_category();
@@ -1011,7 +1012,9 @@ enum ProfVersion {
   Version6 = 6,
   // An additional counter is added around logical operators.
   Version7 = 7,
-  // The current version is 7.
+  // An additional (optional) memory profile type is added.
+  Version8 = 8,
+  // The current version is 8.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1028,6 +1031,7 @@ struct Header {
   uint64_t Unused; // Becomes unused since version 4
   uint64_t HashType;
   uint64_t HashOffset;
+  uint64_t MemProfOffset;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 62054a6a3df51..282620d8b5dc0 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -650,7 +650,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 7
+#define INSTR_PROF_INDEX_VERSION 8
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -662,6 +662,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
+ * The 62nd bit indicates whether memory profile information is present.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -671,6 +672,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define VARIANT_MASK_DBG_CORRELATE (0x1ULL << 59)
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
+#define VARIANT_MASK_MEMPROF (0x1ULL << 62)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 548affbf65fa5..7a18d5a6a11af 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -19,6 +19,7 @@
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfCorrelator.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
@@ -471,6 +472,9 @@ struct InstrProfReaderIndexBase {
 using OnDiskHashTableImplV3 =
     OnDiskIterableChainedHashTable<InstrProfLookupTrait>;
 
+using MemProfHashTable =
+    OnDiskIterableChainedHashTable<memprof::MemProfRecordLookupTrait>;
+
 template <typename HashTableImpl>
 class InstrProfReaderItaniumRemapper;
 
@@ -556,6 +560,11 @@ class IndexedInstrProfReader : public InstrProfReader {
   std::unique_ptr<ProfileSummary> Summary;
   /// Context sensitive profile summary data.
   std::unique_ptr<ProfileSummary> CS_Summary;
+  /// MemProf profile schema (if available).
+  memprof::MemProfSchema Schema;
+  /// MemProf profile data on-disk indexed via llvm::md5(FunctionName).
+  std::unique_ptr<MemProfHashTable> MemProfTable;
+
   // Index to the current record in the record array.
   unsigned RecordIndex;
 
@@ -609,6 +618,11 @@ class IndexedInstrProfReader : public InstrProfReader {
   Expected<InstrProfRecord> getInstrProfRecord(StringRef FuncName,
                                                uint64_t FuncHash);
 
+  /// Return the memprof records for the function identified by
+  /// llvm::md5(Name).
+  Expected<ArrayRef<memprof::MemProfRecord>>
+  getMemProfRecord(uint64_t FuncNameHash);
+
   /// Fill Counts with the profile data for the given function name.
   Error getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                           std::vector<uint64_t> &Counts);
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index af1e46cf4fc24..bb180ac42c212 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -37,6 +38,11 @@ class InstrProfWriter {
 private:
   bool Sparse;
   StringMap<ProfilingData> FunctionData;
+
+  // A map to hold memprof data per function. The lower 64 bits obtained from
+  // the md5 hash of the function name is used to index into the map.
+  memprof::FunctionMemProfMap MemProfData;
+
   // An enum describing the attributes of the profile.
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
   // Use raw pointer here for the incomplete type object.
@@ -57,6 +63,9 @@ class InstrProfWriter {
     addRecord(std::move(I), 1, Warn);
   }
 
+  void addRecord(const ::llvm::memprof::MemProfRecord &MR,
+                 function_ref<void(Error)> Warn);
+
   /// Merge existing function counts from the given writer.
   void mergeRecordsFromWriter(InstrProfWriter &&IPW,
                               function_ref<void(Error)> Warn);
@@ -112,6 +121,8 @@ class InstrProfWriter {
     return Error::success();
   }
 
+  InstrProfKind getProfileKind() const { return ProfileKind; }
+
   // Internal interface for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness);
   void setOutputSparse(bool Sparse);
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 2fa577a626bbe..dcc9b69386e8a 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -5,6 +5,7 @@
 #include <string>
 #include <vector>
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ProfileData/MemProfData.inc"
 #include "llvm/ProfileData/ProfileCommon.h"
@@ -134,18 +135,52 @@ struct PortableMemInfoBlock {
 };
 
 struct MemProfRecord {
-  struct Frame {
-    std::string Function;
+  // Describes a call frame for a dynamic allocation context. The contents of
+  // the frame are populated by symbolizing the stack depot call frame from the
+  // compiler runtime.
+  PACKED(struct Frame {
+    // A uuid (uint64_t) identifying the function. It is obtained by
+    // llvm::md5(FunctionName) which returns the lower 64 bits.
+    GlobalValue::GUID Function;
+    // The source line offset of the call from the beginning of parent function.
     uint32_t LineOffset;
+    // The source column number of the call to help distinguish multiple calls
+    // on the same line.
     uint32_t Column;
+    // Whether the current frame is inlined.
     bool IsInlineFrame;
 
-    Frame(std::string Str, uint32_t Off, uint32_t Col, bool Inline)
-        : Function(std::move(Str)), LineOffset(Off), Column(Col),
-          IsInlineFrame(Inline) {}
-  };
+    Frame(uint64_t Hash, uint32_t Off, uint32_t Col, bool Inline)
+        : Function(Hash), LineOffset(Off), Column(Col), IsInlineFrame(Inline) {}
 
+    bool operator==(const Frame &Other) const {
+      return Other.Function == Function && Other.LineOffset == LineOffset &&
+             Other.Column == Column && Other.IsInlineFrame == IsInlineFrame;
+    }
+
+    bool operator!=(const Frame &Other) const { return !operator==(Other); }
+
+    // Write the contents of the frame to the ostream \p OS.
+    void write(raw_ostream & OS) const {
+      using namespace support;
+
+      endian::Writer LE(OS, little);
+
+      // If the type of the GlobalValue::GUID changes, then we need to update
+      // the reader and the writer.
+      static_assert(std::is_same<GlobalValue::GUID, uint64_t>::value,
+                    "Expect GUID to be uint64_t.");
+      LE.write<uint64_t>(Function);
+
+      LE.write<uint32_t>(LineOffset);
+      LE.write<uint32_t>(Column);
+      LE.write<bool>(IsInlineFrame);
+    }
+  });
+
+  // The dynamic calling context for the allocation.
   std::vector<Frame> CallStack;
+  // The statistics obtained from the runtime for the allocation.
   PortableMemInfoBlock Info;
 
   void clear() {
@@ -153,6 +188,12 @@ struct MemProfRecord {
     Info.clear();
   }
 
+  size_t serializedSize() const {
+    return sizeof(uint64_t) + // The number of frames to serialize.
+           sizeof(Frame) * CallStack.size() + // The contents of the frames.
+           PortableMemInfoBlock::serializedSize(); // The size of the payload.
+  }
+
   // Prints out the contents of the memprof record in YAML.
   void print(llvm::raw_ostream &OS) const {
     OS << "    Callstack:\n";
@@ -168,6 +209,138 @@ struct MemProfRecord {
 
     Info.printYAML(OS);
   }
+
+  bool operator==(const MemProfRecord &Other) const {
+    if (Other.Info != Info)
+      return false;
+
+    if (Other.CallStack.size() != CallStack.size())
+      return false;
+
+    for (size_t I = 0; I < Other.CallStack.size(); I++) {
+      if (Other.CallStack[I] != CallStack[I])
+        return false;
+    }
+    return true;
+  }
+};
+
+// Serializes the memprof records in \p Records to the ostream \p OS based on
+// the schema provided in \p Schema.
+void serializeRecords(const ArrayRef<MemProfRecord> Records,
+                      const MemProfSchema &Schema, raw_ostream &OS);
+
+// Deserializes memprof records from the Buffer
+SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
+                                                 const unsigned char *Buffer);
+
+// Reads a memprof schema from a buffer. All entries in the buffer are
+// interpreted as uint64_t. The first entry in the buffer denotes the number of
+// ids in the schema. Subsequent entries are integers which map to memprof::Meta
+// enum class entries. After successfully reading the schema, the pointer is one
+// byte past the schema contents.
+Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer);
+
+using FunctionMemProfMap =
+    DenseMap<uint64_t, SmallVector<memprof::MemProfRecord, 4>>;
+
+/// Trait for lookups into the on-disk hash table for memprof format in the
+/// indexed profile.
+class MemProfRecordLookupTrait {
+public:
+  using data_type = ArrayRef<MemProfRecord>;
+  using internal_key_type = uint64_t;
+  using external_key_type = uint64_t;
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
+
+  MemProfRecordLookupTrait() = delete;
+  MemProfRecordLookupTrait(const MemProfSchema &S) : Schema(S) {}
+
+  static bool EqualKey(uint64_t A, uint64_t B) { return A == B; }
+  static uint64_t GetInternalKey(uint64_t K) { return K; }
+  static uint64_t GetExternalKey(uint64_t K) { return K; }
+
+  hash_value_type ComputeHash(uint64_t K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  ReadKeyDataLength(const unsigned char *&D) {
+    using namespace support;
+
+    offset_type KeyLen = endian::readNext<offset_type, little, unaligned>(D);
+    offset_type DataLen = endian::readNext<offset_type, little, unaligned>(D);
+    return std::make_pair(KeyLen, DataLen);
+  }
+
+  uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) {
+    using namespace support;
+    return endian::readNext<external_key_type, little, unaligned>(D);
+  }
+
+  data_type ReadData(uint64_t K, const unsigned char *D,
+                     offset_type /*Unused*/) {
+    Records = deserializeRecords(Schema, D);
+    return Records;
+  }
+
+private:
+  // Holds the memprof schema used to deserialize records.
+  MemProfSchema Schema;
+  // Holds the records from one function deserialized from the indexed format.
+  llvm::SmallVector<MemProfRecord, 4> Records;
+};
+
+class MemProfRecordWriterTrait {
+public:
+  using key_type = uint64_t;
+  using key_type_ref = uint64_t;
+
+  using data_type = ArrayRef<MemProfRecord>;
+  using data_type_ref = ArrayRef<MemProfRecord>;
+
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
+
+  // Pointer to the memprof schema to use for the generator. Unlike the reader
+  // we must use a default constructor with no params for the writer trait so we
+  // have a public member which must be initialized by the user.
+  MemProfSchema *Schema = nullptr;
+
+  MemProfRecordWriterTrait() = default;
+
+  static hash_value_type ComputeHash(key_type_ref K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
+    using namespace support;
+
+    endian::Writer LE(Out, little);
+
+    offset_type N = sizeof(K);
+    LE.write<offset_type>(N);
+
+    offset_type M = 0;
+
+    M += sizeof(uint64_t);
+    for (const auto &Record : V) {
+      M += Record.serializedSize();
+    }
+
+    LE.write<offset_type>(M);
+    return std::make_pair(N, M);
+  }
+
+  void EmitKey(raw_ostream &Out, key_type_ref K, offset_type /*Unused*/) {
+    using namespace support;
+    endian::Writer LE(Out, little);
+    LE.write<uint64_t>(K);
+  }
+
+  void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
+                offset_type /*Unused*/) {
+    assert(Schema != nullptr && "MemProf schema is not initialized!");
+    serializeRecords(V, *Schema, Out);
+  }
 };
 
 } // namespace memprof
diff --git a/llvm/include/llvm/ProfileData/MemProfData.inc b/llvm/include/llvm/ProfileData/MemProfData.inc
index 8135a664b0466..38698be9ea0ec 100644
--- a/llvm/include/llvm/ProfileData/MemProfData.inc
+++ b/llvm/include/llvm/ProfileData/MemProfData.inc
@@ -1,5 +1,5 @@
-#ifndef LLVM_PROFILEDATA_MEMPROFDATA_INC
-#define LLVM_PROFILEDATA_MEMPROFDATA_INC
+#ifndef MEMPROF_DATA_INC
+#define MEMPROF_DATA_INC
 /*===-- MemProfData.inc - MemProf profiling runtime structures -*- C++ -*-=== *\
 |*
 |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/llvm/include/llvm/ProfileData/RawMemProfReader.h b/llvm/include/llvm/ProfileData/RawMemProfReader.h
index 55ba31d2a6492..bda33d336468a 100644
--- a/llvm/include/llvm/ProfileData/RawMemProfReader.h
+++ b/llvm/include/llvm/ProfileData/RawMemProfReader.h
@@ -66,6 +66,9 @@ class RawMemProfReader {
     return Iterator(this);
   }
 
+  // The RawMemProfReader only holds memory profile information.
+  InstrProfKind getProfileKind() const { return InstrProfKind::MemProf; }
+
   // Constructor for unittests only.
   RawMemProfReader(std::unique_ptr<llvm::symbolize::SymbolizableModule> Sym,
                    llvm::SmallVectorImpl<SegmentEntry> &Seg,
diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt
index 2749119f72d90..486c45d0dff5c 100644
--- a/llvm/lib/ProfileData/CMakeLists.txt
+++ b/llvm/lib/ProfileData/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_component_library(LLVMProfileData
   InstrProfCorrelator.cpp
   InstrProfReader.cpp
   InstrProfWriter.cpp
+  MemProf.cpp
   ProfileSummaryBuilder.cpp
   SampleProf.cpp
   SampleProfReader.cpp
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 6e53b0a276998..0a0ce7604a290 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1349,8 +1349,15 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
   switch (GET_VERSION(H.formatVersion())) {
-  // When a new field is added in the header add a case statement here to
-  // populate it.
+    // When a new field is added in the header add a case statement here to
+    // populate it.
+    static_assert(
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
+        "Please update the reading code below if a new field has been added, "
+        "if not add a case statement to fall through to the latest version.");
+  case 8ull:
+    H.MemProfOffset = read(Buffer, offsetOf(&Header::MemProfOffset));
+    LLVM_FALLTHROUGH;
   default: // Version7 (when the backwards compatible header was introduced).
     H.HashType = read(Buffer, offsetOf(&Header::HashType));
     H.HashOffset = read(Buffer, offsetOf(&Header::HashOffset));
@@ -1361,9 +1368,15 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 
 size_t Header::size() const {
   switch (GET_VERSION(formatVersion())) {
-  // When a new field is added to the header add a case statement here to
-  // compute the size as offset of the new field + size of the new field. This
-  // relies on the field being added to the end of the list.
+    // When a new field is added to the header add a case statement here to
+    // compute the size as offset of the new field + size of the new field. This
+    // relies on the field being added to the end of the list.
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
+                  "Please update the size computation below if a new field has "
+                  "been added to the header, if not add a case statement to "
+                  "fall through to the latest version.");
+  case 8ull:
+    return offsetOf(&Header::MemProfOffset) + sizeof(Header::MemProfOffset);
   default: // Version7 (when the backwards compatible header was introduced).
     return offsetOf(&Header::HashOffset) + sizeof(Header::HashOffset);
   }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index d1e3438a6f412..c84b942ce8b10 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -19,7 +19,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/RawMemProfReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
@@ -57,6 +59,9 @@ static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   if (Version & VARIANT_MASK_FUNCTION_ENTRY_ONLY) {
     ProfileKind |= InstrProfKind::FunctionEntryOnly;
   }
+  if (Version & VARIANT_MASK_MEMPROF) {
+    ProfileKind |= InstrProfKind::MemProf;
+  }
   return ProfileKind;
 }
 
@@ -955,10 +960,35 @@ Error IndexedInstrProfReader::readHeader() {
 
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
-  // The rest of the file is an on disk hash table.
+  // The hash table with profile counts comes next.
   auto IndexPtr = std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
       Start + HashOffset, Cur, Start, HashType, Header->formatVersion());
 
+  // The MemProfOffset field in the header is only valid when the format version
+  // is higher than 8 (when it was introduced).
+  if (GET_VERSION(Header->Version) >= 8 &&
+      Header->Version & VARIANT_MASK_MEMPROF) {
+    uint64_t MemProfOffset =
+        endian::byte_swap<uint64_t, little>(Header->MemProfOffset);
+
+    const unsigned char *Ptr = Start + MemProfOffset;
+    // The value returned from Generator.Emit.
+    const uint64_t TableOffset =
+        support::endian::readNext<uint64_t, little, unaligned>(Ptr);
+
+    // Read the schema.
+    auto SchemaOr = memprof::readMemProfSchema(Ptr);
+    if (!SchemaOr)
+      return SchemaOr.takeError();
+    Schema = SchemaOr.get();
+
+    // Now initialize the table reader with a pointer into data buffer.
+    MemProfTable.reset(MemProfHashTable::Create(
+        /*Buckets=*/Start + TableOffset,
+        /*Payload=*/Ptr,
+        /*Base=*/Start, memprof::MemProfRecordLookupTrait(Schema)));
+  }
+
   // Load the remapping table now if requested.
   if (RemappingBuffer) {
     Remapper = std::make_unique<
@@ -1003,6 +1033,17 @@ IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
   return error(instrprof_error::hash_mismatch);
 }
 
+Expected<ArrayRef<memprof::MemProfRecord>>
+IndexedInstrProfReader::getMemProfRecord(uint64_t FuncNameHash) {
+  auto Iter = MemProfTable->find(FuncNameHash);
+  if (Iter == MemProfTable->end())
+    // TODO: Add memprof specific errors.
+    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
+                                      "memprof record not found for hash " +
+                                          Twine(FuncNameHash));
+  return *Iter;
+}
+
 Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
                                                 uint64_t FuncHash,
                                                 std::vector<uint64_t> &Counts) {
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index ebf89317d585a..4c974f402d2b3 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
@@ -63,11 +64,16 @@ class ProfOStream {
 
     if (IsFDOStream) {
       raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
+      const uint64_t LastPos = FDOStream.tell();
       for (int K = 0; K < NItems; K++) {
         FDOStream.seek(P[K].Pos);
         for (int I = 0; I < P[K].N; I++)
           write(P[K].D[I]);
       }
+      // Reset the stream to the last position after patching so that users
+      // don't accidentally overwrite data. This makes it consistent with
+      // the string stream below which replaces the data directly.
+      FDOStream.seek(LastPos);
     } else {
       raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
@@ -248,11 +254,39 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
   Dest.sortValueData();
 }
 
+void InstrProfWriter::addRecord(const memprof::MemProfRecord &MR,
+                                function_ref<void(Error)> Warn) {
+  // Use 0 as a sentinel value since its highly unlikely that the lower 64-bits
+  // of a 128 bit md5 hash will be all zeros.
+  // TODO: Move this Key frame detection to the contructor to avoid having to
+  // scan all the callstacks again when adding a new record.
+  uint64_t Key = 0;
+  for (auto Iter = MR.CallStack.rbegin(), End = MR.CallStack.rend();
+       Iter != End; Iter++) {
+    if (!Iter->IsInlineFrame) {
+      Key = Iter->Function;
+      break;
+    }
+  }
+
+  if (Key == 0) {
+    Warn(make_error<InstrProfError>(
+        instrprof_error::invalid_prof,
+        "could not determine leaf function for memprof record."));
+  }
+
+  MemProfData[Key].push_back(MR);
+}
+
 void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
                                              function_ref<void(Error)> Warn) {
   for (auto &I : IPW.FunctionData)
     for (auto &Func : I.getValue())
       addRecord(I.getKey(), Func.first, std::move(Func.second), 1, Warn);
+
+  for (auto &I : IPW.MemProfData)
+    for (const auto &MR : I.second)
+      addRecord(MR, Warn);
 }
 
 bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
@@ -297,6 +331,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
       Generator.insert(I.getKey(), &I.getValue());
+
   // Write the header.
   IndexedInstrProf::Header Header;
   Header.Magic = IndexedInstrProf::Magic;
@@ -311,16 +346,18 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
     Header.Version |= VARIANT_MASK_BYTE_COVERAGE;
   if (static_cast<bool>(ProfileKind & InstrProfKind::FunctionEntryOnly))
     Header.Version |= VARIANT_MASK_FUNCTION_ENTRY_ONLY;
+  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf))
+    Header.Version |= VARIANT_MASK_MEMPROF;
 
   Header.Unused = 0;
   Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   Header.HashOffset = 0;
+  Header.MemProfOffset = 0;
   int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
-  // Only write out all the fields except 'HashOffset'. We need
-  // to remember the offset of that field to allow back patching
-  // later.
-  for (int I = 0; I < N - 1; I++)
+  // Only write out all the fields except 'HashOffset' and 'MemProfOffset'. We
+  // need to remember the offset of these fields to allow back patching later.
+  for (int I = 0; I < N - 2; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -328,6 +365,13 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Reserve the space for HashOffset field.
   OS.write(0);
 
+  // Save the location of MemProf profile data. This is stored in two parts as
+  // the schema and as a separate on-disk chained hashtable.
+  uint64_t MemProfSectionOffset = OS.tell();
+  // Reserve space for the MemProf table field to be patched later if this
+  // profile contains memory profile information.
+  OS.write(0);
+
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -347,6 +391,42 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS.OS, *InfoObj);
 
+  // Write the MemProf profile data if we have it. This includes a simple schema
+  // with the format described below followed by the hashtable:
+  // uint64_t Offset = MemProfGenerator.Emit
+  // uint64_t Num schema entries
+  // uint64_t Schema entry 0
+  // uint64_t Schema entry 1
+  // ....
+  // uint64_t Schema entry N - 1
+  // OnDiskChainedHashTable MemProfFunctionData
+  uint64_t MemProfSectionStart = 0;
+  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf)) {
+    MemProfSectionStart = OS.tell();
+    OS.write(0ULL); // Reserve space for the offset.
+
+    auto Schema = memprof::PortableMemInfoBlock::getSchema();
+    OS.write(static_cast<uint64_t>(Schema.size()));
+    for (const auto Id : Schema) {
+      OS.write(static_cast<uint64_t>(Id));
+    }
+
+    auto MemProfWriter = std::make_unique<memprof::MemProfRecordWriterTrait>();
+    MemProfWriter->Schema = &Schema;
+    OnDiskChainedHashTableGenerator<memprof::MemProfRecordWriterTrait>
+        MemProfGenerator;
+    for (const auto &I : MemProfData) {
+      // Insert the key (func hash) and value (vector of memprof records).
+      MemProfGenerator.insert(I.first, I.second);
+    }
+
+    uint64_t TableOffset = MemProfGenerator.Emit(OS.OS, *MemProfWriter);
+    PatchItem PatchItems[] = {
+        {MemProfSectionStart, &TableOffset, 1},
+    };
+    OS.patch(PatchItems, 1);
+  }
+
   // Allocate space for data to be serialized out.
   std::unique_ptr<IndexedInstrProf::Summary> TheSummary =
       IndexedInstrProf::allocSummary(SummarySize);
@@ -369,6 +449,8 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   PatchItem PatchItems[] = {
       // Patch the Header.HashOffset field.
       {HashTableStartFieldOffset, &HashTableStart, 1},
+      // Patch the Header.MemProfOffset (=0 for profiles without MemProf data).
+      {MemProfSectionOffset, &MemProfSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
new file mode 100644
index 0000000000000..6a9b69ff6cff0
--- /dev/null
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -0,0 +1,73 @@
+#include "llvm/ProfileData/MemProf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+namespace llvm {
+namespace memprof {
+
+void serializeRecords(const ArrayRef<MemProfRecord> Records,
+                      const MemProfSchema &Schema, raw_ostream &OS) {
+  using namespace support;
+
+  endian::Writer LE(OS, little);
+
+  LE.write<uint64_t>(Records.size());
+  for (const MemProfRecord &MR : Records) {
+    LE.write<uint64_t>(MR.CallStack.size());
+    for (const MemProfRecord::Frame &F : MR.CallStack) {
+      F.write(OS);
+    }
+    MR.Info.serialize(Schema, OS);
+  }
+}
+
+SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
+                                                 const unsigned char *Ptr) {
+  using namespace support;
+
+  SmallVector<MemProfRecord, 4> Records;
+  const uint64_t NumRecords =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  for (uint64_t I = 0; I < NumRecords; I++) {
+    MemProfRecord MR;
+    const uint64_t NumFrames =
+        endian::readNext<uint64_t, little, unaligned>(Ptr);
+    for (uint64_t J = 0; J < NumFrames; J++) {
+      const auto F = *reinterpret_cast<const MemProfRecord::Frame *>(Ptr);
+      Ptr += sizeof(MemProfRecord::Frame);
+      MR.CallStack.push_back(F);
+    }
+    MR.Info.deserialize(Schema, Ptr);
+    Ptr += PortableMemInfoBlock::serializedSize();
+    Records.push_back(MR);
+  }
+  return Records;
+}
+
+Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
+  using namespace support;
+
+  const unsigned char *Ptr = Buffer;
+  const uint64_t NumSchemaIds =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
+    return make_error<InstrProfError>(instrprof_error::malformed,
+                                      "memprof schema invalid");
+  }
+
+  MemProfSchema Result;
+  for (size_t I = 0; I < NumSchemaIds; I++) {
+    const uint64_t Tag = endian::readNext<uint64_t, little, unaligned>(Ptr);
+    if (Tag >= static_cast<uint64_t>(Meta::Size)) {
+      return make_error<InstrProfError>(instrprof_error::malformed,
+                                        "memprof schema invalid");
+    }
+    Result.push_back(static_cast<Meta>(Tag));
+  }
+  // Advace the buffer to one past the schema if we succeeded.
+  Buffer = Ptr;
+  return Result;
+}
+
+} // namespace memprof
+} // namespace llvm
diff --git a/llvm/lib/ProfileData/RawMemProfReader.cpp b/llvm/lib/ProfileData/RawMemProfReader.cpp
index 43ef7c947366a..9bcba2a2b04ea 100644
--- a/llvm/lib/ProfileData/RawMemProfReader.cpp
+++ b/llvm/lib/ProfileData/RawMemProfReader.cpp
@@ -362,7 +362,12 @@ Error RawMemProfReader::fillRecord(const uint64_t Id, const MemInfoBlock &MIB,
     for (size_t I = 0; I < DI.getNumberOfFrames(); I++) {
       const auto &Frame = DI.getFrame(I);
       Record.CallStack.emplace_back(
-          std::to_string(llvm::MD5Hash(trimSuffix(Frame.FunctionName))),
+          // We use the function guid which we expect to be a uint64_t. At this
+          // time, it is the lower 64 bits of the md5 of the function name. Any
+          // suffix with .llvm. is trimmed since these are added by thinLTO
+          // global promotion. At the time the profile is consumed, these
+          // suffixes will not be present.
+          Function::getGUID(trimSuffix(Frame.FunctionName)),
           Frame.Line - Frame.StartLine, Frame.Column,
           // Only the first entry is not an inlined location.
           I != 0);
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic.profraw b/llvm/test/tools/llvm-profdata/Inputs/basic.profraw
new file mode 100644
index 0000000000000000000000000000000000000000..ad88759398c6020f4ab8a5606258e69d98e36687
GIT binary patch
literal 152
zcmZoHO3N=Q$obE~00xW@ih%*nfC`}V*`VS-{zJgi8V9flOx>@mz0b{3rrrk1zQ4@n
a%LP-nKp3J9svT|*OdktFZenI00|NkRuOhYp

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profdata/memprof-merge.test b/llvm/test/tools/llvm-profdata/memprof-merge.test
new file mode 100644
index 0000000000000..b11459f237ca5
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/memprof-merge.test
@@ -0,0 +1,47 @@
+REQUIRES: x86_64-linux
+
+The input memprof and instrumented raw profiles were generated from the following source code:
+
+```
+#include <stdlib.h>
+#include <string.h>
+int main(int argc, char **argv) {
+  char *x = (char *)malloc(10);
+  memset(x, 0, 10);
+  free(x);
+  x = (char *)malloc(10);
+  memset(x, 0, 10);
+  free(x);
+  return 0;
+}
+```
+
+Steps to collect the memprof raw profile and the instrprof raw profile:
+
+```
+# Collect instrprof profile with name compression disabled since some buildbots
+# do not have zlib.
+clang -mllvm -enable-name-compression=false -fprofile-generate source.c -o instr.out
+./instr.out
+mv *.profraw basic.profraw
+
+# Collect memprof profile.
+clang -fuse-ld=lld -Wl,--no-rosegment -gmlt -fdebug-info-for-profiling \
+      -fmemory-profile -mno-omit-leaf-frame-pointer -fno-omit-frame-pointer \
+      -fno-optimize-sibling-calls -m64 -Wl,-build-id source.c -o basic.memprofexe
+
+env MEMPROF_OPTIONS=log_path=stdout ./rawprofile.out > basic.memprofraw
+```
+
+RUN: llvm-profdata merge %p/Inputs/basic.profraw %p/Inputs/basic.memprofraw --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof
+RUN: llvm-profdata show %t.prof | FileCheck %s
+
+For now we only check the validity of the instrumented profile since we don't
+have a way to display the contents of the memprof indexed format yet.
+
+CHECK: Instrumentation level: IR  entry_first = 0
+CHECK: Total functions: 1
+CHECK: Maximum function count: 1
+CHECK: Maximum internal block count: 0
+
+
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index e00582851d47f..ba2f1b6038c48 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -239,7 +239,7 @@ static void overlapInput(const std::string &BaseFilename,
 /// Load an input into a writer context.
 static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
                       const InstrProfCorrelator *Correlator,
-                      WriterContext *WC) {
+                      const StringRef ProfiledBinary, WriterContext *WC) {
   std::unique_lock<std::mutex> CtxGuard{WC->Lock};
 
   // Copy the filename, because llvm::ThreadPool copied the input "const
@@ -247,6 +247,35 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
   // invalid outside of this packaged task.
   std::string Filename = Input.Filename;
 
+  using ::llvm::memprof::RawMemProfReader;
+  if (RawMemProfReader::hasFormat(Input.Filename)) {
+    auto ReaderOrErr = RawMemProfReader::create(Input.Filename, ProfiledBinary);
+    if (!ReaderOrErr) {
+      exitWithError(ReaderOrErr.takeError(), Input.Filename);
+    }
+    std::unique_ptr<RawMemProfReader> Reader = std::move(ReaderOrErr.get());
+    // Check if the profile types can be merged, e.g. clang frontend profiles
+    // should not be merged with memprof profiles.
+    if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) {
+      consumeError(std::move(E));
+      WC->Errors.emplace_back(
+          make_error<StringError>(
+              "Cannot merge MemProf profile with Clang generated profile.",
+              std::error_code()),
+          Filename);
+      return;
+    }
+
+    // Add the records into the writer context.
+    for (const memprof::MemProfRecord &MR : *Reader) {
+      WC->Writer.addRecord(MR, [&](Error E) {
+        instrprof_error IPE = InstrProfError::take(std::move(E));
+        WC->Errors.emplace_back(make_error<InstrProfError>(IPE), Filename);
+      });
+    }
+    return;
+  }
+
   auto ReaderOrErr = InstrProfReader::create(Input.Filename, Correlator);
   if (Error E = ReaderOrErr.takeError()) {
     // Skip the empty profiles by returning sliently.
@@ -332,7 +361,8 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
                               SymbolRemapper *Remapper,
                               StringRef OutputFilename,
                               ProfileFormat OutputFormat, bool OutputSparse,
-                              unsigned NumThreads, FailureMode FailMode) {
+                              unsigned NumThreads, FailureMode FailMode,
+                              const StringRef ProfiledBinary) {
   if (OutputFormat != PF_Binary && OutputFormat != PF_Compact_Binary &&
       OutputFormat != PF_Ext_Binary && OutputFormat != PF_Text)
     exitWithError("unknown format is specified");
@@ -365,14 +395,15 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
 
   if (NumThreads == 1) {
     for (const auto &Input : Inputs)
-      loadInput(Input, Remapper, Correlator.get(), Contexts[0].get());
+      loadInput(Input, Remapper, Correlator.get(), ProfiledBinary,
+                Contexts[0].get());
   } else {
     ThreadPool Pool(hardware_concurrency(NumThreads));
 
     // Load the inputs in parallel (N/NumThreads serial steps).
     unsigned Ctx = 0;
     for (const auto &Input : Inputs) {
-      Pool.async(loadInput, Input, Remapper, Correlator.get(),
+      Pool.async(loadInput, Input, Remapper, Correlator.get(), ProfiledBinary,
                  Contexts[Ctx].get());
       Ctx = (Ctx + 1) % NumThreads;
     }
@@ -589,7 +620,7 @@ static void supplementInstrProfile(
   SmallSet<instrprof_error, 4> WriterErrorCodes;
   auto WC = std::make_unique<WriterContext>(OutputSparse, ErrorLock,
                                             WriterErrorCodes);
-  loadInput(Inputs[0], nullptr, nullptr, WC.get());
+  loadInput(Inputs[0], nullptr, nullptr, /*ProfiledBinary=*/"", WC.get());
   if (WC->Errors.size() > 0)
     exitWithError(std::move(WC->Errors[0].first), InstrFilename);
 
@@ -969,6 +1000,9 @@ static int merge_main(int argc, const char *argv[]) {
   cl::opt<std::string> DebugInfoFilename(
       "debug-info", cl::init(""),
       cl::desc("Use the provided debug info to correlate the raw profile."));
+  cl::opt<std::string> ProfiledBinary(
+      "profiled-binary", cl::init(""),
+      cl::desc("Path to binary from which the profile was collected."));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
@@ -1011,7 +1045,7 @@ static int merge_main(int argc, const char *argv[]) {
   if (ProfileKind == instr)
     mergeInstrProfile(WeightedInputs, DebugInfoFilename, Remapper.get(),
                       OutputFilename, OutputFormat, OutputSparse, NumThreads,
-                      FailureMode);
+                      FailureMode, ProfiledBinary);
   else
     mergeSampleProfile(WeightedInputs, Remapper.get(), OutputFilename,
                        OutputFormat, ProfileSymbolListFile, CompressAllSections,
@@ -1042,7 +1076,7 @@ static void overlapInstrProfile(const std::string &BaseFilename,
     OS << "Sum of edge counts for profile " << TestFilename << " is 0.\n";
     exit(0);
   }
-  loadInput(WeightedInput, nullptr, nullptr, &Context);
+  loadInput(WeightedInput, nullptr, nullptr, /*ProfiledBinary=*/"", &Context);
   overlapInput(BaseFilename, TestFilename, &Context, Overlap, FuncFilter, OS,
                IsCS);
   Overlap.dump(OS);
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 7bdd6c2992859..434e6aaee8b02 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -12,6 +12,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Testing/Support/Error.h"
 #include "llvm/Testing/Support/SupportHelpers.h"
@@ -221,6 +222,67 @@ TEST_F(InstrProfTest, test_writer_merge) {
   ASSERT_EQ(0U, R->Counts[1]);
 }
 
+TEST_F(InstrProfTest, test_memprof) {
+  ASSERT_THAT_ERROR(Writer.mergeProfileKind(InstrProfKind::MemProf),
+                    Succeeded());
+  llvm::memprof::MemProfRecord MR;
+  MR.CallStack.push_back({0x123, 1, 2, false});
+  MR.CallStack.push_back({0x345, 3, 4, true});
+  Writer.addRecord(MR, Err);
+
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  auto RecordsOr = Reader->getMemProfRecord(0x123);
+  ASSERT_THAT_ERROR(RecordsOr.takeError(), Succeeded());
+  const auto Records = RecordsOr.get();
+  ASSERT_EQ(Records.size(), 1U);
+  EXPECT_EQ(Records[0], MR);
+}
+
+TEST_F(InstrProfTest, test_memprof_merge) {
+  Writer.addRecord({"func1", 0x1234, {42}}, Err);
+
+  InstrProfWriter Writer2;
+  ASSERT_THAT_ERROR(Writer2.mergeProfileKind(InstrProfKind::MemProf),
+                    Succeeded());
+
+  llvm::memprof::MemProfRecord MR;
+  MR.CallStack.push_back({0x123, 1, 2, false});
+  MR.CallStack.push_back({0x345, 3, 4, true});
+  Writer2.addRecord(MR, Err);
+
+  ASSERT_THAT_ERROR(Writer.mergeProfileKind(Writer2.getProfileKind()),
+                    Succeeded());
+  Writer.mergeRecordsFromWriter(std::move(Writer2), Err);
+
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  Expected<InstrProfRecord> R = Reader->getInstrProfRecord("func1", 0x1234);
+  EXPECT_THAT_ERROR(R.takeError(), Succeeded());
+  ASSERT_EQ(1U, R->Counts.size());
+  ASSERT_EQ(42U, R->Counts[0]);
+
+  auto RecordsOr = Reader->getMemProfRecord(0x123);
+  ASSERT_THAT_ERROR(RecordsOr.takeError(), Succeeded());
+  const auto Records = RecordsOr.get();
+  ASSERT_EQ(Records.size(), 1U);
+  EXPECT_EQ(Records[0], MR);
+}
+
+TEST_F(InstrProfTest, test_memprof_invalid_add_record) {
+  llvm::memprof::MemProfRecord MR;
+  // At least one of the frames should be a non-inline frame.
+  MR.CallStack.push_back({0x123, 1, 2, true});
+  MR.CallStack.push_back({0x345, 3, 4, true});
+
+  auto CheckErr = [](Error &&E) {
+    EXPECT_TRUE(ErrorEquals(instrprof_error::invalid_prof, std::move(E)));
+  };
+  Writer.addRecord(MR, CheckErr);
+}
+
 static const char callee1[] = "callee1";
 static const char callee2[] = "callee2";
 static const char callee3[] = "callee3";
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index f744b85d784c0..dc793178bd209 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -89,8 +89,8 @@ const DILineInfoSpecifier specifier() {
       DILineInfoSpecifier::FunctionNameKind::LinkageName);
 }
 
-MATCHER_P4(FrameContains, Function, LineOffset, Column, Inline, "") {
-  const std::string ExpectedHash = std::to_string(llvm::MD5Hash(Function));
+MATCHER_P4(FrameContains, FunctionName, LineOffset, Column, Inline, "") {
+  const uint64_t ExpectedHash = llvm::Function::getGUID(FunctionName);
   if (arg.Function != ExpectedHash) {
     *result_listener << "Hash mismatch";
     return false;
@@ -103,6 +103,22 @@ MATCHER_P4(FrameContains, Function, LineOffset, Column, Inline, "") {
   return false;
 }
 
+MATCHER_P(EqualsRecord, Want, "") {
+  if (arg == Want)
+    return true;
+
+  std::string Explanation;
+  llvm::raw_string_ostream OS(Explanation);
+  OS << "\n Want: \n";
+  Want.print(OS);
+  OS << "\n Got: \n";
+  arg.print(OS);
+  OS.flush();
+
+  *result_listener << Explanation;
+  return false;
+}
+
 MemProfSchema getFullSchema() {
   MemProfSchema Schema;
 #define MIBEntryDef(NameTag, Name, Type) Schema.push_back(Meta::Name);
@@ -184,4 +200,38 @@ TEST(MemProf, PortableWrapper) {
   EXPECT_EQ(3UL, ReadBlock.getAllocCpuId());
 }
 
+TEST(MemProf, RecordSerializationRoundTrip) {
+  const MemProfSchema Schema = getFullSchema();
+
+  llvm::SmallVector<MemProfRecord, 3> Records;
+  MemProfRecord MR;
+
+  MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
+                    /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
+                    /*dealloc_cpu=*/4);
+
+  MR.Info = PortableMemInfoBlock(Info);
+  MR.CallStack.push_back({0x123, 1, 2, false});
+  MR.CallStack.push_back({0x345, 3, 4, false});
+  Records.push_back(MR);
+
+  MR.clear();
+  MR.Info = PortableMemInfoBlock(Info);
+  MR.CallStack.push_back({0x567, 5, 6, false});
+  MR.CallStack.push_back({0x789, 7, 8, false});
+  Records.push_back(MR);
+
+  std::string Buffer;
+  llvm::raw_string_ostream OS(Buffer);
+  serializeRecords(Records, Schema, OS);
+  OS.flush();
+
+  const llvm::SmallVector<MemProfRecord, 4> GotRecords = deserializeRecords(
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()));
+
+  ASSERT_TRUE(!GotRecords.empty());
+  EXPECT_EQ(GotRecords.size(), Records.size());
+  EXPECT_THAT(GotRecords[0], EqualsRecord(Records[0]));
+  EXPECT_THAT(GotRecords[1], EqualsRecord(Records[1]));
+}
 } // namespace

From 9febd1e573fb8b3d1de5844b7bfd33eb998f0106 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Mon, 14 Feb 2022 17:05:03 -0800
Subject: [PATCH 173/748] Fix race condition when launching and attaching.

We discovered that when using "launchCommands" or "attachCommands" that there was an issue where these commands were not being run synchronously. There were further problems in this case where we would get thread events for the process that was just launched or attached before the IDE was ready, which is after "configurationDone" was sent to lldb-vscode.

This fix introduces the ability to wait for the process to stop after the run or attach to ensure that we have a stopped process at the entry point that is ready for the debug session to proceed. This also allows us to run the normal launch or attach without needing to play with the async flag the debugger. We spin up the thread that listens for process events before we start the launch or attach, but we stop the first eStateStopped (with stop ID of zero) event from being delivered through the DAP protocol because the "configurationDone" request handler will deliver it manually as the IDE expects a stop after configuration done. The request_configurationDone will also only deliver the stop packet if the "stopOnEntry" is False in the launch configuration.

Also added a new "timeout" to the launch and attach launch configuration arguments that can be set and defaults to 30 seconds. Since we now poll to detect when the process is stopped, we need a timeout that can be changed in case certain workflows take longer that 30 seconds to attach. If the process is not stopped by the timeout, an error will be retured for the launch or attach.

Added a flag to the vscode.py protocol classes that detects and ensures that no "stopped" events are sent prior to "configurationDone" has been sent and will raise an error if it does happen.

This should make our launching and attaching more reliable and avoid some deadlocks that were being seen (https://reviews.llvm.org/D119548).

Differential Revision: https://reviews.llvm.org/D119797
---
 .../test/tools/lldb-vscode/vscode.py          |  3 ++
 .../lldb-vscode/launch/TestVSCode_launch.py   |  2 +-
 lldb/tools/lldb-vscode/VSCode.cpp             | 52 +++++++++++++++++++
 lldb/tools/lldb-vscode/VSCode.h               | 13 +++++
 lldb/tools/lldb-vscode/lldb-vscode.cpp        | 34 +++++++-----
 lldb/tools/lldb-vscode/package.json           | 10 +++-
 6 files changed, 99 insertions(+), 15 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
index 603b1545cd714..ae919fc2ed0c6 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
@@ -228,6 +228,9 @@ def handle_recv_packet(self, packet):
                 # 'stopped' event. We need to remember the thread stop
                 # reasons since the 'threads' command doesn't return
                 # that information.
+                if not self.configuration_done_sent:
+                    raise ValueError("'stopped' event received before "
+                                     "configuationDone packet was sent")
                 self._process_stopped()
                 tid = body['threadId']
                 self.thread_stop_reasons[tid] = body
diff --git a/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py b/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py
index ff798364c9573..8c0000bdb1546 100644
--- a/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py
+++ b/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py
@@ -374,7 +374,7 @@ def test_commands(self):
     @skipIfRemote
     def test_extra_launch_commands(self):
         '''
-            Tests the "luanchCommands" with extra launching settings
+            Tests the "launchCommands" with extra launching settings
         '''
         self.build_and_create_debug_adaptor()
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/tools/lldb-vscode/VSCode.cpp b/lldb/tools/lldb-vscode/VSCode.cpp
index 3209eea4a897f..a6fe7f840a566 100644
--- a/lldb/tools/lldb-vscode/VSCode.cpp
+++ b/lldb/tools/lldb-vscode/VSCode.cpp
@@ -528,6 +528,58 @@ void VSCode::RegisterRequestCallback(std::string request,
   request_handlers[request] = callback;
 }
 
+lldb::SBError VSCode::WaitForProcessToStop(uint32_t seconds) {
+  // Wait for the process hit a stopped state. When running a launch (with or
+  // without "launchCommands") or attach (with or without)= "attachCommands"),
+  // the calls might take some time to stop at the entry point since the command
+  // is asynchronous. So we need to sync up with the process and make sure it is
+  // stopped before we proceed to do anything else as we will soon be asked to
+  // set breakpoints and other things that require the process to be stopped.
+  // We must use polling because attach doesn't send a process state change
+  // event for the first stop, while launching does. Since both "attachCommands"
+  // and "launchCommands" could end up using any combination of LLDB commands,
+  // we must ensure we can also catch when the process stops, so we must poll
+  // the process to make sure we handle all cases.
+
+  lldb::SBError error;
+  lldb::SBProcess process = target.GetProcess();
+  if (!process.IsValid()) {
+    error.SetErrorString("invalid process");
+    return error;
+  }
+  auto timeout_time =
+      std::chrono::high_resolution_clock::now() + std::chrono::seconds(seconds);
+  while (std::chrono::high_resolution_clock::now() < timeout_time) {
+    const auto state = process.GetState();
+    switch (state) {
+      case lldb::eStateAttaching:
+      case lldb::eStateConnected:
+      case lldb::eStateInvalid:
+      case lldb::eStateLaunching:
+      case lldb::eStateRunning:
+      case lldb::eStateStepping:
+      case lldb::eStateSuspended:
+        break;
+      case lldb::eStateDetached:
+        error.SetErrorString("process detached during launch or attach");
+        return error;
+      case lldb::eStateExited:
+        error.SetErrorString("process exited during launch or attach");
+        return error;
+      case lldb::eStateUnloaded:
+        error.SetErrorString("process unloaded during launch or attach");
+        return error;
+      case lldb::eStateCrashed:
+      case lldb::eStateStopped:
+        return lldb::SBError(); // Success!
+    }
+    std::this_thread::sleep_for(std::chrono::microseconds(250));
+  }
+  error.SetErrorStringWithFormat("process failed to stop within %u seconds",
+                                 seconds);
+  return error;
+}
+
 void Variables::Clear() {
   locals.Clear();
   globals.Clear();
diff --git a/lldb/tools/lldb-vscode/VSCode.h b/lldb/tools/lldb-vscode/VSCode.h
index 602cf758a9a17..bc868760eb830 100644
--- a/lldb/tools/lldb-vscode/VSCode.h
+++ b/lldb/tools/lldb-vscode/VSCode.h
@@ -243,6 +243,19 @@ struct VSCode {
   /// Debuggee will continue from stopped state.
   void WillContinue() { variables.Clear(); }
 
+  /// Poll the process to wait for it to reach the eStateStopped state.
+  ///
+  /// We need to ensure the process is stopped and ready to resume before we
+  /// continue with the launch or attach. This is needed since we no longer play
+  /// with the synchronous mode in the debugger for launching (with or without
+  /// "launchCommands") or attaching (with or without "attachCommands").
+  ///
+  /// \param[in] seconds
+  ///   The number of seconds to poll the process to wait until it is stopped.
+  ///
+  /// \return Error if waiting for the process fails, no error if succeeds.
+  lldb::SBError WaitForProcessToStop(uint32_t seconds);
+
 private:
   // Send the JSON in "json_str" to the "out" stream. Correctly send the
   // "Content-Length:" field followed by the length, followed by the raw
diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp
index 97ec4b578cf7c..734b23afc9b28 100644
--- a/lldb/tools/lldb-vscode/lldb-vscode.cpp
+++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp
@@ -449,10 +449,18 @@ void EventThreadFunction() {
           case lldb::eStateSuspended:
             break;
           case lldb::eStateStopped:
-            // Only report a stopped event if the process was not restarted.
-            if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
-              SendStdOutStdErr(process);
-              SendThreadStoppedEvent();
+            // Now that we don't mess with the async setting in the debugger
+            // when launching or attaching we will get the first process stop
+            // event which we do not want to send an event for. This is because
+            // we either manually deliver the event in by calling the
+            // SendThreadStoppedEvent() from request_configuarationDone() if we
+            // want to stop on entry, or we resume from that function.
+            if (process.GetStopID() > 1) {
+              // Only report a stopped event if the process was not restarted.
+              if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
+                SendStdOutStdErr(process);
+                SendThreadStoppedEvent();
+              }
             }
             break;
           case lldb::eStateRunning:
@@ -600,6 +608,7 @@ void request_attach(const llvm::json::Object &request) {
   g_vsc.terminate_commands = GetStrings(arguments, "terminateCommands");
   auto attachCommands = GetStrings(arguments, "attachCommands");
   llvm::StringRef core_file = GetString(arguments, "coreFile");
+  const uint64_t timeout_seconds = GetUnsigned(arguments, "timeout", 30);
   g_vsc.stop_at_entry =
       core_file.empty() ? GetBoolean(arguments, "stopOnEntry", false) : true;
   std::vector<std::string> postRunCommands =
@@ -640,15 +649,10 @@ void request_attach(const llvm::json::Object &request) {
   }
   if (attachCommands.empty()) {
     // No "attachCommands", just attach normally.
-    // Disable async events so the attach will be successful when we return from
-    // the launch call and the launch will happen synchronously
-    g_vsc.debugger.SetAsync(false);
     if (core_file.empty())
       g_vsc.target.Attach(attach_info, error);
     else
       g_vsc.target.LoadCore(core_file.data(), error);
-    // Reenable async events
-    g_vsc.debugger.SetAsync(true);
   } else {
     // We have "attachCommands" that are a set of commands that are expected
     // to execute the commands after which a process should be created. If there
@@ -658,6 +662,9 @@ void request_attach(const llvm::json::Object &request) {
     // selected target after these commands are run.
     g_vsc.target = g_vsc.debugger.GetSelectedTarget();
   }
+  // Make sure the process is attached and stopped before proceeding.
+  if (error.Success())
+    error = g_vsc.WaitForProcessToStop(timeout_seconds);
 
   if (error.Success() && core_file.empty()) {
     auto attached_pid = g_vsc.target.GetProcess().GetProcessID();
@@ -1652,6 +1659,7 @@ void request_launch(const llvm::json::Object &request) {
       GetStrings(arguments, "postRunCommands");
   g_vsc.stop_at_entry = GetBoolean(arguments, "stopOnEntry", false);
   const llvm::StringRef debuggerRoot = GetString(arguments, "debuggerRoot");
+  const uint64_t timeout_seconds = GetUnsigned(arguments, "timeout", 30);
 
   // This is a hack for loading DWARF in .o files on Mac where the .o files
   // in the debug map of the main executable have relative paths which require
@@ -1716,17 +1724,17 @@ void request_launch(const llvm::json::Object &request) {
     if (llvm::Error err = request_runInTerminal(request))
       error.SetErrorString(llvm::toString(std::move(err)).c_str());
   } else if (launchCommands.empty()) {
-    // Disable async events so the launch will be successful when we return from
-    // the launch call and the launch will happen synchronously
-    g_vsc.debugger.SetAsync(false);
     g_vsc.target.Launch(launch_info, error);
-    g_vsc.debugger.SetAsync(true);
   } else {
     g_vsc.RunLLDBCommands("Running launchCommands:", launchCommands);
     // The custom commands might have created a new target so we should use the
     // selected target after these commands are run.
     g_vsc.target = g_vsc.debugger.GetSelectedTarget();
   }
+  // Make sure the process is launched and stopped at the entry point before
+  // proceeding.
+  if (error.Success())
+    error = g_vsc.WaitForProcessToStop(timeout_seconds);
 
   if (error.Fail()) {
     response["success"] = llvm::json::Value(false);
diff --git a/lldb/tools/lldb-vscode/package.json b/lldb/tools/lldb-vscode/package.json
index a5c79911f6e9f..bedc8f16ea26e 100644
--- a/lldb/tools/lldb-vscode/package.json
+++ b/lldb/tools/lldb-vscode/package.json
@@ -215,7 +215,7 @@
 							},
 							"launchCommands": {
 								"type": "array",
-								"description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail.",
+								"description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail. Launch the process with \"process launch -s\" to make the process to at the entry point since lldb-vscode will auto resume if necessary.",
 								"default": []
 							},
 							"stopCommands": {
@@ -232,6 +232,10 @@
 								"type": "boolean",
 								"description": "Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs",
 								"default": false
+							},
+							"timeout": {
+								"type": "string",
+								"description": "The time in seconds to wait for a program to stop at entry point when launching. Defaults to 30 seconds."
 							}
 						}
 					},
@@ -307,6 +311,10 @@
 							"coreFile": {
 								"type": "string",
 								"description": "Path to the core file to debug."
+							},
+							"timeout": {
+								"type": "string",
+								"description": "The time in seconds to wait for a program to stop when attaching. Defaults to 30 seconds."
 							}
 						}
 					}

From 84718d37db577f57514df6ac544e3db88aa75684 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 15 Feb 2022 20:23:44 +0000
Subject: [PATCH 174/748] [MLIR][GPU] Add gpu.set_default_device op

This op is added to allow MLIR code running on multi-GPU systems to
select the GPU they want to execute operations on when no GPU is
otherwise specified.

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D119883
---
 mlir/include/mlir/Dialect/GPU/GPUOps.td       | 14 ++++++++-
 .../GPUCommon/GPUToLLVMConversion.cpp         | 29 +++++++++++++++++++
 .../ExecutionEngine/CudaRuntimeWrappers.cpp   | 17 ++++++++---
 .../ExecutionEngine/RocmRuntimeWrappers.cpp   | 11 +++++--
 mlir/test/Dialect/GPU/ops.mlir                |  7 +++++
 5 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index d8236eafa9cf1..5d25892175b90 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -273,7 +273,7 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
 
     /// Returns the type of this function.
     /// FIXME: We should drive this via the ODS `type` param.
-    FunctionType getType() { 
+    FunctionType getType() {
       return getTypeAttr().getValue().cast<FunctionType>();
     }
 
@@ -1006,6 +1006,18 @@ def GPU_MemsetOp : GPU_Op<"memset",
   let hasFolder = 1;
 }
 
+def GPU_SetDefaultDeviceOp : GPU_Op<"set_default_device",
+                                    [MemoryEffects<[MemWrite]>]>,
+    Arguments<(ins I32:$devIndex)> {
+  let summary = "Set default GPU for operations after this by index";
+  let description = [{
+    Operation that sets the current default GPU, using a zero-based index
+    into the set of GPUs on the system. The default GPU setting may be
+    thread-local.
+  }];
+  let assemblyFormat = "attr-dict $devIndex";
+}
+
 def GPU_SubgroupMmaLoadMatrixOp : GPU_Op<"subgroup_mma_load_matrix",
     [MemoryEffects<[MemRead]>]>{
 
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index a30bfaf5cce4d..1aa12500c5716 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -185,6 +185,10 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       {llvmPointerType /* void *dst */, llvmInt32Type /* unsigned int value */,
        llvmIntPtrType /* intptr_t sizeBytes */,
        llvmPointerType /* void *stream */}};
+  FunctionCallBuilder setDefaultDeviceCallBuilder = {
+      "mgpuSetDefaultDevice",
+      llvmVoidType,
+      {llvmInt32Type /* uint32_t devIndex */}};
 };
 
 /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@@ -342,6 +346,21 @@ class ConvertMemsetOpToGpuRuntimeCallPattern
   matchAndRewrite(gpu::MemsetOp memsetOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override;
 };
+
+/// A rewrite pattern to convert gpu.set_default_device to a GPU runtime call.
+/// Currently supports CUDA and ROCm (HIP)
+class ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::SetDefaultDeviceOp> {
+public:
+  ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::SetDefaultDeviceOp>(
+            typeConverter) {}
+
+  LogicalResult
+  matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
 } // namespace
 
 void GpuToLLVMConversionPass::runOnOperation() {
@@ -844,6 +863,15 @@ LogicalResult ConvertMemsetOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  setDefaultDeviceCallBuilder.create(loc, rewriter, {adaptor.devIndex()});
+  rewriter.replaceOp(op, {});
+  return success();
+}
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 mlir::createGpuToLLVMConversionPass() {
   return std::make_unique<GpuToLLVMConversionPass>();
@@ -861,6 +889,7 @@ void mlir::populateGpuToLLVMConversionPatterns(
                ConvertHostRegisterOpToGpuRuntimeCallPattern,
                ConvertMemcpyOpToGpuRuntimeCallPattern,
                ConvertMemsetOpToGpuRuntimeCallPattern,
+               ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
                ConvertWaitAsyncOpToGpuRuntimeCallPattern,
                ConvertWaitOpToGpuRuntimeCallPattern,
                ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 9fb3c100feaed..dd66056289cec 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -35,16 +35,20 @@
     fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
   }(expr)
 
-// Make the primary context of device 0 current for the duration of the instance
-// and restore the previous context on destruction.
+thread_local static int32_t defaultDevice = 0;
+
+// Make the primary context of the current default device current for the
+// duration
+//  of the instance and restore the previous context on destruction.
 class ScopedContext {
 public:
   ScopedContext() {
-    // Static reference to CUDA primary context for device ordinal 0.
+    // Static reference to CUDA primary context for device ordinal
+    // defaultDevice.
     static CUcontext context = [] {
       CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
       CUdevice device;
-      CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/0));
+      CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
       CUcontext ctx;
       // Note: this does not affect the current context.
       CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));
@@ -187,3 +191,8 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
   auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
   mgpuMemHostRegister(ptr, sizeBytes);
 }
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
+  defaultDevice = device;
+  CUDA_REPORT_IF_ERROR(cudaSetDevice(device));
+}
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index 92358ed38d9cb..34363ccc61416 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -30,16 +30,18 @@
     fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
   }(expr)
 
+thread_local static int32_t defaultDevice = 0;
+
 // Sets the `Context` for the duration of the instance and restores the previous
 // context on destruction.
 class ScopedContext {
 public:
   ScopedContext() {
-    // Static reference to HIP primary context for device ordinal 0.
+    // Static reference to HIP primary context for device ordinal defaultDevice.
     static hipCtx_t context = [] {
       HIP_REPORT_IF_ERROR(hipInit(/*flags=*/0));
       hipDevice_t device;
-      HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/0));
+      HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/defaultDevice));
       hipCtx_t ctx;
       HIP_REPORT_IF_ERROR(hipDevicePrimaryCtxRetain(&ctx, device));
       return ctx;
@@ -199,3 +201,8 @@ mgpuMemGetDeviceMemRef1dInt32(int32_t *allocated, int32_t *aligned,
   mgpuMemGetDevicePointer(aligned, &devicePtr);
   return {devicePtr, devicePtr, offset, {size}, {stride}};
 }
+
+extern "C" void mgpuSetDefaultDevice(int32_t device) {
+  defaultDevice = device;
+  HIP_REPORT_IF_ERROR(hipSetDevice(device));
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index c1c5ff5570832..c317dbc930480 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -252,4 +252,11 @@ module attributes {gpu.container_module} {
     gpu.device_async_wait %token {numGroups = 1 : i32}
     return
   }
+
+  // CHECK-LABEL: func @set_default_device
+  func @set_default_device(%arg0: i32) {
+    // CHECK: gpu.set_default_device
+    gpu.set_default_device %arg0
+    return
+  }
 }

From 1689b1092ebb2c630f8ef1d3880a9fb4808d16fa Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 17 Feb 2022 16:33:24 -0500
Subject: [PATCH 175/748] unbreak Modules/cxx20-export-import.cpp with
 LLVM_APPEND_VC_REV after 32b73bc6ab82

See revision b8b7a9dcdcbc for prior art.
---
 clang/include/clang/Serialization/ASTBitCodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index f98e173b158c1..c94274ff34b8f 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -41,7 +41,7 @@ namespace serialization {
 /// Version 4 of AST files also requires that the version control branch and
 /// revision match exactly, since there is no backward compatibility of
 /// AST files at this time.
-const unsigned VERSION_MAJOR = 15;
+const unsigned VERSION_MAJOR = 16;
 
 /// AST file minor version number supported by this version of
 /// Clang.

From 62914bad46cf0b010e3277197dc3114fdf0d8b79 Mon Sep 17 00:00:00 2001
From: Mitch Phillips <31459023+hctim@users.noreply.github.com>
Date: Thu, 17 Feb 2022 13:27:13 -0800
Subject: [PATCH 176/748] [ASan] Fix TLS teardown.

TLS teardown is currently broken, as we unpoison the shadow a little bit
and to the right of the TLS section, rather than the full TLS section
itself. This currently breaks at -O0, and breaks with some upcoming
globals code that I have.

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D120080
---
 compiler-rt/lib/asan/asan_thread.cpp                   | 4 +---
 compiler-rt/test/asan/TestCases/Linux/unpoison_tls.cpp | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_thread.cpp b/compiler-rt/lib/asan/asan_thread.cpp
index 2b06c3c4e7c04..c15963e141832 100644
--- a/compiler-rt/lib/asan/asan_thread.cpp
+++ b/compiler-rt/lib/asan/asan_thread.cpp
@@ -323,9 +323,7 @@ void AsanThread::ClearShadowForThreadStackAndTLS() {
   if (tls_begin_ != tls_end_) {
     uptr tls_begin_aligned = RoundDownTo(tls_begin_, ASAN_SHADOW_GRANULARITY);
     uptr tls_end_aligned = RoundUpTo(tls_end_, ASAN_SHADOW_GRANULARITY);
-    FastPoisonShadowPartialRightRedzone(tls_begin_aligned,
-                                        tls_end_ - tls_begin_aligned,
-                                        tls_end_aligned - tls_end_, 0);
+    FastPoisonShadow(tls_begin_aligned, tls_end_aligned - tls_begin_aligned, 0);
   }
 }
 
diff --git a/compiler-rt/test/asan/TestCases/Linux/unpoison_tls.cpp b/compiler-rt/test/asan/TestCases/Linux/unpoison_tls.cpp
index e22345342f3f5..8b405ac5f8651 100644
--- a/compiler-rt/test/asan/TestCases/Linux/unpoison_tls.cpp
+++ b/compiler-rt/test/asan/TestCases/Linux/unpoison_tls.cpp
@@ -1,6 +1,7 @@
 // Test that TLS is unpoisoned on thread death.
 // REQUIRES: x86-target-arch && !android
 
+// RUN: %clangxx_asan -O0 %s -pthread -o %t && %run %t 2>&1
 // RUN: %clangxx_asan -O1 %s -pthread -o %t && %run %t 2>&1
 
 #include <assert.h>

From c5803ee4faa7305f7dcb58170acdb3247f405e04 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Thu, 17 Feb 2022 21:37:15 +0000
Subject: [PATCH 177/748] [MLIR][GPU] Remove call to cudaSetDevice(), which no
 longer exists

Differential Revision: https://reviews.llvm.org/D120085
---
 mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index dd66056289cec..44ed5b0cd2057 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -194,5 +194,4 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
   defaultDevice = device;
-  CUDA_REPORT_IF_ERROR(cudaSetDevice(device));
 }

From 7debcad0d0853cca6d038a0b0632e9061a046540 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 17 Feb 2022 21:41:09 +0000
Subject: [PATCH 178/748] [gn build] Port 807ba7aace18

---
 llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
index ee186bded1111..39fe42efd481a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
@@ -13,6 +13,7 @@ static_library("ProfileData") {
     "InstrProfCorrelator.cpp",
     "InstrProfReader.cpp",
     "InstrProfWriter.cpp",
+    "MemProf.cpp",
     "ProfileSummaryBuilder.cpp",
     "RawMemProfReader.cpp",
     "SampleProf.cpp",

From 383ed82dd1f8c4ea7e88a4cf92dd918dda794854 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 11 Feb 2022 22:13:28 -0500
Subject: [PATCH 179/748] [clang] Pass more flags to ld64.lld

* ld64.lld now completely supports -export_dynamic (D119372), so map -rdynamic
  to -export_dynamic like already done for ld64

* ld64.lld has been supporting -object_path_lto for well over a year (D92537),
  so pass it like already done for ld64

Differential Revision: https://reviews.llvm.org/D119612
---
 clang/lib/Driver/ToolChains/Darwin.cpp | 16 ++++++++++------
 clang/test/Driver/darwin-ld-lto-lld.c  | 19 +++++++++++++++++++
 clang/test/Driver/darwin-ld-lto.c      |  8 ++++----
 clang/test/Driver/darwin-ld.c          |  3 +++
 4 files changed, 36 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/Driver/darwin-ld-lto-lld.c

diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index dc75b2b4621bb..005236c4476f4 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -219,9 +219,8 @@ void darwin::Linker::AddLinkArgs(Compilation &C, const ArgList &Args,
       !Args.hasArg(options::OPT_Z_Xlinker__no_demangle))
     CmdArgs.push_back("-demangle");
 
-  // FIXME: Pass most of the flags below that check Version if LinkerIsLLD too.
-
-  if (Args.hasArg(options::OPT_rdynamic) && Version >= VersionTuple(137))
+  if (Args.hasArg(options::OPT_rdynamic) &&
+      (Version >= VersionTuple(137) || LinkerIsLLD))
     CmdArgs.push_back("-export_dynamic");
 
   // If we are using App Extension restrictions, pass a flag to the linker
@@ -230,7 +229,8 @@ void darwin::Linker::AddLinkArgs(Compilation &C, const ArgList &Args,
                    options::OPT_fno_application_extension, false))
     CmdArgs.push_back("-application_extension");
 
-  if (D.isUsingLTO() && Version >= VersionTuple(116) && NeedsTempPath(Inputs)) {
+  if (D.isUsingLTO() && (Version >= VersionTuple(116) || LinkerIsLLD) &&
+      NeedsTempPath(Inputs)) {
     std::string TmpPathName;
     if (D.getLTOMode() == LTOK_Full) {
       // If we are using full LTO, then automatically create a temporary file
@@ -269,8 +269,11 @@ void darwin::Linker::AddLinkArgs(Compilation &C, const ArgList &Args,
     CmdArgs.push_back(C.getArgs().MakeArgString(LibLTOPath));
   }
 
-  // ld64 version 262 and above run the deduplicate pass by default.
-  if (Version >= VersionTuple(262) && shouldLinkerNotDedup(C.getJobs().empty(), Args))
+  // ld64 version 262 and above runs the deduplicate pass by default.
+  // FIXME: lld doesn't dedup by default. Should we pass `--icf=safe`
+  //        if `!shouldLinkerNotDedup()` if LinkerIsLLD here?
+  if (Version >= VersionTuple(262) &&
+      shouldLinkerNotDedup(C.getJobs().empty(), Args))
     CmdArgs.push_back("-no_deduplicate");
 
   // Derived from the "link" spec.
@@ -368,6 +371,7 @@ void darwin::Linker::AddLinkArgs(Compilation &C, const ArgList &Args,
     // Check if the toolchain supports bitcode build flow.
     if (MachOTC.SupportsEmbeddedBitcode()) {
       CmdArgs.push_back("-bitcode_bundle");
+      // FIXME: Pass this if LinkerIsLLD too, once it implements this flag.
       if (C.getDriver().embedBitcodeMarkerOnly() &&
           Version >= VersionTuple(278)) {
         CmdArgs.push_back("-bitcode_process_mode");
diff --git a/clang/test/Driver/darwin-ld-lto-lld.c b/clang/test/Driver/darwin-ld-lto-lld.c
new file mode 100644
index 0000000000000..2d1ed86ebcda8
--- /dev/null
+++ b/clang/test/Driver/darwin-ld-lto-lld.c
@@ -0,0 +1,19 @@
+// REQUIRES: shell
+
+// Check that lld gets "-lto_library".
+// (Separate test file since darwin-ld-lto requires system-darwin but this
+// test doesn't require that.)
+
+// Check that -object_lto_path is passed correctly to ld64
+// RUN: %clang -fuse-ld=lld -B%S/Inputs/lld -target x86_64-apple-darwin10 \
+// RUN:     %s -flto=full -### 2>&1 \
+// RUN:     | FileCheck -check-prefix=FULL_LTO_OBJECT_PATH %s
+// FULL_LTO_OBJECT_PATH: {{ld(.exe)?"}}
+// FULL_LTO_OBJECT_PATH-SAME: "-object_path_lto"
+// FULL_LTO_OBJECT_PATH-SAME: {{cc\-[a-zA-Z0-9_]+.o}}"
+// RUN: %clang -fuse-ld=lld -B%S/Inputs/lld -target x86_64-apple-darwin10 \
+// RUN:     %s -flto=thin -### 2>&1 \
+// RUN:     | FileCheck -check-prefix=THIN_LTO_OBJECT_PATH %s
+// THIN_LTO_OBJECT_PATH: {{ld(.exe)?"}}
+// THIN_LTO_OBJECT_PATH-SAME: "-object_path_lto"
+// THIN_LTO_OBJECT_PATH-SAME: {{thinlto\-[a-zA-Z0-9_]+}}
diff --git a/clang/test/Driver/darwin-ld-lto.c b/clang/test/Driver/darwin-ld-lto.c
index 252ca148c5200..441a07d0a7e56 100644
--- a/clang/test/Driver/darwin-ld-lto.c
+++ b/clang/test/Driver/darwin-ld-lto.c
@@ -20,13 +20,13 @@
 
 
 // Check that -object_lto_path is passed correctly to ld64
-// RUN: %clang -target x86_64-apple-darwin10 %s -flto=full -### 2>&1 | \
-// RUN:   FileCheck -check-prefix=FULL_LTO_OBJECT_PATH %s
+// RUN: %clang -fuse-ld= -target x86_64-apple-darwin10 %s -flto=full -### 2>&1 \
+// RUN:     | FileCheck -check-prefix=FULL_LTO_OBJECT_PATH %s
 // FULL_LTO_OBJECT_PATH: {{ld(.exe)?"}}
 // FULL_LTO_OBJECT_PATH-SAME: "-object_path_lto"
 // FULL_LTO_OBJECT_PATH-SAME: {{cc\-[a-zA-Z0-9_]+.o}}"
-// RUN: %clang -target x86_64-apple-darwin10 %s -flto=thin -### 2>&1 | \
-// RUN:   FileCheck -check-prefix=THIN_LTO_OBJECT_PATH %s
+// RUN: %clang -fuse-ld= -target x86_64-apple-darwin10 %s -flto=thin -### 2>&1 \
+// RUN:     | FileCheck -check-prefix=THIN_LTO_OBJECT_PATH %s
 // THIN_LTO_OBJECT_PATH: {{ld(.exe)?"}}
 // THIN_LTO_OBJECT_PATH-SAME: "-object_path_lto"
 // THIN_LTO_OBJECT_PATH-SAME: {{thinlto\-[a-zA-Z0-9_]+}}
diff --git a/clang/test/Driver/darwin-ld.c b/clang/test/Driver/darwin-ld.c
index daf0fb8cda7cf..a678bba7a8003 100644
--- a/clang/test/Driver/darwin-ld.c
+++ b/clang/test/Driver/darwin-ld.c
@@ -236,6 +236,9 @@
 // RUN: %clang -target x86_64-apple-darwin12 -rdynamic -### %t.o \
 // RUN:   -fuse-ld= -mlinker-version=137 2> %t.log
 // RUN: FileCheck -check-prefix=LINK_EXPORT_DYNAMIC %s < %t.log
+// RUN: %clang -target x86_64-apple-darwin12 -rdynamic -### %t.o \
+// RUN:   -fuse-ld=lld -B%S/Inputs/lld -mlinker-version=100 2> %t.log
+// RUN: FileCheck -check-prefix=LINK_EXPORT_DYNAMIC %s < %t.log
 // LINK_EXPORT_DYNAMIC: {{ld(.exe)?"}}
 // LINK_EXPORT_DYNAMIC: "-export_dynamic"
 

From 86e26f09a4039837bc9954d963fdc5745ff413be Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Wed, 16 Feb 2022 16:32:09 -0800
Subject: [PATCH 180/748] [MTE] Instrument use-after-scope for optnone
 functions.

We always need the Dominator and PostDominatorTree for use-after-scope.

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D119993
---
 llvm/lib/Target/AArch64/AArch64StackTagging.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 589074e16d9b0..4cb1db3ad75a7 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -508,8 +508,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (auto *P = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
     DT = &P->getDomTree();
 
-  if (DT == nullptr && (SInfo.AllocasToInstrument.size() > 1 ||
-                        !F->hasFnAttribute(Attribute::OptimizeNone))) {
+  if (DT == nullptr) {
     DeleteDT = std::make_unique<DominatorTree>(*F);
     DT = DeleteDT.get();
   }
@@ -519,7 +518,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (auto *P = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>())
     PDT = &P->getPostDomTree();
 
-  if (PDT == nullptr && !F->hasFnAttribute(Attribute::OptimizeNone)) {
+  if (PDT == nullptr) {
     DeletePDT = std::make_unique<PostDominatorTree>(*F);
     PDT = DeletePDT.get();
   }

From f755806813224ca6e8c61313644ffcf48b0799b9 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 17 Feb 2022 22:53:20 +0100
Subject: [PATCH 181/748] Remove __uncvref; use __uncvref_t instead

Reviewed By: Quuxplusone, #libc

Spies: libcxx-commits

Differential Revision: https://reviews.llvm.org/D119958
---
 libcxx/include/__functional/bind.h            |  8 ++--
 libcxx/include/__hash_table                   |  2 +-
 libcxx/include/__memory/allocator_arg_t.h     |  2 +-
 libcxx/include/__tree                         |  2 +-
 libcxx/include/__tuple                        |  5 +--
 libcxx/include/experimental/functional        | 16 +++-----
 libcxx/include/future                         | 36 ++--------------
 libcxx/include/thread                         |  6 +--
 libcxx/include/tuple                          |  9 ++--
 libcxx/include/type_traits                    | 18 +++-----
 .../ctor2.compile.pass.cpp                    | 41 +++++++++++++++++++
 .../futures.task.members/ctor2.fail.cpp       | 35 ----------------
 12 files changed, 71 insertions(+), 109 deletions(-)
 create mode 100644 libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp
 delete mode 100644 libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.fail.cpp

diff --git a/libcxx/include/__functional/bind.h b/libcxx/include/__functional/bind.h
index f584fc2e8d5e7..c352406f85613 100644
--- a/libcxx/include/__functional/bind.h
+++ b/libcxx/include/__functional/bind.h
@@ -25,9 +25,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template<class _Tp>
 struct is_bind_expression : _If<
-    _IsSame<_Tp, typename __uncvref<_Tp>::type>::value,
+    _IsSame<_Tp, __uncvref_t<_Tp> >::value,
     false_type,
-    is_bind_expression<typename __uncvref<_Tp>::type>
+    is_bind_expression<__uncvref_t<_Tp> >
 > {};
 
 #if _LIBCPP_STD_VER > 14
@@ -37,9 +37,9 @@ inline constexpr size_t is_bind_expression_v = is_bind_expression<_Tp>::value;
 
 template<class _Tp>
 struct is_placeholder : _If<
-    _IsSame<_Tp, typename __uncvref<_Tp>::type>::value,
+    _IsSame<_Tp, __uncvref_t<_Tp> >::value,
     integral_constant<int, 0>,
-    is_placeholder<typename __uncvref<_Tp>::type>
+    is_placeholder<__uncvref_t<_Tp> >
 > {};
 
 #if _LIBCPP_STD_VER > 14
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 43c15d59a1932..36f2ef7a2c7ac 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -47,7 +47,7 @@ template <class ..._Args>
 struct __is_hash_value_type : false_type {};
 
 template <class _One>
-struct __is_hash_value_type<_One> : __is_hash_value_type_imp<typename __uncvref<_One>::type> {};
+struct __is_hash_value_type<_One> : __is_hash_value_type_imp<__uncvref_t<_One> > {};
 
 _LIBCPP_FUNC_VIS
 size_t __next_prime(size_t __n);
diff --git a/libcxx/include/__memory/allocator_arg_t.h b/libcxx/include/__memory/allocator_arg_t.h
index d93d99dea0c92..2d63b1f905e59 100644
--- a/libcxx/include/__memory/allocator_arg_t.h
+++ b/libcxx/include/__memory/allocator_arg_t.h
@@ -36,7 +36,7 @@ extern _LIBCPP_EXPORTED_FROM_ABI const allocator_arg_t allocator_arg;
 template <class _Tp, class _Alloc, class ..._Args>
 struct __uses_alloc_ctor_imp
 {
-    typedef _LIBCPP_NODEBUG typename __uncvref<_Alloc>::type _RawAlloc;
+    typedef _LIBCPP_NODEBUG __uncvref_t<_Alloc> _RawAlloc;
     static const bool __ua = uses_allocator<_Tp, _RawAlloc>::value;
     static const bool __ic =
         is_constructible<_Tp, allocator_arg_t, _Alloc, _Args...>::value;
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index eeb6b1107e7c0..384d3835049bb 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -545,7 +545,7 @@ template <class ..._Args>
 struct __is_tree_value_type : false_type {};
 
 template <class _One>
-struct __is_tree_value_type<_One> : __is_tree_value_type_imp<typename __uncvref<_One>::type> {};
+struct __is_tree_value_type<_One> : __is_tree_value_type_imp<__uncvref_t<_One> > {};
 
 template <class _Tp>
 struct __tree_key_value_types {
diff --git a/libcxx/include/__tuple b/libcxx/include/__tuple
index 9053c56518e0e..6d13bb24c5795 100644
--- a/libcxx/include/__tuple
+++ b/libcxx/include/__tuple
@@ -278,7 +278,7 @@ using __type_pack_element _LIBCPP_NODEBUG = typename decltype(
 #endif
 
 template <size_t _Ip, class ..._Types>
-struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, __tuple_types<_Types...>>
+struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, __tuple_types<_Types...> >
 {
     static_assert(_Ip < sizeof...(_Types), "tuple_element index out of range");
     typedef _LIBCPP_NODEBUG __type_pack_element<_Ip, _Types...> type;
@@ -469,8 +469,7 @@ template <class _SizeTrait, size_t _Expected>
 struct __tuple_like_with_size_imp<true, _SizeTrait, _Expected>
     : integral_constant<bool, _SizeTrait::value == _Expected> {};
 
-template <class _Tuple, size_t _ExpectedSize,
-          class _RawTuple = typename __uncvref<_Tuple>::type>
+template <class _Tuple, size_t _ExpectedSize, class _RawTuple = __uncvref_t<_Tuple> >
 using __tuple_like_with_size _LIBCPP_NODEBUG = __tuple_like_with_size_imp<
                                    __tuple_like<_RawTuple>::value,
                                    tuple_size<_RawTuple>, _ExpectedSize
diff --git a/libcxx/include/experimental/functional b/libcxx/include/experimental/functional
index 087f7c95f5a77..1291894aa088f 100644
--- a/libcxx/include/experimental/functional
+++ b/libcxx/include/experimental/functional
@@ -208,11 +208,9 @@ public:
     pair<_RandomAccessIterator2, _RandomAccessIterator2>
     operator ()(_RandomAccessIterator2 __f, _RandomAccessIterator2 __l) const
     {
-        static_assert ( std::is_same<
-                typename std::__uncvref<typename std::iterator_traits<_RandomAccessIterator1>::value_type>::type,
-                typename std::__uncvref<typename std::iterator_traits<_RandomAccessIterator2>::value_type>::type
-                    >::value,
-                "Corpus and Pattern iterators must point to the same type" );
+        static_assert(__is_same_uncvref<typename iterator_traits<_RandomAccessIterator1>::value_type,
+                                        typename iterator_traits<_RandomAccessIterator2>::value_type>::value,
+                      "Corpus and Pattern iterators must point to the same type");
 
         if (__f      == __l )    return make_pair(__l, __l); // empty corpus
         if (__first_ == __last_) return make_pair(__f, __f); // empty pattern
@@ -360,11 +358,9 @@ public:
     pair<_RandomAccessIterator2, _RandomAccessIterator2>
     operator ()(_RandomAccessIterator2 __f, _RandomAccessIterator2 __l) const
     {
-        static_assert ( std::is_same<
-                typename std::__uncvref<typename std::iterator_traits<_RandomAccessIterator1>::value_type>::type,
-                typename std::__uncvref<typename std::iterator_traits<_RandomAccessIterator2>::value_type>::type
-                    >::value,
-                "Corpus and Pattern iterators must point to the same type" );
+        static_assert(__is_same_uncvref<typename std::iterator_traits<_RandomAccessIterator1>::value_type,
+                                        typename std::iterator_traits<_RandomAccessIterator2>::value_type>::value,
+                      "Corpus and Pattern iterators must point to the same type");
 
         if (__f      == __l )    return make_pair(__l, __l); // empty corpus
         if (__first_ == __last_) return make_pair(__f, __f); // empty pattern
diff --git a/libcxx/include/future b/libcxx/include/future
index 00f5a68ac6b65..15685dfb22cbf 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -1885,25 +1885,11 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     packaged_task() _NOEXCEPT : __p_(nullptr) {}
     template <class _Fp,
-              class = typename enable_if
-              <
-                  !is_same<
-                      typename __uncvref<_Fp>::type,
-                      packaged_task
-                      >::value
-                  >::type
-             >
+              class = __enable_if_t<!is_same<__uncvref_t<_Fp>, packaged_task>::value> >
         _LIBCPP_INLINE_VISIBILITY
         explicit packaged_task(_Fp&& __f) : __f_(_VSTD::forward<_Fp>(__f)) {}
     template <class _Fp, class _Allocator,
-              class = typename enable_if
-              <
-                  !is_same<
-                      typename __uncvref<_Fp>::type,
-                      packaged_task
-                      >::value
-                  >::type
-              >
+              class = __enable_if_t<!is_same<__uncvref_t<_Fp>, packaged_task>::value> >
         _LIBCPP_INLINE_VISIBILITY
         packaged_task(allocator_arg_t, const _Allocator& __a, _Fp&& __f)
              : __f_(allocator_arg, __a, _VSTD::forward<_Fp>(__f)),
@@ -2014,25 +2000,11 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     packaged_task() _NOEXCEPT : __p_(nullptr) {}
     template <class _Fp,
-              class = typename enable_if
-              <
-                  !is_same<
-                      typename __uncvref<_Fp>::type,
-                      packaged_task
-                      >::value
-                  >::type
-              >
+              class = __enable_if_t<!is_same<__uncvref_t<_Fp>, packaged_task>::value> >
         _LIBCPP_INLINE_VISIBILITY
         explicit packaged_task(_Fp&& __f) : __f_(_VSTD::forward<_Fp>(__f)) {}
     template <class _Fp, class _Allocator,
-              class = typename enable_if
-              <
-                  !is_same<
-                      typename __uncvref<_Fp>::type,
-                      packaged_task
-                      >::value
-                  >::type
-              >
+              class = __enable_if_t<!is_same<__uncvref_t<_Fp>, packaged_task>::value> >
         _LIBCPP_INLINE_VISIBILITY
         packaged_task(allocator_arg_t, const _Allocator& __a, _Fp&& __f)
              : __f_(allocator_arg, __a, _VSTD::forward<_Fp>(__f)),
diff --git a/libcxx/include/thread b/libcxx/include/thread
index 5aa698e1e136d..b059cfb459905 100644
--- a/libcxx/include/thread
+++ b/libcxx/include/thread
@@ -228,11 +228,7 @@ public:
     thread() _NOEXCEPT : __t_(_LIBCPP_NULL_THREAD) {}
 #ifndef _LIBCPP_CXX03_LANG
     template <class _Fp, class ..._Args,
-              class = typename enable_if
-              <
-                   !is_same<typename __uncvref<_Fp>::type, thread>::value
-              >::type
-             >
+              class = __enable_if_t<!is_same<__uncvref_t<_Fp>, thread>::value> >
         _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
         explicit thread(_Fp&& __f, _Args&&... __args);
 #else  // _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index a28a2e81db3ec..08ded9c916eb5 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -1455,9 +1455,10 @@ struct __tuple_cat_return_1
 template <class ..._Types, class _Tuple0>
 struct __tuple_cat_return_1<tuple<_Types...>, true, _Tuple0>
 {
-    typedef _LIBCPP_NODEBUG typename __tuple_cat_type<tuple<_Types...>,
-            typename __make_tuple_types<typename __uncvref<_Tuple0>::type>::type>::type
-                                                                           type;
+  using type _LIBCPP_NODEBUG = typename __tuple_cat_type<
+      tuple<_Types...>,
+      typename __make_tuple_types<__uncvref_t<_Tuple0> >::type
+    >::type;
 };
 
 template <class ..._Types, class _Tuple0, class _Tuple1, class ..._Tuples>
@@ -1465,7 +1466,7 @@ struct __tuple_cat_return_1<tuple<_Types...>, true, _Tuple0, _Tuple1, _Tuples...
     : public __tuple_cat_return_1<
                  typename __tuple_cat_type<
                      tuple<_Types...>,
-                     typename __make_tuple_types<typename __uncvref<_Tuple0>::type>::type
+                     typename __make_tuple_types<__uncvref_t<_Tuple0> >::type
                  >::type,
                  __tuple_like<typename remove_reference<_Tuple1>::type>::value,
                  _Tuple1, _Tuples...>
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index 3098b3d79f051..54df709874e15 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -1274,33 +1274,25 @@ template <class _Tp>
 decltype(__declval<_Tp>(0))
 declval() _NOEXCEPT;
 
-// __uncvref
-
-template <class _Tp>
-struct __uncvref  {
-    typedef _LIBCPP_NODEBUG typename remove_cv<typename remove_reference<_Tp>::type>::type type;
-};
-
 template <class _Tp>
 struct __unconstref {
     typedef _LIBCPP_NODEBUG typename remove_const<typename remove_reference<_Tp>::type>::type type;
 };
 
-#ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
-using __uncvref_t _LIBCPP_NODEBUG = typename __uncvref<_Tp>::type;
-#endif
+using __uncvref_t _LIBCPP_NODEBUG = typename remove_cv<typename remove_reference<_Tp>::type>::type;
 
 // __is_same_uncvref
 
 template <class _Tp, class _Up>
-struct __is_same_uncvref : _IsSame<typename __uncvref<_Tp>::type,
-                                   typename __uncvref<_Up>::type> {};
+struct __is_same_uncvref : _IsSame<__uncvref_t<_Tp>, __uncvref_t<_Up> > {};
 
 #if _LIBCPP_STD_VER > 17
 // remove_cvref - same as __uncvref
 template <class _Tp>
-struct remove_cvref : public __uncvref<_Tp> {};
+struct remove_cvref {
+    using type _LIBCPP_NODEBUG = __uncvref_t<_Tp>;
+};
 
 template <class _Tp> using remove_cvref_t = typename remove_cvref<_Tp>::type;
 #endif
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp
new file mode 100644
index 0000000000000..2bc5c62758add
--- /dev/null
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <future>
+
+// class packaged_task<R(ArgTypes...)>
+// template <class F, class Allocator>
+//   packaged_task(allocator_arg_t, const Allocator& a, F&& f);
+// These constructors shall not participate in overload resolution if
+//    decay<F>::type is the same type as std::packaged_task<R(ArgTypes...)>.
+
+#include <cassert>
+#include <future>
+
+#include "test_allocator.h"
+
+struct A {};
+using PT = std::packaged_task<A(int, char)>;
+using VPT = volatile std::packaged_task<A(int, char)>;
+
+static_assert(!std::is_constructible<PT, std::allocator_arg_t, test_allocator<A>, VPT>::value, "");
+
+using PA = std::packaged_task<A(int)>;
+using PI = std::packaged_task<int(int)>;
+
+static_assert(!std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, const PA&>::value, "");
+static_assert(!std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, const PA&&>::value, "");
+static_assert(!std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, volatile PA&>::value, "");
+static_assert(!std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, volatile PA&&>::value, "");
+
+static_assert( std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, const PI&>::value, "");
+static_assert( std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, const PI&&>::value, "");
+static_assert( std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, volatile PI&>::value, "");
+static_assert( std::is_constructible<PA, std::allocator_arg_t, std::allocator<A>, volatile PI&&>::value, "");
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.fail.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.fail.cpp
deleted file mode 100644
index 578ac661675ae..0000000000000
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.fail.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: libcpp-has-no-threads
-// UNSUPPORTED: c++03
-
-// <future>
-
-// class packaged_task<R(ArgTypes...)>
-// template <class F, class Allocator>
-//   packaged_task(allocator_arg_t, const Allocator& a, F&& f);
-// These constructors shall not participate in overload resolution if
-//    decay<F>::type is the same type as std::packaged_task<R(ArgTypes...)>.
-
-#include <future>
-#include <cassert>
-
-#include "test_allocator.h"
-
-struct A {};
-typedef std::packaged_task<A(int, char)> PT;
-typedef volatile std::packaged_task<A(int, char)> VPT;
-
-int main(int, char**)
-{
-    PT p { std::allocator_arg_t{}, test_allocator<A>{}, VPT {}}; // expected-error {{no matching constructor for initialization of 'PT' (aka 'packaged_task<A (int, char)>')}}
-    // expected-note-re@future:* 1 {{candidate template ignored: {{(disabled by 'enable_if')|(requirement '.*' was not satisfied)}}}}
-
-  return 0;
-}

From 0e219af475430ab338c9d76a101a78304a64f78a Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 17 Feb 2022 13:57:46 -0800
Subject: [PATCH 182/748] [clang] Remove Address::deprecated() call in
 CGExprCXX.cpp

---
 clang/lib/CodeGen/CGExprCXX.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 9596ed34e5e9d..54c87a7361b1c 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -2101,10 +2101,9 @@ void CodeGenFunction::EmitCXXDeleteExpr(const CXXDeleteExpr *E) {
       GEP.push_back(Zero);
     }
 
-    Ptr = Address::deprecated(Builder.CreateInBoundsGEP(Ptr.getElementType(),
-                                                        Ptr.getPointer(), GEP,
-                                                        "del.first"),
-                              Ptr.getAlignment());
+    Ptr = Address(Builder.CreateInBoundsGEP(Ptr.getElementType(),
+                                            Ptr.getPointer(), GEP, "del.first"),
+                  ConvertTypeForMem(DeleteTy), Ptr.getAlignment());
   }
 
   assert(ConvertTypeForMem(DeleteTy) == Ptr.getElementType());

From e217ebcc961b75f78e7647b9b93b928c572f1338 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Thu, 17 Feb 2022 21:55:07 +0000
Subject: [PATCH 183/748] [NFC][Flang] Add colon to CHECK-LABEL to exercise the
 check

Reviewed By: clementval

Differential Revision: https://reviews.llvm.org/D119995
---
 flang/test/Lower/stop-statement.f90 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/flang/test/Lower/stop-statement.f90 b/flang/test/Lower/stop-statement.f90
index 8b8935fc0bfdf..555750854187c 100644
--- a/flang/test/Lower/stop-statement.f90
+++ b/flang/test/Lower/stop-statement.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc %s -emit-fir --canonicalize -o - | FileCheck %s
 
-! CHECK-LABEL stop_test
+! CHECK-LABEL: stop_test
 subroutine stop_test()
  ! CHECK-DAG: %[[c0:.*]] = arith.constant 0 : i32
  ! CHECK-DAG: %[[false:.*]] = arith.constant false
@@ -10,7 +10,7 @@ subroutine stop_test()
 end subroutine 
 
 
-! CHECK-LABEL stop_error
+! CHECK-LABEL: stop_error
 subroutine stop_error()
  error stop
  ! CHECK-DAG: %[[c0:.*]] = arith.constant 0 : i32
@@ -20,7 +20,7 @@ subroutine stop_error()
  ! CHECK-NEXT: fir.unreachable
 end subroutine
 
-! CHECK-LABEL stop_code
+! CHECK-LABEL: stop_code
 subroutine stop_code()
   stop 42
  ! CHECK-DAG: %[[c42:.*]] = arith.constant 42 : i32
@@ -29,7 +29,7 @@ subroutine stop_code()
  ! CHECK-NEXT: fir.unreachable
 end subroutine
 
-! CHECK-LABEL stop_quiet_constant
+! CHECK-LABEL: stop_quiet_constant
 subroutine stop_quiet_constant()
   stop, quiet = .true.
  ! CHECK-DAG: %[[true:.*]] = arith.constant true
@@ -39,7 +39,7 @@ subroutine stop_quiet_constant()
  ! CHECK-NEXT: fir.unreachable
 end subroutine
 
-! CHECK-LABEL stop_quiet
+! CHECK-LABEL: stop_quiet
 subroutine stop_quiet()
   logical :: b
   stop, quiet = b
@@ -52,7 +52,7 @@ subroutine stop_quiet()
  ! CHECK-NEXT: fir.unreachable
 end subroutine
 
-! CHECK-LABEL stop_char_lit
+! CHECK-LABEL: stop_char_lit
 subroutine stop_char_lit
   ! CHECK-DAG: %[[false:.*]] = arith.constant false
   ! CHECK-DAG: %[[five:.*]] = arith.constant 5 : index

From 38d25aecdf72177d72ed40f3dfbbf1d3c726dc8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 25 Jan 2022 13:58:43 +0000
Subject: [PATCH 184/748] [libcxx] [test] Use proper UTF-8 locales on Windows

Since Windows 10 version 1803 (10.0.17134.0) (or Windows Server 2019),
the Windows Universal C Runtime (UCRT) actually does support UTF-8
locales - they're available e.g. via the same names as commonly on Unices,
e.g. "en_US.UTF-8".

The UTF-8 locale support unfortunately has a bug which breaks a couple
tests that were passing previously. That bug is fixed in the very
latest version of the UCRT (in UCRT 10.0.20348.0, available in Windows
11 or Windows Server 2022), so it will get resolved at some point
eventually, provided that the CI environment does get upgraded to a
newer version of Windows Server.

While the net number of xfailed/passing tests in this patch is a loss,
this does allow fixing a lot more locale tests properly for Windows
in later patches.

Intentionally not touching the ISO-8859-1/2 locales used for testing;
they're not detected and tested/used right now, and fixing that up
is another project.

Differential Revision: https://reviews.llvm.org/D119930
---
 libcxx/test/libcxx/selftest/dsl/dsl.sh.py     | 22 ++++++++++++
 .../filebuf.virtuals/overflow.pass.cpp        |  2 --
 .../filebuf.virtuals/underflow.pass.cpp       |  2 --
 .../locale.ctype.byname/tolower_1.pass.cpp    |  1 +
 .../locale.ctype.byname/tolower_many.pass.cpp |  1 +
 .../locale.ctype.byname/toupper_1.pass.cpp    |  1 +
 .../locale.ctype.byname/toupper_many.pass.cpp |  1 +
 .../get_monthname.pass.cpp                    |  2 --
 .../get_weekday.pass.cpp                      |  2 --
 .../re/re.traits/translate_nocase.pass.cpp    |  1 +
 libcxx/test/support/platform_support.h        | 35 +++++++------------
 libcxx/utils/libcxx/test/dsl.py               | 22 +++++++++---
 libcxx/utils/libcxx/test/features.py          | 12 +++++++
 13 files changed, 69 insertions(+), 35 deletions(-)

diff --git a/libcxx/test/libcxx/selftest/dsl/dsl.sh.py b/libcxx/test/libcxx/selftest/dsl/dsl.sh.py
index 5da2cc9f045ba..f4c48549a7c2d 100644
--- a/libcxx/test/libcxx/selftest/dsl/dsl.sh.py
+++ b/libcxx/test/libcxx/selftest/dsl/dsl.sh.py
@@ -216,6 +216,28 @@ def test_program_stderr_is_not_conflated_with_stdout(self):
         self.assertEqual(dsl.programOutput(self.config, source), "STDOUT-OUTPUT")
 
 
+class TestProgramSucceeds(SetupConfigs):
+    """
+    Tests for libcxx.test.dsl.programSucceeds
+    """
+    def test_success(self):
+        source = """
+        int main(int, char**) { return 0; }
+        """
+        self.assertTrue(dsl.programSucceeds(self.config, source))
+
+    def test_failure(self):
+        source = """
+        int main(int, char**) { return 1; }
+        """
+        self.assertFalse(dsl.programSucceeds(self.config, source))
+
+    def test_compile_failure(self):
+        source = """
+        this does not compile
+        """
+        self.assertRaises(dsl.ConfigurationCompilationError, lambda: dsl.programSucceeds(self.config, source))
+
 class TestHasLocale(SetupConfigs):
     """
     Tests for libcxx.test.dsl.hasLocale
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.pass.cpp
index d4f8f0f2c322c..90161432a2d34 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.pass.cpp
@@ -8,8 +8,6 @@
 
 // REQUIRES: locale.en_US.UTF-8
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // <fstream>
 
 // int_type overflow(int_type c = traits::eof());
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp
index 7adc07da9d380..21418bd1958a4 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp
@@ -9,8 +9,6 @@
 // REQUIRES: locale.en_US.UTF-8
 // FILE_DEPENDENCIES: underflow.dat, underflow_utf8.dat
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // <fstream>
 
 // int_type underflow();
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_1.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_1.pass.cpp
index a85f89f46a527..5e2b81d06e5aa 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_1.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_1.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: locale.en_US.UTF-8
+// XFAIL: broken-utf8-wchar-ctype
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_many.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_many.pass.cpp
index 8363b57a504d3..289d569ab9fb0 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_many.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/tolower_many.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: locale.en_US.UTF-8
+// XFAIL: broken-utf8-wchar-ctype
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_1.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_1.pass.cpp
index 503cb198e3142..d957e1ee09919 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_1.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_1.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: locale.en_US.UTF-8
+// XFAIL: broken-utf8-wchar-ctype
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_many.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_many.pass.cpp
index ffb3b7ecdc1e2..7893a32204a9a 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_many.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.ctype.byname/toupper_many.pass.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: locale.en_US.UTF-8
+// XFAIL: broken-utf8-wchar-ctype
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_monthname.pass.cpp b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_monthname.pass.cpp
index db6f845271f40..f2153f0c6620f 100644
--- a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_monthname.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_monthname.pass.cpp
@@ -10,8 +10,6 @@
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.zh_CN.UTF-8
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // <locale>
 
 // class time_get_byname<charT, InputIterator>
diff --git a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_weekday.pass.cpp b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_weekday.pass.cpp
index c2d706855a5e8..4a452f23fcd1b 100644
--- a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_weekday.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_weekday.pass.cpp
@@ -11,8 +11,6 @@
 // REQUIRES: locale.ru_RU.UTF-8
 // REQUIRES: locale.zh_CN.UTF-8
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // <locale>
 
 // class time_get_byname<charT, InputIterator>
diff --git a/libcxx/test/std/re/re.traits/translate_nocase.pass.cpp b/libcxx/test/std/re/re.traits/translate_nocase.pass.cpp
index d7eabc919812d..257c627f20eed 100644
--- a/libcxx/test/std/re/re.traits/translate_nocase.pass.cpp
+++ b/libcxx/test/std/re/re.traits/translate_nocase.pass.cpp
@@ -13,6 +13,7 @@
 // charT translate_nocase(charT c) const;
 
 // REQUIRES: locale.en_US.UTF-8
+// XFAIL: broken-utf8-wchar-ctype
 
 #include <regex>
 #include <cassert>
diff --git a/libcxx/test/support/platform_support.h b/libcxx/test/support/platform_support.h
index f8183d2fb4f09..7cfb21f794944 100644
--- a/libcxx/test/support/platform_support.h
+++ b/libcxx/test/support/platform_support.h
@@ -15,30 +15,21 @@
 #define PLATFORM_SUPPORT_H
 
 // locale names
-#ifdef _WIN32
-    // WARNING: Windows does not support UTF-8 codepages.
-    // Locales are "converted" using https://docs.moodle.org/dev/Table_of_locales
-#   define LOCALE_en_US           "en-US"
-#   define LOCALE_en_US_UTF_8     "en-US"
-#   define LOCALE_cs_CZ_ISO8859_2 "cs-CZ"
-#   define LOCALE_fr_FR_UTF_8     "fr-FR"
-#   define LOCALE_fr_CA_ISO8859_1 "fr-CA"
-#   define LOCALE_ru_RU_UTF_8     "ru-RU"
-#   define LOCALE_zh_CN_UTF_8     "zh-CN"
+#define LOCALE_en_US           "en_US"
+#define LOCALE_en_US_UTF_8     "en_US.UTF-8"
+#define LOCALE_fr_FR_UTF_8     "fr_FR.UTF-8"
+#ifdef __linux__
+#    define LOCALE_fr_CA_ISO8859_1 "fr_CA.ISO-8859-1"
+#    define LOCALE_cs_CZ_ISO8859_2 "cs_CZ.ISO-8859-2"
+#elif defined(_WIN32)
+#    define LOCALE_fr_CA_ISO8859_1 "fr-CA"
+#    define LOCALE_cs_CZ_ISO8859_2 "cs-CZ"
 #else
-#   define LOCALE_en_US           "en_US"
-#   define LOCALE_en_US_UTF_8     "en_US.UTF-8"
-#   define LOCALE_fr_FR_UTF_8     "fr_FR.UTF-8"
-#   ifdef __linux__
-#       define LOCALE_fr_CA_ISO8859_1 "fr_CA.ISO-8859-1"
-#       define LOCALE_cs_CZ_ISO8859_2 "cs_CZ.ISO-8859-2"
-#   else
-#       define LOCALE_fr_CA_ISO8859_1 "fr_CA.ISO8859-1"
-#       define LOCALE_cs_CZ_ISO8859_2 "cs_CZ.ISO8859-2"
-#   endif
-#   define LOCALE_ru_RU_UTF_8     "ru_RU.UTF-8"
-#   define LOCALE_zh_CN_UTF_8     "zh_CN.UTF-8"
+#    define LOCALE_fr_CA_ISO8859_1 "fr_CA.ISO8859-1"
+#    define LOCALE_cs_CZ_ISO8859_2 "cs_CZ.ISO8859-2"
 #endif
+#define LOCALE_ru_RU_UTF_8     "ru_RU.UTF-8"
+#define LOCALE_zh_CN_UTF_8     "zh_CN.UTF-8"
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py
index c50a7508cab30..791edb3406fc9 100644
--- a/libcxx/utils/libcxx/test/dsl.py
+++ b/libcxx/utils/libcxx/test/dsl.py
@@ -175,6 +175,22 @@ def programOutput(config, program, args=None):
     actualOut = actualOut.group(1) if actualOut else ""
     return actualOut
 
+@_memoizeExpensiveOperation(lambda c, p, args=None: (c.substitutions, c.environment, p, args))
+def programSucceeds(config, program, args=None):
+  """
+  Compiles a program for the test target, run it on the test target and return
+  whether it completed successfully.
+
+  Note that execution of the program is done through the %{exec} substitution,
+  which means that the program may be run on a remote host depending on what
+  %{exec} does.
+  """
+  try:
+    programOutput(config, program, args)
+  except ConfigurationRuntimeError:
+    return False
+  return True
+
 @_memoizeExpensiveOperation(lambda c, f: (c.substitutions, c.environment, f))
 def hasCompileFlag(config, flag):
   """
@@ -229,11 +245,7 @@ def hasAnyLocale(config, locales):
       }
     #endif
   """
-  try:
-    programOutput(config, program, args=[pipes.quote(l) for l in locales])
-  except ConfigurationRuntimeError:
-    return False
-  return True
+  return programSucceeds(config, program, args=[pipes.quote(l) for l in locales])
 
 @_memoizeExpensiveOperation(lambda c, flags='': (c.substitutions, c.environment, flags))
 def compilerMacros(config, flags=''):
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 0dfb516ae5297..a2498cada94bb 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -73,6 +73,18 @@
             void f() { new int(3); }
           """, ['-shared'])),
 
+  # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.20348.0):
+  # https://developercommunity.visualstudio.com/t/utf-8-locales-break-ctype-functions-for-wchar-type/1653678
+  Feature(name='broken-utf8-wchar-ctype',
+          when=lambda cfg: '_WIN32' in compilerMacros(cfg) and not programSucceeds(cfg, """
+          #include <locale.h>
+          #include <wctype.h>
+          int main(int, char**) {
+            setlocale(LC_ALL, "en_US.UTF-8");
+            return towlower(L'\\xDA') != L'\\xFA';
+          }
+          """)),
+
   # Whether Bash can run on the executor.
   # This is not always the case, for example when running on embedded systems.
   #

From 8d58cb62da0f4fa67d49aa0614ae47bc233ff5d4 Mon Sep 17 00:00:00 2001
From: Casey Carter <Casey@Carter.net>
Date: Thu, 17 Feb 2022 12:23:58 -0800
Subject: [PATCH 185/748] [libcxx][test] Let the library indicate support for
 int128

Define `TEST_HAS_NO_INT128` accordingly.

Differential Revision: https://reviews.llvm.org/D120010
---
 libcxx/test/support/test_macros.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index dde0802c4c261..130f3d4f0a948 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -366,7 +366,7 @@ inline void DoNotOptimize(Tp const& value) {
 #   define TEST_HAS_NO_UNICODE
 #endif
 
-#if defined(_LIBCPP_HAS_NO_INT128) || defined(TEST_COMPILER_MSVC)
+#if defined(_LIBCPP_HAS_NO_INT128) || defined(_MSVC_STL_VERSION)
 #   define TEST_HAS_NO_INT128
 #endif
 

From 4986a41f58220e2b597de3ecf45de3714bb8ee23 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <minyihh@uci.edu>
Date: Thu, 17 Feb 2022 14:13:52 -0800
Subject: [PATCH 186/748] [M68k] Adopt VarLenCodeEmitter for bits instructions

And introduce operand encoding fragments (i.e. MxEncMemOp record) for
addressing modes 'o' and 'e'.
---
 llvm/lib/Target/M68k/M68kInstrBits.td    | 75 ++++++++++++++----------
 llvm/lib/Target/M68k/M68kInstrFormats.td | 10 ++++
 llvm/test/MC/Disassembler/M68k/bits.txt  |  3 +
 3 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/M68k/M68kInstrBits.td b/llvm/lib/Target/M68k/M68kInstrBits.td
index 0d12781023788..abd2ab3cf012c 100644
--- a/llvm/lib/Target/M68k/M68kInstrBits.td
+++ b/llvm/lib/Target/M68k/M68kInstrBits.td
@@ -32,9 +32,15 @@
 /// ------------+---------+---------+---------+---------
 ///  0  0  0  0 |   REG   | 1  0  0 |   MODE  |   REG
 /// ------------+---------+---------+---------+---------
-class MxBTSTEnc_R<MxBeadDReg REG, MxEncEA EA, MxEncExt EXT>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead3Bits<0b100>, REG, MxBead4Bits<0b0000>,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+class MxBTSTEnc_R<MxEncMemOp dst_enc, string bitno_name> {
+  dag Value = (ascend
+    (descend 0b0000,
+      (operand "$"#bitno_name, 3),
+      0b100, dst_enc.EA
+    ),
+    dst_enc.Supplement
+  );
+}
 
 /// -------------------------------+---------+---------
 ///  F  E  D  C  B  A  9  8 . 7  6 | 5  4  3 | 2  1  0
@@ -43,33 +49,40 @@ class MxBTSTEnc_R<MxBeadDReg REG, MxEncEA EA, MxEncExt EXT>
 /// ------------------------+------+---------+---------
 ///  0  0  0  0  0  0  0  0 |        BIT NUMBER
 /// ------------------------+--------------------------
-class MxBTSTEnc_I<MxBead8Imm IMM, MxEncEA EA, MxEncExt EXT>
-    : MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b00>,
-                 MxBead4Bits<0b1000>, MxBead4Bits<0b0000>, IMM,
-                 EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>;
+class MxBTSTEnc_I<MxEncMemOp dst_enc, string bitno_name> {
+  dag Value = (ascend
+    (descend 0b0000100000, dst_enc.EA),
+    (descend 0b00000000, (operand "$"#bitno_name, 8)),
+    dst_enc.Supplement
+  );
+}
 
 let Defs = [CCR] in {
 class MxBTST_RR<MxType TYPE>
     : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.VT:$bitno))],
-             MxBTSTEnc_R<MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>;
+             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.VT:$bitno))]> {
+  let Inst = MxBTSTEnc_R<MxEncAddrMode_r<"dst">, "bitno">.Value;
+}
 
 class MxBTST_RI<MxType TYPE>
     : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.IPat:$bitno))],
-             MxBTSTEnc_I<MxBead8Imm<1>, MxEncEAd_0, MxExtEmpty>>;
+             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.IPat:$bitno))]> {
+  let Inst = MxBTSTEnc_I<MxEncAddrMode_r<"dst">, "bitno">.Value;
+}
 
 class MxBTST_MR<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
-                MxEncEA EA, MxEncExt EXT>
+                MxEncMemOp DST_ENC>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))],
-             MxBTSTEnc_R<MxBeadDReg<1>, EA, EXT>>;
+             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))]> {
+  let Inst = MxBTSTEnc_R<DST_ENC, "bitno">.Value;
+}
 
 class MxBTST_MI<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
-                MxEncEA EA, MxEncExt EXT>
+                MxEncMemOp DST_ENC>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))],
-             MxBTSTEnc_I<MxBead8Imm<1>, EA, EXT>>;
+             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))]> {
+  let Inst = MxBTSTEnc_I<DST_ENC, "bitno">.Value;
+}
 } // Defs = [CCR]
 
 // Register BTST limited to 32 bits only
@@ -78,31 +91,31 @@ def BTST32di : MxBTST_RI<MxType32d>;
 
 // Memory BTST limited to 8 bits only
 def BTST8jd : MxBTST_MR<MxType8d, MxType8.JOp, MxType8.JPat,
-                        MxEncEAj_0, MxExtEmpty>;
+                        MxEncAddrMode_j<"dst">>;
 def BTST8od : MxBTST_MR<MxType8d, MxType8.OOp, MxType8.OPat,
-                        MxEncEAo_0, MxExtEmpty>;
+                        MxEncAddrMode_o<"dst">>;
 def BTST8ed : MxBTST_MR<MxType8d, MxType8.EOp, MxType8.EPat,
-                        MxEncEAe_0, MxExtEmpty>;
+                        MxEncAddrMode_e<"dst">>;
 def BTST8pd : MxBTST_MR<MxType8d, MxType8.POp, MxType8.PPat,
-                        MxEncEAp_0, MxExtI16_0>;
+                        MxEncAddrMode_p<"dst">>;
 def BTST8fd : MxBTST_MR<MxType8d, MxType8.FOp, MxType8.FPat,
-                        MxEncEAf_0, MxExtBrief_0>;
+                        MxEncAddrMode_f<"dst">>;
 def BTST8qd : MxBTST_MR<MxType8d, MxType8.QOp, MxType8.QPat,
-                        MxEncEAq,   MxExtI16_0>;
+                        MxEncAddrMode_q<"dst">>;
 def BTST8kd : MxBTST_MR<MxType8d, MxType8.KOp, MxType8.KPat,
-                        MxEncEAk,   MxExtBrief_0>;
+                        MxEncAddrMode_k<"dst">>;
 
 def BTST8ji : MxBTST_MI<MxType8d, MxType8.JOp, MxType8.JPat,
-                        MxEncEAj_0, MxExtEmpty>;
+                        MxEncAddrMode_j<"dst">>;
 def BTST8oi : MxBTST_MI<MxType8d, MxType8.OOp, MxType8.OPat,
-                        MxEncEAo_0, MxExtEmpty>;
+                        MxEncAddrMode_o<"dst">>;
 def BTST8ei : MxBTST_MI<MxType8d, MxType8.EOp, MxType8.EPat,
-                        MxEncEAe_0, MxExtEmpty>;
+                        MxEncAddrMode_e<"dst">>;
 def BTST8pi : MxBTST_MI<MxType8d, MxType8.POp, MxType8.PPat,
-                        MxEncEAp_0, MxExtI16_0>;
+                        MxEncAddrMode_p<"dst">>;
 def BTST8fi : MxBTST_MI<MxType8d, MxType8.FOp, MxType8.FPat,
-                        MxEncEAf_0, MxExtBrief_0>;
+                        MxEncAddrMode_f<"dst">>;
 def BTST8qi : MxBTST_MI<MxType8d, MxType8.QOp, MxType8.QPat,
-                        MxEncEAq,   MxExtI16_0>;
+                        MxEncAddrMode_q<"dst">>;
 def BTST8ki : MxBTST_MI<MxType8d, MxType8.KOp, MxType8.KPat,
-                        MxEncEAk,   MxExtBrief_0>;
+                        MxEncAddrMode_k<"dst">>;
diff --git a/llvm/lib/Target/M68k/M68kInstrFormats.td b/llvm/lib/Target/M68k/M68kInstrFormats.td
index 0518faa77a283..4fe17b1fe656f 100644
--- a/llvm/lib/Target/M68k/M68kInstrFormats.td
+++ b/llvm/lib/Target/M68k/M68kInstrFormats.td
@@ -338,6 +338,16 @@ class MxEncAddrMode_abs<string opnd_name, bit size_w_l = false> : MxEncMemOp {
   );
 }
 
+class MxEncAddrMode_o<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b011,
+                    /*REGISTER*/(operand "$"#reg_opnd, 3));
+}
+
+class MxEncAddrMode_e<string reg_opnd> : MxEncMemOp {
+  let EA = (descend /*MODE*/0b100,
+                    /*REGISTER*/(operand "$"#reg_opnd, 3));
+}
+
 // Allows you to specify each bit of opcode
 class MxEncOpMode<MxBead b0, MxBead b1 = MxBeadIgnore, MxBead b2 = MxBeadIgnore> {
   MxBead B0 = b0;
diff --git a/llvm/test/MC/Disassembler/M68k/bits.txt b/llvm/test/MC/Disassembler/M68k/bits.txt
index c0a3001ffd265..f47693131d0c3 100644
--- a/llvm/test/MC/Disassembler/M68k/bits.txt
+++ b/llvm/test/MC/Disassembler/M68k/bits.txt
@@ -1,4 +1,7 @@
 # RUN: llvm-mc -disassemble -triple m68k %s | FileCheck %s
+# Disable this particular test until migration to the new code emitter is
+# finished.
+# XFAIL: *
 
 # CHECK: btst #0, %d3
 0x08 0x03 0x00 0x00

From ba9944ea1dff507839df8e4cf9897a5d4916ec68 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 17 Feb 2022 14:22:39 -0800
Subject: [PATCH 187/748] [clang] Remove Address::deprecated() in CGCXXABI.h

---
 clang/lib/CodeGen/CGCXXABI.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGCXXABI.h b/clang/lib/CodeGen/CGCXXABI.h
index ba073b3ff4e52..a46f7f37141f0 100644
--- a/clang/lib/CodeGen/CGCXXABI.h
+++ b/clang/lib/CodeGen/CGCXXABI.h
@@ -56,7 +56,10 @@ class CGCXXABI {
     return CGF.CXXABIThisValue;
   }
   Address getThisAddress(CodeGenFunction &CGF) {
-    return Address::deprecated(CGF.CXXABIThisValue, CGF.CXXABIThisAlignment);
+    return Address(
+        CGF.CXXABIThisValue,
+        CGF.ConvertTypeForMem(CGF.CXXABIThisDecl->getType()->getPointeeType()),
+        CGF.CXXABIThisAlignment);
   }
 
   /// Issue a diagnostic about unsupported features in the ABI.

From 08a6229e214bd75aed975c610aaff6026c5d2880 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <syaghmour@apple.com>
Date: Thu, 17 Feb 2022 14:30:18 -0800
Subject: [PATCH 188/748] [LLDB] Adding skipif for arm linux for
 TestStructuredBinding.py

---
 .../API/lang/cpp/structured-binding/TestStructuredBinding.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
index 9f57d45dd9fc1..603c2df7163e4 100644
--- a/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
+++ b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
@@ -7,6 +7,7 @@ class TestStructuredBinding(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
+    @skipIf(oslist=["linux"], archs=["arm"])
     @skipIf(compiler="clang", compiler_version=['<', '14.0'])
     def test(self):
         self.build()

From 9ce09099bba4be68d2a269b0bfd2b1dcc67f02d4 Mon Sep 17 00:00:00 2001
From: Matthew Voss <matthew.voss@sony.com>
Date: Thu, 17 Feb 2022 14:18:54 -0800
Subject: [PATCH 189/748] Revert "[CUDA][SPIRV] Assign global address space to
 CUDA kernel arguments"

This reverts commit 9de4fc0f2d3b60542956f7e5254951d049edeb1f.

Reverting due to test failure: https://lab.llvm.org/buildbot/#/builders/139/builds/17199
---
 clang/lib/Basic/Targets/SPIR.h                 | 10 +++++-----
 clang/lib/CodeGen/TargetInfo.cpp               |  6 +++---
 clang/test/CodeGenCUDASPIRV/kernel-argument.cu | 17 -----------------
 3 files changed, 8 insertions(+), 25 deletions(-)
 delete mode 100644 clang/test/CodeGenCUDASPIRV/kernel-argument.cu

diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 08c49f018ac79..a40d4b3ca27e1 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -144,16 +144,16 @@ class LLVM_LIBRARY_VISIBILITY BaseSPIRTargetInfo : public TargetInfo {
     // FIXME: SYCL specification considers unannotated pointers and references
     // to be pointing to the generic address space. See section 5.9.3 of
     // SYCL 2020 specification.
-    // Currently, there is no way of representing SYCL's and HIP/CUDA's default
+    // Currently, there is no way of representing SYCL's and HIP's default
     // address space language semantic along with the semantics of embedded C's
     // default address space in the same address space map. Hence the map needs
     // to be reset to allow mapping to the desired value of 'Default' entry for
-    // SYCL and HIP/CUDA.
+    // SYCL and HIP.
     setAddressSpaceMap(
         /*DefaultIsGeneric=*/Opts.SYCLIsDevice ||
-        // The address mapping from HIP/CUDA language for device code is only
-        // defined for SPIR-V.
-        (getTriple().isSPIRV() && Opts.CUDAIsDevice));
+        // The address mapping from HIP language for device code is only defined
+        // for SPIR-V.
+        (getTriple().isSPIRV() && Opts.HIP && Opts.CUDAIsDevice));
   }
 
   void setSupportedOpenCLOpts() override {
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 5a2991dfe1762..3e1df744b2ad7 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -10320,10 +10320,10 @@ void CommonSPIRABIInfo::setCCs() {
 }
 
 ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const {
-  if (getContext().getLangOpts().CUDAIsDevice) {
+  if (getContext().getLangOpts().HIP) {
     // Coerce pointer arguments with default address space to CrossWorkGroup
-    // pointers for HIPSPV/CUDASPV. When the language mode is HIP/CUDA, the
-    // SPIRTargetInfo maps cuda_device to SPIR-V's CrossWorkGroup address space.
+    // pointers for HIPSPV. When the language mode is HIP, the SPIRTargetInfo
+    // maps cuda_device to SPIR-V's CrossWorkGroup address space.
     llvm::Type *LTy = CGT.ConvertType(Ty);
     auto DefaultAS = getContext().getTargetAddressSpace(LangAS::Default);
     auto GlobalAS = getContext().getTargetAddressSpace(LangAS::cuda_device);
diff --git a/clang/test/CodeGenCUDASPIRV/kernel-argument.cu b/clang/test/CodeGenCUDASPIRV/kernel-argument.cu
deleted file mode 100644
index 0ccacffd12a5f..0000000000000
--- a/clang/test/CodeGenCUDASPIRV/kernel-argument.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-// Tests CUDA kernel arguments get global address space when targetting SPIR-V.
-
-// REQUIRES: clang-driver
-
-// RUN: %clang -emit-llvm --cuda-device-only --offload=spirv32 \
-// RUN:   -nocudalib -nocudainc %s -o %t.bc -c 2>&1
-// RUN: llvm-dis %t.bc -o %t.ll
-// RUN: FileCheck %s --input-file=%t.ll
-
-// RUN: %clang -emit-llvm --cuda-device-only --offload=spirv64 \
-// RUN:   -nocudalib -nocudainc %s -o %t.bc -c 2>&1
-// RUN: llvm-dis %t.bc -o %t.ll
-// RUN: FileCheck %s --input-file=%t.ll
-
-// CHECK: define spir_kernel void @_Z6kernelPi(i32 addrspace(1)* noundef %output.coerce)
-
-__attribute__((global)) void kernel(int* output) { *output = 1; }

From 7d05d35b3590cb53f75cd8321d43b8d69c7c6ca8 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Thu, 17 Feb 2022 09:43:10 -0800
Subject: [PATCH 190/748] [mlir][sparse] remove unused test matrix file

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D120069
---
 mlir/test/CMakeLists.txt            | 1 -
 mlir/test/Integration/data/zero.mtx | 6 ------
 2 files changed, 7 deletions(-)
 delete mode 100644 mlir/test/Integration/data/zero.mtx

diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 8e44a9c37cf13..24adcb646d08c 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -41,7 +41,6 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS)
             ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test_symmetric.mtx
             ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.tns
             ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/wide.mtx
-            ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/zero.mtx
           DESTINATION ${MLIR_INTEGRATION_TEST_DIR}/data/)
 endif()
 
diff --git a/mlir/test/Integration/data/zero.mtx b/mlir/test/Integration/data/zero.mtx
deleted file mode 100644
index 7f1c47aec1f51..0000000000000
--- a/mlir/test/Integration/data/zero.mtx
+++ /dev/null
@@ -1,6 +0,0 @@
-%%MatrixMarket matrix coordinate real general
-%
-% This is a test sparse matrix in Matrix Market Exchange Format.
-% see https://math.nist.gov/MatrixMarket
-%
-5 5 0

From 0b5fe2c9f2e5bdfb111068fab1f6689c066422aa Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 17 Feb 2022 15:00:43 -0800
Subject: [PATCH 191/748] [clang] Remove Address::deprecated() in
 emitVoidPtrDirectVAArg()

---
 clang/lib/CodeGen/TargetInfo.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 3e1df744b2ad7..a26a1955bcc54 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -323,10 +323,10 @@ static Address emitVoidPtrDirectVAArg(CodeGenFunction &CGF,
   // If the CC aligns values higher than the slot size, do so if needed.
   Address Addr = Address::invalid();
   if (AllowHigherAlign && DirectAlign > SlotSize) {
-    Addr = Address::deprecated(
-        emitRoundPointerUpToAlignment(CGF, Ptr, DirectAlign), DirectAlign);
+    Addr = Address(emitRoundPointerUpToAlignment(CGF, Ptr, DirectAlign),
+                   CGF.Int8Ty, DirectAlign);
   } else {
-    Addr = Address::deprecated(Ptr, SlotSize);
+    Addr = Address(Ptr, CGF.Int8Ty, SlotSize);
   }
 
   // Advance the pointer past the argument, then store that back.

From 822a1aad17288c23edd8a0c15f2c8130db66f262 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Thu, 17 Feb 2022 14:59:50 -0800
Subject: [PATCH 192/748] Fix typo in Swift.def, it is
 swift5_protocol_conformances not swift5_protocol_confromances

---
 llvm/include/llvm/BinaryFormat/Swift.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/BinaryFormat/Swift.def b/llvm/include/llvm/BinaryFormat/Swift.def
index aa3ee4f1dc3fb..1ea0bc548b37e 100644
--- a/llvm/include/llvm/BinaryFormat/Swift.def
+++ b/llvm/include/llvm/BinaryFormat/Swift.def
@@ -24,7 +24,7 @@ HANDLE_SWIFT_SECTION(builtin, "__swift5_builtin", "swift5_builtin", ".sw5bltn")
 HANDLE_SWIFT_SECTION(capture, "__swift5_capture", "swift5_capture", ".sw5cptr")
 HANDLE_SWIFT_SECTION(typeref, "__swift5_typeref", "swift5_typeref", ".sw5tyrf")
 HANDLE_SWIFT_SECTION(reflstr, "__swift5_reflstr", "swift5_reflstr", ".sw5rfst")
-HANDLE_SWIFT_SECTION(conform, "__swift5_proto", "swift5_protocol_confromances",
+HANDLE_SWIFT_SECTION(conform, "__swift5_proto", "swift5_protocol_conformances",
                      ".sw5prtc$B")
 HANDLE_SWIFT_SECTION(protocs, "__swift5_protos", "swift5_protocols",
                      ".sw5prt$B")

From c46aab01c002b7a04135b8b7f1f52d8c9ae23a58 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 17 Feb 2022 16:52:02 -0500
Subject: [PATCH 193/748] RegAllocGreedy: Fix last chance recolor assert in
 impossible case

This example is not compilable without handling eviction of specific
subregisters. Last chance recoloring was deciding it could try
evicting an overlapping superregister, which doesn't help make any
progress. The LiveIntervalUnion would then assert due to an
overlapping / identical range when trying the new assignment.

Unfortunately this is also producing a verifier error after the
allocation fails. I've seen a number of these, and not sure if we
should just start deleting the function on error rather than trying to
figure out how to put together valid MIR.

I'm not super confident this is the right place to fix this. I also
have a number of failing testcases I need to fix by handling partial
evictions of superregisters.
---
 llvm/lib/CodeGen/RegAllocGreedy.cpp           |  8 ++-
 ...lloc-failure-overlapping-insert-assert.mir | 62 +++++++++++++++++++
 .../regalloc-illegal-eviction-assert.ll       | 26 ++++++++
 3 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 9371712a23cec..817aeb6d4d9ba 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1961,8 +1961,14 @@ bool RAGreedy::mayRecolorAllInterferences(
       // it would not be recolorable as it is in the same state as VirtReg.
       // However, if VirtReg has tied defs and Intf doesn't, then
       // there is still a point in examining if it can be recolorable.
+      //
+      // Also, don't try to evict a register which is assigned to an overlapping
+      // super register.
+      //
+      // TODO: Can we evict an interfering subset of the subregisters?
       if (((ExtraInfo->getStage(*Intf) == RS_Done &&
-            MRI->getRegClass(Intf->reg()) == CurRC) &&
+            (MRI->getRegClass(Intf->reg()) == CurRC ||
+             TRI->regsOverlap(VRM->getPhys(Intf->reg()), PhysReg))) &&
            !(hasTiedDef(MRI, VirtReg.reg()) &&
              !hasTiedDef(MRI, Intf->reg()))) ||
           FixedRegisters.count(Intf->reg())) {
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
new file mode 100644
index 0000000000000..34b1cf695cee4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -0,0 +1,62 @@
+# RUN: not llc -march=amdgcn -mcpu=gfx908 -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: not --crash llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s
+
+# FIXME: We should not produce a verifier error after erroring
+
+# ERR: error: inline assembly requires more registers than available
+# VERIFIER: *** Bad machine code: Using an undefined physical register ***
+
+# This testcase cannot be compiled with the enforced register
+# budget. Previously, tryLastChanceRecoloring would assert here. It
+# was attempting to recolor a superregister with an overlapping
+# subregister over the same range.
+
+--- |
+  define void @foo() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+
+...
+---
+name:            foo
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vreg_512 }
+  - { id: 3, class: vreg_256 }
+  - { id: 4, class: vreg_128 }
+  - { id: 5, class: vreg_96 }
+  - { id: 6, class: vreg_96 }
+  - { id: 7, class: vreg_512 }
+  - { id: 8, class: vreg_256 }
+  - { id: 9, class: vreg_128 }
+  - { id: 10, class: vreg_96 }
+  - { id: 11, class: vreg_96 }
+  - { id: 12, class: sreg_64 }
+  - { id: 13, class: ccr_sgpr_64 }
+  - { id: 14, class: vgpr_32 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr30_sgpr31
+
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0
+    %14:vgpr_32 = COPY killed $agpr0
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 16842762 /* regdef:VReg_512 */, def %7, 14155786 /* regdef:VReg_256 */, def %8, 5308426 /* regdef:VReg_128 */, def %9, 3866634 /* regdef:VReg_96 */, def %10, 3866634 /* regdef:VReg_96 */, def %11
+    INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 16842761 /* reguse:VReg_512 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 14155785 /* reguse:VReg_256 */, %8
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5308425 /* reguse:VReg_128 */, %9
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:VReg_96 */, %10
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:VReg_96 */, %11
+    $agpr1 = COPY %14
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1
+    S_SETPC_B64_return killed renamable $sgpr30_sgpr31
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
new file mode 100644
index 0000000000000..0d80195ba09c1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -0,0 +1,26 @@
+; RUN: not llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -o - %s 2>%t.err | FileCheck %s
+; RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+; ERR: error: inline assembly requires more registers than available
+; ERR: error: inline assembly requires more registers than available
+
+%asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> }
+
+; CHECK-LABEL: {{^}}illegal_eviction_assert:
+; CHECK: ; def v[0:15] v[20:27] v[0:4] v[16:19] a[0:15]
+; CHECK: ; clobber
+; CHECK: ; use v[0:15] v[20:27] v[0:4] v[16:19] a[1:16]
+define void @illegal_eviction_assert(<32 x i32> addrspace(1)* %arg) #0 {
+  ;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"()
+  %asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"()
+  %vgpr0 = extractvalue %asm.output %asm, 0
+  %vgpr1 = extractvalue %asm.output %asm, 1
+  %vgpr2 = extractvalue %asm.output %asm, 2
+  %vgpr3 = extractvalue %asm.output %asm, 3
+  %agpr0 = extractvalue %asm.output %asm, 4
+  call void asm sideeffect "; clobber", "~{v[0:31]}"()
+  call void asm sideeffect "; use $0 $1 $2 $3 $4","v,v,v,v,{a[1:16]}"(<16 x i32> %vgpr0, <8 x i32> %vgpr1, <5 x i32> %vgpr2, <4 x i32> %vgpr3, <16 x i32> %agpr0)
+  ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="8,8" }

From c74389b4b58d8db3f8262ce15b9d514d62fe265c Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Thu, 17 Feb 2022 15:14:47 -0800
Subject: [PATCH 194/748] [memprof] Fix frame deserialization on big endian
 systems.

We write the memprof internal call frame data in little endian format.
However when reading the frame information we were casting it directly
to a MemProfRecord::Frame pointer. In this change we add a separate
deserialization method which uses an endian reader to read the bytes as
little endian.

This fixes https://lab.llvm.org/buildbot/#/builders/100/builds/12940

Differential Revision: https://reviews.llvm.org/D120093
---
 llvm/include/llvm/ProfileData/MemProf.h | 18 +++++++++++++++++-
 llvm/lib/ProfileData/MemProf.cpp        |  6 +++---
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index dcc9b69386e8a..07bf629ce146a 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -161,7 +161,7 @@ struct MemProfRecord {
     bool operator!=(const Frame &Other) const { return !operator==(Other); }
 
     // Write the contents of the frame to the ostream \p OS.
-    void write(raw_ostream & OS) const {
+    void serialize(raw_ostream & OS) const {
       using namespace support;
 
       endian::Writer LE(OS, little);
@@ -176,6 +176,22 @@ struct MemProfRecord {
       LE.write<uint32_t>(Column);
       LE.write<bool>(IsInlineFrame);
     }
+
+    // Read a frame from char data which has been serialized as little endian.
+    static Frame deserialize(const unsigned char *Ptr) {
+      using namespace support;
+      return Frame(
+          /*Function=*/endian::readNext<uint64_t, little, unaligned>(Ptr),
+          /*LineOffset=*/endian::readNext<uint32_t, little, unaligned>(Ptr),
+          /*Column=*/endian::readNext<uint32_t, little, unaligned>(Ptr),
+          /*IsInlineFrame=*/endian::readNext<bool, little, unaligned>(Ptr));
+    }
+
+    // Returns the size of the frame information.
+    static constexpr size_t serializedSize() {
+      return sizeof(Frame::Function) + sizeof(Frame::LineOffset) +
+             sizeof(Frame::Column) + sizeof(Frame::IsInlineFrame);
+    }
   });
 
   // The dynamic calling context for the allocation.
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 6a9b69ff6cff0..48950d41d0234 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -15,7 +15,7 @@ void serializeRecords(const ArrayRef<MemProfRecord> Records,
   for (const MemProfRecord &MR : Records) {
     LE.write<uint64_t>(MR.CallStack.size());
     for (const MemProfRecord::Frame &F : MR.CallStack) {
-      F.write(OS);
+      F.serialize(OS);
     }
     MR.Info.serialize(Schema, OS);
   }
@@ -33,8 +33,8 @@ SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
     const uint64_t NumFrames =
         endian::readNext<uint64_t, little, unaligned>(Ptr);
     for (uint64_t J = 0; J < NumFrames; J++) {
-      const auto F = *reinterpret_cast<const MemProfRecord::Frame *>(Ptr);
-      Ptr += sizeof(MemProfRecord::Frame);
+      const auto F = MemProfRecord::Frame::deserialize(Ptr);
+      Ptr += MemProfRecord::Frame::serializedSize();
       MR.CallStack.push_back(F);
     }
     MR.Info.deserialize(Schema, Ptr);

From ba653b7fabb6620fb466a107fd0b2a57eab4886b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 17 Feb 2022 18:42:06 -0500
Subject: [PATCH 195/748] AMDGPU: Try to fix expensive_checks bot tests
 failures

---
 .../AMDGPU/regalloc-failure-overlapping-insert-assert.mir       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index 34b1cf695cee4..933b8e91c0a78 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=amdgcn -mcpu=gfx908 -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
+# RUN: not llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
 # RUN: not --crash llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s
 
 # FIXME: We should not produce a verifier error after erroring

From 27b7c1e3f5e01b00781c6cc0d19e8f0446e79d9f Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Thu, 17 Feb 2022 15:42:02 -0800
Subject: [PATCH 196/748] Revert "[memprof] Fix frame deserialization on big
 endian systems."

This reverts commit c74389b4b58d8db3f8262ce15b9d514d62fe265c.

This broke the ml-opt-x86-64 build.
https://lab.llvm.org/buildbot#builders/9/builds/4127
---
 llvm/include/llvm/ProfileData/MemProf.h | 18 +-----------------
 llvm/lib/ProfileData/MemProf.cpp        |  6 +++---
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 07bf629ce146a..dcc9b69386e8a 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -161,7 +161,7 @@ struct MemProfRecord {
     bool operator!=(const Frame &Other) const { return !operator==(Other); }
 
     // Write the contents of the frame to the ostream \p OS.
-    void serialize(raw_ostream & OS) const {
+    void write(raw_ostream & OS) const {
       using namespace support;
 
       endian::Writer LE(OS, little);
@@ -176,22 +176,6 @@ struct MemProfRecord {
       LE.write<uint32_t>(Column);
       LE.write<bool>(IsInlineFrame);
     }
-
-    // Read a frame from char data which has been serialized as little endian.
-    static Frame deserialize(const unsigned char *Ptr) {
-      using namespace support;
-      return Frame(
-          /*Function=*/endian::readNext<uint64_t, little, unaligned>(Ptr),
-          /*LineOffset=*/endian::readNext<uint32_t, little, unaligned>(Ptr),
-          /*Column=*/endian::readNext<uint32_t, little, unaligned>(Ptr),
-          /*IsInlineFrame=*/endian::readNext<bool, little, unaligned>(Ptr));
-    }
-
-    // Returns the size of the frame information.
-    static constexpr size_t serializedSize() {
-      return sizeof(Frame::Function) + sizeof(Frame::LineOffset) +
-             sizeof(Frame::Column) + sizeof(Frame::IsInlineFrame);
-    }
   });
 
   // The dynamic calling context for the allocation.
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 48950d41d0234..6a9b69ff6cff0 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -15,7 +15,7 @@ void serializeRecords(const ArrayRef<MemProfRecord> Records,
   for (const MemProfRecord &MR : Records) {
     LE.write<uint64_t>(MR.CallStack.size());
     for (const MemProfRecord::Frame &F : MR.CallStack) {
-      F.serialize(OS);
+      F.write(OS);
     }
     MR.Info.serialize(Schema, OS);
   }
@@ -33,8 +33,8 @@ SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
     const uint64_t NumFrames =
         endian::readNext<uint64_t, little, unaligned>(Ptr);
     for (uint64_t J = 0; J < NumFrames; J++) {
-      const auto F = MemProfRecord::Frame::deserialize(Ptr);
-      Ptr += MemProfRecord::Frame::serializedSize();
+      const auto F = *reinterpret_cast<const MemProfRecord::Frame *>(Ptr);
+      Ptr += sizeof(MemProfRecord::Frame);
       MR.CallStack.push_back(F);
     }
     MR.Info.deserialize(Schema, Ptr);

From 19bdf44d850884a13a8708ccf1260fb7f4ef4eb3 Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Thu, 17 Feb 2022 15:45:10 -0800
Subject: [PATCH 197/748] Revert "Reland "[memprof] Extend the index prof
 format to include memory profiles.""

This reverts commit 807ba7aace188ada83ddb4477265728e97346af1.
---
 compiler-rt/include/profile/InstrProfData.inc |   4 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |   8 +-
 .../llvm/ProfileData/InstrProfData.inc        |   4 +-
 .../llvm/ProfileData/InstrProfReader.h        |  14 --
 .../llvm/ProfileData/InstrProfWriter.h        |  11 --
 llvm/include/llvm/ProfileData/MemProf.h       | 185 +-----------------
 llvm/include/llvm/ProfileData/MemProfData.inc |   4 +-
 .../llvm/ProfileData/RawMemProfReader.h       |   3 -
 llvm/lib/ProfileData/CMakeLists.txt           |   1 -
 llvm/lib/ProfileData/InstrProf.cpp            |  23 +--
 llvm/lib/ProfileData/InstrProfReader.cpp      |  43 +---
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  90 +--------
 llvm/lib/ProfileData/MemProf.cpp              |  73 -------
 llvm/lib/ProfileData/RawMemProfReader.cpp     |   7 +-
 .../tools/llvm-profdata/Inputs/basic.profraw  | Bin 152 -> 0 bytes
 .../tools/llvm-profdata/memprof-merge.test    |  47 -----
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  48 +----
 llvm/unittests/ProfileData/InstrProfTest.cpp  |  62 ------
 llvm/unittests/ProfileData/MemProfTest.cpp    |  54 +----
 19 files changed, 32 insertions(+), 649 deletions(-)
 delete mode 100644 llvm/lib/ProfileData/MemProf.cpp
 delete mode 100644 llvm/test/tools/llvm-profdata/Inputs/basic.profraw
 delete mode 100644 llvm/test/tools/llvm-profdata/memprof-merge.test

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 282620d8b5dc0..62054a6a3df51 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -650,7 +650,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 8
+#define INSTR_PROF_INDEX_VERSION 7
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -662,7 +662,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
- * The 62nd bit indicates whether memory profile information is present.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -672,7 +671,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define VARIANT_MASK_DBG_CORRELATE (0x1ULL << 59)
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
-#define VARIANT_MASK_MEMPROF (0x1ULL << 62)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index e14d3e206e9f2..c015e8e4b43d0 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -287,8 +287,7 @@ enum class InstrProfKind {
   CS = 0x8, // A context sensitive IR-level profile.
   SingleByteCoverage = 0x10, // Use single byte probes for coverage.
   FunctionEntryOnly = 0x20,  // Only instrument the function entry basic block.
-  MemProf = 0x40, // A memory profile collected using -fmemory-profile.
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/MemProf)
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionEntryOnly)
 };
 
 const std::error_category &instrprof_category();
@@ -1012,9 +1011,7 @@ enum ProfVersion {
   Version6 = 6,
   // An additional counter is added around logical operators.
   Version7 = 7,
-  // An additional (optional) memory profile type is added.
-  Version8 = 8,
-  // The current version is 8.
+  // The current version is 7.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1031,7 +1028,6 @@ struct Header {
   uint64_t Unused; // Becomes unused since version 4
   uint64_t HashType;
   uint64_t HashOffset;
-  uint64_t MemProfOffset;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 282620d8b5dc0..62054a6a3df51 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -650,7 +650,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 8
+#define INSTR_PROF_INDEX_VERSION 7
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -662,7 +662,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
- * The 62nd bit indicates whether memory profile information is present.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -672,7 +671,6 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define VARIANT_MASK_DBG_CORRELATE (0x1ULL << 59)
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
-#define VARIANT_MASK_MEMPROF (0x1ULL << 62)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 7a18d5a6a11af..548affbf65fa5 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -19,7 +19,6 @@
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfCorrelator.h"
-#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
@@ -472,9 +471,6 @@ struct InstrProfReaderIndexBase {
 using OnDiskHashTableImplV3 =
     OnDiskIterableChainedHashTable<InstrProfLookupTrait>;
 
-using MemProfHashTable =
-    OnDiskIterableChainedHashTable<memprof::MemProfRecordLookupTrait>;
-
 template <typename HashTableImpl>
 class InstrProfReaderItaniumRemapper;
 
@@ -560,11 +556,6 @@ class IndexedInstrProfReader : public InstrProfReader {
   std::unique_ptr<ProfileSummary> Summary;
   /// Context sensitive profile summary data.
   std::unique_ptr<ProfileSummary> CS_Summary;
-  /// MemProf profile schema (if available).
-  memprof::MemProfSchema Schema;
-  /// MemProf profile data on-disk indexed via llvm::md5(FunctionName).
-  std::unique_ptr<MemProfHashTable> MemProfTable;
-
   // Index to the current record in the record array.
   unsigned RecordIndex;
 
@@ -618,11 +609,6 @@ class IndexedInstrProfReader : public InstrProfReader {
   Expected<InstrProfRecord> getInstrProfRecord(StringRef FuncName,
                                                uint64_t FuncHash);
 
-  /// Return the memprof records for the function identified by
-  /// llvm::md5(Name).
-  Expected<ArrayRef<memprof::MemProfRecord>>
-  getMemProfRecord(uint64_t FuncNameHash);
-
   /// Fill Counts with the profile data for the given function name.
   Error getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                           std::vector<uint64_t> &Counts);
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index bb180ac42c212..af1e46cf4fc24 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -17,7 +17,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -38,11 +37,6 @@ class InstrProfWriter {
 private:
   bool Sparse;
   StringMap<ProfilingData> FunctionData;
-
-  // A map to hold memprof data per function. The lower 64 bits obtained from
-  // the md5 hash of the function name is used to index into the map.
-  memprof::FunctionMemProfMap MemProfData;
-
   // An enum describing the attributes of the profile.
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
   // Use raw pointer here for the incomplete type object.
@@ -63,9 +57,6 @@ class InstrProfWriter {
     addRecord(std::move(I), 1, Warn);
   }
 
-  void addRecord(const ::llvm::memprof::MemProfRecord &MR,
-                 function_ref<void(Error)> Warn);
-
   /// Merge existing function counts from the given writer.
   void mergeRecordsFromWriter(InstrProfWriter &&IPW,
                               function_ref<void(Error)> Warn);
@@ -121,8 +112,6 @@ class InstrProfWriter {
     return Error::success();
   }
 
-  InstrProfKind getProfileKind() const { return ProfileKind; }
-
   // Internal interface for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness);
   void setOutputSparse(bool Sparse);
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index dcc9b69386e8a..2fa577a626bbe 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -5,7 +5,6 @@
 #include <string>
 #include <vector>
 
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ProfileData/MemProfData.inc"
 #include "llvm/ProfileData/ProfileCommon.h"
@@ -135,52 +134,18 @@ struct PortableMemInfoBlock {
 };
 
 struct MemProfRecord {
-  // Describes a call frame for a dynamic allocation context. The contents of
-  // the frame are populated by symbolizing the stack depot call frame from the
-  // compiler runtime.
-  PACKED(struct Frame {
-    // A uuid (uint64_t) identifying the function. It is obtained by
-    // llvm::md5(FunctionName) which returns the lower 64 bits.
-    GlobalValue::GUID Function;
-    // The source line offset of the call from the beginning of parent function.
+  struct Frame {
+    std::string Function;
     uint32_t LineOffset;
-    // The source column number of the call to help distinguish multiple calls
-    // on the same line.
     uint32_t Column;
-    // Whether the current frame is inlined.
     bool IsInlineFrame;
 
-    Frame(uint64_t Hash, uint32_t Off, uint32_t Col, bool Inline)
-        : Function(Hash), LineOffset(Off), Column(Col), IsInlineFrame(Inline) {}
+    Frame(std::string Str, uint32_t Off, uint32_t Col, bool Inline)
+        : Function(std::move(Str)), LineOffset(Off), Column(Col),
+          IsInlineFrame(Inline) {}
+  };
 
-    bool operator==(const Frame &Other) const {
-      return Other.Function == Function && Other.LineOffset == LineOffset &&
-             Other.Column == Column && Other.IsInlineFrame == IsInlineFrame;
-    }
-
-    bool operator!=(const Frame &Other) const { return !operator==(Other); }
-
-    // Write the contents of the frame to the ostream \p OS.
-    void write(raw_ostream & OS) const {
-      using namespace support;
-
-      endian::Writer LE(OS, little);
-
-      // If the type of the GlobalValue::GUID changes, then we need to update
-      // the reader and the writer.
-      static_assert(std::is_same<GlobalValue::GUID, uint64_t>::value,
-                    "Expect GUID to be uint64_t.");
-      LE.write<uint64_t>(Function);
-
-      LE.write<uint32_t>(LineOffset);
-      LE.write<uint32_t>(Column);
-      LE.write<bool>(IsInlineFrame);
-    }
-  });
-
-  // The dynamic calling context for the allocation.
   std::vector<Frame> CallStack;
-  // The statistics obtained from the runtime for the allocation.
   PortableMemInfoBlock Info;
 
   void clear() {
@@ -188,12 +153,6 @@ struct MemProfRecord {
     Info.clear();
   }
 
-  size_t serializedSize() const {
-    return sizeof(uint64_t) + // The number of frames to serialize.
-           sizeof(Frame) * CallStack.size() + // The contents of the frames.
-           PortableMemInfoBlock::serializedSize(); // The size of the payload.
-  }
-
   // Prints out the contents of the memprof record in YAML.
   void print(llvm::raw_ostream &OS) const {
     OS << "    Callstack:\n";
@@ -209,138 +168,6 @@ struct MemProfRecord {
 
     Info.printYAML(OS);
   }
-
-  bool operator==(const MemProfRecord &Other) const {
-    if (Other.Info != Info)
-      return false;
-
-    if (Other.CallStack.size() != CallStack.size())
-      return false;
-
-    for (size_t I = 0; I < Other.CallStack.size(); I++) {
-      if (Other.CallStack[I] != CallStack[I])
-        return false;
-    }
-    return true;
-  }
-};
-
-// Serializes the memprof records in \p Records to the ostream \p OS based on
-// the schema provided in \p Schema.
-void serializeRecords(const ArrayRef<MemProfRecord> Records,
-                      const MemProfSchema &Schema, raw_ostream &OS);
-
-// Deserializes memprof records from the Buffer
-SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
-                                                 const unsigned char *Buffer);
-
-// Reads a memprof schema from a buffer. All entries in the buffer are
-// interpreted as uint64_t. The first entry in the buffer denotes the number of
-// ids in the schema. Subsequent entries are integers which map to memprof::Meta
-// enum class entries. After successfully reading the schema, the pointer is one
-// byte past the schema contents.
-Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer);
-
-using FunctionMemProfMap =
-    DenseMap<uint64_t, SmallVector<memprof::MemProfRecord, 4>>;
-
-/// Trait for lookups into the on-disk hash table for memprof format in the
-/// indexed profile.
-class MemProfRecordLookupTrait {
-public:
-  using data_type = ArrayRef<MemProfRecord>;
-  using internal_key_type = uint64_t;
-  using external_key_type = uint64_t;
-  using hash_value_type = uint64_t;
-  using offset_type = uint64_t;
-
-  MemProfRecordLookupTrait() = delete;
-  MemProfRecordLookupTrait(const MemProfSchema &S) : Schema(S) {}
-
-  static bool EqualKey(uint64_t A, uint64_t B) { return A == B; }
-  static uint64_t GetInternalKey(uint64_t K) { return K; }
-  static uint64_t GetExternalKey(uint64_t K) { return K; }
-
-  hash_value_type ComputeHash(uint64_t K) { return K; }
-
-  static std::pair<offset_type, offset_type>
-  ReadKeyDataLength(const unsigned char *&D) {
-    using namespace support;
-
-    offset_type KeyLen = endian::readNext<offset_type, little, unaligned>(D);
-    offset_type DataLen = endian::readNext<offset_type, little, unaligned>(D);
-    return std::make_pair(KeyLen, DataLen);
-  }
-
-  uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) {
-    using namespace support;
-    return endian::readNext<external_key_type, little, unaligned>(D);
-  }
-
-  data_type ReadData(uint64_t K, const unsigned char *D,
-                     offset_type /*Unused*/) {
-    Records = deserializeRecords(Schema, D);
-    return Records;
-  }
-
-private:
-  // Holds the memprof schema used to deserialize records.
-  MemProfSchema Schema;
-  // Holds the records from one function deserialized from the indexed format.
-  llvm::SmallVector<MemProfRecord, 4> Records;
-};
-
-class MemProfRecordWriterTrait {
-public:
-  using key_type = uint64_t;
-  using key_type_ref = uint64_t;
-
-  using data_type = ArrayRef<MemProfRecord>;
-  using data_type_ref = ArrayRef<MemProfRecord>;
-
-  using hash_value_type = uint64_t;
-  using offset_type = uint64_t;
-
-  // Pointer to the memprof schema to use for the generator. Unlike the reader
-  // we must use a default constructor with no params for the writer trait so we
-  // have a public member which must be initialized by the user.
-  MemProfSchema *Schema = nullptr;
-
-  MemProfRecordWriterTrait() = default;
-
-  static hash_value_type ComputeHash(key_type_ref K) { return K; }
-
-  static std::pair<offset_type, offset_type>
-  EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
-    using namespace support;
-
-    endian::Writer LE(Out, little);
-
-    offset_type N = sizeof(K);
-    LE.write<offset_type>(N);
-
-    offset_type M = 0;
-
-    M += sizeof(uint64_t);
-    for (const auto &Record : V) {
-      M += Record.serializedSize();
-    }
-
-    LE.write<offset_type>(M);
-    return std::make_pair(N, M);
-  }
-
-  void EmitKey(raw_ostream &Out, key_type_ref K, offset_type /*Unused*/) {
-    using namespace support;
-    endian::Writer LE(Out, little);
-    LE.write<uint64_t>(K);
-  }
-
-  void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
-                offset_type /*Unused*/) {
-    assert(Schema != nullptr && "MemProf schema is not initialized!");
-    serializeRecords(V, *Schema, Out);
-  }
 };
 
 } // namespace memprof
diff --git a/llvm/include/llvm/ProfileData/MemProfData.inc b/llvm/include/llvm/ProfileData/MemProfData.inc
index 38698be9ea0ec..8135a664b0466 100644
--- a/llvm/include/llvm/ProfileData/MemProfData.inc
+++ b/llvm/include/llvm/ProfileData/MemProfData.inc
@@ -1,5 +1,5 @@
-#ifndef MEMPROF_DATA_INC
-#define MEMPROF_DATA_INC
+#ifndef LLVM_PROFILEDATA_MEMPROFDATA_INC
+#define LLVM_PROFILEDATA_MEMPROFDATA_INC
 /*===-- MemProfData.inc - MemProf profiling runtime structures -*- C++ -*-=== *\
 |*
 |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/llvm/include/llvm/ProfileData/RawMemProfReader.h b/llvm/include/llvm/ProfileData/RawMemProfReader.h
index bda33d336468a..55ba31d2a6492 100644
--- a/llvm/include/llvm/ProfileData/RawMemProfReader.h
+++ b/llvm/include/llvm/ProfileData/RawMemProfReader.h
@@ -66,9 +66,6 @@ class RawMemProfReader {
     return Iterator(this);
   }
 
-  // The RawMemProfReader only holds memory profile information.
-  InstrProfKind getProfileKind() const { return InstrProfKind::MemProf; }
-
   // Constructor for unittests only.
   RawMemProfReader(std::unique_ptr<llvm::symbolize::SymbolizableModule> Sym,
                    llvm::SmallVectorImpl<SegmentEntry> &Seg,
diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt
index 486c45d0dff5c..2749119f72d90 100644
--- a/llvm/lib/ProfileData/CMakeLists.txt
+++ b/llvm/lib/ProfileData/CMakeLists.txt
@@ -4,7 +4,6 @@ add_llvm_component_library(LLVMProfileData
   InstrProfCorrelator.cpp
   InstrProfReader.cpp
   InstrProfWriter.cpp
-  MemProf.cpp
   ProfileSummaryBuilder.cpp
   SampleProf.cpp
   SampleProfReader.cpp
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 0a0ce7604a290..6e53b0a276998 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1349,15 +1349,8 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
   switch (GET_VERSION(H.formatVersion())) {
-    // When a new field is added in the header add a case statement here to
-    // populate it.
-    static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
-        "Please update the reading code below if a new field has been added, "
-        "if not add a case statement to fall through to the latest version.");
-  case 8ull:
-    H.MemProfOffset = read(Buffer, offsetOf(&Header::MemProfOffset));
-    LLVM_FALLTHROUGH;
+  // When a new field is added in the header add a case statement here to
+  // populate it.
   default: // Version7 (when the backwards compatible header was introduced).
     H.HashType = read(Buffer, offsetOf(&Header::HashType));
     H.HashOffset = read(Buffer, offsetOf(&Header::HashOffset));
@@ -1368,15 +1361,9 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 
 size_t Header::size() const {
   switch (GET_VERSION(formatVersion())) {
-    // When a new field is added to the header add a case statement here to
-    // compute the size as offset of the new field + size of the new field. This
-    // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
-                  "Please update the size computation below if a new field has "
-                  "been added to the header, if not add a case statement to "
-                  "fall through to the latest version.");
-  case 8ull:
-    return offsetOf(&Header::MemProfOffset) + sizeof(Header::MemProfOffset);
+  // When a new field is added to the header add a case statement here to
+  // compute the size as offset of the new field + size of the new field. This
+  // relies on the field being added to the end of the list.
   default: // Version7 (when the backwards compatible header was introduced).
     return offsetOf(&Header::HashOffset) + sizeof(Header::HashOffset);
   }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index c84b942ce8b10..d1e3438a6f412 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -19,9 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
-#include "llvm/ProfileData/RawMemProfReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
@@ -59,9 +57,6 @@ static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   if (Version & VARIANT_MASK_FUNCTION_ENTRY_ONLY) {
     ProfileKind |= InstrProfKind::FunctionEntryOnly;
   }
-  if (Version & VARIANT_MASK_MEMPROF) {
-    ProfileKind |= InstrProfKind::MemProf;
-  }
   return ProfileKind;
 }
 
@@ -960,35 +955,10 @@ Error IndexedInstrProfReader::readHeader() {
 
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
-  // The hash table with profile counts comes next.
+  // The rest of the file is an on disk hash table.
   auto IndexPtr = std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
       Start + HashOffset, Cur, Start, HashType, Header->formatVersion());
 
-  // The MemProfOffset field in the header is only valid when the format version
-  // is higher than 8 (when it was introduced).
-  if (GET_VERSION(Header->Version) >= 8 &&
-      Header->Version & VARIANT_MASK_MEMPROF) {
-    uint64_t MemProfOffset =
-        endian::byte_swap<uint64_t, little>(Header->MemProfOffset);
-
-    const unsigned char *Ptr = Start + MemProfOffset;
-    // The value returned from Generator.Emit.
-    const uint64_t TableOffset =
-        support::endian::readNext<uint64_t, little, unaligned>(Ptr);
-
-    // Read the schema.
-    auto SchemaOr = memprof::readMemProfSchema(Ptr);
-    if (!SchemaOr)
-      return SchemaOr.takeError();
-    Schema = SchemaOr.get();
-
-    // Now initialize the table reader with a pointer into data buffer.
-    MemProfTable.reset(MemProfHashTable::Create(
-        /*Buckets=*/Start + TableOffset,
-        /*Payload=*/Ptr,
-        /*Base=*/Start, memprof::MemProfRecordLookupTrait(Schema)));
-  }
-
   // Load the remapping table now if requested.
   if (RemappingBuffer) {
     Remapper = std::make_unique<
@@ -1033,17 +1003,6 @@ IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
   return error(instrprof_error::hash_mismatch);
 }
 
-Expected<ArrayRef<memprof::MemProfRecord>>
-IndexedInstrProfReader::getMemProfRecord(uint64_t FuncNameHash) {
-  auto Iter = MemProfTable->find(FuncNameHash);
-  if (Iter == MemProfTable->end())
-    // TODO: Add memprof specific errors.
-    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
-                                      "memprof record not found for hash " +
-                                          Twine(FuncNameHash));
-  return *Iter;
-}
-
 Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
                                                 uint64_t FuncHash,
                                                 std::vector<uint64_t> &Counts) {
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 4c974f402d2b3..ebf89317d585a 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
@@ -64,16 +63,11 @@ class ProfOStream {
 
     if (IsFDOStream) {
       raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
-      const uint64_t LastPos = FDOStream.tell();
       for (int K = 0; K < NItems; K++) {
         FDOStream.seek(P[K].Pos);
         for (int I = 0; I < P[K].N; I++)
           write(P[K].D[I]);
       }
-      // Reset the stream to the last position after patching so that users
-      // don't accidentally overwrite data. This makes it consistent with
-      // the string stream below which replaces the data directly.
-      FDOStream.seek(LastPos);
     } else {
       raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
@@ -254,39 +248,11 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
   Dest.sortValueData();
 }
 
-void InstrProfWriter::addRecord(const memprof::MemProfRecord &MR,
-                                function_ref<void(Error)> Warn) {
-  // Use 0 as a sentinel value since its highly unlikely that the lower 64-bits
-  // of a 128 bit md5 hash will be all zeros.
-  // TODO: Move this Key frame detection to the contructor to avoid having to
-  // scan all the callstacks again when adding a new record.
-  uint64_t Key = 0;
-  for (auto Iter = MR.CallStack.rbegin(), End = MR.CallStack.rend();
-       Iter != End; Iter++) {
-    if (!Iter->IsInlineFrame) {
-      Key = Iter->Function;
-      break;
-    }
-  }
-
-  if (Key == 0) {
-    Warn(make_error<InstrProfError>(
-        instrprof_error::invalid_prof,
-        "could not determine leaf function for memprof record."));
-  }
-
-  MemProfData[Key].push_back(MR);
-}
-
 void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
                                              function_ref<void(Error)> Warn) {
   for (auto &I : IPW.FunctionData)
     for (auto &Func : I.getValue())
       addRecord(I.getKey(), Func.first, std::move(Func.second), 1, Warn);
-
-  for (auto &I : IPW.MemProfData)
-    for (const auto &MR : I.second)
-      addRecord(MR, Warn);
 }
 
 bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
@@ -331,7 +297,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
       Generator.insert(I.getKey(), &I.getValue());
-
   // Write the header.
   IndexedInstrProf::Header Header;
   Header.Magic = IndexedInstrProf::Magic;
@@ -346,18 +311,16 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
     Header.Version |= VARIANT_MASK_BYTE_COVERAGE;
   if (static_cast<bool>(ProfileKind & InstrProfKind::FunctionEntryOnly))
     Header.Version |= VARIANT_MASK_FUNCTION_ENTRY_ONLY;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf))
-    Header.Version |= VARIANT_MASK_MEMPROF;
 
   Header.Unused = 0;
   Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   Header.HashOffset = 0;
-  Header.MemProfOffset = 0;
   int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
-  // Only write out all the fields except 'HashOffset' and 'MemProfOffset'. We
-  // need to remember the offset of these fields to allow back patching later.
-  for (int I = 0; I < N - 2; I++)
+  // Only write out all the fields except 'HashOffset'. We need
+  // to remember the offset of that field to allow back patching
+  // later.
+  for (int I = 0; I < N - 1; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -365,13 +328,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Reserve the space for HashOffset field.
   OS.write(0);
 
-  // Save the location of MemProf profile data. This is stored in two parts as
-  // the schema and as a separate on-disk chained hashtable.
-  uint64_t MemProfSectionOffset = OS.tell();
-  // Reserve space for the MemProf table field to be patched later if this
-  // profile contains memory profile information.
-  OS.write(0);
-
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -391,42 +347,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS.OS, *InfoObj);
 
-  // Write the MemProf profile data if we have it. This includes a simple schema
-  // with the format described below followed by the hashtable:
-  // uint64_t Offset = MemProfGenerator.Emit
-  // uint64_t Num schema entries
-  // uint64_t Schema entry 0
-  // uint64_t Schema entry 1
-  // ....
-  // uint64_t Schema entry N - 1
-  // OnDiskChainedHashTable MemProfFunctionData
-  uint64_t MemProfSectionStart = 0;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf)) {
-    MemProfSectionStart = OS.tell();
-    OS.write(0ULL); // Reserve space for the offset.
-
-    auto Schema = memprof::PortableMemInfoBlock::getSchema();
-    OS.write(static_cast<uint64_t>(Schema.size()));
-    for (const auto Id : Schema) {
-      OS.write(static_cast<uint64_t>(Id));
-    }
-
-    auto MemProfWriter = std::make_unique<memprof::MemProfRecordWriterTrait>();
-    MemProfWriter->Schema = &Schema;
-    OnDiskChainedHashTableGenerator<memprof::MemProfRecordWriterTrait>
-        MemProfGenerator;
-    for (const auto &I : MemProfData) {
-      // Insert the key (func hash) and value (vector of memprof records).
-      MemProfGenerator.insert(I.first, I.second);
-    }
-
-    uint64_t TableOffset = MemProfGenerator.Emit(OS.OS, *MemProfWriter);
-    PatchItem PatchItems[] = {
-        {MemProfSectionStart, &TableOffset, 1},
-    };
-    OS.patch(PatchItems, 1);
-  }
-
   // Allocate space for data to be serialized out.
   std::unique_ptr<IndexedInstrProf::Summary> TheSummary =
       IndexedInstrProf::allocSummary(SummarySize);
@@ -449,8 +369,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   PatchItem PatchItems[] = {
       // Patch the Header.HashOffset field.
       {HashTableStartFieldOffset, &HashTableStart, 1},
-      // Patch the Header.MemProfOffset (=0 for profiles without MemProf data).
-      {MemProfSectionOffset, &MemProfSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
deleted file mode 100644
index 6a9b69ff6cff0..0000000000000
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "llvm/ProfileData/MemProf.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/EndianStream.h"
-
-namespace llvm {
-namespace memprof {
-
-void serializeRecords(const ArrayRef<MemProfRecord> Records,
-                      const MemProfSchema &Schema, raw_ostream &OS) {
-  using namespace support;
-
-  endian::Writer LE(OS, little);
-
-  LE.write<uint64_t>(Records.size());
-  for (const MemProfRecord &MR : Records) {
-    LE.write<uint64_t>(MR.CallStack.size());
-    for (const MemProfRecord::Frame &F : MR.CallStack) {
-      F.write(OS);
-    }
-    MR.Info.serialize(Schema, OS);
-  }
-}
-
-SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
-                                                 const unsigned char *Ptr) {
-  using namespace support;
-
-  SmallVector<MemProfRecord, 4> Records;
-  const uint64_t NumRecords =
-      endian::readNext<uint64_t, little, unaligned>(Ptr);
-  for (uint64_t I = 0; I < NumRecords; I++) {
-    MemProfRecord MR;
-    const uint64_t NumFrames =
-        endian::readNext<uint64_t, little, unaligned>(Ptr);
-    for (uint64_t J = 0; J < NumFrames; J++) {
-      const auto F = *reinterpret_cast<const MemProfRecord::Frame *>(Ptr);
-      Ptr += sizeof(MemProfRecord::Frame);
-      MR.CallStack.push_back(F);
-    }
-    MR.Info.deserialize(Schema, Ptr);
-    Ptr += PortableMemInfoBlock::serializedSize();
-    Records.push_back(MR);
-  }
-  return Records;
-}
-
-Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
-  using namespace support;
-
-  const unsigned char *Ptr = Buffer;
-  const uint64_t NumSchemaIds =
-      endian::readNext<uint64_t, little, unaligned>(Ptr);
-  if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
-    return make_error<InstrProfError>(instrprof_error::malformed,
-                                      "memprof schema invalid");
-  }
-
-  MemProfSchema Result;
-  for (size_t I = 0; I < NumSchemaIds; I++) {
-    const uint64_t Tag = endian::readNext<uint64_t, little, unaligned>(Ptr);
-    if (Tag >= static_cast<uint64_t>(Meta::Size)) {
-      return make_error<InstrProfError>(instrprof_error::malformed,
-                                        "memprof schema invalid");
-    }
-    Result.push_back(static_cast<Meta>(Tag));
-  }
-  // Advace the buffer to one past the schema if we succeeded.
-  Buffer = Ptr;
-  return Result;
-}
-
-} // namespace memprof
-} // namespace llvm
diff --git a/llvm/lib/ProfileData/RawMemProfReader.cpp b/llvm/lib/ProfileData/RawMemProfReader.cpp
index 9bcba2a2b04ea..43ef7c947366a 100644
--- a/llvm/lib/ProfileData/RawMemProfReader.cpp
+++ b/llvm/lib/ProfileData/RawMemProfReader.cpp
@@ -362,12 +362,7 @@ Error RawMemProfReader::fillRecord(const uint64_t Id, const MemInfoBlock &MIB,
     for (size_t I = 0; I < DI.getNumberOfFrames(); I++) {
       const auto &Frame = DI.getFrame(I);
       Record.CallStack.emplace_back(
-          // We use the function guid which we expect to be a uint64_t. At this
-          // time, it is the lower 64 bits of the md5 of the function name. Any
-          // suffix with .llvm. is trimmed since these are added by thinLTO
-          // global promotion. At the time the profile is consumed, these
-          // suffixes will not be present.
-          Function::getGUID(trimSuffix(Frame.FunctionName)),
+          std::to_string(llvm::MD5Hash(trimSuffix(Frame.FunctionName))),
           Frame.Line - Frame.StartLine, Frame.Column,
           // Only the first entry is not an inlined location.
           I != 0);
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic.profraw b/llvm/test/tools/llvm-profdata/Inputs/basic.profraw
deleted file mode 100644
index ad88759398c6020f4ab8a5606258e69d98e36687..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 152
zcmZoHO3N=Q$obE~00xW@ih%*nfC`}V*`VS-{zJgi8V9flOx>@mz0b{3rrrk1zQ4@n
a%LP-nKp3J9svT|*OdktFZenI00|NkRuOhYp

diff --git a/llvm/test/tools/llvm-profdata/memprof-merge.test b/llvm/test/tools/llvm-profdata/memprof-merge.test
deleted file mode 100644
index b11459f237ca5..0000000000000
--- a/llvm/test/tools/llvm-profdata/memprof-merge.test
+++ /dev/null
@@ -1,47 +0,0 @@
-REQUIRES: x86_64-linux
-
-The input memprof and instrumented raw profiles were generated from the following source code:
-
-```
-#include <stdlib.h>
-#include <string.h>
-int main(int argc, char **argv) {
-  char *x = (char *)malloc(10);
-  memset(x, 0, 10);
-  free(x);
-  x = (char *)malloc(10);
-  memset(x, 0, 10);
-  free(x);
-  return 0;
-}
-```
-
-Steps to collect the memprof raw profile and the instrprof raw profile:
-
-```
-# Collect instrprof profile with name compression disabled since some buildbots
-# do not have zlib.
-clang -mllvm -enable-name-compression=false -fprofile-generate source.c -o instr.out
-./instr.out
-mv *.profraw basic.profraw
-
-# Collect memprof profile.
-clang -fuse-ld=lld -Wl,--no-rosegment -gmlt -fdebug-info-for-profiling \
-      -fmemory-profile -mno-omit-leaf-frame-pointer -fno-omit-frame-pointer \
-      -fno-optimize-sibling-calls -m64 -Wl,-build-id source.c -o basic.memprofexe
-
-env MEMPROF_OPTIONS=log_path=stdout ./rawprofile.out > basic.memprofraw
-```
-
-RUN: llvm-profdata merge %p/Inputs/basic.profraw %p/Inputs/basic.memprofraw --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof
-RUN: llvm-profdata show %t.prof | FileCheck %s
-
-For now we only check the validity of the instrumented profile since we don't
-have a way to display the contents of the memprof indexed format yet.
-
-CHECK: Instrumentation level: IR  entry_first = 0
-CHECK: Total functions: 1
-CHECK: Maximum function count: 1
-CHECK: Maximum internal block count: 0
-
-
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index ba2f1b6038c48..e00582851d47f 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -239,7 +239,7 @@ static void overlapInput(const std::string &BaseFilename,
 /// Load an input into a writer context.
 static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
                       const InstrProfCorrelator *Correlator,
-                      const StringRef ProfiledBinary, WriterContext *WC) {
+                      WriterContext *WC) {
   std::unique_lock<std::mutex> CtxGuard{WC->Lock};
 
   // Copy the filename, because llvm::ThreadPool copied the input "const
@@ -247,35 +247,6 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
   // invalid outside of this packaged task.
   std::string Filename = Input.Filename;
 
-  using ::llvm::memprof::RawMemProfReader;
-  if (RawMemProfReader::hasFormat(Input.Filename)) {
-    auto ReaderOrErr = RawMemProfReader::create(Input.Filename, ProfiledBinary);
-    if (!ReaderOrErr) {
-      exitWithError(ReaderOrErr.takeError(), Input.Filename);
-    }
-    std::unique_ptr<RawMemProfReader> Reader = std::move(ReaderOrErr.get());
-    // Check if the profile types can be merged, e.g. clang frontend profiles
-    // should not be merged with memprof profiles.
-    if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) {
-      consumeError(std::move(E));
-      WC->Errors.emplace_back(
-          make_error<StringError>(
-              "Cannot merge MemProf profile with Clang generated profile.",
-              std::error_code()),
-          Filename);
-      return;
-    }
-
-    // Add the records into the writer context.
-    for (const memprof::MemProfRecord &MR : *Reader) {
-      WC->Writer.addRecord(MR, [&](Error E) {
-        instrprof_error IPE = InstrProfError::take(std::move(E));
-        WC->Errors.emplace_back(make_error<InstrProfError>(IPE), Filename);
-      });
-    }
-    return;
-  }
-
   auto ReaderOrErr = InstrProfReader::create(Input.Filename, Correlator);
   if (Error E = ReaderOrErr.takeError()) {
     // Skip the empty profiles by returning sliently.
@@ -361,8 +332,7 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
                               SymbolRemapper *Remapper,
                               StringRef OutputFilename,
                               ProfileFormat OutputFormat, bool OutputSparse,
-                              unsigned NumThreads, FailureMode FailMode,
-                              const StringRef ProfiledBinary) {
+                              unsigned NumThreads, FailureMode FailMode) {
   if (OutputFormat != PF_Binary && OutputFormat != PF_Compact_Binary &&
       OutputFormat != PF_Ext_Binary && OutputFormat != PF_Text)
     exitWithError("unknown format is specified");
@@ -395,15 +365,14 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
 
   if (NumThreads == 1) {
     for (const auto &Input : Inputs)
-      loadInput(Input, Remapper, Correlator.get(), ProfiledBinary,
-                Contexts[0].get());
+      loadInput(Input, Remapper, Correlator.get(), Contexts[0].get());
   } else {
     ThreadPool Pool(hardware_concurrency(NumThreads));
 
     // Load the inputs in parallel (N/NumThreads serial steps).
     unsigned Ctx = 0;
     for (const auto &Input : Inputs) {
-      Pool.async(loadInput, Input, Remapper, Correlator.get(), ProfiledBinary,
+      Pool.async(loadInput, Input, Remapper, Correlator.get(),
                  Contexts[Ctx].get());
       Ctx = (Ctx + 1) % NumThreads;
     }
@@ -620,7 +589,7 @@ static void supplementInstrProfile(
   SmallSet<instrprof_error, 4> WriterErrorCodes;
   auto WC = std::make_unique<WriterContext>(OutputSparse, ErrorLock,
                                             WriterErrorCodes);
-  loadInput(Inputs[0], nullptr, nullptr, /*ProfiledBinary=*/"", WC.get());
+  loadInput(Inputs[0], nullptr, nullptr, WC.get());
   if (WC->Errors.size() > 0)
     exitWithError(std::move(WC->Errors[0].first), InstrFilename);
 
@@ -1000,9 +969,6 @@ static int merge_main(int argc, const char *argv[]) {
   cl::opt<std::string> DebugInfoFilename(
       "debug-info", cl::init(""),
       cl::desc("Use the provided debug info to correlate the raw profile."));
-  cl::opt<std::string> ProfiledBinary(
-      "profiled-binary", cl::init(""),
-      cl::desc("Path to binary from which the profile was collected."));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
@@ -1045,7 +1011,7 @@ static int merge_main(int argc, const char *argv[]) {
   if (ProfileKind == instr)
     mergeInstrProfile(WeightedInputs, DebugInfoFilename, Remapper.get(),
                       OutputFilename, OutputFormat, OutputSparse, NumThreads,
-                      FailureMode, ProfiledBinary);
+                      FailureMode);
   else
     mergeSampleProfile(WeightedInputs, Remapper.get(), OutputFilename,
                        OutputFormat, ProfileSymbolListFile, CompressAllSections,
@@ -1076,7 +1042,7 @@ static void overlapInstrProfile(const std::string &BaseFilename,
     OS << "Sum of edge counts for profile " << TestFilename << " is 0.\n";
     exit(0);
   }
-  loadInput(WeightedInput, nullptr, nullptr, /*ProfiledBinary=*/"", &Context);
+  loadInput(WeightedInput, nullptr, nullptr, &Context);
   overlapInput(BaseFilename, TestFilename, &Context, Overlap, FuncFilter, OS,
                IsCS);
   Overlap.dump(OS);
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 434e6aaee8b02..7bdd6c2992859 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -12,7 +12,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProfWriter.h"
-#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Testing/Support/Error.h"
 #include "llvm/Testing/Support/SupportHelpers.h"
@@ -222,67 +221,6 @@ TEST_F(InstrProfTest, test_writer_merge) {
   ASSERT_EQ(0U, R->Counts[1]);
 }
 
-TEST_F(InstrProfTest, test_memprof) {
-  ASSERT_THAT_ERROR(Writer.mergeProfileKind(InstrProfKind::MemProf),
-                    Succeeded());
-  llvm::memprof::MemProfRecord MR;
-  MR.CallStack.push_back({0x123, 1, 2, false});
-  MR.CallStack.push_back({0x345, 3, 4, true});
-  Writer.addRecord(MR, Err);
-
-  auto Profile = Writer.writeBuffer();
-  readProfile(std::move(Profile));
-
-  auto RecordsOr = Reader->getMemProfRecord(0x123);
-  ASSERT_THAT_ERROR(RecordsOr.takeError(), Succeeded());
-  const auto Records = RecordsOr.get();
-  ASSERT_EQ(Records.size(), 1U);
-  EXPECT_EQ(Records[0], MR);
-}
-
-TEST_F(InstrProfTest, test_memprof_merge) {
-  Writer.addRecord({"func1", 0x1234, {42}}, Err);
-
-  InstrProfWriter Writer2;
-  ASSERT_THAT_ERROR(Writer2.mergeProfileKind(InstrProfKind::MemProf),
-                    Succeeded());
-
-  llvm::memprof::MemProfRecord MR;
-  MR.CallStack.push_back({0x123, 1, 2, false});
-  MR.CallStack.push_back({0x345, 3, 4, true});
-  Writer2.addRecord(MR, Err);
-
-  ASSERT_THAT_ERROR(Writer.mergeProfileKind(Writer2.getProfileKind()),
-                    Succeeded());
-  Writer.mergeRecordsFromWriter(std::move(Writer2), Err);
-
-  auto Profile = Writer.writeBuffer();
-  readProfile(std::move(Profile));
-
-  Expected<InstrProfRecord> R = Reader->getInstrProfRecord("func1", 0x1234);
-  EXPECT_THAT_ERROR(R.takeError(), Succeeded());
-  ASSERT_EQ(1U, R->Counts.size());
-  ASSERT_EQ(42U, R->Counts[0]);
-
-  auto RecordsOr = Reader->getMemProfRecord(0x123);
-  ASSERT_THAT_ERROR(RecordsOr.takeError(), Succeeded());
-  const auto Records = RecordsOr.get();
-  ASSERT_EQ(Records.size(), 1U);
-  EXPECT_EQ(Records[0], MR);
-}
-
-TEST_F(InstrProfTest, test_memprof_invalid_add_record) {
-  llvm::memprof::MemProfRecord MR;
-  // At least one of the frames should be a non-inline frame.
-  MR.CallStack.push_back({0x123, 1, 2, true});
-  MR.CallStack.push_back({0x345, 3, 4, true});
-
-  auto CheckErr = [](Error &&E) {
-    EXPECT_TRUE(ErrorEquals(instrprof_error::invalid_prof, std::move(E)));
-  };
-  Writer.addRecord(MR, CheckErr);
-}
-
 static const char callee1[] = "callee1";
 static const char callee2[] = "callee2";
 static const char callee3[] = "callee3";
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index dc793178bd209..f744b85d784c0 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -89,8 +89,8 @@ const DILineInfoSpecifier specifier() {
       DILineInfoSpecifier::FunctionNameKind::LinkageName);
 }
 
-MATCHER_P4(FrameContains, FunctionName, LineOffset, Column, Inline, "") {
-  const uint64_t ExpectedHash = llvm::Function::getGUID(FunctionName);
+MATCHER_P4(FrameContains, Function, LineOffset, Column, Inline, "") {
+  const std::string ExpectedHash = std::to_string(llvm::MD5Hash(Function));
   if (arg.Function != ExpectedHash) {
     *result_listener << "Hash mismatch";
     return false;
@@ -103,22 +103,6 @@ MATCHER_P4(FrameContains, FunctionName, LineOffset, Column, Inline, "") {
   return false;
 }
 
-MATCHER_P(EqualsRecord, Want, "") {
-  if (arg == Want)
-    return true;
-
-  std::string Explanation;
-  llvm::raw_string_ostream OS(Explanation);
-  OS << "\n Want: \n";
-  Want.print(OS);
-  OS << "\n Got: \n";
-  arg.print(OS);
-  OS.flush();
-
-  *result_listener << Explanation;
-  return false;
-}
-
 MemProfSchema getFullSchema() {
   MemProfSchema Schema;
 #define MIBEntryDef(NameTag, Name, Type) Schema.push_back(Meta::Name);
@@ -200,38 +184,4 @@ TEST(MemProf, PortableWrapper) {
   EXPECT_EQ(3UL, ReadBlock.getAllocCpuId());
 }
 
-TEST(MemProf, RecordSerializationRoundTrip) {
-  const MemProfSchema Schema = getFullSchema();
-
-  llvm::SmallVector<MemProfRecord, 3> Records;
-  MemProfRecord MR;
-
-  MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
-                    /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
-                    /*dealloc_cpu=*/4);
-
-  MR.Info = PortableMemInfoBlock(Info);
-  MR.CallStack.push_back({0x123, 1, 2, false});
-  MR.CallStack.push_back({0x345, 3, 4, false});
-  Records.push_back(MR);
-
-  MR.clear();
-  MR.Info = PortableMemInfoBlock(Info);
-  MR.CallStack.push_back({0x567, 5, 6, false});
-  MR.CallStack.push_back({0x789, 7, 8, false});
-  Records.push_back(MR);
-
-  std::string Buffer;
-  llvm::raw_string_ostream OS(Buffer);
-  serializeRecords(Records, Schema, OS);
-  OS.flush();
-
-  const llvm::SmallVector<MemProfRecord, 4> GotRecords = deserializeRecords(
-      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()));
-
-  ASSERT_TRUE(!GotRecords.empty());
-  EXPECT_EQ(GotRecords.size(), Records.size());
-  EXPECT_THAT(GotRecords[0], EqualsRecord(Records[0]));
-  EXPECT_THAT(GotRecords[1], EqualsRecord(Records[1]));
-}
 } // namespace

From d7895c5914f452d7143c752d0f91f0c7ccc8a5c7 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 17 Feb 2022 23:56:17 +0000
Subject: [PATCH 198/748] [gn build] Port 19bdf44d8508

---
 llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
index 39fe42efd481a..ee186bded1111 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
@@ -13,7 +13,6 @@ static_library("ProfileData") {
     "InstrProfCorrelator.cpp",
     "InstrProfReader.cpp",
     "InstrProfWriter.cpp",
-    "MemProf.cpp",
     "ProfileSummaryBuilder.cpp",
     "RawMemProfReader.cpp",
     "SampleProf.cpp",

From 4f9b8397725c2f57fa671e70f23ac48e7d47fe36 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 16 Feb 2022 16:11:29 -0800
Subject: [PATCH 199/748] [WebAssembly] Make EH/SjLj vars unconditionally
 thread local

This makes three thread local variables (`__THREW__`, `__threwValue`,
and `__wasm_lpad_context`) unconditionally thread local. If the target
doesn't support TLS, they will be downgraded to normal variables in
`stripThreadLocals`. This makes the object not linkable with other
objects using shared memory, which is what we intend here; these
variables should be thread local when used with shared memory. This is
what we initially tried in D88262.

But D88323 changed this: It only created these variables when threads
were supported, because `__THREW__` and `__threwValue` were always
generated even if Emscripten EH/SjLj was not used, making all objects
built without threads not linkable with shared memory, which was too
restrictive. But sometimes this is not safe. If we build an object using
variables such as `__THREW__` without threads, it can be linked to other
objects using shared memory, because the original object's `__THREW__`
was not created thread local to begin with.

So this CL basically reverts D88323 with some additional improvements:
- This checks each of the functions and global variables created within
  `LowerEmscriptenEHSjLj` pass and removes it if it's not used at the
  end of the pass. So only modules using those variables will be
  affected.
- Moves `CoalesceFeaturesAndStripAtomics` and `AtomicExpand` passes
  after all other IR pasess that can create thread local variables. It
  is not sufficient to move them to the end of `addIRPasses`, because
  `__wasm_lpad_context` is created in `WasmEHPrepare`, which runs inside
  `addPassesToHandleExceptions`, which runs before `addISelPrepare`. So
  we override `addISelPrepare` and move atomic/TLS stripping and
  expanding passes there.

This also removes merges `TLS` and `NO-TLS` FileCheck lines into one
`CHECK` line, because in the bitcode level we always create them as
thread local. Also some function declarations are deleted `CHECK` lines
because they are unused.

Reviewed By: tlively, sbc100

Differential Revision: https://reviews.llvm.org/D120013
---
 llvm/lib/CodeGen/WasmEHPrepare.cpp            | 16 ++-----
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp      | 47 ++++++++-----------
 .../WebAssembly/WebAssemblyTargetMachine.cpp  | 17 ++++---
 .../WebAssembly/lower-em-exceptions.ll        | 14 +++---
 .../test/CodeGen/WebAssembly/lower-em-sjlj.ll | 12 ++---
 .../CodeGen/WebAssembly/lower-wasm-sjlj.ll    | 15 +++---
 .../test/CodeGen/WebAssembly/wasmehprepare.ll |  7 ++-
 7 files changed, 56 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 6b7df758e4579..e196931248466 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -213,19 +213,13 @@ bool WasmEHPrepare::prepareEHPads(Function &F) {
   assert(F.hasPersonalityFn() && "Personality function not found");
 
   // __wasm_lpad_context global variable.
-  // If the target supports TLS, make this thread-local. We can't just
-  // unconditionally make it thread-local and depend on
-  // CoalesceFeaturesAndStripAtomics to downgrade it, because stripping TLS has
-  // the side effect of disallowing the object from being linked into a
-  // shared-memory module, which we don't want to be responsible for.
+  // This variable should be thread local. If the target does not support TLS,
+  // we depend on CoalesceFeaturesAndStripAtomics to downgrade it to
+  // non-thread-local ones, in which case we don't allow this object to be
+  // linked with other objects using shared memory.
   LPadContextGV = cast<GlobalVariable>(
       M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
-  Attribute FSAttr = F.getFnAttribute("target-features");
-  if (FSAttr.isValid()) {
-    StringRef FS = FSAttr.getValueAsString();
-    if (FS.contains("+atomics") && FS.contains("+bulk-memory"))
-      LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
-  }
+  LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
 
   LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0,
                                           "lpad_index_gep");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 38415507b23ca..c165542019532 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -406,8 +406,9 @@ static bool canThrow(const Value *V) {
   return true;
 }
 
-// Get a global variable with the given name. If it doesn't exist declare it,
-// which will generate an import and assume that it will exist at link time.
+// Get a thread-local global variable with the given name. If it doesn't exist
+// declare it, which will generate an import and assume that it will exist at
+// link time.
 static GlobalVariable *getGlobalVariable(Module &M, Type *Ty,
                                          WebAssemblyTargetMachine &TM,
                                          const char *Name) {
@@ -415,16 +416,11 @@ static GlobalVariable *getGlobalVariable(Module &M, Type *Ty,
   if (!GV)
     report_fatal_error(Twine("unable to create global: ") + Name);
 
-  // If the target supports TLS, make this variable thread-local. We can't just
-  // unconditionally make it thread-local and depend on
-  // CoalesceFeaturesAndStripAtomics to downgrade it, because stripping TLS has
-  // the side effect of disallowing the object from being linked into a
-  // shared-memory module, which we don't want to be responsible for.
-  auto *Subtarget = TM.getSubtargetImpl();
-  auto TLS = Subtarget->hasAtomics() && Subtarget->hasBulkMemory()
-                 ? GlobalValue::GeneralDynamicTLSModel
-                 : GlobalValue::NotThreadLocal;
-  GV->setThreadLocalMode(TLS);
+  // Variables created by this function are thread local. If the target does not
+  // support TLS, we depend on CoalesceFeaturesAndStripAtomics to downgrade it
+  // to non-thread-local ones, in which case we don't allow this object to be
+  // linked with other objects using shared memory.
+  GV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
   return GV;
 }
 
@@ -1064,22 +1060,16 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
       nullifySetjmp(F);
   }
 
-  if (!Changed) {
-    // Delete unused global variables and functions
-    if (ResumeF)
-      ResumeF->eraseFromParent();
-    if (EHTypeIDF)
-      EHTypeIDF->eraseFromParent();
-    if (EmLongjmpF)
-      EmLongjmpF->eraseFromParent();
-    if (SaveSetjmpF)
-      SaveSetjmpF->eraseFromParent();
-    if (TestSetjmpF)
-      TestSetjmpF->eraseFromParent();
-    return false;
-  }
+  // Delete unused global variables and functions
+  for (auto *V : {ThrewGV, ThrewValueGV})
+    if (V && V->use_empty())
+      V->eraseFromParent();
+  for (auto *V : {GetTempRet0F, SetTempRet0F, ResumeF, EHTypeIDF, EmLongjmpF,
+                  SaveSetjmpF, TestSetjmpF, WasmLongjmpF, CatchF})
+    if (V && V->use_empty())
+      V->eraseFromParent();
 
-  return true;
+  return Changed;
 }
 
 bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
@@ -1829,7 +1819,8 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
         if (auto *CPI = dyn_cast<CatchPadInst>(FromPad)) {
           UnwindDest = CPI->getCatchSwitch()->getUnwindDest();
           break;
-        } else if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) {
+        }
+        if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) {
           // getCleanupRetUnwindDest() can return nullptr when
           // 1. This cleanuppad's matching cleanupret uwninds to caller
           // 2. There is no matching cleanupret because it ends with
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 482837178f3dd..950df71e9efc9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -320,6 +320,7 @@ class WebAssemblyPassConfig final : public TargetPassConfig {
   FunctionPass *createTargetRegisterAllocator(bool) override;
 
   void addIRPasses() override;
+  void addISelPrepare() override;
   bool addInstSelector() override;
   void addPostRegAlloc() override;
   bool addGCPasses() override { return false; }
@@ -407,12 +408,6 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) {
 //===----------------------------------------------------------------------===//
 
 void WebAssemblyPassConfig::addIRPasses() {
-  // Lower atomics and TLS if necessary
-  addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
-
-  // This is a no-op if atomics are not used in the module
-  addPass(createAtomicExpandPass());
-
   // Add signatures to prototype-less function declarations
   addPass(createWebAssemblyAddMissingPrototypes());
 
@@ -455,6 +450,16 @@ void WebAssemblyPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 }
 
+void WebAssemblyPassConfig::addISelPrepare() {
+  // Lower atomics and TLS if necessary
+  addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
+
+  // This is a no-op if atomics are not used in the module
+  addPass(createAtomicExpandPass());
+
+  TargetPassConfig::addISelPrepare();
+}
+
 bool WebAssemblyPassConfig::addInstSelector() {
   (void)TargetPassConfig::addInstSelector();
   addPass(
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
index 95d7c9f2fb535..b72984b7b59f6 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
@@ -1,16 +1,15 @@
-; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-cxx-exceptions -S | FileCheck %s --check-prefixes=CHECK,NO-TLS -DPTR=i32
-; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-cxx-exceptions -S --mattr=+atomics,+bulk-memory | FileCheck %s --check-prefixes=CHECK,TLS -DPTR=i32
-; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-cxx-exceptions --mtriple=wasm64-unknown-unknown -data-layout="e-m:e-p:64:64-i64:64-n32:64-S128" -S | FileCheck %s --check-prefixes=CHECK -DPTR=i64
+; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-cxx-exceptions -S | FileCheck %s -DPTR=i32
+; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-cxx-exceptions -S --mattr=+atomics,+bulk-memory | FileCheck %s -DPTR=i32
+; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-cxx-exceptions --mtriple=wasm64-unknown-unknown -data-layout="e-m:e-p:64:64-i64:64-n32:64-S128" -S | FileCheck %s -DPTR=i64
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 @_ZTIi = external constant i8*
 @_ZTIc = external constant i8*
-; NO-TLS-DAG: __THREW__ = external global [[PTR]]
-; NO-TLS-DAG: __threwValue = external global i32
-; TLS-DAG: __THREW__ = external thread_local global [[PTR]]
-; TLS-DAG: __threwValue = external thread_local global i32
+; CHECK: @__THREW__ = external thread_local global [[PTR]]
+; __threwValue is only used in Emscripten SjLj, so it shouldn't be generated.
+; CHECK-NOT: @__threwValue =
 
 ; Test invoke instruction with clauses (try-catch block)
 define void @clause() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
@@ -172,7 +171,6 @@ declare void @__cxa_call_unexpected(i8*)
 
 ; JS glue functions and invoke wrappers declaration
 ; CHECK-DAG: declare i32 @getTempRet0()
-; CHECK-DAG: declare void @setTempRet0(i32)
 ; CHECK-DAG: declare void @__resumeException(i8*)
 ; CHECK-DAG: declare void @__invoke_void_i32(void (i32)*, i32)
 ; CHECK-DAG: declare i8* @__cxa_find_matching_catch_4(i8*, i8*)
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
index 19253d1cf1175..3873ce9b5dc28 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -1,6 +1,6 @@
-; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-sjlj -S | FileCheck %s --check-prefixes=CHECK,NO-TLS -DPTR=i32
-; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-sjlj -S --mattr=+atomics,+bulk-memory | FileCheck %s --check-prefixes=CHECK,TLS -DPTR=i32
-; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-sjlj --mtriple=wasm64-unknown-unknown -data-layout="e-m:e-p:64:64-i64:64-n32:64-S128" -S | FileCheck %s --check-prefixes=CHECK -DPTR=i64
+; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-sjlj -S | FileCheck %s -DPTR=i32
+; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-sjlj -S --mattr=+atomics,+bulk-memory | FileCheck %s -DPTR=i32
+; RUN: opt < %s -wasm-lower-em-ehsjlj -enable-emscripten-sjlj --mtriple=wasm64-unknown-unknown -data-layout="e-m:e-p:64:64-i64:64-n32:64-S128" -S | FileCheck %s -DPTR=i64
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
@@ -8,11 +8,9 @@ target triple = "wasm32-unknown-unknown"
 %struct.__jmp_buf_tag = type { [6 x i32], i32, [32 x i32] }
 
 @global_var = global i32 0, align 4
-; NO-TLS-DAG: __THREW__ = external global [[PTR]]
-; NO-TLS-DAG: __threwValue = external global [[PTR]]
-; TLS-DAG: __THREW__ = external thread_local global [[PTR]]
-; TLS-DAG: __threwValue = external thread_local global [[PTR]]
 @global_longjmp_ptr = global void (%struct.__jmp_buf_tag*, i32)* @longjmp, align 4
+; CHECK-DAG: @__THREW__ = external thread_local global [[PTR]]
+; CHECK-DAG: @__threwValue = external thread_local global i32
 ; CHECK-DAG: @global_longjmp_ptr = global void (%struct.__jmp_buf_tag*, i32)* bitcast (void ([[PTR]], i32)* @emscripten_longjmp to void (%struct.__jmp_buf_tag*, i32)*)
 
 ; Test a simple setjmp - longjmp sequence
diff --git a/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
index ebe22c7e6cb18..3c18a3ceaf3b6 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
@@ -1,16 +1,16 @@
-; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-sjlj -S | FileCheck %s --check-prefixes=CHECK,NO-TLS -DPTR=i32
-; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-sjlj -S --mattr=+atomics,+bulk-memory | FileCheck %s --check-prefixes=CHECK,TLS -DPTR=i32
-; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-sjlj --mtriple=wasm64-unknown-unknown -data-layout="e-m:e-p:64:64-i64:64-n32:64-S128" -S | FileCheck %s --check-prefixes CHECK -DPTR=i64
+; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-sjlj -S | FileCheck %s -DPTR=i32
+; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-sjlj -S --mattr=+atomics,+bulk-memory | FileCheck %s -DPTR=i32
+; RUN: opt < %s -wasm-lower-em-ehsjlj -wasm-enable-sjlj --mtriple=wasm64-unknown-unknown -data-layout="e-m:e-p:64:64-i64:64-n32:64-S128" -S | FileCheck %s -DPTR=i64
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 %struct.__jmp_buf_tag = type { [6 x i32], i32, [32 x i32] }
 
-; NO-TLS-DAG: __THREW__ = external global [[PTR]]
-; NO-TLS-DAG: __threwValue = external global [[PTR]]
-; TLS-DAG: __THREW__ = external thread_local global [[PTR]]
-; TLS-DAG: __threwValue = external thread_local global [[PTR]]
+; These variables are only used in Emscripten EH/SjLj, so they shouldn't be
+; generated.
+; CHECK-NOT: @__THREW__ =
+; CHECK-NOT: @__threwValue =
 
 @global_longjmp_ptr = global void (%struct.__jmp_buf_tag*, i32)* @longjmp, align 4
 ; CHECK-DAG: @global_longjmp_ptr = global void (%struct.__jmp_buf_tag*, i32)* bitcast (void (i8*, i32)* @__wasm_longjmp to void (%struct.__jmp_buf_tag*, i32)*)
@@ -157,7 +157,6 @@ declare void @free(i8*)
 
 ; JS glue function declarations
 ; CHECK-DAG: declare i32 @getTempRet0()
-; CHECK-DAG: declare void @setTempRet0(i32)
 ; CHECK-DAG: declare i32* @saveSetjmp(%struct.__jmp_buf_tag*, i32, i32*, i32)
 ; CHECK-DAG: declare i32 @testSetjmp([[PTR]], i32*, i32)
 ; CHECK-DAG: declare void @__wasm_longjmp(i8*, i32)
diff --git a/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll b/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll
index 081a9776fa9aa..3fff510e3cb45 100644
--- a/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll
+++ b/llvm/test/CodeGen/WebAssembly/wasmehprepare.ll
@@ -1,11 +1,10 @@
-; RUN: opt < %s -winehprepare -demote-catchswitch-only -wasmehprepare -S | FileCheck %s --check-prefixes=CHECK,NO-TLS
-; RUN: opt < %s -winehprepare -demote-catchswitch-only -wasmehprepare -S --mattr=+atomics,+bulk-memory | FileCheck %s --check-prefixes=CHECK,TLS
+; RUN: opt < %s -winehprepare -demote-catchswitch-only -wasmehprepare -S | FileCheck %s
+; RUN: opt < %s -winehprepare -demote-catchswitch-only -wasmehprepare -S --mattr=+atomics,+bulk-memory | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
-; NO-TLS: @__wasm_lpad_context = external global { i32, i8*, i32 }
-; TLS: @__wasm_lpad_context = external thread_local global { i32, i8*, i32 }
+; CHECK: @__wasm_lpad_context = external thread_local global { i32, i8*, i32 }
 
 @_ZTIi = external constant i8*
 %struct.Temp = type { i8 }

From 6f6ac4af62a76adce847a3de3481731415b9a0f5 Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Wed, 16 Feb 2022 00:38:51 -0800
Subject: [PATCH 200/748] [RISCV][NFC] Add tail agnostic tests for nomask
 Vector Reduction IR intrinsics.

Improve test coverage for tail agnostic nomask Vector Reduction IR.

Reviewed By: monkchiang

Differential Revision: https://reviews.llvm.org/D119920
---
 llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll | 447 +++++++++++++++++++++
 1 file changed, 447 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
index a13c20dfba6d4..d220ba899b7fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
@@ -607,3 +607,450 @@ entry:
 
   ret <vscale x 1 x i16> %a
 }
+
+declare <vscale x 8 x i8> @llvm.riscv.vredsum.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredsum_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredsum_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredsum.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredsum_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredsum.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredsum.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vredand.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredand_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredand_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredand.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredand_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredand.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredand.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vredor.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredor_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredor_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredor.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredor_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredor.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredor.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vredxor.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredxor_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredxor_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredxor_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredxor.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredxor.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vredminu.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredminu_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredminu_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredminu.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredminu_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredminu.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredminu.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vredmin.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredmin_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredmin_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredmin.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredmin_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredmin.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredmin.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vredmaxu.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredmaxu_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredmaxu_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredmaxu.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredmaxu_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredmaxu.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredmaxu.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vredmax.nxv8i8.nxv1i8(
+  <vscale x 8 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vredmax_vs_nxv8i8_nxv1i8_nxv8i8(<vscale x 1 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vredmax_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vredmax.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vredmax_vs_nxv8i8_nxv1i8_nxv8i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vredmax.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vredmax.nxv8i8.nxv1i8(
+    <vscale x 8 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vwredsumu.nxv4i16.nxv1i8(
+  <vscale x 4 x i16>,
+  <vscale x 1 x i8>,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vwredsumu_vs_nxv4i16_nxv1i8_nxv4i16(<vscale x 1 x i8> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vwredsumu_vs_nxv4i16_nxv1i8_nxv4i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vwredsumu.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vwredsumu_vs_nxv4i16_nxv1i8_nxv4i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vwredsumu.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vwredsumu.nxv4i16.nxv1i8(
+    <vscale x 4 x i16> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vwredsum.nxv4i16.nxv1i8(
+  <vscale x 4 x i16>,
+  <vscale x 1 x i8>,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vwredsum_vs_nxv4i16_nxv1i8_nxv4i16(<vscale x 1 x i8> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vwredsum_vs_nxv4i16_nxv1i8_nxv4i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vwredsum.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vwredsum_vs_nxv4i16_nxv1i8_nxv4i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vwredsum.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vwredsum.nxv4i16.nxv1i8(
+    <vscale x 4 x i16> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv1f16(
+  <vscale x 4 x half>,
+  <vscale x 1 x half>,
+  <vscale x 4 x half>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vfredosum_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 1 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfredosum.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfredosum_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfredosum.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfredosum.nxv4f16.nxv1f16(
+    <vscale x 4 x half> undef,
+    <vscale x 1 x half> %0,
+    <vscale x 4 x half> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv1f16(
+  <vscale x 4 x half>,
+  <vscale x 1 x half>,
+  <vscale x 4 x half>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vfredusum_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 1 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfredusum.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfredusum_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfredusum.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfredusum.nxv4f16.nxv1f16(
+    <vscale x 4 x half> undef,
+    <vscale x 1 x half> %0,
+    <vscale x 4 x half> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfredmax.nxv4f16.nxv1f16(
+  <vscale x 4 x half>,
+  <vscale x 1 x half>,
+  <vscale x 4 x half>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vfredmax_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 1 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vfredmax_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfredmax.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfredmax_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfredmax.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfredmax.nxv4f16.nxv1f16(
+    <vscale x 4 x half> undef,
+    <vscale x 1 x half> %0,
+    <vscale x 4 x half> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vfredmin.nxv4f16.nxv1f16(
+  <vscale x 4 x half>,
+  <vscale x 1 x half>,
+  <vscale x 4 x half>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vfredmin_vs_nxv4f16_nxv1f16_nxv4f16(<vscale x 1 x half> %0, <vscale x 4 x half> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vfredmin_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfredmin.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfredmin_vs_nxv4f16_nxv1f16_nxv4f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfredmin.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vfredmin.nxv4f16.nxv1f16(
+    <vscale x 4 x half> undef,
+    <vscale x 1 x half> %0,
+    <vscale x 4 x half> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv1f16(
+  <vscale x 2 x float>,
+  <vscale x 1 x half>,
+  <vscale x 2 x float>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwredosum_vs_nxv2f32_nxv1f16_nxv2f32(<vscale x 1 x half> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv1f16_nxv2f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfwredosum.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfwredosum_vs_nxv2f32_nxv1f16_nxv2f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfwredosum.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwredosum.nxv2f32.nxv1f16(
+    <vscale x 2 x float> undef,
+    <vscale x 1 x half> %0,
+    <vscale x 2 x float> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+declare <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv1f16(
+  <vscale x 2 x float>,
+  <vscale x 1 x half>,
+  <vscale x 2 x float>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwredusum_vs_nxv2f32_nxv1f16_nxv2f32(<vscale x 1 x half> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv1f16_nxv2f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfwredusum.vs v8, v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfwredusum_vs_nxv2f32_nxv1f16_nxv2f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfwredusum.vs v8, v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwredusum.nxv2f32.nxv1f16(
+    <vscale x 2 x float> undef,
+    <vscale x 1 x half> %0,
+    <vscale x 2 x float> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}

From c6a3225bb03b6afc2b63fbf13db3c100406b32ce Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Wed, 16 Feb 2022 20:49:25 -0800
Subject: [PATCH 201/748] [RISCV][NFC] Add some tail agnostic tests for nomask
 operations.

Improve test coverage for tail agnostic nomask vslidedown/up, vmv.s.x
vfmv.s.f and vcompress.

Reviewed By: rogfer01

Differential Revision: https://reviews.llvm.org/D120008
---
 llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll    | 134 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vcompress-rv32.ll |  18 ---
 llvm/test/CodeGen/RISCV/rvv/vcompress-rv64.ll |  18 ---
 3 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
index d220ba899b7fa..65b225ede7152 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
@@ -1054,3 +1054,137 @@ entry:
 
   ret <vscale x 2 x float> %a
 }
+
+declare <vscale x 1 x i8> @llvm.riscv.vslidedown.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vslidedown_vx_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslidedown_vx_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslidedown_vx_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; RV64-NEXT:    vslidedown.vx v8, v8, a0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vslidedown.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vslideup.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vslideup_vx_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslideup_vx_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; RV32-NEXT:    vslideup.vx v9, v8, a0
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslideup_vx_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; RV64-NEXT:    vslideup.vx v9, v8, a0
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vslideup.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    iXLen %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64>, i64, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    vmseq.vi v0, v9, 0
+; RV32-NEXT:    vmerge.vvm v8, v8, v8, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> undef, i64 %0, iXLen %1)
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vfmv.s.f.nxv1f16(<vscale x 1 x half>, half, iXLen)
+
+define <vscale x 1 x half> @intrinsic_vfmv.s.f_f_nxv1f16(half %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vfmv.s.f_f_nxv1f16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV32-NEXT:    vfmv.s.f v8, fa0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vfmv.s.f_f_nxv1f16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
+; RV64-NEXT:    vfmv.s.f v8, fa0
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfmv.s.f.nxv1f16(<vscale x 1 x half> undef, half %0, iXLen %1)
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vcompress.nxv1i8(
+  <vscale x 1 x i8>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vcompress_um_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i1> %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vcompress_um_nxv1i8_nxv1i8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV32-NEXT:    vcompress.vm v9, v8, v0
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vcompress_um_nxv1i8_nxv1i8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
+; RV64-NEXT:    vcompress.vm v9, v8, v0
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vcompress.nxv1i8(
+    <vscale x 1 x i8> undef,
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x i1> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcompress-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vcompress-rv32.ll
index 5a55268171f23..aa42e49f37bf4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcompress-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcompress-rv32.ll
@@ -814,21 +814,3 @@ entry:
 
   ret <vscale x 8 x double> %a
 }
-
-; Test with undef for the dest operand. This should use tail agnostic policy.
-define <vscale x 1 x i8> @intrinsic_vcompress_um_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i1> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vcompress_um_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vcompress.vm v9, v8, v0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vcompress.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i1> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vcompress-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vcompress-rv64.ll
index d77f0da267e55..51924e126b8e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vcompress-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vcompress-rv64.ll
@@ -814,21 +814,3 @@ entry:
 
   ret <vscale x 8 x double> %a
 }
-
-; Test with undef for the dest operand. This should use tail agnostic policy.
-define <vscale x 1 x i8> @intrinsic_vcompress_um_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i1> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vcompress_um_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vcompress.vm v9, v8, v0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vcompress.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i1> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i8> %a
-}

From a83e08b4506d971c724135519d1509570c570889 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 18 Feb 2022 01:15:14 +0100
Subject: [PATCH 202/748] [mlir][Vector] Add InferTypeOpInterface to vector
 dialect and remove the now redundant builders

---
 mlir/include/mlir/Dialect/Vector/IR/VectorOps.td  | 7 +------
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp          | 9 +--------
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 5 +++--
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 66d4a69593358..c9370a07ab7fa 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -13,6 +13,7 @@
 #ifndef VECTOR_OPS
 #define VECTOR_OPS
 
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/VectorInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
@@ -675,10 +676,6 @@ def Vector_FMAOp :
     ```
   }];
   let assemblyFormat = "$lhs `,` $rhs `,` $acc attr-dict `:` type($lhs)";
-  let builders = [
-    OpBuilder<(ins "Value":$lhs, "Value":$rhs, "Value":$acc),
-    [{build($_builder, $_state, lhs.getType(), lhs, rhs, acc);}]>
-  ];
   let extraClassDeclaration = [{
     VectorType getVectorType() { return lhs().getType().cast<VectorType>(); }
   }];
@@ -721,8 +718,6 @@ def Vector_InsertElementOp :
   let builders = [
     // 0-D builder.
     OpBuilder<(ins "Value":$source, "Value":$dest)>,
-    // 1-D + position builder.
-    OpBuilder<(ins "Value":$source, "Value":$dest, "Value":$position)>
   ];
   let extraClassDeclaration = [{
     Type getSourceType() { return source().getType(); }
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 4db150927fae5..fc472f848b46e 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -1837,14 +1837,7 @@ OpFoldResult vector::ShuffleOp::fold(ArrayRef<Attribute> operands) {
 
 void InsertElementOp::build(OpBuilder &builder, OperationState &result,
                             Value source, Value dest) {
-  result.addOperands({source, dest});
-  result.addTypes(dest.getType());
-}
-
-void InsertElementOp::build(OpBuilder &builder, OperationState &result,
-                            Value source, Value dest, Value position) {
-  result.addOperands({source, dest, position});
-  result.addTypes(dest.getType());
+  build(builder, result, source, dest, {});
 }
 
 LogicalResult InsertElementOp::verify() {
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 32d78f225a05b..4fb2c01892f3f 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2174,7 +2174,7 @@ cc_library(
         ":StandardOps",
         ":Support",
         ":TransformUtils",
-       ":Transforms",
+        ":Transforms",
         ":TransformsPassIncGen",
         "//llvm:Core",
         "//llvm:Support",
@@ -3616,8 +3616,8 @@ cc_library(
         ":ArithmeticToLLVM",
         ":Async",
         ":AsyncToLLVM",
-        ":ConversionPassIncGen",
         ":ControlFlowToLLVM",
+        ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUTransforms",
         ":IR",
@@ -7143,6 +7143,7 @@ td_library(
     srcs = ["include/mlir/Dialect/Vector/IR/VectorOps.td"],
     includes = ["include"],
     deps = [
+        ":InferTypeOpInterfaceTdFiles",
         ":OpBaseTdFiles",
         ":SideEffectInterfacesTdFiles",
         ":VectorInterfacesTdFiles",

From f0dd818be389e193cbe5817996e434696f37a702 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 18 Feb 2022 01:35:25 +0100
Subject: [PATCH 203/748] [mlir][Vector] Switch ShuffleOp to the declarative
 assembly format

This also requires implementing return type deduction.
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       |  5 +-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 55 +++++--------------
 mlir/test/Dialect/Vector/invalid.mlir         |  2 +-
 3 files changed, 19 insertions(+), 43 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index c9370a07ab7fa..1e16dbbb97295 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -447,7 +447,8 @@ def Vector_ShuffleOp :
      PredOpTrait<"first operand v1 and result have same element type",
                  TCresVTEtIsSameAsOpBase<0, 0>>,
      PredOpTrait<"second operand v2 and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 1>>]>,
+                 TCresVTEtIsSameAsOpBase<0, 1>>,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
      Arguments<(ins AnyVector:$v1, AnyVector:$v2, I64ArrayAttr:$mask)>,
      Results<(outs AnyVector:$vector)> {
   let summary = "shuffle operation";
@@ -496,7 +497,7 @@ def Vector_ShuffleOp :
       return vector().getType().cast<VectorType>();
     }
   }];
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = "operands $mask attr-dict `:` type(operands)";
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index fc472f848b46e..560746453a079 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -1723,19 +1723,7 @@ void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 void ShuffleOp::build(OpBuilder &builder, OperationState &result, Value v1,
                       Value v2, ArrayRef<int64_t> mask) {
-  result.addOperands({v1, v2});
-  auto maskAttr = getVectorSubscriptAttr(builder, mask);
-  auto v1Type = v1.getType().cast<VectorType>();
-  auto shape = llvm::to_vector<4>(v1Type.getShape());
-  shape[0] = mask.size();
-  result.addTypes(VectorType::get(shape, v1Type.getElementType()));
-  result.addAttribute(getMaskAttrStrName(), maskAttr);
-}
-
-void ShuffleOp::print(OpAsmPrinter &p) {
-  p << " " << v1() << ", " << v2() << " " << mask();
-  p.printOptionalAttrDict((*this)->getAttrs(), {ShuffleOp::getMaskAttrName()});
-  p << " : " << v1().getType() << ", " << v2().getType();
+  build(builder, result, v1, v2, getVectorSubscriptAttr(builder, mask));
 }
 
 LogicalResult ShuffleOp::verify() {
@@ -1759,6 +1747,8 @@ LogicalResult ShuffleOp::verify() {
   // Verify mask length.
   auto maskAttr = mask().getValue();
   int64_t maskLength = maskAttr.size();
+  if (maskLength <= 0)
+    return emitOpError("invalid mask length");
   if (maskLength != resultType.getDimSize(0))
     return emitOpError("mask length mismatch");
   // Verify all indices.
@@ -1771,36 +1761,21 @@ LogicalResult ShuffleOp::verify() {
   return success();
 }
 
-ParseResult ShuffleOp::parse(OpAsmParser &parser, OperationState &result) {
-  OpAsmParser::OperandType v1, v2;
-  Attribute attr;
-  VectorType v1Type, v2Type;
-  if (parser.parseOperand(v1) || parser.parseComma() ||
-      parser.parseOperand(v2) ||
-      parser.parseAttribute(attr, ShuffleOp::getMaskAttrStrName(),
-                            result.attributes) ||
-      parser.parseOptionalAttrDict(result.attributes) ||
-      parser.parseColonType(v1Type) || parser.parseComma() ||
-      parser.parseType(v2Type) ||
-      parser.resolveOperand(v1, v1Type, result.operands) ||
-      parser.resolveOperand(v2, v2Type, result.operands))
-    return failure();
+LogicalResult
+ShuffleOp::inferReturnTypes(MLIRContext *, Optional<Location>,
+                            ValueRange operands, DictionaryAttr attributes,
+                            RegionRange,
+                            SmallVectorImpl<Type> &inferredReturnTypes) {
+  ShuffleOp::Adaptor op(operands, attributes);
+  auto v1Type = op.v1().getType().cast<VectorType>();
   // Construct resulting type: leading dimension matches mask length,
   // all trailing dimensions match the operands.
-  auto maskAttr = attr.dyn_cast<ArrayAttr>();
-  if (!maskAttr)
-    return parser.emitError(parser.getNameLoc(), "missing mask attribute");
-  int64_t maskLength = maskAttr.size();
-  if (maskLength <= 0)
-    return parser.emitError(parser.getNameLoc(), "invalid mask length");
-  int64_t v1Rank = v1Type.getRank();
   SmallVector<int64_t, 4> shape;
-  shape.reserve(v1Rank);
-  shape.push_back(maskLength);
-  for (int64_t r = 1; r < v1Rank; ++r)
-    shape.push_back(v1Type.getDimSize(r));
-  VectorType resType = VectorType::get(shape, v1Type.getElementType());
-  parser.addTypeToList(resType, result.types);
+  shape.reserve(v1Type.getRank());
+  shape.push_back(std::max<size_t>(1, op.mask().size()));
+  llvm::append_range(shape, v1Type.getShape().drop_front());
+  inferredReturnTypes.push_back(
+      VectorType::get(shape, v1Type.getElementType()));
   return success();
 }
 
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 54697d1136ebf..bc75e0bbe8b2e 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -73,7 +73,7 @@ func @shuffle_index_out_of_range(%arg0: vector<2xf32>, %arg1: vector<2xf32>) {
 // -----
 
 func @shuffle_empty_mask(%arg0: vector<2xf32>, %arg1: vector<2xf32>) {
-  // expected-error@+1 {{'vector.shuffle' invalid mask length}}
+  // expected-error@+1 {{'vector.shuffle' op invalid mask length}}
   %1 = vector.shuffle %arg0, %arg1 [] : vector<2xf32>, vector<2xf32>
 }
 

From d9da6a535f21946cfaac1516ef28ac7646211d56 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Sat, 12 Feb 2022 16:33:22 -0500
Subject: [PATCH 204/748] [LICM][PhaseOrder] Don't speculate in LICM until
 after running loop rotate

LICM will speculatively hoist code outside of loops. This requires removing information, like alias analysis (https://github.com/llvm/llvm-project/issues/53794), range information (https://bugs.llvm.org/show_bug.cgi?id=50550), among others. Prior to https://reviews.llvm.org/D99249 , LICM would only be run after LoopRotate. Running Loop Rotate prior to LICM prevents a instruction hoist from being speculative, if it was conditionally executed by the iteration (as is commonly emitted by clang and other frontends). Adding the additional LICM pass first, however, forces all of these instructions to be considered speculative, even if they are not speculative after LoopRotate. This destroys information, resulting in performance losses for discarding this additional information.

This PR modifies LICM to accept a ``speculative'' parameter which allows LICM to be set to perform information-loss speculative hoists or not. Phase ordering is then modified to not perform the information-losing speculative hoists until after loop rotate is performed, preserving this additional information.

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D119965
---
 llvm/include/llvm/Transforms/Scalar.h         |  3 +-
 llvm/include/llvm/Transforms/Scalar/LICM.h    | 20 ++++--
 .../include/llvm/Transforms/Utils/LoopUtils.h |  9 ++-
 llvm/lib/Passes/PassBuilderPipelines.cpp      | 34 ++++++---
 .../lib/Transforms/IPO/PassManagerBuilder.cpp | 26 ++++---
 llvm/lib/Transforms/Scalar/LICM.cpp           | 71 +++++++++---------
 .../AArch64/runtime-unroll-generic.ll         | 20 +++---
 .../AArch64/matrix-extract-insert.ll          | 12 ++--
 .../X86/hoist-load-of-baseptr.ll              | 20 +++---
 .../PhaseOrdering/X86/speculation-vs-tbaa.ll  | 72 ++++++++++---------
 .../PhaseOrdering/X86/spurious-peeling.ll     | 14 ++--
 11 files changed, 177 insertions(+), 124 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index d6228700aa9ac..4d6874f784efb 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -133,7 +133,8 @@ Pass *createIndVarSimplifyPass();
 //
 Pass *createLICMPass();
 Pass *createLICMPass(unsigned LicmMssaOptCap,
-                     unsigned LicmMssaNoAccForPromotionCap);
+                     unsigned LicmMssaNoAccForPromotionCap,
+                     bool AllowSpeculation);
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/llvm/include/llvm/Transforms/Scalar/LICM.h b/llvm/include/llvm/Transforms/Scalar/LICM.h
index 751f75c0ccb24..503c8792d3092 100644
--- a/llvm/include/llvm/Transforms/Scalar/LICM.h
+++ b/llvm/include/llvm/Transforms/Scalar/LICM.h
@@ -46,14 +46,18 @@ extern cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap;
 class LICMPass : public PassInfoMixin<LICMPass> {
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
+  bool LicmAllowSpeculation;
 
 public:
   LICMPass()
       : LicmMssaOptCap(SetLicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {}
-  LICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap)
+        LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap),
+        LicmAllowSpeculation(true) {}
+  LICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap,
+           bool LicmAllowSpeculation)
       : LicmMssaOptCap(LicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap),
+        LicmAllowSpeculation(LicmAllowSpeculation) {}
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
@@ -62,14 +66,18 @@ class LICMPass : public PassInfoMixin<LICMPass> {
 class LNICMPass : public PassInfoMixin<LNICMPass> {
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
+  bool LicmAllowSpeculation;
 
 public:
   LNICMPass()
       : LicmMssaOptCap(SetLicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {}
-  LNICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap)
+        LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap),
+        LicmAllowSpeculation(true) {}
+  LNICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap,
+            bool LicmAllowSpeculation)
       : LicmMssaOptCap(LicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap),
+        LicmAllowSpeculation(LicmAllowSpeculation) {}
   PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 3a712d78df671..134f8bcfd8886 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -171,10 +171,13 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *,
 /// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all
 /// instructions of the loop and loop safety information as arguments.
 /// Diagnostics is emitted via \p ORE. It returns changed status.
+/// \p AllowSpeculation is whether values should be hoisted even if they are not
+/// guaranteed to execute in the loop, but are safe to speculatively execute.
 bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                  BlockFrequencyInfo *, TargetLibraryInfo *, Loop *,
                  MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *,
-                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool);
+                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool,
+                 bool AllowSpeculation);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
@@ -204,12 +207,14 @@ void breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 /// LoopInfo, DominatorTree, Loop, AliasSet information for all instructions
 /// of the loop and loop safety information as arguments.
 /// Diagnostics is emitted via \p ORE. It returns changed status.
+/// \p AllowSpeculation is whether values should be hoisted even if they are not
+/// guaranteed to execute in the loop, but are safe to speculatively execute.
 bool promoteLoopAccessesToScalars(
     const SmallSetVector<Value *, 8> &, SmallVectorImpl<BasicBlock *> &,
     SmallVectorImpl<Instruction *> &, SmallVectorImpl<MemoryAccess *> &,
     PredIteratorCache &, LoopInfo *, DominatorTree *, const TargetLibraryInfo *,
     Loop *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
-    OptimizationRemarkEmitter *);
+    OptimizationRemarkEmitter *, bool AllowSpeculation);
 
 /// Does a BFS from a given node to all of its children inside a given loop.
 /// The returned vector of nodes includes the starting point.
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 2aba7ef262340..e838665eb9ce9 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -293,14 +293,19 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   LPM1.addPass(LoopSimplifyCFGPass());
 
   // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
+  // to reduce amount of IR that will have to be duplicated. However,
+  // do not perform speculative hoisting the first time as LICM
+  // will destroy metadata that may not need to be destroyed if run
+  // after loop rotation.
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/false));
 
   LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
                               isLTOPreLink(Phase)));
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/true));
   LPM1.addPass(SimpleLoopUnswitchPass());
   if (EnableLoopFlatten)
     LPM1.addPass(LoopFlattenPass());
@@ -470,15 +475,20 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM1.addPass(LoopSimplifyCFGPass());
 
   // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
+  // to reduce amount of IR that will have to be duplicated. However,
+  // do not perform speculative hoisting the first time as LICM
+  // will destroy metadata that may not need to be destroyed if run
+  // after loop rotation.
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/false));
 
   // Disable header duplication in loop rotation at -Oz.
   LPM1.addPass(
       LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
   // TODO: Investigate promotion cap for O1.
-  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/true));
   LPM1.addPass(
       SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
                              EnableO3NonTrivialUnswitching));
@@ -575,7 +585,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   FPM.addPass(DSEPass());
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+               /*AllowSpeculation=*/true),
       /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
 
   FPM.addPass(CoroElidePass());
@@ -1019,7 +1030,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     ExtraPasses.addPass(CorrelatedValuePropagationPass());
     ExtraPasses.addPass(InstCombinePass());
     LoopPassManager LPM;
-    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                         /*AllowSpeculation=*/true));
     LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
                                        OptimizationLevel::O3));
     ExtraPasses.addPass(
@@ -1087,7 +1099,8 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(
         RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
     FPM.addPass(createFunctionToLoopPassAdaptor(
-        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                 /*AllowSpeculation=*/true),
         /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
   }
 
@@ -1627,7 +1640,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   FunctionPassManager MainFPM;
   MainFPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+               /*AllowSpeculation=*/true),
       /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
 
   if (RunNewGVN)
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index d13eedf80e3bf..6e5aeb9c41f6f 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -458,13 +458,18 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createLoopSimplifyCFGPass());
   }
   // Try to remove as much code from the loop header as possible,
-  // to reduce amount of IR that will have to be duplicated.
+  // to reduce amount of IR that will have to be duplicated. However,
+  // do not perform speculative hoisting the first time as LICM
+  // will destroy metadata that may not need to be destroyed if run
+  // after loop rotation.
   // TODO: Investigate promotion cap for O1.
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                         /*AllowSpeculation=*/false));
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
   // TODO: Investigate promotion cap for O1.
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                         /*AllowSpeculation=*/true));
   if (EnableSimpleLoopUnswitch)
     MPM.add(createSimpleLoopUnswitchLegacyPass());
   else
@@ -529,7 +534,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // TODO: Investigate if this is too expensive at O1.
   if (OptLevel > 1) {
     MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                           /*AllowSpeculation=*/true));
   }
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
@@ -588,7 +594,8 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
     PM.add(createEarlyCSEPass());
     PM.add(createCorrelatedValuePropagationPass());
     PM.add(createInstructionCombiningPass());
-    PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                          /*AllowSpeculation=*/true));
     PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
     PM.add(createCFGSimplificationPass(
         SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
@@ -651,7 +658,8 @@ void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
       // unrolled loop is a inner loop, then the prologue will be inside the
       // outer loop. LICM pass can help to promote the runtime check out if the
       // checked value is loop invariant.
-      PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+      PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                            /*AllowSpeculation=*/true));
     }
 
     PM.add(createWarnMissedTransformationsPass());
@@ -898,7 +906,8 @@ void PassManagerBuilder::populateModulePassManager(
   // later might get benefit of no-alias assumption in clone loop.
   if (UseLoopVersioningLICM) {
     MPM.add(createLoopVersioningLICMPass());    // Do LoopVersioningLICM
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                           /*AllowSpeculation=*/true));
   }
 
   // We add a fresh GlobalsModRef run at this point. This is particularly
@@ -1133,7 +1142,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Run a few AA driven optimizations here and now, to cleanup the code.
   PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
 
-  PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                        /*AllowSpeculation=*/true));
   PM.add(NewGVN ? createNewGVNPass()
                 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 7fb1a25bdf13e..6372ce19f8eec 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -149,13 +149,11 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                  BlockFrequencyInfo *BFI, const Loop *CurLoop,
                  ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
                  OptimizationRemarkEmitter *ORE);
-static bool isSafeToExecuteUnconditionally(Instruction &Inst,
-                                           const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI,
-                                           const Loop *CurLoop,
-                                           const LoopSafetyInfo *SafetyInfo,
-                                           OptimizationRemarkEmitter *ORE,
-                                           const Instruction *CtxI = nullptr);
+static bool isSafeToExecuteUnconditionally(
+    Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+    const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE, const Instruction *CtxI,
+    bool AllowSpeculation);
 static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
                                      AliasSetTracker *CurAST, Loop *CurLoop,
                                      AAResults *AA);
@@ -188,21 +186,26 @@ struct LoopInvariantCodeMotion {
                  OptimizationRemarkEmitter *ORE, bool LoopNestMode = false);
 
   LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
-                          unsigned LicmMssaNoAccForPromotionCap)
+                          unsigned LicmMssaNoAccForPromotionCap,
+                          bool LicmAllowSpeculation)
       : LicmMssaOptCap(LicmMssaOptCap),
-        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap),
+        LicmAllowSpeculation(LicmAllowSpeculation) {}
 
 private:
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
+  bool LicmAllowSpeculation;
 };
 
 struct LegacyLICMPass : public LoopPass {
   static char ID; // Pass identification, replacement for typeid
   LegacyLICMPass(
       unsigned LicmMssaOptCap = SetLicmMssaOptCap,
-      unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap)
-      : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) {
+      unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap,
+      bool LicmAllowSpeculation = true)
+      : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                           LicmAllowSpeculation) {
     initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -265,7 +268,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
-  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                               LicmAllowSpeculation);
   if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI,
                       &AR.SE, AR.MSSA, &ORE))
     return PreservedAnalyses::all();
@@ -290,7 +294,8 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
   // but ORE cannot be preserved (see comment before the pass definition).
   OptimizationRemarkEmitter ORE(LN.getParent());
 
-  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                               LicmAllowSpeculation);
 
   Loop &OutermostLoop = LN.getOutermostLoop();
   bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, AR.BFI,
@@ -321,8 +326,10 @@ INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
 
 Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
 Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
-                           unsigned LicmMssaNoAccForPromotionCap) {
-  return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+                           unsigned LicmMssaNoAccForPromotionCap,
+                           bool LicmAllowSpeculation) {
+  return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                            LicmAllowSpeculation);
 }
 
 llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L,
@@ -418,7 +425,8 @@ bool LoopInvariantCodeMotion::runOnLoop(
   Flags.setIsSink(false);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
-                           &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode);
+                           &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode,
+                           LicmAllowSpeculation);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -460,8 +468,8 @@ bool LoopInvariantCodeMotion::runOnLoop(
         for (const SmallSetVector<Value *, 8> &PointerMustAliases :
              collectPromotionCandidates(MSSA, AA, L)) {
           LocalPromoted |= promoteLoopAccessesToScalars(
-              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC,
-              LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE);
+              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI,
+              DT, TLI, L, &MSSAU, &SafetyInfo, ORE, LicmAllowSpeculation);
         }
         Promoted |= LocalPromoted;
       } while (LocalPromoted);
@@ -825,7 +833,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
                        ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
-                       OptimizationRemarkEmitter *ORE, bool LoopNestMode) {
+                       OptimizationRemarkEmitter *ORE, bool LoopNestMode,
+                       bool AllowSpeculation) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
          CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr &&
@@ -877,7 +886,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                              true, &Flags, ORE) &&
           isSafeToExecuteUnconditionally(
               I, DT, TLI, CurLoop, SafetyInfo, ORE,
-              CurLoop->getLoopPreheader()->getTerminator())) {
+              CurLoop->getLoopPreheader()->getTerminator(), AllowSpeculation)) {
         hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
               MSSAU, SE, ORE);
         HoistedInstructions.push_back(&I);
@@ -1774,14 +1783,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
 /// Only sink or hoist an instruction if it is not a trapping instruction,
 /// or if the instruction is known not to trap when moved to the preheader.
 /// or if it is a trapping instruction and is guaranteed to execute.
-static bool isSafeToExecuteUnconditionally(Instruction &Inst,
-                                           const DominatorTree *DT,
-                                           const TargetLibraryInfo *TLI,
-                                           const Loop *CurLoop,
-                                           const LoopSafetyInfo *SafetyInfo,
-                                           OptimizationRemarkEmitter *ORE,
-                                           const Instruction *CtxI) {
-  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI))
+static bool isSafeToExecuteUnconditionally(
+    Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+    const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE, const Instruction *CtxI,
+    bool AllowSpeculation) {
+  if (AllowSpeculation && isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI))
     return true;
 
   bool GuaranteedToExecute =
@@ -1949,7 +1956,7 @@ bool llvm::promoteLoopAccessesToScalars(
     SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
     Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
-    OptimizationRemarkEmitter *ORE) {
+    OptimizationRemarkEmitter *ORE, bool AllowSpeculation) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
          SafetyInfo != nullptr &&
@@ -2054,9 +2061,9 @@ bool llvm::promoteLoopAccessesToScalars(
         // to execute does as well.  Thus we can increase our guaranteed
         // alignment as well.
         if (!DereferenceableInPH || (InstAlignment > Alignment))
-          if (isSafeToExecuteUnconditionally(*Load, DT, TLI, CurLoop,
-                                             SafetyInfo, ORE,
-                                             Preheader->getTerminator())) {
+          if (isSafeToExecuteUnconditionally(
+                  *Load, DT, TLI, CurLoop, SafetyInfo, ORE,
+                  Preheader->getTerminator(), AllowSpeculation)) {
             DereferenceableInPH = true;
             Alignment = std::max(Alignment, InstAlignment);
           }
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
index 4f0c07343bda8..428eccc2761bf 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
@@ -6,21 +6,21 @@
 define void @runtime_unroll_generic(i32 %arg_0, i32* %arg_1, i16* %arg_2, i16* %arg_3) {
 ; CHECK-A55-LABEL: @runtime_unroll_generic(
 ; CHECK-A55-NEXT:  entry:
+; CHECK-A55-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
+; CHECK-A55-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_LR_PH:%.*]]
+; CHECK-A55:       for.body6.lr.ph:
 ; CHECK-A55-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[ARG_2:%.*]], i64 undef
 ; CHECK-A55-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[ARG_3:%.*]], i64 undef
 ; CHECK-A55-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[ARG_1:%.*]], i64 undef
-; CHECK-A55-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
-; CHECK-A55-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]]
-; CHECK-A55:       for.body6.preheader:
 ; CHECK-A55-NEXT:    [[TMP0:%.*]] = add i32 [[ARG_0]], -1
 ; CHECK-A55-NEXT:    [[XTRAITER:%.*]] = and i32 [[ARG_0]], 3
 ; CHECK-A55-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-A55-NEXT:    br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY6_PREHEADER_NEW:%.*]]
-; CHECK-A55:       for.body6.preheader.new:
+; CHECK-A55-NEXT:    br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY6_LR_PH_NEW:%.*]]
+; CHECK-A55:       for.body6.lr.ph.new:
 ; CHECK-A55-NEXT:    [[UNROLL_ITER:%.*]] = and i32 [[ARG_0]], -4
 ; CHECK-A55-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-A55:       for.body6:
-; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_BODY6_LR_PH_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
 ; CHECK-A55-NEXT:    [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
 ; CHECK-A55-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-A55-NEXT:    [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2
@@ -93,13 +93,15 @@ define void @runtime_unroll_generic(i32 %arg_0, i32* %arg_1, i16* %arg_2, i16* %
 ;
 ; CHECK-GENERIC-LABEL: @runtime_unroll_generic(
 ; CHECK-GENERIC-NEXT:  entry:
+; CHECK-GENERIC-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
+; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_LR_PH:%.*]]
+; CHECK-GENERIC:       for.body6.lr.ph:
 ; CHECK-GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[ARG_2:%.*]], i64 undef
 ; CHECK-GENERIC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[ARG_3:%.*]], i64 undef
 ; CHECK-GENERIC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[ARG_1:%.*]], i64 undef
-; CHECK-GENERIC-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
-; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6:%.*]]
+; CHECK-GENERIC-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-GENERIC:       for.body6:
-; CHECK-GENERIC-NEXT:    [[K_03:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-GENERIC-NEXT:    [[K_03:%.*]] = phi i32 [ 0, [[FOR_BODY6_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY6]] ]
 ; CHECK-GENERIC-NEXT:    [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
 ; CHECK-GENERIC-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
 ; CHECK-GENERIC-NEXT:    [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 33cd004d43482..eee888d2e18c0 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -88,10 +88,10 @@ entry:
 define void @matrix_extract_insert_loop(i32 %i, [225 x double]* nonnull align 8 dereferenceable(1800) %A, [225 x double]* nonnull align 8 dereferenceable(1800) %B) {
 ; CHECK-LABEL: @matrix_extract_insert_loop(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP212_NOT:%.*]] = icmp eq i32 [[I:%.*]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [225 x double]* [[A:%.*]] to <225 x double>*
-; CHECK-NEXT:    [[CONV6:%.*]] = zext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>*
-; CHECK-NEXT:    [[CMP212_NOT:%.*]] = icmp eq i32 [[I]], 0
 ; CHECK-NEXT:    br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[I]], 225
@@ -145,8 +145,8 @@ define void @matrix_extract_insert_loop(i32 %i, [225 x double]* nonnull align 8
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_2:%.*]]
 ; CHECK:       for.body4.us.2:
 ; CHECK-NEXT:    [[K_013_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ]
-; CHECK-NEXT:    [[NARROW16:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30
-; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[NARROW16]] to i64
+; CHECK-NEXT:    [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[NARROW17]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult i32 [[K_013_US_2]], 195
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP18]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP17]]
@@ -168,8 +168,8 @@ define void @matrix_extract_insert_loop(i32 %i, [225 x double]* nonnull align 8
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_3:%.*]]
 ; CHECK:       for.body4.us.3:
 ; CHECK-NEXT:    [[K_013_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ]
-; CHECK-NEXT:    [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45
-; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[NARROW17]] to i64
+; CHECK-NEXT:    [[NARROW18:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[NARROW18]] to i64
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[K_013_US_3]], 180
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP25]])
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP24]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
index 20c3cb029d2ae..9b602750974af 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
@@ -44,15 +44,15 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* nound
 ; OLDPM_O2-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
 ; OLDPM_O2-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; OLDPM_O2-NEXT:  entry:
-; OLDPM_O2-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
 ; OLDPM_O2-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; OLDPM_O2-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; OLDPM_O2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; OLDPM_O2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
 ; OLDPM_O2-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
 ; OLDPM_O2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]]
 ; OLDPM_O2-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; OLDPM_O2:       for.cond1.preheader:
 ; OLDPM_O2-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
-; OLDPM_O2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; OLDPM_O2-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]]
 ; OLDPM_O2:       for.body4.preheader:
 ; OLDPM_O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER11:%.*]], label [[VECTOR_BODY:%.*]]
@@ -97,8 +97,9 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* nound
 ; OLDPM_O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
 ; OLDPM_O3-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; OLDPM_O3-NEXT:  entry:
-; OLDPM_O3-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
 ; OLDPM_O3-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; OLDPM_O3-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; OLDPM_O3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; OLDPM_O3-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
 ; OLDPM_O3:       for.cond1.preheader.us.preheader:
 ; OLDPM_O3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
@@ -107,7 +108,6 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* nound
 ; OLDPM_O3-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; OLDPM_O3:       for.cond1.preheader.us:
 ; OLDPM_O3-NEXT:    [[I_08_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; OLDPM_O3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; OLDPM_O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
 ; OLDPM_O3:       vector.body:
 ; OLDPM_O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
@@ -150,12 +150,12 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* nound
 ; NEWPM_O1-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
 ; NEWPM_O1-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; NEWPM_O1-NEXT:  entry:
-; NEWPM_O1-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
 ; NEWPM_O1-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; NEWPM_O1-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; NEWPM_O1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; NEWPM_O1-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; NEWPM_O1:       for.cond1.preheader:
 ; NEWPM_O1-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
-; NEWPM_O1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; NEWPM_O1-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4:%.*]]
 ; NEWPM_O1:       for.cond.cleanup:
 ; NEWPM_O1-NEXT:    ret void
@@ -176,15 +176,15 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* nound
 ; NEWPM_O2-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
 ; NEWPM_O2-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; NEWPM_O2-NEXT:  entry:
-; NEWPM_O2-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
 ; NEWPM_O2-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; NEWPM_O2-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; NEWPM_O2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; NEWPM_O2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
 ; NEWPM_O2-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8
 ; NEWPM_O2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]]
 ; NEWPM_O2-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; NEWPM_O2:       for.cond1.preheader:
 ; NEWPM_O2-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
-; NEWPM_O2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; NEWPM_O2-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]]
 ; NEWPM_O2:       for.body4.preheader:
 ; NEWPM_O2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER11:%.*]], label [[VECTOR_BODY:%.*]]
@@ -229,8 +229,9 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* nound
 ; NEWPM_O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
 ; NEWPM_O3-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; NEWPM_O3-NEXT:  entry:
-; NEWPM_O3-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
 ; NEWPM_O3-NEXT:    [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0
+; NEWPM_O3-NEXT:    [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0
+; NEWPM_O3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; NEWPM_O3-NEXT:    br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
 ; NEWPM_O3:       for.cond1.preheader.us.preheader:
 ; NEWPM_O3-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8
@@ -239,7 +240,6 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(%"class.std::vector"* nound
 ; NEWPM_O3-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; NEWPM_O3:       for.cond1.preheader.us:
 ; NEWPM_O3-NEXT:    [[I_08_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; NEWPM_O3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8
 ; NEWPM_O3-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
 ; NEWPM_O3:       vector.body:
 ; NEWPM_O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
index 3ac9104708405..41dbdcb0c5bd3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
@@ -14,13 +14,15 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem) {
 ; OLDPM_O1-LABEL: @licm(
 ; OLDPM_O1-NEXT:  entry:
-; OLDPM_O1-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
 ; OLDPM_O1-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; OLDPM_O1-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
+; OLDPM_O1-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; OLDPM_O1:       for.body.lr.ph:
+; OLDPM_O1-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; OLDPM_O1-NEXT:    br label [[FOR_BODY:%.*]]
 ; OLDPM_O1:       for.body:
-; OLDPM_O1-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; OLDPM_O1-NEXT:    [[K_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; OLDPM_O1-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; OLDPM_O1-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; OLDPM_O1-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; OLDPM_O1-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
 ; OLDPM_O1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
 ; OLDPM_O1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
@@ -29,12 +31,12 @@ define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem)
 ;
 ; OLDPM_O23-LABEL: @licm(
 ; OLDPM_O23-NEXT:  entry:
-; OLDPM_O23-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
 ; OLDPM_O23-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; OLDPM_O23-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; OLDPM_O23:       for.body.preheader:
+; OLDPM_O23-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; OLDPM_O23:       for.body.lr.ph:
+; OLDPM_O23-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]]
 ; OLDPM_O23-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
-; OLDPM_O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
+; OLDPM_O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
 ; OLDPM_O23:       vector.ph:
 ; OLDPM_O23-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
 ; OLDPM_O23-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -42,38 +44,40 @@ define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem)
 ; OLDPM_O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; OLDPM_O23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
 ; OLDPM_O23-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
-; OLDPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; OLDPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; OLDPM_O23-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
 ; OLDPM_O23-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
-; OLDPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
+; OLDPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA8]]
 ; OLDPM_O23-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; OLDPM_O23-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; OLDPM_O23-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; OLDPM_O23-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; OLDPM_O23:       middle.block:
 ; OLDPM_O23-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
-; OLDPM_O23-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
-; OLDPM_O23:       for.body.preheader3:
-; OLDPM_O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; OLDPM_O23-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER]]
+; OLDPM_O23:       for.body.preheader:
+; OLDPM_O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
 ; OLDPM_O23-NEXT:    br label [[FOR_BODY:%.*]]
 ; OLDPM_O23:       for.body:
-; OLDPM_O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
+; OLDPM_O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER]] ]
 ; OLDPM_O23-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; OLDPM_O23-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
+; OLDPM_O23-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8]]
 ; OLDPM_O23-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
 ; OLDPM_O23-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; OLDPM_O23-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; OLDPM_O23-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; OLDPM_O23:       for.cond.cleanup:
 ; OLDPM_O23-NEXT:    ret void
 ;
 ; NEWPM_O1-LABEL: @licm(
 ; NEWPM_O1-NEXT:  entry:
-; NEWPM_O1-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
 ; NEWPM_O1-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; NEWPM_O1-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
+; NEWPM_O1-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; NEWPM_O1:       for.body.lr.ph:
+; NEWPM_O1-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; NEWPM_O1-NEXT:    br label [[FOR_BODY:%.*]]
 ; NEWPM_O1:       for.body:
-; NEWPM_O1-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; NEWPM_O1-NEXT:    [[K_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; NEWPM_O1-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; NEWPM_O1-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; NEWPM_O1-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; NEWPM_O1-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
 ; NEWPM_O1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
 ; NEWPM_O1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
@@ -82,12 +86,12 @@ define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem)
 ;
 ; NEWPM_O23-LABEL: @licm(
 ; NEWPM_O23-NEXT:  entry:
-; NEWPM_O23-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8
 ; NEWPM_O23-NEXT:    [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0
-; NEWPM_O23-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NEWPM_O23:       for.body.preheader:
+; NEWPM_O23-NEXT:    br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; NEWPM_O23:       for.body.lr.ph:
+; NEWPM_O23-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]]
 ; NEWPM_O23-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4
-; NEWPM_O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]]
+; NEWPM_O23-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
 ; NEWPM_O23:       vector.ph:
 ; NEWPM_O23-NEXT:    [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4
 ; NEWPM_O23-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -95,26 +99,26 @@ define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem)
 ; NEWPM_O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NEWPM_O23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]]
 ; NEWPM_O23-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
-; NEWPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
+; NEWPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; NEWPM_O23-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2
 ; NEWPM_O23-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
-; NEWPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]]
+; NEWPM_O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA8]]
 ; NEWPM_O23-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; NEWPM_O23-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NEWPM_O23-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NEWPM_O23-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; NEWPM_O23:       middle.block:
 ; NEWPM_O23-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]]
-; NEWPM_O23-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]]
-; NEWPM_O23:       for.body.preheader3:
-; NEWPM_O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; NEWPM_O23-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER]]
+; NEWPM_O23:       for.body.preheader:
+; NEWPM_O23-NEXT:    [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
 ; NEWPM_O23-NEXT:    br label [[FOR_BODY:%.*]]
 ; NEWPM_O23:       for.body:
-; NEWPM_O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ]
+; NEWPM_O23-NEXT:    [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER]] ]
 ; NEWPM_O23-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]]
-; NEWPM_O23-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]]
+; NEWPM_O23-NEXT:    store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8]]
 ; NEWPM_O23-NEXT:    [[INC]] = add nuw i64 [[K_02]], 1
 ; NEWPM_O23-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]]
-; NEWPM_O23-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NEWPM_O23-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; NEWPM_O23:       for.cond.cleanup:
 ; NEWPM_O23-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
index f75c7d6ea1316..c37bd29041575 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
@@ -32,10 +32,10 @@ define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(%class.FloatVecPair* %FV
 ; OLDPM_O23-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; OLDPM_O23-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
 ; OLDPM_O23:       for.body7.lr.ph.i:
-; OLDPM_O23-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
 ; OLDPM_O23-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
 ; OLDPM_O23-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I6_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; OLDPM_O23-NEXT:    [[ARRAYIDX_I7_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
+; OLDPM_O23-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
 ; OLDPM_O23-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I4_I]], align 8, !tbaa [[TBAA0]]
 ; OLDPM_O23-NEXT:    [[BASE_I2_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
 ; OLDPM_O23-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I2_I]], align 8, !tbaa [[TBAA8]]
@@ -64,10 +64,10 @@ define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(%class.FloatVecPair* %FV
 ; NEWPM_O1-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; NEWPM_O1-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
 ; NEWPM_O1:       for.body7.lr.ph.i:
-; NEWPM_O1-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
 ; NEWPM_O1-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
 ; NEWPM_O1-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I4_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; NEWPM_O1-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
+; NEWPM_O1-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
 ; NEWPM_O1-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I6_I]], align 8, !tbaa [[TBAA0]]
 ; NEWPM_O1-NEXT:    [[BASE_I8_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
 ; NEWPM_O1-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I8_I]], align 8, !tbaa [[TBAA8]]
@@ -95,10 +95,10 @@ define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(%class.FloatVecPair* %FV
 ; NEWPM_O23-NEXT:    [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; NEWPM_O23-NEXT:    br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
 ; NEWPM_O23:       for.body7.lr.ph.i:
-; NEWPM_O23-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
 ; NEWPM_O23-NEXT:    [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0
 ; NEWPM_O23-NEXT:    [[TMP2:%.*]] = load float*, float** [[BASE_I4_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; NEWPM_O23-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef
+; NEWPM_O23-NEXT:    [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0
 ; NEWPM_O23-NEXT:    [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I6_I]], align 8, !tbaa [[TBAA0]]
 ; NEWPM_O23-NEXT:    [[BASE_I8_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0
 ; NEWPM_O23-NEXT:    [[TMP4:%.*]] = load float*, float** [[BASE_I8_I]], align 8, !tbaa [[TBAA8]]
@@ -130,16 +130,18 @@ define linkonce_odr dso_local void @_ZN12FloatVecPair6vecIncEv(%class.FloatVecPa
 ; OLDPM_O1-SAME: (%class.FloatVecPair* [[THIS:%.*]]) local_unnamed_addr comdat align 2 {
 ; OLDPM_O1-NEXT:  entry:
 ; OLDPM_O1-NEXT:    [[VSRC23:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[THIS]], i64 0, i32 1
-; OLDPM_O1-NEXT:    [[VSRCDST:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[THIS]], i64 0, i32 0
 ; OLDPM_O1-NEXT:    [[CALL2:%.*]] = call %class.HomemadeVector.0* @_ZN14HomemadeVectorIS_IfLj8EELj8EEixEj(%class.HomemadeVector* nonnull [[VSRC23]])
 ; OLDPM_O1-NEXT:    [[SIZE43:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[CALL2]], i64 0, i32 1
 ; OLDPM_O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SIZE43]], align 8, !tbaa [[TBAA0:![0-9]+]]
 ; OLDPM_O1-NEXT:    [[CMP54_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
-; OLDPM_O1-NEXT:    br i1 [[CMP54_NOT]], label [[FOR_COND_CLEANUP6:%.*]], label [[FOR_BODY7:%.*]]
+; OLDPM_O1-NEXT:    br i1 [[CMP54_NOT]], label [[FOR_COND_CLEANUP6:%.*]], label [[FOR_BODY7_LR_PH:%.*]]
+; OLDPM_O1:       for.body7.lr.ph:
+; OLDPM_O1-NEXT:    [[VSRCDST:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[THIS]], i64 0, i32 0
+; OLDPM_O1-NEXT:    br label [[FOR_BODY7:%.*]]
 ; OLDPM_O1:       for.cond.cleanup6:
 ; OLDPM_O1-NEXT:    ret void
 ; OLDPM_O1:       for.body7:
-; OLDPM_O1-NEXT:    [[J_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY7]] ], [ 0, [[ENTRY:%.*]] ]
+; OLDPM_O1-NEXT:    [[J_05:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY7]] ]
 ; OLDPM_O1-NEXT:    [[CALL9:%.*]] = call %class.HomemadeVector.0* @_ZN14HomemadeVectorIS_IfLj8EELj8EEixEj(%class.HomemadeVector* nonnull [[VSRC23]])
 ; OLDPM_O1-NEXT:    [[CALL10:%.*]] = call float* @_ZN14HomemadeVectorIfLj8EEixEj(%class.HomemadeVector.0* [[CALL9]])
 ; OLDPM_O1-NEXT:    [[TMP1:%.*]] = load float, float* [[CALL10]], align 4, !tbaa [[TBAA6:![0-9]+]]

From 1d91537ce872c721cc0f7f52980961951c27b1ce Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Thu, 17 Feb 2022 09:04:47 +0800
Subject: [PATCH 205/748] [LoongArch] Add missing dollar prefix to register
 name in InstPrinter

This patch adds a '$' prefix to register name in InstPrinter that I missed in initial patches.

Reviewed By: xen0n

Differential Revision: https://reviews.llvm.org/D119813
---
 .../MCTargetDesc/LoongArchInstPrinter.cpp     |   2 +-
 llvm/test/CodeGen/LoongArch/1ri.mir           |  16 +-
 llvm/test/CodeGen/LoongArch/2r.mir            |  48 ++--
 llvm/test/CodeGen/LoongArch/2ri.mir           |  84 +++----
 llvm/test/CodeGen/LoongArch/3r.mir            | 218 +++++++++---------
 llvm/test/CodeGen/LoongArch/3ri.mir           |  10 +-
 llvm/test/CodeGen/LoongArch/misc.mir          |  16 +-
 7 files changed, 197 insertions(+), 197 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
index 1a5b44e1873e3..66183868f4681 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
@@ -35,7 +35,7 @@ void LoongArchInstPrinter::printInst(const MCInst *MI, uint64_t Address,
 }
 
 void LoongArchInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
-  O << getRegisterName(RegNo);
+  O << '$' << getRegisterName(RegNo);
 }
 
 void LoongArchInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/test/CodeGen/LoongArch/1ri.mir b/llvm/test/CodeGen/LoongArch/1ri.mir
index d267e3800b76f..537fe07a7ace3 100644
--- a/llvm/test/CodeGen/LoongArch/1ri.mir
+++ b/llvm/test/CodeGen/LoongArch/1ri.mir
@@ -16,7 +16,7 @@
 ---
 # CHECK-LABEL: test_LU12I_W:
 # CHECK-ENC: 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0
-# CHECK-ASM: lu12i.w	a0, 49
+# CHECK-ASM: lu12i.w	$a0, 49
 name: test_LU12I_W
 body: |
   bb.0:
@@ -25,7 +25,7 @@ body: |
 ---
 # CHECK-LABEL: test_LU32I_D:
 # CHECK-ENC: 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0
-# CHECK-ASM: lu32i.d	a0, 196
+# CHECK-ASM: lu32i.d	$a0, 196
 name: test_LU32I_D
 body: |
   bb.0:
@@ -34,7 +34,7 @@ body: |
 ---
 # CHECK-LABEL: test_PCADDI:
 # CHECK-ENC: 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0 0 1 0 0
-# CHECK-ASM: pcaddi	a0, 187
+# CHECK-ASM: pcaddi	$a0, 187
 name: test_PCADDI
 body: |
   bb.0:
@@ -43,7 +43,7 @@ body: |
 ---
 # CHECK-LABEL: test_PCALAU12I:
 # CHECK-ENC: 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0
-# CHECK-ASM: pcalau12i	a0, 89
+# CHECK-ASM: pcalau12i	$a0, 89
 name: test_PCALAU12I
 body: |
   bb.0:
@@ -52,7 +52,7 @@ body: |
 ---
 # CHECK-LABEL: test_PCADDU12I:
 # CHECK-ENC: 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: pcaddu12i	a0, 37
+# CHECK-ASM: pcaddu12i	$a0, 37
 name: test_PCADDU12I
 body: |
   bb.0:
@@ -61,7 +61,7 @@ body: |
 ---
 # CHECK-LABEL: test_PCADDU18I:
 # CHECK-ENC: 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
-# CHECK-ASM: pcaddu18i	a0, 26
+# CHECK-ASM: pcaddu18i	$a0, 26
 name: test_PCADDU18I
 body: |
   bb.0:
@@ -80,7 +80,7 @@ body: |
 ---
 # CHECK-LABEL: test_BEQZ:
 # CHECK-ENC: 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0
-# CHECK-ASM: beqz	a0, 23
+# CHECK-ASM: beqz	$a0, 23
 name: test_BEQZ
 body: |
   bb.0:
@@ -89,7 +89,7 @@ body: |
 ---
 # CHECK-LABEL: test_BNEZ:
 # CHECK-ENC: 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0
-# CHECK-ASM: bnez	a0, 21
+# CHECK-ASM: bnez	$a0, 21
 name: test_BNEZ
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/2r.mir b/llvm/test/CodeGen/LoongArch/2r.mir
index 93bb5418a88e6..488944526e58c 100644
--- a/llvm/test/CodeGen/LoongArch/2r.mir
+++ b/llvm/test/CodeGen/LoongArch/2r.mir
@@ -16,7 +16,7 @@
 ---
 # CHECK-LABEL: test_CLO_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: clo.w	a0, a1
+# CHECK-ASM: clo.w	$a0, $a1
 name: test_CLO_W
 body: |
   bb.0:
@@ -25,7 +25,7 @@ body: |
 ---
 # CHECK-LABEL: test_CLZ_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: clz.w	a0, a1
+# CHECK-ASM: clz.w	$a0, $a1
 name: test_CLZ_W
 body: |
   bb.0:
@@ -34,7 +34,7 @@ body: |
 ---
 # CHECK-LABEL: test_CTO_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: cto.w	a0, a1
+# CHECK-ASM: cto.w	$a0, $a1
 name: test_CTO_W
 body: |
   bb.0:
@@ -43,7 +43,7 @@ body: |
 ---
 # CHECK-LABEL: test_CTZ_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ctz.w	a0, a1
+# CHECK-ASM: ctz.w	$a0, $a1
 name: test_CTZ_W
 body: |
   bb.0:
@@ -52,7 +52,7 @@ body: |
 ---
 # CHECK-LABEL: test_CLO_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: clo.d	a0, a1
+# CHECK-ASM: clo.d	$a0, $a1
 name: test_CLO_D
 body: |
   bb.0:
@@ -61,7 +61,7 @@ body: |
 ---
 # CHECK-LABEL: test_CLZ_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: clz.d	a0, a1
+# CHECK-ASM: clz.d	$a0, $a1
 name: test_CLZ_D
 body: |
   bb.0:
@@ -70,7 +70,7 @@ body: |
 ---
 # CHECK-LABEL: test_CTO_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: cto.d	a0, a1
+# CHECK-ASM: cto.d	$a0, $a1
 name: test_CTO_D
 body: |
   bb.0:
@@ -79,7 +79,7 @@ body: |
 ---
 # CHECK-LABEL: test_CTZ_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ctz.d	a0, a1
+# CHECK-ASM: ctz.d	$a0, $a1
 name: test_CTZ_D
 body: |
   bb.0:
@@ -88,7 +88,7 @@ body: |
 ---
 # CHECK-LABEL: test_REVB_2H:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: revb.2h	a0, a1
+# CHECK-ASM: revb.2h	$a0, $a1
 name: test_REVB_2H
 body: |
   bb.0:
@@ -97,7 +97,7 @@ body: |
 ---
 # CHECK-LABEL: test_REVB_4H:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: revb.4h	a0, a1
+# CHECK-ASM: revb.4h	$a0, $a1
 name: test_REVB_4H
 body: |
   bb.0:
@@ -106,7 +106,7 @@ body: |
 ---
 # CHECK-LABEL: test_REVB_2W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: revb.2w	a0, a1
+# CHECK-ASM: revb.2w	$a0, $a1
 name: test_REVB_2W
 body: |
   bb.0:
@@ -115,7 +115,7 @@ body: |
 ---
 # CHECK-LABEL: test_REVB_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: revb.d	a0, a1
+# CHECK-ASM: revb.d	$a0, $a1
 name: test_REVB_D
 body: |
   bb.0:
@@ -124,7 +124,7 @@ body: |
 ---
 # CHECK-LABEL: test_REVH_2W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: revh.2w	a0, a1
+# CHECK-ASM: revh.2w	$a0, $a1
 name: test_REVH_2W
 body: |
   bb.0:
@@ -133,7 +133,7 @@ body: |
 ---
 # CHECK-LABEL: test_REVH_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: revh.d	a0, a1
+# CHECK-ASM: revh.d	$a0, $a1
 name: test_REVH_D
 body: |
   bb.0:
@@ -142,7 +142,7 @@ body: |
 ---
 # CHECK-LABEL: test_BITREV_4B:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bitrev.4b	a0, a1
+# CHECK-ASM: bitrev.4b	$a0, $a1
 name: test_BITREV_4B
 body: |
   bb.0:
@@ -151,7 +151,7 @@ body: |
 ---
 # CHECK-LABEL: test_BITREV_8B:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bitrev.8b	a0, a1
+# CHECK-ASM: bitrev.8b	$a0, $a1
 name: test_BITREV_8B
 body: |
   bb.0:
@@ -160,7 +160,7 @@ body: |
 ---
 # CHECK-LABEL: test_BITREV_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bitrev.w	a0, a1
+# CHECK-ASM: bitrev.w	$a0, $a1
 name: test_BITREV_W
 body: |
   bb.0:
@@ -169,7 +169,7 @@ body: |
 ---
 # CHECK-LABEL: test_BITREV_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bitrev.d	a0, a1
+# CHECK-ASM: bitrev.d	$a0, $a1
 name: test_BITREV_D
 body: |
   bb.0:
@@ -178,7 +178,7 @@ body: |
 ---
 # CHECK-LABEL: test_EXT_W_H:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ext.w.h	a0, a1
+# CHECK-ASM: ext.w.h	$a0, $a1
 name: test_EXT_W_H
 body: |
   bb.0:
@@ -187,7 +187,7 @@ body: |
 ---
 # CHECK-LABEL: test_EXT_W_B:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ext.w.b	a0, a1
+# CHECK-ASM: ext.w.b	$a0, $a1
 name: test_EXT_W_B
 body: |
   bb.0:
@@ -196,7 +196,7 @@ body: |
 ---
 # CHECK-LABEL: test_CPUCFG:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: cpucfg	a0, a1
+# CHECK-ASM: cpucfg	$a0, $a1
 name: test_CPUCFG
 body: |
   bb.0:
@@ -205,7 +205,7 @@ body: |
 ---
 # CHECK-LABEL: test_RDTIMEL_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: rdtimel.w	a0, a1
+# CHECK-ASM: rdtimel.w	$a0, $a1
 name: test_RDTIMEL_W
 body: |
   bb.0:
@@ -214,7 +214,7 @@ body: |
 ---
 # CHECK-LABEL: test_RDTIMEH_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: rdtimeh.w	a0, a1
+# CHECK-ASM: rdtimeh.w	$a0, $a1
 name: test_RDTIMEH_W
 body: |
   bb.0:
@@ -223,7 +223,7 @@ body: |
 ---
 # CHECK-LABEL: test_RDTIME_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: rdtime.d	a0, a1
+# CHECK-ASM: rdtime.d	$a0, $a1
 name: test_RDTIME_D
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/2ri.mir b/llvm/test/CodeGen/LoongArch/2ri.mir
index cef682ba5039e..15a2759d66976 100644
--- a/llvm/test/CodeGen/LoongArch/2ri.mir
+++ b/llvm/test/CodeGen/LoongArch/2ri.mir
@@ -16,7 +16,7 @@
 ---
 # CHECK-LABEL: test_SLLI_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: slli.w	a0, a1, 0
+# CHECK-ASM: slli.w	$a0, $a1, 0
 name: test_SLLI_W
 body: |
   bb.0:
@@ -25,7 +25,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRLI_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: srli.w	a0, a1, 30
+# CHECK-ASM: srli.w	$a0, $a1, 30
 name: test_SRLI_W
 body: |
   bb.0:
@@ -34,7 +34,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRAI_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: srai.w	a0, a1, 24
+# CHECK-ASM: srai.w	$a0, $a1, 24
 name: test_SRAI_W
 body: |
   bb.0:
@@ -43,7 +43,7 @@ body: |
 ---
 # CHECK-LABEL: test_ROTRI_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: rotri.w	a0, a1, 23
+# CHECK-ASM: rotri.w	$a0, $a1, 23
 name: test_ROTRI_W
 body: |
   bb.0:
@@ -62,7 +62,7 @@ body: |
 ---
 # CHECK-LABEL: test_SLLI_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: slli.d	a0, a1, 39
+# CHECK-ASM: slli.d	$a0, $a1, 39
 name: test_SLLI_D
 body: |
   bb.0:
@@ -71,7 +71,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRLI_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: srli.d	a0, a1, 38
+# CHECK-ASM: srli.d	$a0, $a1, 38
 name: test_SRLI_D
 body: |
   bb.0:
@@ -80,7 +80,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRAI_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: srai.d	a0, a1, 27
+# CHECK-ASM: srai.d	$a0, $a1, 27
 name: test_SRAI_D
 body: |
   bb.0:
@@ -89,7 +89,7 @@ body: |
 ---
 # CHECK-LABEL: test_ROTRI_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: rotri.d	a0, a1, 7
+# CHECK-ASM: rotri.d	$a0, $a1, 7
 name: test_ROTRI_D
 body: |
   bb.0:
@@ -108,7 +108,7 @@ body: |
 ---
 # CHECK-LABEL: test_SLTI:
 # CHECK-ENC: 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: slti	a0, a1, 235
+# CHECK-ASM: slti	$a0, $a1, 235
 name: test_SLTI
 body: |
   bb.0:
@@ -117,7 +117,7 @@ body: |
 ---
 # CHECK-LABEL: test_SLTUI:
 # CHECK-ENC: 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sltui	a0, a1, 162
+# CHECK-ASM: sltui	$a0, $a1, 162
 name: test_SLTUI
 body: |
   bb.0:
@@ -126,7 +126,7 @@ body: |
 ---
 # CHECK-LABEL: test_ADDI_W:
 # CHECK-ENC: 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: addi.w	a0, a1, 246
+# CHECK-ASM: addi.w	$a0, $a1, 246
 name: test_ADDI_W
 body: |
   bb.0:
@@ -135,7 +135,7 @@ body: |
 ---
 # CHECK-LABEL: test_ADDI_D:
 # CHECK-ENC: 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: addi.d	a0, a1, 75
+# CHECK-ASM: addi.d	$a0, $a1, 75
 name: test_ADDI_D
 body: |
   bb.0:
@@ -144,7 +144,7 @@ body: |
 ---
 # CHECK-LABEL: test_LU52I_D:
 # CHECK-ENC: 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: lu52i.d	a0, a1, 195
+# CHECK-ASM: lu52i.d	$a0, $a1, 195
 name: test_LU52I_D
 body: |
   bb.0:
@@ -153,7 +153,7 @@ body: |
 ---
 # CHECK-LABEL: test_ANDI:
 # CHECK-ENC: 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: andi	a0, a1, 106
+# CHECK-ASM: andi	$a0, $a1, 106
 name: test_ANDI
 body: |
   bb.0:
@@ -162,7 +162,7 @@ body: |
 ---
 # CHECK-LABEL: test_ORI:
 # CHECK-ENC: 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ori	a0, a1, 47
+# CHECK-ASM: ori	$a0, $a1, 47
 name: test_ORI
 body: |
   bb.0:
@@ -171,7 +171,7 @@ body: |
 ---
 # CHECK-LABEL: test_XORI:
 # CHECK-ENC: 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: xori	a0, a1, 99
+# CHECK-ASM: xori	$a0, $a1, 99
 name: test_XORI
 body: |
   bb.0:
@@ -180,7 +180,7 @@ body: |
 ---
 # CHECK-LABEL: test_LD_B:
 # CHECK-ENC: 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ld.b	a0, a1, 21
+# CHECK-ASM: ld.b	$a0, $a1, 21
 name: test_LD_B
 body: |
   bb.0:
@@ -189,7 +189,7 @@ body: |
 ---
 # CHECK-LABEL: test_LD_H:
 # CHECK-ENC: 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ld.h	a0, a1, 80
+# CHECK-ASM: ld.h	$a0, $a1, 80
 name: test_LD_H
 body: |
   bb.0:
@@ -198,7 +198,7 @@ body: |
 ---
 # CHECK-LABEL: test_LD_W:
 # CHECK-ENC: 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ld.w	a0, a1, 92
+# CHECK-ASM: ld.w	$a0, $a1, 92
 name: test_LD_W
 body: |
   bb.0:
@@ -207,7 +207,7 @@ body: |
 ---
 # CHECK-LABEL: test_LD_BU:
 # CHECK-ENC: 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ld.bu	a0, a1, 150
+# CHECK-ASM: ld.bu	$a0, $a1, 150
 name: test_LD_BU
 body: |
   bb.0:
@@ -216,7 +216,7 @@ body: |
 ---
 # CHECK-LABEL: test_LD_HU:
 # CHECK-ENC: 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ld.hu	a0, a1, 198
+# CHECK-ASM: ld.hu	$a0, $a1, 198
 name: test_LD_HU
 body: |
   bb.0:
@@ -225,7 +225,7 @@ body: |
 ---
 # CHECK-LABEL: test_LD_WU:
 # CHECK-ENC: 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ld.wu	a0, a1, 31
+# CHECK-ASM: ld.wu	$a0, $a1, 31
 name: test_LD_WU
 body: |
   bb.0:
@@ -234,7 +234,7 @@ body: |
 ---
 # CHECK-LABEL: test_ST_B:
 # CHECK-ENC: 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: st.b	a0, a1, 95
+# CHECK-ASM: st.b	$a0, $a1, 95
 name: test_ST_B
 body: |
   bb.0:
@@ -243,7 +243,7 @@ body: |
 ---
 # CHECK-LABEL: test_ST_H:
 # CHECK-ENC: 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: st.h	a0, a1, 122
+# CHECK-ASM: st.h	$a0, $a1, 122
 name: test_ST_H
 body: |
   bb.0:
@@ -252,7 +252,7 @@ body: |
 ---
 # CHECK-LABEL: test_ST_W:
 # CHECK-ENC: 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: st.w	a0, a1, 175
+# CHECK-ASM: st.w	$a0, $a1, 175
 name: test_ST_W
 body: |
   bb.0:
@@ -261,7 +261,7 @@ body: |
 ---
 # CHECK-LABEL: test_ST_D:
 # CHECK-ENC: 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: st.d	a0, a1, 60
+# CHECK-ASM: st.d	$a0, $a1, 60
 name: test_ST_D
 body: |
   bb.0:
@@ -280,7 +280,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDPTR_W:
 # CHECK-ENC: 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldptr.w	a0, a1, 66
+# CHECK-ASM: ldptr.w	$a0, $a1, 66
 name: test_LDPTR_W
 body: |
   bb.0:
@@ -289,7 +289,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDPTR_D:
 # CHECK-ENC: 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldptr.d	a0, a1, 56
+# CHECK-ASM: ldptr.d	$a0, $a1, 56
 name: test_LDPTR_D
 body: |
   bb.0:
@@ -298,7 +298,7 @@ body: |
 ---
 # CHECK-LABEL: test_STPTR_W:
 # CHECK-ENC: 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stptr.w	a0, a1, 87
+# CHECK-ASM: stptr.w	$a0, $a1, 87
 name: test_STPTR_W
 body: |
   bb.0:
@@ -307,7 +307,7 @@ body: |
 ---
 # CHECK-LABEL: test_STPTR_D:
 # CHECK-ENC: 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stptr.d	a0, a1, 145
+# CHECK-ASM: stptr.d	$a0, $a1, 145
 name: test_STPTR_D
 body: |
   bb.0:
@@ -316,7 +316,7 @@ body: |
 ---
 # CHECK-LABEL: test_LL_W:
 # CHECK-ENC: 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ll.w	a0, a1, 243
+# CHECK-ASM: ll.w	$a0, $a1, 243
 name: test_LL_W
 body: |
   bb.0:
@@ -325,7 +325,7 @@ body: |
 ---
 # CHECK-LABEL: test_LL_D:
 # CHECK-ENC: 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ll.d	a0, a1, 74
+# CHECK-ASM: ll.d	$a0, $a1, 74
 name: test_LL_D
 body: |
   bb.0:
@@ -334,7 +334,7 @@ body: |
 ---
 # CHECK-LABEL: test_SC_W:
 # CHECK-ENC: 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sc.w	a0, a1, 96
+# CHECK-ASM: sc.w	$a0, $a1, 96
 name: test_SC_W
 body: |
   bb.0:
@@ -343,7 +343,7 @@ body: |
 ---
 # CHECK-LABEL: test_SC_D:
 # CHECK-ENC: 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sc.d	a0, a1, 105
+# CHECK-ASM: sc.d	$a0, $a1, 105
 name: test_SC_D
 body: |
   bb.0:
@@ -362,7 +362,7 @@ body: |
 ---
 # CHECK-LABEL: test_ADDU16I_D:
 # CHECK-ENC: 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: addu16i.d	a0, a1, 23
+# CHECK-ASM: addu16i.d	$a0, $a1, 23
 name: test_ADDU16I_D
 body: |
   bb.0:
@@ -371,7 +371,7 @@ body: |
 ---
 # CHECK-LABEL: test_JIRL:
 # CHECK-ENC: 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: jirl	a0, a1, 49
+# CHECK-ASM: jirl	$a0, $a1, 49
 name: test_JIRL
 body: |
   bb.0:
@@ -380,7 +380,7 @@ body: |
 ---
 # CHECK-LABEL: test_BEQ:
 # CHECK-ENC: 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: beq	a0, a1, 196
+# CHECK-ASM: beq	$a0, $a1, 196
 name: test_BEQ
 body: |
   bb.0:
@@ -389,7 +389,7 @@ body: |
 ---
 # CHECK-LABEL: test_BNE:
 # CHECK-ENC: 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bne	a0, a1, 19
+# CHECK-ASM: bne	$a0, $a1, 19
 name: test_BNE
 body: |
   bb.0:
@@ -398,7 +398,7 @@ body: |
 ---
 # CHECK-LABEL: test_BLT:
 # CHECK-ENC: 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: blt	a0, a1, 123
+# CHECK-ASM: blt	$a0, $a1, 123
 name: test_BLT
 body: |
   bb.0:
@@ -407,7 +407,7 @@ body: |
 ---
 # CHECK-LABEL: test_BGE:
 # CHECK-ENC: 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bge	a0, a1, 12
+# CHECK-ASM: bge	$a0, $a1, 12
 name: test_BGE
 body: |
   bb.0:
@@ -416,7 +416,7 @@ body: |
 ---
 # CHECK-LABEL: test_BLTU:
 # CHECK-ENC: 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bltu	a0, a1, 17
+# CHECK-ASM: bltu	$a0, $a1, 17
 name: test_BLTU
 body: |
   bb.0:
@@ -425,7 +425,7 @@ body: |
 ---
 # CHECK-LABEL: test_BGEU:
 # CHECK-ENC: 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1
-# CHECK-ASM: bgeu	a0, a1, 88
+# CHECK-ASM: bgeu	$a0, $a1, 88
 name: test_BGEU
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/3r.mir b/llvm/test/CodeGen/LoongArch/3r.mir
index fc9012b3b1992..19f0446a7d685 100644
--- a/llvm/test/CodeGen/LoongArch/3r.mir
+++ b/llvm/test/CodeGen/LoongArch/3r.mir
@@ -16,7 +16,7 @@
 ---
 # CHECK-LABEL: test_ADD_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: add.w	a0, a1, a0
+# CHECK-ASM: add.w	$a0, $a1, $a0
 name: test_ADD_W
 body: |
   bb.0:
@@ -25,7 +25,7 @@ body: |
 ---
 # CHECK-LABEL: test_ADD_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: add.d	a0, a1, a0
+# CHECK-ASM: add.d	$a0, $a1, $a0
 name: test_ADD_D
 body: |
   bb.0:
@@ -34,7 +34,7 @@ body: |
 ---
 # CHECK-LABEL: test_SUB_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sub.w	a0, a1, a0
+# CHECK-ASM: sub.w	$a0, $a1, $a0
 name: test_SUB_W
 body: |
   bb.0:
@@ -43,7 +43,7 @@ body: |
 ---
 # CHECK-LABEL: test_SUB_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sub.d	a0, a1, a0
+# CHECK-ASM: sub.d	$a0, $a1, $a0
 name: test_SUB_D
 body: |
   bb.0:
@@ -52,7 +52,7 @@ body: |
 ---
 # CHECK-LABEL: test_SLT:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: slt	a0, a1, a0
+# CHECK-ASM: slt	$a0, $a1, $a0
 name: test_SLT
 body: |
   bb.0:
@@ -61,7 +61,7 @@ body: |
 ---
 # CHECK-LABEL: test_SLTU:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sltu	a0, a1, a0
+# CHECK-ASM: sltu	$a0, $a1, $a0
 name: test_SLTU
 body: |
   bb.0:
@@ -70,7 +70,7 @@ body: |
 ---
 # CHECK-LABEL: test_MASKEQZ:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: maskeqz	a0, a1, a0
+# CHECK-ASM: maskeqz	$a0, $a1, $a0
 name: test_MASKEQZ
 body: |
   bb.0:
@@ -79,7 +79,7 @@ body: |
 ---
 # CHECK-LABEL: test_MASKNEZ:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: masknez	a0, a1, a0
+# CHECK-ASM: masknez	$a0, $a1, $a0
 name: test_MASKNEZ
 body: |
   bb.0:
@@ -88,7 +88,7 @@ body: |
 ---
 # CHECK-LABEL: test_NOR:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: nor	a0, a1, a0
+# CHECK-ASM: nor	$a0, $a1, $a0
 name: test_NOR
 body: |
   bb.0:
@@ -97,7 +97,7 @@ body: |
 ---
 # CHECK-LABEL: test_AND:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: and	a0, a1, a0
+# CHECK-ASM: and	$a0, $a1, $a0
 name: test_AND
 body: |
   bb.0:
@@ -106,7 +106,7 @@ body: |
 ---
 # CHECK-LABEL: test_OR:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: or	a0, a1, a0
+# CHECK-ASM: or	$a0, $a1, $a0
 name: test_OR
 body: |
   bb.0:
@@ -115,7 +115,7 @@ body: |
 ---
 # CHECK-LABEL: test_XOR:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: xor	a0, a1, a0
+# CHECK-ASM: xor	$a0, $a1, $a0
 name: test_XOR
 body: |
   bb.0:
@@ -124,7 +124,7 @@ body: |
 ---
 # CHECK-LABEL: test_ORN:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: orn	a0, a1, a0
+# CHECK-ASM: orn	$a0, $a1, $a0
 name: test_ORN
 body: |
   bb.0:
@@ -133,7 +133,7 @@ body: |
 ---
 # CHECK-LABEL: test_ANDN:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: andn	a0, a1, a0
+# CHECK-ASM: andn	$a0, $a1, $a0
 name: test_ANDN
 body: |
   bb.0:
@@ -142,7 +142,7 @@ body: |
 ---
 # CHECK-LABEL: test_SLL_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sll.w	a0, a1, a0
+# CHECK-ASM: sll.w	$a0, $a1, $a0
 name: test_SLL_W
 body: |
   bb.0:
@@ -151,7 +151,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRL_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: srl.w	a0, a1, a0
+# CHECK-ASM: srl.w	$a0, $a1, $a0
 name: test_SRL_W
 body: |
   bb.0:
@@ -160,7 +160,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRA_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sra.w	a0, a1, a0
+# CHECK-ASM: sra.w	$a0, $a1, $a0
 name: test_SRA_W
 body: |
   bb.0:
@@ -169,7 +169,7 @@ body: |
 ---
 # CHECK-LABEL: test_SLL_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sll.d	a0, a1, a0
+# CHECK-ASM: sll.d	$a0, $a1, $a0
 name: test_SLL_D
 body: |
   bb.0:
@@ -178,7 +178,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRL_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: srl.d	a0, a1, a0
+# CHECK-ASM: srl.d	$a0, $a1, $a0
 name: test_SRL_D
 body: |
   bb.0:
@@ -187,7 +187,7 @@ body: |
 ---
 # CHECK-LABEL: test_SRA_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: sra.d	a0, a1, a0
+# CHECK-ASM: sra.d	$a0, $a1, $a0
 name: test_SRA_D
 body: |
   bb.0:
@@ -196,7 +196,7 @@ body: |
 ---
 # CHECK-LABEL: test_ROTR_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: rotr.w	a0, a1, a0
+# CHECK-ASM: rotr.w	$a0, $a1, $a0
 name: test_ROTR_W
 body: |
   bb.0:
@@ -205,7 +205,7 @@ body: |
 ---
 # CHECK-LABEL: test_ROTR_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: rotr.d	a0, a1, a0
+# CHECK-ASM: rotr.d	$a0, $a1, $a0
 name: test_ROTR_D
 body: |
   bb.0:
@@ -214,7 +214,7 @@ body: |
 ---
 # CHECK-LABEL: test_MUL_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mul.w	a0, a1, a0
+# CHECK-ASM: mul.w	$a0, $a1, $a0
 name: test_MUL_W
 body: |
   bb.0:
@@ -223,7 +223,7 @@ body: |
 ---
 # CHECK-LABEL: test_MULH_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mulh.w	a0, a1, a0
+# CHECK-ASM: mulh.w	$a0, $a1, $a0
 name: test_MULH_W
 body: |
   bb.0:
@@ -232,7 +232,7 @@ body: |
 ---
 # CHECK-LABEL: test_MULH_WU:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mulh.wu	a0, a1, a0
+# CHECK-ASM: mulh.wu	$a0, $a1, $a0
 name: test_MULH_WU
 body: |
   bb.0:
@@ -241,7 +241,7 @@ body: |
 ---
 # CHECK-LABEL: test_MUL_D:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mul.d	a0, a1, a0
+# CHECK-ASM: mul.d	$a0, $a1, $a0
 name: test_MUL_D
 body: |
   bb.0:
@@ -250,7 +250,7 @@ body: |
 ---
 # CHECK-LABEL: test_MULH_D:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mulh.d	a0, a1, a0
+# CHECK-ASM: mulh.d	$a0, $a1, $a0
 name: test_MULH_D
 body: |
   bb.0:
@@ -259,7 +259,7 @@ body: |
 ---
 # CHECK-LABEL: test_MULH_DU:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mulh.du	a0, a1, a0
+# CHECK-ASM: mulh.du	$a0, $a1, $a0
 name: test_MULH_DU
 body: |
   bb.0:
@@ -268,7 +268,7 @@ body: |
 ---
 # CHECK-LABEL: test_MULW_D_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mulw.d.w	a0, a1, a0
+# CHECK-ASM: mulw.d.w	$a0, $a1, $a0
 name: test_MULW_D_W
 body: |
   bb.0:
@@ -277,7 +277,7 @@ body: |
 ---
 # CHECK-LABEL: test_MULW_D_WU:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mulw.d.wu	a0, a1, a0
+# CHECK-ASM: mulw.d.wu	$a0, $a1, $a0
 name: test_MULW_D_WU
 body: |
   bb.0:
@@ -286,7 +286,7 @@ body: |
 ---
 # CHECK-LABEL: test_DIV_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: div.w	a0, a1, a0
+# CHECK-ASM: div.w	$a0, $a1, $a0
 name: test_DIV_W
 body: |
   bb.0:
@@ -295,7 +295,7 @@ body: |
 ---
 # CHECK-LABEL: test_MOD_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mod.w	a0, a1, a0
+# CHECK-ASM: mod.w	$a0, $a1, $a0
 name: test_MOD_W
 body: |
   bb.0:
@@ -304,7 +304,7 @@ body: |
 ---
 # CHECK-LABEL: test_DIV_WU:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: div.wu	a0, a1, a0
+# CHECK-ASM: div.wu	$a0, $a1, $a0
 name: test_DIV_WU
 body: |
   bb.0:
@@ -313,7 +313,7 @@ body: |
 ---
 # CHECK-LABEL: test_MOD_WU:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mod.wu	a0, a1, a0
+# CHECK-ASM: mod.wu	$a0, $a1, $a0
 name: test_MOD_WU
 body: |
   bb.0:
@@ -322,7 +322,7 @@ body: |
 ---
 # CHECK-LABEL: test_DIV_D:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: div.d	a0, a1, a0
+# CHECK-ASM: div.d	$a0, $a1, $a0
 name: test_DIV_D
 body: |
   bb.0:
@@ -331,7 +331,7 @@ body: |
 ---
 # CHECK-LABEL: test_MOD_D:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mod.d	a0, a1, a0
+# CHECK-ASM: mod.d	$a0, $a1, $a0
 name: test_MOD_D
 body: |
   bb.0:
@@ -340,7 +340,7 @@ body: |
 ---
 # CHECK-LABEL: test_DIV_DU:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: div.du	a0, a1, a0
+# CHECK-ASM: div.du	$a0, $a1, $a0
 name: test_DIV_DU
 body: |
   bb.0:
@@ -349,7 +349,7 @@ body: |
 ---
 # CHECK-LABEL: test_MOD_DU:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: mod.du	a0, a1, a0
+# CHECK-ASM: mod.du	$a0, $a1, $a0
 name: test_MOD_DU
 body: |
   bb.0:
@@ -358,7 +358,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRC_W_B_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crc.w.b.w	a0, a1, a0
+# CHECK-ASM: crc.w.b.w	$a0, $a1, $a0
 name: test_CRC_W_B_W
 body: |
   bb.0:
@@ -367,7 +367,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRC_W_H_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crc.w.h.w	a0, a1, a0
+# CHECK-ASM: crc.w.h.w	$a0, $a1, $a0
 name: test_CRC_W_H_W
 body: |
   bb.0:
@@ -376,7 +376,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRC_W_W_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crc.w.w.w	a0, a1, a0
+# CHECK-ASM: crc.w.w.w	$a0, $a1, $a0
 name: test_CRC_W_W_W
 body: |
   bb.0:
@@ -385,7 +385,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRC_W_D_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crc.w.d.w	a0, a1, a0
+# CHECK-ASM: crc.w.d.w	$a0, $a1, $a0
 name: test_CRC_W_D_W
 body: |
   bb.0:
@@ -394,7 +394,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRCC_W_B_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crcc.w.b.w	a0, a1, a0
+# CHECK-ASM: crcc.w.b.w	$a0, $a1, $a0
 name: test_CRCC_W_B_W
 body: |
   bb.0:
@@ -403,7 +403,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRCC_W_H_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crcc.w.h.w	a0, a1, a0
+# CHECK-ASM: crcc.w.h.w	$a0, $a1, $a0
 name: test_CRCC_W_H_W
 body: |
   bb.0:
@@ -412,7 +412,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRCC_W_W_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crcc.w.w.w	a0, a1, a0
+# CHECK-ASM: crcc.w.w.w	$a0, $a1, $a0
 name: test_CRCC_W_W_W
 body: |
   bb.0:
@@ -421,7 +421,7 @@ body: |
 ---
 # CHECK-LABEL: test_CRCC_W_D_W:
 # CHECK-ENC:  0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: crcc.w.d.w	a0, a1, a0
+# CHECK-ASM: crcc.w.d.w	$a0, $a1, $a0
 name: test_CRCC_W_D_W
 body: |
   bb.0:
@@ -430,7 +430,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMSWAP_DB_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amswap_db.w	a0, a1, a2
+# CHECK-ASM: amswap_db.w	$a0, $a1, $a2
 name: test_AMSWAP_DB_W
 body: |
   bb.0:
@@ -439,7 +439,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMSWAP_DB_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amswap_db.d	a0, a1, a2
+# CHECK-ASM: amswap_db.d	$a0, $a1, $a2
 name: test_AMSWAP_DB_D
 body: |
   bb.0:
@@ -448,7 +448,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMADD_DB_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amadd_db.w	a0, a1, a2
+# CHECK-ASM: amadd_db.w	$a0, $a1, $a2
 name: test_AMADD_DB_W
 body: |
   bb.0:
@@ -457,7 +457,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMADD_DB_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amadd_db.d	a0, a1, a2
+# CHECK-ASM: amadd_db.d	$a0, $a1, $a2
 name: test_AMADD_DB_D
 body: |
   bb.0:
@@ -466,7 +466,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMAND_DB_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amand_db.w	a0, a1, a2
+# CHECK-ASM: amand_db.w	$a0, $a1, $a2
 name: test_AMAND_DB_W
 body: |
   bb.0:
@@ -475,7 +475,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMAND_DB_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amand_db.d	a0, a1, a2
+# CHECK-ASM: amand_db.d	$a0, $a1, $a2
 name: test_AMAND_DB_D
 body: |
   bb.0:
@@ -484,7 +484,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMOR_DB_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amor_db.w	a0, a1, a2
+# CHECK-ASM: amor_db.w	$a0, $a1, $a2
 name: test_AMOR_DB_W
 body: |
   bb.0:
@@ -493,7 +493,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMOR_DB_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amor_db.d	a0, a1, a2
+# CHECK-ASM: amor_db.d	$a0, $a1, $a2
 name: test_AMOR_DB_D
 body: |
   bb.0:
@@ -502,7 +502,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMXOR_DB_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amxor_db.w	a0, a1, a2
+# CHECK-ASM: amxor_db.w	$a0, $a1, $a2
 name: test_AMXOR_DB_W
 body: |
   bb.0:
@@ -511,7 +511,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMXOR_DB_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amxor_db.d	a0, a1, a2
+# CHECK-ASM: amxor_db.d	$a0, $a1, $a2
 name: test_AMXOR_DB_D
 body: |
   bb.0:
@@ -520,7 +520,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_DB_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax_db.w	a0, a1, a2
+# CHECK-ASM: ammax_db.w	$a0, $a1, $a2
 name: test_AMMAX_DB_W
 body: |
   bb.0:
@@ -529,7 +529,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_DB_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax_db.d	a0, a1, a2
+# CHECK-ASM: ammax_db.d	$a0, $a1, $a2
 name: test_AMMAX_DB_D
 body: |
   bb.0:
@@ -538,7 +538,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_DB_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin_db.w	a0, a1, a2
+# CHECK-ASM: ammin_db.w	$a0, $a1, $a2
 name: test_AMMIN_DB_W
 body: |
   bb.0:
@@ -547,7 +547,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_DB_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin_db.d	a0, a1, a2
+# CHECK-ASM: ammin_db.d	$a0, $a1, $a2
 name: test_AMMIN_DB_D
 body: |
   bb.0:
@@ -556,7 +556,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_DB_WU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax_db.wu	a0, a1, a2
+# CHECK-ASM: ammax_db.wu	$a0, $a1, $a2
 name: test_AMMAX_DB_WU
 body: |
   bb.0:
@@ -565,7 +565,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_DB_DU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax_db.du	a0, a1, a2
+# CHECK-ASM: ammax_db.du	$a0, $a1, $a2
 name: test_AMMAX_DB_DU
 body: |
   bb.0:
@@ -574,7 +574,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_DB_WU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin_db.wu	a0, a1, a2
+# CHECK-ASM: ammin_db.wu	$a0, $a1, $a2
 name: test_AMMIN_DB_WU
 body: |
   bb.0:
@@ -583,7 +583,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_DB_DU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin_db.du	a0, a1, a2
+# CHECK-ASM: ammin_db.du	$a0, $a1, $a2
 name: test_AMMIN_DB_DU
 body: |
   bb.0:
@@ -592,7 +592,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMSWAP_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amswap.w	a0, a1, a2
+# CHECK-ASM: amswap.w	$a0, $a1, $a2
 name: test_AMSWAP_W
 body: |
   bb.0:
@@ -601,7 +601,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMSWAP_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amswap.d	a0, a1, a2
+# CHECK-ASM: amswap.d	$a0, $a1, $a2
 name: test_AMSWAP_D
 body: |
   bb.0:
@@ -610,7 +610,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMADD_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amadd.w	a0, a1, a2
+# CHECK-ASM: amadd.w	$a0, $a1, $a2
 name: test_AMADD_W
 body: |
   bb.0:
@@ -619,7 +619,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMADD_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amadd.d	a0, a1, a2
+# CHECK-ASM: amadd.d	$a0, $a1, $a2
 name: test_AMADD_D
 body: |
   bb.0:
@@ -628,7 +628,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMAND_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amand.w	a0, a1, a2
+# CHECK-ASM: amand.w	$a0, $a1, $a2
 name: test_AMAND_W
 body: |
   bb.0:
@@ -637,7 +637,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMAND_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amand.d	a0, a1, a2
+# CHECK-ASM: amand.d	$a0, $a1, $a2
 name: test_AMAND_D
 body: |
   bb.0:
@@ -646,7 +646,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMOR_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amor.w	a0, a1, a2
+# CHECK-ASM: amor.w	$a0, $a1, $a2
 name: test_AMOR_W
 body: |
   bb.0:
@@ -655,7 +655,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMOR_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amor.d	a0, a1, a2
+# CHECK-ASM: amor.d	$a0, $a1, $a2
 name: test_AMOR_D
 body: |
   bb.0:
@@ -664,7 +664,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMXOR_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amxor.w	a0, a1, a2
+# CHECK-ASM: amxor.w	$a0, $a1, $a2
 name: test_AMXOR_W
 body: |
   bb.0:
@@ -673,7 +673,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMXOR_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: amxor.d	a0, a1, a2
+# CHECK-ASM: amxor.d	$a0, $a1, $a2
 name: test_AMXOR_D
 body: |
   bb.0:
@@ -682,7 +682,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax.w	a0, a1, a2
+# CHECK-ASM: ammax.w	$a0, $a1, $a2
 name: test_AMMAX_W
 body: |
   bb.0:
@@ -691,7 +691,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax.d	a0, a1, a2
+# CHECK-ASM: ammax.d	$a0, $a1, $a2
 name: test_AMMAX_D
 body: |
   bb.0:
@@ -700,7 +700,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin.w	a0, a1, a2
+# CHECK-ASM: ammin.w	$a0, $a1, $a2
 name: test_AMMIN_W
 body: |
   bb.0:
@@ -709,7 +709,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin.d	a0, a1, a2
+# CHECK-ASM: ammin.d	$a0, $a1, $a2
 name: test_AMMIN_D
 body: |
   bb.0:
@@ -718,7 +718,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_WU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax.wu	a0, a1, a2
+# CHECK-ASM: ammax.wu	$a0, $a1, $a2
 name: test_AMMAX_WU
 body: |
   bb.0:
@@ -727,7 +727,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMAX_DU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammax.du	a0, a1, a2
+# CHECK-ASM: ammax.du	$a0, $a1, $a2
 name: test_AMMAX_DU
 body: |
   bb.0:
@@ -736,7 +736,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_WU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin.wu	a0, a1, a2
+# CHECK-ASM: ammin.wu	$a0, $a1, $a2
 name: test_AMMIN_WU
 body: |
   bb.0:
@@ -745,7 +745,7 @@ body: |
 ---
 # CHECK-LABEL: test_AMMIN_DU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ammin.du	a0, a1, a2
+# CHECK-ASM: ammin.du	$a0, $a1, $a2
 name: test_AMMIN_DU
 body: |
   bb.0:
@@ -754,7 +754,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDX_B:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldx.b	a0, a1, a2
+# CHECK-ASM: ldx.b	$a0, $a1, $a2
 name: test_LDX_B
 body: |
   bb.0:
@@ -763,7 +763,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDX_H:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldx.h	a0, a1, a2
+# CHECK-ASM: ldx.h	$a0, $a1, $a2
 name: test_LDX_H
 body: |
   bb.0:
@@ -772,7 +772,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDX_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldx.w	a0, a1, a2
+# CHECK-ASM: ldx.w	$a0, $a1, $a2
 name: test_LDX_W
 body: |
   bb.0:
@@ -781,7 +781,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDX_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldx.d	a0, a1, a2
+# CHECK-ASM: ldx.d	$a0, $a1, $a2
 name: test_LDX_D
 body: |
   bb.0:
@@ -790,7 +790,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDX_BU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldx.bu	a0, a1, a2
+# CHECK-ASM: ldx.bu	$a0, $a1, $a2
 name: test_LDX_BU
 body: |
   bb.0:
@@ -799,7 +799,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDX_HU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldx.hu	a0, a1, a2
+# CHECK-ASM: ldx.hu	$a0, $a1, $a2
 name: test_LDX_HU
 body: |
   bb.0:
@@ -808,7 +808,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDX_WU:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldx.wu	a0, a1, a2
+# CHECK-ASM: ldx.wu	$a0, $a1, $a2
 name: test_LDX_WU
 body: |
   bb.0:
@@ -817,7 +817,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDGT_B:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldgt.b	a0, a1, a2
+# CHECK-ASM: ldgt.b	$a0, $a1, $a2
 name: test_LDGT_B
 body: |
   bb.0:
@@ -826,7 +826,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDGT_H:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldgt.h	a0, a1, a2
+# CHECK-ASM: ldgt.h	$a0, $a1, $a2
 name: test_LDGT_H
 body: |
   bb.0:
@@ -835,7 +835,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDGT_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldgt.w	a0, a1, a2
+# CHECK-ASM: ldgt.w	$a0, $a1, $a2
 name: test_LDGT_W
 body: |
   bb.0:
@@ -844,7 +844,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDGT_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldgt.d	a0, a1, a2
+# CHECK-ASM: ldgt.d	$a0, $a1, $a2
 name: test_LDGT_D
 body: |
   bb.0:
@@ -853,7 +853,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDLE_B:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldle.b	a0, a1, a2
+# CHECK-ASM: ldle.b	$a0, $a1, $a2
 name: test_LDLE_B
 body: |
   bb.0:
@@ -862,7 +862,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDLE_H:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldle.h	a0, a1, a2
+# CHECK-ASM: ldle.h	$a0, $a1, $a2
 name: test_LDLE_H
 body: |
   bb.0:
@@ -871,7 +871,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDLE_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldle.w	a0, a1, a2
+# CHECK-ASM: ldle.w	$a0, $a1, $a2
 name: test_LDLE_W
 body: |
   bb.0:
@@ -880,7 +880,7 @@ body: |
 ---
 # CHECK-LABEL: test_LDLE_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: ldle.d	a0, a1, a2
+# CHECK-ASM: ldle.d	$a0, $a1, $a2
 name: test_LDLE_D
 body: |
   bb.0:
@@ -889,7 +889,7 @@ body: |
 ---
 # CHECK-LABEL: test_STX_B:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stx.b	a0, a1, a2
+# CHECK-ASM: stx.b	$a0, $a1, $a2
 name: test_STX_B
 body: |
   bb.0:
@@ -898,7 +898,7 @@ body: |
 ---
 # CHECK-LABEL: test_STX_H:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stx.h	a0, a1, a2
+# CHECK-ASM: stx.h	$a0, $a1, $a2
 name: test_STX_H
 body: |
   bb.0:
@@ -907,7 +907,7 @@ body: |
 ---
 # CHECK-LABEL: test_STX_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stx.w	a0, a1, a2
+# CHECK-ASM: stx.w	$a0, $a1, $a2
 name: test_STX_W
 body: |
   bb.0:
@@ -916,7 +916,7 @@ body: |
 ---
 # CHECK-LABEL: test_STX_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stx.d	a0, a1, a2
+# CHECK-ASM: stx.d	$a0, $a1, $a2
 name: test_STX_D
 body: |
   bb.0:
@@ -925,7 +925,7 @@ body: |
 ---
 # CHECK-LABEL: test_STGT_B:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stgt.b	a0, a1, a2
+# CHECK-ASM: stgt.b	$a0, $a1, $a2
 name: test_STGT_B
 body: |
   bb.0:
@@ -934,7 +934,7 @@ body: |
 ---
 # CHECK-LABEL: test_STGT_H:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stgt.h	a0, a1, a2
+# CHECK-ASM: stgt.h	$a0, $a1, $a2
 name: test_STGT_H
 body: |
   bb.0:
@@ -943,7 +943,7 @@ body: |
 ---
 # CHECK-LABEL: test_STGT_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stgt.w	a0, a1, a2
+# CHECK-ASM: stgt.w	$a0, $a1, $a2
 name: test_STGT_W
 body: |
   bb.0:
@@ -952,7 +952,7 @@ body: |
 ---
 # CHECK-LABEL: test_STGT_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stgt.d	a0, a1, a2
+# CHECK-ASM: stgt.d	$a0, $a1, $a2
 name: test_STGT_D
 body: |
   bb.0:
@@ -961,7 +961,7 @@ body: |
 ---
 # CHECK-LABEL: test_STLE_B:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stle.b	a0, a1, a2
+# CHECK-ASM: stle.b	$a0, $a1, $a2
 name: test_STLE_B
 body: |
   bb.0:
@@ -970,7 +970,7 @@ body: |
 ---
 # CHECK-LABEL: test_STLE_H:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stle.h	a0, a1, a2
+# CHECK-ASM: stle.h	$a0, $a1, $a2
 name: test_STLE_H
 body: |
   bb.0:
@@ -979,7 +979,7 @@ body: |
 ---
 # CHECK-LABEL: test_STLE_W:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stle.w	a0, a1, a2
+# CHECK-ASM: stle.w	$a0, $a1, $a2
 name: test_STLE_W
 body: |
   bb.0:
@@ -988,7 +988,7 @@ body: |
 ---
 # CHECK-LABEL: test_STLE_D:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: stle.d	a0, a1, a2
+# CHECK-ASM: stle.d	$a0, $a1, $a2
 name: test_STLE_D
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/3ri.mir b/llvm/test/CodeGen/LoongArch/3ri.mir
index 4cf71ff1f8861..1f21dc2461507 100644
--- a/llvm/test/CodeGen/LoongArch/3ri.mir
+++ b/llvm/test/CodeGen/LoongArch/3ri.mir
@@ -16,7 +16,7 @@
 ---
 # CHECK-LABEL: test_ALSL_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: alsl.w	a0, a1, a2, 3
+# CHECK-ASM: alsl.w	$a0, $a1, $a2, 3
 name: test_ALSL_W
 body: |
   bb.0:
@@ -25,7 +25,7 @@ body: |
 ---
 # CHECK-LABEL: test_ALSL_WU:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: alsl.wu	a0, a1, a2, 1
+# CHECK-ASM: alsl.wu	$a0, $a1, $a2, 1
 name: test_ALSL_WU
 body: |
   bb.0:
@@ -34,7 +34,7 @@ body: |
 ---
 # CHECK-LABEL: test_ALSL_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: alsl.d	a0, a1, a2, 3
+# CHECK-ASM: alsl.d	$a0, $a1, $a2, 3
 name: test_ALSL_D
 body: |
   bb.0:
@@ -43,7 +43,7 @@ body: |
 ---
 # CHECK-LABEL: test_BYTEPICK_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bytepick.w	a0, a1, a2, 0
+# CHECK-ASM: bytepick.w	$a0, $a1, $a2, 0
 name: test_BYTEPICK_W
 body: |
   bb.0:
@@ -62,7 +62,7 @@ body: |
 ---
 # CHECK-LABEL: test_BYTEPICK_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bytepick.d	a0, a1, a2, 4
+# CHECK-ASM: bytepick.d	$a0, $a1, $a2, 4
 name: test_BYTEPICK_D
 body: |
   bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/misc.mir b/llvm/test/CodeGen/LoongArch/misc.mir
index 3bece0766a2e3..ad426abbba421 100644
--- a/llvm/test/CodeGen/LoongArch/misc.mir
+++ b/llvm/test/CodeGen/LoongArch/misc.mir
@@ -90,7 +90,7 @@ body: |
 ---
 # CHECK-LABEL: test_BSTRINS_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bstrins.w	a0, a1, 7, 2
+# CHECK-ASM: bstrins.w	$a0, $a1, 7, 2
 name: test_BSTRINS_W
 body: |
   bb.0:
@@ -99,7 +99,7 @@ body: |
 ---
 # CHECK-LABEL: test_BSTRPICK_W:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bstrpick.w	a0, a1, 10, 4
+# CHECK-ASM: bstrpick.w	$a0, $a1, 10, 4
 name: test_BSTRPICK_W
 body: |
   bb.0:
@@ -118,7 +118,7 @@ body: |
 ---
 # CHECK-LABEL: test_BSTRINS_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bstrins.d	a0, a1, 7, 2
+# CHECK-ASM: bstrins.d	$a0, $a1, 7, 2
 name: test_BSTRINS_D
 body: |
   bb.0:
@@ -127,7 +127,7 @@ body: |
 ---
 # CHECK-LABEL: test_BSTRPICK_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0
-# CHECK-ASM: bstrpick.d	a0, a1, 39, 22
+# CHECK-ASM: bstrpick.d	$a0, $a1, 39, 22
 name: test_BSTRPICK_D
 body: |
   bb.0:
@@ -146,7 +146,7 @@ body: |
 ---
 # CHECK-LABEL: test_ASRTLE_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0
-# CHECK-ASM: asrtle.d	a0, a1
+# CHECK-ASM: asrtle.d	$a0, $a1
 name: test_ASRTLE_D
 body: |
   bb.0:
@@ -155,7 +155,7 @@ body: |
 ---
 # CHECK-LABEL: test_ASRTGT_D:
 # CHECK-ENC: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0
-# CHECK-ASM: asrtgt.d	a0, a1
+# CHECK-ASM: asrtgt.d	$a0, $a1
 name: test_ASRTGT_D
 body: |
   bb.0:
@@ -174,7 +174,7 @@ body: |
 ---
 # CHECK-LABEL: test_PRELD:
 # CHECK-ENC: 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 1 1 1
-# CHECK-ASM: preld	15, a0, 21
+# CHECK-ASM: preld	15, $a0, 21
 name: test_PRELD
 body: |
   bb.0:
@@ -193,7 +193,7 @@ body: |
 ---
 # CHECK-LABEL: test_PRELDX:
 # CHECK-ENC: 0 0 1 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 1
-# CHECK-ASM: preldx	11, a0, a1
+# CHECK-ASM: preldx	11, $a0, $a1
 name: test_PRELDX
 body: |
   bb.0:

From 79b0fa08e0ba38463ae7e514ad91379711f93e8d Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Fri, 18 Feb 2022 10:04:28 +0800
Subject: [PATCH 206/748] [LoongArch] Fix several instruction definition errors
 in initial patches

This patch corrects some instruction definitions that I incorrectly wrote
in initial patches including bstr{ins/pick}.{w/d}, ll.{w/d} and sc.{w/d}.

Reviewed By: xen0n

Differential Revision: https://reviews.llvm.org/D119817
---
 .../Target/LoongArch/LoongArchInstrFormats.td | 20 ++++++------
 .../Target/LoongArch/LoongArchInstrInfo.td    | 31 +++++++++++--------
 llvm/test/CodeGen/LoongArch/misc.mir          |  4 +--
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
index 0fe358565a063..0e66422929ac6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrFormats.td
@@ -256,34 +256,34 @@ class FmtI26<bits<6> op, dag outs, dag ins, string asmstr,
 }
 
 // FmtBSTR_W
-// <opcode[11:1] | msb | opcode[0] | lsb | rj | rd>
+// <opcode[11:1] | msbw | opcode[0] | lsbw | rj | rd>
 class FmtBSTR_W<bits<12> op, dag outs, dag ins, string asmstr,
                 list<dag> pattern = []> : LAInst<outs, ins, asmstr, pattern> {
-  bits<5> msb;
-  bits<5> lsb;
+  bits<5> msbw;
+  bits<5> lsbw;
   bits<5> rj;
   bits<5> rd;
 
   let Inst{31-21} = op{11-1};
-  let Inst{20-16} = msb;
+  let Inst{20-16} = msbw;
   let Inst{15} = op{0};
-  let Inst{14-10} = lsb;
+  let Inst{14-10} = lsbw;
   let Inst{9-5} = rj;
   let Inst{4-0} = rd;
 }
 
 // FmtBSTR_D
-// <opcode | msb | lsb | rj | rd>
+// <opcode | msbd | lsbd | rj | rd>
 class FmtBSTR_D<bits<10> op, dag outs, dag ins, string asmstr,
                 list<dag> pattern = []> : LAInst<outs, ins, asmstr, pattern> {
-  bits<6> msb;
-  bits<6> lsb;
+  bits<6> msbd;
+  bits<6> lsbd;
   bits<5> rj;
   bits<5> rd;
 
   let Inst{31-22} = op;
-  let Inst{21-16} = msb;
-  let Inst{15-10} = lsb;
+  let Inst{21-16} = msbd;
+  let Inst{15-10} = lsbd;
   let Inst{9-5} = rj;
   let Inst{4-0} = rd;
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 54f6ed3c6fe8b..4d207ebdea9ad 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -86,13 +86,6 @@ class ALU_1RI20<bits<7> op, string opstr, Operand ImmOpnd>
     : Fmt1RI20<op, (outs GPR:$rd), (ins ImmOpnd:$imm20),
                !strconcat(opstr, "\t$rd, $imm20")>;
 
-class ALU_BSTRW<bits<12> op, string opstr, Operand ImmOpnd>
-    : FmtBSTR_W<op, (outs GPR:$rd), (ins GPR:$rj, ImmOpnd:$msb, ImmOpnd:$lsb),
-                !strconcat(opstr, "\t$rd, $rj, $msb, $lsb")>;
-class ALU_BSTRD<bits<10> op, string opstr, Operand ImmOpnd>
-    : FmtBSTR_D<op, (outs GPR:$rd), (ins GPR:$rj, ImmOpnd:$msb, ImmOpnd:$lsb),
-                !strconcat(opstr, "\t$rd, $rj, $msb, $lsb")>;
-
 class MISC_I15<bits<17> op, string opstr>
     : FmtI15<op, (outs), (ins uimm15:$imm15), !strconcat(opstr, "\t$imm15")>;
 
@@ -150,12 +143,12 @@ class AM_3R<bits<17> op, string opstr>
 
 let mayLoad = 1 in
 class LLBase<bits<8> op, string opstr>
-    : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14:$imm14),
+    : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14_lsl2:$imm14),
                !strconcat(opstr, "\t$rd, $rj, $imm14")>;
 
 let mayStore = 1, Constraints = "$rd = $dst" in
 class SCBase<bits<8> op, string opstr>
-    : Fmt2RI14<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj, simm14:$imm14),
+    : Fmt2RI14<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj, simm14_lsl2:$imm14),
                !strconcat(opstr, "\t$rd, $rj, $imm14")>;
 
 //===----------------------------------------------------------------------===//
@@ -214,8 +207,14 @@ def BYTEPICK_W : ALU_3RI2<0b000000000000100, "bytepick.w", uimm2>;
 def REVB_2H   : ALU_2R<0b0000000000000000001100, "revb.2h">;
 def BITREV_4B : ALU_2R<0b0000000000000000010010, "bitrev.4b">;
 def BITREV_W  : ALU_2R<0b0000000000000000010100, "bitrev.w">;
-def BSTRINS_W  : ALU_BSTRW<0b000000000110, "bstrins.w", uimm5>;
-def BSTRPICK_W : ALU_BSTRW<0b000000000111, "bstrpick.w", uimm5>;
+let Constraints = "$rd = $dst" in {
+def BSTRINS_W  : FmtBSTR_W<0b000000000110, (outs GPR:$dst),
+                           (ins GPR:$rd, GPR:$rj, uimm5:$msbw, uimm5:$lsbw),
+                           "bstrins.w\t$rd, $rj, $msbw, $lsbw">;
+}
+def BSTRPICK_W : FmtBSTR_W<0b000000000111, (outs GPR:$rd),
+                           (ins GPR:$rj, uimm5:$msbw, uimm5:$lsbw),
+                           "bstrpick.w\t$rd, $rj, $msbw, $lsbw">;
 def MASKEQZ : ALU_3R<0b00000000000100110, "maskeqz">;
 def MASKNEZ : ALU_3R<0b00000000000100111, "masknez">;
 
@@ -309,8 +308,14 @@ def REVH_2W   : ALU_2R<0b0000000000000000010000, "revh.2w">;
 def REVH_D    : ALU_2R<0b0000000000000000010001, "revh.d">;
 def BITREV_8B : ALU_2R<0b0000000000000000010011, "bitrev.8b">;
 def BITREV_D  : ALU_2R<0b0000000000000000010101, "bitrev.d">;
-def BSTRINS_D  : ALU_BSTRD<0b0000000010, "bstrins.d", uimm6>;
-def BSTRPICK_D : ALU_BSTRD<0b0000000011, "bstrpick.d", uimm6>;
+let Constraints = "$rd = $dst" in {
+def BSTRINS_D  : FmtBSTR_D<0b0000000010, (outs GPR:$dst),
+                           (ins GPR:$rd, GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
+                           "bstrins.d\t$rd, $rj, $msbd, $lsbd">;
+}
+def BSTRPICK_D : FmtBSTR_D<0b0000000011, (outs GPR:$rd),
+                           (ins GPR:$rj, uimm6:$msbd, uimm6:$lsbd),
+                           "bstrpick.d\t$rd, $rj, $msbd, $lsbd">;
 
 // Common Memory Access Instructions for 64-bits
 def LD_WU : LOAD_2RI12<0b0010101010, "ld.wu">;
diff --git a/llvm/test/CodeGen/LoongArch/misc.mir b/llvm/test/CodeGen/LoongArch/misc.mir
index ad426abbba421..3035edc768fc8 100644
--- a/llvm/test/CodeGen/LoongArch/misc.mir
+++ b/llvm/test/CodeGen/LoongArch/misc.mir
@@ -94,7 +94,7 @@ body: |
 name: test_BSTRINS_W
 body: |
   bb.0:
-    $r4 = BSTRINS_W $r5, 7, 2
+    $r4 = BSTRINS_W $r4, $r5, 7, 2
 ...
 ---
 # CHECK-LABEL: test_BSTRPICK_W:
@@ -122,7 +122,7 @@ body: |
 name: test_BSTRINS_D
 body: |
   bb.0:
-    $r4 = BSTRINS_D $r5, 7, 2
+    $r4 = BSTRINS_D $r4, $r5, 7, 2
 ...
 ---
 # CHECK-LABEL: test_BSTRPICK_D:

From c046cff1cf11d63aa0f5483b758464f989fbc029 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 17 Feb 2022 13:06:11 -0800
Subject: [PATCH 207/748] [msan] strsignal interceptor

Reviewed By: kstoimenov

Differential Revision: https://reviews.llvm.org/D120082
---
 compiler-rt/lib/msan/msan_interceptors.cpp | 10 ++++++++++
 compiler-rt/test/msan/strsignal.cpp        | 13 +++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 compiler-rt/test/msan/strsignal.cpp

diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index 5317af6982a03..dbe18ce37509e 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -1436,6 +1436,15 @@ static uptr signal_impl(int signo, uptr cb) {
 #include "sanitizer_common/sanitizer_common_syscalls.inc"
 #include "sanitizer_common/sanitizer_syscalls_netbsd.inc"
 
+INTERCEPTOR(const char *, strsignal, int sig) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strsignal, sig);
+  const char *res = REAL(strsignal)(sig);
+  if (res)
+    __msan_unpoison(res, internal_strlen(res) + 1);
+  return res;
+}
+
 struct dlinfo {
   char *dli_fname;
   void *dli_fbase;
@@ -1699,6 +1708,7 @@ void InitializeInterceptors() {
   INTERCEPT_FUNCTION(gethostname);
   MSAN_MAYBE_INTERCEPT_EPOLL_WAIT;
   MSAN_MAYBE_INTERCEPT_EPOLL_PWAIT;
+  INTERCEPT_FUNCTION(strsignal);
   INTERCEPT_FUNCTION(dladdr);
   INTERCEPT_FUNCTION(dlerror);
   INTERCEPT_FUNCTION(dl_iterate_phdr);
diff --git a/compiler-rt/test/msan/strsignal.cpp b/compiler-rt/test/msan/strsignal.cpp
new file mode 100644
index 0000000000000..62b68e00b9e71
--- /dev/null
+++ b/compiler-rt/test/msan/strsignal.cpp
@@ -0,0 +1,13 @@
+// RUN: %clangxx_msan -O0 %s -o %t && %run %t
+
+#include <assert.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+
+int main(void) {
+  const char *p = strsignal(SIGSEGV);
+  assert(p);
+  printf("%s %zu\n", p, strlen(p));
+  return 0;
+}

From 12389e375811d46ce41d949857f8b469d6563114 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Thu, 17 Feb 2022 18:12:39 -0800
Subject: [PATCH 208/748] [MachineOutliner] Add statistics for unsigned vector
 size

Useful for debugging + evaluating improvements to the outliner.

Stats are the number of illegal, legal, and invisible instructions in the
unsigned vector, and it's total length.
---
 llvm/lib/CodeGen/MachineOutliner.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index d7d098278d2a5..7ce655dce8e34 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -82,9 +82,17 @@ using namespace llvm;
 using namespace ore;
 using namespace outliner;
 
+// Statistics for outlined functions.
 STATISTIC(NumOutlined, "Number of candidates outlined");
 STATISTIC(FunctionsCreated, "Number of functions created");
 
+// Statistics for instruction mapping.
+STATISTIC(NumLegalInUnsignedVec, "Number of legal instrs in unsigned vector");
+STATISTIC(NumIllegalInUnsignedVec,
+          "Number of illegal instrs in unsigned vector");
+STATISTIC(NumInvisible, "Number of invisible instrs in unsigned vector");
+STATISTIC(UnsignedVecSize, "Size of unsigned vector");
+
 // Set to true if the user wants the outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
 // functions. Since the outliner is confined to a single module (modulo LTO),
@@ -188,6 +196,8 @@ struct InstructionMapper {
     assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
            "Tried to assign DenseMap tombstone or empty key to instruction.");
 
+    // Statistics.
+    ++NumLegalInUnsignedVec;
     return MINumber;
   }
 
@@ -215,6 +225,8 @@ struct InstructionMapper {
     InstrListForMBB.push_back(It);
     UnsignedVecForMBB.push_back(IllegalInstrNumber);
     IllegalInstrNumber--;
+    // Statistics.
+    ++NumIllegalInUnsignedVec;
 
     assert(LegalInstrNumber < IllegalInstrNumber &&
            "Instruction mapping overflow!");
@@ -293,6 +305,7 @@ struct InstructionMapper {
       case InstrType::Invisible:
         // Normally this is set by mapTo(Blah)Unsigned, but we just want to
         // skip this instruction. So, unset the flag here.
+        ++NumInvisible;
         AddedIllegalLastTime = false;
         break;
       }
@@ -905,6 +918,9 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
       // MBB is suitable for outlining. Map it to a list of unsigneds.
       Mapper.convertToUnsignedVec(MBB, *TII);
     }
+
+    // Statistics.
+    UnsignedVecSize = Mapper.UnsignedVec.size();
   }
 }
 

From 77cf18fa1899869e05bee6a13f23881b3e8bf042 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Thu, 17 Feb 2022 18:26:16 -0800
Subject: [PATCH 209/748] [MachineOutliner] Add testcase for instruction
 mapping stats

I forgot to attach the testcase for 12389e375811d46ce41d949857f8b469d6563114!
---
 .../machine-outliner-mapping-stats.mir        | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/machine-outliner-mapping-stats.mir

diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-mapping-stats.mir b/llvm/test/CodeGen/AArch64/machine-outliner-mapping-stats.mir
new file mode 100644
index 0000000000000..7b6ffbe97cd0f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-mapping-stats.mir
@@ -0,0 +1,29 @@
+# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner -verify-machineinstrs -stats %s -o - 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check that instruction mapping stats work.
+
+# We ought to map all of the instructions (5 of them) as legal, and then
+# terminate the string with a single illegal character. Debug instructions are
+# always invisible, and don't contribute to the length of the string.
+
+# CHECK: 1 machine-outliner - Number of illegal instrs in unsigned vector
+# CHECK: 1 machine-outliner - Number of invisible instrs in unsigned vector
+# CHECK: 5 machine-outliner - Number of legal instrs in unsigned vector
+# CHECK: 6 machine-outliner - Size of unsigned vector
+
+...
+---
+name:            test
+tracksRegLiveness: true
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0:
+  liveins: $lr
+    $x0 = ORRXri $xzr, 1
+    $x1 = ORRXri $xzr, 1
+    $x2 = ORRXri $xzr, 1
+    DBG_VALUE $x3, $noreg
+    $x3 = ORRXri $xzr, 1
+    RET undef $lr

From 9dcb5275e5ce03df1f0ba66f7025b904f7f647c2 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Fri, 18 Feb 2022 10:18:13 +0800
Subject: [PATCH 210/748] [NFC] Add myself to CREDITS.TXT

---
 llvm/CREDITS.TXT | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/CREDITS.TXT b/llvm/CREDITS.TXT
index 0e119e45319e1..c7b0cf63bb9f8 100644
--- a/llvm/CREDITS.TXT
+++ b/llvm/CREDITS.TXT
@@ -324,6 +324,10 @@ D: Mips backend
 D: Random ARM integrated assembler and assembly parser improvements
 D: General X86 AVX1 support
 
+N: Weining Lu
+E: luweining@loongson.cn
+D: LoongArch backend
+
 N: Duraid Madina
 E: duraid@octopus.com.au
 W: http://kinoko.c.u-tokyo.ac.jp/~duraid/

From 6b53ad298e95a9f1cb5770dcbaa71cb4ea343021 Mon Sep 17 00:00:00 2001
From: Kuba Mracek <mracek@apple.com>
Date: Thu, 17 Feb 2022 19:41:46 -0800
Subject: [PATCH 211/748] [GlobalDCE] [VFE] Avoid dropping vfunc dependencies
 when an invalid vtable entry is present

When we scan vtables for a particular vload in ScanVTableLoad and an entry in
one possible vtable is invalid (null or non-fptr), we bail in a wrong way -- we
completely stop the scanning of vtables and this results in dropped dependencies
and incorrectly removed vfuncs from vtables. Let's fix that by correcting the
bailing logic to keep iterating and only skip the invalid entries.

Differential Revision: https://reviews.llvm.org/D120006
---
 llvm/lib/Transforms/IPO/GlobalDCE.cpp         |  4 +-
 .../GlobalDCE/virtual-functions-nonptr.ll     | 44 +++++++++++++++++++
 .../GlobalDCE/virtual-functions-null.ll       |  2 +-
 3 files changed, 47 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/GlobalDCE/virtual-functions-nonptr.ll

diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index e375504099610..79bf97b293bcc 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -214,14 +214,14 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
     if (!Ptr) {
       LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
       VFESafeVTables.erase(VTable);
-      return;
+      continue;
     }
 
     auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts());
     if (!Callee) {
       LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n");
       VFESafeVTables.erase(VTable);
-      return;
+      continue;
     }
 
     LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> "
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-nonptr.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-nonptr.ll
new file mode 100644
index 0000000000000..695a0b55327f5
--- /dev/null
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-nonptr.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -globaldce -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare { i8*, i1 } @llvm.type.checked.load(i8*, i32, metadata)
+
+@vtableA = internal unnamed_addr constant { [2 x i32] } { [2 x i32] [
+  i32 0,
+  i32 trunc (i64 sub (i64 ptrtoint (void ()* @vfunc2 to i64), i64 ptrtoint ({ [2 x i32] }* @vtableA to i64)) to i32)
+]}, align 8, !type !{i64 0, !"vfunc1.type"}, !type !{i64 4, !"vfunc2.type"}, !vcall_visibility !{i64 2}
+
+; CHECK:      @vtableA = internal unnamed_addr constant { [2 x i32] } { [2 x i32] [
+; CHECK-SAME:   i32 0,
+; CHECK-SAME:   i32 trunc (i64 sub (i64 ptrtoint (void ()* @vfunc2 to i64), i64 ptrtoint ({ [2 x i32] }* @vtableA to i64)) to i32)
+; CHECK-SAME: ] }, align 8
+
+@vtableB = internal unnamed_addr constant { [2 x i32] } { [2 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (void ()* @vfunc1 to i64), i64 ptrtoint ({ [2 x i32] }* @vtableB to i64)) to i32),
+  i32 trunc (i64 sub (i64 ptrtoint (void ()* @vfunc2 to i64), i64 ptrtoint ({ [2 x i32] }* @vtableB to i64)) to i32)
+]}, align 8, !type !{i64 0, !"vfunc1.type"}, !type !{i64 4, !"vfunc2.type"}, !vcall_visibility !{i64 2}
+
+; CHECK:      @vtableB = internal unnamed_addr constant { [2 x i32] } { [2 x i32] [
+; CHECK-SAME:   i32 trunc (i64 sub (i64 ptrtoint (void ()* @vfunc1 to i64), i64 ptrtoint ({ [2 x i32] }* @vtableB to i64)) to i32),
+; CHECK-SAME:   i32 trunc (i64 sub (i64 ptrtoint (void ()* @vfunc2 to i64), i64 ptrtoint ({ [2 x i32] }* @vtableB to i64)) to i32)
+; CHECK-SAME: ] }, align 8
+
+define internal void @vfunc1() {
+  ret void
+}
+
+define internal void @vfunc2() {
+  ret void
+}
+
+define void @main() {
+  %1 = ptrtoint { [2 x i32] }* @vtableA to i64 ; to keep @vtableA alive
+  %2 = ptrtoint { [2 x i32] }* @vtableB to i64 ; to keep @vtableB alive
+  %3 = tail call { i8*, i1 } @llvm.type.checked.load(i8* null, i32 0, metadata !"vfunc1.type")
+  %4 = tail call { i8*, i1 } @llvm.type.checked.load(i8* null, i32 0, metadata !"vfunc2.type")
+  ret void
+}
+
+!999 = !{i32 1, !"Virtual Function Elim", i32 1}
+!llvm.module.flags = !{!999}
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll
index 33be6451fa3f7..186967b6311ad 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-null.ll
@@ -20,7 +20,7 @@ declare { i8*, i1 } @llvm.type.checked.load(i8*, i32, metadata)
 ]}, align 8, !type !{i64 0, !"vfunc1.type"}, !type !{i64 8, !"vfunc2.type"}, !vcall_visibility !{i64 2}
 
 ; CHECK:      @vtableB = internal unnamed_addr constant { [2 x i8*] } { [2 x i8*] [
-; CHECK-SAME:   i8* null,
+; CHECK-SAME:   i8* bitcast (void ()* @vfunc1 to i8*),
 ; CHECK-SAME:   i8* bitcast (void ()* @vfunc2 to i8*)
 ; CHECK-SAME: ] }, align 8
 

From eea3d90af181fdb66e583af53401e80ddfcc8cd1 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconst@apple.com>
Date: Thu, 17 Feb 2022 20:11:46 -0800
Subject: [PATCH 212/748] [libc++][ranges] Implement `std::mergeable`.

Differential Revision: https://reviews.llvm.org/D119489
---
 libcxx/docs/Status/RangesPaper.csv            |   2 +-
 libcxx/include/CMakeLists.txt                 |   1 +
 libcxx/include/__iterator/mergeable.h         |  41 ++++++
 libcxx/include/iterator                       |  14 +-
 libcxx/include/module.modulemap               |   1 +
 .../iterator/mergeable.module.verify.cpp      |  15 ++
 .../mergeable.compile.pass.cpp                | 129 ++++++++++++++++++
 .../mergeable.subsumption.compile.pass.cpp    |  38 ++++++
 8 files changed, 236 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/include/__iterator/mergeable.h
 create mode 100644 libcxx/test/libcxx/diagnostics/detail.headers/iterator/mergeable.module.verify.cpp
 create mode 100644 libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.compile.pass.cpp
 create mode 100644 libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.subsumption.compile.pass.cpp

diff --git a/libcxx/docs/Status/RangesPaper.csv b/libcxx/docs/Status/RangesPaper.csv
index 3e7be37614ad1..e67abd35873dc 100644
--- a/libcxx/docs/Status/RangesPaper.csv
+++ b/libcxx/docs/Status/RangesPaper.csv
@@ -65,7 +65,7 @@ Section,Description,Dependencies,Assignee,Complete
 | [iterator.cust.swap]",Zoe Carver,✅
 `[alg.req] <https://wg21.link/alg.req>`_: pt. 3,`indirectly_comparable <https://llvm.org/D116268>`_,[projected],Nikolas Klauser,✅
 `[alg.req] <https://wg21.link/alg.req>`_: pt. 4,"| `permutable <https://llvm.org/D119222>`_
-| mergeable
+| `mergeable <https://llvm.org/D119489>`_
 | sortable",[iterator.concepts],Konstantin Varlamov,In progress
 `[std.iterator.tags] <https://wg21.link/std.iterator.tags>`_,"| `contiguous_iterator_tag <https://llvm.org/rG45d048c20440989df2b4e1be1f9343225e7741ab>`_
 | `iterator_concept specialization for pointers <https://llvm.org/rG45d048c20440989df2b4e1be1f9343225e7741ab>`_
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 1e5299f00c2e5..0858a90fe2a6d 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -243,6 +243,7 @@ set(files
   __iterator/iter_swap.h
   __iterator/iterator.h
   __iterator/iterator_traits.h
+  __iterator/mergeable.h
   __iterator/move_iterator.h
   __iterator/next.h
   __iterator/ostream_iterator.h
diff --git a/libcxx/include/__iterator/mergeable.h b/libcxx/include/__iterator/mergeable.h
new file mode 100644
index 0000000000000..08022aab6d272
--- /dev/null
+++ b/libcxx/include/__iterator/mergeable.h
@@ -0,0 +1,41 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ITERATOR_MERGEABLE_H
+#define _LIBCPP___ITERATOR_MERGEABLE_H
+
+#include <__config>
+#include <__functional/identity.h>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/projected.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+template <class _Input1, class _Input2, class _Output,
+          class _Comp = ranges::less, class _Proj1 = identity, class _Proj2 = identity>
+concept mergeable =
+    input_iterator<_Input1> &&
+    input_iterator<_Input2> &&
+    weakly_incrementable<_Output> &&
+    indirectly_copyable<_Input1, _Output> &&
+    indirectly_copyable<_Input2, _Output> &&
+    indirect_strict_weak_order<_Comp, projected<_Input1, _Proj1>, projected<_Input2, _Proj2>>;
+
+#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ITERATOR_MERGEABLE_H
diff --git a/libcxx/include/iterator b/libcxx/include/iterator
index fe91d15e6bc31..1668be3a60c8e 100644
--- a/libcxx/include/iterator
+++ b/libcxx/include/iterator
@@ -147,15 +147,20 @@ template<class In, class Out>
 template<class I1, class I2 = I1>
   concept indirectly_swappable = see below;                // since C++20
 
-// [alg.req.permutable], concept permutable                // since C++20
-template<class I>
-  concept permutable = see below;
-
 template<class I1, class I2, class R, class P1 = identity,
          class P2 = identity>
   concept indirectly_comparable =
     indirect_binary_predicate<R, projected<I1, P1>, projected<I2, P2>>; // since C++20
 
+// [alg.req.permutable], concept permutable                // since C++20
+template<class I>
+  concept permutable = see below;
+
+ // [alg.req.mergeable], concept mergeable
+template<class I1, class I2, class Out,
+    class R = ranges::less, class P1 = identity, class P2 = identity>
+  concept mergeable = see below;                           // since C++20
+
 template<input_or_output_iterator I, sentinel_for<I> S>
   requires (!same_as<I, S> && copyable<I>)
 class common_iterator;                                     // since C++20
@@ -623,6 +628,7 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 #include <__iterator/iter_swap.h>
 #include <__iterator/iterator.h>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/mergeable.h>
 #include <__iterator/move_iterator.h>
 #include <__iterator/next.h>
 #include <__iterator/ostream_iterator.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index a44493968db7e..ea050751d9ca7 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -628,6 +628,7 @@ module std [system] {
       module iter_swap             { private header "__iterator/iter_swap.h" }
       module iterator              { private header "__iterator/iterator.h" }
       module iterator_traits       { private header "__iterator/iterator_traits.h" }
+      module mergeable             { private header "__iterator/mergeable.h" }
       module move_iterator         { private header "__iterator/move_iterator.h" }
       module next                  { private header "__iterator/next.h" }
       module ostream_iterator      { private header "__iterator/ostream_iterator.h" }
diff --git a/libcxx/test/libcxx/diagnostics/detail.headers/iterator/mergeable.module.verify.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/iterator/mergeable.module.verify.cpp
new file mode 100644
index 0000000000000..b8f15dcbd6d1e
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/iterator/mergeable.module.verify.cpp
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: modules-build
+
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
+
+// expected-error@*:* {{use of private header from outside its module: '__iterator/mergeable.h'}}
+#include <__iterator/mergeable.h>
diff --git a/libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.compile.pass.cpp
new file mode 100644
index 0000000000000..21dc0c204cec3
--- /dev/null
+++ b/libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.compile.pass.cpp
@@ -0,0 +1,129 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class I1, class I2, class Out,
+//     class R = ranges::less, class P1 = identity, class P2 = identity>
+//   concept mergeable = see below;                           // since C++20
+
+#include <iterator>
+
+#include <functional>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+
+using CompDefault = std::ranges::less;
+using CompInt = bool(*)(int, int);
+using ProjDefault = std::identity;
+
+using Input = cpp20_input_iterator<int*>;
+static_assert( std::input_iterator<Input>);
+using InputLong = cpp20_input_iterator<long*>;
+static_assert( std::input_iterator<InputLong>);
+
+using Output = cpp17_output_iterator<int*>;
+static_assert( std::weakly_incrementable<Output>);
+
+static_assert( std::indirectly_copyable<Input, Output>);
+static_assert( std::indirectly_copyable<InputLong, Output>);
+static_assert( std::indirect_strict_weak_order<CompDefault, Input, Input>);
+static_assert( std::indirect_strict_weak_order<CompInt, Input, Input>);
+static_assert( std::indirect_strict_weak_order<CompDefault, Input, InputLong>);
+static_assert( std::indirect_strict_weak_order<CompInt, Input, InputLong>);
+
+// All requirements satisfied.
+static_assert( std::mergeable<Input, Input, Output>);
+static_assert( std::mergeable<Input, Input, Output, CompInt>);
+static_assert( std::mergeable<Input, Input, Output, CompInt, ProjDefault>);
+
+// Non-default projections.
+struct Foo {};
+using ProjFooToInt = int(*)(Foo);
+using ProjFooToLong = long(*)(Foo);
+static_assert( std::indirect_strict_weak_order<CompDefault,
+    std::projected<Foo*, ProjFooToInt>, std::projected<Foo*, ProjFooToLong>>);
+static_assert( std::mergeable<Foo*, Foo*, Foo*, CompDefault, ProjFooToInt, ProjFooToLong>);
+static_assert( std::indirect_strict_weak_order<CompInt,
+    std::projected<Foo*, ProjFooToInt>, std::projected<Foo*, ProjFooToLong>>);
+static_assert( std::mergeable<Foo*, Foo*, Foo*, CompInt, ProjFooToInt, ProjFooToLong>);
+
+// I1 or I2 is not an input iterator.
+static_assert(!std::input_iterator<Output>);
+static_assert(!std::mergeable<Output, Input, Output>);
+static_assert(!std::mergeable<Input, Output, Output>);
+
+// O is not weakly incrementable.
+struct NotWeaklyIncrementable {
+  int& operator*() const;
+};
+
+static_assert(!std::weakly_incrementable<NotWeaklyIncrementable>);
+static_assert( std::indirectly_copyable<Input, NotWeaklyIncrementable>);
+static_assert( std::indirect_strict_weak_order<CompDefault, Input, Input>);
+static_assert(!std::mergeable<Input, Input, NotWeaklyIncrementable>);
+
+// I1 or I2 is not indirectly copyable into O.
+struct AssignableOnlyFromInt {
+  AssignableOnlyFromInt& operator=(int);
+  template <class T>
+  AssignableOnlyFromInt& operator=(T) = delete;
+};
+using OutputOnlyInt = cpp17_output_iterator<AssignableOnlyFromInt*>;
+static_assert( std::weakly_incrementable<OutputOnlyInt>);
+
+static_assert( std::indirectly_copyable<Input, OutputOnlyInt>);
+static_assert(!std::indirectly_copyable<InputLong, OutputOnlyInt>);
+static_assert( std::indirect_strict_weak_order<CompDefault, Input, InputLong>);
+static_assert( std::mergeable<Input, Input, OutputOnlyInt>);
+static_assert(!std::mergeable<Input, InputLong, OutputOnlyInt>);
+static_assert(!std::mergeable<InputLong, Input, OutputOnlyInt>);
+
+// No indirect strict weak order between I1 and I2 (bad comparison functor).
+using GoodComp = bool(*)(int, int);
+static_assert( std::indirect_strict_weak_order<GoodComp, Input, Input>);
+static_assert( std::mergeable<Input, Input, Output, GoodComp>);
+using BadComp = bool(*)(int*, int*);
+static_assert(!std::indirect_strict_weak_order<BadComp, Input, Input>);
+static_assert(!std::mergeable<Input, Input, Output, BadComp>);
+
+// No indirect strict weak order between I1 and I2 (bad projection).
+using ToInt = int(*)(int);
+using ToPtr = int*(*)(int);
+static_assert( std::mergeable<Input, Input, Output, GoodComp, std::identity, std::identity>);
+static_assert( std::mergeable<Input, Input, Output, GoodComp, ToInt, ToInt>);
+static_assert(!std::mergeable<Input, Input, Output, GoodComp, ToPtr, ToInt>);
+static_assert(!std::mergeable<Input, Input, Output, GoodComp, ToInt, ToPtr>);
+static_assert(!std::mergeable<Input, Input, Output, bool(*)(int*, int), ToPtr, ToInt>);
+static_assert(!std::mergeable<Input, Input, Output, bool(*)(int, int*), ToInt, ToPtr>);
+
+// A projection that only supports non-const references and has a non-const `operator()` still has to work.
+struct ProjectionOnlyMutable {
+  int operator()(int&);
+  int operator()(int&&) const = delete;
+};
+static_assert( std::mergeable<Input, Input, Output, CompDefault, ProjectionOnlyMutable, ProjectionOnlyMutable>);
+
+// The output is weakly incrementable but not an output iterator.
+struct WeaklyIncrementable {
+  using value_type = int;
+  using difference_type = int;
+
+  int& operator*() const;
+  WeaklyIncrementable& operator++();
+  // `output_iterator` requires `i++` to return an iterator,
+  // while `weakly_incrementable` requires only that `i++` be well-formed.
+  void operator++(int);
+};
+static_assert( std::weakly_incrementable<WeaklyIncrementable>);
+static_assert( std::indirectly_copyable<int*, WeaklyIncrementable>);
+static_assert(!std::output_iterator<WeaklyIncrementable, int>);
+static_assert( std::mergeable<Input, Input, WeaklyIncrementable>);
diff --git a/libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.subsumption.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.subsumption.compile.pass.cpp
new file mode 100644
index 0000000000000..a17a523e6aeab
--- /dev/null
+++ b/libcxx/test/std/iterators/iterator.requirements/alg.req.mergeable/mergeable.subsumption.compile.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class I1, class I2, class Out,
+//     class R = ranges::less, class P1 = identity, class P2 = identity>
+//   concept mergeable = see below;                           // since C++20
+
+#include <iterator>
+
+#include "test_macros.h"
+
+template <class I1, class I2, class O>
+void test_subsumption() requires std::input_iterator<I1> && std::input_iterator<I2>;
+
+template <class I1, class I2, class O>
+void test_subsumption() requires std::weakly_incrementable<O>;
+
+template <class I1, class I2, class O>
+void test_subsumption() requires std::indirectly_copyable<I1, O> && std::indirectly_copyable<I2, O>;
+
+template <class I1, class I2, class O>
+void test_subsumption() requires std::indirect_strict_weak_order<I1, I2>;
+
+template <class I1, class I2, class O>
+constexpr bool test_subsumption() requires std::mergeable<I1, I2, O> {
+  return true;
+}
+
+static_assert(test_subsumption<int*, int*, int*>());

From d40b46e96d53bed8f898e519b25d06660d49074e Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 18 Feb 2022 04:12:48 +0000
Subject: [PATCH 213/748] [gn build] Port eea3d90af181

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 3349edeb3af05..5ea6f2a1595f7 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -298,6 +298,7 @@ if (current_toolchain == default_toolchain) {
       "__iterator/iter_swap.h",
       "__iterator/iterator.h",
       "__iterator/iterator_traits.h",
+      "__iterator/mergeable.h",
       "__iterator/move_iterator.h",
       "__iterator/next.h",
       "__iterator/ostream_iterator.h",

From 8e979460bb27610d574733ca5b75afae0cdfb3c9 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconst@apple.com>
Date: Thu, 17 Feb 2022 20:15:02 -0800
Subject: [PATCH 214/748] [libc++][ranges] Implement `std::sortable`.

Differential Revision: https://reviews.llvm.org/D119619
---
 libcxx/docs/Status/RangesPaper.csv            |  2 +-
 libcxx/include/CMakeLists.txt                 |  1 +
 libcxx/include/__iterator/sortable.h          | 37 ++++++++++++++
 libcxx/include/iterator                       | 13 +++--
 libcxx/include/module.modulemap               |  1 +
 .../iterator/sortable.module.verify.cpp       | 15 ++++++
 .../permutable.compile.pass.cpp               |  2 -
 .../sortable.compile.pass.cpp                 | 51 +++++++++++++++++++
 .../sortable.subsumption.compile.pass.cpp     | 27 ++++++++++
 9 files changed, 142 insertions(+), 7 deletions(-)
 create mode 100644 libcxx/include/__iterator/sortable.h
 create mode 100644 libcxx/test/libcxx/diagnostics/detail.headers/iterator/sortable.module.verify.cpp
 create mode 100644 libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.compile.pass.cpp
 create mode 100644 libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.subsumption.compile.pass.cpp

diff --git a/libcxx/docs/Status/RangesPaper.csv b/libcxx/docs/Status/RangesPaper.csv
index e67abd35873dc..ebbc5c10916cc 100644
--- a/libcxx/docs/Status/RangesPaper.csv
+++ b/libcxx/docs/Status/RangesPaper.csv
@@ -66,7 +66,7 @@ Section,Description,Dependencies,Assignee,Complete
 `[alg.req] <https://wg21.link/alg.req>`_: pt. 3,`indirectly_comparable <https://llvm.org/D116268>`_,[projected],Nikolas Klauser,✅
 `[alg.req] <https://wg21.link/alg.req>`_: pt. 4,"| `permutable <https://llvm.org/D119222>`_
 | `mergeable <https://llvm.org/D119489>`_
-| sortable",[iterator.concepts],Konstantin Varlamov,In progress
+| `sortable <https://llvm.org/D119619>`_",[iterator.concepts],Konstantin Varlamov,✅
 `[std.iterator.tags] <https://wg21.link/std.iterator.tags>`_,"| `contiguous_iterator_tag <https://llvm.org/rG45d048c20440989df2b4e1be1f9343225e7741ab>`_
 | `iterator_concept specialization for pointers <https://llvm.org/rG45d048c20440989df2b4e1be1f9343225e7741ab>`_
 ",[iterator.traits],Eric Fiselier,✅
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 0858a90fe2a6d..72d06fc3a7532 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -255,6 +255,7 @@ set(files
   __iterator/reverse_access.h
   __iterator/reverse_iterator.h
   __iterator/size.h
+  __iterator/sortable.h
   __iterator/unreachable_sentinel.h
   __iterator/wrap_iter.h
   __libcpp_version
diff --git a/libcxx/include/__iterator/sortable.h b/libcxx/include/__iterator/sortable.h
new file mode 100644
index 0000000000000..77a553d3ec30e
--- /dev/null
+++ b/libcxx/include/__iterator/sortable.h
@@ -0,0 +1,37 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ITERATOR_SORTABLE_H
+#define _LIBCPP___ITERATOR_SORTABLE_H
+
+#include <__config>
+#include <__functional/identity.h>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/permutable.h>
+#include <__iterator/projected.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+template <class _Iter, class _Comp = ranges::less, class _Proj = identity>
+concept sortable =
+  permutable<_Iter> &&
+  indirect_strict_weak_order<_Comp, projected<_Iter, _Proj>>;
+
+#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ITERATOR_SORTABLE_H
diff --git a/libcxx/include/iterator b/libcxx/include/iterator
index 1668be3a60c8e..6e2eb4a78c287 100644
--- a/libcxx/include/iterator
+++ b/libcxx/include/iterator
@@ -138,10 +138,10 @@ template<class In, class Out>
 
 // [alg.req.ind.copy], concept indirectly_copyable
 template<class In, class Out>
-  concept indirectly_copyable = see below;                  // since C++20
+  concept indirectly_copyable = see below;                 // since C++20
 
 template<class In, class Out>
-  concept indirectly_copyable_storable = see below;         // since C++20
+  concept indirectly_copyable_storable = see below;        // since C++20
 
 // [alg.req.ind.swap], concept indirectly_swappable
 template<class I1, class I2 = I1>
@@ -152,15 +152,19 @@ template<class I1, class I2, class R, class P1 = identity,
   concept indirectly_comparable =
     indirect_binary_predicate<R, projected<I1, P1>, projected<I2, P2>>; // since C++20
 
-// [alg.req.permutable], concept permutable                // since C++20
+// [alg.req.permutable], concept permutable
 template<class I>
-  concept permutable = see below;
+  concept permutable = see below;                          // since C++20
 
  // [alg.req.mergeable], concept mergeable
 template<class I1, class I2, class Out,
     class R = ranges::less, class P1 = identity, class P2 = identity>
   concept mergeable = see below;                           // since C++20
 
+// [alg.req.sortable], concept sortable
+template<class I, class R = ranges::less, class P = identity>
+  concept sortable = see below;                            // since C++20
+
 template<input_or_output_iterator I, sentinel_for<I> S>
   requires (!same_as<I, S> && copyable<I>)
 class common_iterator;                                     // since C++20
@@ -640,6 +644,7 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 #include <__iterator/reverse_access.h>
 #include <__iterator/reverse_iterator.h>
 #include <__iterator/size.h>
+#include <__iterator/sortable.h>
 #include <__iterator/unreachable_sentinel.h>
 #include <__iterator/wrap_iter.h>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index ea050751d9ca7..890b7fef933ad 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -640,6 +640,7 @@ module std [system] {
       module reverse_access        { private header "__iterator/reverse_access.h" }
       module reverse_iterator      { private header "__iterator/reverse_iterator.h" }
       module size                  { private header "__iterator/size.h" }
+      module sortable              { private header "__iterator/sortable.h" }
       module unreachable_sentinel  { private header "__iterator/unreachable_sentinel.h" }
       module wrap_iter             { private header "__iterator/wrap_iter.h" }
     }
diff --git a/libcxx/test/libcxx/diagnostics/detail.headers/iterator/sortable.module.verify.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/iterator/sortable.module.verify.cpp
new file mode 100644
index 0000000000000..47ed326bccc54
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/iterator/sortable.module.verify.cpp
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: modules-build
+
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
+
+// expected-error@*:* {{use of private header from outside its module: '__iterator/sortable.h'}}
+#include <__iterator/sortable.h>
diff --git a/libcxx/test/std/iterators/iterator.requirements/alg.req.permutable/permutable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/alg.req.permutable/permutable.compile.pass.cpp
index 947ecaee3e30c..b72e684bf90c3 100644
--- a/libcxx/test/std/iterators/iterator.requirements/alg.req.permutable/permutable.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/alg.req.permutable/permutable.compile.pass.cpp
@@ -14,9 +14,7 @@
 
 #include <iterator>
 
-#include "MoveOnly.h"
 #include "test_iterators.h"
-#include "test_macros.h"
 
 using AllConstraintsSatisfied = forward_iterator<int*>;
 static_assert( std::forward_iterator<AllConstraintsSatisfied>);
diff --git a/libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.compile.pass.cpp
new file mode 100644
index 0000000000000..44f6ef8444dd7
--- /dev/null
+++ b/libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.compile.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class I, class R = ranges::less, class P = identity>
+//   concept sortable = see below;                            // since C++20
+
+#include <iterator>
+
+#include <functional>
+
+using CompInt = bool(*)(int, int);
+using CompDefault = std::ranges::less;
+
+using AllConstraintsSatisfied = int*;
+static_assert( std::permutable<AllConstraintsSatisfied>);
+static_assert( std::indirect_strict_weak_order<CompDefault, AllConstraintsSatisfied>);
+static_assert( std::sortable<AllConstraintsSatisfied>);
+static_assert( std::indirect_strict_weak_order<CompInt, AllConstraintsSatisfied>);
+static_assert( std::sortable<AllConstraintsSatisfied, CompInt>);
+
+struct Foo {};
+using Proj = int(*)(Foo);
+static_assert( std::permutable<Foo*>);
+static_assert(!std::indirect_strict_weak_order<CompDefault, Foo*>);
+static_assert( std::indirect_strict_weak_order<CompDefault, std::projected<Foo*, Proj>>);
+static_assert(!std::sortable<Foo*, CompDefault>);
+static_assert( std::sortable<Foo*, CompDefault, Proj>);
+static_assert(!std::indirect_strict_weak_order<CompInt, Foo*>);
+static_assert( std::indirect_strict_weak_order<CompInt, std::projected<Foo*, Proj>>);
+static_assert(!std::sortable<Foo*, CompInt>);
+static_assert( std::sortable<Foo*, CompInt, Proj>);
+
+using NotPermutable = const int*;
+static_assert(!std::permutable<NotPermutable>);
+static_assert( std::indirect_strict_weak_order<CompInt, NotPermutable>);
+static_assert(!std::sortable<NotPermutable, CompInt>);
+
+struct Empty {};
+using NoIndirectStrictWeakOrder = Empty*;
+static_assert( std::permutable<NoIndirectStrictWeakOrder>);
+static_assert(!std::indirect_strict_weak_order<CompInt, NoIndirectStrictWeakOrder>);
+static_assert(!std::sortable<NoIndirectStrictWeakOrder, CompInt>);
diff --git a/libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.subsumption.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.subsumption.compile.pass.cpp
new file mode 100644
index 0000000000000..135499d60a508
--- /dev/null
+++ b/libcxx/test/std/iterators/iterator.requirements/alg.req.sortable/sortable.subsumption.compile.pass.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class I, class R = ranges::less, class P = identity>
+//   concept sortable = see below;                            // since C++20
+
+#include <iterator>
+
+#include <functional>
+
+template <class I, class R, class P> void test_subsumption() requires std::permutable<I>;
+
+template <class I, class R, class P> void test_subsumption()
+    requires std::indirect_strict_weak_order<R, std::projected<I, P>>;
+
+template <class I, class R, class P> constexpr bool test_subsumption() requires std::sortable<I, R, P> { return true; }
+
+static_assert(test_subsumption<int*, std::ranges::less, std::identity>());

From 06f346cb7238fff88b2c7f94bf55fde50de5c1dd Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 18 Feb 2022 04:18:16 +0000
Subject: [PATCH 215/748] [gn build] Port 8e979460bb27

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 5ea6f2a1595f7..8767544d79e0c 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -310,6 +310,7 @@ if (current_toolchain == default_toolchain) {
       "__iterator/reverse_access.h",
       "__iterator/reverse_iterator.h",
       "__iterator/size.h",
+      "__iterator/sortable.h",
       "__iterator/unreachable_sentinel.h",
       "__iterator/wrap_iter.h",
       "__libcpp_version",

From 2ad662172cbbd1ca53489bf8bddb0183d7692708 Mon Sep 17 00:00:00 2001
From: esmeyi <esme.yi@ibm.com>
Date: Fri, 18 Feb 2022 00:29:10 -0500
Subject: [PATCH 216/748] [XCOFF][llvm-objdump] change the priority of symbols
 with                       the same address by symbol types.

Summary: In XCOFF, each section comes with a default symbol
         with the same name as the section. It doesn't bind
         to code locations and it may cause incorrect display
         of symbol names under `llvm-objdump -d`.
         This patch changes the priority of symbols with the
         same address by symbol type.

Reviewed By: jhenderson, shchenz

Differential Revision: https://reviews.llvm.org/D117642
---
 .../llvm/MC/MCDisassembler/MCDisassembler.h   | 11 ++++----
 llvm/include/llvm/Object/ObjectFile.h         |  2 +-
 .../aix-prefixed-instruction-boundary.mir     |  2 +-
 llvm/test/CodeGen/PowerPC/aix-return55.ll     |  2 +-
 .../PowerPC/aix-user-defined-memcpy.ll        |  2 +-
 .../PowerPC/aix-xcoff-mergeable-const.ll      |  2 +-
 .../CodeGen/PowerPC/aix-xcoff-reloc-symb.mir  |  2 +-
 llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll  |  2 +-
 .../PowerPC/aix-xcoff-textdisassembly.ll      |  2 +-
 .../llvm-objdump/XCOFF/disassemble-all.test   |  2 +-
 .../XCOFF/disassemble-symbol-description.test |  2 +-
 .../XCOFF/disassemble-symbol-priority.ll      | 28 +++++++++++++++++++
 .../XCOFF/disassemble-symbolize-operands.ll   |  7 ++---
 .../llvm-objdump/XCOFF/print-linenumber.test  |  2 +-
 llvm/tools/llvm-objdump/llvm-objdump.cpp      |  3 ++
 15 files changed, 51 insertions(+), 20 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll

diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 10037cd66ef12..db608d78fd6a0 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -46,8 +46,9 @@ struct SymbolInfoTy {
                Optional<XCOFF::StorageMappingClass> Smc, Optional<uint32_t> Idx,
                bool Label)
       : Addr(Addr), Name(Name), XCOFFSymInfo(Smc, Idx, Label), IsXCOFF(true) {}
-  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type)
-      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(false) {}
+  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type,
+               bool IsXCOFF = false)
+      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(IsXCOFF) {}
   bool isXCOFF() const { return IsXCOFF; }
 
 private:
@@ -55,11 +56,11 @@ struct SymbolInfoTy {
     assert(P1.IsXCOFF == P2.IsXCOFF &&
            "P1.IsXCOFF should be equal to P2.IsXCOFF.");
     if (P1.IsXCOFF)
-      return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
-             std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
+      return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Type, P1.Name) <
+             std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Type, P2.Name);
 
     return std::tie(P1.Addr, P1.Name, P1.Type) <
-             std::tie(P2.Addr, P2.Name, P2.Type);
+           std::tie(P2.Addr, P2.Name, P2.Type);
   }
 };
 
diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index bb6f1321a68e8..1faa070052d5e 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -170,11 +170,11 @@ class SymbolRef : public BasicSymbolRef {
 public:
   enum Type {
     ST_Unknown, // Type not specified
+    ST_Other,
     ST_Data,
     ST_Debug,
     ST_File,
     ST_Function,
-    ST_Other
   };
 
   SymbolRef() = default;
diff --git a/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir b/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
index 9ea49bf40c897..2947ae2c39989 100644
--- a/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
+++ b/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
@@ -43,7 +43,7 @@ body:             |
 ...
 
 # DIS:      Disassembly of section .text:
-# DIS:      00000000 <.text>:
+# DIS:      00000000 <.aix-prefixed-instruction-boundary>:
 # DIS-NEXT:   0: 38 60 00 02  	          li 3, 2
 # DIS-NEXT:   4: 06 00 00 00 38 63 00 0d  paddi 3, 3, 13, 0
 # DIS-NEXT:   c: 06 00 00 00 38 63 00 0d  paddi 3, 3, 13, 0
diff --git a/llvm/test/CodeGen/PowerPC/aix-return55.ll b/llvm/test/CodeGen/PowerPC/aix-return55.ll
index c16b75bb68d8d..19e8322f8f8a2 100644
--- a/llvm/test/CodeGen/PowerPC/aix-return55.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-return55.ll
@@ -21,7 +21,7 @@ entry:
 ; CHECK: blr
 }
 
-;CHECKOBJ:      00000000 <.text>:
+;CHECKOBJ:      00000000 <.foo>:
 ;CHECKOBJ-NEXT:       0: 38 60 00 37                    li 3, 55
 ;CHECKOBJ-NEXT:       4: 4e 80 00 20                    blr{{[[:space:]] *}}
 ;CHECKOBJ-NEXT: 00000008 <.rodata.str1.1>:
diff --git a/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll b/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
index b69b3760c9f4e..097eb302e4161 100644
--- a/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
@@ -102,7 +102,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture r
 ; 32-REL-NOT:  Type: R_RBR (0x1A)
 
 ; 32-DIS:      Disassembly of section .text:
-; 32-DIS:      00000000 <.text>:
+; 32-DIS:      00000000 <.memcpy>:
 ; 32-DIS-NEXT:        0: 38 60 00 03                   li 3, 3
 ; 32-DIS-NEXT:        4: 4e 80 00 20                   blr
 ; 32-DIS-NEXT:        8: 60 00 00 00                   nop
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
index 255472d65c341..c7b1d2a0771c1 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
@@ -62,7 +62,7 @@ entry:
 ;CHECK-NEXT:         .space  1
 
 
-;CHECKOBJ:      00000000 <.text>:
+;CHECKOBJ:      00000000 <.main>:
 ;CHECKOBJ-NEXT:        0: 38 60 00 00                    li 3, 0
 ;CHECKOBJ-NEXT:        4: 4e 80 00 20                    blr
 ;CHECKOBJ-NEXT:          ...{{[[:space:]] *}}
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
index f650168d5877d..c64552f9852c0 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
@@ -75,7 +75,7 @@ body:             |
 
 # DIS:      Disassembly of section .text:
 # DIS-EMPTY:
-# DIS-NEXT: 00000000 <.text>:
+# DIS-NEXT: 00000000 <.foo>:
 # DIS-NEXT:        0: 80 62 00 00                   lwz 3, 0(2)
 # DIS-NEXT:        4: 4e 80 00 20                   blr
 # DIS-EMPTY:
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
index 6ce251bb49fd8..1bbc12c5a3af5 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
@@ -422,7 +422,7 @@ declare i32 @bar(i32)
 
 ; DIS:      {{.*}}aix-xcoff-reloc.ll.tmp.o:   file format aixcoff-rs6000
 ; DIS:      Disassembly of section .text:
-; DIS:      00000000 <.text>:
+; DIS:      00000000 <.foo>:
 ; DIS-NEXT:        0: 7c 08 02 a6                   mflr 0
 ; DIS-NEXT:        4: 90 01 00 08                   stw 0, 8(1)
 ; DIS-NEXT:        8: 94 21 ff c0                   stwu 1, -64(1)
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
index c8df85da0c855..8b73e748e1a89 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
@@ -13,7 +13,7 @@ entry:
 }
 
 ; CHECK:     Disassembly of section .text:{{[[:space:]] *}}
-; CHECK-NEXT:     00000000 <.text>:
+; CHECK-NEXT:     00000000 <.foo>:
 ; CHECK-NEXT:        0: 38 60 00 00                   li 3, 0
 ; CHECK-NEXT:        4: 4e 80 00 20                   blr
 ; CHECK-NEXT:        8: 60 00 00 00                   nop
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
index d94d5734a1cbd..4c96662fc854f 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
@@ -18,7 +18,7 @@
 
 CHECK:        Inputs/xcoff-section-headers.o:	file format aixcoff-rs6000
 CHECK:        Disassembly of section .text:
-CHECK:        00000000 <.text>:
+CHECK:        00000000 <.func>:
 CHECK-NEXT:        0: 80 62 00 04                  	lwz 3, 4(2)
 WITH-R-NEXT:                         00000002:  R_TOC        a
 CHECK-NEXT:        4: 80 63 00 00                  	lwz 3, 0(3)
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
index 16f7137cf3796..f33421cc6c149 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
@@ -22,7 +22,7 @@
 
 COMMON: Inputs/xcoff-section-headers.o:	file format aixcoff-rs6000
 COMMON: Disassembly of section .text:
-PLAIN:      00000000 <.text>:
+PLAIN:      00000000 <.func>:
 DESC:       00000000 (idx: 16) .func: 
 COMMON-NEXT:        0: 80 62 00 04                  	lwz 3, 4(2)
 RELOC:                              00000002:  R_TOC        (idx: 26) a[TC]
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll
new file mode 100644
index 0000000000000..6db8451ea6a13
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=powerpc-ibm-aix-xcoff %s -filetype=obj -o %t
+; RUN: llvm-objdump %t -d --no-show-raw-insn | FileCheck %s
+
+; CHECK: Disassembly of section .text:
+; CHECK: 00000000 <.foo3>:
+; CHECK: 00000020 <.foo4>:
+; CHECK: 00000040 <.foo>:
+; CHECK: 00000060 <.foo2>:
+
+define dso_local signext i32 @foo(i32 noundef signext %a) #0 section "explicit_sec" {
+entry:
+  ret i32 %a
+}
+
+define dso_local signext i32 @foo2(i32 noundef signext %a) #0 section "explicit_sec" {
+entry:
+  ret i32 %a
+}
+
+define dso_local signext i32 @foo3(i32 noundef signext %a) #0 {
+entry:
+  ret i32 %a
+}
+
+define dso_local signext i32 @foo4(i32 noundef signext %a) #0 {
+entry:
+  ret i32 %a
+}
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
index a6742285a148e..95399aa4d41d2 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
@@ -3,8 +3,7 @@
 ; RUN:   | FileCheck %s
 
 ;; Expect to find the branch labels.
-; CHECK-LABEL: <.text>:
-;; TODO: <.internal> should be printed instead of <.text>.
+; CHECK-LABEL: <.internal>:
 ; CHECK-NEXT:         0:      mr 4, 3
 ; CHECK-NEXT:         4:      li 3, 0
 ; CHECK-NEXT:         8:      mtctr 4
@@ -19,11 +18,11 @@
 ; CHECK-NEXT:        60:      	bf	8, 0x84 <L1>
 ; CHECK-NEXT:  <L0>:
 ; CHECK-NEXT:        64:      	mr	3, 31
-; CHECK-NEXT:        68:      	bl 0x0 <.text>
+; CHECK-NEXT:        68:      	bl 0x0 <.internal>
 ; CHECK-NEXT:        6c:      	mr	31, 3
 ; CHECK-NEXT:        70:      	cmplwi	3, 11
 ; CHECK-NEXT:        74:      	bt	0, 0x60 <L2>
-; CHECK-NEXT:        78:      	bl 0x0 <.text>
+; CHECK-NEXT:        78:      	bl 0x0 <.internal>
 ; CHECK-NEXT:        7c:      	nop
 ; CHECK-NEXT:        80:      	b 0x60 <L2>
 ; CHECK-NEXT:  <L1>:
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test b/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
index 0f3acacae4389..8256e27c064dd 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
@@ -17,7 +17,7 @@
 
 # LINES32:       Inputs/basic32.o:	file format aixcoff-rs6000
 # LINES32:       Disassembly of section .text:
-# LINES32:       00000000 <.text>:
+# LINES32:       00000000 <.main>:
 # LINES32:       ; .main():
 # LINES32-NEXT:  ; /basic.c:1
 # LINES32-NEXT:         0: 38 60 00 00  	li 3, 0
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 6b238fa01d258..4cb226b795255 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -957,6 +957,9 @@ SymbolInfoTy objdump::createSymbolInfo(const ObjectFile *Obj,
         getXCOFFSymbolCsectSMC(XCOFFObj, Symbol);
     return SymbolInfoTy(Addr, Name, Smc, SymbolIndex,
                         isLabel(XCOFFObj, Symbol));
+  } else if (Obj->isXCOFF()) {
+    const SymbolRef::Type SymType = unwrapOrError(Symbol.getType(), FileName);
+    return SymbolInfoTy(Addr, Name, SymType, true);
   } else
     return SymbolInfoTy(Addr, Name,
                         Obj->isELF() ? getElfSymbolType(Obj, Symbol)

From b45d0b3e8e003291f99a12d039c8a54a064adfcb Mon Sep 17 00:00:00 2001
From: Serguei Katkov <serguei.katkov@azul.com>
Date: Thu, 17 Feb 2022 12:06:23 +0700
Subject: [PATCH 217/748] [MemoryDependency] Simplfy re-ordering condition.
 Cleanup. NFC.

Make the reading of condition for restricting re-ordering simpler.

Reviewers: reames
Reviewed By: reames
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D120005
---
 .../lib/Analysis/MemoryDependenceAnalysis.cpp | 39 +++++++------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index aaeba903f43df..923fcdcf65009 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -414,30 +414,17 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       isInvariantLoad = true;
   }
 
-  // Return "true" if and only if the instruction I is either a non-simple
-  // load or a non-simple store.
-  auto isNonSimpleLoadOrStore = [](Instruction *I) -> bool {
+  // True for volatile instruction.
+  // For Load/Store return true if atomic ordering is stronger than AO,
+  // for other instruction just true if it can read or write to memory.
+  auto isComplexForReordering = [](Instruction * I, AtomicOrdering AO)->bool {
+    if (I->isVolatile())
+      return true;
     if (auto *LI = dyn_cast<LoadInst>(I))
-      return !LI->isSimple();
+      return isStrongerThan(LI->getOrdering(), AO);
     if (auto *SI = dyn_cast<StoreInst>(I))
-      return !SI->isSimple();
-    return false;
-  };
-
-  // Return "true" if and only if the instruction I is either a non-unordered
-  // load or a non-unordered store.
-  auto isNonUnorderedLoadOrStore = [](Instruction *I) -> bool {
-    if (auto *LI = dyn_cast<LoadInst>(I))
-      return !LI->isUnordered();
-    if (auto *SI = dyn_cast<StoreInst>(I))
-      return !SI->isUnordered();
-    return false;
-  };
-
-  // Return "true" if I is not a load and not a store, but it does access
-  // memory.
-  auto isOtherMemAccess = [](Instruction *I) -> bool {
-    return !isa<LoadInst>(I) && !isa<StoreInst>(I) && I->mayReadOrWriteMemory();
+      return isStrongerThan(SI->getOrdering(), AO);
+    return I->mayReadOrWriteMemory();
   };
 
   // Walk backwards through the basic block, looking for dependencies.
@@ -510,8 +497,8 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // atomic.
       // FIXME: This is overly conservative.
       if (LI->isAtomic() && isStrongerThanUnordered(LI->getOrdering())) {
-        if (!QueryInst || isNonSimpleLoadOrStore(QueryInst) ||
-            isOtherMemAccess(QueryInst))
+        if (!QueryInst ||
+            isComplexForReordering(QueryInst, AtomicOrdering::NotAtomic))
           return MemDepResult::getClobber(LI);
         if (LI->getOrdering() != AtomicOrdering::Monotonic)
           return MemDepResult::getClobber(LI);
@@ -559,8 +546,8 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // A Monotonic store is OK if the query inst is itself not atomic.
       // FIXME: This is overly conservative.
       if (!SI->isUnordered() && SI->isAtomic()) {
-        if (!QueryInst || isNonUnorderedLoadOrStore(QueryInst) ||
-            isOtherMemAccess(QueryInst))
+        if (!QueryInst ||
+            isComplexForReordering(QueryInst, AtomicOrdering::Unordered))
           return MemDepResult::getClobber(SI);
         // Ok, if we are here the guard above guarantee us that
         // QueryInst is a non-atomic or unordered load/store.

From 1ece3eeeb79e766d4f9e0b5044db2d72946f785d Mon Sep 17 00:00:00 2001
From: fourdim <fourdim@foxmail.com>
Date: Fri, 18 Feb 2022 14:06:48 +0800
Subject: [PATCH 218/748] [JITLink][RISCV] fix the extractBits behavior and add
 R_RISCV_JAL relocation.

This patch supports the R_RISCV_JAL relocation.
Moreover, it will fix the extractBits function's behavior as it extracts Size + 1 bits.
In the test ELF_jal.s:
Before:
```
Hi: 4294836480
extractBits(Hi, 12, 8): 480
```
After:
```
Hi: 4294836480
extractBits(Hi, 12, 8): 224
```

Reviewed By: StephenFan

Differential Revision: https://reviews.llvm.org/D117975
---
 .../llvm/ExecutionEngine/JITLink/riscv.h      |  7 ++++
 .../lib/ExecutionEngine/JITLink/ELF_riscv.cpp | 16 ++++++++
 llvm/lib/ExecutionEngine/JITLink/riscv.cpp    |  2 +
 .../ExecutionEngine/JITLink/RISCV/ELF_jal.s   | 37 +++++++++++++++++++
 4 files changed, 62 insertions(+)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
index d0d3a3786e55d..2d32a749111d1 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
@@ -44,6 +44,13 @@ enum EdgeKind_riscv : Edge::Kind {
   ///
   R_RISCV_BRANCH,
 
+  /// High 20 bits of PC-relative jump pointer value relocation
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend
+  ///
+  R_RISCV_JAL,
+
   /// High 20 bits of 32-bit pointer value relocation
   ///
   /// Fixup expression
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index 90f3a38b81d53..469a81d882aea 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -220,6 +220,20 @@ class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
       *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7;
       break;
     }
+    case R_RISCV_JAL: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      Error AlignmentIssue = checkAlignment(FixupAddress, Value, 2, E);
+      if (AlignmentIssue) {
+        return AlignmentIssue;
+      }
+      uint32_t Imm20 = extractBits(Value, 20, 1) << 31;
+      uint32_t Imm10_1 = extractBits(Value, 1, 10) << 21;
+      uint32_t Imm11 = extractBits(Value, 11, 1) << 20;
+      uint32_t Imm19_12 = extractBits(Value, 12, 8) << 12;
+      uint32_t RawInstr = *(little32_t *)FixupPtr;
+      *(little32_t *)FixupPtr = RawInstr | Imm20 | Imm10_1 | Imm11 | Imm19_12;
+      break;
+    }
     case R_RISCV_HI20: {
       int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       int64_t Hi = Value + 0x800;
@@ -409,6 +423,8 @@ class ELFLinkGraphBuilder_riscv : public ELFLinkGraphBuilder<ELFT> {
       return EdgeKind_riscv::R_RISCV_64;
     case ELF::R_RISCV_BRANCH:
       return EdgeKind_riscv::R_RISCV_BRANCH;
+    case ELF::R_RISCV_JAL:
+      return EdgeKind_riscv::R_RISCV_JAL;
     case ELF::R_RISCV_HI20:
       return EdgeKind_riscv::R_RISCV_HI20;
     case ELF::R_RISCV_LO12_I:
diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
index 3ce2cf10a24cb..0bd57b654d402 100644
--- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
@@ -26,6 +26,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "R_RISCV_64";
   case R_RISCV_BRANCH:
     return "R_RISCV_BRANCH";
+  case R_RISCV_JAL:
+    return "R_RISCV_JAL";
   case R_RISCV_HI20:
     return "R_RISCV_HI20";
   case R_RISCV_LO12_I:
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s
new file mode 100644
index 0000000000000..97fe1b1ea40ff
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_jal.s
@@ -0,0 +1,37 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=riscv64 -filetype=obj \
+# RUN:     -o %t/elf_riscv64_jal.o %s
+# RUN: llvm-mc -triple=riscv32 -filetype=obj \
+# RUN:     -o %t/elf_riscv32_jal.o %s
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0x1ff00000 -slab-page-size 4096 \
+# RUN:     -abs external_func=0x1fe000fe \
+# RUN:     -check %s %t/elf_riscv64_jal.o
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0x1ff00000 -slab-page-size 4096 \
+# RUN:     -abs external_func=0x1fe000fe \
+# RUN:     -check %s %t/elf_riscv32_jal.o
+#
+
+        .text
+        .file   "testcase.c"
+
+# Empty main entry point.
+        .globl  main
+        .p2align  1
+        .type   main,@function
+main:
+        ret
+
+        .size   main, .-main
+
+# Test R_RISCV_JAL
+
+# jitlink-check: decode_operand(test_jal, 1)[31:12] = (external_func - test_jal)[31:12]
+  .globl  test_jal
+  .p2align  1
+  .type  test_jal,@function
+test_jal:
+  jal	x0, external_func
+
+  .size test_jal, .-test_jal

From 0a4184909a8c4861142acec0f59a4a3373f39b09 Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Thu, 17 Feb 2022 16:01:31 -0800
Subject: [PATCH 219/748] Reland "[memprof] Extend the index prof format to
 include memory profiles."

This patch adds support for optional memory profile information to be
included with and indexed profile. The indexed profile header adds a new
field which points to the offset of the memory profile section (if
present) in the indexed profile. For users who do not utilize this
feature the only overhead is a 64-bit offset in the header.

The memory profile section contains (1) profile metadata describing the
information recorded for each entry (2) an on-disk hashtable containing
the profile records indexed via llvm::md5(function_name). We chose to
introduce a separate hash table instead of the existing one since the
indexing for the instrumented fdo hash table is based on a CFG hash
which itself is perturbed by memprof instrumentation.

This commit also includes the changes reviewed separately in D120093.

Differential Revision: https://reviews.llvm.org/D120103
---
 compiler-rt/include/profile/InstrProfData.inc |   4 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |   8 +-
 .../llvm/ProfileData/InstrProfData.inc        |   4 +-
 .../llvm/ProfileData/InstrProfReader.h        |  14 ++
 .../llvm/ProfileData/InstrProfWriter.h        |  11 +
 llvm/include/llvm/ProfileData/MemProf.h       | 203 +++++++++++++++++-
 llvm/include/llvm/ProfileData/MemProfData.inc |   4 +-
 .../llvm/ProfileData/RawMemProfReader.h       |   3 +
 llvm/lib/ProfileData/CMakeLists.txt           |   1 +
 llvm/lib/ProfileData/InstrProf.cpp            |  23 +-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  46 +++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  90 +++++++-
 llvm/lib/ProfileData/MemProf.cpp              |  73 +++++++
 llvm/lib/ProfileData/RawMemProfReader.cpp     |   7 +-
 .../tools/llvm-profdata/Inputs/basic.profraw  | Bin 0 -> 152 bytes
 .../tools/llvm-profdata/memprof-merge.test    |  47 ++++
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |  48 ++++-
 llvm/unittests/ProfileData/InstrProfTest.cpp  |  62 ++++++
 llvm/unittests/ProfileData/MemProfTest.cpp    |  54 ++++-
 19 files changed, 670 insertions(+), 32 deletions(-)
 create mode 100644 llvm/lib/ProfileData/MemProf.cpp
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/basic.profraw
 create mode 100644 llvm/test/tools/llvm-profdata/memprof-merge.test

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 62054a6a3df51..282620d8b5dc0 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -650,7 +650,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 7
+#define INSTR_PROF_INDEX_VERSION 8
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -662,6 +662,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
+ * The 62nd bit indicates whether memory profile information is present.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -671,6 +672,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define VARIANT_MASK_DBG_CORRELATE (0x1ULL << 59)
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
+#define VARIANT_MASK_MEMPROF (0x1ULL << 62)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index c015e8e4b43d0..9f7a6711131d6 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -287,7 +287,8 @@ enum class InstrProfKind {
   CS = 0x8, // A context sensitive IR-level profile.
   SingleByteCoverage = 0x10, // Use single byte probes for coverage.
   FunctionEntryOnly = 0x20,  // Only instrument the function entry basic block.
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionEntryOnly)
+  MemProf = 0x40, // A memory profile collected using -fprofile=memory.
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/MemProf)
 };
 
 const std::error_category &instrprof_category();
@@ -1011,7 +1012,9 @@ enum ProfVersion {
   Version6 = 6,
   // An additional counter is added around logical operators.
   Version7 = 7,
-  // The current version is 7.
+  // An additional (optional) memory profile type is added.
+  Version8 = 8,
+  // The current version is 8.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1028,6 +1031,7 @@ struct Header {
   uint64_t Unused; // Becomes unused since version 4
   uint64_t HashType;
   uint64_t HashOffset;
+  uint64_t MemProfOffset;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 62054a6a3df51..282620d8b5dc0 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -650,7 +650,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 7
+#define INSTR_PROF_INDEX_VERSION 8
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -662,6 +662,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  * The 59th bit indicates whether to use debug info to correlate profiles.
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
+ * The 62nd bit indicates whether memory profile information is present.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -671,6 +672,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define VARIANT_MASK_DBG_CORRELATE (0x1ULL << 59)
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
+#define VARIANT_MASK_MEMPROF (0x1ULL << 62)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 548affbf65fa5..7a18d5a6a11af 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -19,6 +19,7 @@
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/InstrProfCorrelator.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
@@ -471,6 +472,9 @@ struct InstrProfReaderIndexBase {
 using OnDiskHashTableImplV3 =
     OnDiskIterableChainedHashTable<InstrProfLookupTrait>;
 
+using MemProfHashTable =
+    OnDiskIterableChainedHashTable<memprof::MemProfRecordLookupTrait>;
+
 template <typename HashTableImpl>
 class InstrProfReaderItaniumRemapper;
 
@@ -556,6 +560,11 @@ class IndexedInstrProfReader : public InstrProfReader {
   std::unique_ptr<ProfileSummary> Summary;
   /// Context sensitive profile summary data.
   std::unique_ptr<ProfileSummary> CS_Summary;
+  /// MemProf profile schema (if available).
+  memprof::MemProfSchema Schema;
+  /// MemProf profile data on-disk indexed via llvm::md5(FunctionName).
+  std::unique_ptr<MemProfHashTable> MemProfTable;
+
   // Index to the current record in the record array.
   unsigned RecordIndex;
 
@@ -609,6 +618,11 @@ class IndexedInstrProfReader : public InstrProfReader {
   Expected<InstrProfRecord> getInstrProfRecord(StringRef FuncName,
                                                uint64_t FuncHash);
 
+  /// Return the memprof records for the function identified by
+  /// llvm::md5(Name).
+  Expected<ArrayRef<memprof::MemProfRecord>>
+  getMemProfRecord(uint64_t FuncNameHash);
+
   /// Fill Counts with the profile data for the given function name.
   Error getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                           std::vector<uint64_t> &Counts);
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index af1e46cf4fc24..bb180ac42c212 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -37,6 +38,11 @@ class InstrProfWriter {
 private:
   bool Sparse;
   StringMap<ProfilingData> FunctionData;
+
+  // A map to hold memprof data per function. The lower 64 bits obtained from
+  // the md5 hash of the function name is used to index into the map.
+  memprof::FunctionMemProfMap MemProfData;
+
   // An enum describing the attributes of the profile.
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
   // Use raw pointer here for the incomplete type object.
@@ -57,6 +63,9 @@ class InstrProfWriter {
     addRecord(std::move(I), 1, Warn);
   }
 
+  void addRecord(const ::llvm::memprof::MemProfRecord &MR,
+                 function_ref<void(Error)> Warn);
+
   /// Merge existing function counts from the given writer.
   void mergeRecordsFromWriter(InstrProfWriter &&IPW,
                               function_ref<void(Error)> Warn);
@@ -112,6 +121,8 @@ class InstrProfWriter {
     return Error::success();
   }
 
+  InstrProfKind getProfileKind() const { return ProfileKind; }
+
   // Internal interface for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness);
   void setOutputSparse(bool Sparse);
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 2fa577a626bbe..784927e4805d7 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -5,6 +5,7 @@
 #include <string>
 #include <vector>
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ProfileData/MemProfData.inc"
 #include "llvm/ProfileData/ProfileCommon.h"
@@ -134,18 +135,70 @@ struct PortableMemInfoBlock {
 };
 
 struct MemProfRecord {
-  struct Frame {
-    std::string Function;
+  // Describes a call frame for a dynamic allocation context. The contents of
+  // the frame are populated by symbolizing the stack depot call frame from the
+  // compiler runtime.
+  PACKED(struct Frame {
+    // A uuid (uint64_t) identifying the function. It is obtained by
+    // llvm::md5(FunctionName) which returns the lower 64 bits.
+    GlobalValue::GUID Function;
+    // The source line offset of the call from the beginning of parent function.
     uint32_t LineOffset;
+    // The source column number of the call to help distinguish multiple calls
+    // on the same line.
     uint32_t Column;
+    // Whether the current frame is inlined.
     bool IsInlineFrame;
 
-    Frame(std::string Str, uint32_t Off, uint32_t Col, bool Inline)
-        : Function(std::move(Str)), LineOffset(Off), Column(Col),
-          IsInlineFrame(Inline) {}
-  };
+    Frame(uint64_t Hash, uint32_t Off, uint32_t Col, bool Inline)
+        : Function(Hash), LineOffset(Off), Column(Col), IsInlineFrame(Inline) {}
 
+    bool operator==(const Frame &Other) const {
+      return Other.Function == Function && Other.LineOffset == LineOffset &&
+             Other.Column == Column && Other.IsInlineFrame == IsInlineFrame;
+    }
+
+    bool operator!=(const Frame &Other) const { return !operator==(Other); }
+
+    // Write the contents of the frame to the ostream \p OS.
+    void serialize(raw_ostream & OS) const {
+      using namespace support;
+
+      endian::Writer LE(OS, little);
+
+      // If the type of the GlobalValue::GUID changes, then we need to update
+      // the reader and the writer.
+      static_assert(std::is_same<GlobalValue::GUID, uint64_t>::value,
+                    "Expect GUID to be uint64_t.");
+      LE.write<uint64_t>(Function);
+
+      LE.write<uint32_t>(LineOffset);
+      LE.write<uint32_t>(Column);
+      LE.write<bool>(IsInlineFrame);
+    }
+
+    // Read a frame from char data which has been serialized as little endian.
+    static Frame deserialize(const unsigned char *Ptr) {
+      using namespace support;
+
+      const uint64_t F = endian::readNext<uint64_t, little, unaligned>(Ptr);
+      const uint32_t L = endian::readNext<uint32_t, little, unaligned>(Ptr);
+      const uint32_t C = endian::readNext<uint32_t, little, unaligned>(Ptr);
+      const bool I = endian::readNext<bool, little, unaligned>(Ptr);
+      return Frame(/*Function=*/F, /*LineOffset=*/L, /*Column=*/C,
+                   /*IsInlineFrame=*/I);
+    }
+
+    // Returns the size of the frame information.
+    static constexpr size_t serializedSize() {
+      return sizeof(Frame::Function) + sizeof(Frame::LineOffset) +
+             sizeof(Frame::Column) + sizeof(Frame::IsInlineFrame);
+    }
+  });
+
+  // The dynamic calling context for the allocation.
   std::vector<Frame> CallStack;
+  // The statistics obtained from the runtime for the allocation.
   PortableMemInfoBlock Info;
 
   void clear() {
@@ -153,6 +206,12 @@ struct MemProfRecord {
     Info.clear();
   }
 
+  size_t serializedSize() const {
+    return sizeof(uint64_t) + // The number of frames to serialize.
+           sizeof(Frame) * CallStack.size() + // The contents of the frames.
+           PortableMemInfoBlock::serializedSize(); // The size of the payload.
+  }
+
   // Prints out the contents of the memprof record in YAML.
   void print(llvm::raw_ostream &OS) const {
     OS << "    Callstack:\n";
@@ -168,6 +227,138 @@ struct MemProfRecord {
 
     Info.printYAML(OS);
   }
+
+  bool operator==(const MemProfRecord &Other) const {
+    if (Other.Info != Info)
+      return false;
+
+    if (Other.CallStack.size() != CallStack.size())
+      return false;
+
+    for (size_t I = 0; I < Other.CallStack.size(); I++) {
+      if (Other.CallStack[I] != CallStack[I])
+        return false;
+    }
+    return true;
+  }
+};
+
+// Serializes the memprof records in \p Records to the ostream \p OS based on
+// the schema provided in \p Schema.
+void serializeRecords(const ArrayRef<MemProfRecord> Records,
+                      const MemProfSchema &Schema, raw_ostream &OS);
+
+// Deserializes memprof records from the Buffer
+SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
+                                                 const unsigned char *Buffer);
+
+// Reads a memprof schema from a buffer. All entries in the buffer are
+// interpreted as uint64_t. The first entry in the buffer denotes the number of
+// ids in the schema. Subsequent entries are integers which map to memprof::Meta
+// enum class entries. After successfully reading the schema, the pointer is one
+// byte past the schema contents.
+Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer);
+
+using FunctionMemProfMap =
+    DenseMap<uint64_t, SmallVector<memprof::MemProfRecord, 4>>;
+
+/// Trait for lookups into the on-disk hash table for memprof format in the
+/// indexed profile.
+class MemProfRecordLookupTrait {
+public:
+  using data_type = ArrayRef<MemProfRecord>;
+  using internal_key_type = uint64_t;
+  using external_key_type = uint64_t;
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
+
+  MemProfRecordLookupTrait() = delete;
+  MemProfRecordLookupTrait(const MemProfSchema &S) : Schema(S) {}
+
+  static bool EqualKey(uint64_t A, uint64_t B) { return A == B; }
+  static uint64_t GetInternalKey(uint64_t K) { return K; }
+  static uint64_t GetExternalKey(uint64_t K) { return K; }
+
+  hash_value_type ComputeHash(uint64_t K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  ReadKeyDataLength(const unsigned char *&D) {
+    using namespace support;
+
+    offset_type KeyLen = endian::readNext<offset_type, little, unaligned>(D);
+    offset_type DataLen = endian::readNext<offset_type, little, unaligned>(D);
+    return std::make_pair(KeyLen, DataLen);
+  }
+
+  uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) {
+    using namespace support;
+    return endian::readNext<external_key_type, little, unaligned>(D);
+  }
+
+  data_type ReadData(uint64_t K, const unsigned char *D,
+                     offset_type /*Unused*/) {
+    Records = deserializeRecords(Schema, D);
+    return Records;
+  }
+
+private:
+  // Holds the memprof schema used to deserialize records.
+  MemProfSchema Schema;
+  // Holds the records from one function deserialized from the indexed format.
+  llvm::SmallVector<MemProfRecord, 4> Records;
+};
+
+class MemProfRecordWriterTrait {
+public:
+  using key_type = uint64_t;
+  using key_type_ref = uint64_t;
+
+  using data_type = ArrayRef<MemProfRecord>;
+  using data_type_ref = ArrayRef<MemProfRecord>;
+
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
+
+  // Pointer to the memprof schema to use for the generator. Unlike the reader
+  // we must use a default constructor with no params for the writer trait so we
+  // have a public member which must be initialized by the user.
+  MemProfSchema *Schema = nullptr;
+
+  MemProfRecordWriterTrait() = default;
+
+  static hash_value_type ComputeHash(key_type_ref K) { return K; }
+
+  static std::pair<offset_type, offset_type>
+  EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
+    using namespace support;
+
+    endian::Writer LE(Out, little);
+
+    offset_type N = sizeof(K);
+    LE.write<offset_type>(N);
+
+    offset_type M = 0;
+
+    M += sizeof(uint64_t);
+    for (const auto &Record : V) {
+      M += Record.serializedSize();
+    }
+
+    LE.write<offset_type>(M);
+    return std::make_pair(N, M);
+  }
+
+  void EmitKey(raw_ostream &Out, key_type_ref K, offset_type /*Unused*/) {
+    using namespace support;
+    endian::Writer LE(Out, little);
+    LE.write<uint64_t>(K);
+  }
+
+  void EmitData(raw_ostream &Out, key_type_ref /*Unused*/, data_type_ref V,
+                offset_type /*Unused*/) {
+    assert(Schema != nullptr && "MemProf schema is not initialized!");
+    serializeRecords(V, *Schema, Out);
+  }
 };
 
 } // namespace memprof
diff --git a/llvm/include/llvm/ProfileData/MemProfData.inc b/llvm/include/llvm/ProfileData/MemProfData.inc
index 8135a664b0466..38698be9ea0ec 100644
--- a/llvm/include/llvm/ProfileData/MemProfData.inc
+++ b/llvm/include/llvm/ProfileData/MemProfData.inc
@@ -1,5 +1,5 @@
-#ifndef LLVM_PROFILEDATA_MEMPROFDATA_INC
-#define LLVM_PROFILEDATA_MEMPROFDATA_INC
+#ifndef MEMPROF_DATA_INC
+#define MEMPROF_DATA_INC
 /*===-- MemProfData.inc - MemProf profiling runtime structures -*- C++ -*-=== *\
 |*
 |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/llvm/include/llvm/ProfileData/RawMemProfReader.h b/llvm/include/llvm/ProfileData/RawMemProfReader.h
index 55ba31d2a6492..bda33d336468a 100644
--- a/llvm/include/llvm/ProfileData/RawMemProfReader.h
+++ b/llvm/include/llvm/ProfileData/RawMemProfReader.h
@@ -66,6 +66,9 @@ class RawMemProfReader {
     return Iterator(this);
   }
 
+  // The RawMemProfReader only holds memory profile information.
+  InstrProfKind getProfileKind() const { return InstrProfKind::MemProf; }
+
   // Constructor for unittests only.
   RawMemProfReader(std::unique_ptr<llvm::symbolize::SymbolizableModule> Sym,
                    llvm::SmallVectorImpl<SegmentEntry> &Seg,
diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt
index 2749119f72d90..486c45d0dff5c 100644
--- a/llvm/lib/ProfileData/CMakeLists.txt
+++ b/llvm/lib/ProfileData/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_component_library(LLVMProfileData
   InstrProfCorrelator.cpp
   InstrProfReader.cpp
   InstrProfWriter.cpp
+  MemProf.cpp
   ProfileSummaryBuilder.cpp
   SampleProf.cpp
   SampleProfReader.cpp
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 6e53b0a276998..0a0ce7604a290 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1349,8 +1349,15 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
   switch (GET_VERSION(H.formatVersion())) {
-  // When a new field is added in the header add a case statement here to
-  // populate it.
+    // When a new field is added in the header add a case statement here to
+    // populate it.
+    static_assert(
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
+        "Please update the reading code below if a new field has been added, "
+        "if not add a case statement to fall through to the latest version.");
+  case 8ull:
+    H.MemProfOffset = read(Buffer, offsetOf(&Header::MemProfOffset));
+    LLVM_FALLTHROUGH;
   default: // Version7 (when the backwards compatible header was introduced).
     H.HashType = read(Buffer, offsetOf(&Header::HashType));
     H.HashOffset = read(Buffer, offsetOf(&Header::HashOffset));
@@ -1361,9 +1368,15 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 
 size_t Header::size() const {
   switch (GET_VERSION(formatVersion())) {
-  // When a new field is added to the header add a case statement here to
-  // compute the size as offset of the new field + size of the new field. This
-  // relies on the field being added to the end of the list.
+    // When a new field is added to the header add a case statement here to
+    // compute the size as offset of the new field + size of the new field. This
+    // relies on the field being added to the end of the list.
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version8,
+                  "Please update the size computation below if a new field has "
+                  "been added to the header, if not add a case statement to "
+                  "fall through to the latest version.");
+  case 8ull:
+    return offsetOf(&Header::MemProfOffset) + sizeof(Header::MemProfOffset);
   default: // Version7 (when the backwards compatible header was introduced).
     return offsetOf(&Header::HashOffset) + sizeof(Header::HashOffset);
   }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index d1e3438a6f412..f79169c7190f7 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -19,7 +19,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/RawMemProfReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
@@ -57,6 +59,9 @@ static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   if (Version & VARIANT_MASK_FUNCTION_ENTRY_ONLY) {
     ProfileKind |= InstrProfKind::FunctionEntryOnly;
   }
+  if (Version & VARIANT_MASK_MEMPROF) {
+    ProfileKind |= InstrProfKind::MemProf;
+  }
   return ProfileKind;
 }
 
@@ -955,10 +960,35 @@ Error IndexedInstrProfReader::readHeader() {
 
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
-  // The rest of the file is an on disk hash table.
+  // The hash table with profile counts comes next.
   auto IndexPtr = std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
       Start + HashOffset, Cur, Start, HashType, Header->formatVersion());
 
+  // The MemProfOffset field in the header is only valid when the format version
+  // is higher than 8 (when it was introduced).
+  if (GET_VERSION(Header->formatVersion()) >= 8 &&
+      Header->formatVersion() & VARIANT_MASK_MEMPROF) {
+    uint64_t MemProfOffset =
+        endian::byte_swap<uint64_t, little>(Header->MemProfOffset);
+
+    const unsigned char *Ptr = Start + MemProfOffset;
+    // The value returned from Generator.Emit.
+    const uint64_t TableOffset =
+        support::endian::readNext<uint64_t, little, unaligned>(Ptr);
+
+    // Read the schema.
+    auto SchemaOr = memprof::readMemProfSchema(Ptr);
+    if (!SchemaOr)
+      return SchemaOr.takeError();
+    Schema = SchemaOr.get();
+
+    // Now initialize the table reader with a pointer into data buffer.
+    MemProfTable.reset(MemProfHashTable::Create(
+        /*Buckets=*/Start + TableOffset,
+        /*Payload=*/Ptr,
+        /*Base=*/Start, memprof::MemProfRecordLookupTrait(Schema)));
+  }
+
   // Load the remapping table now if requested.
   if (RemappingBuffer) {
     Remapper = std::make_unique<
@@ -1003,6 +1033,20 @@ IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
   return error(instrprof_error::hash_mismatch);
 }
 
+Expected<ArrayRef<memprof::MemProfRecord>>
+IndexedInstrProfReader::getMemProfRecord(const uint64_t FuncNameHash) {
+  // TODO: Add memprof specific errors.
+  if (MemProfTable == nullptr)
+    return make_error<InstrProfError>(instrprof_error::invalid_prof,
+                                      "no memprof data available in profile");
+  auto Iter = MemProfTable->find(FuncNameHash);
+  if (Iter == MemProfTable->end())
+    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
+                                      "memprof record not found for hash " +
+                                          Twine(FuncNameHash));
+  return *Iter;
+}
+
 Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
                                                 uint64_t FuncHash,
                                                 std::vector<uint64_t> &Counts) {
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index ebf89317d585a..4c974f402d2b3 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
@@ -63,11 +64,16 @@ class ProfOStream {
 
     if (IsFDOStream) {
       raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
+      const uint64_t LastPos = FDOStream.tell();
       for (int K = 0; K < NItems; K++) {
         FDOStream.seek(P[K].Pos);
         for (int I = 0; I < P[K].N; I++)
           write(P[K].D[I]);
       }
+      // Reset the stream to the last position after patching so that users
+      // don't accidentally overwrite data. This makes it consistent with
+      // the string stream below which replaces the data directly.
+      FDOStream.seek(LastPos);
     } else {
       raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
@@ -248,11 +254,39 @@ void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
   Dest.sortValueData();
 }
 
+void InstrProfWriter::addRecord(const memprof::MemProfRecord &MR,
+                                function_ref<void(Error)> Warn) {
+  // Use 0 as a sentinel value since its highly unlikely that the lower 64-bits
+  // of a 128 bit md5 hash will be all zeros.
+  // TODO: Move this Key frame detection to the contructor to avoid having to
+  // scan all the callstacks again when adding a new record.
+  uint64_t Key = 0;
+  for (auto Iter = MR.CallStack.rbegin(), End = MR.CallStack.rend();
+       Iter != End; Iter++) {
+    if (!Iter->IsInlineFrame) {
+      Key = Iter->Function;
+      break;
+    }
+  }
+
+  if (Key == 0) {
+    Warn(make_error<InstrProfError>(
+        instrprof_error::invalid_prof,
+        "could not determine leaf function for memprof record."));
+  }
+
+  MemProfData[Key].push_back(MR);
+}
+
 void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
                                              function_ref<void(Error)> Warn) {
   for (auto &I : IPW.FunctionData)
     for (auto &Func : I.getValue())
       addRecord(I.getKey(), Func.first, std::move(Func.second), 1, Warn);
+
+  for (auto &I : IPW.MemProfData)
+    for (const auto &MR : I.second)
+      addRecord(MR, Warn);
 }
 
 bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
@@ -297,6 +331,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
       Generator.insert(I.getKey(), &I.getValue());
+
   // Write the header.
   IndexedInstrProf::Header Header;
   Header.Magic = IndexedInstrProf::Magic;
@@ -311,16 +346,18 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
     Header.Version |= VARIANT_MASK_BYTE_COVERAGE;
   if (static_cast<bool>(ProfileKind & InstrProfKind::FunctionEntryOnly))
     Header.Version |= VARIANT_MASK_FUNCTION_ENTRY_ONLY;
+  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf))
+    Header.Version |= VARIANT_MASK_MEMPROF;
 
   Header.Unused = 0;
   Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   Header.HashOffset = 0;
+  Header.MemProfOffset = 0;
   int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
-  // Only write out all the fields except 'HashOffset'. We need
-  // to remember the offset of that field to allow back patching
-  // later.
-  for (int I = 0; I < N - 1; I++)
+  // Only write out all the fields except 'HashOffset' and 'MemProfOffset'. We
+  // need to remember the offset of these fields to allow back patching later.
+  for (int I = 0; I < N - 2; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -328,6 +365,13 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Reserve the space for HashOffset field.
   OS.write(0);
 
+  // Save the location of MemProf profile data. This is stored in two parts as
+  // the schema and as a separate on-disk chained hashtable.
+  uint64_t MemProfSectionOffset = OS.tell();
+  // Reserve space for the MemProf table field to be patched later if this
+  // profile contains memory profile information.
+  OS.write(0);
+
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -347,6 +391,42 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   // Write the hash table.
   uint64_t HashTableStart = Generator.Emit(OS.OS, *InfoObj);
 
+  // Write the MemProf profile data if we have it. This includes a simple schema
+  // with the format described below followed by the hashtable:
+  // uint64_t Offset = MemProfGenerator.Emit
+  // uint64_t Num schema entries
+  // uint64_t Schema entry 0
+  // uint64_t Schema entry 1
+  // ....
+  // uint64_t Schema entry N - 1
+  // OnDiskChainedHashTable MemProfFunctionData
+  uint64_t MemProfSectionStart = 0;
+  if (static_cast<bool>(ProfileKind & InstrProfKind::MemProf)) {
+    MemProfSectionStart = OS.tell();
+    OS.write(0ULL); // Reserve space for the offset.
+
+    auto Schema = memprof::PortableMemInfoBlock::getSchema();
+    OS.write(static_cast<uint64_t>(Schema.size()));
+    for (const auto Id : Schema) {
+      OS.write(static_cast<uint64_t>(Id));
+    }
+
+    auto MemProfWriter = std::make_unique<memprof::MemProfRecordWriterTrait>();
+    MemProfWriter->Schema = &Schema;
+    OnDiskChainedHashTableGenerator<memprof::MemProfRecordWriterTrait>
+        MemProfGenerator;
+    for (const auto &I : MemProfData) {
+      // Insert the key (func hash) and value (vector of memprof records).
+      MemProfGenerator.insert(I.first, I.second);
+    }
+
+    uint64_t TableOffset = MemProfGenerator.Emit(OS.OS, *MemProfWriter);
+    PatchItem PatchItems[] = {
+        {MemProfSectionStart, &TableOffset, 1},
+    };
+    OS.patch(PatchItems, 1);
+  }
+
   // Allocate space for data to be serialized out.
   std::unique_ptr<IndexedInstrProf::Summary> TheSummary =
       IndexedInstrProf::allocSummary(SummarySize);
@@ -369,6 +449,8 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   PatchItem PatchItems[] = {
       // Patch the Header.HashOffset field.
       {HashTableStartFieldOffset, &HashTableStart, 1},
+      // Patch the Header.MemProfOffset (=0 for profiles without MemProf data).
+      {MemProfSectionOffset, &MemProfSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
new file mode 100644
index 0000000000000..48950d41d0234
--- /dev/null
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -0,0 +1,73 @@
+#include "llvm/ProfileData/MemProf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+namespace llvm {
+namespace memprof {
+
+void serializeRecords(const ArrayRef<MemProfRecord> Records,
+                      const MemProfSchema &Schema, raw_ostream &OS) {
+  using namespace support;
+
+  endian::Writer LE(OS, little);
+
+  LE.write<uint64_t>(Records.size());
+  for (const MemProfRecord &MR : Records) {
+    LE.write<uint64_t>(MR.CallStack.size());
+    for (const MemProfRecord::Frame &F : MR.CallStack) {
+      F.serialize(OS);
+    }
+    MR.Info.serialize(Schema, OS);
+  }
+}
+
+SmallVector<MemProfRecord, 4> deserializeRecords(const MemProfSchema &Schema,
+                                                 const unsigned char *Ptr) {
+  using namespace support;
+
+  SmallVector<MemProfRecord, 4> Records;
+  const uint64_t NumRecords =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  for (uint64_t I = 0; I < NumRecords; I++) {
+    MemProfRecord MR;
+    const uint64_t NumFrames =
+        endian::readNext<uint64_t, little, unaligned>(Ptr);
+    for (uint64_t J = 0; J < NumFrames; J++) {
+      const auto F = MemProfRecord::Frame::deserialize(Ptr);
+      Ptr += MemProfRecord::Frame::serializedSize();
+      MR.CallStack.push_back(F);
+    }
+    MR.Info.deserialize(Schema, Ptr);
+    Ptr += PortableMemInfoBlock::serializedSize();
+    Records.push_back(MR);
+  }
+  return Records;
+}
+
+Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
+  using namespace support;
+
+  const unsigned char *Ptr = Buffer;
+  const uint64_t NumSchemaIds =
+      endian::readNext<uint64_t, little, unaligned>(Ptr);
+  if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
+    return make_error<InstrProfError>(instrprof_error::malformed,
+                                      "memprof schema invalid");
+  }
+
+  MemProfSchema Result;
+  for (size_t I = 0; I < NumSchemaIds; I++) {
+    const uint64_t Tag = endian::readNext<uint64_t, little, unaligned>(Ptr);
+    if (Tag >= static_cast<uint64_t>(Meta::Size)) {
+      return make_error<InstrProfError>(instrprof_error::malformed,
+                                        "memprof schema invalid");
+    }
+    Result.push_back(static_cast<Meta>(Tag));
+  }
+  // Advace the buffer to one past the schema if we succeeded.
+  Buffer = Ptr;
+  return Result;
+}
+
+} // namespace memprof
+} // namespace llvm
diff --git a/llvm/lib/ProfileData/RawMemProfReader.cpp b/llvm/lib/ProfileData/RawMemProfReader.cpp
index 43ef7c947366a..9bcba2a2b04ea 100644
--- a/llvm/lib/ProfileData/RawMemProfReader.cpp
+++ b/llvm/lib/ProfileData/RawMemProfReader.cpp
@@ -362,7 +362,12 @@ Error RawMemProfReader::fillRecord(const uint64_t Id, const MemInfoBlock &MIB,
     for (size_t I = 0; I < DI.getNumberOfFrames(); I++) {
       const auto &Frame = DI.getFrame(I);
       Record.CallStack.emplace_back(
-          std::to_string(llvm::MD5Hash(trimSuffix(Frame.FunctionName))),
+          // We use the function guid which we expect to be a uint64_t. At this
+          // time, it is the lower 64 bits of the md5 of the function name. Any
+          // suffix with .llvm. is trimmed since these are added by thinLTO
+          // global promotion. At the time the profile is consumed, these
+          // suffixes will not be present.
+          Function::getGUID(trimSuffix(Frame.FunctionName)),
           Frame.Line - Frame.StartLine, Frame.Column,
           // Only the first entry is not an inlined location.
           I != 0);
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic.profraw b/llvm/test/tools/llvm-profdata/Inputs/basic.profraw
new file mode 100644
index 0000000000000000000000000000000000000000..ad88759398c6020f4ab8a5606258e69d98e36687
GIT binary patch
literal 152
zcmZoHO3N=Q$obE~00xW@ih%*nfC`}V*`VS-{zJgi8V9flOx>@mz0b{3rrrk1zQ4@n
a%LP-nKp3J9svT|*OdktFZenI00|NkRuOhYp

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profdata/memprof-merge.test b/llvm/test/tools/llvm-profdata/memprof-merge.test
new file mode 100644
index 0000000000000..b11459f237ca5
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/memprof-merge.test
@@ -0,0 +1,47 @@
+REQUIRES: x86_64-linux
+
+The input memprof and instrumented raw profiles were generated from the following source code:
+
+```
+#include <stdlib.h>
+#include <string.h>
+int main(int argc, char **argv) {
+  char *x = (char *)malloc(10);
+  memset(x, 0, 10);
+  free(x);
+  x = (char *)malloc(10);
+  memset(x, 0, 10);
+  free(x);
+  return 0;
+}
+```
+
+Steps to collect the memprof raw profile and the instrprof raw profile:
+
+```
+# Collect instrprof profile with name compression disabled since some buildbots
+# do not have zlib.
+clang -mllvm -enable-name-compression=false -fprofile-generate source.c -o instr.out
+./instr.out
+mv *.profraw basic.profraw
+
+# Collect memprof profile.
+clang -fuse-ld=lld -Wl,--no-rosegment -gmlt -fdebug-info-for-profiling \
+      -fmemory-profile -mno-omit-leaf-frame-pointer -fno-omit-frame-pointer \
+      -fno-optimize-sibling-calls -m64 -Wl,-build-id source.c -o basic.memprofexe
+
+env MEMPROF_OPTIONS=log_path=stdout ./rawprofile.out > basic.memprofraw
+```
+
+RUN: llvm-profdata merge %p/Inputs/basic.profraw %p/Inputs/basic.memprofraw --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof
+RUN: llvm-profdata show %t.prof | FileCheck %s
+
+For now we only check the validity of the instrumented profile since we don't
+have a way to display the contents of the memprof indexed format yet.
+
+CHECK: Instrumentation level: IR  entry_first = 0
+CHECK: Total functions: 1
+CHECK: Maximum function count: 1
+CHECK: Maximum internal block count: 0
+
+
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index e00582851d47f..ba2f1b6038c48 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -239,7 +239,7 @@ static void overlapInput(const std::string &BaseFilename,
 /// Load an input into a writer context.
 static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
                       const InstrProfCorrelator *Correlator,
-                      WriterContext *WC) {
+                      const StringRef ProfiledBinary, WriterContext *WC) {
   std::unique_lock<std::mutex> CtxGuard{WC->Lock};
 
   // Copy the filename, because llvm::ThreadPool copied the input "const
@@ -247,6 +247,35 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
   // invalid outside of this packaged task.
   std::string Filename = Input.Filename;
 
+  using ::llvm::memprof::RawMemProfReader;
+  if (RawMemProfReader::hasFormat(Input.Filename)) {
+    auto ReaderOrErr = RawMemProfReader::create(Input.Filename, ProfiledBinary);
+    if (!ReaderOrErr) {
+      exitWithError(ReaderOrErr.takeError(), Input.Filename);
+    }
+    std::unique_ptr<RawMemProfReader> Reader = std::move(ReaderOrErr.get());
+    // Check if the profile types can be merged, e.g. clang frontend profiles
+    // should not be merged with memprof profiles.
+    if (Error E = WC->Writer.mergeProfileKind(Reader->getProfileKind())) {
+      consumeError(std::move(E));
+      WC->Errors.emplace_back(
+          make_error<StringError>(
+              "Cannot merge MemProf profile with Clang generated profile.",
+              std::error_code()),
+          Filename);
+      return;
+    }
+
+    // Add the records into the writer context.
+    for (const memprof::MemProfRecord &MR : *Reader) {
+      WC->Writer.addRecord(MR, [&](Error E) {
+        instrprof_error IPE = InstrProfError::take(std::move(E));
+        WC->Errors.emplace_back(make_error<InstrProfError>(IPE), Filename);
+      });
+    }
+    return;
+  }
+
   auto ReaderOrErr = InstrProfReader::create(Input.Filename, Correlator);
   if (Error E = ReaderOrErr.takeError()) {
     // Skip the empty profiles by returning sliently.
@@ -332,7 +361,8 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
                               SymbolRemapper *Remapper,
                               StringRef OutputFilename,
                               ProfileFormat OutputFormat, bool OutputSparse,
-                              unsigned NumThreads, FailureMode FailMode) {
+                              unsigned NumThreads, FailureMode FailMode,
+                              const StringRef ProfiledBinary) {
   if (OutputFormat != PF_Binary && OutputFormat != PF_Compact_Binary &&
       OutputFormat != PF_Ext_Binary && OutputFormat != PF_Text)
     exitWithError("unknown format is specified");
@@ -365,14 +395,15 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
 
   if (NumThreads == 1) {
     for (const auto &Input : Inputs)
-      loadInput(Input, Remapper, Correlator.get(), Contexts[0].get());
+      loadInput(Input, Remapper, Correlator.get(), ProfiledBinary,
+                Contexts[0].get());
   } else {
     ThreadPool Pool(hardware_concurrency(NumThreads));
 
     // Load the inputs in parallel (N/NumThreads serial steps).
     unsigned Ctx = 0;
     for (const auto &Input : Inputs) {
-      Pool.async(loadInput, Input, Remapper, Correlator.get(),
+      Pool.async(loadInput, Input, Remapper, Correlator.get(), ProfiledBinary,
                  Contexts[Ctx].get());
       Ctx = (Ctx + 1) % NumThreads;
     }
@@ -589,7 +620,7 @@ static void supplementInstrProfile(
   SmallSet<instrprof_error, 4> WriterErrorCodes;
   auto WC = std::make_unique<WriterContext>(OutputSparse, ErrorLock,
                                             WriterErrorCodes);
-  loadInput(Inputs[0], nullptr, nullptr, WC.get());
+  loadInput(Inputs[0], nullptr, nullptr, /*ProfiledBinary=*/"", WC.get());
   if (WC->Errors.size() > 0)
     exitWithError(std::move(WC->Errors[0].first), InstrFilename);
 
@@ -969,6 +1000,9 @@ static int merge_main(int argc, const char *argv[]) {
   cl::opt<std::string> DebugInfoFilename(
       "debug-info", cl::init(""),
       cl::desc("Use the provided debug info to correlate the raw profile."));
+  cl::opt<std::string> ProfiledBinary(
+      "profiled-binary", cl::init(""),
+      cl::desc("Path to binary from which the profile was collected."));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
@@ -1011,7 +1045,7 @@ static int merge_main(int argc, const char *argv[]) {
   if (ProfileKind == instr)
     mergeInstrProfile(WeightedInputs, DebugInfoFilename, Remapper.get(),
                       OutputFilename, OutputFormat, OutputSparse, NumThreads,
-                      FailureMode);
+                      FailureMode, ProfiledBinary);
   else
     mergeSampleProfile(WeightedInputs, Remapper.get(), OutputFilename,
                        OutputFormat, ProfileSymbolListFile, CompressAllSections,
@@ -1042,7 +1076,7 @@ static void overlapInstrProfile(const std::string &BaseFilename,
     OS << "Sum of edge counts for profile " << TestFilename << " is 0.\n";
     exit(0);
   }
-  loadInput(WeightedInput, nullptr, nullptr, &Context);
+  loadInput(WeightedInput, nullptr, nullptr, /*ProfiledBinary=*/"", &Context);
   overlapInput(BaseFilename, TestFilename, &Context, Overlap, FuncFilter, OS,
                IsCS);
   Overlap.dump(OS);
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 7bdd6c2992859..434e6aaee8b02 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -12,6 +12,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ProfileData/MemProf.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Testing/Support/Error.h"
 #include "llvm/Testing/Support/SupportHelpers.h"
@@ -221,6 +222,67 @@ TEST_F(InstrProfTest, test_writer_merge) {
   ASSERT_EQ(0U, R->Counts[1]);
 }
 
+TEST_F(InstrProfTest, test_memprof) {
+  ASSERT_THAT_ERROR(Writer.mergeProfileKind(InstrProfKind::MemProf),
+                    Succeeded());
+  llvm::memprof::MemProfRecord MR;
+  MR.CallStack.push_back({0x123, 1, 2, false});
+  MR.CallStack.push_back({0x345, 3, 4, true});
+  Writer.addRecord(MR, Err);
+
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  auto RecordsOr = Reader->getMemProfRecord(0x123);
+  ASSERT_THAT_ERROR(RecordsOr.takeError(), Succeeded());
+  const auto Records = RecordsOr.get();
+  ASSERT_EQ(Records.size(), 1U);
+  EXPECT_EQ(Records[0], MR);
+}
+
+TEST_F(InstrProfTest, test_memprof_merge) {
+  Writer.addRecord({"func1", 0x1234, {42}}, Err);
+
+  InstrProfWriter Writer2;
+  ASSERT_THAT_ERROR(Writer2.mergeProfileKind(InstrProfKind::MemProf),
+                    Succeeded());
+
+  llvm::memprof::MemProfRecord MR;
+  MR.CallStack.push_back({0x123, 1, 2, false});
+  MR.CallStack.push_back({0x345, 3, 4, true});
+  Writer2.addRecord(MR, Err);
+
+  ASSERT_THAT_ERROR(Writer.mergeProfileKind(Writer2.getProfileKind()),
+                    Succeeded());
+  Writer.mergeRecordsFromWriter(std::move(Writer2), Err);
+
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile));
+
+  Expected<InstrProfRecord> R = Reader->getInstrProfRecord("func1", 0x1234);
+  EXPECT_THAT_ERROR(R.takeError(), Succeeded());
+  ASSERT_EQ(1U, R->Counts.size());
+  ASSERT_EQ(42U, R->Counts[0]);
+
+  auto RecordsOr = Reader->getMemProfRecord(0x123);
+  ASSERT_THAT_ERROR(RecordsOr.takeError(), Succeeded());
+  const auto Records = RecordsOr.get();
+  ASSERT_EQ(Records.size(), 1U);
+  EXPECT_EQ(Records[0], MR);
+}
+
+TEST_F(InstrProfTest, test_memprof_invalid_add_record) {
+  llvm::memprof::MemProfRecord MR;
+  // At least one of the frames should be a non-inline frame.
+  MR.CallStack.push_back({0x123, 1, 2, true});
+  MR.CallStack.push_back({0x345, 3, 4, true});
+
+  auto CheckErr = [](Error &&E) {
+    EXPECT_TRUE(ErrorEquals(instrprof_error::invalid_prof, std::move(E)));
+  };
+  Writer.addRecord(MR, CheckErr);
+}
+
 static const char callee1[] = "callee1";
 static const char callee2[] = "callee2";
 static const char callee3[] = "callee3";
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index f744b85d784c0..dc793178bd209 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -89,8 +89,8 @@ const DILineInfoSpecifier specifier() {
       DILineInfoSpecifier::FunctionNameKind::LinkageName);
 }
 
-MATCHER_P4(FrameContains, Function, LineOffset, Column, Inline, "") {
-  const std::string ExpectedHash = std::to_string(llvm::MD5Hash(Function));
+MATCHER_P4(FrameContains, FunctionName, LineOffset, Column, Inline, "") {
+  const uint64_t ExpectedHash = llvm::Function::getGUID(FunctionName);
   if (arg.Function != ExpectedHash) {
     *result_listener << "Hash mismatch";
     return false;
@@ -103,6 +103,22 @@ MATCHER_P4(FrameContains, Function, LineOffset, Column, Inline, "") {
   return false;
 }
 
+MATCHER_P(EqualsRecord, Want, "") {
+  if (arg == Want)
+    return true;
+
+  std::string Explanation;
+  llvm::raw_string_ostream OS(Explanation);
+  OS << "\n Want: \n";
+  Want.print(OS);
+  OS << "\n Got: \n";
+  arg.print(OS);
+  OS.flush();
+
+  *result_listener << Explanation;
+  return false;
+}
+
 MemProfSchema getFullSchema() {
   MemProfSchema Schema;
 #define MIBEntryDef(NameTag, Name, Type) Schema.push_back(Meta::Name);
@@ -184,4 +200,38 @@ TEST(MemProf, PortableWrapper) {
   EXPECT_EQ(3UL, ReadBlock.getAllocCpuId());
 }
 
+TEST(MemProf, RecordSerializationRoundTrip) {
+  const MemProfSchema Schema = getFullSchema();
+
+  llvm::SmallVector<MemProfRecord, 3> Records;
+  MemProfRecord MR;
+
+  MemInfoBlock Info(/*size=*/16, /*access_count=*/7, /*alloc_timestamp=*/1000,
+                    /*dealloc_timestamp=*/2000, /*alloc_cpu=*/3,
+                    /*dealloc_cpu=*/4);
+
+  MR.Info = PortableMemInfoBlock(Info);
+  MR.CallStack.push_back({0x123, 1, 2, false});
+  MR.CallStack.push_back({0x345, 3, 4, false});
+  Records.push_back(MR);
+
+  MR.clear();
+  MR.Info = PortableMemInfoBlock(Info);
+  MR.CallStack.push_back({0x567, 5, 6, false});
+  MR.CallStack.push_back({0x789, 7, 8, false});
+  Records.push_back(MR);
+
+  std::string Buffer;
+  llvm::raw_string_ostream OS(Buffer);
+  serializeRecords(Records, Schema, OS);
+  OS.flush();
+
+  const llvm::SmallVector<MemProfRecord, 4> GotRecords = deserializeRecords(
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()));
+
+  ASSERT_TRUE(!GotRecords.empty());
+  EXPECT_EQ(GotRecords.size(), Records.size());
+  EXPECT_THAT(GotRecords[0], EqualsRecord(Records[0]));
+  EXPECT_THAT(GotRecords[1], EqualsRecord(Records[1]));
+}
 } // namespace

From b1d9136da176152f9c2767b88faef6ee8b400b5b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 18 Feb 2022 06:12:10 +0000
Subject: [PATCH 220/748] [gn build] Port 0a4184909a8c

---
 llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
index ee186bded1111..39fe42efd481a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
@@ -13,6 +13,7 @@ static_library("ProfileData") {
     "InstrProfCorrelator.cpp",
     "InstrProfReader.cpp",
     "InstrProfWriter.cpp",
+    "MemProf.cpp",
     "ProfileSummaryBuilder.cpp",
     "RawMemProfReader.cpp",
     "SampleProf.cpp",

From eafafbae92f4d68b432989ee01ff38aa9e187a5f Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Fri, 18 Feb 2022 08:08:34 +0100
Subject: [PATCH 221/748] [flang] Lower scalar negation

Handle negation on scalar expression.

```
res = -a
```

This patch is part of the upstreaming effort from fir-dev branch.

Reviewed By: PeteSteinfeld

Differential Revision: https://reviews.llvm.org/D120071

Co-authored-by: Jean Perier <jperier@nvidia.com>
Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
---
 flang/lib/Lower/ConvertExpr.cpp |  9 ++++---
 flang/test/Lower/assignment.f90 | 43 +++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 497d1eaf06a0b..466a74fe031eb 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -189,18 +189,21 @@ class ScalarExprLowering {
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::Negate<Fortran::evaluate::Type<
                       Fortran::common::TypeCategory::Integer, KIND>> &op) {
-    TODO(getLoc(), "genval Negate integer");
+    mlir::Value input = genunbox(op.left());
+    // Like LLVM, integer negation is the binary op "0 - value"
+    mlir::Value zero = genIntegerConstant<KIND>(builder.getContext(), 0);
+    return builder.create<mlir::arith::SubIOp>(getLoc(), zero, input);
   }
 
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::Negate<Fortran::evaluate::Type<
                       Fortran::common::TypeCategory::Real, KIND>> &op) {
-    TODO(getLoc(), "genval Negate real");
+    return builder.create<mlir::arith::NegFOp>(getLoc(), genunbox(op.left()));
   }
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::Negate<Fortran::evaluate::Type<
                       Fortran::common::TypeCategory::Complex, KIND>> &op) {
-    TODO(getLoc(), "genval Negate complex");
+    return builder.create<fir::NegcOp>(getLoc(), genunbox(op.left()));
   }
 
 #undef GENBIN
diff --git a/flang/test/Lower/assignment.f90 b/flang/test/Lower/assignment.f90
index 6cb2e32095cee..32c2086de7de2 100644
--- a/flang/test/Lower/assignment.f90
+++ b/flang/test/Lower/assignment.f90
@@ -22,3 +22,46 @@ subroutine sub2(a, b)
 ! CHECK:         %[[B_VAL:.*]] = fir.load %arg1 : !fir.ref<i64>
 ! CHECK:         %[[B_CONV:.*]] = fir.convert %[[B_VAL]] : (i64) -> i32
 ! CHECK:         fir.store %[[B_CONV]] to %[[A]] : !fir.ref<i32>
+
+integer function negi(a)
+  integer :: a
+  negi = -a
+end 
+
+! CHECK-LABEL: func @_QPnegi(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) -> i32 {
+! CHECK:         %[[FCTRES:.*]] = fir.alloca i32 {bindc_name = "negi", uniq_name = "_QFnegiEnegi"}
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
+! CHECK:         %[[C0:.*]] = arith.constant 0 : i32
+! CHECK:         %[[NEG:.*]] = arith.subi %[[C0]], %[[A_VAL]] : i32
+! CHECK:         fir.store %[[NEG]] to %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         return %[[RET]] : i32
+
+real function negr(a)
+  real :: a
+  negr = -a
+end 
+
+! CHECK-LABEL: func @_QPnegr(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<f32> {fir.bindc_name = "a"}) -> f32 {
+! CHECK:         %[[FCTRES:.*]] = fir.alloca f32 {bindc_name = "negr", uniq_name = "_QFnegrEnegr"}
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<f32>
+! CHECK:         %[[NEG:.*]] = arith.negf %[[A_VAL]] : f32
+! CHECK:         fir.store %[[NEG]] to %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         return %[[RET]] : f32
+
+complex function negc(a)
+  complex :: a
+  negc = -a
+end 
+
+! CHECK-LABEL: func @_QPnegc(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "a"}) -> !fir.complex<4> {
+! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4> {bindc_name = "negc", uniq_name = "_QFnegcEnegc"}
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[NEG:.*]] = fir.negc %[[A_VAL]] : !fir.complex<4>
+! CHECK:         fir.store %[[NEG]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         return %[[RET]] : !fir.complex<4>

From 38054556a08884aa15d3ebc720e2f43d0cb5a944 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Thu, 17 Feb 2022 23:59:15 -0800
Subject: [PATCH 222/748] Fix buildbots after https://reviews.llvm.org/D119797

This value error is no longer needed with the new version of the patch
---
 .../Python/lldbsuite/test/tools/lldb-vscode/vscode.py       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
index ae919fc2ed0c6..faa0b93b3f9a7 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
@@ -228,9 +228,9 @@ def handle_recv_packet(self, packet):
                 # 'stopped' event. We need to remember the thread stop
                 # reasons since the 'threads' command doesn't return
                 # that information.
-                if not self.configuration_done_sent:
-                    raise ValueError("'stopped' event received before "
-                                     "configuationDone packet was sent")
+                # if not self.configuration_done_sent:
+                #     raise ValueError("'stopped' event received before "
+                #                      "configuationDone packet was sent")
                 self._process_stopped()
                 tid = body['threadId']
                 self.thread_stop_reasons[tid] = body

From b529744c29a87d8ecf4336e751fee794df10ec7f Mon Sep 17 00:00:00 2001
From: hyeongyukim <gusrb406@snu.ac.kr>
Date: Fri, 18 Feb 2022 17:00:00 +0900
Subject: [PATCH 223/748] [Clang] Rename `disable-noundef-analysis` flag to
 `-[no-]enable-noundef-analysis`

This flag was previously renamed `enable_noundef_analysis` to
`disable-noundef-analysis,` which is not a conventional name. (Driver and
CC1's boolean options are using [no-] prefix)
As discussed at https://reviews.llvm.org/D105169, this patch reverts its
name to `[no-]enable_noundef_analysis` and enables noundef-analysis as
default.

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D119998
---
 clang/include/clang/Basic/CodeGenOptions.def  |   2 +-
 clang/include/clang/Driver/Options.td         |  10 +-
 clang/lib/CodeGen/CGCall.cpp                  |   4 +-
 .../CodeGen/2007-06-18-SextAttrAggregate.c    |   2 +-
 .../CodeGen/2009-02-13-zerosize-union-field.c |   2 +-
 clang/test/CodeGen/2009-05-04-EnumInreg.c     |   2 +-
 clang/test/CodeGen/64bit-swiftcall.c          |   8 +-
 .../test/CodeGen/SystemZ/systemz-abi-vector.c |  18 +-
 clang/test/CodeGen/SystemZ/systemz-abi.c      |  22 +-
 clang/test/CodeGen/arm-swiftcall.c            |   6 +-
 clang/test/CodeGen/ext-int-cc.c               |  58 +-
 clang/test/CodeGen/matrix-type-builtins.c     |   4 +-
 clang/test/CodeGen/msan-param-retval.c        |   4 +-
 clang/test/CodeGen/noundef-analysis.cpp       |  39 +
 clang/test/CodeGen/object-size.c              |   4 +-
 clang/test/CodeGen/swift-async-call-conv.c    |  22 +-
 clang/test/CodeGen/temporary-lifetime.cpp     |   4 +-
 clang/test/CodeGenCXX/arm.cpp                 |   4 +-
 .../test/CodeGenCXX/catch-undef-behavior.cpp  |  10 +-
 .../CodeGenCXX/copy-constructor-elim-2.cpp    |   2 +-
 clang/test/CodeGenCXX/dllexport-members.cpp   |  12 +-
 clang/test/CodeGenCXX/dllexport.cpp           |  12 +-
 clang/test/CodeGenCXX/dllimport-members.cpp   |  12 +-
 clang/test/CodeGenCXX/dllimport.cpp           |  18 +-
 clang/test/CodeGenCXX/exceptions.cpp          |   4 +-
 clang/test/CodeGenCXX/ext-int.cpp             |  16 +-
 clang/test/CodeGenCXX/fastcall.cpp            |   2 +-
 .../CodeGenCXX/inheriting-constructor.cpp     |  10 +-
 clang/test/CodeGenCXX/matrix-type.cpp         |   2 +-
 .../CodeGenCXX/microsoft-abi-structors.cpp    |   2 +-
 clang/test/CodeGenCXX/pod-member-memcpys.cpp  |   4 +-
 clang/test/CodeGenCXX/wasm-args-returns.cpp   |   4 +-
 clang/test/CodeGenCXX/x86_64-arguments.cpp    |   2 +-
 .../coro-await-exp-namespace.cpp              |   2 +-
 clang/test/CodeGenCoroutines/coro-await.cpp   |   4 +-
 ...ro-symmetric-transfer-01-exp-namespace.cpp |   4 +-
 clang/test/CodeGenObjC/arc-foreach.m          |   4 +-
 clang/test/CodeGenObjC/arc-literals.m         |   2 +-
 .../CodeGenObjC/atomic-aggregate-property.m   |   4 +-
 .../CodeGenObjC/property-ref-cast-to-void.m   |   4 +-
 clang/test/CodeGenObjC/ubsan-bool.m           |   6 +-
 clang/test/CodeGenObjC/ubsan-nullability.m    |   4 +-
 .../test/CodeGenObjCXX/arc-cxx11-init-list.mm |   2 +-
 .../CodeGenObjCXX/property-lvalue-lambda.mm   |   2 +-
 clang/test/CodeGenOpenCL/amdgpu-nullptr.cl    |   8 +-
 .../CodeGenOpenCL/cl20-device-side-enqueue.cl |  12 +-
 clang/test/CodeGenOpenCL/printf.cl            |  12 +-
 ..._parallel_for_simd_num_threads_codegen.cpp |  48 +-
 .../master_taskloop_in_reduction_codegen.cpp  |  12 +-
 ...ter_taskloop_simd_in_reduction_codegen.cpp |  12 +-
 clang/test/OpenMP/nvptx_target_codegen.cpp    |  10 +-
 ...arallel_reduction_codegen_tbaa_PR46146.cpp |  10 +-
 ...x_target_teams_distribute_simd_codegen.cpp |  22 +-
 .../OpenMP/parallel_firstprivate_codegen.cpp  |  44 +-
 clang/test/OpenMP/reduction_compound_op.cpp   |  12 +-
 .../OpenMP/single_firstprivate_codegen.cpp    |  10 +-
 .../OpenMP/target_defaultmap_codegen_01.cpp   | 676 +++++++++---------
 .../test/OpenMP/task_in_reduction_codegen.cpp |  12 +-
 .../OpenMP/taskloop_in_reduction_codegen.cpp  |  12 +-
 .../taskloop_simd_in_reduction_codegen.cpp    |  12 +-
 .../OpenMP/teams_firstprivate_codegen.cpp     |  72 +-
 61 files changed, 702 insertions(+), 659 deletions(-)
 create mode 100644 clang/test/CodeGen/noundef-analysis.cpp

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 1df5168c7d4ec..ed360d7a8a94e 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -64,7 +64,7 @@ CODEGENOPT(DisableLifetimeMarkers, 1, 0) ///< Don't emit any lifetime markers
 CODEGENOPT(DisableO0ImplyOptNone , 1, 0) ///< Don't annonate function with optnone at O0
 CODEGENOPT(ExperimentalStrictFloatingPoint, 1, 0) ///< Enables the new, experimental
                                                   ///< strict floating point.
-CODEGENOPT(DisableNoundefAttrs, 1, 0) ///< Disable emitting `noundef` attributes on IR call arguments and return values
+CODEGENOPT(EnableNoundefAttrs, 1, 0) ///< Disable emitting `noundef` attributes on IR call arguments and return values
 CODEGENOPT(LegacyPassManager, 1, 0) ///< Use the legacy pass manager.
 CODEGENOPT(DebugPassManager, 1, 0) ///< Prints debug information for the new
                                    ///< pass manager.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 10a98f637cddd..dad2b536db445 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5452,9 +5452,13 @@ defm clear_ast_before_backend : BoolOption<"",
   PosFlag<SetTrue, [], "Clear">,
   NegFlag<SetFalse, [], "Don't clear">,
   BothFlags<[], " the Clang AST before running backend code generation">>;
-def disable_noundef_analysis : Flag<["-"], "disable-noundef-analysis">, Group<f_Group>,
-  HelpText<"Disable analyzing function argument and return types for mandatory definedness">,
-  MarshallingInfoFlag<CodeGenOpts<"DisableNoundefAttrs">>;
+defm enable_noundef_analysis : BoolOption<"",
+  "enable-noundef-analysis",
+  CodeGenOpts<"EnableNoundefAttrs">,
+  DefaultTrue,
+  PosFlag<SetTrue, [], "Enable">,
+  NegFlag<SetFalse, [], "Disable">,
+  BothFlags<[], " analyzing function argument and return types for mandatory definedness">>;
 def discard_value_names : Flag<["-"], "discard-value-names">,
   HelpText<"Discard value names in LLVM IR">,
   MarshallingInfoFlag<CodeGenOpts<"DiscardValueNames">>;
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index f1eb26e498225..017d4036ed057 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2293,7 +2293,7 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
                      getLangOpts().Sanitize.has(SanitizerKind::Return);
 
   // Determine if the return type could be partially undef
-  if (!CodeGenOpts.DisableNoundefAttrs && HasStrictReturn) {
+  if (CodeGenOpts.EnableNoundefAttrs && HasStrictReturn) {
     if (!RetTy->isVoidType() && RetAI.getKind() != ABIArgInfo::Indirect &&
         DetermineNoUndef(RetTy, getTypes(), DL, RetAI))
       RetAttrs.addAttribute(llvm::Attribute::NoUndef);
@@ -2427,7 +2427,7 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
     }
 
     // Decide whether the argument we're handling could be partially undef
-    if (!CodeGenOpts.DisableNoundefAttrs &&
+    if (CodeGenOpts.EnableNoundefAttrs &&
         DetermineNoUndef(ParamType, getTypes(), DL, AI)) {
       Attrs.addAttribute(llvm::Attribute::NoUndef);
     }
diff --git a/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c b/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c
index b8e32497102de..bcd2a9f67708c 100644
--- a/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c
+++ b/clang/test/CodeGen/2007-06-18-SextAttrAggregate.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis %s -o - -emit-llvm | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -o - -emit-llvm | FileCheck %s
 // XFAIL: aarch64, arm64, x86_64-pc-windows-msvc, x86_64-w64-windows-gnu, x86_64-pc-windows-gnu
 
 // PR1513
diff --git a/clang/test/CodeGen/2009-02-13-zerosize-union-field.c b/clang/test/CodeGen/2009-02-13-zerosize-union-field.c
index f1d335a8f8336..417ab956d9012 100644
--- a/clang/test/CodeGen/2009-02-13-zerosize-union-field.c
+++ b/clang/test/CodeGen/2009-02-13-zerosize-union-field.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -disable-noundef-analysis -triple i686-apple-darwin -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -no-enable-noundef-analysis -triple i686-apple-darwin -emit-llvm -o - | FileCheck %s
 // Every printf has 'i32 0' for the GEP of the string; no point counting those.
 typedef unsigned int Foo __attribute__((aligned(32)));
 typedef union{Foo:0;}a;
diff --git a/clang/test/CodeGen/2009-05-04-EnumInreg.c b/clang/test/CodeGen/2009-05-04-EnumInreg.c
index 895849022d3fe..0ea18b92691e0 100644
--- a/clang/test/CodeGen/2009-05-04-EnumInreg.c
+++ b/clang/test/CodeGen/2009-05-04-EnumInreg.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -emit-llvm -triple i686-apple-darwin -mregparm 3 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -emit-llvm -triple i686-apple-darwin -mregparm 3 %s -o - | FileCheck %s
 // PR3967
 
 enum kobject_action {
diff --git a/clang/test/CodeGen/64bit-swiftcall.c b/clang/test/CodeGen/64bit-swiftcall.c
index fe3cb51f1ce67..143c6f6e861e1 100644
--- a/clang/test/CodeGen/64bit-swiftcall.c
+++ b/clang/test/CodeGen/64bit-swiftcall.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s --check-prefix=X86-64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s --check-prefix=X86-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM64
 
 // REQUIRES: aarch64-registered-target,x86-registered-target
 
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
index fa85074981c5d..f44f090180b09 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c
@@ -1,20 +1,20 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu \
 // RUN:   -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-feature +vector \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-feature +vector \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu z13 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z13 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch11 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch11 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu z14 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z14 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch12 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch12 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu z15 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z15 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch13 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch13 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 
 // Vector types
diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c
index 1755de3a87e92..3bf4b0501d585 100644
--- a/clang/test/CodeGen/SystemZ/systemz-abi.c
+++ b/clang/test/CodeGen/SystemZ/systemz-abi.c
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-feature +vector \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-feature +vector \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu z13 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z13 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch11 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch11 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu z14 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z14 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch12 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch12 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu z15 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu z15 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch13 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch13 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch13 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch13 \
 // RUN:   -emit-llvm -o - %s -mfloat-abi soft | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SOFT-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD-FLOAT
-// RUN: %clang_cc1 -disable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple s390x-linux-gnu -target-cpu arch14 \
 // RUN:   -emit-llvm -o - %s -mfloat-abi soft | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,SOFT-FLOAT
 
diff --git a/clang/test/CodeGen/arm-swiftcall.c b/clang/test/CodeGen/arm-swiftcall.c
index 1deb8476dc73c..77934246480c2 100644
--- a/clang/test/CodeGen/arm-swiftcall.c
+++ b/clang/test/CodeGen/arm-swiftcall.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7s-apple-ios9 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7k-apple-ios9 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7s-apple-ios9 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7k-apple-ios9 -emit-llvm -o - %s | FileCheck %s
 
 #define SWIFTCALL __attribute__((swiftcall))
 #define SWIFTASYNCCALL __attribute__((swiftasynccall))
diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c
index 88b649934f4d7..7dc932937389f 100644
--- a/clang/test/CodeGen/ext-int-cc.c
+++ b/clang/test/CodeGen/ext-int-cc.c
@@ -1,32 +1,32 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=LIN64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WIN64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=LIN32
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WIN32
-// RUN: %clang_cc1 -disable-noundef-analysis -triple nvptx64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=NVPTX64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple nvptx -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=NVPTX
-// RUN: %clang_cc1 -disable-noundef-analysis -triple sparcv9 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPARCV9
-// RUN: %clang_cc1 -disable-noundef-analysis -triple sparc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPARC
-// RUN: %clang_cc1 -disable-noundef-analysis -triple mips64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=MIPS64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple mips -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=MIPS
-// RUN: %clang_cc1 -disable-noundef-analysis -triple spir64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPIR64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple spir -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPIR
-// RUN: %clang_cc1 -disable-noundef-analysis -triple hexagon -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=HEX
-// RUN: %clang_cc1 -disable-noundef-analysis -triple lanai -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=LANAI
-// RUN: %clang_cc1 -disable-noundef-analysis -triple r600 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=R600
-// RUN: %clang_cc1 -disable-noundef-analysis -triple arc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=ARC
-// RUN: %clang_cc1 -disable-noundef-analysis -triple xcore -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=XCORE
-// RUN: %clang_cc1 -disable-noundef-analysis -triple riscv64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=RISCV64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple riscv32 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=RISCV32
-// RUN: %clang_cc1 -disable-noundef-analysis -triple wasm64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WASM
-// RUN: %clang_cc1 -disable-noundef-analysis -triple wasm32 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WASM
-// RUN: %clang_cc1 -disable-noundef-analysis -triple systemz -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SYSTEMZ
-// RUN: %clang_cc1 -disable-noundef-analysis -triple ppc64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=PPC64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple ppc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=PPC32
-// RUN: %clang_cc1 -disable-noundef-analysis -triple aarch64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple aarch64 -target-abi darwinpcs -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64DARWIN
-// RUN: %clang_cc1 -disable-noundef-analysis -triple arm64_32-apple-ios -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64
-// RUN: %clang_cc1 -disable-noundef-analysis -triple arm64_32-apple-ios -target-abi darwinpcs -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64DARWIN
-// RUN: %clang_cc1 -disable-noundef-analysis -triple arm -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=ARM
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=LIN64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WIN64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-gnu-linux -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=LIN32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-windows-pc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WIN32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple nvptx64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=NVPTX64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple nvptx -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=NVPTX
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple sparcv9 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPARCV9
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple sparc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPARC
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple mips64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=MIPS64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple mips -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=MIPS
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spir64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPIR64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple spir -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SPIR
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple hexagon -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=HEX
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple lanai -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=LANAI
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple r600 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=R600
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple arc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=ARC
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple xcore -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=XCORE
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple riscv64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=RISCV64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple riscv32 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=RISCV32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple wasm64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WASM
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple wasm32 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=WASM
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple systemz -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=SYSTEMZ
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple ppc64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=PPC64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple ppc -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=PPC32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple aarch64 -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple aarch64 -target-abi darwinpcs -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64DARWIN
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple arm64_32-apple-ios -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple arm64_32-apple-ios -target-abi darwinpcs -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=AARCH64DARWIN
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple arm -O3 -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s --check-prefixes=ARM
 
 // Make sure 128 and 64 bit versions are passed like integers.
 void ParamPassing(_BitInt(128) b, _BitInt(64) c) {}
diff --git a/clang/test/CodeGen/matrix-type-builtins.c b/clang/test/CodeGen/matrix-type-builtins.c
index fb6e69bcd30fc..58d8b96e32264 100644
--- a/clang/test/CodeGen/matrix-type-builtins.c
+++ b/clang/test/CodeGen/matrix-type-builtins.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK64 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fenable-matrix -triple i386-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -triple i386-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK32 %s
 
 // Also check we do not crash when running some middle-end passes. Most
 // importantly this includes the IR verifier, to ensure we emit valid IR.
diff --git a/clang/test/CodeGen/msan-param-retval.c b/clang/test/CodeGen/msan-param-retval.c
index 07bd0f8d83c29..5bb597f3cb883 100644
--- a/clang/test/CodeGen/msan-param-retval.c
+++ b/clang/test/CodeGen/msan-param-retval.c
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -disable-noundef-analysis -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -o - %s | \
 // RUN:     FileCheck %s --check-prefix=CLEAN
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,NOUNDEF_ONLY
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -mllvm -msan-eager-checks -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -disable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -no-enable-noundef-analysis -fsanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=CLEAN
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -S -emit-llvm -fsanitize=memory -fsanitize-memory-param-retval -o - %s | \
 // RUN:     FileCheck %s --check-prefixes=NOUNDEF,EAGER
diff --git a/clang/test/CodeGen/noundef-analysis.cpp b/clang/test/CodeGen/noundef-analysis.cpp
new file mode 100644
index 0000000000000..778810758de44
--- /dev/null
+++ b/clang/test/CodeGen/noundef-analysis.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang -cc1 -enable-noundef-analysis -emit-llvm -o - %s | FileCheck %s -check-prefix ENABLED
+// RUN: %clang -cc1 -no-enable-noundef-analysis -emit-llvm -o - %s | FileCheck %s -check-prefix DISABLED
+
+union u1 {
+  int val;
+};
+
+struct s1 {
+  int val;
+};
+
+int indirect_callee_int(int a) { return a; }
+union u1 indirect_callee_union(union u1 a) {
+  return a;
+}
+
+static int sink;
+
+static void examineValue(int x) { sink = x; }
+
+// ENABLED-LABEL: @main(
+// ENABLED:    [[CALL:%.*]] = call noundef i32 @_Z19indirect_callee_inti(i32 noundef 0)
+// ENABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i32 {{.*}})
+// ENABLED:    [[CALL2:%.*]] = call noalias noundef nonnull i8* @_Znwm(i64 noundef 4) #[[ATTR4:[0-9]+]]
+// ENABLED:    call void @_ZL12examineValuei(i32 noundef {{.*}})
+// DISABLED-LABEL: @main(
+// DISABLED:    [[CALL:%.*]] = call i32 @_Z19indirect_callee_inti(i32 0)
+// DISABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i32 {{.*}})
+// DISABLED:    [[CALL2:%.*]] = call noalias nonnull i8* @_Znwm(i64 4) #[[ATTR4:[0-9]+]]
+// DISABLED:    call void @_ZL12examineValuei(i32 {{.*}})
+int main() {
+  indirect_callee_int(0);
+  indirect_callee_union((union u1){0});
+
+  auto s = new s1;
+  examineValue(s->val);
+
+  return 0;
+}
diff --git a/clang/test/CodeGen/object-size.c b/clang/test/CodeGen/object-size.c
index 153b3d9e6047b..46bfbf90ebdfa 100644
--- a/clang/test/CodeGen/object-size.c
+++ b/clang/test/CodeGen/object-size.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis           -triple x86_64-apple-darwin -emit-llvm %s -o - 2>&1 | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DDYNAMIC -triple x86_64-apple-darwin -emit-llvm %s -o - 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis           -triple x86_64-apple-darwin -emit-llvm %s -o - 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DDYNAMIC -triple x86_64-apple-darwin -emit-llvm %s -o - 2>&1 | FileCheck %s
 
 #ifndef DYNAMIC
 #define OBJECT_SIZE_BUILTIN __builtin_object_size
diff --git a/clang/test/CodeGen/swift-async-call-conv.c b/clang/test/CodeGen/swift-async-call-conv.c
index 0ab9b33dd3fb2..4d3ae76a0132d 100644
--- a/clang/test/CodeGen/swift-async-call-conv.c
+++ b/clang/test/CodeGen/swift-async-call-conv.c
@@ -1,14 +1,14 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7s-apple-ios9 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7k-apple-ios9 -emit-llvm -o - %s | FileCheck %s
-
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -triple armv7s-apple-ios9 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -triple armv7k-apple-ios9 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7s-apple-ios9 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7k-apple-ios9 -emit-llvm -o - %s | FileCheck %s
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -triple x86_64-apple-darwin10 -target-cpu core2 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -triple arm64-apple-ios9 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -triple armv7s-apple-ios9 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -triple armv7k-apple-ios9 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CPPONLY
 
 // Test tail call behavior when a swiftasynccall function is called
 // from another swiftasynccall function.
diff --git a/clang/test/CodeGen/temporary-lifetime.cpp b/clang/test/CodeGen/temporary-lifetime.cpp
index af1907cb99158..52e4760dfff80 100644
--- a/clang/test/CodeGen/temporary-lifetime.cpp
+++ b/clang/test/CodeGen/temporary-lifetime.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis %s -std=c++11 -O1 -DWITH_DTOR -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-DTOR %s
-// RUN: %clang_cc1 -disable-noundef-analysis %s -std=c++11 -O1 -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-NO-DTOR %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -std=c++11 -O1 -DWITH_DTOR -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-DTOR %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -std=c++11 -O1 -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-NO-DTOR %s
 
 struct A {
   A();
diff --git a/clang/test/CodeGenCXX/arm.cpp b/clang/test/CodeGenCXX/arm.cpp
index 252880664dfa6..24b648d652124 100644
--- a/clang/test/CodeGenCXX/arm.cpp
+++ b/clang/test/CodeGenCXX/arm.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis %s -triple=thumbv7-apple-ios6.0 -fno-use-cxa-atexit -target-abi apcs-gnu -emit-llvm -std=gnu++98 -o - -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
-// RUN: %clang_cc1 -disable-noundef-analysis %s -triple=thumbv7-apple-ios6.0 -fno-use-cxa-atexit -target-abi apcs-gnu -emit-llvm -std=gnu++11 -o - -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -triple=thumbv7-apple-ios6.0 -fno-use-cxa-atexit -target-abi apcs-gnu -emit-llvm -std=gnu++98 -o - -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -triple=thumbv7-apple-ios6.0 -fno-use-cxa-atexit -target-abi apcs-gnu -emit-llvm -std=gnu++11 -o - -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
 
 // CHECK: @_ZZN5test74testEvE1x = internal global i32 0, align 4
 // CHECK: @_ZGVZN5test74testEvE1x = internal global i32 0
diff --git a/clang/test/CodeGenCXX/catch-undef-behavior.cpp b/clang/test/CodeGenCXX/catch-undef-behavior.cpp
index d6b094cb5b82d..79efd297a4c61 100644
--- a/clang/test/CodeGenCXX/catch-undef-behavior.cpp
+++ b/clang/test/CodeGenCXX/catch-undef-behavior.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -fsanitize=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,unreachable,return,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -emit-llvm %s -o - -triple x86_64-linux-gnu | opt -instnamer -S | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -fsanitize=vptr,address -fsanitize-recover=vptr,address -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-ASAN
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -fsanitize=vptr -fsanitize-recover=vptr -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=DOWNCAST-NULL
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -fsanitize=function -emit-llvm %s -o - -triple x86_64-linux-gnux32 | FileCheck %s --check-prefix=CHECK-X32
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -fsanitize=function -emit-llvm %s -o - -triple i386-linux-gnu | FileCheck %s --check-prefix=CHECK-X86
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,unreachable,return,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -fsanitize-recover=signed-integer-overflow,integer-divide-by-zero,float-divide-by-zero,shift-base,shift-exponent,vla-bound,alignment,null,vptr,object-size,float-cast-overflow,bool,enum,array-bounds,function -emit-llvm %s -o - -triple x86_64-linux-gnu | opt -instnamer -S | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=vptr,address -fsanitize-recover=vptr,address -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=CHECK-ASAN
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=vptr -fsanitize-recover=vptr -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefix=DOWNCAST-NULL
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=function -emit-llvm %s -o - -triple x86_64-linux-gnux32 | FileCheck %s --check-prefix=CHECK-X32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -fsanitize=function -emit-llvm %s -o - -triple i386-linux-gnu | FileCheck %s --check-prefix=CHECK-X86
 
 struct S {
   double d;
diff --git a/clang/test/CodeGenCXX/copy-constructor-elim-2.cpp b/clang/test/CodeGenCXX/copy-constructor-elim-2.cpp
index 37feae0ae4b72..0c0ab002f1362 100644
--- a/clang/test/CodeGenCXX/copy-constructor-elim-2.cpp
+++ b/clang/test/CodeGenCXX/copy-constructor-elim-2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7-none-eabi -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7-none-eabi -emit-llvm -o - %s | FileCheck %s
 
 struct A { int x; A(int); ~A(); };
 A f() { return A(0); }
diff --git a/clang/test/CodeGenCXX/dllexport-members.cpp b/clang/test/CodeGenCXX/dllexport-members.cpp
index 0dab20a59ae87..cee297afcaa0a 100644
--- a/clang/test/CodeGenCXX/dllexport-members.cpp
+++ b/clang/test/CodeGenCXX/dllexport-members.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M32 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M64 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M32VS2015 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M64VS2015 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G32 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-gnu                     -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M32VS2015 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M64VS2015 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu                     -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s
 
 // Helper structs to make templates more expressive.
 struct ImplicitInst_Exported {};
diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp
index fb3d0f9484783..049c4df738781 100644
--- a/clang/test/CodeGenCXX/dllexport.cpp
+++ b/clang/test/CodeGenCXX/dllexport.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-passes -o - %s -w -fms-compatibility-version=19.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2015 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-passes -o - %s -w -fms-compatibility-version=18.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2013 -check-prefix=M32MSVC2013 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-passes -o - %s -w -fms-compatibility-version=19.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2015 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-passes -o - %s -w -fms-compatibility-version=18.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2013 -check-prefix=M32MSVC2013 %s
 
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=19.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2015 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=18.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2013 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=19.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2015 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=18.00 | FileCheck -allow-deprecated-dag-overlap --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2013 %s
 
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-gnu    -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU --check-prefix=G32 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-gnu  -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu    -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU --check-prefix=G32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu  -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck -allow-deprecated-dag-overlap --check-prefix=GNU %s
 
 // Helper structs to make templates more expressive.
 struct ImplicitInst_Exported {};
diff --git a/clang/test/CodeGenCXX/dllimport-members.cpp b/clang/test/CodeGenCXX/dllimport-members.cpp
index 795dfa6c21186..0230c6448ca19 100644
--- a/clang/test/CodeGenCXX/dllimport-members.cpp
+++ b/clang/test/CodeGenCXX/dllimport-members.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc -fms-compatibility   -emit-llvm -std=c++1y -O0 -o - %s -DMSABI | FileCheck --check-prefix=MSC --check-prefix=M32 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-msvc -fms-compatibility -emit-llvm -std=c++1y -O0 -o - %s -DMSABI | FileCheck --check-prefix=MSC --check-prefix=M64 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O0 -o - %s         | FileCheck --check-prefix=GNU --check-prefix=G32 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-gnu                     -emit-llvm -std=c++1y -O0 -o - %s         | FileCheck --check-prefix=GNU --check-prefix=G64 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc -fms-compatibility   -emit-llvm -std=c++1y -O1 -fno-experimental-new-pass-manager -o - %s -DMSABI | FileCheck --check-prefix=MO1 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O1 -fno-experimental-new-pass-manager -o - %s         | FileCheck --check-prefix=GO1 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc -fms-compatibility   -emit-llvm -std=c++1y -O0 -o - %s -DMSABI | FileCheck --check-prefix=MSC --check-prefix=M32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -fms-compatibility -emit-llvm -std=c++1y -O0 -o - %s -DMSABI | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O0 -o - %s         | FileCheck --check-prefix=GNU --check-prefix=G32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu                     -emit-llvm -std=c++1y -O0 -o - %s         | FileCheck --check-prefix=GNU --check-prefix=G64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc -fms-compatibility   -emit-llvm -std=c++1y -O1 -fno-experimental-new-pass-manager -o - %s -DMSABI | FileCheck --check-prefix=MO1 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O1 -fno-experimental-new-pass-manager -o - %s         | FileCheck --check-prefix=GO1 %s
 
 // Helper structs to make templates more expressive.
 struct ImplicitInst_Imported {};
diff --git a/clang/test/CodeGenCXX/dllimport.cpp b/clang/test/CodeGenCXX/dllimport.cpp
index c354325ec45aa..1f4342d96d62d 100644
--- a/clang/test/CodeGenCXX/dllimport.cpp
+++ b/clang/test/CodeGenCXX/dllimport.cpp
@@ -1,14 +1,14 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC --check-prefix=M32 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-msvc -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC --check-prefix=M64 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-gnu    -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s         -w | FileCheck --check-prefix=GNU --check-prefix=G32 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-gnu  -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s         -w | FileCheck --check-prefix=GNU %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -fms-compatibility-version=18.00 -emit-llvm -std=c++1y -O1 -disable-llvm-passes -o - %s -DMSABI -w | FileCheck --check-prefix=MO1 --check-prefix=M18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -fms-compatibility-version=19.00 -emit-llvm -std=c++1y -O1 -disable-llvm-passes -o - %s -DMSABI -w | FileCheck --check-prefix=MO1 --check-prefix=M19 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-gnu    -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O1 -fno-experimental-new-pass-manager -o - %s         -w | FileCheck --check-prefix=GO1 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC --check-prefix=M32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu    -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s         -w | FileCheck --check-prefix=GNU --check-prefix=G32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu  -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s         -w | FileCheck --check-prefix=GNU %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -fms-compatibility-version=18.00 -emit-llvm -std=c++1y -O1 -disable-llvm-passes -o - %s -DMSABI -w | FileCheck --check-prefix=MO1 --check-prefix=M18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -fms-compatibility-version=19.00 -emit-llvm -std=c++1y -O1 -disable-llvm-passes -o - %s -DMSABI -w | FileCheck --check-prefix=MO1 --check-prefix=M19 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu    -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O1 -fno-experimental-new-pass-manager -o - %s         -w | FileCheck --check-prefix=GO1 %s
 
 // CHECK-NOT doesn't play nice with CHECK-DAG, so use separate run lines.
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC2 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i686-windows-gnu    -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s         -w | FileCheck --check-prefix=GNU2 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc   -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s -DMSABI -w | FileCheck --check-prefix=MSC2 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-gnu    -fno-rtti -fno-threadsafe-statics -fms-extensions -emit-llvm -std=c++1y -O0 -o - %s         -w | FileCheck --check-prefix=GNU2 %s
 
 // Helper structs to make templates more expressive.
 struct ImplicitInst_Imported {};
diff --git a/clang/test/CodeGenCXX/exceptions.cpp b/clang/test/CodeGenCXX/exceptions.cpp
index 256a791398509..884ec6e06f775 100644
--- a/clang/test/CodeGenCXX/exceptions.cpp
+++ b/clang/test/CodeGenCXX/exceptions.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis %s -triple=x86_64-linux-gnu -emit-llvm -std=c++98 -o - -fcxx-exceptions -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
-// RUN: %clang_cc1 -disable-noundef-analysis %s -triple=x86_64-linux-gnu -emit-llvm -std=c++11 -o - -fcxx-exceptions -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -triple=x86_64-linux-gnu -emit-llvm -std=c++98 -o - -fcxx-exceptions -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK98 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -triple=x86_64-linux-gnu -emit-llvm -std=c++11 -o - -fcxx-exceptions -fexceptions | FileCheck -check-prefix=CHECK -check-prefix=CHECK11 %s
 
 // CHECK: %[[STRUCT_TEST13_A:.*]] = type { i32, i32 }
 
diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp
index cc61977fc95e3..446890eda2867 100644
--- a/clang/test/CodeGenCXX/ext-int.cpp
+++ b/clang/test/CodeGenCXX/ext-int.cpp
@@ -1,14 +1,14 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-gnu-linux -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN64,NoNewStructPathTBAA
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-gnu-linux -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN64,NewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-gnu-linux -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN64,NoNewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-gnu-linux -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN64,NewStructPathTBAA
 
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-pc -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN64,NoNewStructPathTBAA
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-windows-pc -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN64,NewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-pc -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN64,NoNewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-pc -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN64,NewStructPathTBAA
 
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-gnu-linux -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN32,NoNewStructPathTBAA
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-gnu-linux -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN32,NewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-gnu-linux -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN32,NoNewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-gnu-linux -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,LIN,LIN32,NewStructPathTBAA
 
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-windows-pc -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN32,NoNewStructPathTBAA
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-windows-pc -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN32,NewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-windows-pc -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN32,NoNewStructPathTBAA
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-windows-pc -O3 -disable-llvm-passes -I%S -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,WIN,WIN32,NewStructPathTBAA
 
 namespace std {
   class type_info { public: virtual ~type_info(); private: const char * name; };
diff --git a/clang/test/CodeGenCXX/fastcall.cpp b/clang/test/CodeGenCXX/fastcall.cpp
index 4a30c67893597..8bd248eaa9626 100644
--- a/clang/test/CodeGenCXX/fastcall.cpp
+++ b/clang/test/CodeGenCXX/fastcall.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 void __attribute__((fastcall)) foo1(int &y);
 void bar1(int &y) {
diff --git a/clang/test/CodeGenCXX/inheriting-constructor.cpp b/clang/test/CodeGenCXX/inheriting-constructor.cpp
index 62df84b9464d2..748c0d6d62d81 100644
--- a/clang/test/CodeGenCXX/inheriting-constructor.cpp
+++ b/clang/test/CodeGenCXX/inheriting-constructor.cpp
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -triple i386-linux -emit-llvm -o - %s | FileCheck %s --check-prefix=ITANIUM
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -triple x86_64-darwin -emit-llvm -o - %s | FileCheck %s --check-prefix=ITANIUM
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -triple arm64-ehabi -emit-llvm -o - %s | FileCheck %s --check-prefix=ITANIUM
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -triple i386-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=MSABI --check-prefix=WIN32
-// RUN: %clang_cc1 -disable-noundef-analysis -std=c++11 -triple x86_64-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=MSABI --check-prefix=WIN64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -triple i386-linux -emit-llvm -o - %s | FileCheck %s --check-prefix=ITANIUM
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -triple x86_64-darwin -emit-llvm -o - %s | FileCheck %s --check-prefix=ITANIUM
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -triple arm64-ehabi -emit-llvm -o - %s | FileCheck %s --check-prefix=ITANIUM
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -triple i386-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=MSABI --check-prefix=WIN32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -std=c++11 -triple x86_64-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=MSABI --check-prefix=WIN64
 
 // PR12219
 struct A { A(int); virtual ~A(); };
diff --git a/clang/test/CodeGenCXX/matrix-type.cpp b/clang/test/CodeGenCXX/matrix-type.cpp
index 8be81e97567e8..1e68936ecd422 100644
--- a/clang/test/CodeGenCXX/matrix-type.cpp
+++ b/clang/test/CodeGenCXX/matrix-type.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s
 
 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
 typedef float fx3x4_t __attribute__((matrix_type(3, 4)));
diff --git a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
index ce99b9cd737c5..ee1ca026bbb73 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -emit-llvm -fno-rtti %s -std=c++11 -o - -mconstructor-aliases -triple=i386-pc-win32 -fno-rtti > %t
+// RUN: %clang_cc1 -no-enable-noundef-analysis -emit-llvm -fno-rtti %s -std=c++11 -o - -mconstructor-aliases -triple=i386-pc-win32 -fno-rtti > %t
 // RUN: FileCheck %s < %t
 // vftables are emitted very late, so do another pass to try to keep the checks
 // in source order.
diff --git a/clang/test/CodeGenCXX/pod-member-memcpys.cpp b/clang/test/CodeGenCXX/pod-member-memcpys.cpp
index 83a732630b72a..2ea439dbe8906 100644
--- a/clang/test/CodeGenCXX/pod-member-memcpys.cpp
+++ b/clang/test/CodeGenCXX/pod-member-memcpys.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin10 -emit-llvm -std=c++03 -fexceptions -fcxx-exceptions -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple i386-apple-darwin10 -emit-llvm -std=c++03 -o - %s | FileCheck --check-prefix=CHECK-2 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin10 -emit-llvm -std=c++03 -fexceptions -fcxx-exceptions -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-apple-darwin10 -emit-llvm -std=c++03 -o - %s | FileCheck --check-prefix=CHECK-2 %s
 
 struct POD {
   int w, x, y, z;
diff --git a/clang/test/CodeGenCXX/wasm-args-returns.cpp b/clang/test/CodeGenCXX/wasm-args-returns.cpp
index a8e166f1bfd70..d56f3cdf7b773 100644
--- a/clang/test/CodeGenCXX/wasm-args-returns.cpp
+++ b/clang/test/CodeGenCXX/wasm-args-returns.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -O1 -triple wasm32-unknown-unknown -emit-llvm -o - %s \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -O1 -triple wasm32-unknown-unknown -emit-llvm -o - %s \
 // RUN:   | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -O1 -triple wasm64-unknown-unknown -emit-llvm -o - %s \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -O1 -triple wasm64-unknown-unknown -emit-llvm -o - %s \
 // RUN:   | FileCheck %s
 
 #define concat_(x, y) x##y
diff --git a/clang/test/CodeGenCXX/x86_64-arguments.cpp b/clang/test/CodeGenCXX/x86_64-arguments.cpp
index 3ac753dbed1e7..64ba57acd3f84 100644
--- a/clang/test/CodeGenCXX/x86_64-arguments.cpp
+++ b/clang/test/CodeGenCXX/x86_64-arguments.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 // Basic base class test.
 struct f0_s0 { unsigned a; };
diff --git a/clang/test/CodeGenCoroutines/coro-await-exp-namespace.cpp b/clang/test/CodeGenCoroutines/coro-await-exp-namespace.cpp
index b3a7bcdccfbc3..a464b361f3884 100644
--- a/clang/test/CodeGenCoroutines/coro-await-exp-namespace.cpp
+++ b/clang/test/CodeGenCoroutines/coro-await-exp-namespace.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 \
 // RUN:   -emit-llvm %s -o - -disable-llvm-passes -Wno-coroutine -Wno-unused | FileCheck %s
 
 namespace std {
diff --git a/clang/test/CodeGenCoroutines/coro-await.cpp b/clang/test/CodeGenCoroutines/coro-await.cpp
index b2baf09647681..943a0a5c2b296 100644
--- a/clang/test/CodeGenCoroutines/coro-await.cpp
+++ b/clang/test/CodeGenCoroutines/coro-await.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -std=c++20 \
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -std=c++20 \
 // RUN:   -emit-llvm %s -o - -disable-llvm-passes -Wno-coroutine -Wno-unused | FileCheck %s
 
 namespace std {
@@ -347,4 +347,4 @@ extern "C" void TestTailcall() {
   // CHECK: store i8* %[[RESULT]], i8** %[[COERCE]]
   // CHECK: %[[ADDR:.+]] = call i8* @_ZNSt16coroutine_handleIvE7addressEv(%"struct.std::coroutine_handle"* {{[^,]*}} %[[TMP]])
   // CHECK: call void @llvm.coro.resume(i8* %[[ADDR]])
-}
\ No newline at end of file
+}
diff --git a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01-exp-namespace.cpp b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01-exp-namespace.cpp
index 5dd614595424f..d15576581a8e7 100644
--- a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01-exp-namespace.cpp
+++ b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01-exp-namespace.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 -O0 -emit-llvm %s -o - -disable-llvm-passes | FileCheck %s
-// RUN: %clang -disable-noundef-analysis -fcoroutines-ts -std=c++14 -O0 -emit-llvm -c  %s -o %t -Xclang -disable-llvm-passes && %clang -c %t
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fcoroutines-ts -std=c++14 -O0 -emit-llvm %s -o - -disable-llvm-passes | FileCheck %s
+// RUN: %clang -fcoroutines-ts -std=c++14 -O0 -emit-llvm -c  %s -o %t -Xclang -disable-llvm-passes && %clang -c %t
 
 #include "Inputs/coroutine-exp-namespace.h"
 
diff --git a/clang/test/CodeGenObjC/arc-foreach.m b/clang/test/CodeGenObjC/arc-foreach.m
index 7b1905a6ff756..d64c5556283d9 100644
--- a/clang/test/CodeGenObjC/arc-foreach.m
+++ b/clang/test/CodeGenObjC/arc-foreach.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin -fblocks -fobjc-arc -fobjc-runtime-has-weak -emit-llvm %s -o - | FileCheck -check-prefix CHECK-LP64 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin -O1 -fblocks -fobjc-arc -fobjc-runtime-has-weak -emit-llvm %s -o - | FileCheck -check-prefix CHECK-LP64-OPT %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin -fblocks -fobjc-arc -fobjc-runtime-has-weak -emit-llvm %s -o - | FileCheck -check-prefix CHECK-LP64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin -O1 -fblocks -fobjc-arc -fobjc-runtime-has-weak -emit-llvm %s -o - | FileCheck -check-prefix CHECK-LP64-OPT %s
 // rdar://9503326
 // rdar://9606600
 
diff --git a/clang/test/CodeGenObjC/arc-literals.m b/clang/test/CodeGenObjC/arc-literals.m
index e7c03d563598b..0b1daffd1c976 100644
--- a/clang/test/CodeGenObjC/arc-literals.m
+++ b/clang/test/CodeGenObjC/arc-literals.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -I %S/Inputs -triple x86_64-apple-darwin10 -emit-llvm -fblocks -fobjc-arc -fobjc-runtime-has-weak -O2 -disable-llvm-passes -disable-noundef-analysis -o - %s | FileCheck %s
+// RUN: %clang_cc1 -I %S/Inputs -triple x86_64-apple-darwin10 -emit-llvm -fblocks -fobjc-arc -fobjc-runtime-has-weak -O2 -disable-llvm-passes -no-enable-noundef-analysis -o - %s | FileCheck %s
 
 #include "literal-support.h"
 
diff --git a/clang/test/CodeGenObjC/atomic-aggregate-property.m b/clang/test/CodeGenObjC/atomic-aggregate-property.m
index 4f9fba03cbbaf..4950bb19f8345 100644
--- a/clang/test/CodeGenObjC/atomic-aggregate-property.m
+++ b/clang/test/CodeGenObjC/atomic-aggregate-property.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin10  -fobjc-gc -emit-llvm -o - %s | FileCheck -check-prefix CHECK-LP64 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -x objective-c++ -triple x86_64-apple-darwin10  -fobjc-gc -emit-llvm -o - %s | FileCheck -check-prefix CHECK-LP64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin10  -fobjc-gc -emit-llvm -o - %s | FileCheck -check-prefix CHECK-LP64 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x objective-c++ -triple x86_64-apple-darwin10  -fobjc-gc -emit-llvm -o - %s | FileCheck -check-prefix CHECK-LP64 %s
 // rdar: // 7849824
 // <rdar://problem/12547611>
 
diff --git a/clang/test/CodeGenObjC/property-ref-cast-to-void.m b/clang/test/CodeGenObjC/property-ref-cast-to-void.m
index 5be6a60543f82..e304304439b20 100644
--- a/clang/test/CodeGenObjC/property-ref-cast-to-void.m
+++ b/clang/test/CodeGenObjC/property-ref-cast-to-void.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -x objective-c++ -triple x86_64-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x objective-c++ -triple x86_64-apple-darwin9 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s
 
 // rdar: // 8399655
 @interface TestClass
diff --git a/clang/test/CodeGenObjC/ubsan-bool.m b/clang/test/CodeGenObjC/ubsan-bool.m
index 7e69e0995ec07..43a964283b209 100644
--- a/clang/test/CodeGenObjC/ubsan-bool.m
+++ b/clang/test/CodeGenObjC/ubsan-bool.m
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -x objective-c -emit-llvm -triple x86_64-apple-macosx10.10.0 -fsanitize=bool %s -o - -w | FileCheck %s -check-prefixes=SHARED,OBJC
-// RUN: %clang_cc1 -disable-noundef-analysis -x objective-c++ -emit-llvm -triple x86_64-apple-macosx10.10.0 -fsanitize=bool %s -o - -w | FileCheck %s -check-prefixes=SHARED,OBJC
-// RUN: %clang_cc1 -disable-noundef-analysis -x c -emit-llvm -triple x86_64-apple-macosx10.10.0 -fsanitize=bool %s -o - | FileCheck %s -check-prefixes=SHARED,C
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x objective-c -emit-llvm -triple x86_64-apple-macosx10.10.0 -fsanitize=bool %s -o - -w | FileCheck %s -check-prefixes=SHARED,OBJC
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x objective-c++ -emit-llvm -triple x86_64-apple-macosx10.10.0 -fsanitize=bool %s -o - -w | FileCheck %s -check-prefixes=SHARED,OBJC
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c -emit-llvm -triple x86_64-apple-macosx10.10.0 -fsanitize=bool %s -o - | FileCheck %s -check-prefixes=SHARED,C
 
 typedef signed char BOOL;
 
diff --git a/clang/test/CodeGenObjC/ubsan-nullability.m b/clang/test/CodeGenObjC/ubsan-nullability.m
index 22e5d592150ee..c25a241edbd60 100644
--- a/clang/test/CodeGenObjC/ubsan-nullability.m
+++ b/clang/test/CodeGenObjC/ubsan-nullability.m
@@ -1,6 +1,6 @@
 // REQUIRES: asserts
-// RUN: %clang_cc1 -disable-noundef-analysis -x objective-c -emit-llvm -triple x86_64-apple-macosx10.10.0 -fblocks -fobjc-arc -fsanitize=nullability-arg,nullability-assign,nullability-return -w %s -o - | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis -x objective-c++ -emit-llvm -triple x86_64-apple-macosx10.10.0 -fblocks -fobjc-arc -fsanitize=nullability-arg,nullability-assign,nullability-return -w %s -o - | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x objective-c -emit-llvm -triple x86_64-apple-macosx10.10.0 -fblocks -fobjc-arc -fsanitize=nullability-arg,nullability-assign,nullability-return -w %s -o - | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x objective-c++ -emit-llvm -triple x86_64-apple-macosx10.10.0 -fblocks -fobjc-arc -fsanitize=nullability-arg,nullability-assign,nullability-return -w %s -o - | FileCheck %s
 
 // CHECK: [[NONNULL_RV_LOC1:@.*]] = private unnamed_addr global {{.*}} i32 100, i32 6
 // CHECK: [[NONNULL_ARG_LOC:@.*]] = private unnamed_addr global {{.*}} i32 204, i32 15 {{.*}} i32 190, i32 23
diff --git a/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm b/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
index cbd942f7d581e..44d15e2fea608 100644
--- a/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
+++ b/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -triple armv7-ios5.0 -std=c++11 -fmerge-all-constants -fobjc-arc -Os -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple armv7-ios5.0 -std=c++11 -fmerge-all-constants -fobjc-arc -Os -emit-llvm -o - %s | FileCheck %s
 
 // CHECK: @[[STR0:.*]] = private unnamed_addr constant [5 x i8] c"str0\00", section "__TEXT,__cstring,cstring_literals"
 // CHECK: @[[UNNAMED_CFSTRING0:.*]] = private global %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[STR0]], i32 0, i32 0), i32 4 }, section "__DATA,__cfstring"
diff --git a/clang/test/CodeGenObjCXX/property-lvalue-lambda.mm b/clang/test/CodeGenObjCXX/property-lvalue-lambda.mm
index 1051887ecdb3e..1fc75e003c22b 100644
--- a/clang/test/CodeGenObjCXX/property-lvalue-lambda.mm
+++ b/clang/test/CodeGenObjCXX/property-lvalue-lambda.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -fblocks -disable-llvm-passes -triple x86_64-apple-darwin10 -std=c++17 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fblocks -disable-llvm-passes -triple x86_64-apple-darwin10 -std=c++17 -emit-llvm -o - %s | FileCheck %s
 
 typedef void (^blk_t)();
 typedef void (*fnptr_t)();
diff --git a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
index ca9980b7bbac5..d966e2dfd57a4 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefix=NOOPT %s
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=COMMON
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -O0 -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -emit-llvm -o - | FileCheck --check-prefix=NOOPT %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -include opencl-c.h -triple amdgcn -fcommon -emit-llvm -o - | FileCheck %s --check-prefix=COMMON
 
 typedef struct {
   private char *p1;
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
index ffc30c274a855..141adc1da8692 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64
-// RUN: %clang_cc1 -disable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64
+// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
 
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
diff --git a/clang/test/CodeGenOpenCL/printf.cl b/clang/test/CodeGenOpenCL/printf.cl
index 2764d29859a39..2c00c169a6948 100644
--- a/clang/test/CodeGenOpenCL/printf.cl
+++ b/clang/test/CodeGenOpenCL/printf.cl
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -disable-noundef-analysis -cl-std=CL1.2 -cl-ext=-+cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=FP64,ALL %s
-// RUN: %clang_cc1 -disable-noundef-analysis -cl-std=CL1.2 -cl-ext=-cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NOFP64,ALL %s
-// RUN: %clang_cc1 -disable-noundef-analysis -cl-std=CL3.0 -cl-ext=+__opencl_c_fp64,+cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=FP64,ALL %s
-// RUN: %clang_cc1 -disable-noundef-analysis -cl-std=CL3.0 -cl-ext=-__opencl_c_fp64,-cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NOFP64,ALL %s
-// RUN: %clang_cc1 -disable-noundef-analysis -cl-std=clc++2021 -cl-ext=+__opencl_c_fp64,+cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=FP64,ALL %s
-// RUN: %clang_cc1 -disable-noundef-analysis -cl-std=clc++2021 -cl-ext=-__opencl_c_fp64,-cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NOFP64,ALL %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -cl-std=CL1.2 -cl-ext=-+cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=FP64,ALL %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -cl-std=CL1.2 -cl-ext=-cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NOFP64,ALL %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -cl-std=CL3.0 -cl-ext=+__opencl_c_fp64,+cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=FP64,ALL %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -cl-std=CL3.0 -cl-ext=-__opencl_c_fp64,-cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NOFP64,ALL %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -cl-std=clc++2021 -cl-ext=+__opencl_c_fp64,+cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=FP64,ALL %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -cl-std=clc++2021 -cl-ext=-__opencl_c_fp64,-cl_khr_fp64 -triple spir-unknown-unknown -disable-llvm-passes -emit-llvm -o - %s | FileCheck -check-prefixes=NOFP64,ALL %s
 
 typedef __attribute__((ext_vector_type(2))) float float2;
 typedef __attribute__((ext_vector_type(2))) half half2;
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
index 3d3de0a10855d..83a26f7c03d50 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -1,35 +1,35 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK3
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK5
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK6
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK5
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK6
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK7
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK8
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK7
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK8
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK9
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK10
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK10
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK11
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK12
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK12
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK13
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK14
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK13
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK14
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK15
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK16
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=CHECK15
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK16
 
 // expected-no-diagnostics
 #ifndef HEADER
diff --git a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
index 5b9b5dc524128..cb67c0b4b23b2 100644
--- a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
index 7c8bb2b3273ab..590daa75f3702 100644
--- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp
index 255deafe48af5..1690399590964 100644
--- a/clang/test/OpenMP/nvptx_target_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_codegen.cpp
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK3
 // expected-no-diagnostics
 
 #ifndef HEADER
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index 69ff00a0616b4..d7565ba3258ca 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -disable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -fexceptions -fcxx-exceptions -aux-triple powerpc64le-unknown-unknown -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -fexceptions -fcxx-exceptions -aux-triple powerpc64le-unknown-unknown -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
index 1b95d0887970a..8a76bd25cb74a 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -1,15 +1,15 @@
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 
 // expected-no-diagnostics
 #ifndef HEADER
diff --git a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
index 54e3a1492f981..13a615e7c0ecf 100644
--- a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -1,31 +1,31 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK4
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK9
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK10
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK11
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK12
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK10
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK12
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK17
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK17
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp-simd -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp-simd -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // expected-no-diagnostics
 #ifndef ARRAY
 #ifndef HEADER
diff --git a/clang/test/OpenMP/reduction_compound_op.cpp b/clang/test/OpenMP/reduction_compound_op.cpp
index 36bc8d6842e38..09feb3cafc5ce 100644
--- a/clang/test/OpenMP/reduction_compound_op.cpp
+++ b/clang/test/OpenMP/reduction_compound_op.cpp
@@ -1,21 +1,21 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
-//RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp -DNORM \
+//  RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp -DNORM \
 //RUN:  -emit-llvm -o - %s | FileCheck %s --check-prefix NORM
 
-//RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp -DCOMP \
+//RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp -DCOMP \
 //RUN:  -emit-llvm -o - %s | FileCheck %s --check-prefix COMP
 
 // Prefer compound operators since that is what the spec seems to say.
-//RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp -DNORM -DCOMP \
+//  RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp -DNORM -DCOMP \
 //RUN:  -emit-llvm -o - %s | FileCheck %s --check-prefix COMP
 
-//RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp-simd -DNORM \
+//RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp-simd -DNORM \
 //RUN:  -emit-llvm -o - %s | FileCheck %s --check-prefix SIMD-ONLY
 
-//RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp-simd -DCOMP \
+//RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp-simd -DCOMP \
 //RUN:  -emit-llvm -o - %s | FileCheck %s --check-prefix SIMD-ONLY
 
-//RUN: %clang_cc1 -disable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp-simd -DNORM -DCOMP \
+//RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-unknown-linux-gnu -fopenmp-simd -DNORM -DCOMP \
 //RUN:  -emit-llvm -o - %s | FileCheck %s --check-prefix SIMD-ONLY
 
 // SIMD-ONLY-NOT: {{__kmpc|__tgt}}
diff --git a/clang/test/OpenMP/single_firstprivate_codegen.cpp b/clang/test/OpenMP/single_firstprivate_codegen.cpp
index 7c22d874311af..9aebf64ac261e 100644
--- a/clang/test/OpenMP/single_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/single_firstprivate_codegen.cpp
@@ -1,9 +1,9 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK4
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
diff --git a/clang/test/OpenMP/target_defaultmap_codegen_01.cpp b/clang/test/OpenMP/target_defaultmap_codegen_01.cpp
index 8e08f21415ba6..5702196353b49 100644
--- a/clang/test/OpenMP/target_defaultmap_codegen_01.cpp
+++ b/clang/test/OpenMP/target_defaultmap_codegen_01.cpp
@@ -5,19 +5,19 @@
 #ifdef CK1
 
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK1
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
 // SIMD-ONLY10-NOT: {{__kmpc|__tgt}}
 
 // CK1-LABEL: @.__omp_offloading_{{.*}}implicit_maps_double_complex{{.*}}_l{{[0-9]+}}.region_id = weak{{.*}} constant i8 0
@@ -54,19 +54,19 @@ void implicit_maps_double_complex (int a){
 // CK1: {{.+}} = getelementptr inbounds { double, double }, { double, double }* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK2
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK2
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK2
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK2
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK2 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK2
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK2 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
 // SIMD-ONLY10-NOT: {{__kmpc|__tgt}}
 #ifdef CK2
 
@@ -104,19 +104,19 @@ void implicit_maps_double_complex (int a){
 // CK2: {{.+}} = getelementptr inbounds { double, double }, { double, double }* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK3
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK3 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK3
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK3 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY10 %s
 // SIMD-ONLY10-NOT: {{__kmpc|__tgt}}
 #ifdef CK3
 
@@ -154,19 +154,19 @@ void implicit_maps_double_complex (int a){
 // CK3: {{.+}} = getelementptr inbounds { double, double }, { double, double }* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK4 --check-prefix CK4-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4  --check-prefix CK4-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4  --check-prefix CK4-32
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4  --check-prefix CK4-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK4 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4  --check-prefix CK4-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4  --check-prefix CK4-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK4  --check-prefix CK4-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK4 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
 // SIMD-ONLY6-NOT: {{__kmpc|__tgt}}
 #ifdef CK4
 
@@ -226,19 +226,19 @@ void implicit_maps_double (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK5
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK5 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK5
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK5
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK5 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK5
 
@@ -277,19 +277,19 @@ void implicit_maps_array (int a){
 // CK5: {{.+}} = getelementptr inbounds [2 x double], [2 x double]* [[REF]], i{{64|32}} 0, i{{64|32}} 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK6
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK6
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK6
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK6
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK6 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK6
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK6
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK6
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK6
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK6 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK6
 
@@ -328,19 +328,19 @@ void implicit_maps_array (int a){
 // CK6: {{.+}} = getelementptr inbounds [2 x double], [2 x double]* [[REF]], i{{64|32}} 0, i{{64|32}} 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK7
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK7
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK7
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK7
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK7 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK7
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK7
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK7
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK7
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK7 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK7
 
@@ -379,19 +379,19 @@ void implicit_maps_array (int a){
 // CK7: {{.+}} = getelementptr inbounds [2 x double], [2 x double]* [[REF]], i{{64|32}} 0, i{{64|32}} 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK8
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK8
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK8
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK8
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK8 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK8
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK8
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK8
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK8
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK8 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK8
 
@@ -431,19 +431,19 @@ void implicit_maps_array (int a){
 #endif
 
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK9 --check-prefix CK9-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK9 --check-prefix CK9-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK9 --check-prefix CK9-32
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK9 --check-prefix CK9-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK9 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK9 --check-prefix CK9-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK9 --check-prefix CK9-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK9 --check-prefix CK9-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK9 --check-prefix CK9-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK9 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY26 %s
 // SIMD-ONLY26-NOT: {{__kmpc|__tgt}}
 
 #ifdef CK9
@@ -481,19 +481,19 @@ void zero_size_section_and_private_maps (int ii){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK10
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK10
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK10
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK10
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK10 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK10
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK10
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK10
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK10
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK10 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK10
 
@@ -529,19 +529,19 @@ void explicit_maps_single (){
   // CK10: define {{.+}}[[CALL]]
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK11
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK11
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK11
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK11
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK11 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK11
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK11
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK11
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK11
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK11 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK11
 
@@ -577,19 +577,19 @@ void explicit_maps_single (){
   // CK11: define {{.+}}[[CALL09]]
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK12
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK12
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK12
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK12
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK12 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK12
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK12
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK12
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK12
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK12 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK12
 
@@ -625,19 +625,19 @@ void explicit_maps_single (){
   // CK12: define {{.+}}[[CALL09]]
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK13
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK13
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK13
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK13
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK13 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK13
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK13
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK13
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK13
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK13 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK13
 
@@ -673,19 +673,19 @@ void explicit_maps_single (){
   // CK13: define {{.+}}[[CALL09]]
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK14
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK14
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK14
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK14
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK14 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK14
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK14
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK14
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK14
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK14 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY8 %s
 // SIMD-ONLY8-NOT: {{__kmpc|__tgt}}
 #ifdef CK14
 
@@ -721,19 +721,19 @@ void explicit_maps_single (){
   // CK14: define {{.+}}[[CALL09]]
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK15
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK15
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK15
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK15
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK15 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK15
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK15
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK15
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK15
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK15 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY12 %s
 // SIMD-ONLY12-NOT: {{__kmpc|__tgt}}
 #ifdef CK15
 
@@ -803,19 +803,19 @@ void implicit_maps_variable_length_array (int a){
 // CK15: {{.+}} = getelementptr inbounds double, double* [[REF]], i[[sz]] %{{.+}}
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK16
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK16
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK16
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK16
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK16 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK16
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK16
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK16
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK16
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK16 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
 // SIMD-ONLY16-NOT: {{__kmpc|__tgt}}
 #ifdef CK16
 
@@ -860,19 +860,19 @@ void implicit_maps_struct (int a){
 // CK16: {{.+}} = getelementptr inbounds [[ST]], [[ST]]* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK17
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK17
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK17
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK17
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK17 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK17
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK17
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK17
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK17
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK17 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
 // SIMD-ONLY16-NOT: {{__kmpc|__tgt}}
 #ifdef CK17
 
@@ -917,19 +917,19 @@ void implicit_maps_struct (int a){
 // CK17: {{.+}} = getelementptr inbounds [[ST]], [[ST]]* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK18
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK18
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK18
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK18
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK18 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK18
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK18
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK18
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK18
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK18 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
 // SIMD-ONLY16-NOT: {{__kmpc|__tgt}}
 #ifdef CK18
 
@@ -974,19 +974,19 @@ void implicit_maps_struct (int a){
 // CK18: {{.+}} = getelementptr inbounds [[ST]], [[ST]]* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK19
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK19
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK19
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK19
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK19 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK19
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK19
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK19
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK19
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK19 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
 // SIMD-ONLY16-NOT: {{__kmpc|__tgt}}
 #ifdef CK19
 
@@ -1031,19 +1031,19 @@ void implicit_maps_struct (int a){
 // CK19: {{.+}} = getelementptr inbounds [[ST]], [[ST]]* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK20 --check-prefix CK20-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK20  --check-prefix CK20-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK20  --check-prefix CK20-32
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK20  --check-prefix CK20-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK20 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK20 --check-prefix CK20-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK20  --check-prefix CK20-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK20  --check-prefix CK20-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK20  --check-prefix CK20-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK20 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY6 %s
 // SIMD-ONLY6-NOT: {{__kmpc|__tgt}}
 #ifdef CK20
 
@@ -1103,19 +1103,19 @@ void implicit_maps_double (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK21
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK21
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK21
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK21
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK21 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK21
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK21
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK21
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK21
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK21 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY16 %s
 // SIMD-ONLY16-NOT: {{__kmpc|__tgt}}
 #ifdef CK21
 
@@ -1160,19 +1160,19 @@ void implicit_maps_struct (int a){
 // CK21: {{.+}} = getelementptr inbounds [[ST]], [[ST]]* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK22
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK22
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK22
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK22
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK22 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK22
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK22
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK22
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK22
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK22 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY9 %s
 // SIMD-ONLY9-NOT: {{__kmpc|__tgt}}
 #ifdef CK22
 
@@ -1212,19 +1212,19 @@ void implicit_maps_pointer (){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK23 --check-prefix CK23-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK23 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK23 --check-prefix CK23-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK23 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 #ifdef CK23
 
@@ -1397,19 +1397,19 @@ void bar(float *&a, int *&b) {
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK24 --check-prefix CK24-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK24 --check-prefix CK24-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK24 --check-prefix CK24-32
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK24 --check-prefix CK24-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK24 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK24 --check-prefix CK24-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK24 --check-prefix CK24-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK24 --check-prefix CK24-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK24 --check-prefix CK24-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK24 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
 // SIMD-ONLY18-NOT: {{__kmpc|__tgt}}
 #ifdef CK24
 
@@ -1469,19 +1469,19 @@ void explicit_maps_single (int ii){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK25 --check-prefix CK25-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK25 --check-prefix CK25-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK25 --check-prefix CK25-32
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK25 --check-prefix CK25-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK25 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK25 --check-prefix CK25-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK25 --check-prefix CK25-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK25 --check-prefix CK25-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK25 --check-prefix CK25-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK25 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
 // SIMD-ONLY18-NOT: {{__kmpc|__tgt}}
 #ifdef CK25
 
@@ -1504,19 +1504,19 @@ void declare_target_to()
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK26 --check-prefix CK26-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK26 --check-prefix CK26-64
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK26 --check-prefix CK26-32
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK26 --check-prefix CK26-32
-
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
-// RUN: %clang_cc1 -disable-noundef-analysis -DCK26 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK26 --check-prefix CK26-32
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  --check-prefix CK26 --check-prefix CK26-32
+
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DCK26 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  --check-prefix SIMD-ONLY18 %s
 // SIMD-ONLY18-NOT: {{__kmpc|__tgt}}
 #ifdef CK26
 
diff --git a/clang/test/OpenMP/task_in_reduction_codegen.cpp b/clang/test/OpenMP/task_in_reduction_codegen.cpp
index 3cd91d2e6356b..262ca72d0103e 100644
--- a/clang/test/OpenMP/task_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/task_in_reduction_codegen.cpp
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
index 8b5546cc69e1f..41552686514d3 100644
--- a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
index c82ae72c53b31..23c24fc97b47a 100644
--- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
 
-// RUN: %clang_cc1 -disable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -disable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -no-enable-noundef-analysis -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
index 28e17cd733599..e5c43fb91ae55 100644
--- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -1,46 +1,46 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test host codegen.
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK3
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK4
 
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 
-// RUN: %clang_cc1 -disable-noundef-analysis  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK9
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK10
-// RUN: %clang_cc1 -disable-noundef-analysis  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK11
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK12
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK10
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK12
 
-// RUN: %clang_cc1 -disable-noundef-analysis  -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis  -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK17
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK18
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK19
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK20
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK17
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK18
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK19
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --check-prefix=CHECK20
 
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
-// RUN: %clang_cc1 -disable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s -Wno-openmp-mapping
+// RUN: %clang_cc1 -no-enable-noundef-analysis -DARRAY  -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - -Wno-openmp-mapping | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER

From f5efe2807056d3fa525e51d35ea94c91e0945eb2 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Thu, 17 Feb 2022 10:20:50 +0100
Subject: [PATCH 224/748] [mlir] Propagate NaNs in PolynomialApproximation

Previously, NaNs would be dropped in favor of bounded values which was
strictly incorrect. Now the min/max operation propagate this
information. Not all uses of min/max need this, but the given change
will help protect future additions, and this prevents the need for an
additional cmpf and select operation to handle NaNs.

Differential Revision: https://reviews.llvm.org/D120020
---
 .../Math/Transforms/PolynomialApproximation.cpp     | 13 +++++++++----
 .../test/Dialect/Math/polynomial-approximation.mlir |  6 +++---
 .../mlir-cpu-runner/math-polynomial-approx.mlir     |  5 +++++
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index 5d3f629210d42..dbd611c07829c 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -182,16 +182,21 @@ static Value f32FromBits(ImplicitLocOpBuilder &builder, uint32_t bits) {
 // Helper functions to build math functions approximations.
 //----------------------------------------------------------------------------//
 
-static Value min(ImplicitLocOpBuilder &builder, Value a, Value b) {
+// Return the minimum of the two values or NaN if value is NaN
+static Value min(ImplicitLocOpBuilder &builder, Value value, Value bound) {
   return builder.create<arith::SelectOp>(
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, a, b), a, b);
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::ULT, value, bound),
+      value, bound);
 }
 
-static Value max(ImplicitLocOpBuilder &builder, Value a, Value b) {
+// Return the maximum of the two values or NaN if value is NaN
+static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound) {
   return builder.create<arith::SelectOp>(
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, a, b), a, b);
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::UGT, value, bound),
+      value, bound);
 }
 
+// Return the clamped value or NaN if value is NaN
 static Value clamp(ImplicitLocOpBuilder &builder, Value value, Value lowerBound,
                    Value upperBound) {
   return max(builder, min(builder, value, upperBound), lowerBound);
diff --git a/mlir/test/Dialect/Math/polynomial-approximation.mlir b/mlir/test/Dialect/Math/polynomial-approximation.mlir
index 457e585e25f9a..cff92f3042c06 100644
--- a/mlir/test/Dialect/Math/polynomial-approximation.mlir
+++ b/mlir/test/Dialect/Math/polynomial-approximation.mlir
@@ -225,7 +225,7 @@ func @expm1_vector(%arg0: vector<8x8xf32>) -> vector<8x8xf32> {
 // CHECK:           %[[VAL_20:.*]] = arith.constant 1056964608 : i32
 // CHECK:           %[[VAL_21:.*]] = arith.constant 23 : i32
 // CHECK:           %[[VAL_22:.*]] = arith.constant 0.693147182 : f32
-// CHECK:           %[[VAL_23:.*]] = arith.cmpf ogt, %[[X]], %[[VAL_4]] : f32
+// CHECK:           %[[VAL_23:.*]] = arith.cmpf ugt, %[[X]], %[[VAL_4]] : f32
 // CHECK:           %[[VAL_24:.*]] = arith.select %[[VAL_23]], %[[X]], %[[VAL_4]] : f32
 // CHECK-NOT:       frexp
 // CHECK:           %[[VAL_25:.*]] = arith.bitcast %[[VAL_24]] : f32 to i32
@@ -355,9 +355,9 @@ func @log1p_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
 // CHECK:           %[[VAL_12:.*]] = arith.constant 0.00226843474 : f32
 // CHECK:           %[[VAL_13:.*]] = arith.constant 1.18534706E-4 : f32
 // CHECK:           %[[VAL_14:.*]] = arith.constant 1.19825836E-6 : f32
-// CHECK:           %[[VAL_15:.*]] = arith.cmpf olt, %[[VAL_0]], %[[VAL_2]] : f32
+// CHECK:           %[[VAL_15:.*]] = arith.cmpf ult, %[[VAL_0]], %[[VAL_2]] : f32
 // CHECK:           %[[VAL_16:.*]] = arith.select %[[VAL_15]], %[[VAL_0]], %[[VAL_2]] : f32
-// CHECK:           %[[VAL_17:.*]] = arith.cmpf ogt, %[[VAL_16]], %[[VAL_1]] : f32
+// CHECK:           %[[VAL_17:.*]] = arith.cmpf ugt, %[[VAL_16]], %[[VAL_1]] : f32
 // CHECK:           %[[VAL_18:.*]] = arith.select %[[VAL_17]], %[[VAL_16]], %[[VAL_1]] : f32
 // CHECK:           %[[VAL_19:.*]] = math.abs %[[VAL_0]] : f32
 // CHECK:           %[[VAL_20:.*]] = arith.cmpf olt, %[[VAL_19]], %[[VAL_3]] : f32
diff --git a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
index 413c04cc8867a..ccbd15b8972c6 100644
--- a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
@@ -29,6 +29,11 @@ func @tanh() {
   %5 = math.tanh %4 : vector<8xf32>
   vector.print %5 : vector<8xf32>
 
+  // CHECK-NEXT: nan
+  %nan = arith.constant 0x7fc00000 : f32
+  %6 = math.tanh %nan : f32
+  vector.print %6 : f32
+
   return
 }
 

From 5333447a00ff568e5029483e1b46041ad1c9788f Mon Sep 17 00:00:00 2001
From: hyeongyukim <gusrb406@snu.ac.kr>
Date: Fri, 18 Feb 2022 17:38:50 +0900
Subject: [PATCH 225/748] [NFC] Fix a buildbot failure after b529744

---
 clang/test/CodeGen/noundef-analysis.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGen/noundef-analysis.cpp b/clang/test/CodeGen/noundef-analysis.cpp
index 778810758de44..8a439cde38999 100644
--- a/clang/test/CodeGen/noundef-analysis.cpp
+++ b/clang/test/CodeGen/noundef-analysis.cpp
@@ -19,12 +19,12 @@ static int sink;
 static void examineValue(int x) { sink = x; }
 
 // ENABLED-LABEL: @main(
-// ENABLED:    [[CALL:%.*]] = call noundef i32 @_Z19indirect_callee_inti(i32 noundef 0)
+// ENABLED:    [[CALL:%.*]] = call noundef {{.*}}i32 @_Z19indirect_callee_inti(i32 noundef {{.*}}0)
 // ENABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i32 {{.*}})
 // ENABLED:    [[CALL2:%.*]] = call noalias noundef nonnull i8* @_Znwm(i64 noundef 4) #[[ATTR4:[0-9]+]]
 // ENABLED:    call void @_ZL12examineValuei(i32 noundef {{.*}})
 // DISABLED-LABEL: @main(
-// DISABLED:    [[CALL:%.*]] = call i32 @_Z19indirect_callee_inti(i32 0)
+// DISABLED:    [[CALL:%.*]] = call {{.*}}i32 @_Z19indirect_callee_inti(i32 {{.*}}0)
 // DISABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i32 {{.*}})
 // DISABLED:    [[CALL2:%.*]] = call noalias nonnull i8* @_Znwm(i64 4) #[[ATTR4:[0-9]+]]
 // DISABLED:    call void @_ZL12examineValuei(i32 {{.*}})

From c85a26454d4b3dab383555c3864568b7aff9c225 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 17 Feb 2022 15:48:35 +0100
Subject: [PATCH 226/748] [asan] Add support for
 disable_sanitizer_instrumentation attribute

For ASan this will effectively serve as a synonym for
__attribute__((no_sanitize("address"))).

Adding the disable_sanitizer_instrumentation to functions will drop the
sanitize_XXX attributes on the IR level.

This is the third reland of https://reviews.llvm.org/D114421.
Now that TSan test is fixed (https://reviews.llvm.org/D120050) there
should be no deadlocks.

Differential Revision: https://reviews.llvm.org/D120055
---
 clang/docs/AddressSanitizer.rst               |  6 +++
 clang/lib/CodeGen/CodeGenFunction.cpp         | 31 ++++++------
 clang/lib/CodeGen/SanitizerMetadata.cpp       |  2 +
 .../CodeGen/address-safety-attr-flavors.cpp   |  9 ++++
 clang/test/CodeGen/asan-globals.cpp           | 23 +++++----
 .../Instrumentation/AddressSanitizer.cpp      |  3 ++
 .../asan-disable-sanitizer-instrumentation.ll | 47 +++++++++++++++++++
 7 files changed, 96 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll

diff --git a/clang/docs/AddressSanitizer.rst b/clang/docs/AddressSanitizer.rst
index 06b53e2e5da0b..fe5f683580a46 100644
--- a/clang/docs/AddressSanitizer.rst
+++ b/clang/docs/AddressSanitizer.rst
@@ -229,6 +229,12 @@ compilers, so we suggest to use it together with
 The same attribute used on a global variable prevents AddressSanitizer
 from adding redzones around it and detecting out of bounds accesses.
 
+
+AddressSanitizer also supports
+``__attribute__((disable_sanitizer_instrumentation))``. This attribute
+works similar to ``__attribute__((no_sanitize("address")))``, but it also
+prevents instrumentation performed by other sanitizers.
+
 Suppressing Errors in Recompiled Code (Ignorelist)
 --------------------------------------------------
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 95091edd9ecb7..c4ccc8e1b0424 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -383,9 +383,6 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
                        "__cyg_profile_func_exit");
   }
 
-  if (ShouldSkipSanitizerInstrumentation())
-    CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
-
   // Emit debug descriptor for function end.
   if (CGDebugInfo *DI = getDebugInfo())
     DI->EmitFunctionEnd(Builder, CurFn);
@@ -767,18 +764,22 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       Fn->addFnAttr(llvm::Attribute::NoSanitizeCoverage);
   }
 
-  // Apply sanitizer attributes to the function.
-  if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress))
-    Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
-  if (SanOpts.hasOneOf(SanitizerKind::HWAddress |
-                       SanitizerKind::KernelHWAddress))
-    Fn->addFnAttr(llvm::Attribute::SanitizeHWAddress);
-  if (SanOpts.has(SanitizerKind::MemTag))
-    Fn->addFnAttr(llvm::Attribute::SanitizeMemTag);
-  if (SanOpts.has(SanitizerKind::Thread))
-    Fn->addFnAttr(llvm::Attribute::SanitizeThread);
-  if (SanOpts.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory))
-    Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
+  if (ShouldSkipSanitizerInstrumentation()) {
+    CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
+  } else {
+    // Apply sanitizer attributes to the function.
+    if (SanOpts.hasOneOf(SanitizerKind::Address | SanitizerKind::KernelAddress))
+      Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
+    if (SanOpts.hasOneOf(SanitizerKind::HWAddress |
+                         SanitizerKind::KernelHWAddress))
+      Fn->addFnAttr(llvm::Attribute::SanitizeHWAddress);
+    if (SanOpts.has(SanitizerKind::MemTag))
+      Fn->addFnAttr(llvm::Attribute::SanitizeMemTag);
+    if (SanOpts.has(SanitizerKind::Thread))
+      Fn->addFnAttr(llvm::Attribute::SanitizeThread);
+    if (SanOpts.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory))
+      Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
+  }
   if (SanOpts.has(SanitizerKind::SafeStack))
     Fn->addFnAttr(llvm::Attribute::SafeStack);
   if (SanOpts.has(SanitizerKind::ShadowCallStack))
diff --git a/clang/lib/CodeGen/SanitizerMetadata.cpp b/clang/lib/CodeGen/SanitizerMetadata.cpp
index 009965a36c396..9e26d242d3a7e 100644
--- a/clang/lib/CodeGen/SanitizerMetadata.cpp
+++ b/clang/lib/CodeGen/SanitizerMetadata.cpp
@@ -73,6 +73,8 @@ void SanitizerMetadata::reportGlobalToASan(llvm::GlobalVariable *GV,
   for (auto Attr : D.specific_attrs<NoSanitizeAttr>())
     if (Attr->getMask() & SanitizerKind::Address)
       IsExcluded = true;
+  if (D.hasAttr<DisableSanitizerInstrumentationAttr>())
+    IsExcluded = true;
   reportGlobalToASan(GV, D.getLocation(), OS.str(), D.getType(), IsDynInit,
                      IsExcluded);
 }
diff --git a/clang/test/CodeGen/address-safety-attr-flavors.cpp b/clang/test/CodeGen/address-safety-attr-flavors.cpp
index e6d17ed2da340..ef815555059db 100644
--- a/clang/test/CodeGen/address-safety-attr-flavors.cpp
+++ b/clang/test/CodeGen/address-safety-attr-flavors.cpp
@@ -73,3 +73,12 @@ __attribute__((no_sanitize("kernel-hwaddress"))) int NoSanitizeKernelHWAddress()
 // CHECK-KASAN: {{Function Attrs: mustprogress noinline nounwind sanitize_address$}}
 // CHECK-HWASAN: {{Function Attrs: mustprogress noinline nounwind$}}
 // CHECK-KHWASAN: {{Function Attrs: mustprogress noinline nounwind$}}
+
+__attribute__((disable_sanitizer_instrumentation)) int DisableSanitizerInstrumentation() {
+  return 0;
+}
+// CHECK-NOASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
+// CHECK-ASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
+// CHECK-KASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
+// CHECK-HWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
+// CHECK-KHWASAN: {{Function Attrs: disable_sanitizer_instrumentation mustprogress noinline nounwind$}}
diff --git a/clang/test/CodeGen/asan-globals.cpp b/clang/test/CodeGen/asan-globals.cpp
index 2cea167d0ea59..0b17038a5f15a 100644
--- a/clang/test/CodeGen/asan-globals.cpp
+++ b/clang/test/CodeGen/asan-globals.cpp
@@ -10,6 +10,7 @@
 int global;
 int dyn_init_global = global;
 int __attribute__((no_sanitize("address"))) attributed_global;
+int __attribute__((disable_sanitizer_instrumentation)) disable_instrumentation_global;
 int ignorelisted_global;
 
 int __attribute__((section("__DATA, __common"))) sectioned_global; // KASAN - ignore globals in a section
@@ -50,31 +51,33 @@ void func() {
 // UWTABLE: attributes #[[#ATTR]] = { nounwind uwtable }
 // UWTABLE: ![[#]] = !{i32 7, !"uwtable", i32 2}
 
-// CHECK: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[ATTR_GLOBAL:[0-9]+]], ![[IGNORELISTED_GLOBAL:[0-9]+]], ![[SECTIONED_GLOBAL:[0-9]+]], ![[SPECIAL_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
+// CHECK: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[ATTR_GLOBAL:[0-9]+]], ![[DISABLE_INSTR_GLOBAL:[0-9]+]], ![[IGNORELISTED_GLOBAL:[0-9]+]], ![[SECTIONED_GLOBAL:[0-9]+]], ![[SPECIAL_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
 // CHECK: ![[EXTRA_GLOBAL]] = !{{{.*}} ![[EXTRA_GLOBAL_LOC:[0-9]+]], !"extra_global", i1 false, i1 false}
 // CHECK: ![[EXTRA_GLOBAL_LOC]] = !{!"{{.*}}extra-source.cpp", i32 1, i32 5}
 // CHECK: ![[GLOBAL]] = !{{{.*}} ![[GLOBAL_LOC:[0-9]+]], !"global", i1 false, i1 false}
 // CHECK: ![[GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 10, i32 5}
 // CHECK: ![[DYN_INIT_GLOBAL]] = !{{{.*}} ![[DYN_INIT_LOC:[0-9]+]], !"dyn_init_global", i1 true, i1 false}
 // CHECK: ![[DYN_INIT_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 11, i32 5}
-// CHECK: ![[ATTR_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
-// CHECK: ![[IGNORELISTED_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
+// CHECK: ![[ATTR_GLOBAL]] = !{{{.*attributed_global.*}}, null, null, i1 false, i1 true}
+// CHECK: ![[DISABLE_INSTR_GLOBAL]] = !{{{.*disable_instrumentation_global.*}}, null, null, i1 false, i1 true}
+// CHECK: ![[IGNORELISTED_GLOBAL]] = !{{{.*ignorelisted_global.*}}, null, null, i1 false, i1 true}
 // CHECK: ![[SECTIONED_GLOBAL]] = !{{{.*}} ![[SECTIONED_GLOBAL_LOC:[0-9]+]], !"sectioned_global", i1 false, i1 false}
-// CHECK: ![[SECTIONED_GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 15, i32 50}
+// CHECK: ![[SECTIONED_GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 16, i32 50}
 // CHECK: ![[SPECIAL_GLOBAL]] = !{{{.*}} ![[SPECIAL_GLOBAL_LOC:[0-9]+]], !"__special_global", i1 false, i1 false}
-// CHECK: ![[SPECIAL_GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 17, i32 5}
+// CHECK: ![[SPECIAL_GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 18, i32 5}
 // CHECK: ![[STATIC_VAR]] = !{{{.*}} ![[STATIC_LOC:[0-9]+]], !"static_var", i1 false, i1 false}
-// CHECK: ![[STATIC_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 21, i32 14}
+// CHECK: ![[STATIC_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 22, i32 14}
 // CHECK: ![[LITERAL]] = !{{{.*}} ![[LITERAL_LOC:[0-9]+]], !"<string literal>", i1 false, i1 false}
-// CHECK: ![[LITERAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 22, i32 25}
+// CHECK: ![[LITERAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 23, i32 25}
 
-// IGNORELIST-SRC: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[ATTR_GLOBAL:[0-9]+]], ![[IGNORELISTED_GLOBAL:[0-9]+]], ![[SECTIONED_GLOBAL:[0-9]+]], ![[SPECIAL_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
+// IGNORELIST-SRC: !llvm.asan.globals = !{![[EXTRA_GLOBAL:[0-9]+]], ![[GLOBAL:[0-9]+]], ![[DYN_INIT_GLOBAL:[0-9]+]], ![[ATTR_GLOBAL:[0-9]+]], ![[DISABLE_INSTR_GLOBAL:[0-9]+]], ![[IGNORELISTED_GLOBAL:[0-9]+]], ![[SECTIONED_GLOBAL:[0-9]+]], ![[SPECIAL_GLOBAL:[0-9]+]], ![[STATIC_VAR:[0-9]+]], ![[LITERAL:[0-9]+]]}
 // IGNORELIST-SRC: ![[EXTRA_GLOBAL]] = !{{{.*}} ![[EXTRA_GLOBAL_LOC:[0-9]+]], !"extra_global", i1 false, i1 false}
 // IGNORELIST-SRC: ![[EXTRA_GLOBAL_LOC]] = !{!"{{.*}}extra-source.cpp", i32 1, i32 5}
 // IGNORELIST-SRC: ![[GLOBAL]] = !{{{.*}} null, null, i1 false, i1 true}
 // IGNORELIST-SRC: ![[DYN_INIT_GLOBAL]] = !{{{.*}} null, null, i1 true, i1 true}
-// IGNORELIST-SRC: ![[ATTR_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
-// IGNORELIST-SRC: ![[IGNORELISTED_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
+// IGNORELIST-SRC: ![[ATTR_GLOBAL]] = !{{{.*attributed_global.*}}, null, null, i1 false, i1 true}
+// IGNORELIST-SRC: ![[DISABLE_INSTR_GLOBAL]] = !{{{.*disable_instrumentation_global.*}}, null, null, i1 false, i1 true}
+// IGNORELIST-SRC: ![[IGNORELISTED_GLOBAL]] = !{{{.*ignorelisted_global.*}}, null, null, i1 false, i1 true}
 // IGNORELIST-SRC: ![[SECTIONED_GLOBAL]] = !{{{.*}} null, null, i1 false, i1 true}
 // IGNORELIST-SRC: ![[SPECIAL_GLOBAL]] = !{{{.*}} null, null, i1 false, i1 true}
 // IGNORELIST-SRC: ![[STATIC_VAR]] = !{{{.*}} null, null, i1 false, i1 true}
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8f94172a6402d..a8d67c755799d 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -2888,6 +2888,9 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   // Leave if the function doesn't need instrumentation.
   if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
 
+  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+    return FunctionModified;
+
   LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
 
   initializeCallbacks(*F.getParent());
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll
new file mode 100644
index 0000000000000..ffd94889f97a6
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll
@@ -0,0 +1,47 @@
+; This test checks that we are not instrumenting sanitizer code.
+; RUN: opt < %s -passes='asan-pipeline' -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function with sanitize_address is instrumented.
+; Function Attrs: nounwind uwtable
+define void @instr_sa(i32* %a) sanitize_address {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  %tmp2 = add i32 %tmp1,  1
+  store i32 %tmp2, i32* %a, align 4
+  ret void
+}
+
+; CHECK-LABEL: @instr_sa
+; CHECK: call void @__asan_report_load
+
+
+; Function with disable_sanitizer_instrumentation is not instrumented.
+; Function Attrs: nounwind uwtable
+define void @noinstr_dsi(i32* %a) disable_sanitizer_instrumentation {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  %tmp2 = add i32 %tmp1,  1
+  store i32 %tmp2, i32* %a, align 4
+  ret void
+}
+
+; CHECK-LABEL: @noinstr_dsi
+; CHECK-NOT: call void @__asan_report_load
+
+
+; disable_sanitizer_instrumentation takes precedence over sanitize_address.
+; Function Attrs: nounwind uwtable
+define void @noinstr_dsi_sa(i32* %a) disable_sanitizer_instrumentation sanitize_address {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  %tmp2 = add i32 %tmp1,  1
+  store i32 %tmp2, i32* %a, align 4
+  ret void
+}
+
+; CHECK-LABEL: @noinstr_dsi_sa
+; CHECK-NOT: call void @__asan_report_load
+

From 35baa26747b0033afac15d7989bc2100b251412c Mon Sep 17 00:00:00 2001
From: hyeongyukim <gusrb406@snu.ac.kr>
Date: Fri, 18 Feb 2022 17:52:56 +0900
Subject: [PATCH 227/748] [NFC][Clang/test] add target triple to
 CodeGen/analyze_noundef.cpp

---
 clang/test/CodeGen/noundef-analysis.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/CodeGen/noundef-analysis.cpp b/clang/test/CodeGen/noundef-analysis.cpp
index 8a439cde38999..1ba60cf0154be 100644
--- a/clang/test/CodeGen/noundef-analysis.cpp
+++ b/clang/test/CodeGen/noundef-analysis.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang -cc1 -enable-noundef-analysis -emit-llvm -o - %s | FileCheck %s -check-prefix ENABLED
-// RUN: %clang -cc1 -no-enable-noundef-analysis -emit-llvm -o - %s | FileCheck %s -check-prefix DISABLED
+// RUN: %clang_cc1 -triple arm64-darwin -enable-noundef-analysis -emit-llvm -o - %s | FileCheck %s -check-prefix ENABLED
+// RUN: %clang_cc1 -triple arm64-darwin -no-enable-noundef-analysis -emit-llvm -o - %s | FileCheck %s -check-prefix DISABLED
 
 union u1 {
   int val;
@@ -20,12 +20,12 @@ static void examineValue(int x) { sink = x; }
 
 // ENABLED-LABEL: @main(
 // ENABLED:    [[CALL:%.*]] = call noundef {{.*}}i32 @_Z19indirect_callee_inti(i32 noundef {{.*}}0)
-// ENABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i32 {{.*}})
+// ENABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i64 {{.*}})
 // ENABLED:    [[CALL2:%.*]] = call noalias noundef nonnull i8* @_Znwm(i64 noundef 4) #[[ATTR4:[0-9]+]]
 // ENABLED:    call void @_ZL12examineValuei(i32 noundef {{.*}})
 // DISABLED-LABEL: @main(
 // DISABLED:    [[CALL:%.*]] = call {{.*}}i32 @_Z19indirect_callee_inti(i32 {{.*}}0)
-// DISABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i32 {{.*}})
+// DISABLED:    [[CALL1:%.*]] = call i32 @_Z21indirect_callee_union2u1(i64 {{.*}})
 // DISABLED:    [[CALL2:%.*]] = call noalias nonnull i8* @_Znwm(i64 4) #[[ATTR4:[0-9]+]]
 // DISABLED:    call void @_ZL12examineValuei(i32 {{.*}})
 int main() {

From a43f7d6d76984ddae4a5e5e0bebf82ee2edebabb Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Mon, 14 Feb 2022 17:46:35 +0100
Subject: [PATCH 228/748] [mlir][tensor] Extend reshape utils.

This change changes the handling of trailing dimensions with unknown
extent. Users of the changessociationIndicesForReshape helper should
see benefits when transforming reshape like operations into
expand/collapse pairs if the higher-rank type has trailing unknown
dimensions.

The motivating example is a reshape from tensor<16x1x?xi32> to
tensor<16xi32> that can be modeled as collapsing the three dimensions.

Differential Revision: https://reviews.llvm.org/D119730
---
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp | 37 ++++++++++------------
 mlir/test/Dialect/Tensor/canonicalize.mlir |  9 +++---
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index fd509621015d2..17f449e489e38 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -35,13 +35,12 @@ mlir::getReassociationIndicesForReshape(ShapedType sourceType,
   int64_t prodOfCollapsedDims = 1;
   while (sourceDim < sourceShape.size()) {
     unsigned targetDim = reassociationMap.size();
+    // If we have mapped all the target dimensions stop and handle the remaining
+    // tail of size-1 dimensions explictly.
+    if (targetDim == targetType.getRank())
+      break;
 
-    // If all the dimensions of the targetShape are exhausted, then the
-    // remaining dims in the source shape must be all 1s. So for such cases, set
-    // 1 as the target shape. The actual reassociation indices will be handled
-    // later.
-    int64_t currTargetShape =
-        (targetDim < targetType.getRank() ? targetShape[targetDim] : 1);
+    int64_t currTargetShape = targetShape[targetDim];
     while (sourceShape[sourceDim] != ShapedType::kDynamicSize &&
            prodOfCollapsedDims * sourceShape[sourceDim] < currTargetShape &&
            sourceDim < sourceShape.size()) {
@@ -69,25 +68,23 @@ mlir::getReassociationIndicesForReshape(ShapedType sourceType,
       return llvm::None;
 
     currIndices.push_back(sourceDim++);
-    // If the reassociation is empty but the currIndices is not, this by
-    // definition is folding unit-dimensions with the result being scalar type.
-    // So only append the `currIndices` if reassociation map is not empty.
-    if (targetDim == targetShape.size()) {
-      while (sourceDim < sourceShape.size())
-        currIndices.push_back(sourceDim++);
-      if (!reassociationMap.empty() && !currIndices.empty())
-        reassociationMap.back().append(currIndices.begin(), currIndices.end());
-      // Break out of the loops. We should be done here.
-      break;
-    }
     reassociationMap.emplace_back(ReassociationIndices{});
     std::swap(reassociationMap.back(), currIndices);
     prodOfCollapsedDims = 1;
   }
-  // All the dimensions in the two shapes must have been processed.
-  if (reassociationMap.size() != targetShape.size() ||
-      sourceDim != sourceShape.size())
+  // All the dimensions in the target must have been processed.
+  if (reassociationMap.size() != targetShape.size())
     return llvm::None;
+  // Process any remaining entries in the source shape. They all need to be
+  // 1 or dynamic.
+  for (; sourceDim < sourceShape.size(); sourceDim++) {
+    if (sourceShape[sourceDim] != ShapedType::kDynamicSize &&
+        sourceShape[sourceDim] != 1)
+      return llvm::None;
+    // The map is empty when the target type is a scalar.
+    if (!reassociationMap.empty())
+      reassociationMap.back().push_back(sourceDim);
+  }
   return reassociationMap;
 }
 
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 4e4bbb8a12672..ce3db8d6039c2 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -879,16 +879,17 @@ func @fold_reshape_trailing_unit_dims(%arg0: tensor<12x42x1x1xf32>)
 
 // -----
 
-func @no_fold_reshapes(%arg0 : tensor<?x?x?xf32>) -> tensor<?x?xf32> {
+func @fold_reshapes_unit_dims_in_middle(%arg0 : tensor<?x?x?xf32>) -> tensor<?x?xf32> {
   %0 = tensor.expand_shape %arg0 [[0], [1], [2, 3]]
       : tensor<?x?x?xf32> into tensor<?x?x1x?xf32>
   %1 = tensor.collapse_shape %0 [[0], [1, 2, 3]]
       : tensor<?x?x1x?xf32> into tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @no_fold_reshapes
-//       CHECK:   tensor.expand_shape
-//       CHECK:   tensor.collapse_shape
+// CHECK-LABEL: func @fold_reshapes_unit_dims_in_middle
+//  CHECK-SAME: (%[[ARG:.*]]: tensor<?x?x?xf32>
+//       CHECK: tensor.collapse_shape %[[ARG]] {{\[}}[0], [1, 2]]
+//  CHECK-SAME:   tensor<?x?x?xf32> into tensor<?x?xf32>
 
 // -----
 

From 0bf3fec4cd95d43677c3a4d5b84a505818896817 Mon Sep 17 00:00:00 2001
From: esmeyi <esme.yi@ibm.com>
Date: Fri, 18 Feb 2022 04:12:32 -0500
Subject: [PATCH 229/748] Revert "[XCOFF][llvm-objdump] change the priority of
 symbols with"

This reverts commit 2ad662172cbbd1ca53489bf8bddb0183d7692708.

Buildbot failure #19373
---
 .../llvm/MC/MCDisassembler/MCDisassembler.h   | 11 ++++----
 llvm/include/llvm/Object/ObjectFile.h         |  2 +-
 .../aix-prefixed-instruction-boundary.mir     |  2 +-
 llvm/test/CodeGen/PowerPC/aix-return55.ll     |  2 +-
 .../PowerPC/aix-user-defined-memcpy.ll        |  2 +-
 .../PowerPC/aix-xcoff-mergeable-const.ll      |  2 +-
 .../CodeGen/PowerPC/aix-xcoff-reloc-symb.mir  |  2 +-
 llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll  |  2 +-
 .../PowerPC/aix-xcoff-textdisassembly.ll      |  2 +-
 .../llvm-objdump/XCOFF/disassemble-all.test   |  2 +-
 .../XCOFF/disassemble-symbol-description.test |  2 +-
 .../XCOFF/disassemble-symbol-priority.ll      | 28 -------------------
 .../XCOFF/disassemble-symbolize-operands.ll   |  7 +++--
 .../llvm-objdump/XCOFF/print-linenumber.test  |  2 +-
 llvm/tools/llvm-objdump/llvm-objdump.cpp      |  3 --
 15 files changed, 20 insertions(+), 51 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll

diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index db608d78fd6a0..10037cd66ef12 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -46,9 +46,8 @@ struct SymbolInfoTy {
                Optional<XCOFF::StorageMappingClass> Smc, Optional<uint32_t> Idx,
                bool Label)
       : Addr(Addr), Name(Name), XCOFFSymInfo(Smc, Idx, Label), IsXCOFF(true) {}
-  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type,
-               bool IsXCOFF = false)
-      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(IsXCOFF) {}
+  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type)
+      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(false) {}
   bool isXCOFF() const { return IsXCOFF; }
 
 private:
@@ -56,11 +55,11 @@ struct SymbolInfoTy {
     assert(P1.IsXCOFF == P2.IsXCOFF &&
            "P1.IsXCOFF should be equal to P2.IsXCOFF.");
     if (P1.IsXCOFF)
-      return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Type, P1.Name) <
-             std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Type, P2.Name);
+      return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
+             std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
 
     return std::tie(P1.Addr, P1.Name, P1.Type) <
-           std::tie(P2.Addr, P2.Name, P2.Type);
+             std::tie(P2.Addr, P2.Name, P2.Type);
   }
 };
 
diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index 1faa070052d5e..bb6f1321a68e8 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -170,11 +170,11 @@ class SymbolRef : public BasicSymbolRef {
 public:
   enum Type {
     ST_Unknown, // Type not specified
-    ST_Other,
     ST_Data,
     ST_Debug,
     ST_File,
     ST_Function,
+    ST_Other
   };
 
   SymbolRef() = default;
diff --git a/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir b/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
index 2947ae2c39989..9ea49bf40c897 100644
--- a/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
+++ b/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
@@ -43,7 +43,7 @@ body:             |
 ...
 
 # DIS:      Disassembly of section .text:
-# DIS:      00000000 <.aix-prefixed-instruction-boundary>:
+# DIS:      00000000 <.text>:
 # DIS-NEXT:   0: 38 60 00 02  	          li 3, 2
 # DIS-NEXT:   4: 06 00 00 00 38 63 00 0d  paddi 3, 3, 13, 0
 # DIS-NEXT:   c: 06 00 00 00 38 63 00 0d  paddi 3, 3, 13, 0
diff --git a/llvm/test/CodeGen/PowerPC/aix-return55.ll b/llvm/test/CodeGen/PowerPC/aix-return55.ll
index 19e8322f8f8a2..c16b75bb68d8d 100644
--- a/llvm/test/CodeGen/PowerPC/aix-return55.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-return55.ll
@@ -21,7 +21,7 @@ entry:
 ; CHECK: blr
 }
 
-;CHECKOBJ:      00000000 <.foo>:
+;CHECKOBJ:      00000000 <.text>:
 ;CHECKOBJ-NEXT:       0: 38 60 00 37                    li 3, 55
 ;CHECKOBJ-NEXT:       4: 4e 80 00 20                    blr{{[[:space:]] *}}
 ;CHECKOBJ-NEXT: 00000008 <.rodata.str1.1>:
diff --git a/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll b/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
index 097eb302e4161..b69b3760c9f4e 100644
--- a/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
@@ -102,7 +102,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture r
 ; 32-REL-NOT:  Type: R_RBR (0x1A)
 
 ; 32-DIS:      Disassembly of section .text:
-; 32-DIS:      00000000 <.memcpy>:
+; 32-DIS:      00000000 <.text>:
 ; 32-DIS-NEXT:        0: 38 60 00 03                   li 3, 3
 ; 32-DIS-NEXT:        4: 4e 80 00 20                   blr
 ; 32-DIS-NEXT:        8: 60 00 00 00                   nop
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
index c7b1d2a0771c1..255472d65c341 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
@@ -62,7 +62,7 @@ entry:
 ;CHECK-NEXT:         .space  1
 
 
-;CHECKOBJ:      00000000 <.main>:
+;CHECKOBJ:      00000000 <.text>:
 ;CHECKOBJ-NEXT:        0: 38 60 00 00                    li 3, 0
 ;CHECKOBJ-NEXT:        4: 4e 80 00 20                    blr
 ;CHECKOBJ-NEXT:          ...{{[[:space:]] *}}
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
index c64552f9852c0..f650168d5877d 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
@@ -75,7 +75,7 @@ body:             |
 
 # DIS:      Disassembly of section .text:
 # DIS-EMPTY:
-# DIS-NEXT: 00000000 <.foo>:
+# DIS-NEXT: 00000000 <.text>:
 # DIS-NEXT:        0: 80 62 00 00                   lwz 3, 0(2)
 # DIS-NEXT:        4: 4e 80 00 20                   blr
 # DIS-EMPTY:
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
index 1bbc12c5a3af5..6ce251bb49fd8 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
@@ -422,7 +422,7 @@ declare i32 @bar(i32)
 
 ; DIS:      {{.*}}aix-xcoff-reloc.ll.tmp.o:   file format aixcoff-rs6000
 ; DIS:      Disassembly of section .text:
-; DIS:      00000000 <.foo>:
+; DIS:      00000000 <.text>:
 ; DIS-NEXT:        0: 7c 08 02 a6                   mflr 0
 ; DIS-NEXT:        4: 90 01 00 08                   stw 0, 8(1)
 ; DIS-NEXT:        8: 94 21 ff c0                   stwu 1, -64(1)
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
index 8b73e748e1a89..c8df85da0c855 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
@@ -13,7 +13,7 @@ entry:
 }
 
 ; CHECK:     Disassembly of section .text:{{[[:space:]] *}}
-; CHECK-NEXT:     00000000 <.foo>:
+; CHECK-NEXT:     00000000 <.text>:
 ; CHECK-NEXT:        0: 38 60 00 00                   li 3, 0
 ; CHECK-NEXT:        4: 4e 80 00 20                   blr
 ; CHECK-NEXT:        8: 60 00 00 00                   nop
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
index 4c96662fc854f..d94d5734a1cbd 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
@@ -18,7 +18,7 @@
 
 CHECK:        Inputs/xcoff-section-headers.o:	file format aixcoff-rs6000
 CHECK:        Disassembly of section .text:
-CHECK:        00000000 <.func>:
+CHECK:        00000000 <.text>:
 CHECK-NEXT:        0: 80 62 00 04                  	lwz 3, 4(2)
 WITH-R-NEXT:                         00000002:  R_TOC        a
 CHECK-NEXT:        4: 80 63 00 00                  	lwz 3, 0(3)
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
index f33421cc6c149..16f7137cf3796 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
@@ -22,7 +22,7 @@
 
 COMMON: Inputs/xcoff-section-headers.o:	file format aixcoff-rs6000
 COMMON: Disassembly of section .text:
-PLAIN:      00000000 <.func>:
+PLAIN:      00000000 <.text>:
 DESC:       00000000 (idx: 16) .func: 
 COMMON-NEXT:        0: 80 62 00 04                  	lwz 3, 4(2)
 RELOC:                              00000002:  R_TOC        (idx: 26) a[TC]
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll
deleted file mode 100644
index 6db8451ea6a13..0000000000000
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -mtriple=powerpc-ibm-aix-xcoff %s -filetype=obj -o %t
-; RUN: llvm-objdump %t -d --no-show-raw-insn | FileCheck %s
-
-; CHECK: Disassembly of section .text:
-; CHECK: 00000000 <.foo3>:
-; CHECK: 00000020 <.foo4>:
-; CHECK: 00000040 <.foo>:
-; CHECK: 00000060 <.foo2>:
-
-define dso_local signext i32 @foo(i32 noundef signext %a) #0 section "explicit_sec" {
-entry:
-  ret i32 %a
-}
-
-define dso_local signext i32 @foo2(i32 noundef signext %a) #0 section "explicit_sec" {
-entry:
-  ret i32 %a
-}
-
-define dso_local signext i32 @foo3(i32 noundef signext %a) #0 {
-entry:
-  ret i32 %a
-}
-
-define dso_local signext i32 @foo4(i32 noundef signext %a) #0 {
-entry:
-  ret i32 %a
-}
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
index 95399aa4d41d2..a6742285a148e 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
@@ -3,7 +3,8 @@
 ; RUN:   | FileCheck %s
 
 ;; Expect to find the branch labels.
-; CHECK-LABEL: <.internal>:
+; CHECK-LABEL: <.text>:
+;; TODO: <.internal> should be printed instead of <.text>.
 ; CHECK-NEXT:         0:      mr 4, 3
 ; CHECK-NEXT:         4:      li 3, 0
 ; CHECK-NEXT:         8:      mtctr 4
@@ -18,11 +19,11 @@
 ; CHECK-NEXT:        60:      	bf	8, 0x84 <L1>
 ; CHECK-NEXT:  <L0>:
 ; CHECK-NEXT:        64:      	mr	3, 31
-; CHECK-NEXT:        68:      	bl 0x0 <.internal>
+; CHECK-NEXT:        68:      	bl 0x0 <.text>
 ; CHECK-NEXT:        6c:      	mr	31, 3
 ; CHECK-NEXT:        70:      	cmplwi	3, 11
 ; CHECK-NEXT:        74:      	bt	0, 0x60 <L2>
-; CHECK-NEXT:        78:      	bl 0x0 <.internal>
+; CHECK-NEXT:        78:      	bl 0x0 <.text>
 ; CHECK-NEXT:        7c:      	nop
 ; CHECK-NEXT:        80:      	b 0x60 <L2>
 ; CHECK-NEXT:  <L1>:
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test b/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
index 8256e27c064dd..0f3acacae4389 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
@@ -17,7 +17,7 @@
 
 # LINES32:       Inputs/basic32.o:	file format aixcoff-rs6000
 # LINES32:       Disassembly of section .text:
-# LINES32:       00000000 <.main>:
+# LINES32:       00000000 <.text>:
 # LINES32:       ; .main():
 # LINES32-NEXT:  ; /basic.c:1
 # LINES32-NEXT:         0: 38 60 00 00  	li 3, 0
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 4cb226b795255..6b238fa01d258 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -957,9 +957,6 @@ SymbolInfoTy objdump::createSymbolInfo(const ObjectFile *Obj,
         getXCOFFSymbolCsectSMC(XCOFFObj, Symbol);
     return SymbolInfoTy(Addr, Name, Smc, SymbolIndex,
                         isLabel(XCOFFObj, Symbol));
-  } else if (Obj->isXCOFF()) {
-    const SymbolRef::Type SymType = unwrapOrError(Symbol.getType(), FileName);
-    return SymbolInfoTy(Addr, Name, SymType, true);
   } else
     return SymbolInfoTy(Addr, Name,
                         Obj->isELF() ? getElfSymbolType(Obj, Symbol)

From 074d1e2536343b96ff1c6de57c4fecf1b7fbfa72 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 16:49:47 +0000
Subject: [PATCH 230/748] [CodeGen] Return better Changed status from
 PostRAHazardRecognizer

Differential Revision: https://reviews.llvm.org/D119954
---
 llvm/lib/CodeGen/PostRAHazardRecognizer.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
index 82ed386db8272..00c91b8a59f86 100644
--- a/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -76,6 +76,7 @@ bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
     return false;
 
   // Loop over all of the basic blocks
+  bool Changed = false;
   for (auto &MBB : Fn) {
     // We do not call HazardRec->reset() here to make sure we are handling noop
     // hazards at the start of basic blocks.
@@ -85,6 +86,8 @@ bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
       HazardRec->EmitNoops(NumPreNoops);
       TII->insertNoops(MBB, MachineBasicBlock::iterator(MI), NumPreNoops);
       NumNoops += NumPreNoops;
+      if (NumPreNoops)
+        Changed = true;
 
       HazardRec->EmitInstruction(&MI);
       if (HazardRec->atIssueLimit()) {
@@ -92,5 +95,5 @@ bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
       }
     }
   }
-  return true;
+  return Changed;
 }

From acc08a2f1bd35a6046bffd7030bf3990ddd595c7 Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Fri, 18 Feb 2022 01:48:13 -0800
Subject: [PATCH 231/748] Add "REQUIRES: asserts" to test
 misched-predicate-virtreg.mir which uses "-debug-only".

---
 llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir
index 95e58a48b424b..dbe6aa1e6e6a6 100644
--- a/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir
+++ b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mcpu=exynos-m5 -mtriple=aarch64 -enable-misched -run-pass=machine-scheduler -debug-only=machine-scheduler %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
 
 # CHECK-LABEL: ********** MI Scheduling **********
 # CHECK:       SU(0):   %0:fpr128 = COPY $q1

From d86dcb7ea56afff8e85998b23cbad5130f5502cd Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 16:30:25 +0000
Subject: [PATCH 232/748] [AMDGPU] Return better Changed status from
 SIOptimizeExecMasking

Differential Revision: https://reviews.llvm.org/D120024
---
 llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index b9c839fe28ba8..9a4cc25f00085 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -312,6 +312,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   //     x = s_<op>_saveexec_b64 y
   //
 
+  bool Changed = false;
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
     MachineBasicBlock::reverse_iterator E = MBB.rend();
@@ -351,6 +352,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
 
         CopyToExecInst->eraseFromParent();
+        Changed = true;
       }
 
       continue;
@@ -456,8 +458,9 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
       OtherInst->substituteRegister(CopyToExec, Exec,
                                     AMDGPU::NoSubRegister, *TRI);
     }
-  }
 
-  return true;
+    Changed = true;
+  }
 
+  return Changed;
 }

From 768e6faba8fa5eef04276a51836a40c5ab8013bc Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 16:41:45 +0000
Subject: [PATCH 233/748] [AMDGPU] Return better Changed status from
 SILowerControlFlow

Differential Revision: https://reviews.llvm.org/D120025
---
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index e1018bdfde469..4bb05d9069780 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -865,6 +865,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  bool Changed = false;
   MachineFunction::iterator NextBB;
   for (MachineFunction::iterator BI = MF.begin();
        BI != MF.end(); BI = NextBB) {
@@ -886,6 +887,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
       case AMDGPU::SI_LOOP:
       case AMDGPU::SI_END_CF:
         SplitMBB = process(MI);
+        Changed = true;
         break;
 
       // FIXME: find a better place for this
@@ -894,6 +896,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         lowerInitExec(MBB, MI);
         if (LIS)
           LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
+        Changed = true;
         break;
 
       default:
@@ -913,5 +916,5 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   LoweredIf.clear();
   KillBlocks.clear();
 
-  return true;
+  return Changed;
 }

From a0c0db4627dcbc51234494bdab9cc69e882828e3 Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Fri, 18 Feb 2022 01:41:29 +0300
Subject: [PATCH 234/748] [objcopy][NFC] Add rules to cmake to put files under
 specific folders.

This patch adds rules to cmake to put files under specific folders.

It allows to have files for different formats(which are located in different
subdirectories) be displayed in different subfolders of VS IDE solution.

Depends on D114429

Differential Revision: https://reviews.llvm.org/D114664
---
 llvm/lib/ObjCopy/CMakeLists.txt | 43 +++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/ObjCopy/CMakeLists.txt b/llvm/lib/ObjCopy/CMakeLists.txt
index c272d2637bdcc..1e516394c74ae 100644
--- a/llvm/lib/ObjCopy/CMakeLists.txt
+++ b/llvm/lib/ObjCopy/CMakeLists.txt
@@ -1,3 +1,34 @@
+source_group("Header Files" REGULAR_EXPRESSION
+  .*[.]h
+)
+source_group("Header Files\\COFF" REGULAR_EXPRESSION
+  COFF/.*[.]h
+)
+source_group("Header Files\\ELF" REGULAR_EXPRESSION
+  ELF/.*[.]h
+)
+source_group("Header Files\\MachO" REGULAR_EXPRESSION
+  MachO/.*[.]h
+)
+source_group("Header Files\\wasm" REGULAR_EXPRESSION
+  wasm/.*[.]h
+)
+source_group("Source Files" REGULAR_EXPRESSION
+  .*[.]cpp
+)
+source_group("Source Files\\COFF" REGULAR_EXPRESSION  
+  COFF/.*[.]cpp
+)
+source_group("Source Files\\ELF" REGULAR_EXPRESSION
+  ELF/.*[.]cpp
+)
+source_group("Source Files\\MachO" REGULAR_EXPRESSION
+  MachO/.*[.]cpp
+)
+source_group("Source Files\\wasm" REGULAR_EXPRESSION
+  wasm/.*[.]cpp
+)
+
 add_llvm_component_library(LLVMObjCopy
   Archive.cpp
   ObjCopy.cpp
@@ -19,8 +50,16 @@ add_llvm_component_library(LLVMObjCopy
   wasm/WasmObjcopy.cpp
 
   ADDITIONAL_HEADER_DIRS
-  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Object
-
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ObjCopy
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ObjCopy/COFF
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ObjCopy/ELF
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ObjCopy/MachO
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ObjCopy/wasm
+  COFF
+  ELF
+  MachO
+  wasm
+ 
   DEPENDS
   intrinsics_gen
 

From fa7c8cb4d01e9f24816c43d5c44a7fb62564ebc5 Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Fri, 18 Feb 2022 18:45:36 +0900
Subject: [PATCH 235/748] [mlir][bufferize] Support memrefs with non-standard
 layout in `finalizing-bufferize`

Differential Revision: https://reviews.llvm.org/D119935
---
 .../Dialect/Bufferization/IR/Bufferization.h  |  22 +++
 .../Bufferization/IR/BufferizationOps.cpp     | 150 +++++++++++-------
 .../Bufferization/Transforms/Bufferize.cpp    |  25 ++-
 .../Transforms/finalizing-bufferize.mlir      |  74 +++++++++
 4 files changed, 212 insertions(+), 59 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
index 2cbfc901f239b..d3e6a5c7f5e3b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
@@ -27,4 +27,26 @@
 #define GET_OP_CLASSES
 #include "mlir/Dialect/Bufferization/IR/BufferizationOps.h.inc"
 
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace bufferization {
+/// Try to cast the given ranked MemRef-typed value to the given ranked MemRef
+/// type. Insert a reallocation + copy if it cannot be statically guaranteed
+/// that a direct cast would be valid.
+///
+/// E.g., when casting from a ranked MemRef type with dynamic layout to a ranked
+/// MemRef type with static layout, it is not statically known whether the cast
+/// will succeed or not. Such `memref.cast` ops may fail at runtime. This
+/// function never generates such casts and conservatively inserts a copy.
+///
+/// This function returns `failure()` in case of unsupported casts. E.g., casts
+/// with differing element types or memory spaces.
+FailureOr<Value> castOrReallocMemRefValue(OpBuilder &b, Value value,
+                                          MemRefType type);
+} // namespace bufferization
+} // namespace mlir
+
 #endif // MLIR_DIALECT_BUFFERIZATION_IR_BUFFERIZATION_H_
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index f1ec7bbdead24..c5a99d820bc90 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -1,4 +1,3 @@
-
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -13,6 +12,73 @@
 using namespace mlir;
 using namespace mlir::bufferization;
 
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+FailureOr<Value>
+mlir::bufferization::castOrReallocMemRefValue(OpBuilder &b, Value value,
+                                              MemRefType destType) {
+  auto srcType = value.getType().cast<MemRefType>();
+
+  // Casting to the same type, nothing to do.
+  if (srcType == destType)
+    return value;
+
+  // Element type, rank and memory space must match.
+  if (srcType.getElementType() != destType.getElementType())
+    return failure();
+  if (srcType.getMemorySpaceAsInt() != destType.getMemorySpaceAsInt())
+    return failure();
+  if (srcType.getRank() != destType.getRank())
+    return failure();
+
+  // In case the affine maps are different, we may need to use a copy if we go
+  // from dynamic to static offset or stride (the canonicalization cannot know
+  // at this point that it is really cast compatible).
+  auto isGuaranteedCastCompatible = [](MemRefType source, MemRefType target) {
+    int64_t sourceOffset, targetOffset;
+    SmallVector<int64_t, 4> sourceStrides, targetStrides;
+    if (failed(getStridesAndOffset(source, sourceStrides, sourceOffset)) ||
+        failed(getStridesAndOffset(target, targetStrides, targetOffset)))
+      return false;
+    auto dynamicToStatic = [](int64_t a, int64_t b) {
+      return a == MemRefType::getDynamicStrideOrOffset() &&
+             b != MemRefType::getDynamicStrideOrOffset();
+    };
+    if (dynamicToStatic(sourceOffset, targetOffset))
+      return false;
+    for (auto it : zip(sourceStrides, targetStrides))
+      if (dynamicToStatic(std::get<0>(it), std::get<1>(it)))
+        return false;
+    return true;
+  };
+
+  // Note: If `areCastCompatible`, a cast is valid, but may fail at runtime. To
+  // ensure that we only generate casts that always succeed at runtime, we check
+  // a fix extra conditions in `isGuaranteedCastCompatible`.
+  if (memref::CastOp::areCastCompatible(srcType, destType) &&
+      isGuaranteedCastCompatible(srcType, destType)) {
+    Value casted = b.create<memref::CastOp>(value.getLoc(), destType, value);
+    return casted;
+  }
+
+  auto loc = value.getLoc();
+  SmallVector<Value, 4> dynamicOperands;
+  for (int i = 0; i < destType.getRank(); ++i) {
+    if (destType.getShape()[i] != ShapedType::kDynamicSize)
+      continue;
+    auto index = b.createOrFold<arith::ConstantIndexOp>(loc, i);
+    Value size = b.create<memref::DimOp>(loc, value, index);
+    dynamicOperands.push_back(size);
+  }
+  // TODO: Use alloc/memcpy callback from BufferizationOptions if called via
+  // BufferizableOpInterface impl of ToMemrefOp.
+  Value copy = b.create<memref::AllocOp>(loc, destType, dynamicOperands);
+  b.create<memref::CopyOp>(loc, value, copy);
+  return copy;
+}
+
 //===----------------------------------------------------------------------===//
 // CloneOp
 //===----------------------------------------------------------------------===//
@@ -191,67 +257,39 @@ static LogicalResult foldToMemrefToTensorPair(RewriterBase &rewriter,
   if (!memrefToTensor)
     return failure();
 
-  // A memref_to_tensor + tensor_to_memref with same types can be folded without
-  // inserting a cast.
-  if (memrefToTensor.memref().getType() == toMemref.getType()) {
-    if (!allowSameType)
-      // Function can be configured to only handle cases where a cast is needed.
+  Type srcType = memrefToTensor.memref().getType();
+  Type destType = toMemref.getType();
+
+  // Function can be configured to only handle cases where a cast is needed.
+  if (!allowSameType && srcType == destType)
+    return failure();
+
+  auto rankedSrcType = srcType.dyn_cast<MemRefType>();
+  auto rankedDestType = destType.dyn_cast<MemRefType>();
+  auto unrankedSrcType = srcType.dyn_cast<UnrankedMemRefType>();
+
+  // Ranked memref -> Ranked memref cast.
+  if (rankedSrcType && rankedDestType) {
+    FailureOr<Value> replacement = castOrReallocMemRefValue(
+        rewriter, memrefToTensor.memref(), rankedDestType);
+    if (failed(replacement))
       return failure();
-    rewriter.replaceOp(toMemref, memrefToTensor.memref());
+
+    rewriter.replaceOp(toMemref, *replacement);
     return success();
   }
 
-  // If types are definitely not cast-compatible, bail.
-  if (!memref::CastOp::areCastCompatible(memrefToTensor.memref().getType(),
-                                         toMemref.getType()))
+  // Unranked memref -> Ranked memref cast: May require a copy.
+  // TODO: Not implemented at the moment.
+  if (unrankedSrcType && rankedDestType)
     return failure();
 
-  // We already know that the types are potentially cast-compatible. However
-  // in case the affine maps are different, we may need to use a copy if we go
-  // from dynamic to static offset or stride (the canonicalization cannot know
-  // at this point that it is really cast compatible).
-  auto isGuaranteedCastCompatible = [](MemRefType source, MemRefType target) {
-    int64_t sourceOffset, targetOffset;
-    SmallVector<int64_t, 4> sourceStrides, targetStrides;
-    if (failed(getStridesAndOffset(source, sourceStrides, sourceOffset)) ||
-        failed(getStridesAndOffset(target, targetStrides, targetOffset)))
-      return false;
-    auto dynamicToStatic = [](int64_t a, int64_t b) {
-      return a == MemRefType::getDynamicStrideOrOffset() &&
-             b != MemRefType::getDynamicStrideOrOffset();
-    };
-    if (dynamicToStatic(sourceOffset, targetOffset))
-      return false;
-    for (auto it : zip(sourceStrides, targetStrides))
-      if (dynamicToStatic(std::get<0>(it), std::get<1>(it)))
-        return false;
-    return true;
-  };
-
-  auto memrefToTensorType =
-      memrefToTensor.memref().getType().dyn_cast<MemRefType>();
-  auto toMemrefType = toMemref.getType().dyn_cast<MemRefType>();
-  if (memrefToTensorType && toMemrefType &&
-      !isGuaranteedCastCompatible(memrefToTensorType, toMemrefType)) {
-    MemRefType resultType = toMemrefType;
-    auto loc = toMemref.getLoc();
-    SmallVector<Value, 4> dynamicOperands;
-    for (int i = 0; i < resultType.getRank(); ++i) {
-      if (resultType.getShape()[i] != ShapedType::kDynamicSize)
-        continue;
-      auto index = rewriter.createOrFold<arith::ConstantIndexOp>(loc, i);
-      Value size = rewriter.create<tensor::DimOp>(loc, memrefToTensor, index);
-      dynamicOperands.push_back(size);
-    }
-    // TODO: Use alloc/memcpy callback from BufferizationOptions if called via
-    // BufferizableOpInterface impl of ToMemrefOp.
-    auto copy =
-        rewriter.create<memref::AllocOp>(loc, resultType, dynamicOperands);
-    rewriter.create<memref::CopyOp>(loc, memrefToTensor.memref(), copy);
-    rewriter.replaceOp(toMemref, {copy});
-  } else
-    rewriter.replaceOpWithNewOp<memref::CastOp>(toMemref, toMemref.getType(),
-                                                memrefToTensor.memref());
+  // Unranked memref -> unranked memref cast
+  // Ranked memref -> unranked memref cast: No copy needed.
+  assert(memref::CastOp::areCastCompatible(srcType, destType) &&
+         "expected that types are cast compatible");
+  rewriter.replaceOpWithNewOp<memref::CastOp>(toMemref, destType,
+                                              memrefToTensor.memref());
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index c7468da6132f5..01b22264e5bad 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -45,9 +45,28 @@ BufferizeTypeConverter::BufferizeTypeConverter() {
   addSourceMaterialization(materializeToTensor);
   addTargetMaterialization([](OpBuilder &builder, BaseMemRefType type,
                               ValueRange inputs, Location loc) -> Value {
-    assert(inputs.size() == 1);
-    assert(inputs[0].getType().isa<TensorType>());
-    return builder.create<bufferization::ToMemrefOp>(loc, type, inputs[0]);
+    assert(inputs.size() == 1 && "expected exactly one input");
+
+    if (auto inputType = inputs[0].getType().dyn_cast<MemRefType>()) {
+      // MemRef to MemRef cast.
+      assert(inputType != type && "expected different types");
+      // Unranked to ranked and ranked to unranked casts must be explicit.
+      auto rankedDestType = type.dyn_cast<MemRefType>();
+      if (!rankedDestType)
+        return nullptr;
+      FailureOr<Value> replacement =
+          castOrReallocMemRefValue(builder, inputs[0], rankedDestType);
+      if (failed(replacement))
+        return nullptr;
+      return *replacement;
+    }
+
+    if (inputs[0].getType().isa<TensorType>()) {
+      // Tensor to MemRef cast.
+      return builder.create<bufferization::ToMemrefOp>(loc, type, inputs[0]);
+    }
+
+    llvm_unreachable("only tensor/memref input types supported");
   });
 }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
index fac685ae7e725..5d70e90b75402 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/finalizing-bufferize.mlir
@@ -26,3 +26,77 @@ func @unable_to_convert_lone_tensor_load(%arg0: memref<f32>) {
   "test.sink"(%0) : (tensor<f32>) -> ()
   return
 }
+
+// -----
+
+//       CHECK: #[[$map1:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-LABEL: func @dyn_layout_to_no_layout_cast(
+//  CHECK-SAME:     %[[arg:.*]]: memref<?xf32, #[[$map1]]>)
+//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
+//       CHECK:   %[[dim:.*]] = memref.dim %[[arg]], %[[c0]]
+//       CHECK:   %[[alloc:.*]] = memref.alloc(%[[dim]]) : memref<?xf32>
+//       CHECK:   memref.copy %[[arg]], %[[alloc]]
+//       CHECK:   return %[[alloc]]
+#map1 = affine_map<(d0)[s0] -> (d0 + s0)>
+func @dyn_layout_to_no_layout_cast(%m: memref<?xf32, #map1>) -> memref<?xf32> {
+  %0 = bufferization.to_tensor %m : memref<?xf32, #map1>
+  %1 = bufferization.to_memref %0 : memref<?xf32>
+  return %1 : memref<?xf32>
+}
+
+// -----
+
+//       CHECK: #[[$map2:.*]] = affine_map<(d0)[s0] -> (d0 * 100 + s0)>
+// CHECK-LABEL: func @fancy_layout_to_no_layout_cast(
+//  CHECK-SAME:     %[[arg:.*]]: memref<?xf32, #[[$map2]]>)
+//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
+//       CHECK:   %[[dim:.*]] = memref.dim %[[arg]], %[[c0]]
+//       CHECK:   %[[alloc:.*]] = memref.alloc(%[[dim]]) : memref<?xf32>
+//       CHECK:   memref.copy %[[arg]], %[[alloc]]
+//       CHECK:   return %[[alloc]]
+#map2 = affine_map<(d0)[s0] -> (d0 * 100 + s0)>
+func @fancy_layout_to_no_layout_cast(%m: memref<?xf32, #map2>) -> memref<?xf32> {
+  %0 = bufferization.to_tensor %m : memref<?xf32, #map2>
+  %1 = bufferization.to_memref %0 : memref<?xf32>
+  return %1 : memref<?xf32>
+}
+
+// -----
+
+//       CHECK: #[[$map3:.*]] = affine_map<(d0)[s0] -> (d0 + 25)>
+// CHECK-LABEL: func @static_layout_to_no_layout_cast(
+//  CHECK-SAME:     %[[arg:.*]]: memref<?xf32, #[[$map3]]>)
+//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
+//       CHECK:   %[[dim:.*]] = memref.dim %[[arg]], %[[c0]]
+//       CHECK:   %[[alloc:.*]] = memref.alloc(%[[dim]]) : memref<?xf32>
+//       CHECK:   memref.copy %[[arg]], %[[alloc]]
+//       CHECK:   return %[[alloc]]
+#map3 = affine_map<(d0)[s0] -> (d0 + 25)>
+func @static_layout_to_no_layout_cast(%m: memref<?xf32, #map3>) -> memref<?xf32> {
+  %0 = bufferization.to_tensor %m : memref<?xf32, #map3>
+  %1 = bufferization.to_memref %0 : memref<?xf32>
+  return %1 : memref<?xf32>
+}
+
+// -----
+
+// TODO: to_memref with layout maps not supported yet. This should fold to a
+// memref.cast.
+#map4 = affine_map<(d0)[s0] -> (d0 + s0)>
+func @no_layout_to_dyn_layout_cast(%m: memref<?xf32>) -> memref<?xf32, #map4> {
+  %0 = bufferization.to_tensor %m : memref<?xf32>
+  // expected-error @+1 {{failed to materialize conversion for result #0 of operation 'bufferization.to_memref' that remained live after conversion}}
+  %1 = bufferization.to_memref %0 : memref<?xf32, #map4>
+  // expected-note @+1 {{see existing live user here}}
+  return %1 : memref<?xf32, #map4>
+}
+
+// -----
+
+func @illegal_unranked_to_rank(%m: memref<*xf32>) -> memref<?xf32> {
+  // expected-note @+1 {{prior use here}}
+  %0 = bufferization.to_tensor %m : memref<*xf32>
+  // expected-error @+1 {{expects different type than prior uses: 'tensor<?xf32>' vs 'tensor<*xf32>'}}
+  %1 = bufferization.to_memref %0 : memref<?xf32>
+  return %1 : memref<?xf32>
+}

From 69ab233a15bf63e706f14fbf53df4a755a2a37d9 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Feb 2022 15:59:15 +0000
Subject: [PATCH 236/748] [AMDGPU] Return better Changed status from
 SIFoldOperands

Differential Revision: https://reviews.llvm.org/D120023
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 44 ++++++++++++++++-------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 33954e11d6c6c..034358319b190 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -92,7 +92,7 @@ class SIFoldOperands : public MachineFunctionPass {
 
   bool tryFoldCndMask(MachineInstr &MI) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
-  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+  bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
   const MachineOperand *isClamp(const MachineInstr &MI) const;
   bool tryFoldClamp(MachineInstr &MI);
@@ -1217,7 +1217,7 @@ bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
   return false;
 }
 
-void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
                                      MachineOperand &OpToFold) const {
   // We need mutate the operands of new mov instructions to add implicit
   // uses of EXEC, but adding them invalidates the use_iterator, so defer
@@ -1225,6 +1225,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
   SmallVector<MachineInstr *, 4> CopiesToReplace;
   SmallVector<FoldCandidate, 4> FoldList;
   MachineOperand &Dst = MI.getOperand(0);
+  bool Changed = false;
 
   if (OpToFold.isImm()) {
     for (auto &UseMI :
@@ -1237,8 +1238,10 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       // We may also encounter cases where one or both operands are
       // immediates materialized into a register, which would ordinarily not
       // be folded due to multiple uses or operand constraints.
-      if (tryConstantFoldOp(*MRI, TII, &UseMI))
+      if (tryConstantFoldOp(*MRI, TII, &UseMI)) {
         LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
+        Changed = true;
+      }
     }
   }
 
@@ -1297,6 +1300,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     }
   }
 
+  if (CopiesToReplace.empty() && FoldList.empty())
+    return Changed;
+
   MachineFunction *MF = MI.getParent()->getParent();
   // Make sure we add EXEC uses to any new v_mov instructions created.
   for (MachineInstr *Copy : CopiesToReplace)
@@ -1328,6 +1334,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       TII->commuteInstruction(*Fold.UseMI, false);
     }
   }
+  return true;
 }
 
 // Clamp patterns are canonically selected to v_max_* instructions, so only
@@ -1751,22 +1758,31 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   bool IsIEEEMode = MFI->getMode().IEEE;
   bool HasNSZ = MFI->hasNoSignedZerosFPMath();
 
+  bool Changed = false;
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineOperand *CurrentKnownM0Val = nullptr;
     for (auto &MI : make_early_inc_range(*MBB)) {
-      tryFoldCndMask(MI);
+      Changed |= tryFoldCndMask(MI);
 
-      if (tryFoldZeroHighBits(MI))
+      if (tryFoldZeroHighBits(MI)) {
+        Changed = true;
         continue;
+      }
 
-      if (MI.isRegSequence() && tryFoldRegSequence(MI))
+      if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
+        Changed = true;
         continue;
+      }
 
-      if (MI.isPHI() && tryFoldLCSSAPhi(MI))
+      if (MI.isPHI() && tryFoldLCSSAPhi(MI)) {
+        Changed = true;
         continue;
+      }
 
-      if (MI.mayLoad() && tryFoldLoad(MI))
+      if (MI.mayLoad() && tryFoldLoad(MI)) {
+        Changed = true;
         continue;
+      }
 
       if (!TII->isFoldableCopy(MI)) {
         // Saw an unknown clobber of m0, so we no longer know what it is.
@@ -1777,7 +1793,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         // instruction, and not the omod multiply.
         if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
             !tryFoldOMod(MI))
-          tryFoldClamp(MI);
+          Changed |= tryFoldClamp(MI);
 
         continue;
       }
@@ -1788,6 +1804,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         MachineOperand &NewM0Val = MI.getOperand(1);
         if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
           MI.eraseFromParent();
+          Changed = true;
           continue;
         }
 
@@ -1817,7 +1834,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       if (!MI.getOperand(0).getReg().isVirtual())
         continue;
 
-      foldInstOperand(MI, OpToFold);
+      Changed |= foldInstOperand(MI, OpToFold);
 
       // If we managed to fold all uses of this copy then we might as well
       // delete it now.
@@ -1829,6 +1846,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
         auto &SrcOp = InstToErase->getOperand(1);
         auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
         InstToErase->eraseFromParent();
+        Changed = true;
         InstToErase = nullptr;
         if (!SrcReg || SrcReg.isPhysical())
           break;
@@ -1837,9 +1855,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
           break;
       }
       if (InstToErase && InstToErase->isRegSequence() &&
-          MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
+          MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
         InstToErase->eraseFromParent();
+        Changed = true;
+      }
     }
   }
-  return true;
+  return Changed;
 }

From ae4bec20c4b473da18e334866f38595604238b7c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 10:29:52 +0000
Subject: [PATCH 237/748] [ARM] ARMAsmPrinter::emitAttributes - remove
 unnecessary nullptr test.

The MMI pointer has already been dereferenced several times.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp | 86 +++++++++++++--------------
 1 file changed, 42 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index fa09b2567aa90..0874bae7d4614 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -740,55 +740,53 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format,
                     ARMBuildAttrs::FP16FormatIEEE);
 
-  if (MMI) {
-    if (const Module *SourceModule = MMI->getModule()) {
-      // ABI_PCS_wchar_t to indicate wchar_t width
-      // FIXME: There is no way to emit value 0 (wchar_t prohibited).
-      if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>(
-              SourceModule->getModuleFlag("wchar_size"))) {
-        int WCharWidth = WCharWidthValue->getZExtValue();
-        assert((WCharWidth == 2 || WCharWidth == 4) &&
-               "wchar_t width must be 2 or 4 bytes");
-        ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
-      }
+  if (const Module *SourceModule = MMI->getModule()) {
+    // ABI_PCS_wchar_t to indicate wchar_t width
+    // FIXME: There is no way to emit value 0 (wchar_t prohibited).
+    if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>(
+            SourceModule->getModuleFlag("wchar_size"))) {
+      int WCharWidth = WCharWidthValue->getZExtValue();
+      assert((WCharWidth == 2 || WCharWidth == 4) &&
+             "wchar_t width must be 2 or 4 bytes");
+      ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
+    }
 
-      // ABI_enum_size to indicate enum width
-      // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
-      //        (all enums contain a value needing 32 bits to encode).
-      if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>(
-              SourceModule->getModuleFlag("min_enum_size"))) {
-        int EnumWidth = EnumWidthValue->getZExtValue();
-        assert((EnumWidth == 1 || EnumWidth == 4) &&
-               "Minimum enum width must be 1 or 4 bytes");
-        int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
-        ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
-      }
+    // ABI_enum_size to indicate enum width
+    // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
+    //        (all enums contain a value needing 32 bits to encode).
+    if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>(
+            SourceModule->getModuleFlag("min_enum_size"))) {
+      int EnumWidth = EnumWidthValue->getZExtValue();
+      assert((EnumWidth == 1 || EnumWidth == 4) &&
+             "Minimum enum width must be 1 or 4 bytes");
+      int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
+      ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
+    }
 
-      auto *PACValue = mdconst::extract_or_null<ConstantInt>(
-          SourceModule->getModuleFlag("sign-return-address"));
-      if (PACValue && PACValue->getZExtValue() == 1) {
-        // If "+pacbti" is used as an architecture extension,
-        // Tag_PAC_extension is emitted in
-        // ARMTargetStreamer::emitTargetAttributes().
-        if (!STI.hasPACBTI()) {
-          ATS.emitAttribute(ARMBuildAttrs::PAC_extension,
-                            ARMBuildAttrs::AllowPACInNOPSpace);
-        }
-        ATS.emitAttribute(ARMBuildAttrs::PACRET_use, ARMBuildAttrs::PACRETUsed);
+    auto *PACValue = mdconst::extract_or_null<ConstantInt>(
+        SourceModule->getModuleFlag("sign-return-address"));
+    if (PACValue && PACValue->getZExtValue() == 1) {
+      // If "+pacbti" is used as an architecture extension,
+      // Tag_PAC_extension is emitted in
+      // ARMTargetStreamer::emitTargetAttributes().
+      if (!STI.hasPACBTI()) {
+        ATS.emitAttribute(ARMBuildAttrs::PAC_extension,
+                          ARMBuildAttrs::AllowPACInNOPSpace);
       }
+      ATS.emitAttribute(ARMBuildAttrs::PACRET_use, ARMBuildAttrs::PACRETUsed);
+    }
 
-      auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
-          SourceModule->getModuleFlag("branch-target-enforcement"));
-      if (BTIValue && BTIValue->getZExtValue() == 1) {
-        // If "+pacbti" is used as an architecture extension,
-        // Tag_BTI_extension is emitted in
-        // ARMTargetStreamer::emitTargetAttributes().
-        if (!STI.hasPACBTI()) {
-          ATS.emitAttribute(ARMBuildAttrs::BTI_extension,
-                            ARMBuildAttrs::AllowBTIInNOPSpace);
-        }
-        ATS.emitAttribute(ARMBuildAttrs::BTI_use, ARMBuildAttrs::BTIUsed);
+    auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
+        SourceModule->getModuleFlag("branch-target-enforcement"));
+    if (BTIValue && BTIValue->getZExtValue() == 1) {
+      // If "+pacbti" is used as an architecture extension,
+      // Tag_BTI_extension is emitted in
+      // ARMTargetStreamer::emitTargetAttributes().
+      if (!STI.hasPACBTI()) {
+        ATS.emitAttribute(ARMBuildAttrs::BTI_extension,
+                          ARMBuildAttrs::AllowBTIInNOPSpace);
       }
+      ATS.emitAttribute(ARMBuildAttrs::BTI_use, ARMBuildAttrs::BTIUsed);
     }
   }
 

From 7104f0c4ab53b0feaf204b942576afdf2106b75b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 10:31:00 +0000
Subject: [PATCH 238/748] [Hexagon] aligned load/store patterns - use cast<>
 instead of dyn_cast<> to avoid dereference of nullptr

The pointer is always referenced inside isAlignedMemNode, so assert the cast is correct instead of returning nullptr
---
 llvm/lib/Target/Hexagon/HexagonPatternsHVX.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index bdd46cfe71529..6769548d6cbc2 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -92,19 +92,19 @@ def IsVecOff : PatLeaf<(i32 imm), [{
 
 
 def alignedload: PatFrag<(ops node:$a), (load $a), [{
-  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 def unalignedload: PatFrag<(ops node:$a), (load $a), [{
-  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return !isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 def alignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
-  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 def unalignedstore: PatFrag<(ops node:$v, node:$a), (store $v, $a), [{
-  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+  return !isAlignedMemNode(cast<MemSDNode>(N));
 }]>;
 
 

From 4086b3be4422b0f8eb09c948b39dd495aeee05d9 Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Fri, 18 Feb 2022 18:46:20 +0900
Subject: [PATCH 239/748] [mlir][bufferize][NFC] Remove obsolete tensor
 bufferization patterns from Linalg/Bufferize.cpp

Differential Revision: https://reviews.llvm.org/D119824
---
 .../Dialect/Linalg/Transforms/Bufferize.cpp   | 112 +--------------
 mlir/test/Dialect/Linalg/bufferize.mlir       | 129 ------------------
 .../Dialect/Linalg/CPU/test-padtensor.mlir    |   4 +-
 3 files changed, 4 insertions(+), 241 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
index 9ef78ea15d1c9..42bb8dcdc8bd1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
@@ -116,30 +116,6 @@ class BufferizeInitTensorOp : public OpConversionPattern<InitTensorOp> {
   }
 };
 
-/// Conversion pattern that replaces `linalg.tensor_reshape` with
-/// `linalg.reshape`.
-template <typename TensorReshapeOp,
-          typename Adaptor = typename TensorReshapeOp::Adaptor>
-class BufferizeTensorReshapeOp : public OpConversionPattern<TensorReshapeOp> {
-public:
-  using OpConversionPattern<TensorReshapeOp>::OpConversionPattern;
-  using ReshapeOp = typename std::conditional_t<
-      std::is_same<TensorReshapeOp, tensor::ExpandShapeOp>::value,
-      memref::ExpandShapeOp, memref::CollapseShapeOp>;
-
-  LogicalResult
-  matchAndRewrite(TensorReshapeOp op, Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    rewriter.replaceOpWithNewOp<ReshapeOp>(op,
-                                           this->getTypeConverter()
-                                               ->convertType(op.getType())
-                                               .template cast<MemRefType>(),
-                                           adaptor.src(),
-                                           adaptor.reassociation());
-    return success();
-  }
-};
-
 /// Conversion pattern that bufferizes `linalg.fill` operation.
 class BufferizeFillOp : public OpConversionPattern<FillOp> {
 public:
@@ -191,83 +167,6 @@ class BufferizeAnyLinalgOp : public OpInterfaceConversionPattern<LinalgOp> {
     return success();
   }
 };
-
-/// Convert `extract_slice %t [offsets][sizes][strides] -> %st` to an
-/// alloc + copy pattern.
-/// ```
-///   %a = alloc(sizes)
-///   %sv = subview %source [offsets][sizes][strides]
-///   memref.copy(%sv, %a)
-/// ```
-///
-/// This pattern is arguable a std pattern once memref::CopyOp becomes
-/// std::CopyOp.
-class ExtractSliceOpConverter
-    : public OpConversionPattern<tensor::ExtractSliceOp> {
-public:
-  using OpConversionPattern<tensor::ExtractSliceOp>::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(tensor::ExtractSliceOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    Value sourceMemref = adaptor.source();
-    assert(sourceMemref.getType().isa<MemRefType>());
-
-    MemRefType subviewMemRefType =
-        getTypeConverter()->convertType(op.getType()).cast<MemRefType>();
-    // op.sizes() capture exactly the dynamic alloc operands matching the
-    // subviewMemRefType thanks to subview/slice canonicalization and
-    // verification.
-    Value alloc = rewriter.create<memref::AllocOp>(
-        op.getLoc(), subviewMemRefType, op.sizes());
-    Value subView = rewriter.create<memref::SubViewOp>(
-        op.getLoc(), sourceMemref, op.getMixedOffsets(), op.getMixedSizes(),
-        op.getMixedStrides());
-    rewriter.create<memref::CopyOp>(op.getLoc(), subView, alloc);
-    rewriter.replaceOp(op, alloc);
-    return success();
-  }
-};
-
-/// Convert `insert_slice %source into %dest [offsets][sizes][strides] ->
-/// %t` to an buffer_cast + subview + copy + tensor_load pattern.
-/// buffer_cast and tensor_load are inserted automatically by the
-/// conversion infra:
-/// ```
-///   %sv = subview %dest [offsets][sizes][strides]
-///   memref.copy(%source, %sv)
-///   // replace with %dest
-/// ```
-///
-/// This pattern is arguable a std pattern once memref::CopyOp becomes
-/// std::CopyOp.
-class InsertSliceOpConverter
-    : public OpConversionPattern<tensor::InsertSliceOp> {
-public:
-  using OpConversionPattern<tensor::InsertSliceOp>::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(tensor::InsertSliceOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const final {
-    Value sourceMemRef = adaptor.source();
-    assert(sourceMemRef.getType().isa<MemRefType>());
-
-    // For now, be conservative and copy the converted input memref.
-    // In general, the converted input memref here could be aliased or could
-    // point into constant memory, so mutating it would lead to miscompilations.
-    Value destMemRef = cloneMemref(op.getLoc(), adaptor.dest(), rewriter);
-    assert(destMemRef.getType().isa<MemRefType>());
-
-    // Take a subview to copy the small memref.
-    Value subview = rewriter.create<memref::SubViewOp>(
-        op.getLoc(), destMemRef, op.getMixedOffsets(), op.getMixedSizes(),
-        op.getMixedStrides());
-    // Copy the small memref.
-    rewriter.create<memref::CopyOp>(op.getLoc(), sourceMemRef, subview);
-    rewriter.replaceOp(op, destMemRef);
-    return success();
-  }
-};
 } // namespace
 
 namespace {
@@ -283,9 +182,7 @@ struct LinalgBufferizePass : public LinalgBufferizeBase<LinalgBufferizePass> {
     target.addLegalDialect<arith::ArithmeticDialect, AffineDialect,
                            memref::MemRefDialect, StandardOpsDialect,
                            tensor::TensorDialect>();
-    target.addIllegalOp<InitTensorOp, tensor::PadOp, tensor::CollapseShapeOp,
-                        tensor::ExpandShapeOp, tensor::ExtractSliceOp,
-                        tensor::InsertSliceOp>();
+    target.addIllegalOp<InitTensorOp>();
 
     // Mark all Linalg operations illegal as long as they work on tensors.
     auto isLegalOperation = [&](Operation *op) {
@@ -314,12 +211,7 @@ void mlir::linalg::populateLinalgBufferizePatterns(
   patterns.add<
       BufferizeAnyLinalgOp,
       BufferizeFillOp,
-      BufferizeInitTensorOp,
-      BufferizeTensorReshapeOp<tensor::ExpandShapeOp>,
-      BufferizeTensorReshapeOp<tensor::CollapseShapeOp>,
-      ExtractSliceOpConverter,
-      InsertSliceOpConverter
+      BufferizeInitTensorOp
     >(typeConverter, patterns.getContext());
   // clang-format on
-  patterns.add<GeneralizePadOpPattern>(patterns.getContext());
 }
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index 614f207bb3354..2edc104ccad36 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -162,86 +162,6 @@ func @generic_with_init_tensor(%arg0: tensor<2x3x4xvector<3x4xi4>>,
 
 // -----
 
-// CHECK-DAG: #[[$MAP0:[0-9a-z]*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
-// CHECK-DAG: #[[$MAP1:[0-9a-z]*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1 * 2)>
-
-func private @make_index() -> index
-
-// CHECK-LABEL: func @bufferize_slice(
-//  CHECK-SAME:   %[[T:[0-9a-z]*]]: tensor<?x?xf32>
-func @bufferize_slice(%t : tensor<?x?xf32>) -> (tensor<2x3xf32>, tensor<2x?xf32>) {
-  //      CHECK: %[[M:.*]] = bufferization.to_memref %[[T]] : memref<?x?xf32>
-
-  //      CHECK: %[[IDX:.*]] = call @make_index() : () -> index
-  %i0 = call @make_index() : () -> index
-
-  // CHECK-NEXT: %[[A0:.*]] = memref.alloc() : memref<2x3xf32>
-  // CHECK-NEXT: %[[SM0:.*]] = memref.subview %[[M]][0, 0] [2, 3] [1, 1]
-  // CHECK-SAME:   memref<?x?xf32> to memref<2x3xf32, #[[$MAP0]]>
-  // CHECK-NEXT: memref.copy %[[SM0]], %[[A0]] : memref<2x3xf32, #[[$MAP0]]> to memref<2x3xf32>
-  // CHECK-NEXT: %[[RT0:.*]] = bufferization.to_tensor %[[A0]] : memref<2x3xf32>
-  %st0 = tensor.extract_slice %t[0, 0][2, 3][1, 1] : tensor<?x?xf32> to tensor<2x3xf32>
-
-  // CHECK-NEXT: %[[A1:.*]] = memref.alloc(%[[IDX]]) : memref<2x?xf32>
-  // CHECK-NEXT: %[[SM1:.*]] = memref.subview %[[M]][0, %[[IDX]]] [2, %[[IDX]]] [1, 2]
-  // CHECK-SAME:   memref<?x?xf32> to memref<2x?xf32, #[[$MAP1]]>
-  // CHECK-NEXT: memref.copy %[[SM1]], %[[A1]] : memref<2x?xf32, #[[$MAP1]]> to memref<2x?xf32>
-  // CHECK-NEXT: %[[RT1:.*]] = bufferization.to_tensor %[[A1]] : memref<2x?xf32>
-  %st1 = tensor.extract_slice %t[0, %i0][2, %i0][1, 2] : tensor<?x?xf32> to tensor<2x?xf32>
-
-  // CHECK-NEXT: return %[[RT0]], %[[RT1]]
-  return %st0, %st1 : tensor<2x3xf32>, tensor<2x?xf32>
-}
-
-// -----
-
-// CHECK-DAG: #[[$MAP0:[0-9a-z]*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
-// CHECK-DAG: #[[$MAP1:[0-9a-z]*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1 * 2)>
-
-func private @make_index() -> index
-
-// CHECK-LABEL: func @bufferize_insert_slice(
-//  CHECK-SAME:   %[[T:[0-9a-z]*]]: tensor<?x?xf32>
-//  CHECK-SAME:   %[[ST0:[0-9a-z]*]]: tensor<2x3xf32>
-//  CHECK-SAME:   %[[ST1:[0-9a-z]*]]: tensor<2x?xf32>
-func @bufferize_insert_slice(%t : tensor<?x?xf32>, %st0 : tensor<2x3xf32>, %st1 : tensor<2x?xf32>) ->
-    (tensor<?x?xf32>, tensor<?x?xf32>) {
-  // CHECK-DAG: %[[M:.*]] = bufferization.to_memref %[[T]] : memref<?x?xf32>
-  // CHECK-DAG: %[[SM0:.*]] = bufferization.to_memref %[[ST0]] : memref<2x3xf32>
-  // CHECK-DAG: %[[SM1:.*]] = bufferization.to_memref %[[ST1]] : memref<2x?xf32>
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  %i0 = call @make_index() : () -> index
-  // CHECK: %[[IDX:.*]] = call @make_index() : () -> index
-
-
-  // CHECK-NEXT: %[[DIM0:.*]] = tensor.dim %[[T]], %[[C0]] : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1:.*]] = tensor.dim %[[T]], %[[C1]] : tensor<?x?xf32>
-  // CHECK-NEXT: %[[M_COPY0:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
-  // CHECK-NEXT: memref.copy %[[M]], %[[M_COPY0]] : memref<?x?xf32> to memref<?x?xf32>
-  // CHECK-NEXT: %[[SUBVIEW0:.*]] = memref.subview %[[M_COPY0]][0, 0] [2, 3] [1, 1]
-  // CHECK-SAME:   memref<?x?xf32> to memref<2x3xf32, #[[$MAP0]]>
-  // CHECK-NEXT: memref.copy %[[SM0]], %[[SUBVIEW0]] : memref<2x3xf32> to memref<2x3xf32, #[[$MAP0]]>
-  // CHECK-NEXT: %[[RT0:.*]] = bufferization.to_tensor %[[M_COPY0]] : memref<?x?xf32>
-  %t0 = tensor.insert_slice %st0 into %t[0, 0][2, 3][1, 1] : tensor<2x3xf32> into tensor<?x?xf32>
-
-  // CHECK-NEXT: %[[M_COPY1:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
-  // CHECK-NEXT: memref.copy %[[M]], %[[M_COPY1]] : memref<?x?xf32> to memref<?x?xf32>
-  // CHECK-NEXT: %[[SUBVIEW1:.*]] = memref.subview %[[M_COPY1]][0, %[[IDX]]] [2, %[[IDX]]] [1, 2]
-  // CHECK-SAME:   memref<?x?xf32> to memref<2x?xf32, #[[$MAP1]]>
-  // CHECK-NEXT: memref.copy %[[SM1]], %[[SUBVIEW1]] : memref<2x?xf32> to memref<2x?xf32, #[[$MAP1]]>
-  // CHECK-NEXT: %[[RT1:.*]] = bufferization.to_tensor %[[M_COPY1]] : memref<?x?xf32>
-  %t1 = tensor.insert_slice %st1 into %t[0, %i0][2, %i0][1, 2] : tensor<2x?xf32> into tensor<?x?xf32>
-
-  //     CHECK: return %[[RT0]], %[[RT1]]
-  return %t0, %t1: tensor<?x?xf32>, tensor<?x?xf32>
-}
-
-// -----
-
 // CHECK-LABEL: func @bufferize_fill(
 // CHECK-SAME:    %[[IN:.*]]: tensor<?xf32>
 func @bufferize_fill(%arg0: tensor<?xf32>) -> tensor<?xf32> {
@@ -256,55 +176,6 @@ func @bufferize_fill(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 
 // -----
 
-// CHECK-LABEL: func @bufferize_tensor_collapse_shape(
-// CHECK-SAME:    %[[IN:.*]]: tensor<4x5xf32>
-func @bufferize_tensor_collapse_shape(%arg0: tensor<4x5xf32>) -> tensor<20xf32> {
-  %out = tensor.collapse_shape %arg0 [[0, 1]] :
-     tensor<4x5xf32> into tensor<20xf32>
-  return %out : tensor<20xf32>
-}
-// CHECK: %[[MEMREF:.*]] = bufferization.to_memref %[[IN]] : memref<4x5xf32>
-// CHECK: %[[RESHAPE:.*]] = memref.collapse_shape %[[MEMREF]] {{\[}}[0, 1]]
-// CHECK-SAME: : memref<4x5xf32> into memref<20xf32>
-// CHECK: %[[TENSOR:.*]] = bufferization.to_tensor %[[RESHAPE]] : memref<20xf32>
-// CHECK: return %[[TENSOR]]
-
-// -----
-
-// CHECK-LABEL:   func @pad_tensor_dynamic_shape(
-// CHECK-SAME:                                   %[[IN:.*]]: tensor<4x?x2x?xf32>,
-// CHECK-SAME:                                   %[[OFFSET:.*]]: index) -> tensor<4x?x?x?xf32> {
-func @pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f32
-  %out = tensor.pad %arg0 low[%c0, %c0, %arg1, %c0] high[%c0, %c0, %c0, %arg1]  {
-  ^bb0(%gen_arg1: index, %gen_arg2: index, %gen_arg3: index, %gen_arg4: index):  
-    tensor.yield %cst : f32
-  } : tensor<4x?x2x?xf32> to tensor<4x?x?x?xf32>
-  return %out : tensor<4x?x?x?xf32>
-}
-
-// CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : index
-// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[IN_MEMREF:.*]] = bufferization.to_memref %[[IN]] : memref<4x?x2x?xf32>
-// CHECK:           %[[DIM1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32>
-// CHECK:           %[[OUT_DIM2:.*]] = arith.addi %[[OFFSET]], %[[C2]] : index
-// CHECK:           %[[DIM3:.*]] = tensor.dim %[[IN]], %[[C3]] : tensor<4x?x2x?xf32>
-// CHECK:           %[[OUT_DIM3:.*]] = arith.addi %[[DIM3]], %[[OFFSET]] : index
-// CHECK:           %[[FILLED:.*]] = memref.alloc(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : memref<4x?x?x?xf32>
-// CHECK:           linalg.fill(%[[CST]], %[[FILLED]]) : f32, memref<4x?x?x?xf32>
-// CHECK:           %[[OUT:.*]] = memref.alloc(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : memref<4x?x?x?xf32>
-// CHECK:           memref.copy %[[FILLED]], %[[OUT]] : memref<4x?x?x?xf32> to memref<4x?x?x?xf32>
-// CHECK:           %[[INTERIOR:.*]] = memref.subview %[[OUT]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : memref<4x?x?x?xf32> to memref<4x?x2x?xf32, #map>
-// CHECK:           memref.copy %[[IN_MEMREF]], %[[INTERIOR]] : memref<4x?x2x?xf32> to memref<4x?x2x?xf32, #map>
-// CHECK:           %[[OUT_TENSOR:.*]] = bufferization.to_tensor %[[OUT]] : memref<4x?x?x?xf32>
-// CHECK:           return %[[OUT_TENSOR]] : tensor<4x?x?x?xf32>
-// CHECK:         }
-
-// -----
-
 // CHECK-LABEL:   func @bufferize_dot
 func @bufferize_dot(%in: tensor<4xf32>, %out: tensor<f32>) -> tensor<f32> {
   %dot = linalg.dot ins(%in, %in : tensor<4xf32>, tensor<4xf32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
index 32465294fa1e0..b351d70d6325f 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s -test-linalg-transform-patterns=test-linalg-to-vector-patterns \
+// RUN: -linalg-bufferize -arith-bufferize -tensor-bufferize -func-bufferize \
 // RUN: -finalizing-bufferize -buffer-deallocation \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \

From b47be47ac2871b6f63f4acbaea8fa5e311f1ecc5 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 18 Feb 2022 11:39:48 +0100
Subject: [PATCH 240/748] [mlir][Vector] Switch ExtractOp to the declarative
 assembly format

This is a bit awkward since ExtractOp allows both `f32` and
`vector<1xf32>` results for a scalar extraction. Allow both, but make
inference return the scalar to make this as NFC as possible.
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       |  6 +-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 72 +++++++------------
 mlir/test/Dialect/Vector/invalid.mlir         |  2 +-
 3 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 1e16dbbb97295..4a20ea0dc4d10 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -551,7 +551,8 @@ def Vector_ExtractElementOp :
 def Vector_ExtractOp :
   Vector_Op<"extract", [NoSideEffect,
      PredOpTrait<"operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
     Arguments<(ins AnyVector:$vector, I64ArrayAttr:$position)>,
     Results<(outs AnyType)> {
   let summary = "extract operation";
@@ -577,9 +578,10 @@ def Vector_ExtractOp :
     VectorType getVectorType() {
       return vector().getType().cast<VectorType>();
     }
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
   }];
+  let assemblyFormat = "$vector `` $position attr-dict `:` type($vector)";
   let hasCanonicalizer = 1;
-  let hasCustomAssemblyFormat = 1;
   let hasFolder = 1;
   let hasVerifier = 1;
 }
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 560746453a079..4ffb2b8c75696 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -940,21 +940,9 @@ LogicalResult vector::ExtractElementOp::verify() {
 // ExtractOp
 //===----------------------------------------------------------------------===//
 
-static Type inferExtractOpResultType(VectorType vectorType,
-                                     ArrayAttr position) {
-  if (static_cast<int64_t>(position.size()) == vectorType.getRank())
-    return vectorType.getElementType();
-  return VectorType::get(vectorType.getShape().drop_front(position.size()),
-                         vectorType.getElementType());
-}
-
 void vector::ExtractOp::build(OpBuilder &builder, OperationState &result,
                               Value source, ArrayRef<int64_t> position) {
-  result.addOperands(source);
-  auto positionAttr = getVectorSubscriptAttr(builder, position);
-  result.addTypes(inferExtractOpResultType(source.getType().cast<VectorType>(),
-                                           positionAttr));
-  result.addAttribute(getPositionAttrStrName(), positionAttr);
+  build(builder, result, source, getVectorSubscriptAttr(builder, position));
 }
 
 // Convenience builder which assumes the values are constant indices.
@@ -967,40 +955,34 @@ void vector::ExtractOp::build(OpBuilder &builder, OperationState &result,
   build(builder, result, source, positionConstants);
 }
 
-void vector::ExtractOp::print(OpAsmPrinter &p) {
-  p << " " << vector() << position();
-  p.printOptionalAttrDict((*this)->getAttrs(), {"position"});
-  p << " : " << vector().getType();
+LogicalResult
+ExtractOp::inferReturnTypes(MLIRContext *, Optional<Location>,
+                            ValueRange operands, DictionaryAttr attributes,
+                            RegionRange,
+                            SmallVectorImpl<Type> &inferredReturnTypes) {
+  ExtractOp::Adaptor op(operands, attributes);
+  auto vectorType = op.vector().getType().cast<VectorType>();
+  if (static_cast<int64_t>(op.position().size()) == vectorType.getRank()) {
+    inferredReturnTypes.push_back(vectorType.getElementType());
+  } else {
+    auto n = std::min<size_t>(op.position().size(), vectorType.getRank() - 1);
+    inferredReturnTypes.push_back(VectorType::get(
+        vectorType.getShape().drop_front(n), vectorType.getElementType()));
+  }
+  return success();
 }
 
-ParseResult vector::ExtractOp::parse(OpAsmParser &parser,
-                                     OperationState &result) {
-  SMLoc attributeLoc, typeLoc;
-  NamedAttrList attrs;
-  OpAsmParser::OperandType vector;
-  Type type;
-  Attribute attr;
-  if (parser.parseOperand(vector) || parser.getCurrentLocation(&attributeLoc) ||
-      parser.parseAttribute(attr, "position", attrs) ||
-      parser.parseOptionalAttrDict(attrs) ||
-      parser.getCurrentLocation(&typeLoc) || parser.parseColonType(type))
-    return failure();
-
-  auto vectorType = type.dyn_cast<VectorType>();
-  if (!vectorType)
-    return parser.emitError(typeLoc, "expected vector type");
-
-  auto positionAttr = attr.dyn_cast<ArrayAttr>();
-  if (!positionAttr ||
-      static_cast<int64_t>(positionAttr.size()) > vectorType.getRank())
-    return parser.emitError(
-        attributeLoc,
-        "expected position attribute of rank smaller than vector rank");
-
-  Type resType = inferExtractOpResultType(vectorType, positionAttr);
-  result.attributes = attrs;
-  return failure(parser.resolveOperand(vector, type, result.operands) ||
-                 parser.addTypeToList(resType, result.types));
+bool ExtractOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  // Allow extracting 1-element vectors instead of scalars.
+  auto isCompatible = [](TypeRange l, TypeRange r) {
+    auto vectorType = l.front().dyn_cast<VectorType>();
+    return vectorType && vectorType.getShape().equals({1}) &&
+           vectorType.getElementType() == r.front();
+  };
+  if (l.size() == 1 && r.size() == 1 &&
+      (isCompatible(l, r) || isCompatible(r, l)))
+    return true;
+  return l == r;
 }
 
 LogicalResult vector::ExtractOp::verify() {
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index bc75e0bbe8b2e..2e224f7f58ebe 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -104,7 +104,7 @@ func @extract_element(%arg0: vector<4x4xf32>) {
 // -----
 
 func @extract_vector_type(%arg0: index) {
-  // expected-error@+1 {{expected vector type}}
+  // expected-error@+1 {{invalid kind of type specified}}
   %1 = vector.extract %arg0[] : index
 }
 

From 09193f20a13e8c3f4196dcc0883d74396f44c3cf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 18 Feb 2022 09:47:08 +0000
Subject: [PATCH 241/748] Revert "Add support for floating-point option
 `ffp-eval-method` and for"

This reverts commit 32b73bc6ab8234b670c34d5ef999300e072cc706.

This breaks builds on macOS in some configurations, because
__FLT_EVAL_METHOD__ is set to an unexpected value.

E.g.
https://green.lab.llvm.org/green/job/clang-stage1-RA/28282/consoleFull#129538464349ba4694-19c4-4d7e-bec5-911270d8a58c

More details available in the review thread
https://reviews.llvm.org/D109239
---
 clang/docs/LanguageExtensions.rst             |  32 ----
 clang/docs/UsersManual.rst                    |  27 ---
 .../include/clang/Basic/DiagnosticLexKinds.td |   4 -
 .../clang/Basic/DiagnosticParseKinds.td       |   3 -
 clang/include/clang/Basic/FPOptions.def       |   1 -
 clang/include/clang/Basic/LangOptions.def     |   1 -
 clang/include/clang/Basic/LangOptions.h       |  18 --
 clang/include/clang/Basic/TargetInfo.h        |   6 +-
 clang/include/clang/Driver/Options.td         |   5 -
 clang/include/clang/Lex/Preprocessor.h        |  41 -----
 clang/include/clang/Parse/Parser.h            |   1 -
 clang/include/clang/Sema/Sema.h               |  14 +-
 clang/lib/Basic/Targets/OSTargets.h           |   4 +-
 clang/lib/Basic/Targets/X86.h                 |  14 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  17 --
 clang/lib/Frontend/InitPreprocessor.cpp       |   1 +
 clang/lib/Lex/PPMacroExpansion.cpp            |  12 --
 clang/lib/Parse/ParsePragma.cpp               |  25 +--
 clang/lib/Parse/ParseStmt.cpp                 |  10 --
 clang/lib/Sema/Sema.cpp                       |  21 ---
 clang/lib/Sema/SemaAttr.cpp                   |  21 ---
 clang/lib/Sema/SemaExpr.cpp                   |  34 ----
 .../test/CodeGen/X86/32bit-behavior-no-eval.c |  30 ----
 clang/test/CodeGen/X86/32bit-behavior.c       | 109 ------------
 clang/test/CodeGen/X86/fp-eval-method.c       |  20 ---
 clang/test/CodeGen/flt_eval_macro.cpp         |  79 ---------
 clang/test/CodeGen/fp-floatcontrol-pragma.cpp | 166 +-----------------
 clang/test/Preprocessor/flt_eval_macro.cpp    |  59 -------
 clang/test/Preprocessor/init-aarch64.c        |   3 +
 clang/test/Preprocessor/init-arm.c            |   5 +
 clang/test/Preprocessor/init-mips.c           |   6 +
 clang/test/Preprocessor/init-ppc.c            |   5 +
 clang/test/Preprocessor/init-ppc64.c          |   4 +
 clang/test/Preprocessor/init-s390x.c          |   1 +
 clang/test/Preprocessor/init-v7k-compat.c     |   1 +
 clang/test/Preprocessor/init-x86.c            |  15 ++
 clang/test/Preprocessor/init.c                |  11 ++
 clang/test/Sema/fp-eval-pragma.cpp            |  87 ---------
 clang/test/Sema/x86-eval-method.c             |  18 --
 clang/test/Sema/x86_64-eval-method.c          |  13 --
 40 files changed, 72 insertions(+), 872 deletions(-)
 delete mode 100644 clang/test/CodeGen/X86/32bit-behavior-no-eval.c
 delete mode 100644 clang/test/CodeGen/X86/32bit-behavior.c
 delete mode 100644 clang/test/CodeGen/X86/fp-eval-method.c
 delete mode 100644 clang/test/CodeGen/flt_eval_macro.cpp
 delete mode 100644 clang/test/Preprocessor/flt_eval_macro.cpp
 delete mode 100644 clang/test/Sema/fp-eval-pragma.cpp
 delete mode 100644 clang/test/Sema/x86-eval-method.c
 delete mode 100644 clang/test/Sema/x86_64-eval-method.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 5249d3f3f7930..f45d88092eb4a 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3907,38 +3907,6 @@ A ``#pragma clang fp`` pragma may contain any number of options:
     ...
   }
 
-``#pragma clang fp eval_method`` allows floating-point behavior to be specified
-for a section of the source code. This pragma can appear at file or namespace
-scope, or at the start of a compound statement (excluding comments).
-The pragma is active within the scope of the compound statement.
-
-When ``pragma clang fp eval_method(source)`` is enabled, the section of code
-governed by the pragma behaves as though the command-line option
-``-ffp-eval-method=source`` is enabled. Rounds intermediate results to
-source-defined precision.
-
-When ``pragma clang fp eval_method(double)`` is enabled, the section of code
-governed by the pragma behaves as though the command-line option
-``-ffp-eval-method=double`` is enabled. Rounds intermediate results to
-``double`` precision.
-
-When ``pragma clang fp eval_method(extended)`` is enabled, the section of code
-governed by the pragma behaves as though the command-line option
-``-ffp-eval-method=extended`` is enabled. Rounds intermediate results to
-target-dependent ``long double`` precision. In Win32 programming, for instance,
-the long double data type maps to the double, 64-bit precision data type.
-
-The full syntax this pragma supports is
-``#pragma clang fp eval_method(source|double|extended)``.
-
-.. code-block:: c++
-
-  for(...) {
-    // The compiler will use long double as the floating-point evaluation
-    // method.
-    #pragma clang fp eval_method(extended)
-    a = b[i] * c[i] + e;
-  }
 
 The ``#pragma float_control`` pragma allows precise floating-point
 semantics and floating-point exception behavior to be specified
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 4a776eb86775c..981909aa16eaf 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1566,22 +1566,6 @@ Note that floating-point operations performed as part of constant initialization
    * ``maytrap`` The compiler avoids transformations that may raise exceptions that would not have been raised by the original code. Constant folding performed by the compiler is exempt from this option.
    * ``strict`` The compiler ensures that all transformations strictly preserve the floating point exception semantics of the original code.
 
-.. option:: -ffp-eval-method=<value>
-
-   Specify the floating-point evaluation method for intermediate results within
-   a single expression of the code.
-
-   Valid values are: ``source``, ``double``, and ``extended``.
-   For 64-bit targets, the default value is ``source``. For 32-bit x86 targets
-   however, in the case of NETBSD 6.99.26 and under, the default value is
-   ``double``; in the case of NETBSD greater than 6.99.26, with NoSSE, the
-   default value is ``extended``, with SSE the default value is ``source``.
-   Details:
-
-   * ``source`` The compiler uses the floating-point type declared in the source program as the evaluation method.
-   * ``double`` The compiler uses ``double`` as the floating-point evaluation method for all float expressions of type that is narrower than ``double``.
-   * ``extended`` The compiler uses ``long double`` as the floating-point evaluation method for all float expressions of type that is narrower than ``long double``.
-
 .. option:: -f[no-]protect-parens:
 
    This option pertains to floating-point types, complex types with
@@ -1603,17 +1587,6 @@ Note that floating-point operations performed as part of constant initialization
    has no effect because the optimizer is prohibited from making unsafe
    transformations.
 
-.. _FLT_EVAL_METHOD:
-
-A note about ``__FLT_EVAL_METHOD__``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-The macro ``__FLT_EVAL_METHOD__`` will expand to either the value set from the
-command line option ``ffp-eval-method`` or to the value from the target info
-setting. The ``__FLT_EVAL_METHOD__`` macro cannot expand to the correct
-evaluation method in the presence of a ``#pragma`` which alters the evaluation
-method. An error is issued if ``__FLT_EVAL_METHOD__`` is expanded inside a scope
-modified by ``#pragma clang fp eval_method``.
-
 .. _fp-constant-eval:
 
 A note about Floating Point Constant Evaluation
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 0f424b02c812a..a4436208799f9 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -321,10 +321,6 @@ def err_pragma_include_instead_system_reserved : Error<
   "header '%0' is an implementation detail; #include %select{'%2'|either '%2' "
   "or '%3'|one of %2}1 instead">;
 
-def err_illegal_use_of_flt_eval_macro : Error<
-  "'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing "
-  "'#pragma clang fp eval_method'">;
-
 def pp_poisoning_existing_macro : Warning<"poisoning existing macro">;
 def pp_out_of_date_dependency : Warning<
   "current file is older than dependency %0">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index bcf8186896303..e23810f402365 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1267,9 +1267,6 @@ def err_pragma_attribute_namespace_on_attribute : Error<
 def note_pragma_attribute_namespace_on_attribute : Note<
   "omit the namespace to add attributes to the most-recently"
   " pushed attribute group">;
-def warn_no_support_for_eval_method_source_on_m32 : Warning<
-  "Setting the floating point evaluation method to `source` on a target"
-  " without SSE is not supported.">, InGroup<Pragmas>;
 
 // OpenCL EXTENSION pragma (OpenCL 1.1 [9.1])
 def warn_pragma_expected_colon : Warning<
diff --git a/clang/include/clang/Basic/FPOptions.def b/clang/include/clang/Basic/FPOptions.def
index 224c1827144f5..a93fa475cd5f6 100644
--- a/clang/include/clang/Basic/FPOptions.def
+++ b/clang/include/clang/Basic/FPOptions.def
@@ -23,5 +23,4 @@ OPTION(NoHonorInfs, bool, 1, NoHonorNaNs)
 OPTION(NoSignedZero, bool, 1, NoHonorInfs)
 OPTION(AllowReciprocal, bool, 1, NoSignedZero)
 OPTION(AllowApproxFunc, bool, 1, AllowReciprocal)
-OPTION(FPEvalMethod, LangOptions::FPEvalMethodKind, 2, AllowApproxFunc)
 #undef OPTION
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 89b11fdea89b2..4651f4fff6aa0 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -301,7 +301,6 @@ BENIGN_ENUM_LANGOPT(DefaultFPContractMode, FPModeKind, 2, FPM_Off, "FP contracti
 COMPATIBLE_LANGOPT(ExpStrictFP, 1, false, "Enable experimental strict floating point")
 BENIGN_ENUM_LANGOPT(FPRoundingMode, RoundingMode, 3, RoundingMode::NearestTiesToEven, "FP Rounding Mode type")
 BENIGN_ENUM_LANGOPT(FPExceptionMode, FPExceptionModeKind, 2, FPE_Ignore, "FP Exception Behavior Mode type")
-BENIGN_ENUM_LANGOPT(FPEvalMethod, FPEvalMethodKind, 2, FEM_UnsetOnCommandLine, "FP type used for floating point arithmetic")
 LANGOPT(NoBitFieldTypeAlign , 1, 0, "bit-field type alignment")
 LANGOPT(HexagonQdsp6Compat , 1, 0, "hexagon-qdsp6 backward compatibility")
 LANGOPT(ObjCAutoRefCount , 1, 0, "Objective-C automated reference counting")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 2e334e375950e..50c7f038fc6be 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -235,24 +235,6 @@ class LangOptions : public LangOptionsBase {
     FPE_Strict
   };
 
-  /// Possible float expression evaluation method choices.
-  enum FPEvalMethodKind {
-    /// The evaluation method cannot be determined or is inconsistent for this
-    /// target.
-    FEM_Indeterminable = -1,
-    /// Use the declared type for fp arithmetic.
-    FEM_Source = 0,
-    /// Use the type double for fp arithmetic.
-    FEM_Double = 1,
-    /// Use extended type for fp arithmetic.
-    FEM_Extended = 2,
-    /// Used only for FE option processing; this is only used to indicate that
-    /// the user did not specify an explicit evaluation method on the command
-    /// line and so the target should be queried for its default evaluation
-    /// method instead.
-    FEM_UnsetOnCommandLine = 3
-  };
-
   /// Possible exception handling behavior.
   enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 8e18ded7d3765..22918f7e12e84 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -726,11 +726,7 @@ class TargetInfo : public virtual TransferrableTargetInfo,
   }
 
   /// Return the value for the C99 FLT_EVAL_METHOD macro.
-  virtual LangOptions::FPEvalMethodKind getFPEvalMethod() const {
-    return LangOptions::FPEvalMethodKind::FEM_Source;
-  }
-
-  virtual bool supportSourceEvalMethod() const { return true; }
+  virtual unsigned getFloatEvalMethod() const { return 0; }
 
   // getLargeArrayMinWidth/Align - Return the minimum array size that is
   // 'large' and its alignment.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index dad2b536db445..76cfdbcd85f26 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1495,11 +1495,6 @@ def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
 def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
 def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
 def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
-def ffp_eval_method_EQ : Joined<["-"], "ffp-eval-method=">, Group<f_Group>, Flags<[CC1Option]>,
-  HelpText<"Specifies the evaluation method to use for floating-point arithmetic.">,
-  Values<"source,double,extended">, NormalizedValuesScope<"LangOptions">,
-  NormalizedValues<["FEM_Source", "FEM_Double", "FEM_Extended"]>,
-  MarshallingInfoEnum<LangOpts<"FPEvalMethod">, "FEM_UnsetOnCommandLine">;
 def ffp_model_EQ : Joined<["-"], "ffp-model=">, Group<f_Group>, Flags<[NoXarchOption]>,
   HelpText<"Controls the semantics of floating-point calculations.">;
 def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<f_Group>, Flags<[CC1Option]>,
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index f2c84e43ddca3..2802329a60220 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -179,27 +179,12 @@ class Preprocessor {
   IdentifierInfo *Ident__is_target_vendor;         // __is_target_vendor
   IdentifierInfo *Ident__is_target_os;             // __is_target_os
   IdentifierInfo *Ident__is_target_environment;    // __is_target_environment
-  IdentifierInfo *Ident__FLT_EVAL_METHOD__;        // __FLT_EVAL_METHOD
 
   // Weak, only valid (and set) while InMacroArgs is true.
   Token* ArgMacro;
 
   SourceLocation DATELoc, TIMELoc;
 
-  // FEM_UnsetOnCommandLine means that an explicit evaluation method was
-  // not specified on the command line. The target is queried to set the
-  // default evaluation method.
-  LangOptions::FPEvalMethodKind CurrentFPEvalMethod =
-      LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
-
-  // The most recent pragma location where the floating point evaluation
-  // method was modified. This is used to determine whether the
-  // 'pragma clang fp eval_method' was used whithin the current scope.
-  SourceLocation LastFPEvalPragmaLocation;
-
-  LangOptions::FPEvalMethodKind TUFPEvalMethod =
-      LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
-
   // Next __COUNTER__ value, starts at 0.
   unsigned CounterValue = 0;
 
@@ -2063,32 +2048,6 @@ class Preprocessor {
   unsigned getCounterValue() const { return CounterValue; }
   void setCounterValue(unsigned V) { CounterValue = V; }
 
-  LangOptions::FPEvalMethodKind getCurrentFPEvalMethod() const {
-    assert(CurrentFPEvalMethod != LangOptions::FEM_UnsetOnCommandLine &&
-           "FPEvalMethod should be set either from command line or from the "
-           "target info");
-    return CurrentFPEvalMethod;
-  }
-
-  LangOptions::FPEvalMethodKind getTUFPEvalMethod() const {
-    return TUFPEvalMethod;
-  }
-
-  SourceLocation getLastFPEvalPragmaLocation() const {
-    return LastFPEvalPragmaLocation;
-  }
-
-  void setCurrentFPEvalMethod(SourceLocation PragmaLoc,
-                              LangOptions::FPEvalMethodKind Val) {
-    assert(Val != LangOptions::FEM_UnsetOnCommandLine &&
-           "FPEvalMethod should never be set to FEM_UnsetOnCommandLine");
-    // This is the location of the '#pragma float_control" where the
-    // execution state is modifed.
-    LastFPEvalPragmaLocation = PragmaLoc;
-    CurrentFPEvalMethod = Val;
-    TUFPEvalMethod = Val;
-  }
-
   /// Retrieves the module that we're currently building, if any.
   Module *getCurrentModule();
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index d2e588992238d..981800a7e2356 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -184,7 +184,6 @@ class Parser : public CodeCompletionHandler {
   std::unique_ptr<PragmaHandler> PCSectionHandler;
   std::unique_ptr<PragmaHandler> MSCommentHandler;
   std::unique_ptr<PragmaHandler> MSDetectMismatchHandler;
-  std::unique_ptr<PragmaHandler> FPEvalMethodHandler;
   std::unique_ptr<PragmaHandler> FloatControlHandler;
   std::unique_ptr<PragmaHandler> MSPointersToMembers;
   std::unique_ptr<PragmaHandler> MSVtorDisp;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 60ee577fca06a..c1e846c55dee7 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1541,16 +1541,19 @@ class Sema final {
   /// statements.
   class FPFeaturesStateRAII {
   public:
-    FPFeaturesStateRAII(Sema &S);
-    ~FPFeaturesStateRAII();
+    FPFeaturesStateRAII(Sema &S) : S(S), OldFPFeaturesState(S.CurFPFeatures) {
+      OldOverrides = S.FpPragmaStack.CurrentValue;
+    }
+    ~FPFeaturesStateRAII() {
+      S.CurFPFeatures = OldFPFeaturesState;
+      S.FpPragmaStack.CurrentValue = OldOverrides;
+    }
     FPOptionsOverride getOverrides() { return OldOverrides; }
 
   private:
     Sema& S;
     FPOptions OldFPFeaturesState;
     FPOptionsOverride OldOverrides;
-    LangOptions::FPEvalMethodKind OldEvalMethod;
-    SourceLocation OldFPPragmaLocation;
   };
 
   void addImplicitTypedef(StringRef Name, QualType T);
@@ -10128,9 +10131,6 @@ class Sema final {
            !CurFPFeatures.getAllowApproxFunc();
   }
 
-  void ActOnPragmaFPEvalMethod(SourceLocation Loc,
-                               LangOptions::FPEvalMethodKind Value);
-
   /// ActOnPragmaFloatControl - Call on well-formed \#pragma float_control
   void ActOnPragmaFloatControl(SourceLocation Loc, PragmaMsStackAction Action,
                                PragmaFloatControlKind Value);
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index f61652d285a89..3c1830d5f8e89 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -749,9 +749,7 @@ class AIXTargetInfo : public OSTargetInfo<Target> {
   }
 
   // AIX sets FLT_EVAL_METHOD to be 1.
-  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
-    return LangOptions::FPEvalMethodKind::FEM_Double;
-  }
+  unsigned getFloatEvalMethod() const override { return 1; }
 
   bool defaultsToAIXPowerAlignment() const override { return true; }
 };
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index e0bb3c344c5b6..d1b66432e38b4 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -168,15 +168,11 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
     return LongDoubleFormat == &llvm::APFloat::IEEEquad() ? "g" : "e";
   }
 
-  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
+  unsigned getFloatEvalMethod() const override {
     // X87 evaluates with 80 bits "long double" precision.
-    return SSELevel == NoSSE ? LangOptions::FPEvalMethodKind::FEM_Extended
-                             : LangOptions::FPEvalMethodKind::FEM_Source;
+    return SSELevel == NoSSE ? 2 : 0;
   }
 
-  // EvalMethod `source` is not supported for targets with `NoSSE` feature.
-  bool supportSourceEvalMethod() const override { return SSELevel > NoSSE; }
-
   ArrayRef<const char *> getGCCRegNames() const override;
 
   ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
@@ -475,13 +471,13 @@ class LLVM_LIBRARY_VISIBILITY NetBSDI386TargetInfo
   NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}
 
-  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
+  unsigned getFloatEvalMethod() const override {
     VersionTuple OsVersion = getTriple().getOSVersion();
     // New NetBSD uses the default rounding mode.
     if (OsVersion >= VersionTuple(6, 99, 26) || OsVersion.getMajor() == 0)
-      return X86_32TargetInfo::getFPEvalMethod();
+      return X86_32TargetInfo::getFloatEvalMethod();
     // NetBSD before 6.99.26 defaults to "double" rounding.
-    return LangOptions::FPEvalMethodKind::FEM_Double;
+    return 1;
   }
 };
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 5877a33df1017..a16175ebebbca 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2726,8 +2726,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   StringRef FPModel = "";
   // -ffp-exception-behavior options: strict, maytrap, ignore
   StringRef FPExceptionBehavior = "";
-  // -ffp-eval-method options: double, extended, source
-  StringRef FPEvalMethod = "";
   const llvm::DenormalMode DefaultDenormalFPMath =
       TC.getDefaultDenormalModeForType(Args, JA);
   const llvm::DenormalMode DefaultDenormalFP32Math =
@@ -2923,18 +2921,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       break;
     }
 
-    // Validate and pass through -ffp-eval-method option.
-    case options::OPT_ffp_eval_method_EQ: {
-      StringRef Val = A->getValue();
-      if (Val.equals("double") || Val.equals("extended") ||
-          Val.equals("source"))
-        FPEvalMethod = Val;
-      else
-        D.Diag(diag::err_drv_unsupported_option_argument)
-            << A->getOption().getName() << Val;
-      break;
-    }
-
     case options::OPT_ffinite_math_only:
       HonorINFs = false;
       HonorNaNs = false;
@@ -3090,9 +3076,6 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     CmdArgs.push_back(Args.MakeArgString("-ffp-exception-behavior=" +
                       FPExceptionBehavior));
 
-  if (!FPEvalMethod.empty())
-    CmdArgs.push_back(Args.MakeArgString("-ffp-eval-method=" + FPEvalMethod));
-
   ParseMRecip(D, Args, CmdArgs);
 
   // -ffast-math enables the __FAST_MATH__ preprocessor macro, but check for the
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index ff507e2c00aaa..bf8a0b2abe22e 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1136,6 +1136,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   }
 
   // Macros to control C99 numerics and <float.h>
+  Builder.defineMacro("__FLT_EVAL_METHOD__", Twine(TI.getFloatEvalMethod()));
   Builder.defineMacro("__FLT_RADIX__", "2");
   Builder.defineMacro("__DECIMAL_DIG__", "__LDBL_DECIMAL_DIG__");
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 82fc57c8f2e88..a29ff215d7ea0 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -342,7 +342,6 @@ void Preprocessor::RegisterBuiltinMacros() {
   Ident__TIME__ = RegisterBuiltinMacro(*this, "__TIME__");
   Ident__COUNTER__ = RegisterBuiltinMacro(*this, "__COUNTER__");
   Ident_Pragma  = RegisterBuiltinMacro(*this, "_Pragma");
-  Ident__FLT_EVAL_METHOD__ = RegisterBuiltinMacro(*this, "__FLT_EVAL_METHOD__");
 
   // C++ Standing Document Extensions.
   if (getLangOpts().CPlusPlus)
@@ -1575,17 +1574,6 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
     // Surround the string with " and strip the trailing newline.
     OS << '"' << StringRef(Result).drop_back() << '"';
     Tok.setKind(tok::string_literal);
-  } else if (II == Ident__FLT_EVAL_METHOD__) {
-    // __FLT_EVAL_METHOD__ is set to the default value.
-    OS << getTUFPEvalMethod();
-    // __FLT_EVAL_METHOD__ expands to a simple numeric value.
-    Tok.setKind(tok::numeric_constant);
-    if (getLastFPEvalPragmaLocation().isValid()) {
-      // The program is ill-formed. The value of __FLT_EVAL_METHOD__ is altered
-      // by the pragma.
-      Diag(Tok, diag::err_illegal_use_of_flt_eval_macro);
-      Diag(getLastFPEvalPragmaLocation(), diag::note_pragma_entered_here);
-    }
   } else if (II == Ident__COUNTER__) {
     // __COUNTER__ expands to a simple numeric value.
     OS << CounterValue++;
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 5c6aa0e47635b..27e8501278626 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -3028,13 +3028,12 @@ void PragmaOptimizeHandler::HandlePragma(Preprocessor &PP,
 namespace {
 /// Used as the annotation value for tok::annot_pragma_fp.
 struct TokFPAnnotValue {
-  enum FlagKinds { Contract, Reassociate, Exceptions, EvalMethod };
+  enum FlagKinds { Contract, Reassociate, Exceptions };
   enum FlagValues { On, Off, Fast };
 
   llvm::Optional<LangOptions::FPModeKind> ContractValue;
   llvm::Optional<LangOptions::FPModeKind> ReassociateValue;
   llvm::Optional<LangOptions::FPExceptionModeKind> ExceptionsValue;
-  llvm::Optional<LangOptions::FPEvalMethodKind> EvalMethodValue;
 };
 } // end anonymous namespace
 
@@ -3061,7 +3060,6 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
             .Case("contract", TokFPAnnotValue::Contract)
             .Case("reassociate", TokFPAnnotValue::Reassociate)
             .Case("exceptions", TokFPAnnotValue::Exceptions)
-            .Case("eval_method", TokFPAnnotValue::EvalMethod)
             .Default(None);
     if (!FlagKind) {
       PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_option)
@@ -3076,11 +3074,8 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
       return;
     }
     PP.Lex(Tok);
-    bool isEvalMethodDouble =
-        Tok.is(tok::kw_double) && FlagKind == TokFPAnnotValue::EvalMethod;
 
-    // Don't diagnose if we have an eval_metod pragma with "double" kind.
-    if (Tok.isNot(tok::identifier) && !isEvalMethodDouble) {
+    if (Tok.isNot(tok::identifier)) {
       PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_argument)
           << PP.getSpelling(Tok) << OptionInfo->getName()
           << static_cast<int>(*FlagKind);
@@ -3126,19 +3121,6 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
             << PP.getSpelling(Tok) << OptionInfo->getName() << *FlagKind;
         return;
       }
-    } else if (FlagKind == TokFPAnnotValue::EvalMethod) {
-      AnnotValue->EvalMethodValue =
-          llvm::StringSwitch<llvm::Optional<LangOptions::FPEvalMethodKind>>(
-              II->getName())
-              .Case("source", LangOptions::FPEvalMethodKind::FEM_Source)
-              .Case("double", LangOptions::FPEvalMethodKind::FEM_Double)
-              .Case("extended", LangOptions::FPEvalMethodKind::FEM_Extended)
-              .Default(llvm::None);
-      if (!AnnotValue->EvalMethodValue) {
-        PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_argument)
-            << PP.getSpelling(Tok) << OptionInfo->getName() << *FlagKind;
-        return;
-      }
     }
     PP.Lex(Tok);
 
@@ -3241,9 +3223,6 @@ void Parser::HandlePragmaFP() {
   if (AnnotValue->ExceptionsValue)
     Actions.ActOnPragmaFPExceptions(Tok.getLocation(),
                                     *AnnotValue->ExceptionsValue);
-  if (AnnotValue->EvalMethodValue)
-    Actions.ActOnPragmaFPEvalMethod(Tok.getLocation(),
-                                    *AnnotValue->EvalMethodValue);
   ConsumeAnnotationToken();
 }
 
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index cadedf6d98dbd..ee07775b6346f 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -1153,16 +1153,6 @@ StmtResult Parser::ParseCompoundStatementBody(bool isStmtExpr) {
     if (R.isUsable())
       Stmts.push_back(R.get());
   }
-  // Warn the user that using option `-ffp-eval-method=source` on a
-  // 32-bit target and feature `sse` disabled, or using
-  // `pragma clang fp eval_method=source` and feature `sse` disabled, is not
-  // supported.
-  if (!PP.getTargetInfo().supportSourceEvalMethod() &&
-      (PP.getLastFPEvalPragmaLocation().isValid() ||
-       PP.getCurrentFPEvalMethod() ==
-           LangOptions::FPEvalMethodKind::FEM_Source))
-    Diag(Tok.getLocation(),
-         diag::warn_no_support_for_eval_method_source_on_m32);
 
   SourceLocation CloseLoc = Tok.getLocation();
 
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index db3eda622639f..7b57c8da4e9cc 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -242,15 +242,6 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
   SemaPPCallbackHandler = Callbacks.get();
   PP.addPPCallbacks(std::move(Callbacks));
   SemaPPCallbackHandler->set(*this);
-  if (getLangOpts().getFPEvalMethod() == LangOptions::FEM_UnsetOnCommandLine)
-    // Use setting from TargetInfo.
-    PP.setCurrentFPEvalMethod(SourceLocation(),
-                              ctxt.getTargetInfo().getFPEvalMethod());
-  else
-    // Set initial value of __FLT_EVAL_METHOD__ from the command line.
-    PP.setCurrentFPEvalMethod(SourceLocation(),
-                              getLangOpts().getFPEvalMethod());
-  CurFPFeatures.setFPEvalMethod(PP.getCurrentFPEvalMethod());
 }
 
 // Anchor Sema's type info to this TU.
@@ -2639,15 +2630,3 @@ const llvm::MapVector<FieldDecl *, Sema::DeleteLocs> &
 Sema::getMismatchingDeleteExpressions() const {
   return DeleteExprs;
 }
-
-Sema::FPFeaturesStateRAII::FPFeaturesStateRAII(Sema &S)
-    : S(S), OldFPFeaturesState(S.CurFPFeatures),
-      OldOverrides(S.FpPragmaStack.CurrentValue),
-      OldEvalMethod(S.PP.getCurrentFPEvalMethod()),
-      OldFPPragmaLocation(S.PP.getLastFPEvalPragmaLocation()) {}
-
-Sema::FPFeaturesStateRAII::~FPFeaturesStateRAII() {
-  S.CurFPFeatures = OldFPFeaturesState;
-  S.FpPragmaStack.CurrentValue = OldOverrides;
-  S.PP.setCurrentFPEvalMethod(OldFPPragmaLocation, OldEvalMethod);
-}
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index d623060fd10cf..38e6e60af90db 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -470,27 +470,6 @@ void Sema::ActOnPragmaDetectMismatch(SourceLocation Loc, StringRef Name,
   Consumer.HandleTopLevelDecl(DeclGroupRef(PDMD));
 }
 
-void Sema::ActOnPragmaFPEvalMethod(SourceLocation Loc,
-                                   LangOptions::FPEvalMethodKind Value) {
-  FPOptionsOverride NewFPFeatures = CurFPFeatureOverrides();
-  switch (Value) {
-  default:
-    llvm_unreachable("invalid pragma eval_method kind");
-  case LangOptions::FEM_Source:
-    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Source);
-    break;
-  case LangOptions::FEM_Double:
-    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Double);
-    break;
-  case LangOptions::FEM_Extended:
-    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Extended);
-    break;
-  }
-  FpPragmaStack.Act(Loc, PSK_Set, StringRef(), NewFPFeatures);
-  CurFPFeatures = NewFPFeatures.applyOverrides(getLangOpts());
-  PP.setCurrentFPEvalMethod(Loc, Value);
-}
-
 void Sema::ActOnPragmaFloatControl(SourceLocation Loc,
                                    PragmaMsStackAction Action,
                                    PragmaFloatControlKind Value) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 88fc89bec629a..22b3f371afe79 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -773,40 +773,6 @@ ExprResult Sema::UsualUnaryConversions(Expr *E) {
   QualType Ty = E->getType();
   assert(!Ty.isNull() && "UsualUnaryConversions - missing type");
 
-  LangOptions::FPEvalMethodKind EvalMethod = CurFPFeatures.getFPEvalMethod();
-  if (EvalMethod != LangOptions::FEM_Source && Ty->isFloatingType() &&
-      (getLangOpts().getFPEvalMethod() !=
-           LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine ||
-       PP.getLastFPEvalPragmaLocation().isValid())) {
-    switch (EvalMethod) {
-    default:
-      llvm_unreachable("Unrecognized float evaluation method");
-      break;
-    case LangOptions::FEM_UnsetOnCommandLine:
-      llvm_unreachable("Float evaluation method should be set by now");
-      break;
-    case LangOptions::FEM_Double:
-      if (Context.getFloatingTypeOrder(Context.DoubleTy, Ty) > 0)
-        // Widen the expression to double.
-        return Ty->isComplexType()
-                   ? ImpCastExprToType(E,
-                                       Context.getComplexType(Context.DoubleTy),
-                                       CK_FloatingComplexCast)
-                   : ImpCastExprToType(E, Context.DoubleTy, CK_FloatingCast);
-      break;
-    case LangOptions::FEM_Extended:
-      if (Context.getFloatingTypeOrder(Context.LongDoubleTy, Ty) > 0)
-        // Widen the expression to long double.
-        return Ty->isComplexType()
-                   ? ImpCastExprToType(
-                         E, Context.getComplexType(Context.LongDoubleTy),
-                         CK_FloatingComplexCast)
-                   : ImpCastExprToType(E, Context.LongDoubleTy,
-                                       CK_FloatingCast);
-      break;
-    }
-  }
-
   // Half FP have to be promoted to float unless it is natively supported
   if (Ty->isHalfType() && !getLangOpts().NativeHalfType)
     return ImpCastExprToType(Res.get(), Context.FloatTy, CK_FloatingCast);
diff --git a/clang/test/CodeGen/X86/32bit-behavior-no-eval.c b/clang/test/CodeGen/X86/32bit-behavior-no-eval.c
deleted file mode 100644
index d040e827ce31c..0000000000000
--- a/clang/test/CodeGen/X86/32bit-behavior-no-eval.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SSE
-// RUN: %clang_cc1  \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
-// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CHECK %s
-
-// NO SSE
-// RUN: %clang_cc1  \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CHECK %s
-
-// NO SSE Fast Math
-// RUN: %clang_cc1  \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -ffast-math -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-FM %s
-
-float addit(float a, float b, float c) {
-  // CHECK: load float, float*
-  // CHECK: load float, float*
-  // CHECK: fadd float
-  // CHECK: load float, float*
-  // CHECK: fadd float
-
-  // CHECK-FM: load float, float*
-  // CHECK-FM: load float, float*
-  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn float
-  // CHECK-FM: load float, float*
-  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn float
-
-  return a + b + c;
-}
diff --git a/clang/test/CodeGen/X86/32bit-behavior.c b/clang/test/CodeGen/X86/32bit-behavior.c
deleted file mode 100644
index a7e0f008c9f35..0000000000000
--- a/clang/test/CodeGen/X86/32bit-behavior.c
+++ /dev/null
@@ -1,109 +0,0 @@
-// SSE
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
-// RUN: | FileCheck -check-prefix=CHECK-SRC %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
-// RUN: | FileCheck -check-prefix=CHECK-DBL %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
-// RUN: | FileCheck -check-prefix=CHECK-DBL %s
-
-// SSE Fast Math
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
-// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM-SRC %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
-// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
-// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM %s
-
-// NO SSE
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=source  \
-// RUN: | FileCheck -check-prefix=CHECK-SRC %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
-// RUN: | FileCheck -check-prefix=CHECK-DBL %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
-// RUN: | FileCheck -check-prefix=CHECK-DBL %s
-
-// NO SSE Fast Math
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
-// RUN: -ffast-math | FileCheck -check-prefix=CHECK %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
-// RUN: -ffast-math | FileCheck -check-prefix=CHECK-DBL-FM %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
-// RUN: -ffast-math | FileCheck -check-prefix=CHECK-DBL-FM %s
-
-float addit(float a, float b, float c) {
-  // CHECK-SRC: load float, float*
-  // CHECK-SRC: load float, float*
-  // CHECK-SRC: fadd float
-  // CHECK-SRC: load float, float*
-  // CHECK-SRC: fadd float
-
-  // CHECK-FM-SRC: load float, float*
-  // CHECK-FM-SRC: load float, float*
-  // CHECK-FM-SRC: fadd reassoc nnan ninf nsz arcp afn float
-  // CHECK-FM-SRC: load float, float*
-  // CHECK-FM-SRC: fadd reassoc nnan ninf nsz arcp afn float
-
-  // CHECK-FM: load float, float*
-  // CHECK-FM: fpext float {{.*}} to double
-  // CHECK-FM: load float, float*
-  // CHECK-FM: fpext float {{.*}} to double
-  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn double
-  // CHECK-FM: load float, float*
-  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn double
-  // CHECK-FM: fptrunc double {{.*}} to float
-
-  // CHECK-DBL: load float, float*
-  // CHECK-DBL: fpext float {{.*}} to double
-  // CHECK-DBL: load float, float*
-  // CHECK-DBL: fpext float {{.*}} to double
-  // CHECK-DBL: fadd double
-  // CHECK-DBL: load float, float*
-  // CHECK-DBL: fpext float {{.*}} to double
-  // CHECK-DBL: fadd double
-  // CHECK-DBL: fptrunc double {{.*}} to float
-
-  // CHECK-DBL-FM: load float, float*
-  // CHECK-DBL-FM: fpext float {{.*}} to double
-  // CHECK-DBL-FM: load float, float*
-  // CHECK-DBL-FM: fpext float {{.*}} to double
-  // CHECK-DBL-FM: fadd reassoc nnan ninf nsz arcp afn double
-  // CHECK-DBL-FM: load float, float*
-  // CHECK-DBL-FM: fpext float {{.*}} to double
-  // CHECK-DBL-FM: fadd reassoc nnan ninf nsz arcp afn double
-  // CHECK-DBL-FM: fptrunc double {{.*}} to float
-
-  // CHECK: ret float
-  return a + b + c;
-}
diff --git a/clang/test/CodeGen/X86/fp-eval-method.c b/clang/test/CodeGen/X86/fp-eval-method.c
deleted file mode 100644
index 5bfc3701050f5..0000000000000
--- a/clang/test/CodeGen/X86/fp-eval-method.c
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 -triple i386-unknown-netbsd6 -emit-llvm -o - %s \
-// RUN: | FileCheck %s -check-prefixes=CHECK
-
-// RUN: %clang_cc1 -triple i386-unknown-netbsd7 -emit-llvm -o - %s \
-// RUN: | FileCheck %s -check-prefixes=CHECK-EXT
-
-// RUN: %clang_cc1 -triple i386--linux -emit-llvm -o - %s \
-// RUN: | FileCheck %s -check-prefixes=CHECK-EXT
-
-float f(float x, float y) {
-  // CHECK: define{{.*}} float @f
-  // CHECK: fadd float
-  return 2.0f + x + y;
-}
-
-int getEvalMethod() {
-  // CHECK: ret i32 1
-  // CHECK-EXT: ret i32 2
-  return __FLT_EVAL_METHOD__;
-}
diff --git a/clang/test/CodeGen/flt_eval_macro.cpp b/clang/test/CodeGen/flt_eval_macro.cpp
deleted file mode 100644
index aa7455f0efe0b..0000000000000
--- a/clang/test/CodeGen/flt_eval_macro.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 \
-// RUN: -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s \
-// RUN: | FileCheck -check-prefix=CHECK-SRC %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
-// RUN: | FileCheck -check-prefix=CHECK-SRC %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=double \
-// RUN: | FileCheck -check-prefixes=CHECK-DBL %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=extended \
-// RUN: | FileCheck -check-prefixes=CHECK-EXT-FLT %s
-
-// RUN: %clang_cc1 -triple powerpc-unknown-aix -emit-llvm -o - %s \
-// RUN: | FileCheck %s -check-prefix=CHECK-DBL-PPC
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=extended -mlong-double-80 \
-// RUN: | FileCheck %s -check-prefix=CHECK-EXT-FLT
-
-int getFEM() {
-  // LABEL: define {{.*}}getFEM{{.*}}
-  return __FLT_EVAL_METHOD__;
-  // CHECK-SRC: ret {{.*}} 0
-  // CHECK-DBL: ret {{.*}} 1
-  // CHECK-DBL-PPC: ret {{.*}} 1
-  // CHECK-EXT-FLT: ret {{.*}} 2
-}
-
-float func() {
-  // LABEL: define {{.*}}@_Z4func{{.*}}
-  float X = 100.0f;
-  float Y = -45.3f;
-  float Z = 393.78f;
-  float temp;
-#if __FLT_EVAL_METHOD__ == 0
-  temp = X + Y + Z;
-#elif __FLT_EVAL_METHOD__ == 1
-  temp = X * Y * Z;
-#elif __FLT_EVAL_METHOD__ == 2
-  temp = X * Y - Z;
-#endif
-  // CHECK-SRC: load float, float*
-  // CHECK-SRC: load float, float*
-  // CHECK-SRC: fadd float
-  // CHECK-SRC: load float, float*
-  // CHECK-SRC: fadd float
-
-  // CHECK-DBL: load float, float*
-  // CHECK-DBL: fpext float
-  // CHECK-DBL: load float, float*
-  // CHECK-DBL: fpext float
-  // CHECK-DBL: fmul double
-  // CHECK-DBL: load float, float*
-  // CHECK-DBL: fpext float
-  // CHECK-DBL: fmul double
-  // CHECK-DBL: fptrunc double
-
-  // CHECK-EXT-FLT: load float, float*
-  // CHECK-EXT-FLT: fpext float
-  // CHECK-EXT-FLT: load float, float*
-  // CHECK-EXT-FLT: fpext float
-  // CHECK-EXT-FLT: fmul x86_fp80
-  // CHECK-EXT-FLT: load float, float*
-  // CHECK-EXT-FLT: fpext float
-  // CHECK-EXT-FLT: fsub x86_fp80
-  // CHECK-EXT-FLT: fptrunc x86_fp80
-
-  // CHECK-DBL-PPC: load float, float*
-  // CHECK-DBL-PPC: load float, float*
-  // CHECK-DBL-PPC: fmul float
-  // CHECK-DBL-PPC: load float, float*
-  // CHECK-DBL-PPC: fmul float
-
-  return temp;
-}
diff --git a/clang/test/CodeGen/fp-floatcontrol-pragma.cpp b/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
index 966eaf6053970..ef29d24de1dbc 100644
--- a/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
+++ b/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
@@ -1,53 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 \
-// RUN: -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s \
-// RUN: | FileCheck -check-prefix=CHECK-NS %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s \
-// RUN: -check-prefixes=CHECK-DEFAULT,CHECK-CONST-ARGS
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DFENV_ON=1 \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s \
-// RUN: | FileCheck -check-prefix=CHECK-FENV %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DNF128 \
-// RUN: -triple %itanium_abi_triple -O3 -emit-llvm -o - %s \
-// RUN: | FileCheck -check-prefix=CHECK-O3 %s
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
-// RUN: | FileCheck %s -check-prefixes=CHECK-SOURCE,CHECK-CONST-ARGS
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=double \
-// RUN: | FileCheck %s -check-prefixes=CHECK-DOUBLE,CHECK-CONST-ARGS
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=extended \
-// RUN: -mlong-double-80 | FileCheck %s \
-// RUN: -check-prefixes=CHECK-EXTENDED,CHECK-CONST-ARGS
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
-// RUN: | FileCheck %s -check-prefix=CHECK-SOURCE
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=double | FileCheck %s \
-// RUN: -check-prefix=CHECK-DOUBLE
-
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
-// RUN: -emit-llvm -o - %s -ffp-eval-method=extended -mlong-double-80 \
-// RUN: | FileCheck %s -check-prefix=CHECK-EXTENDED
-
-// RUN: %clang_cc1 -triple powerpc-unknown-aix -DNF128 -emit-llvm -o - %s \
-// RUN: | FileCheck %s -check-prefix=CHECK-AIX
-
-bool f() {
-  // CHECK: define {{.*}}f{{.*}}
-  return __FLT_EVAL_METHOD__ < 0 &&
-         __FLT_EVAL_METHOD__ == -1;
-  // CHECK: ret {{.*}} true
-}
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NS %s
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DFENV_ON=1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-FENV %s
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple %itanium_abi_triple -O3 -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-O3 %s
 
 // Verify float_control(precise, off) enables fast math flags on fp operations.
 float fp_precise_1(float a, float b, float c) {
@@ -275,115 +229,3 @@ float try_lam(float x, unsigned n) {
   result = x + t;
   return result;
 }
-
-float mySub(float x, float y) {
-  // CHECK: define {{.*}}float {{.*}}mySub{{.*}}
-  // CHECK-NS: fsub float
-  // CHECK-SOURCE: fsub float
-  // CHECK-DOUBLE: fpext float
-  // CHECK-DOUBLE: fpext float
-  // CHECK-DOUBLE: fsub double
-  // CHECK-DOUBLE: fptrunc double {{.*}} to float
-  // CHECK-EXTENDED: fpext float
-  // CHECK-EXTENDED: fpext float
-  // CHECK-EXTENDED: fsub double
-  // CHECK-EXTENDED: fptrunc double {{.*}} to float
-  return x - y;
-}
-
-float mySubSource(float x, float y) {
-// CHECK: define {{.*}}float {{.*}}mySubSource{{.*}}
-#pragma clang fp eval_method(source)
-  return x - y;
-  // CHECK: fsub float
-}
-
-float mySubExtended(float x, float y) {
-// CHECK: define {{.*}}float {{.*}}mySubExtended{{.*}}
-#pragma clang fp eval_method(extended)
-  return x - y;
-  // CHECK: fpext float
-  // CHECK: fpext float
-  // CHECK: fsub x86_fp80
-  // CHECK: fptrunc x86_fp80 {{.*}} to float
-  // CHECK-AIX: fsub double
-  // CHECK-AIX: fptrunc double
-}
-
-float mySubDouble(float x, float y) {
-// CHECK: define {{.*}}float {{.*}}mySubDouble{{.*}}
-#pragma clang fp eval_method(double)
-  return x - y;
-  // CHECK: fpext float
-  // CHECK: fpext float
-  // CHECK: fsub double
-  // CHECK: fptrunc double {{.*}} to float
-}
-
-#ifndef NF128
-__float128 mySub128(__float128 x, __float128 y) {
-  // CHECK: define {{.*}}mySub128{{.*}}
-  // Expect no fpext since fp128 is already widest
-  // CHECK: load fp128
-  // CHECK-NEXT: load fp128
-  // CHECK-NEXT: fsub fp128
-  // CHECK-NEXT: ret fp128
-  return x - y;
-}
-#endif
-
-void mySubfp16(__fp16 *res, __fp16 *x, __fp16 *y) {
-  // CHECK: define {{.*}}mySubfp16{{.*}}
-  *res = *x - *y;
-  // CHECK: load half
-  // CHECK-NEXT: load half
-  // CHECK-NEXT: fpext half{{.*}}
-  // CHECK-NEXT: load half
-  // CHECK-NEXT: load half
-  // CHECK-NS: fpext half{{.*}} to float
-  // CHECK-DEFAULT: fpext half{{.*}} to float
-  // CHECK-DOUBLE: fpext half{{.*}} to float
-  // CHECK-EXTENDED: fpext half{{.*}} to float
-  // CHECK-NEXT: fsub
-  // CHECK-NEXT: fptrunc {{.*}}to half
-  // CHECK-NS: fptrunc float {{.*}} to half
-  // CHECK-DOUBLE: fptrunc float {{.*}} to half
-  // CHECK-EXTENDED: fptrunc float {{.*}} to half
-}
-
-float Div(float x, float y, float z) {
-  // CHECK: define{{.*}}float {{.*}}Div{{.*}}
-  // CHECK-CONST-ARGS: fdiv float
-  return x / (y / z);
-}
-
-float DivExtended(float x, float y, float z) {
-// CHECK: define{{.*}}float {{.*}}DivExtended{{.*}}
-#pragma clang fp eval_method(extended)
-  // CHECK-CONST-ARGS: fdiv x86_fp80
-  // CHECK-CONST-ARGS: fptrunc x86_fp80
-  return x / (y / z);
-}
-
-float DivDouble(float x, float y, float z) {
-// CHECK: define{{.*}}float {{.*}}DivDouble{{.*}}
-#pragma clang fp eval_method(double)
-  // CHECK-CONST-ARGS: fdiv double
-  // CHECK-CONST-ARGS: fptrunc double
-  return x / (y / z);
-}
-
-float DivSource(float x, float y, float z) {
-// CHECK: define{{.*}}float {{.*}}DivSource{{.*}}
-#pragma clang fp eval_method(source)
-  // CHECK-CONST-ARGS: fdiv float
-  return x / (y / z);
-}
-
-int main() {
-  float f = Div(4.2f, 1.0f, 3.0f);
-  float fextended = DivExtended(4.2f, 1.0f, 3.0f);
-  float fdouble = DivDouble(4.2f, 1.0f, 3.0f);
-  float fsource = DivSource(4.2f, 1.0f, 3.0f);
-  // CHECK: store float
-}
diff --git a/clang/test/Preprocessor/flt_eval_macro.cpp b/clang/test/Preprocessor/flt_eval_macro.cpp
deleted file mode 100644
index 47f2592e261bd..0000000000000
--- a/clang/test/Preprocessor/flt_eval_macro.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: %clang_cc1 -E -dM %s -o - | FileCheck %s -strict-whitespace
-
-#ifdef __FLT_EVAL_METHOD__
-#if __FLT_EVAL_METHOD__ == 3
-#define __GLIBC_FLT_EVAL_METHOD 2
-#else
-#define __GLIBC_FLT_EVAL_METHOD __FLT_EVAL_METHOD__
-#endif
-#elif defined __x86_64__
-#define __GLIBC_FLT_EVAL_METHOD 0
-#else
-#define __GLIBC_FLT_EVAL_METHOD 2
-#endif
-
-#if __GLIBC_FLT_EVAL_METHOD == 0 || __GLIBC_FLT_EVAL_METHOD == 16
-#define Name "One"
-#elif __GLIBC_FLT_EVAL_METHOD == 1
-#define Name "Two"
-#elif __GLIBC_FLT_EVAL_METHOD == 2
-#define Name "Unset on command line"
-#elif __GLIBC_FLT_EVAL_METHOD == 32
-#define Name "Four"
-#elif __GLIBC_FLT_EVAL_METHOD == 33
-#define Name "Five"
-#elif __GLIBC_FLT_EVAL_METHOD == 64
-#define Name "Six"
-#elif __GLIBC_FLT_EVAL_METHOD == 65
-#define Name "Seven"
-#elif __GLIBC_FLT_EVAL_METHOD == 128
-#define Name "Eight"
-#elif __GLIBC_FLT_EVAL_METHOD == 129
-#define Name "Nine"
-#else
-#error "Unknown __GLIBC_FLT_EVAL_METHOD"
-#endif
-
-int foo() {
-  // CHECK: #define Name "Unset on command line"
-  return Name;
-}
-
-#if __FLT_EVAL_METHOD__ == 3
-#define Val "val0"
-#endif
-
-#pragma fp eval_method(double)
-
-#if __FLT_EVAL_METHOD__ == 0
-#define Val "val1"
-#elif __FLT_EVAL_METHOD__ == 1
-#define Val "val2"
-#elif __FLT_EVAL_METHOD__ == 2
-#define Val "val3"
-#endif
-
-int goo() {
-  // CHECK: #define Val "val0"
-  return Name;
-}
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 66cab8b1f8d04..f6809d8d9b48f 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -93,6 +93,7 @@
 // AARCH64-NEXT: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-NEXT: #define __FLT_DIG__ 6
 // AARCH64-NEXT: #define __FLT_EPSILON__ 1.19209290e-7F
+// AARCH64-NEXT: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-NEXT: #define __FLT_HAS_DENORM__ 1
 // AARCH64-NEXT: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-NEXT: #define __FLT_HAS_QUIET_NAN__ 1
@@ -387,6 +388,7 @@
 // AARCH64-DARWIN: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-DARWIN: #define __FLT_DIG__ 6
 // AARCH64-DARWIN: #define __FLT_EPSILON__ 1.19209290e-7F
+// AARCH64-DARWIN: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-DARWIN: #define __FLT_HAS_DENORM__ 1
 // AARCH64-DARWIN: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-DARWIN: #define __FLT_HAS_QUIET_NAN__ 1
@@ -602,6 +604,7 @@
 // AARCH64-MSVC: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-MSVC: #define __FLT_DIG__ 6
 // AARCH64-MSVC: #define __FLT_EPSILON__ 1.19209290e-7F
+// AARCH64-MSVC: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-MSVC: #define __FLT_HAS_DENORM__ 1
 // AARCH64-MSVC: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-MSVC: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-arm.c b/clang/test/Preprocessor/init-arm.c
index 2d1503c18560e..32eb2c513f8b0 100644
--- a/clang/test/Preprocessor/init-arm.c
+++ b/clang/test/Preprocessor/init-arm.c
@@ -35,6 +35,7 @@
 // ARM:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM:#define __FLT_DIG__ 6
 // ARM:#define __FLT_EPSILON__ 1.19209290e-7F
+// ARM:#define __FLT_EVAL_METHOD__ 0
 // ARM:#define __FLT_HAS_DENORM__ 1
 // ARM:#define __FLT_HAS_INFINITY__ 1
 // ARM:#define __FLT_HAS_QUIET_NAN__ 1
@@ -234,6 +235,7 @@
 // ARM-BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM-BE:#define __FLT_DIG__ 6
 // ARM-BE:#define __FLT_EPSILON__ 1.19209290e-7F
+// ARM-BE:#define __FLT_EVAL_METHOD__ 0
 // ARM-BE:#define __FLT_HAS_DENORM__ 1
 // ARM-BE:#define __FLT_HAS_INFINITY__ 1
 // ARM-BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -426,6 +428,7 @@
 // ARMEABISOFTFP:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARMEABISOFTFP:#define __FLT_DIG__ 6
 // ARMEABISOFTFP:#define __FLT_EPSILON__ 1.19209290e-7F
+// ARMEABISOFTFP:#define __FLT_EVAL_METHOD__ 0
 // ARMEABISOFTFP:#define __FLT_HAS_DENORM__ 1
 // ARMEABISOFTFP:#define __FLT_HAS_INFINITY__ 1
 // ARMEABISOFTFP:#define __FLT_HAS_QUIET_NAN__ 1
@@ -620,6 +623,7 @@
 // ARMEABIHARDFP:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARMEABIHARDFP:#define __FLT_DIG__ 6
 // ARMEABIHARDFP:#define __FLT_EPSILON__ 1.19209290e-7F
+// ARMEABIHARDFP:#define __FLT_EVAL_METHOD__ 0
 // ARMEABIHARDFP:#define __FLT_HAS_DENORM__ 1
 // ARMEABIHARDFP:#define __FLT_HAS_INFINITY__ 1
 // ARMEABIHARDFP:#define __FLT_HAS_QUIET_NAN__ 1
@@ -817,6 +821,7 @@
 // ARM-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM-NETBSD:#define __FLT_DIG__ 6
 // ARM-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
+// ARM-NETBSD:#define __FLT_EVAL_METHOD__ 0
 // ARM-NETBSD:#define __FLT_HAS_DENORM__ 1
 // ARM-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // ARM-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c
index a07cee64e6848..d76396aa35c91 100644
--- a/clang/test/Preprocessor/init-mips.c
+++ b/clang/test/Preprocessor/init-mips.c
@@ -37,6 +37,7 @@
 // MIPS32BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS32BE:#define __FLT_DIG__ 6
 // MIPS32BE:#define __FLT_EPSILON__ 1.19209290e-7F
+// MIPS32BE:#define __FLT_EVAL_METHOD__ 0
 // MIPS32BE:#define __FLT_HAS_DENORM__ 1
 // MIPS32BE:#define __FLT_HAS_INFINITY__ 1
 // MIPS32BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -246,6 +247,7 @@
 // MIPS32EL:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS32EL:#define __FLT_DIG__ 6
 // MIPS32EL:#define __FLT_EPSILON__ 1.19209290e-7F
+// MIPS32EL:#define __FLT_EVAL_METHOD__ 0
 // MIPS32EL:#define __FLT_HAS_DENORM__ 1
 // MIPS32EL:#define __FLT_HAS_INFINITY__ 1
 // MIPS32EL:#define __FLT_HAS_QUIET_NAN__ 1
@@ -465,6 +467,7 @@
 // MIPSN32BE: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPSN32BE: #define __FLT_DIG__ 6
 // MIPSN32BE: #define __FLT_EPSILON__ 1.19209290e-7F
+// MIPSN32BE: #define __FLT_EVAL_METHOD__ 0
 // MIPSN32BE: #define __FLT_HAS_DENORM__ 1
 // MIPSN32BE: #define __FLT_HAS_INFINITY__ 1
 // MIPSN32BE: #define __FLT_HAS_QUIET_NAN__ 1
@@ -771,6 +774,7 @@
 // MIPSN32EL: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPSN32EL: #define __FLT_DIG__ 6
 // MIPSN32EL: #define __FLT_EPSILON__ 1.19209290e-7F
+// MIPSN32EL: #define __FLT_EVAL_METHOD__ 0
 // MIPSN32EL: #define __FLT_HAS_DENORM__ 1
 // MIPSN32EL: #define __FLT_HAS_INFINITY__ 1
 // MIPSN32EL: #define __FLT_HAS_QUIET_NAN__ 1
@@ -1070,6 +1074,7 @@
 // MIPS64BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS64BE:#define __FLT_DIG__ 6
 // MIPS64BE:#define __FLT_EPSILON__ 1.19209290e-7F
+// MIPS64BE:#define __FLT_EVAL_METHOD__ 0
 // MIPS64BE:#define __FLT_HAS_DENORM__ 1
 // MIPS64BE:#define __FLT_HAS_INFINITY__ 1
 // MIPS64BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1279,6 +1284,7 @@
 // MIPS64EL:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS64EL:#define __FLT_DIG__ 6
 // MIPS64EL:#define __FLT_EPSILON__ 1.19209290e-7F
+// MIPS64EL:#define __FLT_EVAL_METHOD__ 0
 // MIPS64EL:#define __FLT_HAS_DENORM__ 1
 // MIPS64EL:#define __FLT_HAS_INFINITY__ 1
 // MIPS64EL:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-ppc.c b/clang/test/Preprocessor/init-ppc.c
index 45c8a5e53ad4f..611b16dfb8f7e 100644
--- a/clang/test/Preprocessor/init-ppc.c
+++ b/clang/test/Preprocessor/init-ppc.c
@@ -30,6 +30,7 @@
 // PPC603E:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC603E:#define __FLT_DIG__ 6
 // PPC603E:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC603E:#define __FLT_EVAL_METHOD__ 0
 // PPC603E:#define __FLT_HAS_DENORM__ 1
 // PPC603E:#define __FLT_HAS_INFINITY__ 1
 // PPC603E:#define __FLT_HAS_QUIET_NAN__ 1
@@ -223,6 +224,7 @@
 // PPC:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC:#define __FLT_DIG__ 6
 // PPC:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC:#define __FLT_EVAL_METHOD__ 0
 // PPC:#define __FLT_HAS_DENORM__ 1
 // PPC:#define __FLT_HAS_INFINITY__ 1
 // PPC:#define __FLT_HAS_QUIET_NAN__ 1
@@ -423,6 +425,7 @@
 // PPC-AIX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-AIX:#define __FLT_DIG__ 6
 // PPC-AIX:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC-AIX:#define __FLT_EVAL_METHOD__ 1
 // PPC-AIX:#define __FLT_HAS_DENORM__ 1
 // PPC-AIX:#define __FLT_HAS_INFINITY__ 1
 // PPC-AIX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -795,6 +798,7 @@
 // PPC-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-LINUX:#define __FLT_DIG__ 6
 // PPC-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC-LINUX:#define __FLT_EVAL_METHOD__ 0
 // PPC-LINUX:#define __FLT_HAS_DENORM__ 1
 // PPC-LINUX:#define __FLT_HAS_INFINITY__ 1
 // PPC-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1002,6 +1006,7 @@
 // PPC-DARWIN:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-DARWIN:#define __FLT_DIG__ 6
 // PPC-DARWIN:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC-DARWIN:#define __FLT_EVAL_METHOD__ 0
 // PPC-DARWIN:#define __FLT_HAS_DENORM__ 1
 // PPC-DARWIN:#define __FLT_HAS_INFINITY__ 1
 // PPC-DARWIN:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index f0ccd1638c04d..7a9525228c3b6 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -35,6 +35,7 @@
 // PPC64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64:#define __FLT_DIG__ 6
 // PPC64:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC64:#define __FLT_EVAL_METHOD__ 0
 // PPC64:#define __FLT_HAS_DENORM__ 1
 // PPC64:#define __FLT_HAS_INFINITY__ 1
 // PPC64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -239,6 +240,7 @@
 // PPC64LE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64LE:#define __FLT_DIG__ 6
 // PPC64LE:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC64LE:#define __FLT_EVAL_METHOD__ 0
 // PPC64LE:#define __FLT_HAS_DENORM__ 1
 // PPC64LE:#define __FLT_HAS_INFINITY__ 1
 // PPC64LE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -701,6 +703,7 @@
 // PPC64-AIX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64-AIX:#define __FLT_DIG__ 6
 // PPC64-AIX:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC64-AIX:#define __FLT_EVAL_METHOD__ 1
 // PPC64-AIX:#define __FLT_HAS_DENORM__ 1
 // PPC64-AIX:#define __FLT_HAS_INFINITY__ 1
 // PPC64-AIX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -899,6 +902,7 @@
 // PPC64-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64-LINUX:#define __FLT_DIG__ 6
 // PPC64-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
+// PPC64-LINUX:#define __FLT_EVAL_METHOD__ 0
 // PPC64-LINUX:#define __FLT_HAS_DENORM__ 1
 // PPC64-LINUX:#define __FLT_HAS_INFINITY__ 1
 // PPC64-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c
index 6c646527f50f7..b0e45b5348ce9 100644
--- a/clang/test/Preprocessor/init-s390x.c
+++ b/clang/test/Preprocessor/init-s390x.c
@@ -23,6 +23,7 @@
 // S390X:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // S390X:#define __FLT_DIG__ 6
 // S390X:#define __FLT_EPSILON__ 1.19209290e-7F
+// S390X:#define __FLT_EVAL_METHOD__ 0
 // S390X:#define __FLT_HAS_DENORM__ 1
 // S390X:#define __FLT_HAS_INFINITY__ 1
 // S390X:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-v7k-compat.c b/clang/test/Preprocessor/init-v7k-compat.c
index ff5d4bbdea53a..482c7ad6ff687 100644
--- a/clang/test/Preprocessor/init-v7k-compat.c
+++ b/clang/test/Preprocessor/init-v7k-compat.c
@@ -28,6 +28,7 @@
 // CHECK: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // CHECK: #define __FLT_DIG__ 6
 // CHECK: #define __FLT_EPSILON__ 1.19209290e-7F
+// CHECK: #define __FLT_EVAL_METHOD__ 0
 // CHECK: #define __FLT_HAS_DENORM__ 1
 // CHECK: #define __FLT_HAS_INFINITY__ 1
 // CHECK: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-x86.c b/clang/test/Preprocessor/init-x86.c
index aa2e05ec807c7..527cd39508889 100644
--- a/clang/test/Preprocessor/init-x86.c
+++ b/clang/test/Preprocessor/init-x86.c
@@ -24,6 +24,7 @@
 // I386:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386:#define __FLT_DIG__ 6
 // I386:#define __FLT_EPSILON__ 1.19209290e-7F
+// I386:#define __FLT_EVAL_METHOD__ 2
 // I386:#define __FLT_HAS_DENORM__ 1
 // I386:#define __FLT_HAS_INFINITY__ 1
 // I386:#define __FLT_HAS_QUIET_NAN__ 1
@@ -212,6 +213,7 @@
 // I386-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386-LINUX:#define __FLT_DIG__ 6
 // I386-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
+// I386-LINUX:#define __FLT_EVAL_METHOD__ 0
 // I386-LINUX:#define __FLT_HAS_DENORM__ 1
 // I386-LINUX:#define __FLT_HAS_INFINITY__ 1
 // I386-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -414,6 +416,7 @@
 // I386-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386-NETBSD:#define __FLT_DIG__ 6
 // I386-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
+// I386-NETBSD:#define __FLT_EVAL_METHOD__ 2
 // I386-NETBSD:#define __FLT_HAS_DENORM__ 1
 // I386-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // I386-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
@@ -587,6 +590,13 @@
 // I386-NETBSD:#define __i386__ 1
 // I386-NETBSD:#define i386 1
 
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD-SSE %s
+// I386-NETBSD-SSE:#define __FLT_EVAL_METHOD__ 0
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6  < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6 %s
+// I386-NETBSD6:#define __FLT_EVAL_METHOD__ 1
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6 -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6-SSE %s
+// I386-NETBSD6-SSE:#define __FLT_EVAL_METHOD__ 1
+
 // RUN: %clang_cc1 -E -dM -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
 // RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
 // RUN: %clang_cc1 -E -dM -triple=i686-unknown-cygwin < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
@@ -621,6 +631,7 @@
 // X86_64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64:#define __FLT_DIG__ 6
 // X86_64:#define __FLT_EPSILON__ 1.19209290e-7F
+// X86_64:#define __FLT_EVAL_METHOD__ 0
 // X86_64:#define __FLT_HAS_DENORM__ 1
 // X86_64:#define __FLT_HAS_INFINITY__ 1
 // X86_64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -828,6 +839,7 @@
 // X32:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X32:#define __FLT_DIG__ 6
 // X32:#define __FLT_EPSILON__ 1.19209290e-7F
+// X32:#define __FLT_EVAL_METHOD__ 0
 // X32:#define __FLT_HAS_DENORM__ 1
 // X32:#define __FLT_HAS_INFINITY__ 1
 // X32:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1034,6 +1046,7 @@
 // X86_64-CLOUDABI:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-CLOUDABI:#define __FLT_DIG__ 6
 // X86_64-CLOUDABI:#define __FLT_EPSILON__ 1.19209290e-7F
+// X86_64-CLOUDABI:#define __FLT_EVAL_METHOD__ 0
 // X86_64-CLOUDABI:#define __FLT_HAS_DENORM__ 1
 // X86_64-CLOUDABI:#define __FLT_HAS_INFINITY__ 1
 // X86_64-CLOUDABI:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1328,6 +1341,7 @@
 // X86_64-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-LINUX:#define __FLT_DIG__ 6
 // X86_64-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
+// X86_64-LINUX:#define __FLT_EVAL_METHOD__ 0
 // X86_64-LINUX:#define __FLT_HAS_DENORM__ 1
 // X86_64-LINUX:#define __FLT_HAS_INFINITY__ 1
 // X86_64-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1540,6 +1554,7 @@
 // X86_64-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-NETBSD:#define __FLT_DIG__ 6
 // X86_64-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
+// X86_64-NETBSD:#define __FLT_EVAL_METHOD__ 0
 // X86_64-NETBSD:#define __FLT_HAS_DENORM__ 1
 // X86_64-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // X86_64-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index a08e503570723..dd645bf6003ce 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -325,6 +325,7 @@
 // MSP430:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MSP430:#define __FLT_DIG__ 6
 // MSP430:#define __FLT_EPSILON__ 1.19209290e-7F
+// MSP430:#define __FLT_EVAL_METHOD__ 0
 // MSP430:#define __FLT_HAS_DENORM__ 1
 // MSP430:#define __FLT_HAS_INFINITY__ 1
 // MSP430:#define __FLT_HAS_QUIET_NAN__ 1
@@ -512,6 +513,7 @@
 // NVPTX32:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // NVPTX32:#define __FLT_DIG__ 6
 // NVPTX32:#define __FLT_EPSILON__ 1.19209290e-7F
+// NVPTX32:#define __FLT_EVAL_METHOD__ 0
 // NVPTX32:#define __FLT_HAS_DENORM__ 1
 // NVPTX32:#define __FLT_HAS_INFINITY__ 1
 // NVPTX32:#define __FLT_HAS_QUIET_NAN__ 1
@@ -700,6 +702,7 @@
 // NVPTX64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // NVPTX64:#define __FLT_DIG__ 6
 // NVPTX64:#define __FLT_EPSILON__ 1.19209290e-7F
+// NVPTX64:#define __FLT_EVAL_METHOD__ 0
 // NVPTX64:#define __FLT_HAS_DENORM__ 1
 // NVPTX64:#define __FLT_HAS_INFINITY__ 1
 // NVPTX64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -903,6 +906,7 @@
 // SPARC:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // SPARC:#define __FLT_DIG__ 6
 // SPARC:#define __FLT_EPSILON__ 1.19209290e-7F
+// SPARC:#define __FLT_EVAL_METHOD__ 0
 // SPARC:#define __FLT_HAS_DENORM__ 1
 // SPARC:#define __FLT_HAS_INFINITY__ 1
 // SPARC:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1103,6 +1107,7 @@
 // TCE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // TCE:#define __FLT_DIG__ 6
 // TCE:#define __FLT_EPSILON__ 1.19209290e-7F
+// TCE:#define __FLT_EVAL_METHOD__ 0
 // TCE:#define __FLT_HAS_DENORM__ 1
 // TCE:#define __FLT_HAS_INFINITY__ 1
 // TCE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1269,6 +1274,7 @@
 // PS4:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PS4:#define __FLT_DIG__ 6
 // PS4:#define __FLT_EPSILON__ 1.19209290e-7F
+// PS4:#define __FLT_EVAL_METHOD__ 0
 // PS4:#define __FLT_HAS_DENORM__ 1
 // PS4:#define __FLT_HAS_INFINITY__ 1
 // PS4:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1570,6 +1576,7 @@
 // WEBASSEMBLY-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // WEBASSEMBLY-NEXT:#define __FLT_DIG__ 6
 // WEBASSEMBLY-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F
+// WEBASSEMBLY-NEXT:#define __FLT_EVAL_METHOD__ 0
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_DENORM__ 1
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_INFINITY__ 1
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1939,6 +1946,7 @@
 // AVR:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AVR:#define __FLT_DIG__ 6
 // AVR:#define __FLT_EPSILON__ 1.19209290e-7F
+// AVR:#define __FLT_EVAL_METHOD__ 0
 // AVR:#define __FLT_HAS_DENORM__ 1
 // AVR:#define __FLT_HAS_INFINITY__ 1
 // AVR:#define __FLT_HAS_QUIET_NAN__ 1
@@ -2075,6 +2083,7 @@
 // AVR:#define __WCHAR_TYPE__ int
 // AVR:#define __WINT_TYPE__ int
 
+
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:    -triple i686-windows-msvc -fms-compatibility -x c++ < /dev/null \
 // RUN:  | FileCheck -match-full-lines -check-prefix MSVC-X32 %s
@@ -2220,6 +2229,7 @@
 // RISCV32: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // RISCV32: #define __FLT_DIG__ 6
 // RISCV32: #define __FLT_EPSILON__ 1.19209290e-7F
+// RISCV32: #define __FLT_EVAL_METHOD__ 0
 // RISCV32: #define __FLT_HAS_DENORM__ 1
 // RISCV32: #define __FLT_HAS_INFINITY__ 1
 // RISCV32: #define __FLT_HAS_QUIET_NAN__ 1
@@ -2427,6 +2437,7 @@
 // RISCV64: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // RISCV64: #define __FLT_DIG__ 6
 // RISCV64: #define __FLT_EPSILON__ 1.19209290e-7F
+// RISCV64: #define __FLT_EVAL_METHOD__ 0
 // RISCV64: #define __FLT_HAS_DENORM__ 1
 // RISCV64: #define __FLT_HAS_INFINITY__ 1
 // RISCV64: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Sema/fp-eval-pragma.cpp b/clang/test/Sema/fp-eval-pragma.cpp
deleted file mode 100644
index 42d88fd438e81..0000000000000
--- a/clang/test/Sema/fp-eval-pragma.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s
-//
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s \
-// RUN: -ffp-eval-method=source
-//
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s \
-// RUN: -ffp-eval-method=double
-
-extern "C" int printf(const char *, ...);
-
-void foo1() {
-  printf("FP: %d\n", __FLT_EVAL_METHOD__);
-}
-
-void apply_pragma() {
-  // expected-note@+1{{#pragma entered here}}
-#pragma clang fp eval_method(double)
-  // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
-  printf("FP: %d\n", __FLT_EVAL_METHOD__);
-}
-
-int foo2() {
-  apply_pragma();
-  return 0;
-}
-
-void foo() {
-  auto a = __FLT_EVAL_METHOD__;
-  {
-    // expected-note@+1{{#pragma entered here}}
-#pragma clang fp eval_method(double)
-    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
-    auto b = __FLT_EVAL_METHOD__;
-  }
-  auto c = __FLT_EVAL_METHOD__;
-}
-
-void func() {
-  {
-    {
-#pragma clang fp eval_method(source)
-    }
-    int i = __FLT_EVAL_METHOD__; // ok, not in a scope changed by the pragma
-  }
-  {
-    // expected-note@+1{{#pragma entered here}}
-#pragma clang fp eval_method(source)
-    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
-    int i = __FLT_EVAL_METHOD__;
-  }
-}
-
-float G;
-
-int f(float x, float y, float z) {
-  G = x * y + z;
-  return __FLT_EVAL_METHOD__;
-}
-
-int foo(int flag, float x, float y, float z) {
-  if (flag) {
-    // expected-note@+1{{#pragma entered here}}
-#pragma clang fp eval_method(double)
-    G = x + y + z;
-    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
-    return __FLT_EVAL_METHOD__;
-  } else {
-    // expected-note@+1{{#pragma entered here}}
-#pragma clang fp eval_method(extended)
-    G = x + y + z;
-    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
-    return __FLT_EVAL_METHOD__;
-  }
-}
-
-#if __FLT_EVAL_METHOD__ == 1
-#endif
-#pragma clang fp eval_method(source)
-
-// expected-note@+1{{#pragma entered here}}
-#pragma clang fp eval_method(double)
-// expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
-#if __FLT_EVAL_METHOD__ == 1
-#endif
diff --git a/clang/test/Sema/x86-eval-method.c b/clang/test/Sema/x86-eval-method.c
deleted file mode 100644
index f475b0d1b29bc..0000000000000
--- a/clang/test/Sema/x86-eval-method.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
-// RUN: -emit-llvm -ffp-eval-method=source  -o - -verify=warn %s
-//
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple i386-pc-windows -target-cpu pentium4 \
-// RUN: -emit-llvm -ffp-eval-method=source  -o - -verify=no-warn %s
-
-// no-warn-no-diagnostics
-
-float add1(float a, float b, float c) {
-  return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
-
-float add2(float a, float b, float c) {
-#pragma clang fp eval_method(source)
-  return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
diff --git a/clang/test/Sema/x86_64-eval-method.c b/clang/test/Sema/x86_64-eval-method.c
deleted file mode 100644
index dbdc1f881b4a8..0000000000000
--- a/clang/test/Sema/x86_64-eval-method.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -target-feature -sse -emit-llvm \
-// RUN: -o - -verify=warn %s
-//
-// RUN: %clang_cc1 -fexperimental-strict-floating-point \
-// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify=no-warn %s
-
-// no-warn-no-diagnostics
-
-float add2(float a, float b, float c) {
-#pragma clang fp eval_method(source)
-  return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}

From 5086cff04eec4327acc22a90466854ad4d89d570 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 18 Feb 2022 09:49:50 +0000
Subject: [PATCH 242/748] Revert "unbreak Modules/cxx20-export-import.cpp with
 LLVM_APPEND_VC_REV after 32b73bc6ab82"

This reverts commit 1689b1092ebb2c630f8ef1d3880a9fb4808d16fa.

This patch was only added to fix a failure with 32b73bc6ab8234b, which
has been reverted again.
---
 clang/include/clang/Serialization/ASTBitCodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index c94274ff34b8f..f98e173b158c1 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -41,7 +41,7 @@ namespace serialization {
 /// Version 4 of AST files also requires that the version control branch and
 /// revision match exactly, since there is no backward compatibility of
 /// AST files at this time.
-const unsigned VERSION_MAJOR = 16;
+const unsigned VERSION_MAJOR = 15;
 
 /// AST file minor version number supported by this version of
 /// Clang.

From a59014b759050af93e0ab214dcbf0cc2dd75bb75 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 18 Feb 2022 12:01:05 +0100
Subject: [PATCH 243/748] Revert "Fix race condition when launching and
 attaching."

It breaks TestVSCode_attach.py.

This reverts commit 9febd1e573fb8b3d1de5844b7bfd33eb998f0106 and
38054556a08884aa15d3ebc720e2f43d0cb5a944.
---
 .../test/tools/lldb-vscode/vscode.py          |  3 --
 .../lldb-vscode/launch/TestVSCode_launch.py   |  2 +-
 lldb/tools/lldb-vscode/VSCode.cpp             | 52 -------------------
 lldb/tools/lldb-vscode/VSCode.h               | 13 -----
 lldb/tools/lldb-vscode/lldb-vscode.cpp        | 34 +++++-------
 lldb/tools/lldb-vscode/package.json           | 10 +---
 6 files changed, 15 insertions(+), 99 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
index faa0b93b3f9a7..603b1545cd714 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/vscode.py
@@ -228,9 +228,6 @@ def handle_recv_packet(self, packet):
                 # 'stopped' event. We need to remember the thread stop
                 # reasons since the 'threads' command doesn't return
                 # that information.
-                # if not self.configuration_done_sent:
-                #     raise ValueError("'stopped' event received before "
-                #                      "configuationDone packet was sent")
                 self._process_stopped()
                 tid = body['threadId']
                 self.thread_stop_reasons[tid] = body
diff --git a/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py b/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py
index 8c0000bdb1546..ff798364c9573 100644
--- a/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py
+++ b/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py
@@ -374,7 +374,7 @@ def test_commands(self):
     @skipIfRemote
     def test_extra_launch_commands(self):
         '''
-            Tests the "launchCommands" with extra launching settings
+            Tests the "luanchCommands" with extra launching settings
         '''
         self.build_and_create_debug_adaptor()
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/tools/lldb-vscode/VSCode.cpp b/lldb/tools/lldb-vscode/VSCode.cpp
index a6fe7f840a566..3209eea4a897f 100644
--- a/lldb/tools/lldb-vscode/VSCode.cpp
+++ b/lldb/tools/lldb-vscode/VSCode.cpp
@@ -528,58 +528,6 @@ void VSCode::RegisterRequestCallback(std::string request,
   request_handlers[request] = callback;
 }
 
-lldb::SBError VSCode::WaitForProcessToStop(uint32_t seconds) {
-  // Wait for the process hit a stopped state. When running a launch (with or
-  // without "launchCommands") or attach (with or without)= "attachCommands"),
-  // the calls might take some time to stop at the entry point since the command
-  // is asynchronous. So we need to sync up with the process and make sure it is
-  // stopped before we proceed to do anything else as we will soon be asked to
-  // set breakpoints and other things that require the process to be stopped.
-  // We must use polling because attach doesn't send a process state change
-  // event for the first stop, while launching does. Since both "attachCommands"
-  // and "launchCommands" could end up using any combination of LLDB commands,
-  // we must ensure we can also catch when the process stops, so we must poll
-  // the process to make sure we handle all cases.
-
-  lldb::SBError error;
-  lldb::SBProcess process = target.GetProcess();
-  if (!process.IsValid()) {
-    error.SetErrorString("invalid process");
-    return error;
-  }
-  auto timeout_time =
-      std::chrono::high_resolution_clock::now() + std::chrono::seconds(seconds);
-  while (std::chrono::high_resolution_clock::now() < timeout_time) {
-    const auto state = process.GetState();
-    switch (state) {
-      case lldb::eStateAttaching:
-      case lldb::eStateConnected:
-      case lldb::eStateInvalid:
-      case lldb::eStateLaunching:
-      case lldb::eStateRunning:
-      case lldb::eStateStepping:
-      case lldb::eStateSuspended:
-        break;
-      case lldb::eStateDetached:
-        error.SetErrorString("process detached during launch or attach");
-        return error;
-      case lldb::eStateExited:
-        error.SetErrorString("process exited during launch or attach");
-        return error;
-      case lldb::eStateUnloaded:
-        error.SetErrorString("process unloaded during launch or attach");
-        return error;
-      case lldb::eStateCrashed:
-      case lldb::eStateStopped:
-        return lldb::SBError(); // Success!
-    }
-    std::this_thread::sleep_for(std::chrono::microseconds(250));
-  }
-  error.SetErrorStringWithFormat("process failed to stop within %u seconds",
-                                 seconds);
-  return error;
-}
-
 void Variables::Clear() {
   locals.Clear();
   globals.Clear();
diff --git a/lldb/tools/lldb-vscode/VSCode.h b/lldb/tools/lldb-vscode/VSCode.h
index bc868760eb830..602cf758a9a17 100644
--- a/lldb/tools/lldb-vscode/VSCode.h
+++ b/lldb/tools/lldb-vscode/VSCode.h
@@ -243,19 +243,6 @@ struct VSCode {
   /// Debuggee will continue from stopped state.
   void WillContinue() { variables.Clear(); }
 
-  /// Poll the process to wait for it to reach the eStateStopped state.
-  ///
-  /// We need to ensure the process is stopped and ready to resume before we
-  /// continue with the launch or attach. This is needed since we no longer play
-  /// with the synchronous mode in the debugger for launching (with or without
-  /// "launchCommands") or attaching (with or without "attachCommands").
-  ///
-  /// \param[in] seconds
-  ///   The number of seconds to poll the process to wait until it is stopped.
-  ///
-  /// \return Error if waiting for the process fails, no error if succeeds.
-  lldb::SBError WaitForProcessToStop(uint32_t seconds);
-
 private:
   // Send the JSON in "json_str" to the "out" stream. Correctly send the
   // "Content-Length:" field followed by the length, followed by the raw
diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp
index 734b23afc9b28..97ec4b578cf7c 100644
--- a/lldb/tools/lldb-vscode/lldb-vscode.cpp
+++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp
@@ -449,18 +449,10 @@ void EventThreadFunction() {
           case lldb::eStateSuspended:
             break;
           case lldb::eStateStopped:
-            // Now that we don't mess with the async setting in the debugger
-            // when launching or attaching we will get the first process stop
-            // event which we do not want to send an event for. This is because
-            // we either manually deliver the event in by calling the
-            // SendThreadStoppedEvent() from request_configuarationDone() if we
-            // want to stop on entry, or we resume from that function.
-            if (process.GetStopID() > 1) {
-              // Only report a stopped event if the process was not restarted.
-              if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
-                SendStdOutStdErr(process);
-                SendThreadStoppedEvent();
-              }
+            // Only report a stopped event if the process was not restarted.
+            if (!lldb::SBProcess::GetRestartedFromEvent(event)) {
+              SendStdOutStdErr(process);
+              SendThreadStoppedEvent();
             }
             break;
           case lldb::eStateRunning:
@@ -608,7 +600,6 @@ void request_attach(const llvm::json::Object &request) {
   g_vsc.terminate_commands = GetStrings(arguments, "terminateCommands");
   auto attachCommands = GetStrings(arguments, "attachCommands");
   llvm::StringRef core_file = GetString(arguments, "coreFile");
-  const uint64_t timeout_seconds = GetUnsigned(arguments, "timeout", 30);
   g_vsc.stop_at_entry =
       core_file.empty() ? GetBoolean(arguments, "stopOnEntry", false) : true;
   std::vector<std::string> postRunCommands =
@@ -649,10 +640,15 @@ void request_attach(const llvm::json::Object &request) {
   }
   if (attachCommands.empty()) {
     // No "attachCommands", just attach normally.
+    // Disable async events so the attach will be successful when we return from
+    // the launch call and the launch will happen synchronously
+    g_vsc.debugger.SetAsync(false);
     if (core_file.empty())
       g_vsc.target.Attach(attach_info, error);
     else
       g_vsc.target.LoadCore(core_file.data(), error);
+    // Reenable async events
+    g_vsc.debugger.SetAsync(true);
   } else {
     // We have "attachCommands" that are a set of commands that are expected
     // to execute the commands after which a process should be created. If there
@@ -662,9 +658,6 @@ void request_attach(const llvm::json::Object &request) {
     // selected target after these commands are run.
     g_vsc.target = g_vsc.debugger.GetSelectedTarget();
   }
-  // Make sure the process is attached and stopped before proceeding.
-  if (error.Success())
-    error = g_vsc.WaitForProcessToStop(timeout_seconds);
 
   if (error.Success() && core_file.empty()) {
     auto attached_pid = g_vsc.target.GetProcess().GetProcessID();
@@ -1659,7 +1652,6 @@ void request_launch(const llvm::json::Object &request) {
       GetStrings(arguments, "postRunCommands");
   g_vsc.stop_at_entry = GetBoolean(arguments, "stopOnEntry", false);
   const llvm::StringRef debuggerRoot = GetString(arguments, "debuggerRoot");
-  const uint64_t timeout_seconds = GetUnsigned(arguments, "timeout", 30);
 
   // This is a hack for loading DWARF in .o files on Mac where the .o files
   // in the debug map of the main executable have relative paths which require
@@ -1724,17 +1716,17 @@ void request_launch(const llvm::json::Object &request) {
     if (llvm::Error err = request_runInTerminal(request))
       error.SetErrorString(llvm::toString(std::move(err)).c_str());
   } else if (launchCommands.empty()) {
+    // Disable async events so the launch will be successful when we return from
+    // the launch call and the launch will happen synchronously
+    g_vsc.debugger.SetAsync(false);
     g_vsc.target.Launch(launch_info, error);
+    g_vsc.debugger.SetAsync(true);
   } else {
     g_vsc.RunLLDBCommands("Running launchCommands:", launchCommands);
     // The custom commands might have created a new target so we should use the
     // selected target after these commands are run.
     g_vsc.target = g_vsc.debugger.GetSelectedTarget();
   }
-  // Make sure the process is launched and stopped at the entry point before
-  // proceeding.
-  if (error.Success())
-    error = g_vsc.WaitForProcessToStop(timeout_seconds);
 
   if (error.Fail()) {
     response["success"] = llvm::json::Value(false);
diff --git a/lldb/tools/lldb-vscode/package.json b/lldb/tools/lldb-vscode/package.json
index bedc8f16ea26e..a5c79911f6e9f 100644
--- a/lldb/tools/lldb-vscode/package.json
+++ b/lldb/tools/lldb-vscode/package.json
@@ -215,7 +215,7 @@
 							},
 							"launchCommands": {
 								"type": "array",
-								"description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail. Launch the process with \"process launch -s\" to make the process to at the entry point since lldb-vscode will auto resume if necessary.",
+								"description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail.",
 								"default": []
 							},
 							"stopCommands": {
@@ -232,10 +232,6 @@
 								"type": "boolean",
 								"description": "Launch the program inside an integrated terminal in the IDE. Useful for debugging interactive command line programs",
 								"default": false
-							},
-							"timeout": {
-								"type": "string",
-								"description": "The time in seconds to wait for a program to stop at entry point when launching. Defaults to 30 seconds."
 							}
 						}
 					},
@@ -311,10 +307,6 @@
 							"coreFile": {
 								"type": "string",
 								"description": "Path to the core file to debug."
-							},
-							"timeout": {
-								"type": "string",
-								"description": "The time in seconds to wait for a program to stop when attaching. Defaults to 30 seconds."
 							}
 						}
 					}

From 535e7b09c189dd3a7ef65bd36a02962f0c98bd5e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 11:12:33 +0000
Subject: [PATCH 244/748] [clangd] lookupSiblingsWithinContext - remove
 unnecessary nullptr check

The DC pointer is always dereferenced after the loop
---
 clang-tools-extra/clangd/refactor/Rename.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index b106664f0a446..46d884578d462 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -389,7 +389,7 @@ const NamedDecl *lookupSiblingsWithinContext(ASTContext &Ctx,
   DeclarationName LookupName(&II);
   DeclContextLookupResult LookupResult;
   const auto *DC = RenamedDecl.getDeclContext();
-  while (DC && DC->isTransparentContext())
+  while (DC->isTransparentContext())
     DC = DC->getParent();
   switch (DC->getDeclKind()) {
   // The enclosing DeclContext may not be the enclosing scope, it might have

From 977b1f574fa18219fde5f709b906c79202ef1916 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 17 Feb 2022 22:01:29 +0100
Subject: [PATCH 245/748] [clang][ASTReader] Fix memory leak while reading
 FriendTemplateDecls

Allocate on ASTContext, rather than just on heap, so that template
parameter lists are freed up.

Differential Revision: https://reviews.llvm.org/D120081
---
 clang/lib/Serialization/ASTReaderDecl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 25d7e9e6a2e68..29bef2aa20897 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2103,7 +2103,7 @@ void ASTDeclReader::VisitFriendTemplateDecl(FriendTemplateDecl *D) {
   VisitDecl(D);
   unsigned NumParams = Record.readInt();
   D->NumParams = NumParams;
-  D->Params = new TemplateParameterList*[NumParams];
+  D->Params = new (Reader.getContext()) TemplateParameterList *[NumParams];
   for (unsigned i = 0; i != NumParams; ++i)
     D->Params[i] = Record.readTemplateParameterList();
   if (Record.readInt()) // HasFriendDecl

From b4e0507ce018eaf42f3e1a728e7bf22d7cae5514 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Thu, 17 Feb 2022 10:27:19 +0100
Subject: [PATCH 246/748] Rename PatternRewriteSet::insert to add

insert is soft deprecated, so remove all references so it's less likely
to be used and can be easily removed in the future.

Differential Revision: https://reviews.llvm.org/D120021
---
 mlir/include/mlir/IR/PatternMatch.h           |  2 +-
 .../Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp    |  6 ++---
 .../Dialect/Arithmetic/IR/ArithmeticOps.cpp   | 23 ++++++++++---------
 .../Bufferization/IR/BufferizationOps.cpp     |  2 +-
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 10 ++++----
 .../Dialect/Linalg/Transforms/Detensorize.cpp |  6 ++---
 .../MemRef/Transforms/ComposeSubView.cpp      |  2 +-
 mlir/lib/Dialect/SCF/SCF.cpp                  |  6 ++---
 .../SCF/Transforms/LoopCanonicalization.cpp   | 11 ++++-----
 .../Transforms/FuncConversions.cpp            |  2 +-
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          | 22 +++++++++---------
 .../Tosa/Transforms/TosaDecomposeConv2D.cpp   |  2 +-
 .../Transforms/TosaDecomposeDepthwise.cpp     |  2 +-
 .../Transforms/TosaDecomposeTransposeConv.cpp |  4 ++--
 mlir/unittests/Transforms/Canonicalizer.cpp   |  2 +-
 15 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 4f728c2e30644..8fd9fa0caaf3c 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -1061,7 +1061,7 @@ class RewritePatternSet {
     private:
       LogicalResult (*implFn)(OpType, PatternRewriter &rewriter);
     };
-    insert(std::make_unique<FnPattern>(std::move(implFn), getContext()));
+    add(std::make_unique<FnPattern>(std::move(implFn), getContext()));
     return *this;
   }
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
index 4a3ba46233ff8..1a9254e9db896 100644
--- a/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
@@ -370,7 +370,7 @@ LLVM::LLVMStructType mlir::convertMMAToLLVMType(gpu::MMAMatrixType type) {
 
 void mlir::populateGpuWMMAToNVVMConversionPatterns(
     LLVMTypeConverter &converter, RewritePatternSet &patterns) {
-  patterns.insert<WmmaLoadOpToNVVMLowering, WmmaMmaOpToNVVMLowering,
-                  WmmaStoreOpToNVVMLowering, WmmaConstantOpToNVVMLowering,
-                  WmmaElementwiseOpToNVVMLowering>(converter);
+  patterns.add<WmmaLoadOpToNVVMLowering, WmmaMmaOpToNVVMLowering,
+               WmmaStoreOpToNVVMLowering, WmmaConstantOpToNVVMLowering,
+               WmmaElementwiseOpToNVVMLowering>(converter);
 }
diff --git a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
index fffc1b98b0087..8074f2c4751fb 100644
--- a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
+++ b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
@@ -210,7 +210,7 @@ OpFoldResult arith::AddIOp::fold(ArrayRef<Attribute> operands) {
 
 void arith::AddIOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<AddIAddConstant, AddISubConstantRHS, AddISubConstantLHS>(
+  patterns.add<AddIAddConstant, AddISubConstantRHS, AddISubConstantLHS>(
       context);
 }
 
@@ -232,9 +232,10 @@ OpFoldResult arith::SubIOp::fold(ArrayRef<Attribute> operands) {
 
 void arith::SubIOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<SubIRHSAddConstant, SubILHSAddConstant, SubIRHSSubConstantRHS,
-                  SubIRHSSubConstantLHS, SubILHSSubConstantRHS,
-                  SubILHSSubConstantLHS>(context);
+  patterns
+      .add<SubIRHSAddConstant, SubILHSAddConstant, SubIRHSSubConstantRHS,
+           SubIRHSSubConstantLHS, SubILHSSubConstantRHS, SubILHSSubConstantLHS>(
+          context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -568,7 +569,7 @@ OpFoldResult arith::XOrIOp::fold(ArrayRef<Attribute> operands) {
 
 void arith::XOrIOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<XOrINotCmpI>(context);
+  patterns.add<XOrINotCmpI>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -913,7 +914,7 @@ bool arith::ExtSIOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
 
 void arith::ExtSIOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<ExtSIOfExtUI>(context);
+  patterns.add<ExtSIOfExtUI>(context);
 }
 
 LogicalResult arith::ExtSIOp::verify() {
@@ -1007,7 +1008,7 @@ LogicalResult arith::TruncFOp::verify() {
 
 void arith::AndIOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<AndOfExtUI, AndOfExtSI>(context);
+  patterns.add<AndOfExtUI, AndOfExtSI>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1016,7 +1017,7 @@ void arith::AndIOp::getCanonicalizationPatterns(
 
 void arith::OrIOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<OrOfExtUI, OrOfExtSI>(context);
+  patterns.add<OrOfExtUI, OrOfExtSI>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1155,7 +1156,7 @@ OpFoldResult arith::IndexCastOp::fold(ArrayRef<Attribute> operands) {
 
 void arith::IndexCastOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<IndexCastOfIndexCast, IndexCastOfExtSI>(context);
+  patterns.add<IndexCastOfIndexCast, IndexCastOfExtSI>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1204,7 +1205,7 @@ OpFoldResult arith::BitcastOp::fold(ArrayRef<Attribute> operands) {
 
 void arith::BitcastOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<BitcastOfBitcast>(context);
+  patterns.add<BitcastOfBitcast>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1461,7 +1462,7 @@ struct SelectToExtUI : public OpRewritePattern<arith::SelectOp> {
 
 void arith::SelectOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                   MLIRContext *context) {
-  results.insert<SelectI1Simplify, SelectToExtUI>(context);
+  results.add<SelectI1Simplify, SelectToExtUI>(context);
 }
 
 OpFoldResult arith::SelectOp::fold(ArrayRef<Attribute> operands) {
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index c5a99d820bc90..9b3342a6e801e 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -170,7 +170,7 @@ struct SimplifyClones : public OpRewritePattern<CloneOp> {
 
 void CloneOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                           MLIRContext *context) {
-  results.insert<SimplifyClones>(context);
+  results.add<SimplifyClones>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index c148ab9bcfa7f..6b0d22c8f939e 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1702,11 +1702,11 @@ struct TiledLoopResultsFolder : public OpRewritePattern<linalg::TiledLoopOp> {
 
 void TiledLoopOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                               MLIRContext *context) {
-  results.insert<TiledLoopInputsFolder, TiledLoopResultsFolder,
-                 DimOfTiledLoopInsOutsFolder<tensor::DimOp>,
-                 DimOfTiledLoopInsOutsFolder<memref::DimOp>,
-                 DimOfTiledLoopResultFolder<tensor::DimOp>,
-                 DimOfTiledLoopResultFolder<memref::DimOp>>(context);
+  results.add<TiledLoopInputsFolder, TiledLoopResultsFolder,
+              DimOfTiledLoopInsOutsFolder<tensor::DimOp>,
+              DimOfTiledLoopInsOutsFolder<memref::DimOp>,
+              DimOfTiledLoopResultFolder<tensor::DimOp>,
+              DimOfTiledLoopResultFolder<memref::DimOp>>(context);
 }
 
 LogicalResult TiledLoopOp::fold(ArrayRef<Attribute>,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
index 77e522ad280c0..afe82ff8bdfac 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
@@ -533,9 +533,9 @@ struct LinalgDetensorize : public LinalgDetensorizeBase<LinalgDetensorize> {
       return false;
     });
 
-    patterns.insert<DetensorizeGenericOp>(typeConverter, context);
-    patterns.insert<FunctionNonEntryBlockConversion>(context, typeConverter,
-                                                     blockArgsToDetensor);
+    patterns.add<DetensorizeGenericOp>(typeConverter, context);
+    patterns.add<FunctionNonEntryBlockConversion>(context, typeConverter,
+                                                  blockArgsToDetensor);
     // Since non-entry block arguments get detensorized, we also need to
     // update the control flow inside the function to reflect the correct
     // types.
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ComposeSubView.cpp b/mlir/lib/Dialect/MemRef/Transforms/ComposeSubView.cpp
index 8de484c3b86e7..c2937e560ec2f 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ComposeSubView.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ComposeSubView.cpp
@@ -129,5 +129,5 @@ struct ComposeSubViewOpPattern : public OpRewritePattern<memref::SubViewOp> {
 
 void mlir::memref::populateComposeSubViewPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.insert<ComposeSubViewOpPattern>(context);
+  patterns.add<ComposeSubViewOpPattern>(context);
 }
diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp
index 258ead87848f5..54c93e93f37e8 100644
--- a/mlir/lib/Dialect/SCF/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/SCF.cpp
@@ -2844,9 +2844,9 @@ struct WhileUnusedArg : public OpRewritePattern<WhileOp> {
 
 void WhileOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                           MLIRContext *context) {
-  results.insert<RemoveLoopInvariantArgsFromBeforeBlock,
-                 RemoveLoopInvariantValueYielded, WhileConditionTruth,
-                 WhileCmpCond, WhileUnusedResult>(context);
+  results.add<RemoveLoopInvariantArgsFromBeforeBlock,
+              RemoveLoopInvariantValueYielded, WhileConditionTruth,
+              WhileCmpCond, WhileUnusedResult>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
index 4bfbd617cbd04..7e422b0b4a41c 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
@@ -196,12 +196,11 @@ void mlir::scf::populateSCFForLoopCanonicalizationPatterns(
     RewritePatternSet &patterns) {
   MLIRContext *ctx = patterns.getContext();
   patterns
-      .insert<AffineOpSCFCanonicalizationPattern<AffineMinOp, /*IsMin=*/true>,
-              AffineOpSCFCanonicalizationPattern<AffineMaxOp, /*IsMin=*/false>,
-              DimOfIterArgFolder<tensor::DimOp>,
-              DimOfIterArgFolder<memref::DimOp>,
-              DimOfLoopResultFolder<tensor::DimOp>,
-              DimOfLoopResultFolder<memref::DimOp>>(ctx);
+      .add<AffineOpSCFCanonicalizationPattern<AffineMinOp, /*IsMin=*/true>,
+           AffineOpSCFCanonicalizationPattern<AffineMaxOp, /*IsMin=*/false>,
+           DimOfIterArgFolder<tensor::DimOp>, DimOfIterArgFolder<memref::DimOp>,
+           DimOfLoopResultFolder<tensor::DimOp>,
+           DimOfLoopResultFolder<memref::DimOp>>(ctx);
 }
 
 std::unique_ptr<Pass> mlir::createSCFForLoopCanonicalizationPass() {
diff --git a/mlir/lib/Dialect/StandardOps/Transforms/FuncConversions.cpp b/mlir/lib/Dialect/StandardOps/Transforms/FuncConversions.cpp
index 0f07e547a7a39..ee614fd27a769 100644
--- a/mlir/lib/Dialect/StandardOps/Transforms/FuncConversions.cpp
+++ b/mlir/lib/Dialect/StandardOps/Transforms/FuncConversions.cpp
@@ -110,7 +110,7 @@ class ReturnOpTypeConversion : public OpConversionPattern<ReturnOp> {
 void mlir::populateBranchOpInterfaceTypeConversionPattern(
     RewritePatternSet &patterns, TypeConverter &typeConverter,
     function_ref<bool(BranchOpInterface, int)> shouldConvertBranchOperand) {
-  patterns.insert<BranchOpInterfaceTypeConversion>(
+  patterns.add<BranchOpInterfaceTypeConversion>(
       typeConverter, patterns.getContext(), shouldConvertBranchOperand);
 }
 
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index e4e7acbb1429e..37375184a4394 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -130,7 +130,7 @@ struct ConcatOptimization : public OpRewritePattern<tosa::ConcatOp> {
 
 void ConcatOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                            MLIRContext *context) {
-  results.insert<ConcatOptimization>(context);
+  results.add<ConcatOptimization>(context);
 }
 
 struct ReshapeReshapeOptimization : public OpRewritePattern<tosa::ReshapeOp> {
@@ -188,8 +188,8 @@ struct ReshapeConstOptimization : public OpRewritePattern<tosa::ReshapeOp> {
 
 void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
-  results.insert<ReshapeReshapeOptimization>(context);
-  results.insert<ReshapeConstOptimization>(context);
+  results.add<ReshapeReshapeOptimization>(context);
+  results.add<ReshapeConstOptimization>(context);
 }
 
 struct ConstantTransposeOptimization
@@ -285,8 +285,8 @@ struct NoOpOptimization : public OpRewritePattern<tosa::TransposeOp> {
 
 void TransposeOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                               MLIRContext *context) {
-  results.insert<ConstantTransposeOptimization>(context);
-  results.insert<NoOpOptimization>(context);
+  results.add<ConstantTransposeOptimization>(context);
+  results.add<NoOpOptimization>(context);
 }
 
 struct AddZeroOptimization : public OpRewritePattern<tosa::AddOp> {
@@ -323,7 +323,7 @@ struct AddZeroOptimization : public OpRewritePattern<tosa::AddOp> {
 
 void AddOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.insert<AddZeroOptimization>(context);
+  results.add<AddZeroOptimization>(context);
 }
 
 struct MulOneOptimization : public OpRewritePattern<tosa::MulOp> {
@@ -372,7 +372,7 @@ struct MulOneOptimization : public OpRewritePattern<tosa::MulOp> {
 
 void MulOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.insert<MulOneOptimization>(context);
+  results.add<MulOneOptimization>(context);
 }
 
 struct MaterializePadValue : public OpRewritePattern<tosa::PadOp> {
@@ -419,7 +419,7 @@ struct MaterializePadValue : public OpRewritePattern<tosa::PadOp> {
 
 void PadOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.insert<MaterializePadValue>(context);
+  results.add<MaterializePadValue>(context);
 }
 
 struct MaxPool2dIsNoOp : public OpRewritePattern<tosa::MaxPool2dOp> {
@@ -454,7 +454,7 @@ struct MaxPool2dIsNoOp : public OpRewritePattern<tosa::MaxPool2dOp> {
 
 void MaxPool2dOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                               MLIRContext *context) {
-  results.insert<MaxPool2dIsNoOp>(context);
+  results.add<MaxPool2dIsNoOp>(context);
 }
 
 struct ClampIsNoOp : public OpRewritePattern<tosa::ClampOp> {
@@ -556,8 +556,8 @@ struct ClampClampOptimization : public OpRewritePattern<tosa::ClampOp> {
 
 void ClampOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                           MLIRContext *context) {
-  results.insert<ClampIsNoOp>(context);
-  results.insert<ClampClampOptimization>(context);
+  results.add<ClampIsNoOp>(context);
+  results.add<ClampClampOptimization>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
index 4c412f987899e..ac8583f7c03e2 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
@@ -111,5 +111,5 @@ struct Conv2DIsFullyConnected : public OpRewritePattern<tosa::Conv2DOp> {
 
 void mlir::tosa::populateTosaDecomposeConv2D(MLIRContext *ctx,
                                              RewritePatternSet &patterns) {
-  patterns.insert<Conv2DIsFullyConnected>(ctx);
+  patterns.add<Conv2DIsFullyConnected>(ctx);
 }
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
index 685f97353d746..2ce9f24e6d9c9 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
@@ -117,5 +117,5 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
 
 void mlir::tosa::populateTosaDecomposeDepthwise(MLIRContext *ctx,
                                                 RewritePatternSet &patterns) {
-  patterns.insert<DepthwiseConv2DIsMul>(ctx);
+  patterns.add<DepthwiseConv2DIsMul>(ctx);
 }
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index 330add9e248ea..d6ffa463f31bd 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -375,6 +375,6 @@ class TransposeConvStridedConverter
 
 void mlir::tosa::populateTosaDecomposeTransposeConv(
     MLIRContext *ctx, RewritePatternSet &patterns) {
-  patterns.insert<TransposeConvDilatedConverter>(ctx);
-  patterns.insert<TransposeConvStridedConverter>(ctx);
+  patterns.add<TransposeConvDilatedConverter>(ctx);
+  patterns.add<TransposeConvStridedConverter>(ctx);
 }
diff --git a/mlir/unittests/Transforms/Canonicalizer.cpp b/mlir/unittests/Transforms/Canonicalizer.cpp
index 71d7be9bef148..f3b0ad821b03b 100644
--- a/mlir/unittests/Transforms/Canonicalizer.cpp
+++ b/mlir/unittests/Transforms/Canonicalizer.cpp
@@ -58,7 +58,7 @@ struct TestDialect : public Dialect {
   }
 
   void getCanonicalizationPatterns(RewritePatternSet &results) const override {
-    results.insert<DisabledPattern, EnabledPattern>(results.getContext());
+    results.add<DisabledPattern, EnabledPattern>(results.getContext());
   }
 };
 

From de2c0a2e6139e621d5781b3bb1e14c52ac81cf87 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 11:21:32 +0000
Subject: [PATCH 247/748] [X86] combineADC/SBB - pull out repeated getOperand
 calls. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 42 ++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f2509dc9e7a83..9e765a90e8bcd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52055,22 +52055,22 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
-  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue BorrowIn = N->getOperand(2);
+
+  if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
     MVT VT = N->getSimpleValueType(0);
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
-                       N->getOperand(0), N->getOperand(1),
-                       Flags);
+    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
   }
 
   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
   // iff the flag result is dead.
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
+  if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
       !N->hasAnyUseOfValue(1))
-    return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
-                       Op0.getOperand(1), N->getOperand(2));
+    return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
+                       LHS.getOperand(1), BorrowIn);
 
   return SDValue();
 }
@@ -52078,32 +52078,32 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   // the result is either zero or one (depending on the input carry bit).
   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
-  if (X86::isZeroNode(N->getOperand(0)) &&
-      X86::isZeroNode(N->getOperand(1)) &&
+  if (X86::isZeroNode(LHS) && X86::isZeroNode(RHS) &&
       // We don't have a good way to replace an EFLAGS use, so only do this when
       // dead right now.
       SDValue(N, 1).use_empty()) {
     SDLoc DL(N);
     EVT VT = N->getValueType(0);
     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
-    SDValue Res1 =
-        DAG.getNode(ISD::AND, DL, VT,
-                    DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                                DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
-                                N->getOperand(2)),
-                    DAG.getConstant(1, DL, VT));
+    SDValue Res1 = DAG.getNode(
+        ISD::AND, DL, VT,
+        DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                    DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
+        DAG.getConstant(1, DL, VT));
     return DCI.CombineTo(N, Res1, CarryOut);
   }
 
-  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
+  if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
     MVT VT = N->getSimpleValueType(0);
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
-                       N->getOperand(0), N->getOperand(1),
-                       Flags);
+    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
   }
 
   return SDValue();

From 6c99a3469d9c0a48fb2a9fec845d284ef39338ee Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Tue, 15 Feb 2022 13:28:34 +0100
Subject: [PATCH 248/748] [lldb] Add support for a "global" lldbinit file

This patch adds introduces a new kind of an lldbinit file. Unlike the
lldbinit in the home directory (useful for customizing lldb to the needs
of a particular user), or the cwd lldbinit file (useful for
project-specific settings), this file can be used to customize an entire
lldb installation to a particular environment.

The feature is enabled at build time, by setting the
LLDB_GLOBAL_INIT_DIRECTORY variable to a path to a directory which
should contain an "lldbinit" file. Lldb will then load the file at
startup, if it exists, and if automatic init loading has not been
disabled. Relative paths will be resolved (at runtime) relative to the
location of the lldb library (liblldb or LLDB.framework).

The system-wide lldbinit file will be loaded first, before any
$HOME/.lldbinit and $CWD/.lldbinit files are processed, so that those
can override any system-wide settings.

More information can be found on the RFC thread at
<https://discourse.llvm.org/t/rfc-system-wide-lldbinit/59933>.

Differential Revision: https://reviews.llvm.org/D119831
---
 lldb/cmake/modules/LLDBConfig.cmake              |  4 ++++
 lldb/include/lldb/API/SBCommandInterpreter.h     |  2 ++
 lldb/include/lldb/Host/Config.h.cmake            |  2 ++
 .../lldb/Interpreter/CommandInterpreter.h        |  1 +
 lldb/source/API/SBCommandInterpreter.cpp         | 16 ++++++++++++++++
 lldb/source/API/SBDebugger.cpp                   |  1 +
 lldb/source/Interpreter/CommandInterpreter.cpp   | 15 +++++++++++++++
 lldb/tools/driver/Driver.cpp                     |  9 +++++++--
 8 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index e12e548ad0c41..69aaadf29ef64 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -72,6 +72,10 @@ option(LLDB_USE_SYSTEM_DEBUGSERVER "Use the system's debugserver for testing (Da
 option(LLDB_SKIP_STRIP "Whether to skip stripping of binaries when installing lldb." OFF)
 option(LLDB_SKIP_DSYM "Whether to skip generating a dSYM when installing lldb." OFF)
 
+set(LLDB_GLOBAL_INIT_DIRECTORY "" CACHE STRING
+  "Path to the global lldbinit directory. Relative paths are resolved relative to the
+  directory containing the LLDB library.")
+
 if (LLDB_USE_SYSTEM_DEBUGSERVER)
   # The custom target for the system debugserver has no install target, so we
   # need to remove it from the LLVM_DISTRIBUTION_COMPONENTS list.
diff --git a/lldb/include/lldb/API/SBCommandInterpreter.h b/lldb/include/lldb/API/SBCommandInterpreter.h
index 4ebcc797d5bb6..0d344fc7fbcbf 100644
--- a/lldb/include/lldb/API/SBCommandInterpreter.h
+++ b/lldb/include/lldb/API/SBCommandInterpreter.h
@@ -147,6 +147,8 @@ class SBCommandInterpreter {
                              const char *help, const char *syntax,
                              const char *auto_repeat_command);
 
+  void SourceInitFileInGlobalDirectory(lldb::SBCommandReturnObject &result);
+
   void SourceInitFileInHomeDirectory(lldb::SBCommandReturnObject &result);
   void SourceInitFileInHomeDirectory(lldb::SBCommandReturnObject &result,
                                      bool is_repl);
diff --git a/lldb/include/lldb/Host/Config.h.cmake b/lldb/include/lldb/Host/Config.h.cmake
index ffe919aa99561..c24603f866d6b 100644
--- a/lldb/include/lldb/Host/Config.h.cmake
+++ b/lldb/include/lldb/Host/Config.h.cmake
@@ -53,4 +53,6 @@
 
 #define LLDB_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}"
 
+#cmakedefine LLDB_GLOBAL_INIT_DIRECTORY R"(${LLDB_GLOBAL_INIT_DIRECTORY})"
+
 #endif // #ifndef LLDB_HOST_CONFIG_H
diff --git a/lldb/include/lldb/Interpreter/CommandInterpreter.h b/lldb/include/lldb/Interpreter/CommandInterpreter.h
index af4117fcc4867..938d36ba0f3fc 100644
--- a/lldb/include/lldb/Interpreter/CommandInterpreter.h
+++ b/lldb/include/lldb/Interpreter/CommandInterpreter.h
@@ -253,6 +253,7 @@ class CommandInterpreter : public Broadcaster,
 
   void SourceInitFileCwd(CommandReturnObject &result);
   void SourceInitFileHome(CommandReturnObject &result, bool is_repl);
+  void SourceInitFileGlobal(CommandReturnObject &result);
 
   bool AddCommand(llvm::StringRef name, const lldb::CommandObjectSP &cmd_sp,
                   bool can_replace);
diff --git a/lldb/source/API/SBCommandInterpreter.cpp b/lldb/source/API/SBCommandInterpreter.cpp
index 0a8c83f51f207..a19ad48dde042 100644
--- a/lldb/source/API/SBCommandInterpreter.cpp
+++ b/lldb/source/API/SBCommandInterpreter.cpp
@@ -423,6 +423,22 @@ void SBCommandInterpreter::reset(
   m_opaque_ptr = interpreter;
 }
 
+void SBCommandInterpreter::SourceInitFileInGlobalDirectory(
+    SBCommandReturnObject &result) {
+  LLDB_INSTRUMENT_VA(this, result);
+
+  result.Clear();
+  if (IsValid()) {
+    TargetSP target_sp(m_opaque_ptr->GetDebugger().GetSelectedTarget());
+    std::unique_lock<std::recursive_mutex> lock;
+    if (target_sp)
+      lock = std::unique_lock<std::recursive_mutex>(target_sp->GetAPIMutex());
+    m_opaque_ptr->SourceInitFileGlobal(result.ref());
+  } else {
+    result->AppendError("SBCommandInterpreter is not valid");
+  }
+}
+
 void SBCommandInterpreter::SourceInitFileInHomeDirectory(
     SBCommandReturnObject &result) {
   LLDB_INSTRUMENT_VA(this, result);
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 70a7b501f93e1..1582c538fa255 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -236,6 +236,7 @@ SBDebugger SBDebugger::Create(bool source_init_files,
     interp.get()->SkipLLDBInitFiles(false);
     interp.get()->SkipAppInitFiles(false);
     SBCommandReturnObject result;
+    interp.SourceInitFileInGlobalDirectory(result);
     interp.SourceInitFileInHomeDirectory(result, false);
   } else {
     interp.get()->SkipLLDBInitFiles(true);
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 622f36e2a67ff..a50803df58e7c 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2380,6 +2380,21 @@ void CommandInterpreter::SourceInitFileHome(CommandReturnObject &result,
   SourceInitFile(FileSpec(init_file.str()), result);
 }
 
+void CommandInterpreter::SourceInitFileGlobal(CommandReturnObject &result) {
+#ifdef LLDB_GLOBAL_INIT_DIRECTORY
+  if (!m_skip_lldbinit_files) {
+    FileSpec init_file(LLDB_GLOBAL_INIT_DIRECTORY);
+    if (init_file)
+      init_file.MakeAbsolute(HostInfo::GetShlibDir());
+
+    init_file.AppendPathComponent("lldbinit");
+    SourceInitFile(init_file, result);
+    return;
+  }
+#endif
+  result.SetStatus(eReturnStatusSuccessFinishNoResult);
+}
+
 const char *CommandInterpreter::GetCommandPrefix() {
   const char *prefix = GetDebugger().GetIOHandlerCommandPrefix();
   return prefix == nullptr ? "" : prefix;
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 233e0dd977d34..31407be200c0e 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -452,9 +452,14 @@ int Driver::MainLoop() {
 
   SBCommandInterpreter sb_interpreter = m_debugger.GetCommandInterpreter();
 
-  // Before we handle any options from the command line, we parse the
-  // REPL init file or the default file in the user's home directory.
+  // Process lldbinit files before handling any options from the command line.
   SBCommandReturnObject result;
+  sb_interpreter.SourceInitFileInGlobalDirectory(result);
+  if (m_option_data.m_debug_mode) {
+    result.PutError(m_debugger.GetErrorFile());
+    result.PutOutput(m_debugger.GetOutputFile());
+  }
+
   sb_interpreter.SourceInitFileInHomeDirectory(result, m_option_data.m_repl);
   if (m_option_data.m_debug_mode) {
     result.PutError(m_debugger.GetErrorFile());

From 0b13c6b88e65bedc2d702e1ae1743a081461a814 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 18 Feb 2022 06:56:32 -0500
Subject: [PATCH 249/748] [gn build] (manually) port 6c99a3469d9c

---
 llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn
index b8645f45a4404..7e9c03c84e784 100644
--- a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn
@@ -22,6 +22,7 @@ write_cmake_config("Config") {
     "LLDB_ENABLE_PYTHON=",
     "LLDB_ENABLE_FBSDVMCORE=",
     "LLDB_EMBED_PYTHON_HOME=",
+    "LLDB_GLOBAL_INIT_DIRECTORY=",
 
     "LLDB_PYTHON_HOME=",
 

From 3ba42a564a9acfd25f30655d68c2e98bba46c7e4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 18 Feb 2022 11:59:24 +0000
Subject: [PATCH 250/748] [MemCpyOpt] Add non-local memcpy test with memory
 phi.

---
 .../MemCpyOpt/nonlocal-memcpy-memcpy.ll       | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
index 2dbda0c75140c..6d4cde348a550 100644
--- a/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll
@@ -170,3 +170,28 @@ bb23:                                             ; preds = %bb22, %bb13
 bb25:                                             ; preds = %bb6
   unreachable
 }
+
+define void @memphi_with_unrelated_clobber(i1 %cond, i64* %arg, i8* noalias %a, i8* noalias %b, i8* noalias %c) {
+; CHECK-LABEL: @memphi_with_unrelated_clobber(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A:%.*]], i8* [[B:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i64 0, i64* [[ARG:%.*]], align 4
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[C:%.*]], i8* [[B]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i1 false)
+  br i1 %cond, label %then, label %exit
+
+then:
+  store i64 0, i64* %arg
+  br label %exit
+
+exit:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %c, i8* %a, i64 16, i1 false)
+  ret void
+}

From d558540fae376361fbbf9554828c7488bc1c341d Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 18 Feb 2022 12:51:43 +0100
Subject: [PATCH 251/748] [mlir][Vector] Add return type inference for
 multi_reduction

This subsumes the builder and verifier.
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 24 +----------
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 43 +++++++++----------
 mlir/test/Dialect/Vector/invalid.mlir         |  7 +++
 3 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 4a20ea0dc4d10..aec8dd5b68823 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -313,7 +313,8 @@ def Vector_ReductionOp :
 def Vector_MultiDimReductionOp :
   Vector_Op<"multi_reduction", [NoSideEffect,
      PredOpTrait<"source operand and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>]>,
+                 TCresVTEtIsSameAsOpBase<0, 0>>,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
     Arguments<(ins Vector_CombiningKindAttr:$kind,
                    AnyVector:$source,
                    I64ArrayAttr:$reduction_dims)>,
@@ -367,31 +368,10 @@ def Vector_MultiDimReductionOp :
         res[idx] = true;
       return res;
     }
-
-    static SmallVector<int64_t> inferDestShape(
-      ArrayRef<int64_t> sourceShape, ArrayRef<bool> reducedDimsMask) {
-      assert(sourceShape.size() == reducedDimsMask.size() &&
-             "sourceShape and maks of different sizes");
-      SmallVector<int64_t> res;
-      for (auto it : llvm::zip(reducedDimsMask, sourceShape))
-        if (!std::get<0>(it))
-          res.push_back(std::get<1>(it));
-      return res;
-    }
-
-    static Type inferDestType(
-      ArrayRef<int64_t> sourceShape, ArrayRef<bool> reducedDimsMask, Type elementType) {
-      auto targetShape = inferDestShape(sourceShape, reducedDimsMask);
-      // TODO: update to also allow 0-d vectors when available.
-      if (targetShape.empty())
-        return elementType;
-      return VectorType::get(targetShape, elementType);
-    }
   }];
   let assemblyFormat =
     "$kind `,` $source attr-dict $reduction_dims `:` type($source) `to` type($dest)";
   let hasFolder = 1;
-  let hasVerifier = 1;
 }
 
 def Vector_BroadcastOp :
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 4ffb2b8c75696..ddfe0d8442280 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -336,32 +336,31 @@ void vector::MultiDimReductionOp::build(OpBuilder &builder,
                                         OperationState &result, Value source,
                                         ArrayRef<bool> reductionMask,
                                         CombiningKind kind) {
-  result.addOperands(source);
-  auto sourceVectorType = source.getType().cast<VectorType>();
-  auto targetType = MultiDimReductionOp::inferDestType(
-      sourceVectorType.getShape(), reductionMask,
-      sourceVectorType.getElementType());
-  result.addTypes(targetType);
-
   SmallVector<int64_t> reductionDims;
   for (const auto &en : llvm::enumerate(reductionMask))
     if (en.value())
       reductionDims.push_back(en.index());
-  result.addAttribute(getReductionDimsAttrStrName(),
-                      builder.getI64ArrayAttr(reductionDims));
-  result.addAttribute(getKindAttrStrName(),
-                      CombiningKindAttr::get(kind, builder.getContext()));
-}
-
-LogicalResult MultiDimReductionOp::verify() {
-  auto reductionMask = getReductionMask();
-  auto targetType = MultiDimReductionOp::inferDestType(
-      getSourceVectorType().getShape(), reductionMask,
-      getSourceVectorType().getElementType());
-  // TODO: update to support 0-d vectors when available.
-  if (targetType != getDestType())
-    return emitError("invalid output vector type: ")
-           << getDestType() << " (expected: " << targetType << ")";
+  build(builder, result, kind, source, builder.getI64ArrayAttr(reductionDims));
+}
+
+LogicalResult MultiDimReductionOp::inferReturnTypes(
+    MLIRContext *, Optional<Location>, ValueRange operands,
+    DictionaryAttr attributes, RegionRange,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  MultiDimReductionOp::Adaptor op(operands, attributes);
+  auto vectorType = op.source().getType().cast<VectorType>();
+  SmallVector<int64_t> targetShape;
+  for (auto it : llvm::enumerate(vectorType.getShape()))
+    if (!llvm::any_of(op.reduction_dims().getValue(), [&](Attribute attr) {
+          return attr.cast<IntegerAttr>().getValue() == it.index();
+        }))
+      targetShape.push_back(it.value());
+  // TODO: update to also allow 0-d vectors when available.
+  if (targetShape.empty())
+    inferredReturnTypes.push_back(vectorType.getElementType());
+  else
+    inferredReturnTypes.push_back(
+        VectorType::get(targetShape, vectorType.getElementType()));
   return success();
 }
 
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 2e224f7f58ebe..c90725e5d8d7b 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1137,6 +1137,13 @@ func @reduce_unsupported_rank(%arg0: vector<4x16xf32>) -> f32 {
 
 // -----
 
+func @multi_reduce_invalid_type(%arg0: vector<4x16xf32>) -> f32 {
+  // expected-error@+1 {{'vector.multi_reduction' op inferred type(s) 'vector<4xf32>' are incompatible with return type(s) of operation 'vector<16xf32>'}}
+  %0 = vector.multi_reduction <mul>, %arg0 [1] : vector<4x16xf32> to vector<16xf32>
+}
+
+// -----
+
 func @transpose_rank_mismatch(%arg0: vector<4x16x11xf32>) {
   // expected-error@+1 {{'vector.transpose' op vector result rank mismatch: 1}}
   %0 = vector.transpose %arg0, [2, 1, 0] : vector<4x16x11xf32> to vector<100xf32>

From d46e49838e17800cb72d95db1b23c04bbca610e2 Mon Sep 17 00:00:00 2001
From: Simon Moll <simon.moll@emea.nec.com>
Date: Fri, 18 Feb 2022 13:17:03 +0100
Subject: [PATCH 252/748] [VE] Fix vmp0 subregister mapping

vmp0 is the all-ones v512i1 register and does not break down into
subregisters.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D120054
---
 llvm/lib/Target/VE/VERegisterInfo.td          | 4 +++-
 llvm/test/CodeGen/VE/VELIntrinsics/extract.ll | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/VE/VERegisterInfo.td b/llvm/lib/Target/VE/VERegisterInfo.td
index 70ff104b65b7f..cca0ad26b3e99 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.td
+++ b/llvm/lib/Target/VE/VERegisterInfo.td
@@ -152,8 +152,10 @@ foreach I = 0-15 in
   def VM#I : VEMaskReg<I, "vm"#I, [], ["vm"#I]>, DwarfRegNum<[!add(128,I)]>;
 
 // Aliases of VMs to use as a pair of two VM for packed instructions
+def VMP0 : VEMaskReg<0, "vm0", [], ["vm0"]>;
+
 let SubRegIndices = [sub_vm_even, sub_vm_odd], CoveredBySubRegs = 1 in
-foreach I = 0-7 in
+foreach I = 1-7 in
   def VMP#I : VEMaskReg<!shl(I,1), "vmp"#I,
                         [!cast<VEMaskReg>("VM"#!shl(I,1)),
                          !cast<VEMaskReg>("VM"#!add(!shl(I,1),1))],
diff --git a/llvm/test/CodeGen/VE/VELIntrinsics/extract.ll b/llvm/test/CodeGen/VE/VELIntrinsics/extract.ll
index 0e69448d74219..f0f28b388a114 100644
--- a/llvm/test/CodeGen/VE/VELIntrinsics/extract.ll
+++ b/llvm/test/CodeGen/VE/VELIntrinsics/extract.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve -mattr=+vpu | FileCheck %s
 
 ;;; Test extract intrinsic instructions
@@ -22,7 +23,6 @@ declare <256 x i1> @llvm.ve.vl.extract.vm512u(<512 x i1>)
 define fastcc <256 x i1> @extract_vm512l(<512 x i1> %0) {
 ; CHECK-LABEL: extract_vm512l:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andm %vm0, %vm0, %vm2
 ; CHECK-NEXT:    andm %vm1, %vm0, %vm3
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = tail call <256 x i1> @llvm.ve.vl.extract.vm512l(<512 x i1> %0)

From f27423027dc75cd298edfd279e3a627904639a94 Mon Sep 17 00:00:00 2001
From: Simon Moll <simon.moll@emea.nec.com>
Date: Fri, 18 Feb 2022 13:26:08 +0100
Subject: [PATCH 253/748] [VE] Enable v256 fcmp true|false tests

The broadcast patterns for all-true|false masks are available now.
Enable the true|fast fcmp predicate tests that use them.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D119936
---
 llvm/test/CodeGen/VE/Vector/vec_fcmp.ll | 110 ++++++++++++++----------
 1 file changed, 66 insertions(+), 44 deletions(-)

diff --git a/llvm/test/CodeGen/VE/Vector/vec_fcmp.ll b/llvm/test/CodeGen/VE/Vector/vec_fcmp.ll
index 5c6f3550388c5..295ef114aada6 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_fcmp.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_fcmp.ll
@@ -4,28 +4,39 @@
 
 ; <256 x float>
 
-;; TODO v256i1 zero-mask isel
-;; ; Function Attrs: nounwind
-;; define fastcc <256 x i1> @fcmp_false_vv_v256f32(<256 x float> %x, <256 x float> %y) {
-;;   %z = fcmp false <256 x float> %x, %y
-;;   ret <256 x i1> %z
-;; }
-;;
-;; ; Function Attrs: nounwind
-;; define fastcc <256 x i1> @fcmp_false_sv_v256f32(float %x, <256 x float> %y) {
-;;   %xins = insertelement <256 x float> undef, float %x, i32 0
-;;   %vx = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
-;;   %z = fcmp false <256 x float> %vx, %y
-;;   ret <256 x i1> %z
-;; }
-;;
-;; ; Function Attrs: nounwind
-;; define fastcc <256 x i1> @fcmp_false_vs_v256f32(<256 x float> %x, float %y) {
-;;   %yins = insertelement <256 x float> undef, float %y, i32 0
-;;   %vy = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
-;;   %z = fcmp false <256 x float> %x, %vy
-;;   ret <256 x i1> %z
-;; }
+; Function Attrs: nounwind
+define fastcc <256 x i1> @fcmp_false_vv_v256f32(<256 x float> %x, <256 x float> %y) {
+; CHECK-LABEL: fcmp_false_vv_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorm %vm1, %vm0, %vm0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %z = fcmp false <256 x float> %x, %y
+  ret <256 x i1> %z
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x i1> @fcmp_false_sv_v256f32(float %x, <256 x float> %y) {
+; CHECK-LABEL: fcmp_false_sv_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorm %vm1, %vm0, %vm0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %x, i32 0
+  %vx = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %z = fcmp false <256 x float> %vx, %y
+  ret <256 x i1> %z
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x i1> @fcmp_false_vs_v256f32(<256 x float> %x, float %y) {
+; CHECK-LABEL: fcmp_false_vs_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorm %vm1, %vm0, %vm0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %y, i32 0
+  %vy = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %z = fcmp false <256 x float> %x, %vy
+  ret <256 x i1> %z
+}
 
 ; Function Attrs: nounwind
 define fastcc <256 x i1> @fcmp_oeq_vv_v256f32(<256 x float> %x, <256 x float> %y) {
@@ -657,25 +668,36 @@ define fastcc <256 x i1> @fcmp_une_vs_v256f32(<256 x float> %x, float %y) {
   ret <256 x i1> %z
 }
 
-;; TODO v256i1 all-one mask isel.
-;; ; Function Attrs: nounwind
-;; define fastcc <256 x i1> @fcmp_true_vv_v256f32(<256 x float> %x, <256 x float> %y) {
-;;   %z = fcmp true <256 x float> %x, %y
-;;   ret <256 x i1> %z
-;; }
-;;
-;; ; Function Attrs: nounwind
-;; define fastcc <256 x i1> @fcmp_true_sv_v256f32(float %x, <256 x float> %y) {
-;;   %xins = insertelement <256 x float> undef, float %x, i32 0
-;;   %vx = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
-;;   %z = fcmp true <256 x float> %vx, %y
-;;   ret <256 x i1> %z
-;; }
-;;
-;; ; Function Attrs: nounwind
-;; define fastcc <256 x i1> @fcmp_true_vs_v256f32(<256 x float> %x, float %y) {
-;;   %yins = insertelement <256 x float> undef, float %y, i32 0
-;;   %vy = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
-;;   %z = fcmp true <256 x float> %x, %vy
-;;   ret <256 x i1> %z
-;; }
+; Function Attrs: nounwind
+define fastcc <256 x i1> @fcmp_true_vv_v256f32(<256 x float> %x, <256 x float> %y) {
+; CHECK-LABEL: fcmp_true_vv_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andm %vm1, %vm0, %vm0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %z = fcmp true <256 x float> %x, %y
+  ret <256 x i1> %z
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x i1> @fcmp_true_sv_v256f32(float %x, <256 x float> %y) {
+; CHECK-LABEL: fcmp_true_sv_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andm %vm1, %vm0, %vm0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %x, i32 0
+  %vx = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %z = fcmp true <256 x float> %vx, %y
+  ret <256 x i1> %z
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x i1> @fcmp_true_vs_v256f32(<256 x float> %x, float %y) {
+; CHECK-LABEL: fcmp_true_vs_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andm %vm1, %vm0, %vm0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %y, i32 0
+  %vy = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %z = fcmp true <256 x float> %x, %vy
+  ret <256 x i1> %z
+}

From b0a0df980927ca54a7840a1b0c9766e98c05039b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 15 Feb 2022 07:50:28 -0800
Subject: [PATCH 254/748] [SLP]Fix vectorization of the alternate cmp
 instruction with swapped predicates.

If the alternate cmp instruction is a swapped predicate of the main cmp
instruction, need to generate alternate instruction, not the one with
the swapped predicate. Also, the lane with the alternate opcode should
be selected only, if the corresponding operands are not compatible.

Correctness confirmed:
https://alive2.llvm.org/ce/z/94BG66

Differential Revision: https://reviews.llvm.org/D119855
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  82 ++++-------
 .../X86/alternate-cmp-swapped-pred.ll         |  18 +--
 .../SLPVectorizer/X86/cmp-as-alternate-ops.ll |  29 ++--
 .../SLPVectorizer/X86/reduction-logical.ll    | 129 ++++++++++++------
 4 files changed, 143 insertions(+), 115 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f7af3151e1894..024890e5845ef 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4544,10 +4544,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             Value *RHS = Cmp->getOperand(1);
             CmpInst::Predicate CurrentPred = Cmp->getPredicate();
             if (P0 == AltP0Swapped) {
-              if ((P0 == CurrentPred &&
-                   !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
-                  (AltP0 == CurrentPred &&
-                   areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)))
+              if (CI != Cmp && S.AltOp != Cmp &&
+                  ((P0 == CurrentPred &&
+                    !areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS)) ||
+                   (AltP0 == CurrentPred &&
+                    areCompatibleCmpOps(BaseOp0, BaseOp1, LHS, RHS))))
                 std::swap(LHS, RHS);
             } else if (P0 != CurrentPred && AltP0 != CurrentPred) {
               std::swap(LHS, RHS);
@@ -4835,6 +4836,29 @@ buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,
   }
 }
 
+/// Checks if the specified instruction \p I is an alternate operation for the
+/// given \p MainOp and \p AltOp instructions.
+static bool isAlternateInstruction(const Instruction *I,
+                                   const Instruction *MainOp,
+                                   const Instruction *AltOp) {
+  if (auto *CI0 = dyn_cast<CmpInst>(MainOp)) {
+    auto *AltCI0 = cast<CmpInst>(AltOp);
+    auto *CI = cast<CmpInst>(I);
+    CmpInst::Predicate P0 = CI0->getPredicate();
+    CmpInst::Predicate AltP0 = AltCI0->getPredicate();
+    assert(P0 != AltP0 && "Expected different main/alternate predicates.");
+    CmpInst::Predicate AltP0Swapped = CmpInst::getSwappedPredicate(AltP0);
+    CmpInst::Predicate CurrentPred = CI->getPredicate();
+    if (P0 == AltP0Swapped)
+      return I == AltCI0 ||
+             (I != MainOp &&
+              !areCompatibleCmpOps(CI0->getOperand(0), CI0->getOperand(1),
+                                   CI->getOperand(0), CI->getOperand(1)));
+    return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
+  }
+  return I->getOpcode() == AltOp->getOpcode();
+}
+
 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                       ArrayRef<Value *> VectorizedVals) {
   ArrayRef<Value*> VL = E->Scalars;
@@ -5560,28 +5584,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
           E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
           [E](Instruction *I) {
             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-            if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
-              auto *AltCI0 = cast<CmpInst>(E->getAltOp());
-              auto *CI = cast<CmpInst>(I);
-              CmpInst::Predicate P0 = CI0->getPredicate();
-              CmpInst::Predicate AltP0 = AltCI0->getPredicate();
-              assert(P0 != AltP0 &&
-                     "Expected different main/alternate predicates.");
-              CmpInst::Predicate AltP0Swapped =
-                  CmpInst::getSwappedPredicate(AltP0);
-              CmpInst::Predicate CurrentPred = CI->getPredicate();
-              if (P0 == AltP0Swapped)
-                return (P0 == CurrentPred &&
-                        !areCompatibleCmpOps(
-                            CI0->getOperand(0), CI0->getOperand(1),
-                            CI->getOperand(0), CI->getOperand(1))) ||
-                       (AltP0 == CurrentPred &&
-                        !areCompatibleCmpOps(
-                            CI0->getOperand(0), CI0->getOperand(1),
-                            CI->getOperand(1), CI->getOperand(0)));
-              return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
-            }
-            return I->getOpcode() == E->getAltOpcode();
+            return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
           },
           Mask);
       CommonCost =
@@ -7081,10 +7084,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
         auto *AltCI = cast<CmpInst>(E->getAltOp());
         CmpInst::Predicate AltPred = AltCI->getPredicate();
-        unsigned AltIdx =
-            std::distance(E->Scalars.begin(), find(E->Scalars, AltCI));
-        if (AltCI->getOperand(0) != E->getOperand(0)[AltIdx])
-          AltPred = CmpInst::getSwappedPredicate(AltPred);
         V1 = Builder.CreateCmp(AltPred, LHS, RHS);
       } else {
         V0 = Builder.CreateCast(
@@ -7110,28 +7109,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
           E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
           [E](Instruction *I) {
             assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-            if (auto *CI0 = dyn_cast<CmpInst>(E->getMainOp())) {
-              auto *AltCI0 = cast<CmpInst>(E->getAltOp());
-              auto *CI = cast<CmpInst>(I);
-              CmpInst::Predicate P0 = CI0->getPredicate();
-              CmpInst::Predicate AltP0 = AltCI0->getPredicate();
-              assert(P0 != AltP0 &&
-                     "Expected different main/alternate predicates.");
-              CmpInst::Predicate AltP0Swapped =
-                  CmpInst::getSwappedPredicate(AltP0);
-              CmpInst::Predicate CurrentPred = CI->getPredicate();
-              if (P0 == AltP0Swapped)
-                return (P0 == CurrentPred &&
-                        !areCompatibleCmpOps(
-                            CI0->getOperand(0), CI0->getOperand(1),
-                            CI->getOperand(0), CI->getOperand(1))) ||
-                       (AltP0 == CurrentPred &&
-                        !areCompatibleCmpOps(
-                            CI0->getOperand(0), CI0->getOperand(1),
-                            CI->getOperand(1), CI->getOperand(0)));
-              return AltP0 == CurrentPred || AltP0Swapped == CurrentPred;
-            }
-            return I->getOpcode() == E->getAltOpcode();
+            return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());
           },
           Mask, &OpScalars, &AltScalars);
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll
index eb039b12bc662..83138502a65d3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cmp-swapped-pred.ll
@@ -5,14 +5,16 @@ define i16 @test(i16 %call37) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = load i16, i16* undef, align 2
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 poison, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 [[CALL]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL37:%.*]], i32 3
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 4, i32 3, i32 5>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP0]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i16 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 poison, i16 0, i16 0, i16 0, i16 poison, i16 0, i16 0, i16 0>, i16 [[CALL37:%.*]], i32 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 0, i16 0, i16 poison, i16 0>, i16 [[CALL37]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[CALL37]], i32 6
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <8 x i16> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 5, i32 14, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <8 x i1> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]])
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i16 [[TMP8]], 0
 ; CHECK-NEXT:    ret i16 [[OP_EXTRA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
index 0d17fc440cd97..f86a48e28fdd1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
@@ -46,20 +46,21 @@ define { <2 x float>, <2 x float> } @test1(i32 %conv.i32.i.i.i) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV_I32_I_I_I1:%.*]] = fptosi float 0.000000e+00 to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[CONV_I32_I_I_I:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, i32 [[CONV_I32_I_I_I1]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> zeroinitializer, <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[RETVAL_SROA_0_0_VEC_INSERT4:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1
-; CHECK-NEXT:    [[RETVAL_SROA_0_4_VEC_INSERT7:%.*]] = insertelement <2 x float> [[RETVAL_SROA_0_0_VEC_INSERT4]], float [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2
-; CHECK-NEXT:    [[RETVAL_SROA_7_8_VEC_INSERT11:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3
-; CHECK-NEXT:    [[RETVAL_SROA_7_12_VEC_INSERT13:%.*]] = insertelement <2 x float> [[RETVAL_SROA_7_8_VEC_INSERT11]], float [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, i32 [[CONV_I32_I_I_I:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[CONV_I32_I_I_I1]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x float> zeroinitializer, <4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[RETVAL_SROA_0_0_VEC_INSERT4:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[RETVAL_SROA_0_4_VEC_INSERT7:%.*]] = insertelement <2 x float> [[RETVAL_SROA_0_0_VEC_INSERT4]], float [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+; CHECK-NEXT:    [[RETVAL_SROA_7_8_VEC_INSERT11:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
+; CHECK-NEXT:    [[RETVAL_SROA_7_12_VEC_INSERT13:%.*]] = insertelement <2 x float> [[RETVAL_SROA_7_8_VEC_INSERT11]], float [[TMP10]], i64 1
 ; CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } zeroinitializer, <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT7]], 0
 ; CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[RETVAL_SROA_7_12_VEC_INSERT13]], 1
 ; CHECK-NEXT:    ret { <2 x float>, <2 x float> } zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 0d6ebc2043bd4..de4f29445accb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -251,28 +251,53 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; CHECK-NEXT:    call void @use1(i1 [[C2]])
-; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 42
-; CHECK-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; CHECK-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; CHECK-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; CHECK-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; SSE-NEXT:    call void @use1(i1 [[C2]])
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X3]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; SSE-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; SSE-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; SSE-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
+; SSE-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i1 [[C2]], i1 false
+; SSE-NEXT:    [[S5:%.*]] = select i1 [[OP_EXTRA]], i1 [[D1]], i1 false
+; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; SSE-NEXT:    ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp(
+; AVX-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; AVX-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; AVX-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
+; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
+; AVX-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; AVX-NEXT:    call void @use1(i1 [[C2]])
+; AVX-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 42
+; AVX-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
+; AVX-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; AVX-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; AVX-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; AVX-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; AVX-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
+; AVX-NEXT:    [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false
+; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; AVX-NEXT:    ret i1 [[S7]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -395,25 +420,47 @@ define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) {
 }
 
 define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_partial(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
-; CHECK-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
-; CHECK-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
-; CHECK-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
-; CHECK-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; CHECK-NEXT:    [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
-; CHECK-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
-; CHECK-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
-; CHECK-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
-; CHECK-NEXT:    ret i1 [[S7]]
+; SSE-LABEL: @logical_and_icmp_clamp_partial(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; SSE-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X0]], i32 3
+; SSE-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 42, i32 42, i32 42, i32 17>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; SSE-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; SSE-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; SSE-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
+; SSE-NEXT:    [[S5:%.*]] = select i1 [[TMP9]], i1 [[D1]], i1 false
+; SSE-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; SSE-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; SSE-NEXT:    ret i1 [[S7]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_partial(
+; AVX-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
+; AVX-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
+; AVX-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
+; AVX-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
+; AVX-NEXT:    [[C0:%.*]] = icmp slt i32 [[X0]], 42
+; AVX-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 42
+; AVX-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 42
+; AVX-NEXT:    [[D0:%.*]] = icmp sgt i32 [[X0]], 17
+; AVX-NEXT:    [[D1:%.*]] = icmp sgt i32 [[X1]], 17
+; AVX-NEXT:    [[D2:%.*]] = icmp sgt i32 [[X2]], 17
+; AVX-NEXT:    [[D3:%.*]] = icmp sgt i32 [[X3]], 17
+; AVX-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; AVX-NEXT:    [[S4:%.*]] = select i1 [[S2]], i1 [[D0]], i1 false
+; AVX-NEXT:    [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false
+; AVX-NEXT:    [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false
+; AVX-NEXT:    [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false
+; AVX-NEXT:    ret i1 [[S7]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1

From 03ec026eac5c604c77d3185ed4a120b4cd4a27bb Mon Sep 17 00:00:00 2001
From: "Luo, Yuanke" <yuanke.luo@intel.com>
Date: Fri, 18 Feb 2022 20:52:06 +0800
Subject: [PATCH 255/748] [X86] Add test cases for sub with select.

---
 llvm/test/CodeGen/X86/vector-bo-select.ll | 227 ++++++++++++++++++++++
 1 file changed, 227 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index a3ba1601c5f3f..9f2141c48b6ab 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -891,3 +891,230 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n
   %r = fdiv <8 x double> %x, %s
   ret <8 x double> %r
 }
+
+define <4 x i32> @sub_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) {
+; AVX2-LABEL: sub_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sub_v4i32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
+; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sub_v4i32:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+  %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> zeroinitializer
+  %r = sub <4 x i32> %x, %s
+  ret <4 x i32> %r
+}
+
+; negative test - sub is not commutative; there is no identity constant for operand 0
+
+define <8 x i32> @sub_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; AVX2-LABEL: sub_v8i32_commute:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sub_v8i32_commute:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sub_v8i32_commute:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqa32 %ymm2, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer
+  %r = sub <8 x i32> %s, %x
+  ret <8 x i32> %r
+}
+
+define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; AVX2-LABEL: sub_v16i32_swap:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrad $31, %ymm5, %ymm5
+; AVX2-NEXT:    vpandn %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsubd %ymm4, %ymm2, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sub_v16i32_swap:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y
+  %r = sub <16 x i32> %x, %s
+  ret <16 x i32> %r
+}
+
+; negative test - sub is not commutative; there is no identity constant for operand 0
+
+define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; AVX2-LABEL: sub_v16i32_commute_swap:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm5, %ymm5
+; AVX2-NEXT:    vpsrad $31, %ymm5, %ymm5
+; AVX2-NEXT:    vpandn %ymm4, %ymm5, %ymm4
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm2, %ymm4, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sub_v16i32_commute_swap:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y
+  %r = sub <16 x i32> %s, %x
+  ret <16 x i32> %r
+}
+
+define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; AVX2-LABEL: sub_v8i32_cast_cond:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm2
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sub_v8i32_cast_cond:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: sub_v8i32_cast_cond:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    kmovw %edi, %k1
+; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm1 {%k1} {z}
+; AVX512VL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %b = bitcast i8 %pb to <8 x i1>
+  %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer
+  %r = sub <8 x i32> %x, %s
+  ret <8 x i32> %r
+}
+
+define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) {
+; AVX2-LABEL: sub_v8i64_cast_cond:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    shrb %al
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    movl %edi, %ecx
+; AVX2-NEXT:    andb $1, %cl
+; AVX2-NEXT:    movzbl %cl, %ecx
+; AVX2-NEXT:    vmovd %ecx, %xmm4
+; AVX2-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    shrb $2, %al
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    shrb $3, %al
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    shrb $4, %al
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm5
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    shrb $5, %al
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    shrb $6, %al
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    shrb $7, %dil
+; AVX2-NEXT:    movzbl %dil, %eax
+; AVX2-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; AVX2-NEXT:    vpslld $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpsrad $31, %xmm5, %xmm5
+; AVX2-NEXT:    vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT:    vpand %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX2-NEXT:    vpslld $31, %xmm4, %xmm4
+; AVX2-NEXT:    vpsrad $31, %xmm4, %xmm4
+; AVX2-NEXT:    vpmovsxdq %xmm4, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sub_v8i64_cast_cond:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    kmovw %edi, %k1
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm1 {%k1} {z}
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %b = bitcast i8 %pb to <8 x i1>
+  %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> zeroinitializer
+  %r = sub <8 x i64> %x, %s
+  ret <8 x i64> %r
+}

From 0870a4f59aef21bf7707b00ebd4dcad7ce7ef807 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 17 Feb 2022 19:09:46 -0500
Subject: [PATCH 256/748] [OpenMP] Add flag for disabling thread state in
 runtime

The runtime uses thread state values to indicate when we use an ICV or
are in nested parallelism. This is done for OpenMP correctness, but it
not needed in the majority of cases. The new flag added is
`-fopenmp-assume-no-thread-state`.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120106
---
 clang/include/clang/Basic/LangOptions.def           |  1 +
 clang/include/clang/Driver/Options.td               |  4 ++++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp            |  2 ++
 clang/lib/Driver/ToolChains/Clang.cpp               |  2 ++
 clang/test/OpenMP/target_globals_codegen.cpp        | 12 ++++++++++++
 .../libomptarget/DeviceRTL/include/Configuration.h  |  5 +++++
 openmp/libomptarget/DeviceRTL/src/Configuration.cpp |  6 +++++-
 openmp/libomptarget/DeviceRTL/src/State.cpp         | 13 ++++++++++---
 8 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4651f4fff6aa0..e21998860f217 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -246,6 +246,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de
 LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
+LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(RenderScript      , 1, 0, "RenderScript")
 
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 76cfdbcd85f26..c377329e8f6f4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2473,6 +2473,10 @@ def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-te
   Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
 def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">,
   Group<f_Group>, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
+def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, Group<f_Group>, 
+  Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, 
+  HelpText<"Assert no thread in a parallel region modifies an ICV">,
+  MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
 defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime",
   LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue,
   PosFlag<SetTrue, [CC1Option], "Use the new bitcode library for OpenMP offloading">,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index bb6847ab87319..fcaf9d4ed77b3 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1210,6 +1210,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
                                 "__omp_rtl_assume_teams_oversubscription");
     OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
                                 "__omp_rtl_assume_threads_oversubscription");
+    OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
+                                "__omp_rtl_assume_no_thread_state");
   }
 }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a16175ebebbca..32cbb7936f7ee 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5995,6 +5995,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                        options::OPT_fno_openmp_assume_threads_oversubscription,
                        /*Default=*/false))
         CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
+      if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
+        CmdArgs.push_back("-fopenmp-assume-no-thread-state");
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code
diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp
index fa7569cd4ca6b..3c5d4b8ed3984 100644
--- a/clang/test/OpenMP/target_globals_codegen.cpp
+++ b/clang/test/OpenMP/target_globals_codegen.cpp
@@ -6,6 +6,7 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
 // expected-no-diagnostics
 
@@ -16,26 +17,37 @@
 // CHECK: @__omp_rtl_debug_kind = weak_odr hidden constant i32 1
 // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111
 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1
+// CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
+//.
+// CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
+// CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0
+// CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1
 //.
 // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0
 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1
 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0
+// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0
 //.
 void foo() {
 #pragma omp target
diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 5727f1f2bfbf6..94f11b6066a20 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -38,8 +38,13 @@ uint32_t getDebugKind();
 /// Return the amount of dynamic shared memory that was allocated at launch.
 uint64_t getDynamicMemorySize();
 
+/// Return if debugging is enabled for the given debug kind.
 bool isDebugMode(DebugKind Level);
 
+/// Indicates if this kernel may require thread-specific states, or if it was
+/// explicitly disabled by the user.
+bool mayUseThreadStates();
+
 } // namespace config
 } // namespace _OMP
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index 349f93a08701c..e9cc9bb0e318e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -20,7 +20,9 @@ using namespace _OMP;
 
 #pragma omp declare target
 
-extern uint32_t __omp_rtl_debug_kind; // defined by CGOpenMPRuntimeGPU
+// defined by CGOpenMPRuntimeGPU
+extern uint32_t __omp_rtl_debug_kind;
+extern uint32_t __omp_rtl_assume_no_thread_state;
 
 // TODO: We want to change the name as soon as the old runtime is gone.
 // This variable should be visibile to the plugin so we override the default
@@ -48,4 +50,6 @@ bool config::isDebugMode(config::DebugKind Kind) {
   return config::getDebugKind() & Kind;
 }
 
+bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
+
 #pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index a04f5cccb1738..a530c5e0b2471 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -285,7 +285,8 @@ ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
 
 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
-  if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
+  if (OMP_LIKELY(!config::mayUseThreadStates() ||
+                 TeamState.ICVState.LevelVar == 0))
     return TeamState.ICVState.*Var;
   uint32_t TId = mapping::getThreadIdInBlock();
   if (!ThreadStates[TId]) {
@@ -299,13 +300,13 @@ uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
 
 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
   uint32_t TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(ThreadStates[TId]))
+  if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
     return ThreadStates[TId]->ICVState.*Var;
   return TeamState.ICVState.*Var;
 }
 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
   uint64_t TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(ThreadStates[TId]))
+  if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
     return ThreadStates[TId]->ICVState.*Var;
   return TeamState.ICVState.*Var;
 }
@@ -380,6 +381,9 @@ void state::init(bool IsSPMD) {
 }
 
 void state::enterDataEnvironment(IdentTy *Ident) {
+  ASSERT(config::mayUseThreadStates() &&
+         "Thread state modified while explicitly disabled!");
+
   unsigned TId = mapping::getThreadIdInBlock();
   ThreadStateTy *NewThreadState =
       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
@@ -388,6 +392,9 @@ void state::enterDataEnvironment(IdentTy *Ident) {
 }
 
 void state::exitDataEnvironment() {
+  ASSERT(config::mayUseThreadStates() &&
+         "Thread state modified while explicitly disabled!");
+
   unsigned TId = mapping::getThreadIdInBlock();
   resetStateForThread(TId);
 }

From 0136a4401f90b1effd07b3ac8575fdb155f9984d Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Thu, 17 Feb 2022 15:48:12 -0500
Subject: [PATCH 257/748] [OpenMP] Add an option to limit shared memory usage
 in OpenMPOpt

One of the optimizations performed in OpenMPOpt pushes globalized
variables to static shared memory. This is preferable to keeping the
runtime call in all cases, however if too many variables are pushed to
hared memory the kernel will crash. Since this is an optimization and
not something the user specified explicitly, there should be an option
to limit this optimization in those cases. This path introduces the
`-openmp-opt-shared-limit=` option to limit the amount of bytes that
will be placed in shared memory from HeapToShared.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120079
---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp          | 18 +++++++++++++++++-
 .../Transforms/OpenMP/replace_globalization.ll |  3 +++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 392b919c5a120..57a854f2e239e 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -129,6 +129,11 @@ static cl::opt<unsigned>
                           cl::desc("Maximal number of attributor iterations."),
                           cl::init(256));
 
+static cl::opt<unsigned>
+    SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
+                      cl::desc("Maximum amount of shared memory to use."),
+                      cl::init(std::numeric_limits<unsigned>::max()));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -3000,6 +3005,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
 
       auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
 
+      if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
+        LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
+                          << " with shared memory."
+                          << " Shared memory usage is limited to "
+                          << SharedMemoryLimit << " bytes\n");
+        continue;
+      }
+
       LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
                         << " with " << AllocSize->getZExtValue()
                         << " bytes of shared memory\n");
@@ -3034,7 +3047,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       A.deleteAfterManifest(*CB);
       A.deleteAfterManifest(*FreeCalls.front());
 
-      NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
+      SharedMemoryUsed += AllocSize->getZExtValue();
+      NumBytesMovedToSharedMemory = SharedMemoryUsed;
       Changed = ChangeStatus::CHANGED;
     }
 
@@ -3070,6 +3084,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
   SmallSetVector<CallBase *, 4> MallocCalls;
   /// Collection of potentially removed free calls in a function.
   SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
+  /// The total amount of shared memory that has been used for HeapToShared.
+  unsigned SharedMemoryUsed = 0;
 };
 
 struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index aaf48e937ca05..efec38855c9a8 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
 ; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
 ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output -openmp-opt-shared-limit=4 < %s 2>&1 | FileCheck %s -check-prefix=CHECK-LIMIT
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
@@ -8,6 +9,8 @@ target triple = "nvptx64"
 ; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory
 ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
 ; CHECK-REMARKS-NOT: 6 bytes
+; CHECK-LIMIT: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
+; CHECK-LIMIT: remark: replace_globalization.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization
 ; UTC_ARGS: --enable
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }

From 1f0aadfa62a56ca5a71bec0911538fe360bfc28d Mon Sep 17 00:00:00 2001
From: Sebastian Neubauer <Sebastian.Neubauer@amd.com>
Date: Thu, 17 Feb 2022 13:33:07 +0100
Subject: [PATCH 258/748] [AMDGPU] Fix kill flag on overlapping sgpr copy

Same as on vgpr copies, we cannot kill the source register if it
overlaps with the destination register. Otherwise, the kill of the
source register will also count as a kill for the destination register.

Differential Revision: https://reviews.llvm.org/D120042
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  4 +-
 .../CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir | 49 +++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9899c36352b84..7e5c9e990d4be 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -930,7 +930,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
-    expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
+    const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
+    expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
+                   Forward);
     return;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir
new file mode 100644
index 0000000000000..b97a9237a0b97
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir
@@ -0,0 +1,49 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=postrapseudos -o - %s | FileCheck %s
+
+# Don't set a kill of the super register on the last instruction with
+# an overlapping copy. This would kill part of the values in the
+# result copies.
+
+---
+name: overlapping_copy_kill_undef_reg_after_copy
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+
+    ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy
+    ; CHECK: liveins: $sgpr30_sgpr31, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; CHECK-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+    ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+    ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+    ; CHECK-NEXT: renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc
+    ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
+    renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
+    renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc
+    S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
+
+...
+
+---
+name: nonoverlapping_copy_kill
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr30_sgpr31, $sgpr3_sgpr4_sgpr5
+
+    ; CHECK-LABEL: name: nonoverlapping_copy_kill
+    ; CHECK: liveins: $sgpr30_sgpr31, $sgpr3_sgpr4_sgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $sgpr3, implicit $sgpr3_sgpr4_sgpr5, implicit-def $sgpr0_sgpr1_sgpr2
+    ; CHECK-NEXT: $sgpr1 = S_MOV_B32 $sgpr4, implicit $sgpr3_sgpr4_sgpr5
+    ; CHECK-NEXT: $sgpr2 = S_MOV_B32 $sgpr5, implicit killed $sgpr3_sgpr4_sgpr5
+    ; CHECK-NEXT: renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc
+    ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
+    renamable $sgpr0_sgpr1_sgpr2 = COPY killed renamable $sgpr3_sgpr4_sgpr5
+    renamable $sgpr1 = S_ADD_I32 0, $sgpr1, implicit-def $scc
+    S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
+
+...

From b4670438b3ba37b2d4cca004f9a5275e3ea6365c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 16 Feb 2022 14:35:46 +0000
Subject: [PATCH 259/748] [ConstraintSystem] Pass ArrayRef instead of full
 small vector (NFC).

This makes the called functions independent of the container type.
---
 llvm/include/llvm/Analysis/ConstraintSystem.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ConstraintSystem.h b/llvm/include/llvm/Analysis/ConstraintSystem.h
index 2ca1d5ca78a7e..d0f80c87e03b7 100644
--- a/llvm/include/llvm/Analysis/ConstraintSystem.h
+++ b/llvm/include/llvm/Analysis/ConstraintSystem.h
@@ -36,7 +36,7 @@ class ConstraintSystem {
   bool mayHaveSolutionImpl();
 
 public:
-  bool addVariableRow(const SmallVector<int64_t, 8> &R) {
+  bool addVariableRow(ArrayRef<int64_t> R) {
     assert(Constraints.empty() || R.size() == Constraints.back().size());
     // If all variable coefficients are 0, the constraint does not provide any
     // usable information.
@@ -48,11 +48,11 @@ class ConstraintSystem {
       GCD = APIntOps::GreatestCommonDivisor({32, (uint32_t)A}, {32, GCD})
                 .getZExtValue();
     }
-    Constraints.push_back(R);
+    Constraints.emplace_back(R.begin(), R.end());
     return true;
   }
 
-  bool addVariableRowFill(const SmallVector<int64_t, 8> &R) {
+  bool addVariableRowFill(ArrayRef<int64_t> R) {
     for (auto &CR : Constraints) {
       while (CR.size() != R.size())
         CR.push_back(0);

From 6527b2a4d5fadd0743e13797968c49ec30112dfe Mon Sep 17 00:00:00 2001
From: Sebastian Neubauer <Sebastian.Neubauer@amd.com>
Date: Fri, 18 Feb 2022 14:57:21 +0100
Subject: [PATCH 260/748] [AMDGPU][NFC] Fix typos

Fix some typos in the amdgpu backend.

Differential Revision: https://reviews.llvm.org/D119235
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  2 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  4 +-
 .../AMDGPU/AMDGPULowerKernelArguments.cpp     |  2 +-
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       |  2 +-
 .../AMDGPU/AMDGPUMachineCFGStructurizer.cpp   |  2 +-
 .../AMDGPUOpenCLEnqueuedBlockLowering.cpp     |  2 +-
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  2 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h  |  2 +-
 .../Target/AMDGPU/AMDILCFGStructurizer.cpp    |  3 +-
 llvm/lib/Target/AMDGPU/AMDKernelCodeT.h       |  2 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  8 ++--
 llvm/lib/Target/AMDGPU/BUFInstructions.td     |  4 +-
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  4 +-
 llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp     |  4 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |  2 +-
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |  2 +-
 .../lib/Target/AMDGPU/R600ClauseMergePass.cpp |  2 +-
 .../Target/AMDGPU/R600EmitClauseMarkers.cpp   |  4 +-
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp   |  6 +--
 llvm/lib/Target/AMDGPU/R600Packetizer.cpp     |  2 +-
 llvm/lib/Target/AMDGPU/R600TargetMachine.h    |  2 +-
 .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp |  2 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 34 ++++++++--------
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 40 +++++++++----------
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  2 +-
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp |  4 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  2 +-
 llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 16 ++++----
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  |  6 +--
 .../AMDGPU/SIOptimizeExecMaskingPreRA.cpp     |  2 +-
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp     | 12 +++---
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp  |  2 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |  2 +-
 .../Target/AMDGPU/SIShrinkInstructions.cpp    |  4 +-
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    |  4 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  2 +-
 .../Target/AMDGPU/Utils/AMDGPUMemoryUtils.h   |  4 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  2 +-
 45 files changed, 108 insertions(+), 109 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 1920684d8f1fd..94d7844e8a328 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -877,7 +877,7 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
   return getMul64(Builder, LHS, RHS).second;
 }
 
-/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// Figure out how many bits are really needed for this division. \p AtLeast is
 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
 /// first one is insufficient. Returns -1 on failure.
 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 786fc54c466cb..f2b39d68b8572 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -123,7 +123,7 @@ def gi_smrd_buffer_imm32 :
 
 // Separate load nodes are defined to glue m0 initialization in
 // SelectionDAG. The GISel selector can just insert m0 initialization
-// directly before before selecting a glue-less load, so hide this
+// directly before selecting a glue-less load, so hide this
 // distinction.
 
 def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8236e6672247b..dc105dad27ce8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2840,7 +2840,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
           }
         }
       }
-      // If "AllUsesAcceptSReg == false" so far we haven't suceeded
+      // If "AllUsesAcceptSReg == false" so far we haven't succeeded
       // commuting current user. This means have at least one use
       // that strictly require VGPR. Thus, we will not attempt to commute
       // other user instructions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b7d0f0580cda0..533b32e94dcf8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1627,7 +1627,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   }
 
   // The legalizer preprocessed the intrinsic arguments. If we aren't using
-  // NSA, these should have beeen packed into a single value in the first
+  // NSA, these should have been packed into a single value in the first
   // address register
   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a1a69030df8d4..0404193d3ae6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1510,7 +1510,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
     .clampMaxNumElements(0, S16, 64);
 
-  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
+  // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
   // pre-legalize.
   if (ST.hasVOP3PInsts()) {
     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
@@ -4377,7 +4377,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
 ///
 /// We don't want to directly select image instructions just yet, but also want
 /// to exposes all register repacking to the legalizer/combiners. We also don't
-/// want a selected instrution entering RegBankSelect. In order to avoid
+/// want a selected instruction entering RegBankSelect. In order to avoid
 /// defining a multitude of intermediate image instructions, directly hack on
 /// the intrinsic's arguments. In cases like a16 addresses, this requires
 /// padding now unnecessary arguments with $noreg.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index c34c12ab9fecb..4519d2a1c7bdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -73,7 +73,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
 
   Align MaxAlign;
-  // FIXME: Alignment is broken broken with explicit arg offset.;
+  // FIXME: Alignment is broken with explicit arg offset.;
   const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
   if (TotalKernArgSize == 0)
     return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index d8133ca052bf0..75cfd124cd070 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -14,7 +14,7 @@
 // known address. AMDGPUMachineFunction allocates the LDS global.
 //
 // Local variables with constant annotation or non-undef initializer are passed
-// through unchanged for simplication or error diagnostics in later passes.
+// through unchanged for simplification or error diagnostics in later passes.
 //
 // To reduce the memory overhead variables that are only used by kernels are
 // excluded from this transform. The analysis to determine whether a variable
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 4e2f98d2a5dbc..d837f8cb2f60d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1295,7 +1295,7 @@ static void fixRegionTerminator(RegionMRT *Region) {
   }
 }
 
-// If a region region is just a sequence of regions (and the exit
+// If a region is just a sequence of regions (and the exit
 // block in the case of the top level region), we can simply skip
 // linearizing it, because it is already linear
 bool regionIsSequence(RegionMRT *Region) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 5a5a5d213a1a2..3ddfab1b670ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -71,7 +71,7 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
   return new AMDGPUOpenCLEnqueuedBlockLowering();
 }
 
-/// Collect direct or indrect callers of \p F and save them
+/// Collect direct or indirect callers of \p F and save them
 /// to \p Callers.
 static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
   for (auto U : F->users()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 99b7ffb338845..85bcb3c7d0982 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -917,7 +917,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // usage order.
   //
   // FIXME: It is also possible that if we're allowed to use all of the memory
-  // could could end up using more than the maximum due to alignment padding.
+  // could end up using more than the maximum due to alignment padding.
 
   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f2b5beaa40790..8f14d0a451dfd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1789,7 +1789,7 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
 }
 
 /// Utility function for pushing dynamic vector indexes with a constant offset
-/// into waterwall loops.
+/// into waterfall loops.
 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
                                    MachineInstr &IdxUseInstr,
                                    unsigned OpIdx,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index dd3676f3b707a..5d80ec6eb5673 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 1736c078eb83b..0e3f734fa1e7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1487,8 +1487,7 @@ int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
 MachineBasicBlock *
 AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
     MachineBasicBlock *PredMBB) {
-  assert(PredMBB->isSuccessor(MBB) &&
-         "succBlk is not a prececessor of curBlk");
+  assert(PredMBB->isSuccessor(MBB) && "succBlk is not a predecessor of curBlk");
 
   MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
diff --git a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
index 654153ea51513..8e5f966b7c6c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -142,7 +142,7 @@ enum amd_code_property_mask_t {
   /// is provided to the finalizer when it is invoked and is recorded
   /// here. The hardware will interleave the memory requests of each
   /// lane of a wavefront by this element size to ensure each
-  /// work-item gets a distinct memory memory location. Therefore, the
+  /// work-item gets a distinct memory location. Therefore, the
   /// finalizer ensures that all load and store operations done to
   /// private memory do not exceed this size. For example, if the
   /// element size is 4 (32-bits or dword) and a 64-bit value must be
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d348a4c7e9091..ff99d1f57b919 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1930,7 +1930,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
 
   // We allow fp literals with f16x2 operands assuming that the specified
   // literal goes into the lower half and the upper half is zero. We also
-  // require that the literal may be losslesly converted to f16.
+  // require that the literal may be losslessly converted to f16.
   MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 :
                      (type == MVT::v2i16)? MVT::i16 :
                      (type == MVT::v2f32)? MVT::f32 : type;
@@ -2960,7 +2960,7 @@ AMDGPUAsmParser::isModifier() {
 //    v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
 //    v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
 // Negative fp literals with preceding "-" are
-// handled likewise for unifomtity
+// handled likewise for uniformity
 //
 bool
 AMDGPUAsmParser::parseSP3NegModifier() {
@@ -6342,7 +6342,7 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
   using namespace llvm::AMDGPU::SendMsg;
 
   // Validation strictness depends on whether message is specified
-  // in a symbolc or in a numeric form. In the latter case
+  // in a symbolic or in a numeric form. In the latter case
   // only encoding possibility is checked.
   bool Strict = Msg.IsSymbolic;
 
@@ -8384,7 +8384,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
 #define GET_MNEMONIC_CHECKER
 #include "AMDGPUGenAsmMatcher.inc"
 
-// This fuction should be defined after auto-generated include so that we have
+// This function should be defined after auto-generated include so that we have
 // MatchClassKind enum defined
 unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
                                                      unsigned Kind) {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a535c8cc09184..ccbafd02739c7 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -136,7 +136,7 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   bits<3> nfmt = format{6-4};
 
   // GFX90A+ only: instruction uses AccVGPR for data
-  // Bit superceedes tfe.
+  // Bit supersedes tfe.
   bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
 }
 
@@ -370,7 +370,7 @@ class MUBUF_Real <MUBUF_Pseudo ps> :
   bits<8>  soffset;
 
   // GFX90A+ only: instruction uses AccVGPR for data
-  // Bit superceedes tfe.
+  // Bit supersedes tfe.
   bits<1> acc = !if(ps.has_vdata, vdata{9}, 0);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index c0592f6f3c7af..a1eb80b1b762b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -20,7 +20,7 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// Hazard Recoginizer Implementation
+// Hazard Recognizer Implementation
 //===----------------------------------------------------------------------===//
 
 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
@@ -534,7 +534,7 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
   // In order to handle these situations correctly we need to make sure that
   // when a clause has more than one instruction, no instruction in the clause
   // writes to a register that is read by another instruction in the clause
-  // (including itself). If we encounter this situaion, we need to break the
+  // (including itself). If we encounter this situation, we need to break the
   // clause by inserting a non SMEM instruction.
 
   for (MachineInstr *MI : EmittedInstrs) {
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 9f98f9ada802e..c39e47363d76f 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -1,4 +1,4 @@
-//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
+//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,7 +8,7 @@
 //
 /// \file
 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
-/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
+/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
 /// with sequential versions where possible.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 257561cb8430f..c41548d19c8e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -10,7 +10,7 @@
 /// This file defines the GCNRegPressure class, which tracks registry pressure
 /// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It
 /// also implements a compare function, which compares different register
-/// pressures, and declares one with max occupance as winner.
+/// pressures, and declares one with max occupancy as winner.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e2d9e03260923..ef73ae237d615 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -695,7 +695,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
              OS.emitBytes(VendorName);
              OS.emitInt8(0); // NULL terminate VendorName
              OS.emitBytes(ArchName);
-             OS.emitInt8(0); // NULL terminte ArchName
+             OS.emitInt8(0); // NULL terminate ArchName
            });
 }
 
diff --git a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index 715fd69fc7ae5..54ef6993cef9b 100644
--- a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative manner.
 /// This pass is merging consecutive CFAlus where applicable.
 /// It needs to be called after IfCvt for best results.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index b9ca7f928d563..699df681f1fd5 100644
--- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -327,9 +327,9 @@ char R600EmitClauseMarkers::ID = 0;
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
-                      "R600 Emit Clause Markters", false, false)
+                      "R600 Emit Clause Markers", false, false)
 INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
-                      "R600 Emit Clause Markters", false, false)
+                    "R600 Emit Clause Markers", false, false)
 
 FunctionPass *llvm::createR600EmitClauseMarkers() {
   return new R600EmitClauseMarkers();
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index bd757e9e3d704..06b8ec2dceb43 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -995,7 +995,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
-/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// \p StackWidth, which tells us how many of the 4 sub-registers will be used
 /// for indirect addressing.
 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
                                                unsigned StackWidth,
@@ -1100,7 +1100,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
                                  DAG.getConstant(3, DL, MVT::i32));
 
-  // TODO: Contrary to the name of the functiom,
+  // TODO: Contrary to the name of the function,
   // it also handles sub i32 non-truncating stores (like i1)
   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
                                   Store->getValue());
@@ -1610,7 +1610,7 @@ static SDValue CompactSwizzlableVector(
     if (NewBldVec[i].isUndef())
       // We mask write here to teach later passes that the ith element of this
       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
-      // break false dependencies and additionnaly make assembly easier to read.
+      // break false dependencies and additionally make assembly easier to read.
       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
       if (C->isZero()) {
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index fbe2a1cd9fbac..59e2747875909 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -207,7 +207,7 @@ class R600PacketizerList : public VLIWPacketizerList {
     return !ARDef || !ARUse;
   }
 
-  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+  // isLegalToPruneDependencies - Is it legal to prune dependency between SUI
   // and SUJ.
   bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
     return false;
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
index 0ccbca3c68b14..d9f7bc118f90d 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// The AMDGPU TargetMachine interface definition for hw codgen targets.
+/// The AMDGPU TargetMachine interface definition for hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 80ee7a00252a1..d7ca7f36284bf 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -241,7 +241,7 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
 }
 
 // Check register def/use conflicts, occupancy limits and collect def/use maps.
-// Return true if instruction can be bundled with previous. It it cannot
+// Return true if instruction can be bundled with previous. If it cannot
 // def/use maps are not updated.
 bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
                                          RegUse &Defs, RegUse &Uses,
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0169b752e9983..d12ff4a142128 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1200,7 +1200,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
       }
     }
 
-    // Stack slot coloring may assign different objets to the same stack slot.
+    // Stack slot coloring may assign different objects to the same stack slot.
     // If not, then the VGPR to AGPR spill slot is dead.
     for (unsigned FI : SpillFIs.set_bits())
       if (!NonVGPRSpillFIs.test(FI))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0a02f64d2b034..cc4fc4800f0dc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1581,11 +1581,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
   if (Subtarget->hasUnalignedBufferAccessEnabled() &&
       !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
         AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
-    // If we have an uniform constant load, it still requires using a slow
+    // If we have a uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
       // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
-      // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
+      // 2-byte alignment is worse than 1 unless doing a 2-byte access.
       *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
                  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
         Alignment >= Align(4) : Alignment != Align(2);
@@ -4565,7 +4565,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 
     // Otherwise f32 mad is always full rate and returns the same result as
     // the separate operations so should be preferred over fma.
-    // However does not support denomals.
+    // However does not support denormals.
     if (hasFP32Denormals(MF))
       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
 
@@ -8425,7 +8425,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  // If there is a possibilty that flat instruction access scratch memory
+  // If there is a possibility that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
   if (AS == AMDGPUAS::FLAT_ADDRESS &&
       !Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -8513,7 +8513,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     if (NumElements > 2)
       return SplitVectorLoad(Op, DAG);
 
-    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // SI has a hardware bug in the LDS / GDS bounds checking: if the base
     // address is negative, then the instruction is incorrectly treated as
     // out-of-bounds even if base + offsets is in bounds. Split vectorized
     // loads here to avoid emitting ds_read2_b32. We may re-combine the
@@ -8975,7 +8975,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  // If there is a possibilty that flat instruction access scratch memory
+  // If there is a possibility that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
   if (AS == AMDGPUAS::FLAT_ADDRESS &&
       !Subtarget->hasMultiDwordFlatScratchAddressing())
@@ -9024,7 +9024,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
 
-    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // SI has a hardware bug in the LDS / GDS bounds checking: if the base
     // address is negative, then the instruction is incorrectly treated as
     // out-of-bounds even if base + offsets is in bounds. Split vectorized
     // stores here to avoid emitting ds_write2_b32. We may re-combine the
@@ -10064,7 +10064,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
         }
       }
 
-      // If one half is undef, and one is constant, perfer a splat vector rather
+      // If one half is undef, and one is constant, prefer a splat vector rather
       // than the normal qNaN. If it's a register, prefer 0.0 since that's
       // cheaper to use and may be free with a packed operation.
       if (NewElts[0].isUndef()) {
@@ -10786,7 +10786,7 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
   SDValue RHS = N->getOperand(1);
 
   // These should really be instruction patterns, but writing patterns with
-  // source modiifiers is a pain.
+  // source modifiers is a pain.
 
   // fadd (fadd (a, a), b) -> mad 2.0, a, b
   if (LHS.getOpcode() == ISD::FADD) {
@@ -10883,8 +10883,8 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
     return SDValue();
 
   // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
-  // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
-  // is sufficient to allow generaing fdot2.
+  // regardless of the denorm mode setting. Therefore,
+  // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
   const TargetOptions &Options = DAG.getTarget().Options;
   if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
       (N->getFlags().hasAllowContract() &&
@@ -11585,7 +11585,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
   if (DstSize < InitIdx)
     return;
 
-  // Create a register for the intialization value.
+  // Create a register for the initialization value.
   Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
   unsigned NewDst = 0; // Final initialized value will be in here
 
@@ -11631,7 +11631,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     TII->legalizeOperandsVOP3(MRI, MI);
 
     // Prefer VGPRs over AGPRs in mAI instructions where possible.
-    // This saves a chain-copy of registers and better ballance register
+    // This saves a chain-copy of registers and better balance register
     // use between vgpr and agpr as agpr tuples tend to be big.
     if (MI.getDesc().OpInfo) {
       unsigned Opc = MI.getOpcode();
@@ -12476,8 +12476,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
                               : AtomicExpansionKind::CmpXChg;
     }
 
-    // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
-    // to round-to-nearest-even.
+    // DS FP atomics do respect the denormal mode, but the rounding mode is
+    // fixed to round-to-nearest-even.
     // The only exception is DS_ADD_F64 which never flushes regardless of mode.
     if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
       if (!Ty->isDoubleTy())
@@ -12523,7 +12523,7 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
 // always uniform.
 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
                       unsigned WaveSize) {
-  // FIXME: We asssume we never cast the mask results of a control flow
+  // FIXME: We assume we never cast the mask results of a control flow
   // intrinsic.
   // Early exit if the type won't be consistent as a compile time hack.
   IntegerType *IT = dyn_cast<IntegerType>(V->getType());
@@ -12627,7 +12627,7 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
                                            SDValue N1) const {
   if (!N0.hasOneUse())
     return false;
-  // Take care of the oportunity to keep N0 uniform
+  // Take care of the opportunity to keep N0 uniform
   if (N0->isDivergent() || !N1->isDivergent())
     return true;
   // Check if we have a good chance to form the memory access pattern with the
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8508a3bfc5c20..d6ea4c0c06b88 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1040,7 +1040,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
       // The function is going to insert a wait on everything in its prolog.
       // This still needs to be careful if the call target is a load (e.g. a GOT
-      // load). We also need to check WAW depenancy with saved PC.
+      // load). We also need to check WAW dependency with saved PC.
       Wait = AMDGPU::Waitcnt();
 
       int CallAddrOpIdx =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7e5c9e990d4be..8dea17bcde1f8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -203,7 +203,7 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
     if (Offset0Idx == -1 || Offset1Idx == -1)
       return false;
 
-    // XXX - be careful of datalesss loads
+    // XXX - be careful of dataless loads
     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
     // include the output in the operand list, but SDNodes don't, we need to
     // subtract the index by one.
@@ -486,7 +486,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
     return false;
   }
 
-  // In order to avoid regester pressure, on an average, the number of DWORDS
+  // In order to avoid register pressure, on an average, the number of DWORDS
   // loaded together by all clustered mem ops should not exceed 8. This is an
   // empirical value based on certain observations and performance related
   // experiments.
@@ -2875,7 +2875,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   default:
     return false;
   case AMDGPU::S_MOV_B64:
-    // TODO: We could fold 64-bit immediates, but this get compilicated
+    // TODO: We could fold 64-bit immediates, but this get complicated
     // when there are sub-registers.
     return false;
 
@@ -2955,7 +2955,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
-    // We should only expect these to be on src0 due to canonicalizations.
+    // We should only expect these to be on src0 due to canonicalization.
     if (Src0->isReg() && Src0->getReg() == Reg) {
       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
         return false;
@@ -4065,9 +4065,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
 
     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
 
-    const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
+    const int OpIndices[] = {DstIdx, Src0Idx, Src1Idx, Src2Idx};
 
-    for (int OpIdx: OpIndicies) {
+    for (int OpIdx : OpIndices) {
       if (OpIdx == -1)
         continue;
       const MachineOperand &MO = MI.getOperand(OpIdx);
@@ -4230,7 +4230,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
 
     SGPRUsed = findImplicitSGPRRead(MI);
     if (SGPRUsed != AMDGPU::NoRegister) {
-      // Implicit uses may safely overlap true overands
+      // Implicit uses may safely overlap true operands
       if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
             return !RI.regsOverlap(SGPRUsed, SGPR);
           })) {
@@ -4707,7 +4707,7 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
   bool IsAllocatable = false;
   if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
     // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
-    // with two data operands. Request register class constainted to VGPR only
+    // with two data operands. Request register class constrained to VGPR only
     // of both operands present as Machine Copy Propagation can not check this
     // constraint and possibly other passes too.
     //
@@ -5266,7 +5266,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
   const MCInstrDesc &NewDesc = get(NewOpc);
   Inst.setDesc(NewDesc);
 
-  // Callers expect interator to be valid after this call, so modify the
+  // Callers expect iterator to be valid after this call, so modify the
   // instruction in place.
   if (OldVAddrIdx == NewVAddrIdx) {
     MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
@@ -5275,7 +5275,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
     MRI.moveOperands(&NewVAddr, &SAddr, 1);
     Inst.RemoveOperand(OldSAddrIdx);
     // Update the use list with the pointer we have just moved from vaddr to
-    // saddr poisition. Otherwise new vaddr will be missing from the use list.
+    // saddr position. Otherwise new vaddr will be missing from the use list.
     MRI.removeRegOperandFromUseList(&NewVAddr);
     MRI.addRegOperandToUseList(&NewVAddr);
   } else {
@@ -5432,7 +5432,7 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
     else
       Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
 
-    // Combine the comparision results with AND.
+    // Combine the comparison results with AND.
     if (CondReg == AMDGPU::NoRegister) // First.
       CondReg = NewCondReg;
     else { // If not the first, we create an AND.
@@ -5796,7 +5796,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
                              RI.getRegClass(RsrcRC))) {
       // The operands are legal.
-      // FIXME: We may need to legalize operands besided srsrc.
+      // FIXME: We may need to legalize operands besides srsrc.
       return CreatedBB;
     }
 
@@ -5870,7 +5870,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
 
-      // Atomics rith return have have an additional tied operand and are
+      // Atomics with return have an additional tied operand and are
       // missing some of the special bits.
       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
       MachineInstr *Addr64;
@@ -6501,7 +6501,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
     // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
     // invert either source and then perform the XOR. If either source is a
     // scalar register, then we can leave the inversion on the scalar unit to
-    // acheive a better distrubution of scalar and vector instructions.
+    // achieve a better distribution of scalar and vector instructions.
     bool Src0IsSGPR = Src0.isReg() &&
                       RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
     bool Src1IsSGPR = Src1.isReg() &&
@@ -6723,7 +6723,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
   legalizeOperands(*LoHalf, MDT);
   legalizeOperands(*HiHalf, MDT);
 
-  // Move all users of this moved vlaue.
+  // Move all users of this moved value.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
@@ -6787,7 +6787,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
   Worklist.insert(&LoHalf);
   Worklist.insert(&HiHalf);
 
-  // Move all users of this moved vlaue.
+  // Move all users of this moved value.
   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
@@ -6865,7 +6865,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
 
-  // We don't need to legalize operands here. src0 for etiher instruction can be
+  // We don't need to legalize operands here. src0 for either instruction can be
   // an SGPR, and the second input is unused or determined here.
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
@@ -7079,7 +7079,7 @@ void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
 
   MachineInstr *SCCUseInst = Op.getParent();
-  // Look for a preceeding instruction that either defines VCC or SCC. If VCC
+  // Look for a preceding instruction that either defines VCC or SCC. If VCC
   // then there is nothing to do because the defining instruction has been
   // converted to a VALU already. If SCC then that instruction needs to be
   // converted to a VALU.
@@ -8194,7 +8194,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
   const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
                                this](int64_t ExpectedValue, unsigned SrcSize,
-                                     bool IsReversable, bool IsSigned) -> bool {
+                                     bool IsReversible, bool IsSigned) -> bool {
     // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
     // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
     // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
@@ -8252,7 +8252,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
     bool IsReversedCC = false;
     if (CmpValue != ExpectedValue) {
-      if (!IsReversable)
+      if (!IsReversible)
         return false;
       IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
       if (!IsReversedCC)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 73544048e79cc..58606843ac9de 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1966,7 +1966,7 @@ class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
 class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
-                       " vcc", // use vcc token as dst for VOPC instructioins
+                       " vcc", // use vcc token as dst for VOPC instructions
                        "$vdst"),
                     "");
   string src0 = "$src0_modifiers";
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c41087bbb9da8..fdc8f30c01b07 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -1822,7 +1822,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
   // as the new-base(anchor) because of the maximum distance which can
-  // accomodate more intermediate bases presumeably.
+  // accommodate more intermediate bases presumably.
   //
   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
   // (&a + 8192) for load1, load2, load4.
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 4bb05d9069780..66518fbbe4fbf 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -540,7 +540,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
     return;
 
   // Make sure we do not modify exec between def and use.
-  // A copy with implcitly defined exec inserted earlier is an exclusion, it
+  // A copy with implicitly defined exec inserted earlier is an exclusion, it
   // does not really modify exec.
   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
@@ -580,7 +580,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
 }
 
 void SILowerControlFlow::optimizeEndCf() {
-  // If the only instruction immediately following this END_CF is an another
+  // If the only instruction immediately following this END_CF is another
   // END_CF in the only successor we can avoid emitting exec mask restore here.
   if (!EnableOptimizeEndCf)
     return;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 1bb17a549cbf4..5d24d66c24caf 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -331,7 +331,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
 
       SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI));
 
-      // Add this register as live-in to all blocks to avoid machine verifer
+      // Add this register as live-in to all blocks to avoid machine verifier
       // complaining about use of an undefined physical register.
       for (MachineBasicBlock &BB : MF)
         BB.addLiveIn(LaneVGPR);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 81db66a98ddf8..5fc12faa62454 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -64,7 +64,7 @@ using namespace llvm;
 // First the instructions are put into blocks.
 //   We want the blocks help control register usage and hide high latencies
 //   later. To help control register usage, we typically want all local
-//   computations, when for example you create a result that can be comsummed
+//   computations, when for example you create a result that can be consumed
 //   right away, to be contained in a block. Block inputs and outputs would
 //   typically be important results that are needed in several locations of
 //   the shader. Since we do want blocks to help hide high latencies, we want
@@ -90,8 +90,8 @@ using namespace llvm;
 // Increasing the number of active wavefronts helps hide the former, but it
 // doesn't solve the latter, thus why even if wavefront count is high, we have
 // to try have as many instructions hiding high latencies as possible.
-// The OpenCL doc says for example latency of 400 cycles for a global mem access,
-// which is hidden by 10 instructions if the wavefront count is 10.
+// The OpenCL doc says for example latency of 400 cycles for a global mem
+// access, which is hidden by 10 instructions if the wavefront count is 10.
 
 // Some figures taken from AMD docs:
 // Both texture and constant L1 caches are 4-way associative with 64 bytes
@@ -353,7 +353,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
   // able to correctly handle 5 vs 6, 2 vs 3.
   // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
-  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+  // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
   // The use of findDefBetween removes the case 4.
   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
     Register Reg = RegMaskPair.RegUnit;
@@ -402,7 +402,7 @@ void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
     nodeScheduled(SU);
   }
 
-  // TODO: compute InternalAdditionnalPressure.
+  // TODO: compute InternalAdditionalPressure.
   InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size());
 
   // Check everything is right.
@@ -696,7 +696,7 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
         bool HasSubGraph;
         std::vector<int> SubGraph;
         // By construction (topological order), if SU and
-        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+        // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
         // in the parent graph of SU.
 #ifndef NDEBUG
         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
@@ -1131,7 +1131,7 @@ void SIScheduleBlockCreator::colorExports() {
         bool HasSubGraph;
         std::vector<int> SubGraph;
         // By construction (topological order), if SU and
-        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+        // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
         // in the parent graph of SU.
 #ifndef NDEBUG
         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
@@ -1148,7 +1148,7 @@ void SIScheduleBlockCreator::colorExports() {
         for (unsigned k : SubGraph) {
           if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
             // Other instructions than EXP would be required in the group.
-            // Abort the groupping.
+            // Abort the grouping.
             return;
         }
       }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index fff4f6729c99a..d1ce9680c0328 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -63,7 +63,7 @@ enum class SIAtomicScope {
 };
 
 /// The distinct address spaces supported by the AMDGPU target for
-/// atomic memory operation. Can be ORed toether.
+/// atomic memory operation. Can be ORed together.
 enum class SIAtomicAddrSpace {
   NONE = 0u,
   GLOBAL = 1u << 0,
@@ -943,7 +943,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The LDS keeps all memory operations in order for
-      // the same wavesfront.
+      // the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
@@ -1547,7 +1547,7 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     case SIAtomicScope::WAVEFRONT:
     case SIAtomicScope::SINGLETHREAD:
       // The LDS keeps all memory operations in order for
-      // the same wavesfront.
+      // the same wavefront.
       break;
     default:
       llvm_unreachable("Unsupported synchronization scope");
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 5f89f38266833..12e6969be34b6 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -402,7 +402,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
     }
 
     // If the only user of a logical operation is move to exec, fold it now
-    // to prevent forming of saveexec. I.e:
+    // to prevent forming of saveexec. I.e.:
     //
     //    %0:sreg_64 = COPY $exec
     //    %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index da41a5e2478a0..e768a2f3e1a5d 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -316,7 +316,7 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
   }
   if (Abs || Neg) {
     assert(!Sext &&
-           "Float and integer src modifiers can't be set simulteniously");
+           "Float and integer src modifiers can't be set simultaneously");
     Mods |= Abs ? SISrcMods::ABS : 0u;
     Mods ^= Neg ? SISrcMods::NEG : 0u;
   } else if (Sext) {
@@ -1131,16 +1131,16 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {
     LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
-    // There should be no intesection between SDWA operands and potential MIs
+    // There should be no intersection between SDWA operands and potential MIs
     // e.g.:
     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
     // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
     // v_add_u32 v3, v4, v2
     //
-    // In that example it is possible that we would fold 2nd instruction into 3rd
-    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
-    // already destroyed). So if SDWAOperand is also a potential MI then do not
-    // apply it.
+    // In that example it is possible that we would fold 2nd instruction into
+    // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
+    // was already destroyed). So if SDWAOperand is also a potential MI then do
+    // not apply it.
     if (PotentialMatches.count(Operand->getParentInst()) == 0)
       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index b0e45dd3e3e3a..41b9b0a939e7d 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -133,7 +133,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
       return Changed;
     MaskValue = M->getOperand(1).getImm();
     // First if sreg is only used in the AND instruction fold the immediate
-    // into into the AND.
+    // into the AND.
     if (!ReadsSreg && Op2.isKill()) {
       A->getOperand(2).ChangeToImmediate(MaskValue);
       M->eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 71b8b779ba76d..4d6557c23b720 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -97,7 +97,7 @@ class RegSeqNames<int last_reg, int stride, int size, string prefix,
                     []);
 }
 
-// Generates list of dags for register tupless.
+// Generates list of dags for register tuples.
 class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
                 int start = 0> {
   dag trunc_rc = (trunc RC,
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index a15686c637cd8..07b1c42a9b2fc 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -458,11 +458,11 @@ static void dropInstructionKeepingImpDefs(MachineInstr &MI,
 // Returns next valid instruction pointer if was able to create v_swap_b32.
 //
 // This shall not be done too early not to prevent possible folding which may
-// remove matched moves, and this should prefereably be done before RA to
+// remove matched moves, and this should preferably be done before RA to
 // release saved registers and also possibly after RA which can insert copies
 // too.
 //
-// This is really just a generic peephole that is not a canocical shrinking,
+// This is really just a generic peephole that is not a canonical shrinking,
 // although requirements match the pass placement and it reduces code size too.
 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
                                const SIInstrInfo *TII) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 46efb3c605c69..720fc213f77cd 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -969,7 +969,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
   MachineInstr *WQMMaskMI = nullptr;
   Register LiveMaskWQM;
   if (IsDemote) {
-    // Demotes deactive quads with only helper lanes
+    // Demote - deactivate quads with only helper lanes
     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
     WQMMaskMI =
         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
@@ -977,7 +977,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
                   .addReg(Exec)
                   .addReg(LiveMaskWQM);
   } else {
-    // Kills deactivate lanes
+    // Kill - deactivate lanes no longer in live mask
     if (Op.isImm()) {
       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 7df0eab964e62..05cd3deac4acd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -935,7 +935,7 @@ inline bool isLegal64BitDPPControl(unsigned DC) {
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
 
-// Track defaults for fields in the MODE registser.
+// Track defaults for fields in the MODE register.
 struct SIModeRegisterDefaults {
   /// Floating point opcodes that support exception flag gathering quiet and
   /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 292500a8b77e8..65ed02ca62de8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -35,8 +35,8 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
 /// Replace all uses of constant \p C with instructions in \p F.
 void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
 
-/// Given a \p Def clobbering a load from \p Ptr accroding to the MSSA check
-/// if this is actually a memory update or an artifical clobber to facilitate
+/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
+/// if this is actually a memory update or an artificial clobber to facilitate
 /// ordering constraints.
 bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9b998404faa9e..8a3548cd89f21 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -390,7 +390,7 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
   // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
   // We then create two versions of the instruction: with tied dst and src2
-  // and with the eralyclobber flag on the dst. This is strciter than the
+  // and with the earlyclobber flag on the dst. This is stricter than the
   // actual HW restriction. In particular earlyclobber also affects src0 and
   // src1 allocation which is not required.
   bit NoDstOverlap = !gt(DstVT.Size, 128);

From 0bff3a965022647fcdd17cc8f2217f5a2cd30b4c Mon Sep 17 00:00:00 2001
From: David Stone <davidfromonline@gmail.com>
Date: Fri, 18 Feb 2022 05:54:33 -0800
Subject: [PATCH 261/748] Lambdas are not necessarily locals. This resolves
 DR48250.

Differential Revision: https://reviews.llvm.org/D99134
---
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp         |  4 +++-
 .../SemaCXX/lambdas-implicit-explicit-template.cpp     | 10 ++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 2e8ddc8242fb6..237886c906a5b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -6034,7 +6034,9 @@ NamedDecl *Sema::FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D,
       (ParentDependsOnArgs && (ParentDC->isFunctionOrMethod() ||
                                isa<OMPDeclareReductionDecl>(ParentDC) ||
                                isa<OMPDeclareMapperDecl>(ParentDC))) ||
-      (isa<CXXRecordDecl>(D) && cast<CXXRecordDecl>(D)->isLambda())) {
+      (isa<CXXRecordDecl>(D) && cast<CXXRecordDecl>(D)->isLambda() &&
+       cast<CXXRecordDecl>(D)->getTemplateDepth() >
+           TemplateArgs.getNumRetainedOuterLevels())) {
     // D is a local of some kind. Look into the map of local
     // declarations to their instantiations.
     if (CurrentInstantiationScope) {
diff --git a/clang/test/SemaCXX/lambdas-implicit-explicit-template.cpp b/clang/test/SemaCXX/lambdas-implicit-explicit-template.cpp
index 13fe12abe9e9d..a5410d2aed597 100644
--- a/clang/test/SemaCXX/lambdas-implicit-explicit-template.cpp
+++ b/clang/test/SemaCXX/lambdas-implicit-explicit-template.cpp
@@ -39,3 +39,13 @@ void c2() {
   const auto lambda = [&](auto arg1) {};
   [&](auto arg2) { lambda.operator()(arg2); }(0);
 }
+
+auto d = [](auto) {};
+
+template <typename T>
+void d1(T x) { d.operator()(x); }
+
+void d2() { d1(0); }
+
+template <typename T> int e1 = [](auto){ return T(); }.operator()(T());
+int e2 = e1<int>;

From 3ce2ee28f042c2a00d09c228c76f2692778bd607 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 18 Feb 2022 14:40:11 +0100
Subject: [PATCH 262/748] [mlir][ODS] Infer return types if the operands are
 variadic but the results are not

Clean up code that worked around this limitation.

Differential Revision: https://reviews.llvm.org/D120119
---
 .../include/mlir/Dialect/Shape/IR/ShapeOps.td | 61 ++-----------------
 .../mlir/Dialect/Vector/IR/VectorOps.td       |  2 -
 mlir/lib/Dialect/Shape/IR/Shape.cpp           |  5 --
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |  5 --
 mlir/lib/TableGen/Operator.cpp                |  5 +-
 5 files changed, 6 insertions(+), 72 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index 19b08642c8a66..31a69522c86c3 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -192,8 +192,7 @@ def Shape_DivOp : Shape_Op<"div", [NoSideEffect,
   }];
 }
 
-def Shape_ShapeEqOp : Shape_Op<"shape_eq",
-    [NoSideEffect, Commutative, InferTypeOpInterface]> {
+def Shape_ShapeEqOp : Shape_Op<"shape_eq", [NoSideEffect, Commutative]> {
   let summary = "Returns whether the input shapes or extent tensors are equal";
   let description = [{
     Takes one or more shape or extent tensor operands and determines whether
@@ -211,17 +210,6 @@ def Shape_ShapeEqOp : Shape_Op<"shape_eq",
   OpBuilder<(ins "::mlir::Value":$lhs, "::mlir::Value":$rhs),
     [{ build($_builder, $_state, ::llvm::makeArrayRef({lhs, rhs})); }]>,
   ];
-  let extraClassDeclaration = [{
-    // TODO: This should really be automatic. Figure out how to not need this defined.
-    static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *context,
-    ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands,
-    ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions,
-    ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes) {
-      inferredReturnTypes.push_back(::mlir::IntegerType::get(context,
-                                                             /*width=*/1));
-      return success();
-    };
-  }];
 
   let assemblyFormat = "$shapes attr-dict `:` type($shapes)";
   let hasFolder = 1;
@@ -262,8 +250,7 @@ def Shape_FromExtentTensorOp : Shape_Op<"from_extent_tensor", [NoSideEffect]> {
   let assemblyFormat = "$input attr-dict `:` type($input)";
 }
 
-def Shape_IsBroadcastableOp : Shape_Op<"is_broadcastable",
-                                       [Commutative, InferTypeOpInterface]> {
+def Shape_IsBroadcastableOp : Shape_Op<"is_broadcastable", [Commutative]> {
   let summary = "Determines if 2+ shapes can be successfully broadcasted";
   let description = [{
     Given multiple input shapes or extent tensors, return a predicate specifying
@@ -289,17 +276,6 @@ def Shape_IsBroadcastableOp : Shape_Op<"is_broadcastable",
   OpBuilder<(ins "::mlir::Value":$lhs, "::mlir::Value":$rhs),
     [{ build($_builder, $_state, ::llvm::makeArrayRef({lhs, rhs})); }]>,
   ];
-  let extraClassDeclaration = [{
-    // TODO: This should really be automatic. Figure out how to not need this defined.
-    static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *context,
-    ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands,
-    ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions,
-    ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes) {
-      inferredReturnTypes.push_back(::mlir::IntegerType::get(context,
-                                                             /*width=*/1));
-      return success();
-    };
-  }];
 
   let hasFolder = 1;
   let hasCanonicalizer = 1;
@@ -850,12 +826,6 @@ def Shape_AssumingAllOp : Shape_Op<"assuming_all", [Commutative, NoSideEffect]>
   let arguments = (ins Variadic<Shape_WitnessType>:$inputs);
   let results = (outs Shape_WitnessType:$result);
 
-  // Only needed while tablegen is unable to generate this for ops with variadic
-  // arguments.
-  let builders = [
-    OpBuilder<(ins "ValueRange":$inputs)>,
-  ];
-
   let assemblyFormat = "$inputs attr-dict";
 
   let hasFolder = 1;
@@ -917,8 +887,7 @@ def Shape_AssumingYieldOp : Shape_Op<"assuming_yield",
   let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
 }
 
-def Shape_CstrBroadcastableOp : Shape_Op<"cstr_broadcastable",
-                                         [Commutative, InferTypeOpInterface]> {
+def Shape_CstrBroadcastableOp : Shape_Op<"cstr_broadcastable", [Commutative]> {
   let summary = "Determines if 2+ shapes can be successfully broadcasted";
   let description = [{
     Given input shapes or extent tensors, return a witness specifying if they
@@ -944,23 +913,12 @@ def Shape_CstrBroadcastableOp : Shape_Op<"cstr_broadcastable",
     [{ build($_builder, $_state, ::llvm::makeArrayRef({lhs, rhs})); }]>,
   ];
 
-  let extraClassDeclaration = [{
-    // TODO: This should really be automatic. Figure out how to not need this defined.
-    static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *context,
-    ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands,
-    ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions,
-    ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes) {
-      inferredReturnTypes.push_back(::mlir::shape::WitnessType::get(context));
-      return success();
-    };
-  }];
-
   let hasCanonicalizer = 1;
   let hasFolder = 1;
   let hasVerifier = 1;
 }
 
-def Shape_CstrEqOp : Shape_Op<"cstr_eq", [Commutative, InferTypeOpInterface]> {
+def Shape_CstrEqOp : Shape_Op<"cstr_eq", [Commutative]> {
   let summary = "Determines if all input shapes are equal";
   let description = [{
     Given 1 or more input shapes, determine if all shapes are the exact same.
@@ -978,17 +936,6 @@ def Shape_CstrEqOp : Shape_Op<"cstr_eq", [Commutative, InferTypeOpInterface]> {
 
   let assemblyFormat = "$shapes attr-dict `:` type($shapes)";
 
-  let extraClassDeclaration = [{
-    // TODO: This should really be automatic. Figure out how to not need this defined.
-    static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *context,
-    ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands,
-    ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions,
-    ::llvm::SmallVectorImpl<::mlir::Type>&inferredReturnTypes) {
-      inferredReturnTypes.push_back(::mlir::shape::WitnessType::get(context));
-      return success();
-    };
-  }];
-
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index aec8dd5b68823..2e7f06903824f 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -803,8 +803,6 @@ def Vector_InsertMapOp :
       into vector<64x4x32xf32>
     ```
   }];
-  let builders = [OpBuilder<(ins "Value":$vector, "Value":$dest,
-                                "ValueRange":$ids)>];
   let extraClassDeclaration = [{
     VectorType getSourceVectorType() {
       return vector().getType().cast<VectorType>();
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 5c851f579ef85..0f633312eaddc 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -663,11 +663,6 @@ LogicalResult AssumingAllOp::verify() {
   return success();
 }
 
-void AssumingAllOp::build(OpBuilder &b, OperationState &state,
-                          ValueRange inputs) {
-  build(b, state, b.getType<WitnessType>(), inputs);
-}
-
 //===----------------------------------------------------------------------===//
 // BroadcastOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index ddfe0d8442280..f6547e46d5418 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -1900,11 +1900,6 @@ OpFoldResult vector::InsertOp::fold(ArrayRef<Attribute> operands) {
 // InsertMapOp
 //===----------------------------------------------------------------------===//
 
-void InsertMapOp::build(OpBuilder &builder, OperationState &result,
-                        Value vector, Value dest, ValueRange ids) {
-  InsertMapOp::build(builder, result, dest.getType(), vector, dest, ids);
-}
-
 LogicalResult InsertMapOp::verify() {
   if (getSourceVectorType().getRank() != getResultType().getRank())
     return emitOpError("expected source and destination vectors of same rank");
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index a71ae4d642b13..2a0d49fcfccf6 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -327,9 +327,8 @@ void Operator::populateTypeInferenceInfo(
   if (getNumResults() == 0)
     return;
 
-  // Skip for ops with variadic operands/results.
-  // TODO: This can be relaxed.
-  if (isVariadic())
+  // Skip ops with variadic or optional results.
+  if (getNumVariableLengthResults() > 0)
     return;
 
   // Skip cases currently being custom generated.

From 00ab91b70d21f72af59e4e198c6dc819452405af Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 18 Feb 2022 14:33:58 +0000
Subject: [PATCH 263/748] [ConstraintElimination] Remove ConstraintListTy
 (NFCI).

This patch simplifies constraint handling by removing the
ConstraintListTy wrapper struct and moving the Preconditions directly
into ConstraintTy. This reduces the amount of memory needed for managing
constraints.

The only use case for ConstraintListTy was adding 2 constraints to model
ICMP_EQ conditions. But this can be handled by adding an IsEq flag. When
adding an equality constraint, we need to add the constraint and the
inverted constraint.
---
 .../Scalar/ConstraintElimination.cpp          | 126 +++++++++---------
 1 file changed, 62 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 6ba38ca1703e2..5ce05569fa9e0 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -89,53 +89,32 @@ struct PreconditionTy {
 
 struct ConstraintTy {
   SmallVector<int64_t, 8> Coefficients;
+  SmallVector<PreconditionTy, 2> Preconditions;
 
-  bool IsSigned;
+  bool IsSigned = false;
+  bool IsEq = false;
+
+  ConstraintTy() = default;
 
   ConstraintTy(SmallVector<int64_t, 8> Coefficients, bool IsSigned)
       : Coefficients(Coefficients), IsSigned(IsSigned) {}
 
   unsigned size() const { return Coefficients.size(); }
-};
-
-/// Struct to manage a list of constraints with pre-conditions that must be
-/// satisfied before using the constraints.
-struct ConstraintListTy {
-  SmallVector<ConstraintTy, 4> Constraints;
-  SmallVector<PreconditionTy, 4> Preconditions;
-
-  ConstraintListTy() = default;
-
-  ConstraintListTy(ArrayRef<ConstraintTy> Constraints,
-                   ArrayRef<PreconditionTy> Preconditions)
-      : Constraints(Constraints.begin(), Constraints.end()),
-        Preconditions(Preconditions.begin(), Preconditions.end()) {}
-
-  void mergeIn(const ConstraintListTy &Other) {
-    append_range(Constraints, Other.Constraints);
-    // TODO: Do smarter merges here, e.g. exclude duplicates.
-    append_range(Preconditions, Other.Preconditions);
-  }
-
-  unsigned size() const { return Constraints.size(); }
 
-  unsigned empty() const { return Constraints.empty(); }
+  unsigned empty() const { return Coefficients.empty(); }
 
   /// Returns true if any constraint has a non-zero coefficient for any of the
   /// newly added indices. Zero coefficients for new indices are removed. If it
   /// returns true, no new variable need to be added to the system.
   bool needsNewIndices(const DenseMap<Value *, unsigned> &NewIndices) {
-    assert(size() == 1);
     for (unsigned I = 0; I < NewIndices.size(); ++I) {
-      int64_t Last = get(0).Coefficients.pop_back_val();
+      int64_t Last = Coefficients.pop_back_val();
       if (Last != 0)
         return true;
     }
     return false;
   }
 
-  ConstraintTy &get(unsigned I) { return Constraints[I]; }
-
   /// Returns true if all preconditions for this list of constraints are
   /// satisfied given \p CS and the corresponding \p Value2Index mapping.
   bool isValid(const ConstraintInfo &Info) const;
@@ -249,10 +228,11 @@ decompose(Value *V, SmallVector<PreconditionTy, 4> &Preconditions,
 /// Turn a condition \p CmpI into a vector of constraints, using indices from \p
 /// Value2Index. Additional indices for newly discovered values are added to \p
 /// NewIndices.
-static ConstraintListTy
+static ConstraintTy
 getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
               const DenseMap<Value *, unsigned> &Value2Index,
               DenseMap<Value *, unsigned> &NewIndices) {
+  bool IsEq = false;
   // Try to convert Pred to one of ULE/SLT/SLE/SLT.
   switch (Pred) {
   case CmpInst::ICMP_UGT:
@@ -267,12 +247,8 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
     if (match(Op1, m_Zero())) {
       Pred = CmpInst::ICMP_ULE;
     } else {
-      auto A =
-          getConstraint(CmpInst::ICMP_UGE, Op0, Op1, Value2Index, NewIndices);
-      auto B =
-          getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, NewIndices);
-      A.mergeIn(B);
-      return A;
+      IsEq = true;
+      Pred = CmpInst::ICMP_ULE;
     }
     break;
   case CmpInst::ICMP_NE:
@@ -330,7 +306,11 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
 
   // Build result constraint, by first adding all coefficients from A and then
   // subtracting all coefficients from B.
-  SmallVector<int64_t, 8> R(Value2Index.size() + NewIndices.size() + 1, 0);
+  ConstraintTy Res(
+      SmallVector<int64_t, 8>(Value2Index.size() + NewIndices.size() + 1, 0),
+      IsSigned);
+  Res.IsEq = IsEq;
+  auto &R = Res.Coefficients;
   for (const auto &KV : VariablesA)
     R[GetOrAddIndex(KV.second)] += KV.first;
 
@@ -339,27 +319,30 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
 
   R[0] = Offset1 + Offset2 +
          (Pred == (IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT) ? -1 : 0);
-  return {{{R, IsSigned}}, Preconditions};
+  Res.Preconditions = std::move(Preconditions);
+  return Res;
 }
 
-static ConstraintListTy getConstraint(CmpInst *Cmp, ConstraintInfo &Info,
-                                      DenseMap<Value *, unsigned> &NewIndices) {
+static ConstraintTy getConstraint(CmpInst *Cmp, ConstraintInfo &Info,
+                                  DenseMap<Value *, unsigned> &NewIndices) {
   return getConstraint(
       Cmp->getPredicate(), Cmp->getOperand(0), Cmp->getOperand(1),
       Info.getValue2Index(CmpInst::isSigned(Cmp->getPredicate())), NewIndices);
 }
 
-bool ConstraintListTy::isValid(const ConstraintInfo &Info) const {
-  return all_of(Preconditions, [&Info](const PreconditionTy &C) {
-    DenseMap<Value *, unsigned> NewIndices;
-    auto R = getConstraint(C.Pred, C.Op0, C.Op1,
-                           Info.getValue2Index(CmpInst::isSigned(C.Pred)),
-                           NewIndices);
-    // TODO: properly check NewIndices.
-    return NewIndices.empty() && R.Preconditions.empty() && R.size() == 1 &&
-           Info.getCS(CmpInst::isSigned(C.Pred))
-               .isConditionImplied(R.get(0).Coefficients);
-  });
+bool ConstraintTy::isValid(const ConstraintInfo &Info) const {
+  return Coefficients.size() > 0 &&
+         all_of(Preconditions, [&Info](const PreconditionTy &C) {
+           DenseMap<Value *, unsigned> NewIndices;
+           auto R = getConstraint(
+               C.Pred, C.Op0, C.Op1,
+               Info.getValue2Index(CmpInst::isSigned(C.Pred)), NewIndices);
+           // TODO: properly check NewIndices.
+           return NewIndices.empty() && R.Preconditions.empty() && !R.IsEq &&
+                  R.size() >= 2 &&
+                  Info.getCS(CmpInst::isSigned(C.Pred))
+                      .isConditionImplied(R.Coefficients);
+         });
 }
 
 namespace {
@@ -553,11 +536,12 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
 
         DenseMap<Value *, unsigned> NewIndices;
         auto R = getConstraint(Cmp, Info, NewIndices);
-        if (!R.isValidSingle(Info) || R.needsNewIndices(NewIndices))
+        if (R.IsEq || R.size() < 2 || R.needsNewIndices(NewIndices) ||
+            !R.isValid(Info))
           continue;
 
-        auto &CSToUse = Info.getCS(R.get(0).IsSigned);
-        if (CSToUse.isConditionImplied(R.get(0).Coefficients)) {
+        auto &CSToUse = Info.getCS(R.IsSigned);
+        if (CSToUse.isConditionImplied(R.Coefficients)) {
           if (!DebugCounter::shouldExecute(EliminatedCounter))
             continue;
 
@@ -578,7 +562,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
           Changed = true;
         }
         if (CSToUse.isConditionImplied(
-                ConstraintSystem::negate(R.get(0).Coefficients))) {
+                ConstraintSystem::negate(R.Coefficients))) {
           if (!DebugCounter::shouldExecute(EliminatedCounter))
             continue;
 
@@ -626,23 +610,37 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
 
     LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n");
     bool Added = false;
-    for (auto &E : R.Constraints) {
-      auto &CSToUse = Info.getCS(E.IsSigned);
-      if (E.Coefficients.empty())
-        continue;
+    assert(CmpInst::isSigned(CB.Condition->getPredicate()) == R.IsSigned &&
+           "condition and constraint signs must match");
+    auto &CSToUse = Info.getCS(R.IsSigned);
+    if (R.Coefficients.empty())
+      continue;
+
+    Added |= CSToUse.addVariableRowFill(R.Coefficients);
+
+    // If R has been added to the system, queue it for removal once it goes
+    // out-of-scope.
+    if (Added) {
+      for (auto &KV : NewIndices)
+        Info.getValue2Index(R.IsSigned).insert(KV);
 
       LLVM_DEBUG({
         dbgs() << "  constraint: ";
-        dumpWithNames(E, Info.getValue2Index(E.IsSigned));
+        dumpWithNames(R, Info.getValue2Index(R.IsSigned));
       });
 
-      Added |= CSToUse.addVariableRowFill(E.Coefficients);
+      DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not,
+                              R.IsSigned);
+
+      if (R.IsEq) {
+        // Also add the inverted constraint for equality constraints.
+        for (auto &Coeff : R.Coefficients)
+          Coeff *= -1;
+        CSToUse.addVariableRowFill(R.Coefficients);
 
-      // If R has been added to the system, queue it for removal once it goes
-      // out-of-scope.
-      if (Added)
         DFSInStack.emplace_back(CB.NumIn, CB.NumOut, CB.Condition, CB.Not,
-                                E.IsSigned);
+                                R.IsSigned);
+      }
     }
   }
 

From c24e197aaad5333717a27f6e4346f97da84821ed Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 14:21:07 +0000
Subject: [PATCH 264/748] [clangd] getHover - pass FormatStyle argument by
 const reference

Reported by coverity
---
 clang-tools-extra/clangd/Hover.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index dda5ad36e9b89..8b22018411404 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -963,7 +963,7 @@ void maybeAddCalleeArgInfo(const SelectionTree::Node *N, HoverInfo &HI,
 } // namespace
 
 llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
-                                   format::FormatStyle Style,
+                                   const format::FormatStyle &Style,
                                    const SymbolIndex *Index) {
   PrintingPolicy PP =
       getPrintingPolicy(AST.getASTContext().getPrintingPolicy());

From be3b40c059355115a8041f5bd866ad8d99950611 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Fri, 18 Feb 2022 15:49:53 +0100
Subject: [PATCH 265/748] [flang] Lower basic binary operation for scalars

Lower simple binary operation (+, -, *, /) for scalars.

This patch is part of the upstreaming effort from fir-dev branch.

Depends on D120058

Reviewed By: PeteSteinfeld

Differential Revision: https://reviews.llvm.org/D120063

Co-authored-by: Jean Perier <jperier@nvidia.com>
Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
---
 flang/lib/Lower/ConvertExpr.cpp |  17 ++-
 flang/test/Lower/assignment.f90 | 190 ++++++++++++++++++++++++++++++++
 2 files changed, 206 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 466a74fe031eb..07e5fb8fa1a57 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -206,12 +206,27 @@ class ScalarExprLowering {
     return builder.create<fir::NegcOp>(getLoc(), genunbox(op.left()));
   }
 
+  template <typename OpTy>
+  mlir::Value createBinaryOp(const ExtValue &left, const ExtValue &right) {
+    assert(fir::isUnboxedValue(left) && fir::isUnboxedValue(right));
+    mlir::Value lhs = fir::getBase(left);
+    mlir::Value rhs = fir::getBase(right);
+    assert(lhs.getType() == rhs.getType() && "types must be the same");
+    return builder.create<OpTy>(getLoc(), lhs, rhs);
+  }
+
+  template <typename OpTy, typename A>
+  mlir::Value createBinaryOp(const A &ex) {
+    ExtValue left = genval(ex.left());
+    return createBinaryOp<OpTy>(left, genval(ex.right()));
+  }
+
 #undef GENBIN
 #define GENBIN(GenBinEvOp, GenBinTyCat, GenBinFirOp)                           \
   template <int KIND>                                                          \
   ExtValue genval(const Fortran::evaluate::GenBinEvOp<Fortran::evaluate::Type< \
                       Fortran::common::TypeCategory::GenBinTyCat, KIND>> &x) { \
-    TODO(getLoc(), "genval GenBinEvOp");                                       \
+    return createBinaryOp<GenBinFirOp>(x);                                     \
   }
 
   GENBIN(Add, Integer, mlir::arith::AddIOp)
diff --git a/flang/test/Lower/assignment.f90 b/flang/test/Lower/assignment.f90
index 32c2086de7de2..ce9689a708a8f 100644
--- a/flang/test/Lower/assignment.f90
+++ b/flang/test/Lower/assignment.f90
@@ -63,5 +63,195 @@ real function negr(a)
 ! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[NEG:.*]] = fir.negc %[[A_VAL]] : !fir.complex<4>
 ! CHECK:         fir.store %[[NEG]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+
+integer function addi(a, b)
+  integer :: a, b
+  addi = a + b
+end
+
+! CHECK-LABEL: func @_QPaddi(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca i32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+! CHECK:         %[[ADD:.*]] = arith.addi %[[A_VAL]], %[[B_VAL]] : i32
+! CHECK:         fir.store %[[ADD]] to %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         return %[[RET]] : i32
+
+integer function subi(a, b)
+  integer :: a, b
+  subi = a - b
+end
+
+! CHECK-LABEL: func @_QPsubi(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca i32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+! CHECK:         %[[SUB:.*]] = arith.subi %[[A_VAL]], %[[B_VAL]] : i32
+! CHECK:         fir.store %[[SUB]] to %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         return %[[RET]] : i32
+
+integer function muli(a, b)
+  integer :: a, b
+  muli = a * b
+end
+
+! CHECK-LABEL: func @_QPmuli(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca i32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+! CHECK:         %[[MUL:.*]] = arith.muli %[[A_VAL]], %[[B_VAL]] : i32
+! CHECK:         fir.store %[[MUL]] to %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         return %[[RET]] : i32
+
+integer function divi(a, b)
+  integer :: a, b
+  divi = a / b
+end
+
+! CHECK-LABEL: func @_QPdivi(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca i32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+! CHECK:         %[[DIV:.*]] = arith.divsi %[[A_VAL]], %[[B_VAL]] : i32
+! CHECK:         fir.store %[[DIV]] to %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<i32>
+! CHECK:         return %[[RET]] : i32
+
+real function addf(a, b)
+  real :: a, b
+  addf = a + b
+end
+
+! CHECK-LABEL: func @_QPaddf(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<f32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca f32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<f32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<f32>
+! CHECK:         %[[ADD:.*]] = arith.addf %[[A_VAL]], %[[B_VAL]] : f32
+! CHECK:         fir.store %[[ADD]] to %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         return %[[RET]] : f32
+
+real function subf(a, b)
+  real :: a, b
+  subf = a - b
+end
+
+! CHECK-LABEL: func @_QPsubf(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<f32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca f32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<f32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<f32>
+! CHECK:         %[[SUB:.*]] = arith.subf %[[A_VAL]], %[[B_VAL]] : f32
+! CHECK:         fir.store %[[SUB]] to %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         return %[[RET]] : f32
+
+real function mulf(a, b)
+  real :: a, b
+  mulf = a * b
+end
+
+! CHECK-LABEL: func @_QPmulf(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<f32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca f32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<f32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<f32>
+! CHECK:         %[[MUL:.*]] = arith.mulf %[[A_VAL]], %[[B_VAL]] : f32
+! CHECK:         fir.store %[[MUL]] to %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         return %[[RET]] : f32
+
+real function divf(a, b)
+  real :: a, b
+  divf = a / b
+end
+
+! CHECK-LABEL: func @_QPdivf(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<f32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca f32
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<f32>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<f32>
+! CHECK:         %[[DIV:.*]] = arith.divf %[[A_VAL]], %[[B_VAL]] : f32
+! CHECK:         fir.store %[[DIV]] to %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<f32>
+! CHECK:         return %[[RET]] : f32
+
+complex function addc(a, b)
+  complex :: a, b
+  addc = a + b
+end
+
+! CHECK-LABEL: func @_QPaddc(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4>
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[ADD:.*]] = fir.addc %[[A_VAL]], %[[B_VAL]] : !fir.complex<4>
+! CHECK:         fir.store %[[ADD]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         return %[[RET]] : !fir.complex<4>
+
+complex function subc(a, b)
+  complex :: a, b
+  subc = a - b
+end
+
+! CHECK-LABEL: func @_QPsubc(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4>
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[SUB:.*]] = fir.subc %[[A_VAL]], %[[B_VAL]] : !fir.complex<4>
+! CHECK:         fir.store %[[SUB]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         return %[[RET]] : !fir.complex<4>
+
+complex function mulc(a, b)
+  complex :: a, b
+  mulc = a * b
+end
+
+! CHECK-LABEL: func @_QPmulc(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4>
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[MUL:.*]] = fir.mulc %[[A_VAL]], %[[B_VAL]] : !fir.complex<4>
+! CHECK:         fir.store %[[MUL]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
+! CHECK:         return %[[RET]] : !fir.complex<4>
+
+complex function divc(a, b)
+  complex :: a, b
+  divc = a / b
+end
+
+! CHECK-LABEL: func @_QPdivc(
+! CHECK-SAME:    %[[A:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "a"},
+! CHECK-SAME:    %[[B:.*]]: !fir.ref<!fir.complex<4>> {fir.bindc_name = "b"}
+! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4>
+! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<!fir.complex<4>>
+! CHECK:         %[[DIV:.*]] = fir.divc %[[A_VAL]], %[[B_VAL]] : !fir.complex<4>
+! CHECK:         fir.store %[[DIV]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         return %[[RET]] : !fir.complex<4>

From 707157f24834e814243c90cf1f5f50c75f3abcb9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 14:59:57 +0000
Subject: [PATCH 266/748] Revert rGc24e197aaad5333717a27f6e4346f97da84821ed
 "[clangd] getHover - pass FormatStyle argument by const reference"

There are a number of buildbot build failures on non MSVC compilers
---
 clang-tools-extra/clangd/Hover.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 8b22018411404..dda5ad36e9b89 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -963,7 +963,7 @@ void maybeAddCalleeArgInfo(const SelectionTree::Node *N, HoverInfo &HI,
 } // namespace
 
 llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
-                                   const format::FormatStyle &Style,
+                                   format::FormatStyle Style,
                                    const SymbolIndex *Index) {
   PrintingPolicy PP =
       getPrintingPolicy(AST.getASTContext().getPrintingPolicy());

From 57baa14d74425f5f772a7999fc9ba2feb9ecf7c0 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 18 Feb 2022 14:04:19 +0000
Subject: [PATCH 267/748] [AMDGPU] Rename AMDGPUCFGStructurizer to
 R600MachineCFGStructurizer

Previously the name of the class (AMDGPUCFGStructurizer) did not
match the name of the file (AMDILCFGStructurizer).

Standardize on the name R600MachineCFGStructurizer by analogy with
AMDGPUMachineCFGStructurizer.

Differential Revision: https://reviews.llvm.org/D120128
---
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   2 +-
 llvm/lib/Target/AMDGPU/R600.h                 |   2 +-
 ...zer.cpp => R600MachineCFGStructurizer.cpp} | 146 +++++++++---------
 llvm/lib/Target/AMDGPU/R600TargetMachine.cpp  |   2 +-
 4 files changed, 76 insertions(+), 76 deletions(-)
 rename llvm/lib/Target/AMDGPU/{AMDILCFGStructurizer.cpp => R600MachineCFGStructurizer.cpp} (90%)

diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index ca5208355db96..45e95d14b320e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -96,7 +96,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetTransformInfo.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
   AMDGPUUnifyMetadata.cpp
-  AMDILCFGStructurizer.cpp
+  R600MachineCFGStructurizer.cpp
   GCNDPPCombine.cpp
   GCNHazardRecognizer.cpp
   GCNILPSched.cpp
diff --git a/llvm/lib/Target/AMDGPU/R600.h b/llvm/lib/Target/AMDGPU/R600.h
index 2b483ae63da95..5dfbf8f1ef952 100644
--- a/llvm/lib/Target/AMDGPU/R600.h
+++ b/llvm/lib/Target/AMDGPU/R600.h
@@ -26,7 +26,7 @@ FunctionPass *createR600EmitClauseMarkers();
 FunctionPass *createR600ClauseMergePass();
 FunctionPass *createR600Packetizer();
 FunctionPass *createR600ControlFlowFinalizer();
-FunctionPass *createAMDGPUCFGStructurizerPass();
+FunctionPass *createR600MachineCFGStructurizerPass();
 FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel);
 ModulePass *createR600OpenCLImageTypeLoweringPass();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
similarity index 90%
rename from llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
rename to llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
index 0e3f734fa1e7e..07fd4ff48044d 100644
--- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -1,4 +1,4 @@
-//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===//
+//===- R600MachineCFGStructurizer.cpp - CFG Structurizer ------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -42,7 +42,7 @@ STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
 namespace llvm {
 
-void initializeAMDGPUCFGStructurizerPass(PassRegistry &);
+void initializeR600MachineCFGStructurizerPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -89,7 +89,7 @@ class BlockInformation {
 //
 //===----------------------------------------------------------------------===//
 
-class AMDGPUCFGStructurizer : public MachineFunctionPass {
+class R600MachineCFGStructurizer : public MachineFunctionPass {
 public:
   using MBBVector = SmallVector<MachineBasicBlock *, 32>;
   using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>;
@@ -103,8 +103,8 @@ class AMDGPUCFGStructurizer : public MachineFunctionPass {
 
   static char ID;
 
-  AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
-    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  R600MachineCFGStructurizer() : MachineFunctionPass(ID) {
+    initializeR600MachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
   }
 
   StringRef getPassName() const override {
@@ -317,16 +317,16 @@ class AMDGPUCFGStructurizer : public MachineFunctionPass {
 
 } // end anonymous namespace
 
-char AMDGPUCFGStructurizer::ID = 0;
+char R600MachineCFGStructurizer::ID = 0;
 
-int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+int R600MachineCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
   if (It == BlockInfoMap.end())
     return INVALIDSCCNUM;
   return (*It).second->SccNum;
 }
 
-MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+MachineBasicBlock *R600MachineCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
     const {
   LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
   if (It == LLInfoMap.end())
@@ -334,7 +334,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
   return (*It).second;
 }
 
-bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
   MachineLoop *LoopRep = MLI->getLoopFor(MBB);
   if (!LoopRep)
     return false;
@@ -342,14 +342,14 @@ bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
   return MBB->isSuccessor(LoopHeader);
 }
 
-bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
   if (It == BlockInfoMap.end())
     return false;
   return (*It).second->IsRetired;
 }
 
-bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
   MachineLoop *LoopRep = MLI->getLoopFor(MBB);
   while (LoopRep && LoopRep->getHeader() == MBB) {
     MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
@@ -362,7 +362,7 @@ bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
   return false;
 }
 
-AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+R600MachineCFGStructurizer::PathToKind R600MachineCFGStructurizer::singlePathTo(
     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
     bool AllowSideEntry) const {
   assert(DstMBB);
@@ -380,7 +380,7 @@ AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
   return Not_SinglePath;
 }
 
-int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+int R600MachineCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
     MBBVector::const_iterator E) const {
   int Count = 0;
   while (It != E) {
@@ -391,7 +391,7 @@ int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
   return Count;
 }
 
-bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+bool R600MachineCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
   unsigned BlockSizeThreshold = 30;
   unsigned CloneInstrThreshold = 100;
   bool MultiplePreds = MBB && (MBB->pred_size() > 1);
@@ -403,7 +403,7 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
       (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
 }
 
-void AMDGPUCFGStructurizer::reversePredicateSetter(
+void R600MachineCFGStructurizer::reversePredicateSetter(
     MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
   assert(I.isValid() && "Expected valid iterator");
   for (;; --I) {
@@ -430,7 +430,7 @@ void AMDGPUCFGStructurizer::reversePredicateSetter(
   }
 }
 
-void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+void R600MachineCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
                                            int NewOpcode, const DebugLoc &DL) {
   MachineInstr *MI =
       MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
@@ -439,7 +439,7 @@ void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
   SHOWNEWINSTR(MI);
 }
 
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+MachineInstr *R600MachineCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
                                                        int NewOpcode,
                                                        const DebugLoc &DL) {
   MachineInstr *MI =
@@ -452,7 +452,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
   return MI;
 }
 
-MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+MachineInstr *R600MachineCFGStructurizer::insertInstrBefore(
     MachineBasicBlock::iterator I, int NewOpcode) {
   MachineInstr *OldMI = &(*I);
   MachineBasicBlock *MBB = OldMI->getParent();
@@ -464,7 +464,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
   return NewMBB;
 }
 
-void AMDGPUCFGStructurizer::insertCondBranchBefore(
+void R600MachineCFGStructurizer::insertCondBranchBefore(
     MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
   MachineInstr *OldMI = &(*I);
   MachineBasicBlock *MBB = OldMI->getParent();
@@ -477,7 +477,7 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
   //erase later oldInstr->eraseFromParent();
 }
 
-void AMDGPUCFGStructurizer::insertCondBranchBefore(
+void R600MachineCFGStructurizer::insertCondBranchBefore(
     MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
     int RegNum, const DebugLoc &DL) {
   MachineFunction *MF = blk->getParent();
@@ -488,7 +488,7 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(
   SHOWNEWINSTR(NewInstr);
 }
 
-int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
   case R600::JUMP_COND:
   case R600::JUMP: return R600::IF_PREDICATE_SET;
@@ -499,7 +499,7 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
   return -1;
 }
 
-int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
   case R600::JUMP_COND:
   case R600::JUMP: return R600::IF_PREDICATE_SET;
@@ -510,7 +510,7 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
   return -1;
 }
 
-int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
   case R600::JUMP_COND:
   case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32;
@@ -519,7 +519,7 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
   return -1;
 }
 
-int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+int R600MachineCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
   switch(OldOpcode) {
   case R600::JUMP_COND:
   case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32;
@@ -528,17 +528,17 @@ int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
   return -1;
 }
 
-MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+MachineBasicBlock *R600MachineCFGStructurizer::getTrueBranch(MachineInstr *MI) {
   return MI->getOperand(0).getMBB();
 }
 
-void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+void R600MachineCFGStructurizer::setTrueBranch(MachineInstr *MI,
     MachineBasicBlock *MBB) {
   MI->getOperand(0).setMBB(MBB);
 }
 
 MachineBasicBlock *
-AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+R600MachineCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
     MachineInstr *MI) {
   assert(MBB->succ_size() == 2);
   MachineBasicBlock *TrueBranch = getTrueBranch(MI);
@@ -548,7 +548,7 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
   return (*It == TrueBranch) ? *Next : *It;
 }
 
-bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+bool R600MachineCFGStructurizer::isCondBranch(MachineInstr *MI) {
   switch (MI->getOpcode()) {
     case R600::JUMP_COND:
     case R600::BRANCH_COND_i32:
@@ -559,7 +559,7 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
   return false;
 }
 
-bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+bool R600MachineCFGStructurizer::isUncondBranch(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   case R600::JUMP:
   case R600::BRANCH:
@@ -570,7 +570,7 @@ bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
   return false;
 }
 
-DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+DebugLoc R600MachineCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
   //get DebugLoc from the first MachineBasicBlock instruction with debug info
   DebugLoc DL;
   for (MachineInstr &MI : *MBB)
@@ -579,7 +579,7 @@ DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
   return DL;
 }
 
-MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+MachineInstr *R600MachineCFGStructurizer::getNormalBlockBranchInstr(
     MachineBasicBlock *MBB) {
   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
   MachineInstr *MI = &*It;
@@ -588,7 +588,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
   return nullptr;
 }
 
-MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+MachineInstr *R600MachineCFGStructurizer::getLoopendBlockBranchInstr(
     MachineBasicBlock *MBB) {
   for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
       It != E; ++It) {
@@ -604,7 +604,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
   return nullptr;
 }
 
-MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+MachineInstr *R600MachineCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
   MachineBasicBlock::reverse_iterator It = MBB->rbegin();
   if (It != MBB->rend()) {
     MachineInstr *instr = &(*It);
@@ -614,7 +614,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
   return nullptr;
 }
 
-bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+bool R600MachineCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
   MachineInstr *MI = getReturnInstr(MBB);
   bool IsReturn = MBB->succ_empty();
   if (MI)
@@ -625,13 +625,13 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
   return  IsReturn;
 }
 
-void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+void R600MachineCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
     MachineBasicBlock *SrcMBB) {
   for (MachineBasicBlock *Succ : SrcMBB->successors())
     DstMBB->addSuccessor(Succ);  // *iter's predecessor is also taken care of
 }
 
-MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+MachineBasicBlock *R600MachineCFGStructurizer::clone(MachineBasicBlock *MBB) {
   MachineFunction *Func = MBB->getParent();
   MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
   Func->push_back(NewMBB);  //insert to function
@@ -640,7 +640,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
   return NewMBB;
 }
 
-void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+void R600MachineCFGStructurizer::replaceInstrUseOfBlockWith(
     MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
     MachineBasicBlock *NewBlk) {
   MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
@@ -649,7 +649,7 @@ void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
     setTrueBranch(BranchMI, NewBlk);
 }
 
-void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
   assert((!MBB->getParent()->getJumpTableInfo()
           || MBB->getParent()->getJumpTableInfo()->isEmpty())
          && "found a jump table");
@@ -677,12 +677,12 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    // blocks in the jump table with the entryBlk //}
 }
 
-bool AMDGPUCFGStructurizer::prepare() {
+bool R600MachineCFGStructurizer::prepare() {
   bool Changed = false;
 
   //FIXME: if not reducible flow graph, make it so ???
 
-  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+  LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::prepare\n";);
 
   orderBlocks(FuncRep);
 
@@ -719,9 +719,9 @@ bool AMDGPUCFGStructurizer::prepare() {
   return Changed;
 }
 
-bool AMDGPUCFGStructurizer::run() {
+bool R600MachineCFGStructurizer::run() {
   //Assume reducible CFG...
-  LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+  LLVM_DEBUG(dbgs() << "R600MachineCFGStructurizer::run\n");
 
 #ifdef STRESSTEST
   //Use the worse block ordering to test the algorithm.
@@ -842,7 +842,7 @@ bool AMDGPUCFGStructurizer::run() {
   return true;
 }
 
-void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+void R600MachineCFGStructurizer::orderBlocks(MachineFunction *MF) {
   int SccNum = 0;
   for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
        ++It, ++SccNum) {
@@ -861,7 +861,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   }
 }
 
-int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
   int NumMatch = 0;
   int CurMatch;
 
@@ -876,7 +876,7 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
   return NumMatch;
 }
 
-int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
   int NumMatch = 0;
   NumMatch += loopendPatternMatch();
   NumMatch += serialPatternMatch(MBB);
@@ -884,7 +884,7 @@ int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
   return NumMatch;
 }
 
-int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
   if (MBB->succ_size() != 1)
     return 0;
 
@@ -897,7 +897,7 @@ int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
   return 1;
 }
 
-int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
+int R600MachineCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   //two edges
   if (MBB->succ_size() != 2)
     return 0;
@@ -975,7 +975,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   return 1 + Cloned + NumMatch;
 }
 
-int AMDGPUCFGStructurizer::loopendPatternMatch() {
+int R600MachineCFGStructurizer::loopendPatternMatch() {
   std::deque<MachineLoop *> NestedLoops;
   for (auto &It: *MLI)
     for (MachineLoop *ML : depth_first(It))
@@ -1000,7 +1000,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
   return Num;
 }
 
-int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+int R600MachineCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
   MBBVector ExitingMBBs;
   LoopRep->getExitingBlocks(ExitingMBBs);
@@ -1041,7 +1041,7 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   return 1;
 }
 
-bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+bool R600MachineCFGStructurizer::isSameloopDetachedContbreak(
     MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
   if (Src1MBB->succ_empty()) {
     MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
@@ -1058,7 +1058,7 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
   return false;
 }
 
-int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+int R600MachineCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
   if (Num == 0) {
@@ -1069,7 +1069,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
   return Num;
 }
 
-int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+int R600MachineCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
   int Num = 0;
   MachineBasicBlock *DownBlk;
@@ -1107,7 +1107,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
 }
 
 #ifndef NDEBUG
-void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+void R600MachineCFGStructurizer::showImproveSimpleJumpintoIf(
     MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
   dbgs() << "head = BB" << HeadMBB->getNumber()
@@ -1150,7 +1150,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
 }
 #endif
 
-int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+int R600MachineCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
     MachineBasicBlock **LandMBBPtr) {
   bool MigrateTrue = false;
@@ -1322,7 +1322,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   return NumNewBlk;
 }
 
-void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+void R600MachineCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
     MachineBasicBlock *SrcMBB) {
   LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB"
                     << SrcMBB->getNumber() << "\n";);
@@ -1336,7 +1336,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
   retireBlock(SrcMBB);
 }
 
-void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+void R600MachineCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
     MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
   assert (TrueMBB);
@@ -1392,7 +1392,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     MBB->addSuccessor(LandMBB);
 }
 
-void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+void R600MachineCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
     MachineBasicBlock *LandMBB) {
   LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
                     << " land = BB" << LandMBB->getNumber() << "\n";);
@@ -1402,7 +1402,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
   DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
-void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+void R600MachineCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
     MachineBasicBlock *LandMBB) {
   LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB"
                     << ExitingMBB->getNumber() << " land = BB"
@@ -1423,7 +1423,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
   ExitingMBB->removeSuccessor(LandMBB, true);
 }
 
-void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+void R600MachineCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
     MachineBasicBlock *ContMBB) {
   LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
                     << ContingMBB->getNumber() << ", cont = BB"
@@ -1466,7 +1466,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
   }
 }
 
-int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+int R600MachineCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
   int Cloned = 0;
   assert(PreMBB->isSuccessor(SrcMBB));
@@ -1485,7 +1485,7 @@ int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
 }
 
 MachineBasicBlock *
-AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+R600MachineCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
     MachineBasicBlock *PredMBB) {
   assert(PredMBB->isSuccessor(MBB) && "succBlk is not a predecessor of curBlk");
 
@@ -1509,7 +1509,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   return CloneMBB;
 }
 
-void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+void R600MachineCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
     MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
   MachineBasicBlock::iterator SpliceEnd;
   //look for the input branchinstr, not the AMDGPU branchinstr
@@ -1534,7 +1534,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
 }
 
 MachineBasicBlock *
-AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+R600MachineCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   MachineBasicBlock *LoopHeader = LoopRep->getHeader();
   MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
 
@@ -1554,7 +1554,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   return nullptr;
 }
 
-void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   MachineInstr *BranchMI;
 
   // I saw two unconditional branch in one basic block in example
@@ -1566,7 +1566,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   }
 }
 
-void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+void R600MachineCFGStructurizer::removeRedundantConditionalBranch(
     MachineBasicBlock *MBB) {
   if (MBB->succ_size() != 2)
     return;
@@ -1583,7 +1583,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
   MBB->removeSuccessor(MBB1, true);
 }
 
-void AMDGPUCFGStructurizer::addDummyExitBlock(
+void R600MachineCFGStructurizer::addDummyExitBlock(
     SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
@@ -1599,12 +1599,12 @@ void AMDGPUCFGStructurizer::addDummyExitBlock(
   SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
 }
 
-void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
   while (MBB->succ_size())
     MBB->removeSuccessor(*MBB->succ_begin());
 }
 
-void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+void R600MachineCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
     int SccNum) {
   BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
   if (!srcBlkInfo)
@@ -1612,7 +1612,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
   srcBlkInfo->SccNum = SccNum;
 }
 
-void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";);
 
   BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
@@ -1624,14 +1624,14 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
   assert(MBB->succ_empty() && MBB->pred_empty() && "can't retire block yet");
 }
 
-INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer",
                       "AMDGPU CFG Structurizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer",
                       "AMDGPU CFG Structurizer", false, false)
 
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
-  return new AMDGPUCFGStructurizer();
+FunctionPass *llvm::createR600MachineCFGStructurizerPass() {
+  return new R600MachineCFGStructurizer();
 }
diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 39dad45425fce..da8643fa6c407 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -131,7 +131,7 @@ void R600PassConfig::addPreSched2() {
 }
 
 void R600PassConfig::addPreEmitPass() {
-  addPass(createAMDGPUCFGStructurizerPass());
+  addPass(createR600MachineCFGStructurizerPass());
   addPass(createR600ExpandSpecialInstrsPass());
   addPass(&FinalizeMachineBundlesID);
   addPass(createR600Packetizer());

From 3c84e68ddc4a5c6819fae7cacd41be6b864f7c47 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 18 Feb 2022 15:11:13 +0000
Subject: [PATCH 268/748] [gn build] Port 57baa14d7442

---
 llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 5a946689c8ba7..1ddbfaf76e8c3 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -176,7 +176,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUTargetTransformInfo.cpp",
     "AMDGPUUnifyDivergentExitNodes.cpp",
     "AMDGPUUnifyMetadata.cpp",
-    "AMDILCFGStructurizer.cpp",
     "GCNDPPCombine.cpp",
     "GCNHazardRecognizer.cpp",
     "GCNILPSched.cpp",
@@ -196,6 +195,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "R600ISelLowering.cpp",
     "R600InstrInfo.cpp",
     "R600MCInstLower.cpp",
+    "R600MachineCFGStructurizer.cpp",
     "R600MachineFunctionInfo.cpp",
     "R600MachineScheduler.cpp",
     "R600OpenCLImageTypeLoweringPass.cpp",

From ccebf8ac8c61cbd46223abbeb4f29f4e1f7b490c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 18 Feb 2022 10:23:41 -0500
Subject: [PATCH 269/748] [Clang][OpenMP] Add support for compare capture in
 parser

This patch adds the support for `atomic compare capture` in parser and part of
sema. We don't create an AST node for this because the spec doesn't say `compare`
and `capture` clauses should be used tightly, so we cannot look one more token
ahead in the parser.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D116261
---
 clang/lib/CodeGen/CGStmtOpenMP.cpp     |  43 ++-
 clang/lib/Sema/SemaOpenMP.cpp          |  44 ++-
 clang/test/OpenMP/atomic_ast_print.cpp | 422 +++++++++++++++++++++++++
 clang/test/OpenMP/atomic_messages.cpp  |   4 +
 4 files changed, 488 insertions(+), 25 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index b491642871ced..8ea4968f4b11b 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -24,6 +24,7 @@
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/PrettyStackTrace.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
@@ -6020,7 +6021,7 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
                               llvm::AtomicOrdering AO, bool IsPostfixUpdate,
                               const Expr *X, const Expr *V, const Expr *E,
                               const Expr *UE, bool IsXLHSInRHSPart,
-                              SourceLocation Loc) {
+                              bool IsCompareCapture, SourceLocation Loc) {
   switch (Kind) {
   case OMPC_read:
     emitOMPAtomicReadExpr(CGF, AO, X, V, Loc);
@@ -6037,10 +6038,19 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
                              IsXLHSInRHSPart, Loc);
     break;
   case OMPC_compare: {
-    // Emit an error here.
-    unsigned DiagID = CGF.CGM.getDiags().getCustomDiagID(
-        DiagnosticsEngine::Error, "'atomic compare' is not supported for now");
-    CGF.CGM.getDiags().Report(DiagID);
+    if (IsCompareCapture) {
+      // Emit an error here.
+      unsigned DiagID = CGF.CGM.getDiags().getCustomDiagID(
+          DiagnosticsEngine::Error,
+          "'atomic compare capture' is not supported for now");
+      CGF.CGM.getDiags().Report(DiagID);
+    } else {
+      // Emit an error here.
+      unsigned DiagID = CGF.CGM.getDiags().getCustomDiagID(
+          DiagnosticsEngine::Error,
+          "'atomic compare' is not supported for now");
+      CGF.CGM.getDiags().Report(DiagID);
+    }
     break;
   }
   case OMPC_if:
@@ -6153,18 +6163,23 @@ void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
     AO = llvm::AtomicOrdering::Monotonic;
     MemOrderingSpecified = true;
   }
+  llvm::SmallSet<OpenMPClauseKind, 2> KindsEncountered;
   OpenMPClauseKind Kind = OMPC_unknown;
   for (const OMPClause *C : S.clauses()) {
     // Find first clause (skip seq_cst|acq_rel|aqcuire|release|relaxed clause,
     // if it is first).
-    if (C->getClauseKind() != OMPC_seq_cst &&
-        C->getClauseKind() != OMPC_acq_rel &&
-        C->getClauseKind() != OMPC_acquire &&
-        C->getClauseKind() != OMPC_release &&
-        C->getClauseKind() != OMPC_relaxed && C->getClauseKind() != OMPC_hint) {
-      Kind = C->getClauseKind();
-      break;
-    }
+    OpenMPClauseKind K = C->getClauseKind();
+    if (K == OMPC_seq_cst || K == OMPC_acq_rel || K == OMPC_acquire ||
+        K == OMPC_release || K == OMPC_relaxed || K == OMPC_hint)
+      continue;
+    Kind = K;
+    KindsEncountered.insert(K);
+  }
+  bool IsCompareCapture = false;
+  if (KindsEncountered.contains(OMPC_compare) &&
+      KindsEncountered.contains(OMPC_capture)) {
+    IsCompareCapture = true;
+    Kind = OMPC_compare;
   }
   if (!MemOrderingSpecified) {
     llvm::AtomicOrdering DefaultOrder =
@@ -6188,7 +6203,7 @@ void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
   EmitStopPoint(S.getAssociatedStmt());
   emitOMPAtomicExpr(*this, Kind, AO, S.isPostfixUpdate(), S.getX(), S.getV(),
                     S.getExpr(), S.getUpdateExpr(), S.isXLHSInRHSPart(),
-                    S.getBeginLoc());
+                    IsCompareCapture, S.getBeginLoc());
 }
 
 static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 64647f59fcb5f..686cef249f3a5 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -35,6 +35,7 @@
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/PointerEmbeddedInt.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Frontend/OpenMP/OMPAssume.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
@@ -11322,14 +11323,18 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
   SourceLocation AtomicKindLoc;
   OpenMPClauseKind MemOrderKind = OMPC_unknown;
   SourceLocation MemOrderLoc;
+  bool MutexClauseEncountered = false;
+  llvm::SmallSet<OpenMPClauseKind, 2> EncounteredAtomicKinds;
   for (const OMPClause *C : Clauses) {
     switch (C->getClauseKind()) {
     case OMPC_read:
     case OMPC_write:
     case OMPC_update:
+      MutexClauseEncountered = true;
+      LLVM_FALLTHROUGH;
     case OMPC_capture:
     case OMPC_compare: {
-      if (AtomicKind != OMPC_unknown) {
+      if (AtomicKind != OMPC_unknown && MutexClauseEncountered) {
         Diag(C->getBeginLoc(), diag::err_omp_atomic_several_clauses)
             << SourceRange(C->getBeginLoc(), C->getEndLoc());
         Diag(AtomicKindLoc, diag::note_omp_previous_mem_order_clause)
@@ -11337,6 +11342,12 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       } else {
         AtomicKind = C->getClauseKind();
         AtomicKindLoc = C->getBeginLoc();
+        if (!EncounteredAtomicKinds.insert(C->getClauseKind()).second) {
+          Diag(C->getBeginLoc(), diag::err_omp_atomic_several_clauses)
+              << SourceRange(C->getBeginLoc(), C->getEndLoc());
+          Diag(AtomicKindLoc, diag::note_omp_previous_mem_order_clause)
+              << getOpenMPClauseName(AtomicKind);
+        }
       }
       break;
     }
@@ -11364,6 +11375,12 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       llvm_unreachable("unknown clause is encountered");
     }
   }
+  bool IsCompareCapture = false;
+  if (EncounteredAtomicKinds.contains(OMPC_compare) &&
+      EncounteredAtomicKinds.contains(OMPC_capture)) {
+    IsCompareCapture = true;
+    AtomicKind = OMPC_compare;
+  }
   // OpenMP 5.0, 2.17.7 atomic Construct, Restrictions
   // If atomic-clause is read then memory-order-clause must not be acq_rel or
   // release.
@@ -11782,17 +11799,22 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
     if (CurContext->isDependentContext())
       UE = V = E = X = nullptr;
   } else if (AtomicKind == OMPC_compare) {
-    OpenMPAtomicCompareChecker::ErrorInfoTy ErrorInfo;
-    OpenMPAtomicCompareChecker Checker(*this);
-    if (!Checker.checkStmt(Body, ErrorInfo)) {
-      Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare)
-          << ErrorInfo.ErrorRange;
-      Diag(ErrorInfo.NoteLoc, diag::note_omp_atomic_compare)
-          << ErrorInfo.Error << ErrorInfo.NoteRange;
-      return StmtError();
+    if (IsCompareCapture) {
+      // TODO: We don't set X, D, E, etc. here because in code gen we will emit
+      // error directly.
+    } else {
+      OpenMPAtomicCompareChecker::ErrorInfoTy ErrorInfo;
+      OpenMPAtomicCompareChecker Checker(*this);
+      if (!Checker.checkStmt(Body, ErrorInfo)) {
+        Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare)
+            << ErrorInfo.ErrorRange;
+        Diag(ErrorInfo.NoteLoc, diag::note_omp_atomic_compare)
+            << ErrorInfo.Error << ErrorInfo.NoteRange;
+        return StmtError();
+      }
+      // TODO: We don't set X, D, E, etc. here because in code gen we will emit
+      // error directly.
     }
-    // TODO: We don't set X, D, E, etc. here because in code gen we will emit
-    // error directly.
   }
 
   setFunctionHasBranchProtectedScope();
diff --git a/clang/test/OpenMP/atomic_ast_print.cpp b/clang/test/OpenMP/atomic_ast_print.cpp
index 9484917dde83b..7502fdc339c2a 100644
--- a/clang/test/OpenMP/atomic_ast_print.cpp
+++ b/clang/test/OpenMP/atomic_ast_print.cpp
@@ -20,6 +20,7 @@
 
 template <class T>
 T foo(T argc) {
+  T v = T();
   T c = T();
   T b = T();
   T a = T();
@@ -45,6 +46,12 @@ T foo(T argc) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic seq_cst
   a++;
@@ -68,6 +75,12 @@ T foo(T argc) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare seq_cst
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture seq_cst
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare seq_cst capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture seq_cst
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic
   a++;
@@ -91,6 +104,12 @@ T foo(T argc) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare acq_rel
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture acq_rel
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare acq_rel capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture acq_rel
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic
   a++;
@@ -114,6 +133,12 @@ T foo(T argc) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare acquire
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture acquire
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare acquire capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture acquire
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic release
   a++;
@@ -137,6 +162,12 @@ T foo(T argc) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare release
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture release
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare release capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture release
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic relaxed
   a++;
@@ -160,6 +191,12 @@ T foo(T argc) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare relaxed
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture relaxed
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare relaxed capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture relaxed
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic hint(6)
   a++;
@@ -183,6 +220,12 @@ T foo(T argc) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare hint(6)
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture hint(6)
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare hint(6) capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture hint(6)
+  { v = a == b; if (v) a = c; }
 #endif
   return T();
 }
@@ -215,6 +258,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic seq_cst
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read seq_cst
@@ -242,6 +301,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare seq_cst capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read
@@ -269,6 +344,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare acq_rel capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read acquire
@@ -296,6 +387,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acquire
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare acquire capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acquire
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic release
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read
@@ -323,6 +430,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture release
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare release capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture release
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic relaxed
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read
@@ -350,6 +473,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare relaxed capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic hint(6)
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read hint(6)
@@ -377,6 +516,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare hint(6) capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK: int a = int();
 // CHECK-NEXT: #pragma omp atomic
 // CHECK-NEXT: a++;
@@ -405,6 +560,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic seq_cst
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read seq_cst
@@ -432,6 +603,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare seq_cst capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read
@@ -459,6 +646,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare acq_rel capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read acquire
@@ -486,6 +689,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acquire
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare acquire capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture acquire
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic release
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read
@@ -513,6 +732,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture release
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare release capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture release
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic relaxed
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read
@@ -540,6 +775,22 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare relaxed capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 // CHECK-NEXT: #pragma omp atomic hint(6)
 // CHECK-NEXT: a++;
 // CHECK-NEXT: #pragma omp atomic read hint(6)
@@ -567,8 +818,25 @@ T foo(T argc) {
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: a = a == b ? c : a;
 // CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare hint(6) capture
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a;
+// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: }
+// CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
+// CHECK-51-NEXT: {
+// CHECK-51-NEXT: v = a == b;
+// CHECK-51-NEXT: if (v)
+// CHECK-51-NEXT: a = c;
+// CHECK-51-NEXT: }
 
 int main(int argc, char **argv) {
+  int v = 0;
   int c = 0;
   int b = 0;
   int a = 0;
@@ -595,6 +863,12 @@ int main(int argc, char **argv) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic seq_cst
   a++;
@@ -618,6 +892,12 @@ int main(int argc, char **argv) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare seq_cst
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture seq_cst
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare seq_cst capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture seq_cst
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic
   a++;
@@ -641,6 +921,12 @@ int main(int argc, char **argv) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare acq_rel
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture acq_rel
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare acq_rel capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture acq_rel
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic
   a++;
@@ -664,6 +950,12 @@ int main(int argc, char **argv) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare acquire
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture acquire
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare acquire capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture acquire
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic release
   a++;
@@ -687,6 +979,12 @@ int main(int argc, char **argv) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare release
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture release
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare release capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture release
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic relaxed
   a++;
@@ -710,6 +1008,12 @@ int main(int argc, char **argv) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare relaxed
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture relaxed
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare relaxed capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture relaxed
+  { v = a == b; if (v) a = c; }
 #endif
 #pragma omp atomic hint(6)
   a++;
@@ -733,6 +1037,12 @@ int main(int argc, char **argv) {
   { a = a < b ? b : a; }
 #pragma omp atomic compare hint(6)
   { a = a == b ? c : a; }
+#pragma omp atomic compare capture hint(6)
+  { v = a; a = a > b ? b : a; }
+#pragma omp atomic compare hint(6) capture
+  { v = a; a = a < b ? b : a; }
+#pragma omp atomic compare capture hint(6)
+  { v = a == b; if (v) a = c; }
 #endif
   // CHECK-NEXT: #pragma omp atomic
   // CHECK-NEXT: a++;
@@ -761,6 +1071,22 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: a = a == b ? c : a;
   // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a == b;
+  // CHECK-51-NEXT: if (v)
+  // CHECK-51-NEXT: a = c;
+  // CHECK-51-NEXT: }
   // CHECK-NEXT: #pragma omp atomic seq_cst
   // CHECK-NEXT: a++;
   // CHECK-NEXT: #pragma omp atomic read seq_cst
@@ -788,6 +1114,22 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: a = a == b ? c : a;
   // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare seq_cst capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a == b;
+  // CHECK-51-NEXT: if (v)
+  // CHECK-51-NEXT: a = c;
+  // CHECK-51-NEXT: }
   // CHECK-NEXT: #pragma omp atomic
   // CHECK-NEXT: a++;
   // CHECK-NEXT: #pragma omp atomic read
@@ -815,6 +1157,22 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: a = a == b ? c : a;
   // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare acq_rel capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a == b;
+  // CHECK-51-NEXT: if (v)
+  // CHECK-51-NEXT: a = c;
+  // CHECK-51-NEXT: }
   // CHECK-NEXT: #pragma omp atomic
   // CHECK-NEXT: a++;
   // CHECK-NEXT: #pragma omp atomic read acquire
@@ -842,6 +1200,22 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: a = a == b ? c : a;
   // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare acquire capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a == b;
+  // CHECK-51-NEXT: if (v)
+  // CHECK-51-NEXT: a = c;
+  // CHECK-51-NEXT: }
   // CHECK-NEXT: #pragma omp atomic release
   // CHECK-NEXT: a++;
   // CHECK-NEXT: #pragma omp atomic read
@@ -869,6 +1243,22 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: a = a == b ? c : a;
   // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture release
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare release capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture release
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a == b;
+  // CHECK-51-NEXT: if (v)
+  // CHECK-51-NEXT: a = c;
+  // CHECK-51-NEXT: }
   // CHECK-NEXT: #pragma omp atomic relaxed
   // CHECK-NEXT: a++;
   // CHECK-NEXT: #pragma omp atomic read
@@ -896,6 +1286,22 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: a = a == b ? c : a;
   // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare relaxed capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a == b;
+  // CHECK-51-NEXT: if (v)
+  // CHECK-51-NEXT: a = c;
+  // CHECK-51-NEXT: }
   // CHECK-NEXT: #pragma omp atomic hint(6)
   // CHECK-NEXT: a++;
   // CHECK-NEXT: #pragma omp atomic read hint(6)
@@ -923,6 +1329,22 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: a = a == b ? c : a;
   // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare hint(6) capture
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a;
+  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: }
+  // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
+  // CHECK-51-NEXT: {
+  // CHECK-51-NEXT: v = a == b;
+  // CHECK-51-NEXT: if (v)
+  // CHECK-51-NEXT: a = c;
+  // CHECK-51-NEXT: }
   // expect-note@+1 {{in instantiation of function template specialization 'foo<int>' requested here}}
   return foo(a);
 }
diff --git a/clang/test/OpenMP/atomic_messages.cpp b/clang/test/OpenMP/atomic_messages.cpp
index 3e4ffa841ecd8..700c86da1a588 100644
--- a/clang/test/OpenMP/atomic_messages.cpp
+++ b/clang/test/OpenMP/atomic_messages.cpp
@@ -954,6 +954,10 @@ int mixed() {
 // expected-note@+1 {{'read' clause used here}}
 #pragma omp atomic read compare
   a = b;
+// expected-error@+2 {{directive '#pragma omp atomic' cannot contain more than one 'compare' clause}}
+// expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'capture' clause}}
+#pragma omp atomic compare compare capture capture
+  a = b;
 #endif
   // expected-note@+1 {{in instantiation of function template specialization 'mixed<int>' requested here}}
   return mixed<int>();

From 07f93a1e390cb89b41a9dc76f9f97a65e566e4e3 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 17 Feb 2022 15:29:43 +0000
Subject: [PATCH 270/748] [libc][automemcpy] Discard aggrated samples from JSON

The benchmark framework synthesizes fake "aggregate" Samples representing mean, median and cv.
We're only interested in "iteration" samples.

Differential Revision: https://reviews.llvm.org/D120062
---
 .../include/automemcpy/ResultAnalyzer.h       |  6 ++++
 .../automemcpy/lib/ResultAnalyzer.cpp         |  2 ++
 .../automemcpy/lib/ResultAnalyzerMain.cpp     |  6 +++-
 .../unittests/ResultAnalyzerTest.cpp          | 30 ++++++++++++-------
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
index 2991df0aceba7..d4bf272582767 100644
--- a/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
+++ b/libc/benchmarks/automemcpy/include/automemcpy/ResultAnalyzer.h
@@ -79,9 +79,15 @@ struct SampleId {
                           Distribution.Name)
 };
 
+// The type of Samples as reported by the Google Benchmark's JSON result file.
+// We are only interested in the "iteration" samples, the "aggregate" ones
+// represent derived metrics such as 'mean' or 'median'.
+enum class SampleType { UNKNOWN, ITERATION, AGGREGATE };
+
 // A SampleId with an associated measured throughput.
 struct Sample {
   SampleId Id;
+  SampleType Type = SampleType::UNKNOWN;
   double BytesPerSecond = 0;
 };
 
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
index 6bfde0d2cb4be..b134f6c83a0df 100644
--- a/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzer.cpp
@@ -107,6 +107,8 @@ static void processPerDistributionData(PerDistributionData &Data) {
 std::vector<FunctionData> getThroughputs(ArrayRef<Sample> Samples) {
   std::unordered_map<FunctionId, FunctionData, FunctionId::Hasher> Functions;
   for (const auto &S : Samples) {
+    if (S.Type != SampleType::ITERATION)
+      break;
     auto &Function = Functions[S.Id.Function];
     auto &Data = Function.PerDistributionData[S.Id.Distribution.Name];
     Data.BytesPerSecondSamples.push_back(S.BytesPerSecond);
diff --git a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
index 422bc575b6b72..f3fb825621761 100644
--- a/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
+++ b/libc/benchmarks/automemcpy/lib/ResultAnalyzerMain.cpp
@@ -61,13 +61,17 @@ static StringRef getInternalizedString(StringRef VolatileStr) {
 // Helper function for the LLVM JSON API.
 bool fromJSON(const json::Value &V, Sample &Out, json::Path P) {
   std::string Label;
+  std::string RunType;
   json::ObjectMapper O(V, P);
   if (O && O.map("bytes_per_second", Out.BytesPerSecond) &&
-      O.map("label", Label)) {
+      O.map("run_type", RunType) && O.map("label", Label)) {
     const auto LabelPair = StringRef(Label).split(',');
     Out.Id.Function.Name = getInternalizedString(LabelPair.first);
     Out.Id.Function.Type = getFunctionDescriptor(LabelPair.first).Type;
     Out.Id.Distribution.Name = getInternalizedString(LabelPair.second);
+    Out.Type = StringSwitch<SampleType>(RunType)
+                   .Case("aggregate", SampleType::AGGREGATE)
+                   .Case("iteration", SampleType::ITERATION);
     return true;
   }
   return false;
diff --git a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
index 10d0f98272b4b..7b67f70eb89cd 100644
--- a/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
+++ b/libc/benchmarks/automemcpy/unittests/ResultAnalyzerTest.cpp
@@ -24,7 +24,8 @@ TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsOneSample) {
   static constexpr DistributionId DistA = {{"A"}};
   static constexpr SampleId Id = {Foo1, DistA};
   static constexpr Sample kSamples[] = {
-      Sample{Id, 4},
+      Sample{Id, SampleType::ITERATION, 4},
+      Sample{Id, SampleType::AGGREGATE, -1}, // Aggegates gets discarded
   };
 
   const std::vector<FunctionData> Data = getThroughputs(kSamples);
@@ -42,8 +43,9 @@ TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsManySamplesSameBucket) {
   static constexpr FunctionId Foo1 = {"memcpy1", FunctionType::MEMCPY};
   static constexpr DistributionId DistA = {{"A"}};
   static constexpr SampleId Id = {Foo1, DistA};
-  static constexpr Sample kSamples[] = {Sample{Id, 4}, Sample{Id, 5},
-                                        Sample{Id, 5}};
+  static constexpr Sample kSamples[] = {Sample{Id, SampleType::ITERATION, 4},
+                                        Sample{Id, SampleType::ITERATION, 5},
+                                        Sample{Id, SampleType::ITERATION, 5}};
 
   const std::vector<FunctionData> Data = getThroughputs(kSamples);
   EXPECT_THAT(Data, SizeIs(1));
@@ -63,8 +65,10 @@ TEST(AutomemcpyJsonResultsAnalyzer, getThroughputsServeralFunctionAndDist) {
   static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
   static constexpr DistributionId DistB = {{"B"}};
   static constexpr Sample kSamples[] = {
-      Sample{{Foo1, DistA}, 1}, Sample{{Foo1, DistB}, 2},
-      Sample{{Foo2, DistA}, 3}, Sample{{Foo2, DistB}, 4}};
+      Sample{{Foo1, DistA}, SampleType::ITERATION, 1},
+      Sample{{Foo1, DistB}, SampleType::ITERATION, 2},
+      Sample{{Foo2, DistA}, SampleType::ITERATION, 3},
+      Sample{{Foo2, DistB}, SampleType::ITERATION, 4}};
   // Data is aggregated per function.
   const std::vector<FunctionData> Data = getThroughputs(kSamples);
   EXPECT_THAT(Data, SizeIs(2)); // 2 functions Foo1 and Foo2.
@@ -78,9 +82,10 @@ TEST(AutomemcpyJsonResultsAnalyzer, getScore) {
   static constexpr FunctionId Foo2 = {"memcpy2", FunctionType::MEMCPY};
   static constexpr FunctionId Foo3 = {"memcpy3", FunctionType::MEMCPY};
   static constexpr DistributionId Dist = {{"A"}};
-  static constexpr Sample kSamples[] = {Sample{{Foo1, Dist}, 1},
-                                        Sample{{Foo2, Dist}, 2},
-                                        Sample{{Foo3, Dist}, 3}};
+  static constexpr Sample kSamples[] = {
+      Sample{{Foo1, Dist}, SampleType::ITERATION, 1},
+      Sample{{Foo2, Dist}, SampleType::ITERATION, 2},
+      Sample{{Foo3, Dist}, SampleType::ITERATION, 3}};
 
   // Data is aggregated per function.
   std::vector<FunctionData> Data = getThroughputs(kSamples);
@@ -113,9 +118,12 @@ TEST(AutomemcpyJsonResultsAnalyzer, castVotes) {
   static constexpr DistributionId DistA = {{"A"}};
   static constexpr DistributionId DistB = {{"B"}};
   static constexpr Sample kSamples[] = {
-      Sample{{Foo1, DistA}, 0}, Sample{{Foo1, DistB}, 30},
-      Sample{{Foo2, DistA}, 1}, Sample{{Foo2, DistB}, 100},
-      Sample{{Foo3, DistA}, 7}, Sample{{Foo3, DistB}, 100},
+      Sample{{Foo1, DistA}, SampleType::ITERATION, 0},
+      Sample{{Foo1, DistB}, SampleType::ITERATION, 30},
+      Sample{{Foo2, DistA}, SampleType::ITERATION, 1},
+      Sample{{Foo2, DistB}, SampleType::ITERATION, 100},
+      Sample{{Foo3, DistA}, SampleType::ITERATION, 7},
+      Sample{{Foo3, DistB}, SampleType::ITERATION, 100},
   };
 
   // DistA Thoughput ranges from 0 to 7.

From 68b7b357fdfc00ed8807a887eba363d67c9dc3f5 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 18 Feb 2022 10:24:23 -0500
Subject: [PATCH 271/748] [Clang][OpenMP][Sema] Remove support for floating
 point values in atomic compare

This is a follow-up patch of D119378.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D119392
---
 clang/lib/Sema/SemaOpenMP.cpp       | 73 ++++++++++++-----------------
 clang/test/OpenMP/atomic_messages.c |  7 +++
 2 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 686cef249f3a5..ec0d095e89950 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -11015,8 +11015,6 @@ class OpenMPAtomicCompareChecker {
   Expr *C = nullptr;
   /// True if the cond expr is in the form of 'x ordop expr'.
   bool IsXBinopExpr = true;
-  /// The atomic compare operator.
-  OMPAtomicCompareOp Op;
 
   /// Check if it is a valid conditional update statement (cond-update-stmt).
   bool checkCondUpdateStmt(IfStmt *S, ErrorInfoTy &ErrorInfo);
@@ -11073,23 +11071,7 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
   }
 
   switch (Cond->getOpcode()) {
-  case BO_EQ:
-    Op = OMPAtomicCompareOp::EQ;
-    break;
-  case BO_LT:
-    Op = OMPAtomicCompareOp::MIN;
-    break;
-  case BO_GT:
-    Op = OMPAtomicCompareOp::MAX;
-    break;
-  default:
-    ErrorInfo.Error = ErrorTy::InvalidBinaryOp;
-    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
-    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
-    return false;
-  }
-
-  if (Cond->getOpcode() == BO_EQ) {
+  case BO_EQ: {
     C = Cond;
     D = BO->getRHS();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) {
@@ -11102,7 +11084,10 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
       ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
       return false;
     }
-  } else {
+    break;
+  }
+  case BO_LT:
+  case BO_GT: {
     E = BO->getRHS();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS()) &&
         checkIfTwoExprsAreSame(ContextRef, E, Cond->getRHS())) {
@@ -11117,6 +11102,13 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
       ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
       return false;
     }
+    break;
+  }
+  default:
+    ErrorInfo.Error = ErrorTy::InvalidBinaryOp;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
+    return false;
   }
 
   return true;
@@ -11167,23 +11159,7 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S,
   }
 
   switch (Cond->getOpcode()) {
-  case BO_EQ:
-    Op = OMPAtomicCompareOp::EQ;
-    break;
-  case BO_LT:
-    Op = OMPAtomicCompareOp::MIN;
-    break;
-  case BO_GT:
-    Op = OMPAtomicCompareOp::MAX;
-    break;
-  default:
-    ErrorInfo.Error = ErrorTy::InvalidBinaryOp;
-    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
-    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
-    return false;
-  }
-
-  if (Cond->getOpcode() == BO_EQ) {
+  case BO_EQ: {
     C = Cond;
     D = CO->getTrueExpr();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) {
@@ -11196,7 +11172,10 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S,
       ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
       return false;
     }
-  } else {
+    break;
+  }
+  case BO_LT:
+  case BO_GT: {
     E = CO->getTrueExpr();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS()) &&
         checkIfTwoExprsAreSame(ContextRef, E, Cond->getRHS())) {
@@ -11211,6 +11190,13 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S,
       ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
       return false;
     }
+    break;
+  }
+  default:
+    ErrorInfo.Error = ErrorTy::InvalidBinaryOp;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
+    return false;
   }
 
   return true;
@@ -11220,8 +11206,7 @@ bool OpenMPAtomicCompareChecker::checkType(ErrorInfoTy &ErrorInfo) const {
   // 'x' and 'e' cannot be nullptr
   assert(X && E && "X and E cannot be nullptr");
 
-  auto CheckValue = [&ErrorInfo](const Expr *E, OMPAtomicCompareOp Op,
-                                 bool ShouldBeLValue) {
+  auto CheckValue = [&ErrorInfo](const Expr *E, bool ShouldBeLValue) {
     if (ShouldBeLValue && !E->isLValue()) {
       ErrorInfo.Error = ErrorTy::XNotLValue;
       ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
@@ -11238,7 +11223,7 @@ bool OpenMPAtomicCompareChecker::checkType(ErrorInfoTy &ErrorInfo) const {
         return false;
       }
 
-      if (Op != OMPAtomicCompareOp::EQ && !QTy->isIntegerType()) {
+      if (!QTy->isIntegerType()) {
         ErrorInfo.Error = ErrorTy::NotInteger;
         ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
         ErrorInfo.ErrorRange = ErrorInfo.NoteRange = E->getSourceRange();
@@ -11249,13 +11234,13 @@ bool OpenMPAtomicCompareChecker::checkType(ErrorInfoTy &ErrorInfo) const {
     return true;
   };
 
-  if (!CheckValue(X, Op, true))
+  if (!CheckValue(X, true))
     return false;
 
-  if (!CheckValue(E, Op, false))
+  if (!CheckValue(E, false))
     return false;
 
-  if (D && !CheckValue(D, Op, false))
+  if (D && !CheckValue(D, false))
     return false;
 
   return true;
diff --git a/clang/test/OpenMP/atomic_messages.c b/clang/test/OpenMP/atomic_messages.c
index d9e2b1a352ec6..22f7be91662b5 100644
--- a/clang/test/OpenMP/atomic_messages.c
+++ b/clang/test/OpenMP/atomic_messages.c
@@ -483,5 +483,12 @@ void compare(void) {
     if (fx > fe)
       fx = fe;
   }
+// omp51-error@+5 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+4 {{expect integer value}}
+#pragma omp atomic compare
+  {
+    if (fx == fe)
+      fx = fe;
+  }
 }
 #endif

From 91cf639ac069a797b1fac4134cf121bc9db6dff6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 15:36:55 +0000
Subject: [PATCH 272/748] Fix Wdocumentation unknown parameter warning

---
 clang/include/clang/Driver/Driver.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 93e1eca6a9817..6f24f649ea544 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -614,9 +614,9 @@ class Driver {
   ///
   /// \param[in] HostTC is the host ToolChain paired with the device
   ///
-  /// \param[in] Action (e.g. OFK_Cuda/OFK_OpenMP/OFK_SYCL) is an Offloading
-  /// action that is optionally passed to a ToolChain (used by CUDA, to specify
-  /// if it's used in conjunction with OpenMP)
+  /// \param[in] TargetDeviceOffloadKind (e.g. OFK_Cuda/OFK_OpenMP/OFK_SYCL) is
+  /// an Offloading action that is optionally passed to a ToolChain (used by
+  /// CUDA, to specify if it's used in conjunction with OpenMP)
   ///
   /// Will cache ToolChains for the life of the driver object, and create them
   /// on-demand.

From 16655a58f28d8c2117b49efe060b8763fae5c364 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 15:37:23 +0000
Subject: [PATCH 273/748] Fix Wdocumentation unknown parameter warning

---
 llvm/tools/llvm-remark-size-diff/RemarkSizeDiff.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/tools/llvm-remark-size-diff/RemarkSizeDiff.cpp b/llvm/tools/llvm-remark-size-diff/RemarkSizeDiff.cpp
index e18bccdd26cea..8faa573bad261 100644
--- a/llvm/tools/llvm-remark-size-diff/RemarkSizeDiff.cpp
+++ b/llvm/tools/llvm-remark-size-diff/RemarkSizeDiff.cpp
@@ -351,8 +351,8 @@ static bool tryReadFileAndProcessRemarks(
 /// remarks file.
 /// \param[in] FuncNameToSizeInfoB - Size info collected from
 /// the second remarks file.
-/// \param[out] D - Filled with the diff between \p FuncNameToSizeInfoA and
-/// \p FuncNameToSizeInfoB.
+/// \param[out] DiffsByFilesPresent - Filled with the diff between \p
+/// FuncNameToSizeInfoA and \p FuncNameToSizeInfoB.
 static void
 computeDiff(const StringMap<InstCountAndStackSize> &FuncNameToSizeInfoA,
             const StringMap<InstCountAndStackSize> &FuncNameToSizeInfoB,

From 6877ec49f78724fffe53805739cb7d4761c18e37 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 15:37:49 +0000
Subject: [PATCH 274/748] Fix Wdocumentation unknown parameter warning

---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 17aec5d5145fb..d90d4e807e0df 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1366,7 +1366,7 @@ class OpenMPIRBuilder {
   ///                     comparison. If forms that use 'ordop', it should be
   ///                     \p nullptr.
   /// \param AO           Atomic ordering of the generated atomic instructions.
-  /// \param OP           Atomic compare operation. It can only be ==, <, or >.
+  /// \param Op           Atomic compare operation. It can only be ==, <, or >.
   /// \param IsXBinopExpr True if the conditional statement is in the form where
   ///                     x is on LHS. It only matters for < or >.
   ///

From f6510e6d6fcc361594edffaba495b6d7c478e0a9 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 07:39:25 -0800
Subject: [PATCH 275/748] [instsimplify] Factor out a helper for alloca bounds
 checking [NFC]

At the moment, this just groups comments with a reasonably named predicate, but I plan to add other cases to this in the near future.
---
 llvm/lib/Analysis/InstructionSimplify.cpp | 65 ++++++++++++-----------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 35e93143f96a1..5fa6b69c1014f 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2507,6 +2507,36 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
   return nullptr;
 }
 
+/// Return true if V1 and V2 are each the base of some distict storage region
+/// [V, object_size(V)] which do not overlap.  Note that zero sized regions
+/// *are* possible, and that zero sized regions do not overlap with any other.
+static bool HaveNonOverlappingStorage(const Value *V1, const Value *V2) {
+  // Global variables always exist, so they always exist during the lifetime
+  // of each other and all allocas. Two different allocas usually have
+  // different addresses...
+  //
+  // However, if there's an @llvm.stackrestore dynamically in between two
+  // allocas, they may have the same address. It's tempting to reduce the
+  // scope of the problem by only looking at *static* allocas here. That would
+  // cover the majority of allocas while significantly reducing the likelihood
+  // of having an @llvm.stackrestore pop up in the middle. However, it's not
+  // actually impossible for an @llvm.stackrestore to pop up in the middle of
+  // an entry block. Also, if we have a block that's not attached to a
+  // function, we can't tell if it's "static" under the current definition.
+  // Theoretically, this problem could be fixed by creating a new kind of
+  // instruction kind specifically for static allocas. Such a new instruction
+  // could be required to be at the top of the entry block, thus preventing it
+  // from being subject to a @llvm.stackrestore. Instcombine could even
+  // convert regular allocas into these special allocas. It'd be nifty.
+  // However, until then, this problem remains open.
+  //
+  // So, we'll assume that two non-empty allocas have different addresses
+  // for now.
+  //
+  return isa<AllocaInst>(V1) &&
+    (isa<AllocaInst>(V2) || isa<GlobalVariable>(V2));
+}
+
 // A significant optimization not implemented here is assuming that alloca
 // addresses are not equal to incoming argument values. They don't *alias*,
 // as we say, but that doesn't mean they aren't equal, so we take a
@@ -2599,36 +2629,11 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   // Various optimizations for (in)equality comparisons.
   if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
     // Different non-empty allocations that exist at the same time have
-    // different addresses (if the program can tell). Global variables always
-    // exist, so they always exist during the lifetime of each other and all
-    // allocas. Two different allocas usually have different addresses...
-    //
-    // However, if there's an @llvm.stackrestore dynamically in between two
-    // allocas, they may have the same address. It's tempting to reduce the
-    // scope of the problem by only looking at *static* allocas here. That would
-    // cover the majority of allocas while significantly reducing the likelihood
-    // of having an @llvm.stackrestore pop up in the middle. However, it's not
-    // actually impossible for an @llvm.stackrestore to pop up in the middle of
-    // an entry block. Also, if we have a block that's not attached to a
-    // function, we can't tell if it's "static" under the current definition.
-    // Theoretically, this problem could be fixed by creating a new kind of
-    // instruction kind specifically for static allocas. Such a new instruction
-    // could be required to be at the top of the entry block, thus preventing it
-    // from being subject to a @llvm.stackrestore. Instcombine could even
-    // convert regular allocas into these special allocas. It'd be nifty.
-    // However, until then, this problem remains open.
-    //
-    // So, we'll assume that two non-empty allocas have different addresses
-    // for now.
-    //
-    // With all that, if the offsets are within the bounds of their allocations
-    // (and not one-past-the-end! so we can't use inbounds!), and their
-    // allocations aren't the same, the pointers are not equal.
-    //
-    // Note that it's not necessary to check for LHS being a global variable
-    // address, due to canonicalization and constant folding.
-    if (isa<AllocaInst>(LHS) &&
-        (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
+    // different addresses (if the program can tell). If the offsets are
+    // within the bounds of their allocations (and not one-past-the-end!
+    // so we can't use inbounds!), and their allocations aren't the same,
+    // the pointers are not equal.
+    if (HaveNonOverlappingStorage(LHS, RHS)) {
       uint64_t LHSSize, RHSSize;
       ObjectSizeOpts Opts;
       Opts.EvalMode = ObjectSizeOpts::Mode::Min;

From 1cf790bd0440e828c8486bdcad467a30bacb418a Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 07:50:57 -0800
Subject: [PATCH 276/748] [instsimplify] Add pointer compare tests for byval
 args and globals

---
 llvm/test/Transforms/InstSimplify/compare.ll | 71 ++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index 25a6c19d2291c..b82134d58db9b 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -2737,5 +2737,76 @@ define i1 @scalar_vectors_are_non_empty() {
   ret i1 %res
 }
 
+; TODO: Never equal
+define i1 @byval_args_inequal(i32* byval(i32) %a, i32* byval(i32) %b) {
+; CHECK-LABEL: @byval_args_inequal(
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %res = icmp ne i32* %a, %b
+  ret i1 %res
+}
+
+; Arguments can be adjacent on the stack
+define i1 @neg_args_adjacent(i32* byval(i32) %a, i32* byval(i32) %b) {
+; CHECK-LABEL: @neg_args_adjacent(
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 1
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A_OFF]], [[B:%.*]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %a.off = getelementptr i32, i32* %a, i32 1
+  %res = icmp ne i32* %a.off, %b
+  ret i1 %res
+}
+
+; TODO: Never equal
+define i1 @test_byval_alloca_inequal(i32* byval(i32) %a) {
+; CHECK-LABEL: @test_byval_alloca_inequal(
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A:%.*]], [[B]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %b = alloca i32
+  %res = icmp ne i32* %a, %b
+  ret i1 %res
+}
+
+; Byval argument can be immediately before alloca, and crossing
+; over is allowed.
+define i1 @neg_byval_alloca_adjacent(i32* byval(i32) %a) {
+; CHECK-LABEL: @neg_byval_alloca_adjacent(
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A_OFF:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 1
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A_OFF]], [[B]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %b = alloca i32
+  %a.off = getelementptr i32, i32* %a, i32 1
+  %res = icmp ne i32* %a.off, %b
+  ret i1 %res
+}
+
+@A = global i32 0
+@B = global i32 0
+@A.alias = alias i32, i32* @A
+
+define i1 @globals_inequal() {
+; CHECK-LABEL: @globals_inequal(
+; CHECK-NEXT:    ret i1 true
+;
+  %res = icmp ne i32* @A, @B
+  ret i1 %res
+}
+
+define i1 @neg_global_alias() {
+; CHECK-LABEL: @neg_global_alias(
+; CHECK-NEXT:    ret i1 icmp ne (i32* @A, i32* @A.alias)
+;
+  %res = icmp ne i32* @A, @A.alias
+  ret i1 %res
+}
+
+; TODO: Add coverage for global aliases, link once, etc..
+
 
 attributes #0 = { null_pointer_is_valid }

From 670aeece51ae0e4029d078fe3a5f06b50d42f15c Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Mon, 14 Feb 2022 13:48:04 -0500
Subject: [PATCH 277/748] [MLIR][OpenMP][SCF] Mark parallel regions as
 allocation scopes

MLIR has the notion of allocation scopes which specify that stack allocations (e.g. memref.alloca, llvm.alloca) should be freed or equivalently aren't available at the end of the corresponding region.
Currently neither OpenMP parallel nor SCF parallel regions have the notion of such a scope.

This clearly makes sense for an OpenMP parallel as this is implemented in with a new function which outlines the region, and clearly any allocations in that newly outlined function have a lifetime that ends at the return of the function, by definition.

While SCF.parallel doesn't have a guaranteed runtime which it is implemented with, this similarly makes sense for SCF.parallel since otherwise an allocation within an SCF.parallel will needlessly continue to allocate stack memory that isn't cleaned up until the function (or other allocation scope op) which contains the SCF.parallel returns. This means that it is impossible to represent thread or iteration-local memory without causing a stack blow-up. In the case that this stack-blow-up behavior is intended, this can be equivalently represented with an allocation outside of the SCF.parallel with a size equal to the number of iterations.

Reviewed By: ftynse

Differential Revision: https://reviews.llvm.org/D119743
---
 mlir/include/mlir/Dialect/Affine/IR/AffineOps.td    | 4 ++--
 mlir/include/mlir/Dialect/GPU/GPUOps.td             | 2 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td       | 3 ++-
 mlir/include/mlir/Dialect/SCF/SCFOps.td             | 5 +++--
 mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir | 4 ++--
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 5e1b910c8b68e..914973df55945 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -107,7 +107,7 @@ def AffineApplyOp : Affine_Op<"apply", [NoSideEffect]> {
 }
 
 def AffineForOp : Affine_Op<"for",
-    [ImplicitAffineTerminator, RecursiveSideEffects,
+    [AutomaticAllocationScope, ImplicitAffineTerminator, RecursiveSideEffects,
      DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
   let summary = "for operation";
   let description = [{
@@ -608,7 +608,7 @@ def AffineMaxOp : AffineMinMaxOpBase<"max", [NoSideEffect]> {
 }
 
 def AffineParallelOp : Affine_Op<"parallel",
-    [ImplicitAffineTerminator, RecursiveSideEffects,
+    [AutomaticAllocationScope, ImplicitAffineTerminator, RecursiveSideEffects,
      DeclareOpInterfaceMethods<LoopLikeOpInterface>, MemRefsNormalizable]> {
   let summary = "multi-index parallel band operation";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index 5d25892175b90..ecddae6b10d05 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -439,7 +439,7 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func",
   let hasVerifier = 1;
 }
 
-def GPU_LaunchOp : GPU_Op<"launch">,
+def GPU_LaunchOp : GPU_Op<"launch", [AutomaticAllocationScope]>,
     Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
                Optional<I32>:$dynamicSharedMemorySize)>,
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index ddeb698fb2a25..51ae0dc5bea2b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -80,7 +80,8 @@ def ClauseDefault : I32EnumAttr<
 def ClauseDefaultAttr : EnumAttr<OpenMP_Dialect, ClauseDefault,
                                  "clause_default">;
 
-def ParallelOp : OpenMP_Op<"parallel", [AttrSizedOperandSegments,
+def ParallelOp : OpenMP_Op<"parallel", [AutomaticAllocationScope, 
+                                        AttrSizedOperandSegments,
                  DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>]> {
   let summary = "parallel construct";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/SCF/SCFOps.td b/mlir/include/mlir/Dialect/SCF/SCFOps.td
index a2218d13ab0fc..122b13f449002 100644
--- a/mlir/include/mlir/Dialect/SCF/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/SCFOps.td
@@ -110,7 +110,7 @@ def ExecuteRegionOp : SCF_Op<"execute_region"> {
 }
 
 def ForOp : SCF_Op<"for",
-      [DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+      [AutomaticAllocationScope, DeclareOpInterfaceMethods<LoopLikeOpInterface>,
        DeclareOpInterfaceMethods<RegionBranchOpInterface>,
        SingleBlockImplicitTerminator<"scf::YieldOp">,
        RecursiveSideEffects]> {
@@ -404,7 +404,8 @@ def IfOp : SCF_Op<"if",
 }
 
 def ParallelOp : SCF_Op<"parallel",
-    [AttrSizedOperandSegments,
+    [AutomaticAllocationScope,
+     AttrSizedOperandSegments,
      DeclareOpInterfaceMethods<LoopLikeOpInterface>,
      RecursiveSideEffects,
      SingleBlockImplicitTerminator<"scf::YieldOp">]> {
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
index 15b70caa930f0..ab6e7a0fa0494 100644
--- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
@@ -83,7 +83,6 @@ func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %d
 // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
   %f0 = arith.constant 0.0: f32
-  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
   // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
   // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
   // CHECK-DAG:  %[[C3:.*]] = arith.constant 3 : index
@@ -94,6 +93,7 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
   // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} {
   // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
   // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
+  // CHECK:               %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
   // CHECK:               scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
   // CHECK:                 scf.if
   // CHECK:                   %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
@@ -149,7 +149,6 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
 
 // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
-  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
   // CHECK-DAG:  %{{.*}} = arith.constant dense<1.000000e+00> : vector<5x4x3xf32>
   // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
   // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
@@ -161,6 +160,7 @@ func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
   // CHECK-NEXT:   affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
   // CHECK-NEXT:     affine.for %[[I2:.*]] = 0 to %{{.*}} {
   // CHECK-NEXT:       affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
+  // CHECK:              %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
   // CHECK:              memref.store %{{.*}}, %[[ALLOC]][] : memref<vector<5x4x3xf32>>
   // CHECK:              %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<5x4x3xf32>> to memref<5xvector<4x3xf32>>
   // CHECK:              scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {

From 7b731f4d0bfbbac74210f59b782624662a3c7546 Mon Sep 17 00:00:00 2001
From: Carlo Bertolli <carlo.bertolli@amd.com>
Date: Fri, 18 Feb 2022 09:55:49 -0600
Subject: [PATCH 278/748] [OpenMP][libomptarget] Delay restore of shadow
 pointers in structs to after H2D memory copies are completed

When using asynchronous plugin calls, shadow pointer restore could happen before the D2H copy for the entire struct has completed, effectively leaving a device pointer in a host struct.
This patch fixes the problem by delaying restore's to after a synchronization happens (target regions) and by calling early synchronization (target update).

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D119968
---
 openmp/libomptarget/src/omptarget.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 304091e4f2f1d..015e69af90589 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -802,6 +802,10 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
         // If we copied the struct to the host, we need to restore the pointer.
         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
           void **ShadowHstPtrAddr = (void **)Itr->first;
+          // Wait for device-to-host memcopies for whole struct to complete,
+          // before restoring the correct host pointer.
+          if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
+            return OFFLOAD_FAIL;
           *ShadowHstPtrAddr = Itr->second.HstPtrVal;
           DP("Restoring original host pointer value " DPxMOD " for host "
              "pointer " DPxMOD "\n",
@@ -885,6 +889,10 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
 
     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
       void **ShadowHstPtrAddr = (void **)Itr->first;
+      // Wait for device-to-host memcopies for whole struct to complete,
+      // before restoring the correct host pointer.
+      if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
+        return OFFLOAD_FAIL;
       *ShadowHstPtrAddr = Itr->second.HstPtrVal;
       DP("Restoring original host pointer value " DPxMOD
          " for host pointer " DPxMOD "\n",

From f510045d820b6bdc2a20832fd1a35aff47f964f8 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 14 Jan 2022 11:03:21 +0000
Subject: [PATCH 279/748] [CodeGen] Remove unneeded regex escaping in FileCheck
 patterns. NFC.

Take advantage of D117117 to simplify all {{\[}} to [ and {{\]}} to ].

Differential Revision: https://reviews.llvm.org/D117298
---
 .../CodeGen/AArch64/GlobalISel/swiftself.ll   |   2 +-
 .../AArch64/aarch64-interleaved-ld-combine.ll |   4 +-
 llvm/test/CodeGen/AArch64/arm64-abi_align.ll  |   2 +-
 .../AArch64/arm64-alloc-no-stack-realign.ll   |   4 +-
 .../AArch64/arm64-code-model-large-darwin.ll  |   2 +-
 .../test/CodeGen/AArch64/arm64-collect-loh.ll |  74 ++++-----
 .../AArch64/arm64-dagcombiner-load-slicing.ll |  12 +-
 .../CodeGen/AArch64/arm64-fast-isel-call.ll   |   2 +-
 .../CodeGen/AArch64/arm64-fast-isel-gv.ll     |  16 +-
 .../AArch64/arm64-fast-isel-intrinsic.ll      |  68 ++++----
 .../AArch64/arm64-fast-isel-materialize.ll    |   4 +-
 .../CodeGen/AArch64/arm64-promote-const.ll    |  16 +-
 .../AArch64/arm64-swizzle-tbl-i16-layout.ll   |   2 +-
 .../test/CodeGen/AArch64/arm64-vector-ldst.ll |  62 +++----
 .../CodeGen/AArch64/arm64-virtual_base.ll     |   2 +-
 llvm/test/CodeGen/AArch64/cmpxchg-O0.ll       |  24 +--
 llvm/test/CodeGen/AArch64/dllimport.ll        |   2 +-
 llvm/test/CodeGen/AArch64/fast-isel-atomic.ll |  16 +-
 .../AArch64/fast-isel-branch-cond-mask.ll     |   2 +-
 .../test/CodeGen/AArch64/fast-isel-cmpxchg.ll |  12 +-
 .../AArch64/fast-isel-runtime-libcall.ll      |  16 +-
 .../AArch64/misched-fusion-addr-tune.ll       |   8 +-
 .../CodeGen/AArch64/misched-fusion-addr.ll    |  26 +--
 .../CodeGen/AArch64/stack-guard-reassign.ll   |   2 +-
 llvm/test/CodeGen/AArch64/stack-guard-sve.ll  |  22 +--
 .../CodeGen/AArch64/stack-protector-target.ll |   8 +-
 .../AArch64/stack-tagging-unchecked-ld-st.ll  |   2 +-
 .../test/CodeGen/AArch64/stack_guard_remat.ll |  12 +-
 llvm/test/CodeGen/AArch64/stgp.ll             |   4 +-
 llvm/test/CodeGen/AArch64/swiftself.ll        |   4 +-
 .../CodeGen/AArch64/tagged-globals-pic.ll     |  20 +--
 .../CodeGen/AArch64/tagged-globals-static.ll  |   8 +-
 llvm/test/CodeGen/AArch64/win-tls.ll          |  20 +--
 .../GlobalISel/llvm.amdgcn.dispatch.id.ll     |   2 +-
 llvm/test/CodeGen/AMDGPU/acc-ldst.ll          |   8 +-
 llvm/test/CodeGen/AMDGPU/add.i16.ll           |   4 +-
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |   6 +-
 llvm/test/CodeGen/AMDGPU/add_i128.ll          |   2 +-
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     |  62 +++----
 llvm/test/CodeGen/AMDGPU/alignbit-pat.ll      |   6 +-
 llvm/test/CodeGen/AMDGPU/always-uniform.ll    |   4 +-
 llvm/test/CodeGen/AMDGPU/amdpal.ll            |  16 +-
 .../AMDGPU/amdpal_scratch_mergedshader.ll     |   2 +-
 llvm/test/CodeGen/AMDGPU/and.ll               |  18 +-
 .../CodeGen/AMDGPU/atomic_cmp_swap_local.ll   |  12 +-
 .../AMDGPU/atomic_optimizations_buffer.ll     |  16 +-
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |  16 +-
 .../atomic_optimizations_struct_buffer.ll     |  16 +-
 llvm/test/CodeGen/AMDGPU/bfe-combine.ll       |   8 +-
 llvm/test/CodeGen/AMDGPU/bfe-patterns.ll      |   8 +-
 .../AMDGPU/bitreverse-inline-immediates.ll    |  18 +-
 .../AMDGPU/branch-relaxation-debug-info.mir   |   2 +-
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll |  36 ++--
 llvm/test/CodeGen/AMDGPU/build_vector.ll      |   4 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |  40 ++---
 .../callee-special-input-sgprs-fixed-abi.ll   |   4 +-
 .../CodeGen/AMDGPU/captured-frame-index.ll    |   2 +-
 llvm/test/CodeGen/AMDGPU/clamp-modifier.ll    |   2 +-
 .../CodeGen/AMDGPU/coalesce-vgpr-alignment.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll    |   4 +-
 llvm/test/CodeGen/AMDGPU/commute-compares.ll  |   2 +-
 .../AMDGPU/constant-address-space-32bit.ll    |   2 +-
 .../AMDGPU/constant-fold-mi-operands.ll       |   6 +-
 .../AMDGPU/control-flow-fastregalloc.ll       |  30 ++--
 llvm/test/CodeGen/AMDGPU/ctpop64.ll           |  18 +-
 llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll |  16 +-
 llvm/test/CodeGen/AMDGPU/ds_read2st64.ll      |  18 +-
 .../CodeGen/AMDGPU/early-if-convert-cost.ll   |   8 +-
 .../CodeGen/AMDGPU/extract_vector_dynelt.ll   |  12 +-
 .../CodeGen/AMDGPU/extract_vector_elt-f16.ll  |   6 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i16.ll  |   6 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i8.ll   |   2 +-
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |   4 +-
 llvm/test/CodeGen/AMDGPU/fabs.ll              |   8 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |  32 ++--
 llvm/test/CodeGen/AMDGPU/fcmp.f16.ll          |   2 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  12 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |   4 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll     |  16 +-
 llvm/test/CodeGen/AMDGPU/fdiv.f64.ll          |   4 +-
 .../CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll   |  14 +-
 llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll |   8 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll  |  16 +-
 llvm/test/CodeGen/AMDGPU/fmin_legacy.ll       |   4 +-
 llvm/test/CodeGen/AMDGPU/fmul.f16.ll          |  16 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.ll     |  28 ++--
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll     |   6 +-
 llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll    |   6 +-
 llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll    |   6 +-
 llvm/test/CodeGen/AMDGPU/fpext.f16.ll         |   6 +-
 llvm/test/CodeGen/AMDGPU/fptosi.f16.ll        |   4 +-
 llvm/test/CodeGen/AMDGPU/fptoui.f16.ll        |   4 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll       |  12 +-
 llvm/test/CodeGen/AMDGPU/fract.f64.ll         |  30 ++--
 llvm/test/CodeGen/AMDGPU/function-args.ll     |  24 +--
 .../CodeGen/AMDGPU/function-call-relocs.ll    |  20 +--
 llvm/test/CodeGen/AMDGPU/global-constant.ll   |   6 +-
 .../test/CodeGen/AMDGPU/global-extload-i16.ll |   2 +-
 .../CodeGen/AMDGPU/global-variable-relocs.ll  |  40 ++---
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |  12 +-
 .../test/CodeGen/AMDGPU/global_atomics_i64.ll |  28 ++--
 llvm/test/CodeGen/AMDGPU/global_smrd.ll       |   6 +-
 .../AMDGPU/illegal-sgpr-to-vgpr-copy.ll       |   2 +-
 .../CodeGen/AMDGPU/image-sample-waterfall.ll  |  16 +-
 .../AMDGPU/indirect-addressing-si-gfx9.ll     |   2 +-
 .../AMDGPU/indirect-addressing-si-pregfx9.ll  |   2 +-
 .../CodeGen/AMDGPU/indirect-addressing-si.ll  |   8 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.ll        |   4 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |  14 +-
 .../AMDGPU/invariant-load-no-alias-store.ll   |   4 +-
 .../AMDGPU/kernel-argument-dag-lowering.ll    |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll  |  52 +++---
 .../CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll  |  52 +++---
 .../llvm.amdgcn.buffer.load.format.d16.ll     |  10 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll |   2 +-
 .../llvm.amdgcn.buffer.store.format.d16.ll    |   8 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll  |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll  |   2 +-
 .../AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll      |   2 +-
 .../AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll      |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll   |  10 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |   4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll |   4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll  |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll   |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  10 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |   6 +-
 .../llvm.amdgcn.raw.buffer.load.format.d16.ll |  10 +-
 ...llvm.amdgcn.raw.buffer.store.format.d16.ll |  14 +-
 .../llvm.amdgcn.raw.tbuffer.load.d16.ll       |  18 +-
 .../llvm.amdgcn.raw.tbuffer.store.d16.ll      |  18 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll   |   4 +-
 ...vm.amdgcn.struct.buffer.load.format.d16.ll |  10 +-
 ...m.amdgcn.struct.buffer.store.format.d16.ll |  14 +-
 .../llvm.amdgcn.struct.tbuffer.load.d16.ll    |  14 +-
 .../llvm.amdgcn.struct.tbuffer.store.d16.ll   |  20 +--
 .../AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll    |  10 +-
 .../AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll   |  14 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll      |   8 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  |   8 +-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    |   8 +-
 llvm/test/CodeGen/AMDGPU/load-local-i16.ll    |   4 +-
 llvm/test/CodeGen/AMDGPU/load-local-i8.ll     |   8 +-
 llvm/test/CodeGen/AMDGPU/local-atomics64.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll  |  10 +-
 llvm/test/CodeGen/AMDGPU/merge-stores.ll      |   8 +-
 llvm/test/CodeGen/AMDGPU/missing-store.ll     |   2 +-
 .../move-addr64-rsrc-dead-subreg-writes.ll    |   6 +-
 .../CodeGen/AMDGPU/mubuf-legalize-operands.ll |  76 ++++-----
 .../AMDGPU/not-scalarize-volatile-load.ll     |   4 +-
 llvm/test/CodeGen/AMDGPU/operand-folding.ll   |   4 +-
 llvm/test/CodeGen/AMDGPU/or.ll                |  20 +--
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |   8 +-
 .../AMDGPU/private-access-no-objects.ll       |  24 +--
 .../CodeGen/AMDGPU/private-element-size.ll    |   4 +-
 .../AMDGPU/promote-alloca-vector-to-vector.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/read_register.ll     |   4 +-
 llvm/test/CodeGen/AMDGPU/readcyclecounter.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/rel32.ll             |   2 +-
 llvm/test/CodeGen/AMDGPU/returnaddress.ll     |   4 +-
 llvm/test/CodeGen/AMDGPU/s_movk_i32.ll        |  26 +--
 llvm/test/CodeGen/AMDGPU/salu-to-valu.ll      |   4 +-
 llvm/test/CodeGen/AMDGPU/select-opt.ll        |   8 +-
 llvm/test/CodeGen/AMDGPU/select-vectors.ll    |   2 +-
 llvm/test/CodeGen/AMDGPU/sext-in-reg.ll       |  62 +++----
 .../CodeGen/AMDGPU/sgpr-copy-local-cse.ll     |   2 +-
 llvm/test/CodeGen/AMDGPU/sgpr-copy.ll         |   4 +-
 .../test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll |  50 +++---
 llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll    |  24 +--
 .../test/CodeGen/AMDGPU/shl-add-to-add-shl.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/shl_add_constant.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll  |   6 +-
 .../test/CodeGen/AMDGPU/shl_add_ptr_global.ll |  12 +-
 .../AMDGPU/si-triv-disjoint-mem-access.ll     |  16 +-
 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll    |   6 +-
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     |   4 +-
 llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll     |   2 +-
 llvm/test/CodeGen/AMDGPU/sopk-compares.ll     |   4 +-
 llvm/test/CodeGen/AMDGPU/sub.i16.ll           |   4 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |   2 +-
 llvm/test/CodeGen/AMDGPU/trunc.ll             |  18 +-
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |   4 +-
 .../CodeGen/AMDGPU/unaligned-load-store.ll    |   4 +-
 llvm/test/CodeGen/AMDGPU/uniform-cfg.ll       |   2 +-
 .../CodeGen/AMDGPU/use-sgpr-multiple-times.ll |  32 ++--
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  10 +-
 llvm/test/CodeGen/AMDGPU/vectorize-loads.ll   |   2 +-
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |  12 +-
 .../CodeGen/AMDGPU/widen-vselect-and-mask.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |  36 ++--
 llvm/test/CodeGen/AMDGPU/xor.ll               |  14 +-
 llvm/test/CodeGen/AMDGPU/zero_extend.ll       |   2 +-
 .../CodeGen/AMDGPU/zext-i64-bit-operand.ll    |   8 +-
 .../ARM/2012-10-04-AAPCS-byval-align8.ll      |   4 +-
 llvm/test/CodeGen/ARM/Windows/tls.ll          |  56 +++----
 llvm/test/CodeGen/ARM/cmpxchg-O0-be.ll        |   8 +-
 llvm/test/CodeGen/ARM/cmpxchg-O0.ll           |  18 +-
 .../CodeGen/ARM/constantpool-promote-ldrh.ll  |   2 +-
 llvm/test/CodeGen/ARM/fast-isel-call.ll       |   6 +-
 llvm/test/CodeGen/ARM/fast-isel-intrinsic.ll  | 144 ++++++++--------
 .../ARM/fast-isel-ldr-str-thumb-neg-index.ll  |  18 +-
 .../CodeGen/ARM/fast-isel-ldrh-strh-arm.ll    |   4 +-
 llvm/test/CodeGen/ARM/fast-isel-vararg.ll     |   6 +-
 llvm/test/CodeGen/ARM/fast-isel.ll            |   8 +-
 llvm/test/CodeGen/ARM/fp16-load-store.ll      |   8 +-
 .../CodeGen/ARM/i64_volatile_load_store.ll    |  96 +++++------
 llvm/test/CodeGen/ARM/indirectbr.ll           |   2 +-
 llvm/test/CodeGen/ARM/jump-table-tbh.ll       |   2 +-
 llvm/test/CodeGen/ARM/ldrd.ll                 |   6 +-
 llvm/test/CodeGen/ARM/memcpy-ldm-stm.ll       |  18 +-
 llvm/test/CodeGen/ARM/setjmp_longjmp.ll       |  22 +--
 llvm/test/CodeGen/ARM/stack-guard-tls.ll      |   8 +-
 llvm/test/CodeGen/ARM/stack_guard_remat.ll    |  16 +-
 .../CodeGen/ARM/struct_byval_arm_t1_t2.ll     | 154 +++++++++---------
 llvm/test/CodeGen/ARM/swiftself.ll            |   2 +-
 llvm/test/CodeGen/ARM/thumb-big-stack.ll      |   2 +-
 llvm/test/CodeGen/ARM/thumb_indirect_calls.ll |   2 +-
 llvm/test/CodeGen/ARM/vector-DAGCombine.ll    |   2 +-
 llvm/test/CodeGen/ARM/vld3.ll                 |   4 +-
 llvm/test/CodeGen/ARM/win32-ssp.ll            |   8 +-
 llvm/test/CodeGen/Thumb/stack_guard_remat.ll  |  12 +-
 .../Thumb2/2011-06-07-TwoAddrEarlyClobber.ll  |   2 +-
 llvm/test/CodeGen/Thumb2/stack_guard_remat.ll |   8 +-
 llvm/test/CodeGen/XCore/epilogue_prologue.ll  |   4 +-
 llvm/test/CodeGen/XCore/scavenging.ll         |  12 +-
 llvm/test/CodeGen/XCore/varargs.ll            |   8 +-
 .../NVPTX/split-gep-and-gvn.ll                |  36 ++--
 .../NVPTX/reassociate-geps-and-slsr.ll        |   8 +-
 233 files changed, 1522 insertions(+), 1522 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll b/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll
index 2a6bf8734d4ea..65ddcd9d8b00c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll
@@ -51,7 +51,7 @@ declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
 ; CHECK-DAG: ldr  x20, [x20]
 ; CHECK-DAG: mov [[CSREG:x[1-9].*]], x8
 ; CHECK: bl {{_?}}thisreturn_attribute
-; CHECK: str x0, {{\[}}[[CSREG]]
+; CHECK: str x0, [[[CSREG]]
 ; CHECK: ret
 define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret(i8*), i8** noalias nocapture readonly swiftself) {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll b/llvm/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll
index 38ccc5788fd7d..6ced29d9757d1 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-interleaved-ld-combine.ll
@@ -71,7 +71,7 @@ entry:
 ; AS-DAG: add [[ADD:x[0-9]+]], [[LSL]], #64
 ; AS-DAG: and [[AND:x[0-9]+]], [[ADD]], #0xfffffffffffffff0
 ; AS-DAG: add [[ADR:x[0-9]+]], x0, [[AND]]
-; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}}
+; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, [[[ADR]]]
 ; AS-DAG: str q[[V0]]
 ; AS-DAG: str q[[V1]]
 ; AS-DAG: str q[[V2]]
@@ -133,7 +133,7 @@ entry:
 ; AS-DAG: add [[ADD:x[0-9]+]], x0, #4
 ; AS-DAG: and [[AND:x[0-9]+]], [[LSL]], #0xfffffffffffffff0
 ; AS-DAG: add [[ADR:x[0-9]+]], [[ADD]], [[AND]]
-; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}}
+; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, [[[ADR]]]
 ; AS-DAG: str q[[V0]]
 ; AS-DAG: str q[[V1]]
 ; AS-DAG: str q[[V2]]
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
index 5224eca76619f..66bd2ef5ef394 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -512,7 +512,7 @@ entry:
 ; FAST: ldr x7, [{{x[0-9]+}}]
 ; FAST: mov x[[R0:[0-9]+]], sp
 ; FAST: mov w[[R1:[0-9]+]], #8
-; FAST: str w[[R1]], {{\[}}x[[R0]]{{\]}}
+; FAST: str w[[R1]], [x[[R0]]]
   %0 = load i64, i64* bitcast (%struct.s41* @g41 to i64*), align 16
   %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
                                     i32 6, i32 7, i64 %0, i32 8) #5
diff --git a/llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll b/llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
index 1ea61a8ac71bd..83ac21c165f97 100644
--- a/llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
@@ -10,8 +10,8 @@ entry:
 ; CHECK: test
 ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
 ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [[[BASE:x[0-9]+]], #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [[[BASE]]]
  %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
diff --git a/llvm/test/CodeGen/AArch64/arm64-code-model-large-darwin.ll b/llvm/test/CodeGen/AArch64/arm64-code-model-large-darwin.ll
index 8ad93af8e72c2..8f6c8954e6fed 100644
--- a/llvm/test/CodeGen/AArch64/arm64-code-model-large-darwin.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-code-model-large-darwin.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple=arm64-apple-darwin19 -code-model=large -O2 -o - %s | FileCheck %s
 
 ; CHECK: adrp    [[REG1:x[0-9]+]], _bar@GOTPAGE
-; CHECK: ldr     [[REG1]], {{\[}}[[REG1]], _bar@GOTPAGEOFF]
+; CHECK: ldr     [[REG1]], [[[REG1]], _bar@GOTPAGEOFF]
 ; CHECK: blr     [[REG1]]
 
 declare void @bar()
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 649a9d5a9709a..0b2acdf102c2c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -61,7 +61,7 @@ if.end4:                                          ; preds = %if.then2, %if.then,
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -77,7 +77,7 @@ define i32 @getC() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldrsw x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -95,7 +95,7 @@ define i64 @getSExtC() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
 ; CHECK-NEXT: str [[ADD]], [x[[LDRGOT_REG]]]
@@ -115,7 +115,7 @@ entry:
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: str w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -139,7 +139,7 @@ entry:
 ; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr w0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ldr w0, [[[ADDGOT_REG]], #16]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getInternalCPlus4() {
@@ -156,7 +156,7 @@ define i32 @getInternalCPlus4() {
 ; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsw x0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ldrsw x0, [[[ADDGOT_REG]], #16]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExtInternalCPlus4() {
@@ -174,9 +174,9 @@ define i64 @getSExtInternalCPlus4() {
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
 ; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
-; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], [[[ADDGOT_REG]], #16]
 ; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
-; CHECK-NEXT: str [[ADD]], {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: str [[ADD]], [[[ADDGOT_REG]], #16]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAdd [[ADRP_LABEL]], [[ADDGOT_LABEL]]
 define void @getSeveralInternalCPlus4(i32 %t) {
@@ -196,7 +196,7 @@ entry:
 ; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str w0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: str w0, [[[ADDGOT_REG]], #16]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define void @setInternalCPlus4(i32 %t) {
@@ -212,7 +212,7 @@ entry:
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ldr w0, [[[ADRP_REG]], _InternalC@PAGEOFF]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]]
 define i32 @getInternalC() {
@@ -226,7 +226,7 @@ define i32 @getInternalC() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsw x0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ldrsw x0, [[[ADRP_REG]], _InternalC@PAGEOFF]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]]
 define i64 @getSExtInternalC() {
@@ -241,9 +241,9 @@ define i64 @getSExtInternalC() {
 ; there is not much we can do about it.
 ; CHECK-LABEL: _getSeveralInternalC
 ; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
-; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], [[[ADRP_REG]], _InternalC@PAGEOFF]
 ; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
-; CHECK-NEXT: str [[ADD]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: str [[ADD]], [[[ADRP_REG]], _InternalC@PAGEOFF]
 ; CHECK-NEXT: ret
 define void @getSeveralInternalC(i32 %t) {
 entry:
@@ -259,7 +259,7 @@ entry:
 ; Indeed, strs do not support litterals.
 ; CHECK-LABEL: _setInternalC
 ; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
-; CHECK-NEXT: str w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: str w0, [[[ADRP_REG]], _InternalC@PAGEOFF]
 ; CHECK-NEXT: ret
 define void @setInternalC(i32 %t) {
 entry:
@@ -277,7 +277,7 @@ entry:
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: ldrb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
@@ -290,7 +290,7 @@ define i8 @getD() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: strb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -306,7 +306,7 @@ define void @setD(i8 %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldrsb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -323,7 +323,7 @@ define i32 @getSExtD() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldrsb x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -342,7 +342,7 @@ define i64 @getSExt64D() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: ldrh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
@@ -357,7 +357,7 @@ define i16 @getE() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldrsh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -372,7 +372,7 @@ define i32 @getSExtE() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: strh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -388,7 +388,7 @@ define void @setE(i16 %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldrsh x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -407,7 +407,7 @@ define i64 @getSExt64E() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _F@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -421,7 +421,7 @@ define i64 @getF() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _F@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: str x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -439,7 +439,7 @@ define void @setF(i64 %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _G@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr s0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -453,7 +453,7 @@ define float @getG() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _G@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: str s0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -471,7 +471,7 @@ define void @setG(float %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _H@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr h0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -485,7 +485,7 @@ define half @getH() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _H@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: str h0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -503,7 +503,7 @@ define void @setH(half %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _I@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -517,7 +517,7 @@ define double @getI() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _I@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: str d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -535,7 +535,7 @@ define void @setI(double %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _J@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -549,7 +549,7 @@ define <2 x i32> @getJ() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _J@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: str d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -567,7 +567,7 @@ define void @setJ(<2 x i32> %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _K@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr q0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -581,7 +581,7 @@ define <4 x i32> @getK() {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _K@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: str q0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -599,7 +599,7 @@ define void @setK(<4 x i32> %t) {
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: ldr b0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
@@ -614,7 +614,7 @@ define <1 x i8> @getL() {
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
 ; CHECK-NEXT: ; kill
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L@GOTPAGEOFF]
 ; Ultimately we should generate str b0, but right now, we match the vector
 ; variant which does not allow to fold the immediate into the store.
 ; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]]
@@ -637,7 +637,7 @@ define void @setL(<1 x i8> %t) {
 ; CHECK: [[LOH_LABEL0:Lloh[0-9]+]]:
 ; CHECK: adrp [[ADRP_REG:x[0-9]+]], [[CONSTPOOL:lCPI[0-9]+_[0-9]+]]@PAGE
 ; CHECK: [[LOH_LABEL1:Lloh[0-9]+]]:
-; CHECK: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
+; CHECK: ldr q[[IDX:[0-9]+]], [[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
 ; The tuple comes from the next instruction.
 ; CHECK: ext.16b v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, #1
 ; CHECK: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
index 09483ea09bd3f..72d94ae13b0fc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
@@ -7,8 +7,8 @@
 
 ; CHECK-LABEL: @test
 ; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
-; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], [[[BASE]]]
+; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], [[[BASE]], #64]
 ; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
 ; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
 ; CHECK: ret
@@ -39,8 +39,8 @@ entry:
 
 ; CHECK-LABEL: @test_int
 ; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
-; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], [[[BASE]]]
+; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], [[[BASE]], #64]
 ; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
 ; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
 ; CHECK: ret
@@ -71,8 +71,8 @@ entry:
 
 ; CHECK-LABEL: @test_long
 ; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
-; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
+; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], [[[BASE]]]
+; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], [[[BASE]], #128]
 ; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
 ; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
 ; CHECK: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-call.ll
index 9b9eb8d29bed4..fc4e52157845c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-call.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -13,7 +13,7 @@ entry:
 ; CHECK:       bl _call0
 ; LARGE-LABEL: foo0
 ; LARGE:       adrp [[REG0:x[0-9]+]], _call0@GOTPAGE
-; LARGE:       ldr  [[REG1:x[0-9]+]], {{\[}}[[REG0]], _call0@GOTPAGEOFF{{\]}}
+; LARGE:       ldr  [[REG1:x[0-9]+]], [[[REG0]], _call0@GOTPAGEOFF]
 ; LARGE-NEXT:  blr  [[REG1]]
   call void @call0()
   ret void
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
index 7c8941c8e5178..8338475399243 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -7,8 +7,8 @@ define void @Initrand() nounwind {
 entry:
 ; CHECK: @Initrand
 ; CHECK: adrp [[REG:x[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG]], _seed@GOTPAGEOFF{{\]}}
-; CHECK: str  {{x[0-9]+}}, {{\[}}[[REG2]]{{\]}}
+; CHECK: ldr  [[REG2:x[0-9]+]], [[[REG]], _seed@GOTPAGEOFF]
+; CHECK: str  {{x[0-9]+}}, [[[REG2]]]
   store i64 74755, i64* @seed, align 8
   ret void
 }
@@ -17,19 +17,19 @@ define i32 @Rand() nounwind {
 entry:
 ; CHECK: @Rand
 ; CHECK: adrp [[REG1:x[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG1]], _seed@GOTPAGEOFF{{\]}}
-; CHECK: ldr  [[REG5:x[0-9]+]], {{\[}}[[REG2]]{{\]}}
+; CHECK: ldr  [[REG2:x[0-9]+]], [[[REG1]], _seed@GOTPAGEOFF]
+; CHECK: ldr  [[REG5:x[0-9]+]], [[[REG2]]]
 ; CHECK: mov  [[REG4:x[0-9]+]], #1309
 ; CHECK: mul  [[REG6:x[0-9]+]], [[REG5]], [[REG4]]
 ; CHECK: mov  [[REG3:x[0-9]+]], #13849
 ; CHECK: add  [[REG7:x[0-9]+]], [[REG6]], [[REG3]]
 ; CHECK: and  [[REG8:x[0-9]+]], [[REG7]], #0xffff
 ; CHECK: adrp [[REG1:x[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr  [[REG1]], {{\[}}[[REG1]], _seed@GOTPAGEOFF{{\]}}
-; CHECK: str  [[REG8]], {{\[}}[[REG1]]{{\]}}
+; CHECK: ldr  [[REG1]], [[[REG1]], _seed@GOTPAGEOFF]
+; CHECK: str  [[REG8]], [[[REG1]]]
 ; CHECK: adrp [[REG1:x[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr  [[REG1]], {{\[}}[[REG1]], _seed@GOTPAGEOFF{{\]}}
-; CHECK: ldr  {{x[0-9]+}}, {{\[}}[[REG1]]{{\]}}
+; CHECK: ldr  [[REG1]], [[[REG1]], _seed@GOTPAGEOFF]
+; CHECK: ldr  {{x[0-9]+}}, [[[REG1]]]
   %0 = load i64, i64* @seed, align 8
   %mul = mul nsw i64 %0, 1309
   %add = add nsw i64 %mul, 13849
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index 21269ba8fe04c..d8faf08f60d43 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -51,12 +51,12 @@ define void @t4() {
 ; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
 ; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
-; ARM64: ldr x10, {{\[}}[[REG2]]{{\]}}
-; ARM64: str x10, {{\[}}[[REG0]]{{\]}}
-; ARM64: ldr x10, {{\[}}[[REG2]], #8]
-; ARM64: str x10, {{\[}}[[REG0]], #8]
-; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #16]
-; ARM64: strb [[REG3]], {{\[}}[[REG0]], #16]
+; ARM64: ldr x10, [[[REG2]]]
+; ARM64: str x10, [[[REG0]]]
+; ARM64: ldr x10, [[[REG2]], #8]
+; ARM64: str x10, [[[REG0]], #8]
+; ARM64: ldrb [[REG3:w[0-9]+]], [[[REG2]], #16]
+; ARM64: strb [[REG3]], [[[REG0]], #16]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 17, i1 false)
   ret void
@@ -68,12 +68,12 @@ define void @t5() {
 ; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp [[REG3:x[0-9]+]], _message@PAGE
 ; ARM64: add [[REG1:x[0-9]+]], [[REG3]], _message@PAGEOFF
-; ARM64: ldr x10, {{\[}}[[REG1]]]
-; ARM64: str x10, {{\[}}[[REG0]]]
-; ARM64: ldr x10, {{\[}}[[REG1]], #8]
-; ARM64: str x10, {{\[}}[[REG0]], #8]
-; ARM64: ldrb [[REG4:w[0-9]+]], {{\[}}[[REG1]], #16]
-; ARM64: strb [[REG4]], {{\[}}[[REG0]], #16]
+; ARM64: ldr x10, [[[REG1]]]
+; ARM64: str x10, [[[REG0]]]
+; ARM64: ldr x10, [[[REG1]], #8]
+; ARM64: str x10, [[[REG0]], #8]
+; ARM64: ldrb [[REG4:w[0-9]+]], [[[REG1]], #16]
+; ARM64: strb [[REG4]], [[[REG0]], #16]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 8 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 17, i1 false)
   ret void
@@ -85,12 +85,12 @@ define void @t6() {
 ; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
 ; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
-; ARM64: ldr w10, {{\[}}[[REG2]]]
-; ARM64: str w10, {{\[}}[[REG0]]]
-; ARM64: ldr w10, {{\[}}[[REG2]], #4]
-; ARM64: str w10, {{\[}}[[REG0]], #4]
-; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #8]
-; ARM64: strb [[REG3]], {{\[}}[[REG0]], #8]
+; ARM64: ldr w10, [[[REG2]]]
+; ARM64: str w10, [[[REG0]]]
+; ARM64: ldr w10, [[[REG2]], #4]
+; ARM64: str w10, [[[REG0]], #4]
+; ARM64: ldrb [[REG3:w[0-9]+]], [[[REG2]], #8]
+; ARM64: strb [[REG3]], [[[REG0]], #8]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 4 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 9, i1 false)
   ret void
@@ -102,14 +102,14 @@ define void @t7() {
 ; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
 ; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
-; ARM64: ldrh w10, {{\[}}[[REG2]]]
-; ARM64: strh w10, {{\[}}[[REG0]]]
-; ARM64: ldrh w10, {{\[}}[[REG2]], #2]
-; ARM64: strh w10, {{\[}}[[REG0]], #2]
-; ARM64: ldrh w10, {{\[}}[[REG2]], #4]
-; ARM64: strh w10, {{\[}}[[REG0]], #4]
-; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #6]
-; ARM64: strb [[REG3]], {{\[}}[[REG0]], #6]
+; ARM64: ldrh w10, [[[REG2]]]
+; ARM64: strh w10, [[[REG0]]]
+; ARM64: ldrh w10, [[[REG2]], #2]
+; ARM64: strh w10, [[[REG0]], #2]
+; ARM64: ldrh w10, [[[REG2]], #4]
+; ARM64: strh w10, [[[REG0]], #4]
+; ARM64: ldrb [[REG3:w[0-9]+]], [[[REG2]], #6]
+; ARM64: strb [[REG3]], [[[REG0]], #6]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 2 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 7, i1 false)
   ret void
@@ -121,14 +121,14 @@ define void @t8() {
 ; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
 ; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
 ; ARM64: add [[REG2:x[0-9]+]], [[REG1:x[0-9]+]], _message@PAGEOFF
-; ARM64: ldrb w10, {{\[}}[[REG2]]]
-; ARM64: strb w10, {{\[}}[[REG0]]]
-; ARM64: ldrb w10, {{\[}}[[REG2]], #1]
-; ARM64: strb w10, {{\[}}[[REG0]], #1]
-; ARM64: ldrb w10, {{\[}}[[REG2]], #2]
-; ARM64: strb w10, {{\[}}[[REG0]], #2]
-; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #3]
-; ARM64: strb [[REG3]], {{\[}}[[REG0]], #3]
+; ARM64: ldrb w10, [[[REG2]]]
+; ARM64: strb w10, [[[REG0]]]
+; ARM64: ldrb w10, [[[REG2]], #1]
+; ARM64: strb w10, [[[REG0]], #1]
+; ARM64: ldrb w10, [[[REG2]], #2]
+; ARM64: strb w10, [[[REG0]], #2]
+; ARM64: ldrb [[REG3:w[0-9]+]], [[[REG2]], #3]
+; ARM64: strb [[REG3]], [[[REG0]], #3]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 1 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 4, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
index a09aae2962cf5..54b8418010904 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
@@ -39,13 +39,13 @@ define double @fmov_double2() {
 define float @cp_float() {
 ; CHECK-LABEL: cp_float
 ; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
-; CHECK-NEXT:  ldr s0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
+; CHECK-NEXT:  ldr s0, [[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF]
   ret float 0x400921FB60000000
 }
 
 define double @cp_double() {
 ; CHECK-LABEL: cp_double
 ; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
-; CHECK-NEXT:  ldr d0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
+; CHECK-NEXT:  ldr d0, [[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF]
   ret double 0x400921FB54442D18
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-promote-const.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const.ll
index 06c3cb974ac6b..431227a0273a2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-promote-const.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -14,8 +14,8 @@ define %struct.uint8x16x4_t @test1() {
 ; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
 ; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
 ; Destination registers are defined by the ABI
-; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
-; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
+; PROMOTED-NEXT: ldp q0, q1, [[[BASEADDR]]]
+; PROMOTED-NEXT: ldp q2, q3, [[[BASEADDR]], #32]
 ; PROMOTED-NEXT: ret
 
 ; REGULAR-LABEL: test1:
@@ -23,13 +23,13 @@ define %struct.uint8x16x4_t @test1() {
 ; the structure
 ; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
 ; Destination registers are defined by the ABI
-; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: ldr q0, [[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
 ; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: ldr q1, [[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
 ; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
-; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
+; REGULAR: ldr q2, [[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
 ; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
-; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
+; REGULAR: ldr q3, [[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
 ; REGULAR-NEXT: ret
 entry:
   ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
@@ -41,7 +41,7 @@ entry:
 ; PROMOTED-LABEL: test2:
 ; In stress mode, constant vector are promoted
 ; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst.[0-9]+]]@PAGE
-; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTV1]]@PAGEOFF]
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], [[[PAGEADDR]], [[CSTV1]]@PAGEOFF]
 ; Destination register is defined by ABI
 ; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
 ; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
@@ -52,7 +52,7 @@ entry:
 ; The difference is that the address (and thus the space in memory) is not
 ; shared between constants
 ; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: ldr q[[REGNUM:[0-9]+]], [[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
 ; Destination register is defined by ABI
 ; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
 ; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
diff --git a/llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll b/llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
index 2eedde5576441..54c6ee1ed9b91 100644
--- a/llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
@@ -27,7 +27,7 @@ define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
 ; CHECK:	.p2align	2
 ; CHECK:_foo:                                   ; @foo
 ; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0@PAGE
-; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
+; CHECK:	ldr	q[[REG:[0-9]+]], [[[BASE]], lCPI0_0@PAGEOFF]
 ; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
 ; CHECK:	ret
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 6e530cb258b7b..8b4fccb2781ba 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -50,7 +50,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %array, i64 %offset
   %tmp = load <2 x i64>, <2 x i64>* %arrayidx, align 16
   %tmp1 = load <2 x i64>*, <2 x i64>** @globalArray64x2, align 8
@@ -64,7 +64,7 @@ entry:
 ; CHECK-LABEL: fct2_64x2:
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+; CHECK: str [[DEST]], [[[BASE]], #80]
   %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %array, i64 3
   %tmp = load <2 x i64>, <2 x i64>* %arrayidx, align 16
   %tmp1 = load <2 x i64>*, <2 x i64>** @globalArray64x2, align 8
@@ -79,7 +79,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %array, i64 %offset
   %tmp = load <4 x i32>, <4 x i32>* %arrayidx, align 16
   %tmp1 = load <4 x i32>*, <4 x i32>** @globalArray32x4, align 8
@@ -93,7 +93,7 @@ entry:
 ; CHECK-LABEL: fct2_32x4:
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+; CHECK: str [[DEST]], [[[BASE]], #80]
   %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %array, i64 3
   %tmp = load <4 x i32>, <4 x i32>* %arrayidx, align 16
   %tmp1 = load <4 x i32>*, <4 x i32>** @globalArray32x4, align 8
@@ -108,7 +108,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <8 x i16>, <8 x i16>* %array, i64 %offset
   %tmp = load <8 x i16>, <8 x i16>* %arrayidx, align 16
   %tmp1 = load <8 x i16>*, <8 x i16>** @globalArray16x8, align 8
@@ -122,7 +122,7 @@ entry:
 ; CHECK-LABEL: fct2_16x8:
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+; CHECK: str [[DEST]], [[[BASE]], #80]
   %arrayidx = getelementptr inbounds <8 x i16>, <8 x i16>* %array, i64 3
   %tmp = load <8 x i16>, <8 x i16>* %arrayidx, align 16
   %tmp1 = load <8 x i16>*, <8 x i16>** @globalArray16x8, align 8
@@ -137,7 +137,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <16 x i8>, <16 x i8>* %array, i64 %offset
   %tmp = load <16 x i8>, <16 x i8>* %arrayidx, align 16
   %tmp1 = load <16 x i8>*, <16 x i8>** @globalArray8x16, align 8
@@ -151,7 +151,7 @@ entry:
 ; CHECK-LABEL: fct2_8x16:
 ; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+; CHECK: str [[DEST]], [[[BASE]], #80]
   %arrayidx = getelementptr inbounds <16 x i8>, <16 x i8>* %array, i64 3
   %tmp = load <16 x i8>, <16 x i8>* %arrayidx, align 16
   %tmp1 = load <16 x i8>*, <16 x i8>** @globalArray8x16, align 8
@@ -166,7 +166,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
 ; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <1 x i64>, <1 x i64>* %array, i64 %offset
   %tmp = load <1 x i64>, <1 x i64>* %arrayidx, align 8
   %tmp1 = load <1 x i64>*, <1 x i64>** @globalArray64x1, align 8
@@ -180,7 +180,7 @@ entry:
 ; CHECK-LABEL: fct2_64x1:
 ; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+; CHECK: str [[DEST]], [[[BASE]], #40]
   %arrayidx = getelementptr inbounds <1 x i64>, <1 x i64>* %array, i64 3
   %tmp = load <1 x i64>, <1 x i64>* %arrayidx, align 8
   %tmp1 = load <1 x i64>*, <1 x i64>** @globalArray64x1, align 8
@@ -195,7 +195,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
 ; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <2 x i32>, <2 x i32>* %array, i64 %offset
   %tmp = load <2 x i32>, <2 x i32>* %arrayidx, align 8
   %tmp1 = load <2 x i32>*, <2 x i32>** @globalArray32x2, align 8
@@ -209,7 +209,7 @@ entry:
 ; CHECK-LABEL: fct2_32x2:
 ; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+; CHECK: str [[DEST]], [[[BASE]], #40]
   %arrayidx = getelementptr inbounds <2 x i32>, <2 x i32>* %array, i64 3
   %tmp = load <2 x i32>, <2 x i32>* %arrayidx, align 8
   %tmp1 = load <2 x i32>*, <2 x i32>** @globalArray32x2, align 8
@@ -224,7 +224,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
 ; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <4 x i16>, <4 x i16>* %array, i64 %offset
   %tmp = load <4 x i16>, <4 x i16>* %arrayidx, align 8
   %tmp1 = load <4 x i16>*, <4 x i16>** @globalArray16x4, align 8
@@ -238,7 +238,7 @@ entry:
 ; CHECK-LABEL: fct2_16x4:
 ; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+; CHECK: str [[DEST]], [[[BASE]], #40]
   %arrayidx = getelementptr inbounds <4 x i16>, <4 x i16>* %array, i64 3
   %tmp = load <4 x i16>, <4 x i16>* %arrayidx, align 8
   %tmp1 = load <4 x i16>*, <4 x i16>** @globalArray16x4, align 8
@@ -253,7 +253,7 @@ entry:
 ; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
 ; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
 ; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+; CHECK: str [[DEST]], [[[BASE]], [[SHIFTEDOFFSET]]]
   %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8>* %array, i64 %offset
   %tmp = load <8 x i8>, <8 x i8>* %arrayidx, align 8
   %tmp1 = load <8 x i8>*, <8 x i8>** @globalArray8x8, align 8
@@ -348,8 +348,8 @@ entry:
 define void @fct8(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct8:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:d[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <1 x i64>*
   %0 = load <1 x i64>, <1 x i64>* %q, align 8
@@ -362,8 +362,8 @@ entry:
 define void @fct9(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct9:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:d[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <2 x i32>*
   %0 = load <2 x i32>, <2 x i32>* %q, align 8
@@ -376,8 +376,8 @@ entry:
 define void @fct10(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct10:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:d[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <4 x i16>*
   %0 = load <4 x i16>, <4 x i16>* %q, align 8
@@ -390,8 +390,8 @@ entry:
 define void @fct11(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct11:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:d[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <8 x i8>*
   %0 = load <8 x i8>, <8 x i8>* %q, align 8
@@ -404,8 +404,8 @@ entry:
 define void @fct12(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct12:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:q[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <2 x i64>*
   %0 = load <2 x i64>, <2 x i64>* %q, align 16
@@ -418,8 +418,8 @@ entry:
 define void @fct13(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct13:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:q[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <4 x i32>*
   %0 = load <4 x i32>, <4 x i32>* %q, align 16
@@ -432,8 +432,8 @@ entry:
 define void @fct14(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct14:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:q[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <8 x i16>*
   %0 = load <8 x i16>, <8 x i16>* %q, align 16
@@ -446,8 +446,8 @@ entry:
 define void @fct15(i8* %str) nounwind ssp {
 entry:
 ; CHECK-LABEL: fct15:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+; CHECK: ldur [[DESTREG:q[0-9]+]], [[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], [[[BASEREG]], #4]
   %p = getelementptr inbounds i8, i8* %str, i64 3
   %q = bitcast i8* %p to <16 x i8>*
   %0 = load <16 x i8>, <16 x i8>* %q, align 16
diff --git a/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll b/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
index ab6c3186bb12a..26470bf0c1187 100644
--- a/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -35,7 +35,7 @@ define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
 ; CHECK: Precompute_Patch_Values
 ; CHECK: ldr [[VAL2:q[0-9]+]], [x0, #272]
 ; CHECK-NEXT: ldr [[VAL:x[0-9]+]], [x0, #288]
-; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
+; CHECK-NEXT: stur [[VAL2]], [sp, #216]
 ; CHECK-NEXT: str [[VAL]], [sp, #232]
 entry:
   %Control_Points = alloca [16 x [3 x double]], align 8
diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll
index 7270b0207eccf..fc498fb07079e 100644
--- a/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-O0.ll
@@ -6,10 +6,10 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_8:
 ; CHECK:     mov [[ADDR:x[0-9]+]], x0
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldaxrb [[OLD:w[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldaxrb [[OLD:w[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD]], w1, uxtb
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxrb [[STATUS:w[0-9]+]], w2, {{\[}}[[ADDR]]{{\]}}
+; CHECK:     stlxrb [[STATUS:w[0-9]+]], w2, [[[ADDR]]]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{w[0-9]+}}, [[OLD]], w1, uxtb
@@ -23,10 +23,10 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
 ; CHECK-LABEL: test_cmpxchg_16:
 ; CHECK:     mov [[ADDR:x[0-9]+]], x0
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldaxrh [[OLD:w[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldaxrh [[OLD:w[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD]], w1, uxth
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxrh [[STATUS:w[3-9]]], w2, {{\[}}[[ADDR]]{{\]}}
+; CHECK:     stlxrh [[STATUS:w[3-9]]], w2, [[[ADDR]]]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{w[0-9]+}}, [[OLD]], w1
@@ -40,10 +40,10 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind
 ; CHECK-LABEL: test_cmpxchg_32:
 ; CHECK:     mov [[ADDR:x[0-9]+]], x0
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldaxr [[OLD:w[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldaxr [[OLD:w[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD]], w1
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxr [[STATUS:w[0-9]+]], w2, {{\[}}[[ADDR]]{{\]}}
+; CHECK:     stlxr [[STATUS:w[0-9]+]], w2, [[[ADDR]]]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{w[0-9]+}}, [[OLD]], w1
@@ -57,10 +57,10 @@ define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind
 ; CHECK-LABEL: test_cmpxchg_64:
 ; CHECK:     mov [[ADDR:x[0-9]+]], x0
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldaxr [[OLD:x[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldaxr [[OLD:x[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD]], x1
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxr [[STATUS:w[0-9]+]], x2, {{\[}}[[ADDR]]{{\]}}
+; CHECK:     stlxr [[STATUS:w[0-9]+]], x2, [[[ADDR]]]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{x[0-9]+}}, [[OLD]], x1
@@ -74,13 +74,13 @@ define { i128, i1 } @test_cmpxchg_128(i128* %addr, i128 %desired, i128 %new) nou
 ; CHECK-LABEL: test_cmpxchg_128:
 ; CHECK:     mov [[ADDR:x[0-9]+]], x0
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD_LO]], x2
 ; CHECK:     cset [[CMP_TMP:w[0-9]+]], ne
 ; CHECK:     cmp [[OLD_HI]], x3
 ; CHECK:     cinc [[CMP:w[0-9]+]], [[CMP_TMP]], ne
 ; CHECK:     cbnz [[CMP]], [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxp [[STATUS:w[0-9]+]], x4, x5, {{\[}}[[ADDR]]{{\]}}
+; CHECK:     stlxp [[STATUS:w[0-9]+]], x4, x5, [[[ADDR]]]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
   %res = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst monotonic
@@ -99,13 +99,13 @@ define {i128, i1} @test_cmpxchg_128_unsplit(i128* %addr) {
 ; CHECK:     ldp [[DESIRED_LO:x[0-9]+]], [[DESIRED_HI:x[0-9]+]], [x[[VAR128]]]
 ; CHECK:     ldp [[NEW_LO:x[0-9]+]], [[NEW_HI:x[0-9]+]], [x[[VAR128]]]
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD_LO]], [[DESIRED_LO]]
 ; CHECK:     cset [[CMP_TMP:w[0-9]+]], ne
 ; CHECK:     cmp [[OLD_HI]], [[DESIRED_HI]]
 ; CHECK:     cinc [[CMP:w[0-9]+]], [[CMP_TMP]], ne
 ; CHECK:     cbnz [[CMP]], [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxp [[STATUS:w[0-9]+]], [[NEW_LO]], [[NEW_HI]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     stlxp [[STATUS:w[0-9]+]], [[NEW_LO]], [[NEW_HI]], [[[ADDR]]]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 
diff --git a/llvm/test/CodeGen/AArch64/dllimport.ll b/llvm/test/CodeGen/AArch64/dllimport.ll
index ed90c805c53b2..45de8d3be3787 100644
--- a/llvm/test/CodeGen/AArch64/dllimport.ll
+++ b/llvm/test/CodeGen/AArch64/dllimport.ll
@@ -37,7 +37,7 @@ define i32* @get_var_pointer() {
 
 ; CHECK-LABEL: get_var_pointer
 ; CHECK: adrp [[REG1:x[0-9]+]], __imp_var
-; CHECK: ldr {{x[0-9]+}}, {{\[}}[[REG1]], :lo12:__imp_var]
+; CHECK: ldr {{x[0-9]+}}, [[[REG1]], :lo12:__imp_var]
 ; CHECK: ret
 
 define i32 @call_external() {
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-atomic.ll b/llvm/test/CodeGen/AArch64/fast-isel-atomic.ll
index 240e82805726f..7a841fd9da5ad 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-atomic.ll
@@ -92,7 +92,7 @@ define void @atomic_store_release_8(i8* %p, i8 %val) #0 {
 ; CHECK-LABEL: atomic_store_release_8_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #1
-; CHECK-NEXT:  stlrb w1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlrb w1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_8_off(i8* %p, i8 %val) #0 {
   %tmp0 = getelementptr i8, i8* %p, i32 1
@@ -112,7 +112,7 @@ define void @atomic_store_release_16(i16* %p, i16 %val) #0 {
 ; CHECK-LABEL: atomic_store_release_16_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #2
-; CHECK-NEXT:  stlrh w1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlrh w1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_16_off(i16* %p, i16 %val) #0 {
   %tmp0 = getelementptr i16, i16* %p, i32 1
@@ -132,7 +132,7 @@ define void @atomic_store_release_32(i32* %p, i32 %val) #0 {
 ; CHECK-LABEL: atomic_store_release_32_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #4
-; CHECK-NEXT:  stlr w1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlr w1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_32_off(i32* %p, i32 %val) #0 {
   %tmp0 = getelementptr i32, i32* %p, i32 1
@@ -152,7 +152,7 @@ define void @atomic_store_release_64(i64* %p, i64 %val) #0 {
 ; CHECK-LABEL: atomic_store_release_64_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #8
-; CHECK-NEXT:  stlr x1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlr x1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_64_off(i64* %p, i64 %val) #0 {
   %tmp0 = getelementptr i64, i64* %p, i32 1
@@ -173,7 +173,7 @@ define void @atomic_store_seq_cst_8(i8* %p, i8 %val) #0 {
 ; CHECK-LABEL: atomic_store_seq_cst_8_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #1
-; CHECK-NEXT:  stlrb w1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlrb w1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_8_off(i8* %p, i8 %val) #0 {
   %tmp0 = getelementptr i8, i8* %p, i32 1
@@ -193,7 +193,7 @@ define void @atomic_store_seq_cst_16(i16* %p, i16 %val) #0 {
 ; CHECK-LABEL: atomic_store_seq_cst_16_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #2
-; CHECK-NEXT:  stlrh w1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlrh w1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_16_off(i16* %p, i16 %val) #0 {
   %tmp0 = getelementptr i16, i16* %p, i32 1
@@ -213,7 +213,7 @@ define void @atomic_store_seq_cst_32(i32* %p, i32 %val) #0 {
 ; CHECK-LABEL: atomic_store_seq_cst_32_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #4
-; CHECK-NEXT:  stlr w1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlr w1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_32_off(i32* %p, i32 %val) #0 {
   %tmp0 = getelementptr i32, i32* %p, i32 1
@@ -233,7 +233,7 @@ define void @atomic_store_seq_cst_64(i64* %p, i64 %val) #0 {
 ; CHECK-LABEL: atomic_store_seq_cst_64_off:
 ; CHECK-NEXT: // %bb.0:
 ; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #8
-; CHECK-NEXT:  stlr x1, {{\[}}[[REG0]]]
+; CHECK-NEXT:  stlr x1, [[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_64_off(i64* %p, i64 %val) #0 {
   %tmp0 = getelementptr i64, i64* %p, i32 1
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
index 0cafd883f6947..63b2937a8a583 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
@@ -3,7 +3,7 @@
 define void @test(i64 %a, i64 %b, i2* %c) {
 ; CHECK-LABEL: test
 ; CHECK:       and [[REG1:w[0-9]+]], {{w[0-9]+}}, #0x3
-; CHECK-NEXT:  strb [[REG1]], {{\[}}x2{{\]}}
+; CHECK-NEXT:  strb [[REG1]], [x2]
 ; CHECK-NEXT:  tbz {{w[0-9]+}}, #0,
  %1 = trunc i64 %a to i2
  %2 = trunc i64 %b to i1
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll b/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
index 82e3c2d4d61a8..46909e0b25918 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
@@ -3,11 +3,11 @@
 ; CHECK-LABEL: cmpxchg_monotonic_32:
 ; CHECK:          mov [[ADDR:x[0-9]+]], x0
 ; CHECK: [[RETRY:.LBB[0-9_]+]]:
-; CHECK-NEXT:     ldaxr w0, {{\[}}[[ADDR]]{{\]}}
+; CHECK-NEXT:     ldaxr w0, [[[ADDR]]]
 ; CHECK-NEXT:     cmp w0, w1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // %bb.2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], w2, {{\[}}[[ADDR]]{{\]}}
+; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], w2, [[[ADDR]]]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp w0, w1
@@ -28,11 +28,11 @@ define i32 @cmpxchg_monotonic_32(i32* %p, i32 %cmp, i32 %new, i32* %ps) #0 {
 ; CHECK:          mov [[ADDR:x[0-9]+]], x0
 ; CHECK:          ldr [[NEW:w[0-9]+]], [x2]
 ; CHECK-NEXT: [[RETRY:.LBB[0-9_]+]]:
-; CHECK-NEXT:     ldaxr w0, {{\[}}[[ADDR]]{{\]}}
+; CHECK-NEXT:     ldaxr w0, [[[ADDR]]]
 ; CHECK-NEXT:     cmp w0, w1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // %bb.2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], [[NEW]], {{\[}}[[ADDR]]{{\]}}
+; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], [[NEW]], [[[ADDR]]]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp w0, w1
@@ -52,11 +52,11 @@ define i32 @cmpxchg_acq_rel_32_load(i32* %p, i32 %cmp, i32* %pnew, i32* %ps) #0
 ; CHECK-LABEL: cmpxchg_seq_cst_64:
 ; CHECK: mov [[ADDR:x[0-9]+]], x0
 ; CHECK: [[RETRY:.LBB[0-9_]+]]:
-; CHECK-NEXT:     ldaxr x0, {{\[}}[[ADDR]]{{\]}}
+; CHECK-NEXT:     ldaxr x0, [[[ADDR]]]
 ; CHECK-NEXT:     cmp x0, x1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // %bb.2:
-; CHECK-NEXT:     stlxr [[STATUS]], x2, {{\[}}[[ADDR]]{{\]}}
+; CHECK-NEXT:     stlxr [[STATUS]], x2, [[[ADDR]]]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp x0, x1
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll b/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
index 34d7983ff5fac..16bab1565e7fa 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
@@ -6,7 +6,7 @@ define float @frem_f32(float %a, float %b) {
 ; SMALL:       bl _fmodf
 ; LARGE-LABEL: frem_f32
 ; LARGE:       adrp  [[REG:x[0-9]+]], _fmodf@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmodf@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _fmodf@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = frem float %a, %b
   ret float %1
@@ -17,7 +17,7 @@ define double @frem_f64(double %a, double %b) {
 ; SMALL:       bl _fmod
 ; LARGE-LABEL: frem_f64
 ; LARGE:       adrp  [[REG:x[0-9]+]], _fmod@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmod@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _fmod@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = frem double %a, %b
   ret double %1
@@ -28,7 +28,7 @@ define float @sin_f32(float %a) {
 ; SMALL:       bl _sinf
 ; LARGE-LABEL: sin_f32
 ; LARGE:       adrp  [[REG:x[0-9]+]], _sinf@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sinf@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _sinf@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = call float @llvm.sin.f32(float %a)
   ret float %1
@@ -39,7 +39,7 @@ define double @sin_f64(double %a) {
 ; SMALL:       bl _sin
 ; LARGE-LABEL: sin_f64
 ; LARGE:       adrp  [[REG:x[0-9]+]], _sin@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sin@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _sin@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = call double @llvm.sin.f64(double %a)
   ret double %1
@@ -50,7 +50,7 @@ define float @cos_f32(float %a) {
 ; SMALL:       bl _cosf
 ; LARGE-LABEL: cos_f32
 ; LARGE:       adrp  [[REG:x[0-9]+]], _cosf@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cosf@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _cosf@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = call float @llvm.cos.f32(float %a)
   ret float %1
@@ -61,7 +61,7 @@ define double @cos_f64(double %a) {
 ; SMALL:       bl _cos
 ; LARGE-LABEL: cos_f64
 ; LARGE:       adrp  [[REG:x[0-9]+]], _cos@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cos@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _cos@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = call double @llvm.cos.f64(double %a)
   ret double %1
@@ -72,7 +72,7 @@ define float @pow_f32(float %a, float %b) {
 ; SMALL:       bl _powf
 ; LARGE-LABEL: pow_f32
 ; LARGE:       adrp  [[REG:x[0-9]+]], _powf@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _powf@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _powf@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = call float @llvm.pow.f32(float %a, float %b)
   ret float %1
@@ -83,7 +83,7 @@ define double @pow_f64(double %a, double %b) {
 ; SMALL:       bl _pow
 ; LARGE-LABEL: pow_f64
 ; LARGE:       adrp  [[REG:x[0-9]+]], _pow@GOTPAGE
-; LARGE:       ldr [[REG]], {{\[}}[[REG]], _pow@GOTPAGEOFF{{\]}}
+; LARGE:       ldr [[REG]], [[[REG]], _pow@GOTPAGEOFF]
 ; LARGE-NEXT:  blr [[REG]]
   %1 = call double @llvm.pow.f64(double %a, double %b)
   ret double %1
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addr-tune.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addr-tune.ll
index 6161475531a66..db28719ddfbfe 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addr-tune.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addr-tune.ll
@@ -15,9 +15,9 @@ define dso_local void @ldst_double() {
 
 ; CHECK-LABEL: ldst_double:
 ; CHECK: adrp [[RD:x[0-9]+]], var_double
-; CHECK-NEXT: ldr {{d[0-9]+}}, {{\[}}[[RD]], {{#?}}:lo12:var_double{{\]}}
+; CHECK-NEXT: ldr {{d[0-9]+}}, [[[RD]], {{#?}}:lo12:var_double]
 ; CHECK: adrp [[RQ:x[0-9]+]], var_double2
-; CHECK-NEXT: str {{q[0-9]+}}, {{\[}}[[RQ]], {{#?}}:lo12:var_double2{{\]}}
+; CHECK-NEXT: str {{q[0-9]+}}, [[[RQ]], {{#?}}:lo12:var_double2]
 }
 
 define dso_local void @ldst_double_tune_a53() #0 {
@@ -31,10 +31,10 @@ define dso_local void @ldst_double_tune_a53() #0 {
 
 ; CHECK-LABEL: ldst_double_tune_a53:
 ; CHECK: adrp [[RD:x[0-9]+]], var_double
-; CHECK-NEXT: ldr {{d[0-9]+}}, {{\[}}[[RD]], {{#?}}:lo12:var_double{{\]}}
+; CHECK-NEXT: ldr {{d[0-9]+}}, [[[RD]], {{#?}}:lo12:var_double]
 ; CHECK-NEXT: adrp [[RQ:x[0-9]+]], var_double2
 ; CHECK: fcvt
-; CHECK: str {{q[0-9]+}}, {{\[}}[[RQ]], {{#?}}:lo12:var_double2{{\]}}
+; CHECK: str {{q[0-9]+}}, [[[RQ]], {{#?}}:lo12:var_double2]
 }
 
 attributes #0 = { "tune-cpu"="cortex-a53" }
diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addr.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addr.ll
index 489cc849b9081..158a64acf5df2 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-addr.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-addr.ll
@@ -26,9 +26,9 @@ define dso_local void @ldst_8bit() {
 
 ; CHECK-LABEL: ldst_8bit:
 ; CHECK: adrp [[RB:x[0-9]+]], var_8bit
-; CHECK-NEXT: ldrb {{w[0-9]+}}, {{\[}}[[RB]], {{#?}}:lo12:var_8bit{{\]}}
+; CHECK-NEXT: ldrb {{w[0-9]+}}, [[[RB]], {{#?}}:lo12:var_8bit]
 ; CHECK: adrp [[RH:x[0-9]+]], var_16bit
-; CHECK-NEXT: strh {{w[0-9]+}}, {{\[}}[[RH]], {{#?}}:lo12:var_16bit{{\]}}
+; CHECK-NEXT: strh {{w[0-9]+}}, [[[RH]], {{#?}}:lo12:var_16bit]
 }
 
 define dso_local void @ldst_16bit() {
@@ -41,9 +41,9 @@ define dso_local void @ldst_16bit() {
 
 ; CHECK-LABEL: ldst_16bit:
 ; CHECK: adrp [[RH:x[0-9]+]], var_16bit
-; CHECK-NEXT: ldrh {{w[0-9]+}}, {{\[}}[[RH]], {{#?}}:lo12:var_16bit{{\]}}
+; CHECK-NEXT: ldrh {{w[0-9]+}}, [[[RH]], {{#?}}:lo12:var_16bit]
 ; CHECK: adrp [[RW:x[0-9]+]], var_32bit
-; CHECK-NEXT: str {{w[0-9]+}}, {{\[}}[[RW]], {{#?}}:lo12:var_32bit{{\]}}
+; CHECK-NEXT: str {{w[0-9]+}}, [[[RW]], {{#?}}:lo12:var_32bit]
 }
 
 define dso_local void @ldst_32bit() {
@@ -55,9 +55,9 @@ define dso_local void @ldst_32bit() {
 
 ; CHECK-LABEL: ldst_32bit:
 ; CHECK: adrp [[RW:x[0-9]+]], var_32bit
-; CHECK-NEXT: ldr {{w[0-9]+}}, {{\[}}[[RW]], {{#?}}:lo12:var_32bit{{\]}}
+; CHECK-NEXT: ldr {{w[0-9]+}}, [[[RW]], {{#?}}:lo12:var_32bit]
 ; CHECK: adrp [[RL:x[0-9]+]], var_64bit
-; CHECK-NEXT: str {{x[0-9]+}}, {{\[}}[[RL]], {{#?}}:lo12:var_64bit{{\]}}
+; CHECK-NEXT: str {{x[0-9]+}}, [[[RL]], {{#?}}:lo12:var_64bit]
 }
 
 define dso_local void @ldst_64bit() {
@@ -69,7 +69,7 @@ define dso_local void @ldst_64bit() {
 
 ; CHECK-LABEL: ldst_64bit:
 ; CHECK: adrp [[RL:x[0-9]+]], var_64bit
-; CHECK-NEXT: ldr {{x[0-9]+}}, {{\[}}[[RL]], {{#?}}:lo12:var_64bit{{\]}}
+; CHECK-NEXT: ldr {{x[0-9]+}}, [[[RL]], {{#?}}:lo12:var_64bit]
 ; CHECK: adrp [[RQ:x[0-9]+]], var_128bit
 ; CHECK-NEXT: add {{x[0-9]+}}, [[RQ]], {{#?}}:lo12:var_128bit
 }
@@ -82,9 +82,9 @@ define dso_local void @ldst_half() {
 
 ; CHECK-LABEL: ldst_half:
 ; CHECK: adrp [[RH:x[0-9]+]], var_half
-; CHECK-NEXT: ldr {{h[0-9]+}}, {{\[}}[[RH]], {{#?}}:lo12:var_half{{\]}}
+; CHECK-NEXT: ldr {{h[0-9]+}}, [[[RH]], {{#?}}:lo12:var_half]
 ; CHECK: adrp [[RF:x[0-9]+]], var_float
-; CHECK-NEXT: str {{s[0-9]+}}, {{\[}}[[RF]], {{#?}}:lo12:var_float{{\]}}
+; CHECK-NEXT: str {{s[0-9]+}}, [[[RF]], {{#?}}:lo12:var_float]
 }
 
 define dso_local void @ldst_float() {
@@ -95,9 +95,9 @@ define dso_local void @ldst_float() {
 
 ; CHECK-LABEL: ldst_float:
 ; CHECK: adrp [[RF:x[0-9]+]], var_float
-; CHECK-NEXT: ldr {{s[0-9]+}}, {{\[}}[[RF]], {{#?}}:lo12:var_float{{\]}}
+; CHECK-NEXT: ldr {{s[0-9]+}}, [[[RF]], {{#?}}:lo12:var_float]
 ; CHECK: adrp [[RD:x[0-9]+]], var_double
-; CHECK-NEXT: str {{d[0-9]+}}, {{\[}}[[RD]], {{#?}}:lo12:var_double{{\]}}
+; CHECK-NEXT: str {{d[0-9]+}}, [[[RD]], {{#?}}:lo12:var_double]
 }
 
 define dso_local void @ldst_double() {
@@ -111,7 +111,7 @@ define dso_local void @ldst_double() {
 
 ; CHECK-LABEL: ldst_double:
 ; CHECK: adrp [[RD:x[0-9]+]], var_double
-; CHECK-NEXT: ldr {{d[0-9]+}}, {{\[}}[[RD]], {{#?}}:lo12:var_double{{\]}}
+; CHECK-NEXT: ldr {{d[0-9]+}}, [[[RD]], {{#?}}:lo12:var_double]
 ; CHECK: adrp [[RQ:x[0-9]+]], var_double2
-; CHECK-NEXT: str {{q[0-9]+}}, {{\[}}[[RQ]], {{#?}}:lo12:var_double2{{\]}}
+; CHECK-NEXT: str {{q[0-9]+}}, [[[RQ]], {{#?}}:lo12:var_double2]
 }
diff --git a/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll b/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll
index 71d3d27a1b649..d56de879089f3 100644
--- a/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll
+++ b/llvm/test/CodeGen/AArch64/stack-guard-reassign.ll
@@ -4,5 +4,5 @@
 ; frame, covering the locals.
 ; CHECK-LABEL: fn:
 ; CHECK:      adrp [[REG:x[0-9]+]], __stack_chk_guard
-; CHECK-NEXT: ldr [[REG]], {{\[}}[[REG]], :lo12:__stack_chk_guard]
+; CHECK-NEXT: ldr [[REG]], [[[REG]], :lo12:__stack_chk_guard]
 ; CHECK-NEXT: stur [[REG]], [x29, #-8]
diff --git a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll
index 32669e411e8cf..a6696f4c96ccd 100644
--- a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll
@@ -54,7 +54,7 @@ entry:
 ; CHECK: addvl sp, sp, #-2
 ; CHECK-DAG: addvl [[ADDR:x[0-9]+]], x29, #-1
 ; CHECK-DAG: ldr [[VAL:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard]
-; CHECK-DAG: str [[VAL]], {{\[}}[[ADDR]]]
+; CHECK-DAG: str [[VAL]], [[[ADDR]]]
 ; CHECK-DAG: addvl x0, x29, #-2
 ; CHECK: bl ptr_fn
 define void @call_ptr_strong() #1 {
@@ -90,7 +90,7 @@ entry:
 ; CHECK: addvl sp, sp, #-3
 ; CHECK-DAG: addvl [[ADDR:x[0-9]+]], x29, #-1
 ; CHECK-DAG: ldr [[VAL:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard]
-; CHECK-DAG: str [[VAL]], {{\[}}[[ADDR]]]
+; CHECK-DAG: str [[VAL]], [[[ADDR]]]
 ; CHECK-DAG: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, [x29, #-2, mul vl]
 ; CHECK: bl val_fn
 ; CHECK: addvl x0, x29, #-3
@@ -116,7 +116,7 @@ entry:
 ; CHECK: addvl sp, sp, #-1
 ; CHECK-NOT: __stack_chk_guard
 ; CHECK: addvl [[REG:x[0-9]+]], x29, #-11
-; CHECK: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, {{\[}}[[REG]], #-8, mul vl]
+; CHECK: st1w { {{z[0-9]+.s}} }, {{p[0-9]+}}, [[[REG]], #-8, mul vl]
 define void @callee_save(<vscale x 4 x float> %x) #0 {
 entry:
   %x.addr = alloca <vscale x 4 x float>, align 16
@@ -132,9 +132,9 @@ entry:
 ; CHECK: addvl sp, sp, #-2
 ; CHECK-DAG: addvl [[ADDR:x[0-9]+]], x29, #-19
 ; CHECK-DAG: ldr [[VAL:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard]
-; CHECK-DAG: str [[VAL]], {{\[}}[[ADDR]]]
+; CHECK-DAG: str [[VAL]], [[[ADDR]]]
 ; CHECK-DAG: addvl [[ADDR2:x[0-9]+]], x29, #-12
-; CHECK-DAG: st1w { z0.s }, p0, {{\[}}[[ADDR2]], #-8, mul vl]
+; CHECK-DAG: st1w { z0.s }, p0, [[[ADDR2]], #-8, mul vl]
 define void @callee_save_strong(<vscale x 4 x float> %x) #1 {
 entry:
   %x.addr = alloca <vscale x 4 x float>, align 16
@@ -155,16 +155,16 @@ entry:
 ; Stack guard is placed below the SVE stack area
 ; CHECK-DAG: ldr [[STACK_GUARD:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard]
 ; CHECK-DAG: addvl [[STACK_GUARD_POS:x[0-9]+]], x29, #-2
-; CHECK-DAG: stur [[STACK_GUARD]], {{\[}}[[STACK_GUARD_POS]], #-8]
+; CHECK-DAG: stur [[STACK_GUARD]], [[[STACK_GUARD_POS]], #-8]
 
 ; char_arr is below the stack guard
 ; CHECK-DAG: sub [[CHAR_ARR_1:x[0-9]+]], x29, #16
 ; CHECK-DAG: addvl [[CHAR_ARR_2:x[0-9]+]], [[CHAR_ARR_1]], #-2
-; CHECK-DAG: strb wzr, {{\[}}[[CHAR_ARR_2]]]
+; CHECK-DAG: strb wzr, [[[CHAR_ARR_2]]]
 
 ; large1 is accessed via a virtual base register
 ; CHECK-DAG: add [[LARGE1:x[0-9]+]], sp, #8, lsl #12
-; CHECK-DAG: stp x0, x0, {{\[}}[[LARGE1]]]
+; CHECK-DAG: stp x0, x0, [[[LARGE1]]]
 
 ; large2 is at the bottom of the stack
 ; CHECK-DAG: stp x0, x0, [sp]
@@ -205,15 +205,15 @@ entry:
 ; Stack guard is placed at the top of the SVE stack area
 ; CHECK-DAG: ldr [[STACK_GUARD:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard]
 ; CHECK-DAG: addvl [[STACK_GUARD_POS:x[0-9]+]], x29, #-1
-; CHECK-DAG: str [[STACK_GUARD]], {{\[}}[[STACK_GUARD_POS]]]
+; CHECK-DAG: str [[STACK_GUARD]], [[[STACK_GUARD_POS]]]
 
 ; char_arr is below the SVE stack area
 ; CHECK-DAG: addvl [[CHAR_ARR:x[0-9]+]], x29, #-3
-; CHECK-DAG: sturb wzr, {{\[}}[[CHAR_ARR]], #-8]
+; CHECK-DAG: sturb wzr, [[[CHAR_ARR]], #-8]
 
 ; large1 is accessed via a virtual base register
 ; CHECK-DAG: add [[LARGE1:x[0-9]+]], sp, #8, lsl #12
-; CHECK-DAG: stp x0, x0, {{\[}}[[LARGE1]], #8]
+; CHECK-DAG: stp x0, x0, [[[LARGE1]], #8]
 
 ; large2 is at the bottom of the stack
 ; CHECK-DAG: stp x0, x0, [sp, #8]
diff --git a/llvm/test/CodeGen/AArch64/stack-protector-target.ll b/llvm/test/CodeGen/AArch64/stack-protector-target.ll
index 0c5905da81fca..9d420e8db3e9f 100644
--- a/llvm/test/CodeGen/AArch64/stack-protector-target.ll
+++ b/llvm/test/CodeGen/AArch64/stack-protector-target.ll
@@ -15,17 +15,17 @@ entry:
 declare void @_Z7CapturePi(i32*)
 
 ; ANDROID-AARCH64: mrs [[A:.*]], TPIDR_EL0
-; ANDROID-AARCH64: ldr [[B:.*]], {{\[}}[[A]], #40]
+; ANDROID-AARCH64: ldr [[B:.*]], [[[A]], #40]
 ; ANDROID-AARCH64: str [[B]], [sp,
-; ANDROID-AARCH64: ldr [[C:.*]], {{\[}}[[A]], #40]
+; ANDROID-AARCH64: ldr [[C:.*]], [[[A]], #40]
 ; ANDROID-AARCH64: ldr [[D:.*]], [sp,
 ; ANDROID-AARCH64: cmp [[C]], [[D]]
 
 ; FUCHSIA-AARCH64-USER: mrs [[A:.*]], TPIDR_EL0
 ; FUCHSIA-AARCH64-KERNEL: mrs [[A:.*]], TPIDR_EL1
-; FUCHSIA-AARCH64-COMMON: ldur [[B:.*]], {{\[}}[[A]], #-16]
+; FUCHSIA-AARCH64-COMMON: ldur [[B:.*]], [[[A]], #-16]
 ; FUCHSIA-AARCH64-COMMON: str [[B]], [sp,
-; FUCHSIA-AARCH64-COMMON: ldur [[C:.*]], {{\[}}[[A]], #-16]
+; FUCHSIA-AARCH64-COMMON: ldur [[C:.*]], [[[A]], #-16]
 ; FUCHSIA-AARCH64-COMMON: ldr [[D:.*]], [sp,
 ; FUCHSIA-AARCH64-COMMON: cmp [[C]], [[D]]
 
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
index ed6ccc8b49413..d791ba3431197 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
@@ -212,7 +212,7 @@ entry:
 
 ; ALWAYS-DAG: ldg [[PA:x.*]], [x{{.*}}]
 ; ALWAYS-DAG: ldrb [[B:w.*]], [sp]
-; ALWAYS-DAG: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
+; ALWAYS-DAG: ldrb [[A:w.*]], [[[PA]]]
 
 ; COMMON: ret
 
diff --git a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
index 8a6ac9ba3e70c..3206cc7d98212 100644
--- a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -12,24 +12,24 @@
 
 ; DARWIN: foo2
 ; DARWIN: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
-; DARWIN: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
-; DARWIN: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+; DARWIN: ldr [[R1:x[0-9]+]], [[[R0]], ___stack_chk_guard@GOTPAGEOFF]
+; DARWIN: ldr {{x[0-9]+}}, [[[R1]]]
 
 ; PIC-LINUX: foo2
 ; PIC-LINUX: adrp [[R0:x[0-9]+]], :got:__stack_chk_guard
-; PIC-LINUX: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], :got_lo12:__stack_chk_guard{{\]}}
-; PIC-LINUX: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+; PIC-LINUX: ldr [[R1:x[0-9]+]], [[[R0]], :got_lo12:__stack_chk_guard]
+; PIC-LINUX: ldr {{x[0-9]+}}, [[[R1]]]
 
 ; STATIC-LARGE: foo2
 ; STATIC-LARGE: movz [[R0:x[0-9]+]], #:abs_g0_nc:__stack_chk_guard
 ; STATIC-LARGE: movk [[R0]], #:abs_g1_nc:__stack_chk_guard
 ; STATIC-LARGE: movk [[R0]], #:abs_g2_nc:__stack_chk_guard
 ; STATIC-LARGE: movk [[R0]], #:abs_g3:__stack_chk_guard
-; STATIC-LARGE: ldr {{x[0-9]+}}, {{\[}}[[R0]]{{\]}}
+; STATIC-LARGE: ldr {{x[0-9]+}}, [[[R0]]]
 
 ; STATIC-SMALL: foo2
 ; STATIC-SMALL: adrp [[R0:x[0-9]+]], __stack_chk_guard
-; STATIC-SMALL: ldr {{x[0-9]+}}, {{\[}}[[R0]], :lo12:__stack_chk_guard{{\]}}
+; STATIC-SMALL: ldr {{x[0-9]+}}, [[[R0]], :lo12:__stack_chk_guard]
 
 ; FALLBACK-NOT: remark:{{.*}}llvm.lifetime.end
 ; FALLBACK-NOT: remark:{{.*}}llvm.lifetime.start
diff --git a/llvm/test/CodeGen/AArch64/stgp.ll b/llvm/test/CodeGen/AArch64/stgp.ll
index d82b45134f552..efccd3a041d41 100644
--- a/llvm/test/CodeGen/AArch64/stgp.ll
+++ b/llvm/test/CodeGen/AArch64/stgp.ll
@@ -13,7 +13,7 @@ define void @stgp1004(i64 %a, i64 %b, i8* %p) {
 entry:
 ; CHECK-LABEL: stgp1004:
 ; CHECK: add [[R:x[0-9]+]], x2, #1004
-; CHECK: stgp x0, x1, {{\[}}[[R]]{{\]}}
+; CHECK: stgp x0, x1, [[[R]]]
 ; CHECK: ret
   %q = getelementptr i8, i8* %p, i32 1004
   call void @llvm.aarch64.stgp(i8* %q, i64 %a, i64 %b)
@@ -34,7 +34,7 @@ define void @stgp1024(i64 %a, i64 %b, i8* %p) {
 entry:
 ; CHECK-LABEL: stgp1024:
 ; CHECK: add [[R:x[0-9]+]], x2, #1024
-; CHECK: stgp x0, x1, {{\[}}[[R]]{{\]}}
+; CHECK: stgp x0, x1, [[[R]]]
 ; CHECK: ret
   %q = getelementptr i8, i8* %p, i32 1024
   call void @llvm.aarch64.stgp(i8* %q, i64 %a, i64 %b)
diff --git a/llvm/test/CodeGen/AArch64/swiftself.ll b/llvm/test/CodeGen/AArch64/swiftself.ll
index b9c8ab2172dcb..d645b0a9bf437 100644
--- a/llvm/test/CodeGen/AArch64/swiftself.ll
+++ b/llvm/test/CodeGen/AArch64/swiftself.ll
@@ -77,14 +77,14 @@ declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
 ; OPTAARCH64-DAG: ldr  x20, [x20]
 ; OPTAARCH64-DAG: mov [[CSREG:x[1-9].*]], x8
 ; OPTAARCH64: bl {{_?}}thisreturn_attribute
-; OPTAARCH64: str x0, {{\[}}[[CSREG]]
+; OPTAARCH64: str x0, [[[CSREG]]
 ; OPTAARCH64: ret
 
 ; OPTARM64_32-LABEL: swiftself_nothisreturn:
 ; OPTARM64_32-DAG: ldr  w20, [x20]
 ; OPTARM64_32-DAG: mov [[CSREG:x[1-9].*]], x8
 ; OPTARM64_32: bl {{_?}}thisreturn_attribute
-; OPTARM64_32: str w0, {{\[}}[[CSREG]]
+; OPTARM64_32: str w0, [[[CSREG]]
 ; OPTARM64_32: ret
 define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret(i8*), i8** noalias nocapture readonly swiftself) {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/tagged-globals-pic.ll b/llvm/test/CodeGen/AArch64/tagged-globals-pic.ll
index 0f0df91352bd7..2fc8cf546e3b4 100644
--- a/llvm/test/CodeGen/AArch64/tagged-globals-pic.ll
+++ b/llvm/test/CodeGen/AArch64/tagged-globals-pic.ll
@@ -24,7 +24,7 @@ declare void @func()
 define i32* @global_addr() #0 {
   ; CHECK-PIC: global_addr:
   ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:global
-  ; CHECK-PIC: ldr x0, {{\[}}[[REG]], :got_lo12:global]
+  ; CHECK-PIC: ldr x0, [[[REG]], :got_lo12:global]
   ; CHECK-PIC: ret
 
   ret i32* @global
@@ -33,20 +33,20 @@ define i32* @global_addr() #0 {
 define i32 @global_load() #0 {
   ; CHECK-SELECTIONDAGISEL: global_load:
   ; CHECK-SELECTIONDAGISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
-  ; CHECK-SELECTIONDAGISEL: ldr w0, {{\[}}[[REG]], :lo12:global{{\]}}
+  ; CHECK-SELECTIONDAGISEL: ldr w0, [[[REG]], :lo12:global]
   ; CHECK-SELECTIONDAGISEL: ret
 
   ; CHECK-GLOBALISEL: global_load:
   ; CHECK-GLOBALISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
   ; CHECK-GLOBALISEL: movk [[REG]], #:prel_g3:global+4294967296
   ; CHECK-GLOBALISEL: add [[REG]], [[REG]], :lo12:global
-  ; CHECK-GLOBALISEL: ldr w0, {{\[}}[[REG]]{{\]}}
+  ; CHECK-GLOBALISEL: ldr w0, [[[REG]]]
   ; CHECK-GLOBALISEL: ret
 
   ; CHECK-PIC: global_load:
   ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:global
-  ; CHECK-PIC: ldr  [[REG]], {{\[}}[[REG]], :got_lo12:global]
-  ; CHECK-PIC: ldr w0, {{\[}}[[REG]]{{\]}}
+  ; CHECK-PIC: ldr  [[REG]], [[[REG]], :got_lo12:global]
+  ; CHECK-PIC: ldr w0, [[[REG]]]
   ; CHECK-PIC: ret
 
   %load = load i32, i32* @global
@@ -56,20 +56,20 @@ define i32 @global_load() #0 {
 define void @global_store() #0 {
   ; CHECK-SELECTIONDAGISEL: global_store:
   ; CHECK-SELECTIONDAGISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
-  ; CHECK-SELECTIONDAGISEL: str wzr, {{\[}}[[REG]], :lo12:global{{\]}}
+  ; CHECK-SELECTIONDAGISEL: str wzr, [[[REG]], :lo12:global]
   ; CHECK-SELECTIONDAGISEL: ret
 
   ; CHECK-GLOBALISEL: global_store:
   ; CHECK-GLOBALISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
   ; CHECK-GLOBALISEL: movk [[REG]], #:prel_g3:global+4294967296
   ; CHECK-GLOBALISEL: add [[REG]], [[REG]], :lo12:global
-  ; CHECK-GLOBALISEL: str wzr, {{\[}}[[REG]]{{\]}}
+  ; CHECK-GLOBALISEL: str wzr, [[[REG]]]
   ; CHECK-GLOBALISEL: ret
 
   ; CHECK-PIC: global_store:
   ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:global
-  ; CHECK-PIC: ldr  [[REG]], {{\[}}[[REG]], :got_lo12:global]
-  ; CHECK-PIC: str wzr, {{\[}}[[REG]]{{\]}}
+  ; CHECK-PIC: ldr  [[REG]], [[[REG]], :got_lo12:global]
+  ; CHECK-PIC: str wzr, [[[REG]]]
   ; CHECK-PIC: ret
 
   store i32 0, i32* @global
@@ -79,7 +79,7 @@ define void @global_store() #0 {
 define void ()* @func_addr() #0 {
   ; CHECK-PIC: func_addr:
   ; CHECK-PIC: adrp [[REG:x[0-9]+]], :got:func
-  ; CHECK-PIC: ldr  x0, {{\[}}[[REG]], :got_lo12:func]
+  ; CHECK-PIC: ldr  x0, [[[REG]], :got_lo12:func]
   ; CHECK-PIC: ret
 
   ret void ()* @func
diff --git a/llvm/test/CodeGen/AArch64/tagged-globals-static.ll b/llvm/test/CodeGen/AArch64/tagged-globals-static.ll
index ed5597c0c3cad..4f2719ee7543e 100644
--- a/llvm/test/CodeGen/AArch64/tagged-globals-static.ll
+++ b/llvm/test/CodeGen/AArch64/tagged-globals-static.ll
@@ -26,14 +26,14 @@ define i32* @global_addr() #0 {
 define i32 @global_load() #0 {
   ; CHECK-SELECTIONDAGISEL: global_load:
   ; CHECK-SELECTIONDAGISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
-  ; CHECK-SELECTIONDAGISEL: ldr w0, {{\[}}[[REG]], :lo12:global{{\]}}
+  ; CHECK-SELECTIONDAGISEL: ldr w0, [[[REG]], :lo12:global]
   ; CHECK-SELECTIONDAGISEL: ret
 
   ; CHECK-GLOBALISEL: global_load:
   ; CHECK-GLOBALISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
   ; CHECK-GLOBALISEL: movk [[REG]], #:prel_g3:global+4294967296
   ; CHECK-GLOBALISEL: add [[REG]], [[REG]], :lo12:global
-  ; CHECK-GLOBALISEL: ldr w0, {{\[}}[[REG]]{{\]}}
+  ; CHECK-GLOBALISEL: ldr w0, [[[REG]]]
   ; CHECK-GLOBALISEL: ret
 
   %load = load i32, i32* @global
@@ -43,14 +43,14 @@ define i32 @global_load() #0 {
 define void @global_store() #0 {
   ; CHECK-SELECTIONDAGISEL: global_store:
   ; CHECK-SELECTIONDAGISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
-  ; CHECK-SELECTIONDAGISEL: str wzr, {{\[}}[[REG]], :lo12:global{{\]}}
+  ; CHECK-SELECTIONDAGISEL: str wzr, [[[REG]], :lo12:global]
   ; CHECK-SELECTIONDAGISEL: ret
 
   ; CHECK-GLOBALISEL: global_store:
   ; CHECK-GLOBALISEL: adrp [[REG:x[0-9]+]], :pg_hi21_nc:global
   ; CHECK-GLOBALISEL: movk [[REG]], #:prel_g3:global+4294967296
   ; CHECK-GLOBALISEL: add [[REG]], [[REG]], :lo12:global
-  ; CHECK-GLOBALISEL: str wzr, {{\[}}[[REG]]{{\]}}
+  ; CHECK-GLOBALISEL: str wzr, [[[REG]]]
   ; CHECK-GLOBALISEL: ret
 
   store i32 0, i32* @global
diff --git a/llvm/test/CodeGen/AArch64/win-tls.ll b/llvm/test/CodeGen/AArch64/win-tls.ll
index 99a3760137a8f..cec39a04e29a8 100644
--- a/llvm/test/CodeGen/AArch64/win-tls.ll
+++ b/llvm/test/CodeGen/AArch64/win-tls.ll
@@ -31,34 +31,34 @@ define i64 @getVar64() {
 ; CHECK-LABEL: getVar
 ; CHECK: adrp [[TLS_INDEX_ADDR:x[0-9]+]], _tls_index
 ; CHECK: ldr [[TLS_POINTER:x[0-9]+]], [x18, #88]
-; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index]
+; CHECK: ldr w[[TLS_INDEX:[0-9]+]], [[[TLS_INDEX_ADDR]], :lo12:_tls_index]
 
-; CHECK: ldr [[TLS:x[0-9]+]], {{\[}}[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3]
+; CHECK: ldr [[TLS:x[0-9]+]], [[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3]
 ; CHECK: add [[TLS]], [[TLS]], :secrel_hi12:tlsVar
-; CHECK: ldr w0, {{\[}}[[TLS]], :secrel_lo12:tlsVar{{\]}}
+; CHECK: ldr w0, [[[TLS]], :secrel_lo12:tlsVar]
 
 ; CHECK-LABEL: getPtr
 ; CHECK: adrp [[TLS_INDEX_ADDR:x[0-9]+]], _tls_index
 ; CHECK: ldr [[TLS_POINTER:x[0-9]+]], [x18, #88]
-; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index]
+; CHECK: ldr w[[TLS_INDEX:[0-9]+]], [[[TLS_INDEX_ADDR]], :lo12:_tls_index]
 
-; CHECK: ldr [[TLS:x[0-9]+]], {{\[}}[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3]
+; CHECK: ldr [[TLS:x[0-9]+]], [[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3]
 ; CHECK: add [[TLS]], [[TLS]], :secrel_hi12:tlsVar
 ; CHECK: add x0, [[TLS]], :secrel_lo12:tlsVar
 
 ; CHECK-LABEL: setVar
 ; CHECK: adrp [[TLS_INDEX_ADDR:x[0-9]+]], _tls_index
 ; CHECK: ldr [[TLS_POINTER:x[0-9]+]], [x18, #88]
-; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index]
+; CHECK: ldr w[[TLS_INDEX:[0-9]+]], [[[TLS_INDEX_ADDR]], :lo12:_tls_index]
 
-; CHECK: ldr [[TLS:x[0-9]+]], {{\[}}[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3]
+; CHECK: ldr [[TLS:x[0-9]+]], [[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3]
 ; CHECK: add [[TLS]], [[TLS]], :secrel_hi12:tlsVar
-; CHECK: str w0, {{\[}}[[TLS]], :secrel_lo12:tlsVar{{\]}}
+; CHECK: str w0, [[[TLS]], :secrel_lo12:tlsVar]
 
 ; CHECK-LABEL: getVar8
 ; CHECK: add [[TLS:x[0-9]+]], [[TLS]], :secrel_hi12:tlsVar8
-; CHECK: ldrb w0, {{\[}}[[TLS]], :secrel_lo12:tlsVar8{{\]}}
+; CHECK: ldrb w0, [[[TLS]], :secrel_lo12:tlsVar8]
 
 ; CHECK-LABEL: getVar64
 ; CHECK: add [[TLS:x[0-9]+]], [[TLS]], :secrel_hi12:tlsVar64
-; CHECK: ldr x0, {{\[}}[[TLS]], :secrel_lo12:tlsVar64{{\]}}
+; CHECK: ldr x0, [[[TLS]], :secrel_lo12:tlsVar64]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
index 828237605063d..0121d27138a72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
@@ -8,7 +8,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
   %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
   store i64 %tmp0, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 370d55380b49b..7cacdefeb3a10 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -26,7 +26,7 @@ bb:
 ; GCN-LABEL: {{^}}test_load1_mfma_store1:
 ; GCN:      global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:  v_accvgpr_read
-; GCN:      v_mfma_f32_32x32x1f32 a{{\[}}[[N:[0-9]+]]:
+; GCN:      v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]:
 ; GCN-NEXT: s_nop 7
 ; GCN-NEXT: s_nop 7
 ; GCN-NEXT: s_nop 2
@@ -194,7 +194,7 @@ bb:
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN:     v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:
+; GCN:     v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
 ; GCN:     v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
 ; GCN:     global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
 ; GCN:     global_store_dword v{{[0-9]+}}, v{{[0-9]+}},
@@ -217,7 +217,7 @@ bb:
 ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store:
 ; GCN:         global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
 ; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN:         v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:
+; GCN:         v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
 ; GCN:         global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
@@ -244,7 +244,7 @@ bb:
 ; GCN-LABEL: {{^}}test_load_mfma_ds2_store:
 ; GCN-DAG: ds_read_b128 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}
 ; GCN-NOT: v_accvgpr_write
-; GCN-DAG: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
+; GCN-DAG: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
 ; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-NOT: v_accvgpr_read
 ; GCN:     ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128
diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll
index 619cfb81e1e0d..e3b239d910806 100644
--- a/llvm/test/CodeGen/AMDGPU/add.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll
@@ -87,7 +87,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
-; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; VI: buffer_store_dwordx2 v[[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
@@ -128,7 +128,7 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; VI-NEXT: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 3de1aa1dd656d..5b35f592dd272 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -173,7 +173,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
 ; GFX9PLUS-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
-; GFX9PLUS: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+; GFX9PLUS: buffer_store_dwordx2 v[[[ELT0]]:[[ELT1]]]
 
 ; VI: flat_load_dword v[[A:[0-9]+]]
 ; VI: flat_load_dword v[[B:[0-9]+]]
@@ -184,7 +184,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
 ; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NOT: and
 ; VI-NOT: shl
-; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
+; VI: buffer_store_dwordx2 v[[[ADD_LO]]:[[ADD_HI]]]
 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
@@ -238,7 +238,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
 ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GFX9PLUS-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
 ; GFX9PLUS-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
-; GFX9PLUS: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+; GFX9PLUS: buffer_store_dwordx2 v[[[ELT0]]:[[ELT1]]]
 
 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_add_u16_e32
diff --git a/llvm/test/CodeGen/AMDGPU/add_i128.ll b/llvm/test/CodeGen/AMDGPU/add_i128.ll
index d33965d4dda7a..aa36095389bd5 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i128.ll
@@ -5,7 +5,7 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
-; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
+; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]],
 define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index b9095de7d1160..5debd68b30c0f 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -29,7 +29,7 @@
 ; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 
-; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 
 ; At most 2 digits. Make sure src_shared_base is not counted as a high
 ; number SGPR.
@@ -60,7 +60,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt
 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
 ; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
 
-; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
   store volatile i32 7, i32* %stof
@@ -98,7 +98,7 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
 ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 
-; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 
 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
@@ -112,11 +112,11 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %
 ; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
 ; HSA: enable_sgpr_queue_ptr = 0
 
-; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
 define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32*
   store volatile i32 7, i32* %stof
@@ -125,10 +125,10 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %p
 
 ; no-op
 ; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
-; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
-; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
   %ld = load volatile i32, i32* %stof
@@ -136,13 +136,13 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)*
 }
 
 ; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
-; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
+; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
 ; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
-; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
 
 ; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}
+; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s[[[PTRLO]]:[[PTRHI]]]
 define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   %ld = load volatile i32, i32 addrspace(1)* %stof
@@ -154,12 +154,12 @@ define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)
 ; HSA: enable_sgpr_dispatch_ptr = 0
 ; HSA: enable_sgpr_queue_ptr = 0
 
-; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
-; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
+; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]]
+; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
 ; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
 ; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
-; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, 0
+; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0
 ; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
 ; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
@@ -174,12 +174,12 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
 ; HSA: enable_sgpr_dispatch_ptr = 0
 ; HSA: enable_sgpr_queue_ptr = 0
 
-; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
-; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
+; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]]
+; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
 ; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
 ; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
-; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, 0
+; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0
 ; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
 ; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
@@ -192,14 +192,14 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
 ; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
 ; HSA: enable_sgpr_queue_ptr = 0
 
-; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
+; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]], s[4:5], 0x0
 ; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
-; CI: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
 
 ; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GFX9: global_store_dword [[ZERO]], [[ZERO]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]$}}
+; GFX9: global_store_dword [[ZERO]], [[ZERO]], s[[[PTRLO]]:[[PTRHI]]{{\]$}}
 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
@@ -209,8 +209,8 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
 ; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
 ; HSA: enable_sgpr_queue_ptr = 0
 
-; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
-; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
+; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]], s[4:5], 0x0
+; HSA: s_load_dword s{{[0-9]+}}, s[[[PTRLO]]:[[PTRHI]]], 0x0
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
   load volatile i32, i32 addrspace(4)* %ftos
@@ -228,7 +228,7 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
 
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* null to i32*
   store volatile i32 7, i32* %cast
@@ -249,7 +249,7 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
 ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32*
   store volatile i32 7, i32* %cast
@@ -278,7 +278,7 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(5)* null to i32*
   store volatile i32 7, i32* %cast
@@ -303,7 +303,7 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
 define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32*
   store volatile i32 7, i32* %cast
@@ -376,10 +376,10 @@ define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i3
 ; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
 define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(i8 addrspace(4)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
   %ptr = load volatile i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %ptr.ptr
   %addrspacecast = addrspacecast i8 addrspace(4)* %ptr to i8 addrspace(6)*
@@ -392,10 +392,10 @@ define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(i8 addrspace
 ; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
 ; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
 ; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
-; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
+; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
 ; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
 ; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
-; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
 define amdgpu_kernel void @use_global_to_constant32_addrspacecast(i8 addrspace(1)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
   %ptr = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* %ptr.ptr
   %addrspacecast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(6)*
@@ -409,7 +409,7 @@ define amdgpu_kernel void @use_global_to_constant32_addrspacecast(i8 addrspace(1
 ; GCN: s_load_dword [[PTR:s[0-9]+]],
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], [[PTR]]
-; GCN: flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: flat_load_dword v{{[0-9]+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @use_constant32bit_to_flat_addrspacecast_0(i32 addrspace(6)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(6)* %ptr to i32*
   %load = load volatile i32, i32* %stof
@@ -420,7 +420,7 @@ define amdgpu_kernel void @use_constant32bit_to_flat_addrspacecast_0(i32 addrspa
 ; GCN: s_load_dword [[PTR:s[0-9]+]],
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0xffff8000
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], [[PTR]]
-; GCN: flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: flat_load_dword v{{[0-9]+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @use_constant32bit_to_flat_addrspacecast_1(i32 addrspace(6)* %ptr) #3 {
   %stof = addrspacecast i32 addrspace(6)* %ptr to i32*
   %load = load volatile i32, i32* %stof
diff --git a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
index 3f07188063cde..88ae414eb05f0 100644
--- a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
+++ b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}alignbit_shr_pat:
 ; GCN-DAG: s_load_dword s[[SHR:[0-9]+]]
-; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]]
 
 define amdgpu_kernel void @alignbit_shr_pat(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
@@ -18,7 +18,7 @@ bb:
 
 ; GCN-LABEL: {{^}}alignbit_shr_pat_v:
 ; GCN-DAG: load_dword v[[SHR:[0-9]+]],
-; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]]
 
 define amdgpu_kernel void @alignbit_shr_pat_v(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {
@@ -69,7 +69,7 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}alignbit_shr_pat_const30:
-; GCN: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], 30
 
 define amdgpu_kernel void @alignbit_shr_pat_const30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index 4ba57fba81bc0..7760eee8fc324 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -4,10 +4,10 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.readfirstlane(i32)
 
 ; GCN-LABEL: readfirstlane_uniform
-; GCN: 	s_load_dwordx2 s{{\[}}[[IN_ADDR:[0-9]+]]:1{{\]}}, s[4:5], 0x0
+; GCN: 	s_load_dwordx2 s[[[IN_ADDR:[0-9]+]]:1], s[4:5], 0x0
 ; GCN:  v_readfirstlane_b32 s[[SCALAR:[0-9]+]], v0
 ; GCN: 	s_add_u32 s[[LOAD_ADDR:[0-9]+]], s[[IN_ADDR]], s[[SCALAR]]
-; GCN:	s_load_dword s{{[0-9]+}}, s{{\[}}[[LOAD_ADDR]]
+; GCN:	s_load_dword s{{[0-9]+}}, s[[[LOAD_ADDR]]
 
 define amdgpu_kernel void @readfirstlane_uniform(float addrspace(1)* noalias nocapture readonly, float addrspace(1)* noalias nocapture readonly) {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll
index b6c7fcec9f948..918015f735ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll
@@ -13,10 +13,10 @@ entry:
 ; where the high half of the address comes from s_getpc.
 
 ; PAL-LABEL: {{^}}scratch:
-; PAL: s_getpc_b64 s{{\[}}[[GITPTR:[0-9]+]]:
+; PAL: s_getpc_b64 s[[[GITPTR:[0-9]+]]:
 ; PAL: s_mov_b32 s[[GITPTR]], s0
-; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:
-; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]:
+; PAL: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:
+; PAL: buffer_store{{.*}}, s[[[SCRATCHDESC]]:
 
 define amdgpu_kernel void @scratch(<2 x i32> %in, i32 %idx, i32 addrspace(5)* %out) {
 entry:
@@ -39,8 +39,8 @@ entry:
 ; PAL-LABEL: {{^}}scratch2:
 ; PAL: s_movk_i32 s{{[0-9]+}}, 0x1234
 ; PAL: s_mov_b32 s[[GITPTR:[0-9]+]], s0
-; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:
-; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]:
+; PAL: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:
+; PAL: buffer_store{{.*}}, s[[[SCRATCHDESC]]:
 
 define amdgpu_kernel void @scratch2(<2 x i32> %in, i32 %idx, i32 addrspace(5)* %out) #0 {
 entry:
@@ -61,9 +61,9 @@ entry:
 ; PAL-LABEL: {{^}}scratch2_cs:
 ; PAL: s_movk_i32 s{{[0-9]+}}, 0x1234
 ; PAL: s_mov_b32 s[[GITPTR:[0-9]+]], s0
-; CI: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:{{[0-9]+\]}}, 0x4
-; VI: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:{{[0-9]+\]}}, 0x10
-; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]:
+; CI: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:{{[0-9]+\]}}, 0x4
+; VI: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:{{[0-9]+\]}}, 0x10
+; PAL: buffer_store{{.*}}, s[[[SCRATCHDESC]]:
 
 define amdgpu_cs void @scratch2_cs(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32> %coord, <2 x i32> %in, i32 %extra, i32 %idx) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
index 691078739a2d3..0948bb590d409 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll
@@ -5,7 +5,7 @@
 ; than s0.
 
 ; GCN-LABEL: {{^}}_amdgpu_hs_main:
-; GCN: s_getpc_b64 s{{\[}}[[GITPTR:[0-9]+]]:
+; GCN: s_getpc_b64 s[[[GITPTR:[0-9]+]]:
 ; PREGFX9: s_mov_b32 s[[GITPTR]], s0
 ; GFX9: s_mov_b32 s[[GITPTR]], s8
 
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 546065ab5a4e8..8cad7d033ac8e 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -202,7 +202,7 @@ define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i64:
 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
-; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
+; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[KLO]]:[[KHI]]]
 define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and0 = and i64 %a, 549756338176
   %and1 = and i64 %b, 549756338176
@@ -275,8 +275,8 @@ define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrsp
 }
 
 ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
+; SI-DAG: buffer_load_dwordx2 v[[[LO0:[0-9]+]]:[[HI0:[0-9]+]]]
+; SI-DAG: buffer_load_dwordx2 v[[[LO1:[0-9]+]]:[[HI1:[0-9]+]]]
 ; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
 ; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
@@ -296,15 +296,15 @@ define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}v_and_multi_use_inline_imm_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
+; SI: buffer_load_dwordx2 v[[[LO0:[0-9]+]]:[[HI0:[0-9]+]]]
 ; SI-NOT: and
-; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
+; SI: buffer_load_dwordx2 v[[[LO1:[0-9]+]]:[[HI1:[0-9]+]]]
 ; SI-NOT: and
 ; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
 ; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
 ; SI-NOT: and
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
+; SI: buffer_store_dwordx2 v[[[RESLO0]]
+; SI: buffer_store_dwordx2 v[[[RESLO1]]
 define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
@@ -347,11 +347,11 @@ define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addr
 
 ; FIXME: Should be able to reduce load width
 ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
-; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: {{buffer|flat}}_load_dwordx2 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 ; SI-NOT: and
 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
 ; SI-NOT: and
-; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[VAL_LO]]:[[VAL_HI]]]
 define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index b4218ac875c36..dd32f9a35b019 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -29,15 +29,15 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %
 ; SICIVI-DAG: s_mov_b32 m0
 
 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GFX89-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
 ; GCN: [[RESULT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
@@ -90,15 +90,15 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)*
 ; SICIVI-DAG: s_mov_b32 m0
 
 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GFX89-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index cdd4db7f8dbc6..4993bd5b449d2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -13,13 +13,13 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 imm
 
 ; GCN-LABEL: add_i32_constant:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[value:[0-9]+]], s[[popcount]], 5
 ; GCN: v_mov_b32_e32 v[[data:[0-9]+]], s[[value]]
 ; GCN: buffer_atomic_add v[[data]]
@@ -32,13 +32,13 @@ entry:
 
 ; GCN-LABEL: add_i32_uniform:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GCN: buffer_atomic_add v[[value]]
@@ -116,13 +116,13 @@ entry:
 
 ; GCN-LABEL: sub_i32_constant:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[value:[0-9]+]], s[[popcount]], 5
 ; GCN: v_mov_b32_e32 v[[data:[0-9]+]], s[[value]]
 ; GCN: buffer_atomic_sub v[[data]]
@@ -135,13 +135,13 @@ entry:
 
 ; GCN-LABEL: sub_i32_uniform:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GCN: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index a73bf61340e0d..aa4dad50a11d8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -12,13 +12,13 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)
 
 ; GCN-LABEL: add_i32_constant:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[popcount]], s[[popcount]], 5
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]]
 ; GCN: buffer_atomic_add v[[value]]
@@ -31,13 +31,13 @@ entry:
 
 ; GCN-LABEL: add_i32_uniform:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GCN: buffer_atomic_add v[[value]]
@@ -84,13 +84,13 @@ entry:
 
 ; GCN-LABEL: sub_i32_constant:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[popcount]], s[[popcount]], 5
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]]
 ; GCN: buffer_atomic_sub v[[value]]
@@ -103,13 +103,13 @@ entry:
 
 ; GCN-LABEL: sub_i32_uniform:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GCN: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 43f52bdf192bf..6b2f261bbdab4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -12,13 +12,13 @@ declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32,
 
 ; GCN-LABEL: add_i32_constant:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[popcount]], s[[popcount]], 5
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]]
 ; GCN: buffer_atomic_add v[[value]]
@@ -31,13 +31,13 @@ entry:
 
 ; GCN-LABEL: add_i32_uniform:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GCN: buffer_atomic_add v[[value]]
@@ -97,13 +97,13 @@ entry:
 
 ; GCN-LABEL: sub_i32_constant:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[popcount]], s[[popcount]], 5
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]]
 ; GCN: buffer_atomic_sub v[[value]]
@@ -116,13 +116,13 @@ entry:
 
 ; GCN-LABEL: sub_i32_uniform:
 ; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo
-; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN64: s_mov_b64 s[[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]], exec
 ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
 ; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
 ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
 ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc
 ; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
-; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s[[[exec_lo]]:[[exec_hi]]]
 ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
 ; GCN: buffer_atomic_sub v[[value]]
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
index cc0ae3ab4303e..04d67abf0559e 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -11,7 +11,7 @@
 ; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]]
 ; VI: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
 ; VI-SDWA: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
-; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+; GCN: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
 define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx = add i32 %x, %id
@@ -28,13 +28,13 @@ define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x
 ; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]]
 ; VI-SDWA: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 15
 ; VI-SDWA: v_lshlrev_b32_sdwa v[[ADDRBASE1:[0-9]+]], v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-SDWA: v_lshlrev_b64 v{{\[}}[[ADDRBASE:[0-9]+]]:{{[^\]+}}], 2, v{{\[}}[[ADDRBASE1]]:{{[^\]+}}]
+; VI-SDWA: v_lshlrev_b64 v[[[ADDRBASE:[0-9]+]]:{{[^\]+}}], 2, v[[[ADDRBASE1]]:{{[^\]+}}]
 ; VI-SDWA: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
 ; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]]
-; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2
+; CI: v_lshl_b64 v[[[ADDRLO:[0-9]+]]:{{[^\]+}}], v[[[AND]]:{{[^\]+}}], 2
 ; VI: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
-; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+; GCN: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
 define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx = add i32 %x, %id
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index 80580967a7881..0b4fba311484e 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}s_ubfe_sub_i32:
-; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
 ; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]]
 ; GCN: s_lshr_b32 s{{[0-9]+}}, [[TMP]], [[SUB]]
@@ -60,7 +60,7 @@ define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
 }
 
 ; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
-; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
 ; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
 ; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
@@ -119,7 +119,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}s_sbfe_sub_i32:
-; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
 ; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]]
 ; GCN: s_ashr_i32 s{{[0-9]+}}, [[TMP]], [[SUB]]
@@ -134,7 +134,7 @@ define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
 }
 
 ; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
-; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
 ; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
 ; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index 3616ec1f45d31..e4e32708af257 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_0_i64:
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
   store i64 0, i64 addrspace(1)* %out
   ret void
@@ -32,7 +32,7 @@ define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_neg1_i64:
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
   store i64 -1, i64 addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_signbit_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
   store i64  -9223372036854775808, i64 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
   store i64  1152921504606846975, i64 addrspace(1)* %out
   ret void
@@ -83,7 +83,7 @@ define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
   store i64 -576460752303423489, i64 addrspace(1)* %out
   ret void
@@ -100,7 +100,7 @@ define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_64_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
   store i64 144115188075855872, i64 addrspace(1)* %out
   ret void
@@ -117,7 +117,7 @@ define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_65_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
   store i64 -9079256848778919936, i64 addrspace(1)* %out
   ret void
@@ -134,7 +134,7 @@ define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_3_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
   store i64 -4611686018427387904, i64 addrspace(1)* %out
   ret void
@@ -151,7 +151,7 @@ define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LOK]]:[[HIK]]]
 define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
   store i64 508, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
index 1d2574d9fcfac..b2b3d48b92895 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
@@ -7,7 +7,7 @@
 # GCN:  .LBB0_5: ; %bb
 # GCN-NEXT:    ;DEBUG_VALUE: test_debug_value:globalptr_arg <- [DW_OP_plus_uconst 12, DW_OP_stack_value]
 # GCN-NEXT:    .loc 1 0 42 is_stmt 0 ; /tmp/test_debug_value.cl:0:42
-# GCN-NEXT:    s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+# GCN-NEXT:    s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 # GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 # GCN-NEXT:    s_add_u32 s[[PC_LO]], s[[PC_LO]], (.LBB0_4-[[POST_GETPC]])&4294967295
 # GCN-NEXT:    s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.LBB0_4-[[POST_GETPC]])>>32
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index c1cb51e133c2c..a791676e618b1 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -61,11 +61,11 @@ bb3:
 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:.LBB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb0
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 
 ; GCN-NEXT: [[LONGBB]]:
 ; GCN-NEXT: ;;#ASMSTART
@@ -106,11 +106,11 @@ bb3:
 ; GCN: s_cbranch_vccz [[LONGBB:.LBB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb0
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 
 ; GCN-NEXT: [[LONGBB]]:
 ; GCN: v_nop_e64
@@ -195,11 +195,11 @@ bb3:
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb2
 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
 
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (.L[[LOOPBB]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOPBB]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
@@ -230,11 +230,11 @@ bb3:
 ; GCN: s_cbranch_scc{{[0-1]}} [[BB1:.LBB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %bb0
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], ([[BB4:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], ([[BB4]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC0_LO]]:[[PC0_HI]]]
 
 ; GCN: [[BB1]]:
 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
@@ -290,11 +290,11 @@ bb4:
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %loop
 ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
 
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (.L[[LOOP]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOP]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
@@ -326,11 +326,11 @@ loop:
 ; GCN: s_cbranch_vccz [[BB2:.LBB[0-9]_[0-9]+]]
 
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}:
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], ([[BB3:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], ([[BB3:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC1_LO]]:[[PC1_HI]]]
 
 ; GCN-NEXT: [[BB2]]: ; %bb2
 ; GCN-NEXT: ;;#ASMSTART
@@ -385,11 +385,11 @@ bb3:
 ; GCN-NEXT: s_cbranch_execnz [[IF:.LBB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %entry
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[BB2:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[BB2:.LBB[0-9]_[0-9]+]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 
 ; GCN-NEXT: [[IF]]: ; %if
 ; GCN: s_cmp_lg_u32
@@ -448,11 +448,11 @@ endif:
 
 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %loop
 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
-; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (.L[[LOOP_BODY]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOP_BODY]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 
 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
@@ -488,11 +488,11 @@ ret:
 ; GCN: s_cbranch_scc{{[0-1]}} [[LONG_BR_0:.LBB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_BR_DEST0:.LBB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LONG_BR_DEST0]]-[[POST_GETPC]])>>32
-; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: [[LONG_BR_0]]:
 
 ; GCN: [[LONG_BR_DEST0]]:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 6afca9c2ca01d..e19c8d527f226 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -9,7 +9,7 @@
 ; R600-NOT: MOV
 ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
-; GFX678: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
+; GFX678: buffer_store_dwordx2 v[[[X]]:[[Y]]]
 ; GFX10: global_store_dwordx2 v2, v[0:1], s[0:1]
 define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
@@ -27,7 +27,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
 ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
-; GFX678: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
+; GFX678: buffer_store_dwordx4 v[[[X]]:[[W]]]
 ; GFX10: global_store_dwordx4 v4, v[0:3], s[0:1]
 define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 70e92479af172..09c4633431dc3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -64,13 +64,13 @@ declare hidden void @external_void_func_v16i8(<16 x i8>) #0
 
 ; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
 
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+12
 ; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
 ; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
 
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
   call void @external_void_func_i1(i1 true)
@@ -84,11 +84,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]]
 ; MESA-DAG: s_mov_b32 s32, 0{{$}}
 
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12
 ; GCN-NEXT: v_bfe_i32 v0, [[VAR]], 0, 1
-; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
   %var = load volatile i1, i1 addrspace(1)* undef
@@ -105,11 +105,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; MESA: buffer_load_ubyte [[VAL:v[0-9]+]]
 ; MESA-DAG: s_mov_b32 s32, 0{{$}}
 
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
 ; GCN-NEXT: v_and_b32_e32 v0, 1, [[VAL]]
-; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
   %var = load volatile i1, i1 addrspace(1)* undef
@@ -119,14 +119,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
 
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+12
 ; GCN-DAG: v_mov_b32_e32 v0, 0x7b
 
 ; GCN-DAG: s_mov_b32 s32, 0{{$}}
 
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
   call void @external_void_func_i8(i8 123)
@@ -137,14 +137,14 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
 
 ; GCN-DAG: buffer_load_sbyte [[VAL:v[0-9]+]]
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12
 
 ; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
   %var = load volatile i8, i8 addrspace(1)* undef
@@ -155,14 +155,14 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
 
 ; GCN-DAG: buffer_load_ubyte [[VAL:v[0-9]+]]
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12
 
 ; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
   %var = load volatile i8, i8 addrspace(1)* undef
@@ -184,14 +184,14 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
 
 ; GCN-DAG: buffer_load_sshort [[VAL:v[0-9]+]]
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12
 
 ; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
   %var = load volatile i16, i16 addrspace(1)* undef
@@ -201,14 +201,14 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext:
 
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+12
 
 ; GCN-DAG: s_mov_b32 s32, 0
 
 ; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
   %var = load volatile i16, i16 addrspace(1)* undef
@@ -218,13 +218,13 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
 
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+12
 ; GCN-DAG: v_mov_b32_e32 v0, 42
 ; GCN-DAG: s_mov_b32 s32, 0
 
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
   call void @external_void_func_i32(i32 42)
@@ -234,10 +234,10 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
 ; GCN-DAG: v_mov_b32_e32 v1, 0{{$}}
-; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64@rel32@lo+4
 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64@rel32@hi+12
-; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
+; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
   call void @external_void_func_i64(i64 123)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 351aabf257389..6a8d26c441a16 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -42,8 +42,8 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
 ; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]]
 ; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
-; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
-; CIVI: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]]
+; CIVI: {{flat|global}}_store_dword v[[[LO]]:[[HI]]]
 define hidden void @use_queue_ptr_addrspacecast() #1 {
   %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32*
   store volatile i32 0, i32* %asc
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index 2529cebbf1f47..cd0a39856abcf 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -181,7 +181,7 @@ define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5
 ; on the leftover AssertZext's ValueType operand.
 
 ; GCN-LABEL: {{^}}cannot_select_assertzext_valuetype:
-; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+12
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 4f3d6442da444..487ab48655ec3 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -139,7 +139,7 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}v_clamp_add_src_v2f32:
-; GCN: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
+; GCN: {{buffer|flat|global}}_load_dwordx2 v[[[A:[0-9]+]]:[[B:[0-9]+]]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}}
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}}
 define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
index 04d6b0d85b957..5b7c8e4550b96 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll
@@ -7,7 +7,7 @@
 ; GCN:     global_load_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v{{[0-9]+}}, s[{{[0-9:]+}}]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}}
-; GCN-NEXT: global_store_dwordx2 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+\]}}, s[{{[0-9:]+}}]
+; GCN-NEXT: global_store_dwordx2 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+\]}}, s[{{[0-9:]+}}]
 
 define amdgpu_kernel void @test_odd_int4(<4 x i32> addrspace(1)* %arg, <2 x i32> addrspace(1)* %arg1) {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll b/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
index 77ab7c3a948c3..5ea9b8318ff46 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-ftrunc.ll
@@ -24,10 +24,10 @@ define amdgpu_kernel void @combine_ftrunc_frint_f32(float addrspace(1)* %p) {
 
 ; GCN-LABEL: {{^}}combine_ftrunc_frint_v2f32:
 ; GCN: s_load_dwordx2
-; GCN: s_load_dwordx2 s{{\[}}[[SRC1:[0-9]+]]:[[SRC2:[0-9]+]]{{\]}}
+; GCN: s_load_dwordx2 s[[[SRC1:[0-9]+]]:[[SRC2:[0-9]+]]]
 ; GCN-DAG: v_rndne_f32_e32 v[[RND1:[0-9]+]], s[[SRC1]]
 ; GCN-DAG: v_rndne_f32_e32 v[[RND2:[0-9]+]], s[[SRC2]]
-; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RND1]]:[[RND2]]{{\]}}
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v[[[RND1]]:[[RND2]]]
 define amdgpu_kernel void @combine_ftrunc_frint_v2f32(<2 x float> addrspace(1)* %p) {
   %v = load <2 x float>, <2 x float> addrspace(1)* %p, align 8
   %round = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %v)
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index c6d1e1659ef72..d5958e5b3d74c 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -251,7 +251,7 @@ define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrsp
 
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: v_cmp_gt_u64_e32 vcc, s[[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index 65f0fcb1dc8bc..22ac0fa31743e 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -306,7 +306,7 @@ define amdgpu_vs float @load_addr_no_fold(i32 addrspace(6)* inreg noalias %p0) #
 ; GCN-LABEL: {{^}}vgpr_arg_src:
 ; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0
 ; GCN: s_mov_b32 s[[ZERO:[0-9]+]]
-; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[READLANE]]:[[ZERO]]{{\]}}
+; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]]
 define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) {
 main_body:
   %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 587bea4194278..0ca0c6896fff1 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -90,7 +90,7 @@ define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
-; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; GCN-NEXT: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
 define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
   %vreg = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
@@ -102,14 +102,14 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; The neg1 appears after folding the not 0
 ; GCN-LABEL: {{^}}fold_mi_or_neg1:
 ; GCN: buffer_load_dwordx2
-; GCN: buffer_load_dwordx2 v{{\[}}[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]]
 
 ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
 ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
 ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
 define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index e840cef08c5ae..7891cded195d5 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -22,7 +22,7 @@
 ; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}}
 
 ; Spill saved exec
-; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
+; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
 
@@ -30,8 +30,8 @@
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
 ; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
-; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
-; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
+; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], [[CMP0]]
+; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]]
 
 ; GCN: s_cbranch_execz [[ENDIF:.LBB[0-9]+_[0-9]+]]
 
@@ -57,7 +57,7 @@
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
 
-; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
+; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
 ; Restore val
 ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
@@ -92,7 +92,7 @@ endif:
 
 ; Spill load
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
+; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -103,8 +103,8 @@ endif:
 ; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 
-; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
-; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
+; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]]
+; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]]
 ; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]]
 
 
@@ -129,7 +129,7 @@ endif:
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
 
-; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
+; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
@@ -168,9 +168,9 @@ end:
 ; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0
 ; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]]
 
-; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
-; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
-; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
+; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
+; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]]
+; GCN: s_xor_b64 s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]]
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -195,14 +195,14 @@ end:
 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1
 
-; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
+; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
 
 ; Regular spill value restored after exec modification
 ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
 ; Followed by spill
 ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
-; GCN: s_and_b64 s{{\[}}[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]{{\]}}, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}}
+; GCN: s_and_b64 s[[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]], exec, s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]]
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]]
@@ -212,7 +212,7 @@ end:
 ; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1
 ; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
-; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}}
+; GCN: s_xor_b64 exec, exec, s[[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9]+_[0-9]+]]
 
 
@@ -239,7 +239,7 @@ end:
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
 
-; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
+; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
 ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 6b72391ce8c4f..7e188a9f76cb1 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -27,7 +27,7 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32]
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64:
-; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v[[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]],
 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
@@ -44,13 +44,13 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64_user:
-; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: {{buffer|flat}}_load_dwordx2 v[[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]],
 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -123,13 +123,13 @@ define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <
 }
 
 ; FUNC-LABEL: {{^}}ctpop_i64_in_br:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
-; VI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
-; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; SI-DAG: s_load_dwordx2 s[[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0xd
+; VI-DAG: s_load_dwordx2 s[[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]]
 ; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
-; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
@@ -179,8 +179,8 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
 ; FIXME: Should not have extra add
 
 ; FUNC-LABEL: {{^}}v_ctpop_i128:
-; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-; VI: flat_load_dwordx4   v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}
+; SI: buffer_load_dwordx4 v[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; VI: flat_load_dwordx4   v[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
 ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index a9e3aa79f178f..fef46fd55bb9e 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -36,8 +36,8 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)*
 }
 
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-DAG: ds_read2_b32 v[[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]], v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b32 v[[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[REG_W]]
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]]
@@ -62,7 +62,7 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)
 }
 
 ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b32 v[[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}}
 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]]
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]]
@@ -149,9 +149,9 @@ define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1
 
 ; Do scalar loads into the super register we need.
 ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b32 v[[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
-; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
+; CI: buffer_store_dwordx2 v[[[REG_ELT0]]:[[REG_ELT1]]]
 ; CI: s_endpgm
 define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -171,10 +171,10 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x
 
 ; Do scalar loads into the super register we need.
 ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; CI-DAG: ds_read2_b32 v[[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]], v{{[0-9]+}} offset1:1{{$}}
+; CI-DAG: ds_read2_b32 v[[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
 ; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
-; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
+; CI: buffer_store_dwordx4 v[[[REG_ELT0]]:[[REG_ELT3]]]
 ; CI: s_endpgm
 define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
index 069a888547e56..e24864ef650d2 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -9,7 +9,7 @@
 ; CI: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; GCN: ds_read2st64_b32 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset1:1
 ; GCN: s_waitcnt lgkmcnt(0)
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
 ; CI: buffer_store_dword [[RESULT]]
@@ -31,7 +31,7 @@ define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0
 ; CI: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; GCN: ds_read2st64_b32 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:1 offset1:2
 ; GCN: s_waitcnt lgkmcnt(0)
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
 ; CI: buffer_store_dword [[RESULT]]
@@ -54,7 +54,7 @@ define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, fl
 ; CI: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; GCN: ds_read2st64_b32 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:1 offset1:255
 ; GCN: s_waitcnt lgkmcnt(0)
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
 ; CI: buffer_store_dword [[RESULT]]
@@ -139,9 +139,9 @@ define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out)
 ; CI: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; GCN: ds_read2st64_b64 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset1:1
 ; GCN: s_waitcnt lgkmcnt(0)
-; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]]
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
@@ -161,9 +161,9 @@ define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #
 ; CI: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; GCN: ds_read2st64_b64 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:1 offset1:2
 ; GCN: s_waitcnt lgkmcnt(0)
-; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]]
 
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
@@ -208,9 +208,9 @@ define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, d
 ; CI: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; GCN: ds_read2st64_b64 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], v{{[0-9]+}} offset0:4 offset1:127
 ; GCN: s_waitcnt lgkmcnt(0)
-; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]]
 
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index cd3aeb48faaef..81807d9d362fa 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -5,12 +5,12 @@
 ; heuristics. Should not need -stress-early-ifcvt
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
-; GCN: v_cmp_neq_f64_e32 vcc, 1.0, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-; GCN: v_add_f64 v{{\[}}[[ADD_LO:[0-9]+]]:[[ADD_HI:[0-9]+]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
+; GCN: v_cmp_neq_f64_e32 vcc, 1.0, v[[[VAL_LO]]:[[VAL_HI]]]
+; GCN: v_add_f64 v[[[ADD_LO:[0-9]+]]:[[ADD_HI:[0-9]+]]], v[[[VAL_LO]]:[[VAL_HI]]], v[[[VAL_LO]]:[[VAL_HI]]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
 define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 entry:
   %v = load double, double addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index ad255818c9fe1..a6509373cc6e2 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -82,7 +82,7 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
 ; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4
-; GCN:     s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]
+; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
 ; GCN:     store_short v[{{[0-9:]+}}], v[[VRL]]
 define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) {
@@ -220,7 +220,7 @@ entry:
 ; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
+; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
   %ext = extractelement <8 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>, i32 %sel
@@ -236,7 +236,7 @@ entry:
 ; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
+; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
   %ext = extractelement <7 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, i32 %sel
@@ -280,7 +280,7 @@ entry:
 ; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
+; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double15_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
   %ext = extractelement <15 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>, i32 %sel
@@ -296,7 +296,7 @@ entry:
 ; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
 ; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
+; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
   %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
@@ -353,7 +353,7 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605
 ; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3
-; GCN:     s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]
+; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
 ; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
 ; GCN:     store_byte v[{{[0-9:]+}}], v[[VRL]]
 define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) {
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 6fad3653e475e..ae438405bddd5 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -116,13 +116,13 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(half addrspace(1)* %out, <4
 
 ; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_vgpr:
 ; GCN-DAG: {{flat|global|buffer}}_load_dword [[IDX:v[0-9]+]],
-; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 
-; GFX89: v_lshrrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GFX89: v_lshrrev_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], [[SCALED_IDX]], v[[[LO]]:[[HI]]]
 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[SHIFT_LO]]
 
-; SI: v_lshr_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[SCALED_IDX]]
+; SI: v_lshr_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], v[[[LO]]:[[HI]]], [[SCALED_IDX]]
 ; SI: buffer_store_short v[[SHIFT_LO]]
 define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 133c7e3b07875..7080524fe4fea 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -82,7 +82,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 
-; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
+; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x2c
 ; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[LOAD0]]
 ; GFX89-DAG: buffer_store_short [[VLOAD0]], off
 ; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[LOAD1]]
@@ -101,8 +101,8 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
 ; SI: s_load_dwordx2 s
 ; SI: s_load_dwordx2 s
 
-; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x24
-; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x4c
+; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x24
+; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x4c
 ; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54
 
 ; GCN-NOT: {{buffer|flat|global}}
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 036779d3ef243..541631710dff5 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -183,7 +183,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out
 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 
 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]]
+; VI: s_lshr_b64 s[[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]]
 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]]
 ; VI: buffer_store_byte [[V_EXTRACT]]
 define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index f3208c29ca623..d9ae5421516ed 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -39,8 +39,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
 }
 
 ; GCN-LABEL: {{^}}s_fabs_v4f16:
-; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2
-; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
+; CI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
+; GFX89: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x8
 
 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 94a4e09370af5..b6ad0c49dc26a 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -70,8 +70,8 @@ define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 }
 
 ; GCN-LABEL: {{^}}fabs_fn_fold:
-; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
@@ -83,8 +83,8 @@ define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, fl
 }
 
 ; FUNC-LABEL: {{^}}fabs_fold:
-; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
+; SI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 357de0e0eb496..6ee2485d3dea7 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -284,7 +284,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)*
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -294,7 +294,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -304,7 +304,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -314,7 +314,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -324,7 +324,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, double addrspace(1)* %out
@@ -334,7 +334,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -344,7 +344,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(dou
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -354,7 +354,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -364,7 +364,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(dou
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -374,7 +374,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
@@ -384,7 +384,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)*
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -394,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double add
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -404,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double add
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -414,7 +414,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -424,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, double addrspace(1)* %out
@@ -434,7 +434,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 66cd1e81cdee4..faa2d115db22d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -614,7 +614,7 @@ entry:
 
 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]]
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[R_I32_0]]:[[R_I32_1]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fcmp_v2f16_nlt(
     <2 x i32> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index aa17d6b8bc881..3df4edef5f945 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -63,12 +63,12 @@ entry:
 
 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
 ; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v[[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]]
 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]]
+; GCN-DAG: v_cvt_f64_f32_e32 v[[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]], v[[MAG_EXT]]
 ; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[MAG_EXT_LO]]:[[OUT_HI]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
   double addrspace(1)* %arg_out,
@@ -113,14 +113,14 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
-; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v[[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]]
 ; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[SIGN:[0-9]+]]
 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
 ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
 ; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
 ; GFX89: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[MAG_LO]]:[[OUT_HI]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
   double addrspace(1)* %arg_out,
@@ -168,7 +168,7 @@ entry:
 
 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
 ; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]]
-; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v[[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]]
 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 196da93078523..0c94f9af0604e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -8,8 +8,8 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 
 ; Try to identify arg based on higher address.
 ; FUNC-LABEL: {{^}}test_copysign_f32:
-; SI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0xb
-; VI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0x2c
+; SI: s_load_dwordx2 s[[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]], {{.*}} 0xb
+; VI: s_load_dwordx2 s[[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]], {{.*}} 0x2c
 
 ; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]]
 ; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]]
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index b681e4a0da4a1..292eb1fa0b31e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -6,16 +6,16 @@ declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind r
 declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_copysign_f64:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x1d
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x74
+; SI-DAG: s_load_dwordx2 s[[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s[[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
+; VI-DAG: s_load_dwordx2 s[[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; VI-DAG: s_load_dwordx2 s[[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x74
 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[VMAG_LO]]:[[VRESULT_HI]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
@@ -24,15 +24,15 @@ define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32
 }
 
 ; FUNC-LABEL: {{^}}test_copysign_f64_f32:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
+; SI-DAG: s_load_dwordx2 s[[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; VI-DAG: s_load_dwordx2 s[[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN:[0-9]+]], s[[SSIGN]]
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[VMAG_LO]]:[[VRESULT_HI]]]
 define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index 2bf383a48bd0a..c48cb9766eb24 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -137,7 +137,7 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}div_fast_k_x_pat_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[K_LO:[0-9]+]], 0x9999999a
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999
-; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
 define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
@@ -149,7 +149,7 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}div_fast_neg_k_x_pat_f64:
 ; GCN-DAG: v_mov_b32_e32 v[[K_LO:[0-9]+]], 0x9999999a
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999
-; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
 define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index 15b751791d0aa..919c62fbb96c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -84,7 +84,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg
 }
 
 ; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
-; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; GCN-DAG:        s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
@@ -112,7 +112,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
-; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
+; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v[[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
   %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
@@ -121,7 +121,7 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
 }
 
 ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
-; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; GCN-DAG:        s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
@@ -157,7 +157,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
-; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; GCN-DAG:        s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
@@ -185,7 +185,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %
 ; GCN-FLUSH:      v_rcp_f32_e64
 ; GCN-FLUSH:      v_rcp_f32_e64
 ; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
-; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
+; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v[[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
   %neg = fneg <4 x float> %load
@@ -195,7 +195,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
-; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
+; GCN-DAG:        s_load_dwordx4 s[[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
 ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
@@ -223,7 +223,7 @@ define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32
 ; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
-; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
+; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v[[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
   %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
   %neg = fneg <4 x float> %load
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index a1e27425528a9..9da6f472a2c38 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -28,11 +28,11 @@ work:
 ; GCN: s_not_b64 exec, exec
   %tmp1189 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 4, i32 1)
 
-; GCN: s_or_saveexec_b64 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -1
+; GCN: s_or_saveexec_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], -1
 ; GCN: v_lshlrev_b32_e32 v[[tmp1191:[0-9]+]], 2, v[[tmp1189]]
   %tmp1191 = mul i32 %tmp1189, 4
 
-; GCN: s_mov_b64 exec, s{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_mov_b64 exec, s[[[LO]]:[[HI]]]
   %tmp1196 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp1191)
 
   %tmp34 = icmp eq i32 %arg, 0
@@ -65,11 +65,11 @@ work:
 ; GCN: s_not_b64 exec, exec
   %tmp1189 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 4, i32 1)
 
-; GCN: s_or_saveexec_b64 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -1
+; GCN: s_or_saveexec_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], -1
 ; GCN: v_lshlrev_b32_e32 v[[tmp1191:[0-9]+]], 2, v[[tmp1189]]
   %tmp1191 = mul i32 %tmp1189, 4
 
-; GCN: s_mov_b64 exec, s{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_mov_b64 exec, s[[[LO]]:[[HI]]]
   %tmp1196 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp1191)
 
   %tmp34 = icmp eq i32 %arg, 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 8f2cca5ad0daf..63339529ad0fb 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -5,13 +5,13 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
 
 ; CHECK-LABEL: {{^}}store_flat_i32:
-; CHECK-DAG: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
+; CHECK-DAG: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
 ; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]],
 ; CHECK: s_waitcnt lgkmcnt(0)
 ; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
-; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
+; CHECK: flat_store_dword v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]]
 define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
   store volatile i32 %x, i32* %fptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 3e9b8603432f8..29c8a0b1c695c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -909,8 +909,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
-; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+; GCN: flat_atomic_cmpswap_x2 v[[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RET]]:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64* %out, i64* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64* %out, i64 4
@@ -931,8 +931,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
-; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+; GCN: flat_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RET]]:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64* %out, i64 %index
@@ -952,8 +952,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
-; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
-; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+; GCN: flat_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[[[RET]]:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64* %out, i64* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64* %out, i64 %old, i64 %in seq_cst seq_cst
@@ -972,8 +972,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
-; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
-; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
+; GCN: flat_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[[[RET]]:
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64* %out, i64* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64* %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 9435b8278f300..6adf1fc5f4883 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -31,7 +31,7 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(
 }
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dwordx2 s[[[A:[0-9]+]]:[[B:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]]
 
@@ -56,7 +56,7 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out,
 ; Nsz also needed
 ; FIXME: Should separate tests
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
-; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN: s_load_dwordx2 s[[[A:[0-9]+]]:[[B:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0
 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index c75c500cdfac3..98ab1b2e7694e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -166,15 +166,15 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fmul_v4f16:
-; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
+; GFX9: buffer_load_dwordx2 v[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]]
+; GFX9: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]]
 
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
+; GFX9: buffer_store_dwordx2 v[[[MUL_LO]]:[[MUL_HI]]]
 
-; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
+; VI: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]]
+; VI: buffer_load_dwordx2 v[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]]
 ; VI: v_mul_f16_sdwa
 ; VI: v_mul_f16_e32
 ; VI: v_mul_f16_sdwa
@@ -194,13 +194,13 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fmul_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
+; GFX89-DAG: buffer_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]]
 ; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
 ; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
 
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], [[K0]]
 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
+; GFX9: buffer_store_dwordx2 v[[[MUL_LO]]:[[MUL_HI]]]
 
 ; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
 
@@ -212,7 +212,7 @@ entry:
 ; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MUL_LO_LO]], v[[MUL_LO_HI]]
 ; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MUL_HI_LO]], v[[MUL_HI_HI]]
 
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
+; VI: buffer_store_dwordx2 v[[[OR0]]:[[OR1]]]
 define amdgpu_kernel void @fmul_v4f16_imm_a(
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index 000ef3f25db9b..7ca22b5b83120 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -692,12 +692,12 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out,
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
 ; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
-; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI: v_max_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
 
-; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494
+; VI: v_min_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[A]], 0.15915494
 ; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
 
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
 define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -716,7 +716,7 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, d
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
 ; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
 
 ; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
 ; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
@@ -1534,10 +1534,10 @@ define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double add
 
 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[CVT_LO]]:[[CVT_HI]]]
 define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1553,10 +1553,10 @@ define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double add
 
 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
+; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
-; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
+; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v[[[CVT_LO]]:[[CVT_HI]]], 4.0
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
 define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1640,11 +1640,11 @@ define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
-; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
+; GCN: {{buffer|flat}}_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]]
+; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v[[[A_LO]]:[[A_HI]]]
 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[A_LO]]:[[NEG_A_HI]]]
 define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -1662,7 +1662,7 @@ define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrs
 ; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
-; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
+; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s[
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 43af8c6957749..6c6fcfa23ccbd 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -55,12 +55,12 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_f64:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13
-; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c
+; SI-DAG: s_load_dwordx2 s[[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x13
+; VI-DAG: s_load_dwordx2 s[[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x4c
 ; GCN-DAG: s_bitset1_b32 s[[HI_X]], 31
 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO_V]]:[[HI_V]]]
 define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index a7cddd09b7628..a602a04037580 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -38,15 +38,15 @@ define amdgpu_kernel void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out,
 ; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
 ; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
 
-; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s[[[K0_LO]]:[[K0_HI]]]
 ; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
 
 ; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
 
-; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]], [[TRUNC]]
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
-; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; CI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index 4f597eb3f32c3..8a86446472ed4 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -38,15 +38,15 @@ define amdgpu_kernel void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out,
 ; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
 ; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
 
-; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s[[[K0_LO]]:[[K0_HI]]]
 ; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
 
 ; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
 
-; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]], [[TRUNC]]
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
-; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; CI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 693c456f37a26..91dac92fb0b9e 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -20,8 +20,8 @@ entry:
 ; GCN-LABEL: {{^}}fpext_f16_to_f64
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}}
+; GCN: v_cvt_f64_f32_e32 v[[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]], v[[A_F32]]
+; GCN: buffer_store_dwordx2 v[[[R_F64_0]]:[[R_F64_1]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fpext_f16_to_f64(
     double addrspace(1)* %r,
@@ -39,7 +39,7 @@ entry:
 ; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
 ; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[R_F32_0]]:[[R_F32_1]]]
 ; GCN: s_endpgm
 
 define amdgpu_kernel void @fpext_v2f16_to_v2f32(
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 19de441c2301d..6fc9b7f9009d5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -42,7 +42,7 @@ entry:
 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; GCN: v_cvt_i32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]]
 ; GCN: v_ashrrev_i32_e32 v[[R_I64_High:[0-9]+]], 31, v[[R_I64_Low]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[R_I64_Low]]{{\:}}[[R_I64_High]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fptosi_f16_to_i64(
     i64 addrspace(1)* %r,
@@ -121,7 +121,7 @@ entry:
 ; VI-NOT: DEADBEEF
 ; VI-DAG: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]]
 ; VI-DAG: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]]
-; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
+; GCN: buffer_store_dwordx4 v[[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 3535e84bb422c..bf8677b143952 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -42,7 +42,7 @@ entry:
 ; GCN: v_mov_b32_e32 v[[R_I64_High:[0-9]+]], 0
 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; GCN: v_cvt_u32_f32_e32 v[[R_I64_Low:[0-9]+]], v[[A_F32]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_I64_Low]]{{\:}}[[R_I64_High]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[R_I64_Low]]{{\:}}[[R_I64_High]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fptoui_f16_to_i64(
     i64 addrspace(1)* %r,
@@ -117,7 +117,7 @@ entry:
 ; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]]
 ; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]]
 ; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0
-; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}}
+; GCN: buffer_store_dwordx4 v[[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index e17ac33a8bb7c..a50da7f08dda6 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -18,8 +18,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fptrunc_f64_to_f16:
-; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}}
-; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]]
+; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v[[[A_F64_0]]:[[A_F64_1]]]
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
@@ -34,7 +34,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16:
-; GCN:     buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}}
+; GCN:     buffer_load_dwordx2 v[[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]]
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
@@ -60,9 +60,9 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16:
-; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}}
-; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
-; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
+; GCN: buffer_load_dwordx4 v[[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]]
+; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v[[[A_F64_0]]:{{[0-9]+}}]
+; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v[{{[0-9]+}}:[[A_F64_3]]]
 ; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
index b9c3414e2d378..2df8388524cbe 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
@@ -9,15 +9,15 @@ declare double @llvm.fabs.f64(double) #0
 declare double @llvm.floor.f64(double) #0
 
 ; FUNC-LABEL: {{^}}fract_f64:
-; SI-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
-; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
+; SI-DAG: v_min_f64 v[[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v[[[UPLO]]:[[UPHI]]]
+; SI-DAG: v_cmp_class_f64_e64 vcc, v[[[LO]]:[[HI]]], 3
 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
-; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]{{\]}}, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]{{\]}}, -[[SUB0]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], v[[[LO]]:[[HI]]], -v[[[RESLO]]:[[RESHI]]]
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], v[[[LO]]:[[HI]]], -[[SUB0]]
 
 ; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
 ; CI: v_floor_f64_e32 [[FLOORX:v\[[0-9]+:[0-9]+\]]], [[X]]
@@ -36,15 +36,15 @@ define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace
 }
 
 ; FUNC-LABEL: {{^}}fract_f64_neg:
-; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
-; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
+; SI-DAG: v_min_f64 v[[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v[[[UPLO]]:[[UPHI]]]
+; SI-DAG: v_cmp_class_f64_e64 vcc, v[[[LO]]:[[HI]]], 3
 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
-; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO]]:[[HI]]{{\]}}, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO]]:[[HI]]{{\]}}, -[[SUB0]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -v[[[LO]]:[[HI]]], -v[[[RESLO]]:[[RESHI]]]
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -v[[[LO]]:[[HI]]], -[[SUB0]]
 
 ; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
 ; CI: v_floor_f64_e64 [[FLOORX:v\[[0-9]+:[0-9]+\]]], -[[X]]
@@ -64,15 +64,15 @@ define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrs
 }
 
 ; FUNC-LABEL: {{^}}fract_f64_neg_abs:
-; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
+; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
-; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
+; SI-DAG: v_min_f64 v[[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v[[[UPLO]]:[[UPHI]]]
+; SI-DAG: v_cmp_class_f64_e64 vcc, v[[[LO]]:[[HI]]], 3
 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
-; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO]]:[[HI]]{{\]}}|, -v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO]]:[[HI]]{{\]}}|, -[[SUB0]]
+; SI: v_add_f64 [[SUB0:v\[[0-9]+:[0-9]+\]]], -|v[[[LO]]:[[HI]]]|, -v[[[RESLO]]:[[RESHI]]]
+; SI: v_add_f64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|v[[[LO]]:[[HI]]]|, -[[SUB0]]
 
 ; CI: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]]
 ; CI: v_floor_f64_e64 [[FLOORX:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 47f3bbd0dd2be..26daa586d55c7 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -559,7 +559,7 @@ define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval({
 ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
 ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
 ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
-; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off
+; GCN-DAG: buffer_store_dwordx2 v[[[ARG1_LOAD0]]:[[ARG1_LOAD1]]], off
 define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval(i32) %arg0, i64 addrspace(5)* byval(i64) %arg1) #0 {
   %arg0.load = load i32, i32 addrspace(5)* %arg0
   %arg1.load = load i64, i64 addrspace(5)* %arg1
@@ -583,7 +583,7 @@ define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval(i32) %arg0, i
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12
 
 ; GCN: buffer_store_dword v[[LOAD_ARG1]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
+; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off
 define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile i32 %arg1, i32 addrspace(1)* undef
@@ -626,8 +626,8 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
 
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
-; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
+; GCN: buffer_store_dwordx2 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]], off
+; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off
 define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
@@ -658,8 +658,8 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
 
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
@@ -678,8 +678,8 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
 
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
@@ -706,10 +706,10 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
 
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off
-; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]], off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]], off
+; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll b/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll
index 290529a3b739f..f776272fdc7d2 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll
@@ -7,31 +7,31 @@ declare protected void @protected_func(i32 addrspace(1)* %out)
 declare hidden void @hidden_func(i32 addrspace(1)* %out)
 
 ; CHECK-LABEL: call_func:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOT_ADDR_LO:[0-9]+]], s[[PC_LO]], func@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOT_ADDR_HI:[0-9]+]], s[[PC_HI]], func@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]{{\]}}, 0x0
-; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]], 0x0
+; CHECK: s_swappc_b64 s[{{[0-9]+:[0-9]+}}], s[[[ADDR_LO]]:[[ADDR_HI]]]
 define amdgpu_kernel void @call_func(i32 addrspace(1)* %out) {
   call void @func(i32 addrspace(1)* %out)
   ret void
 }
 
 ; CHECK-LABEL: call_protected_func:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], protected_func@rel32@lo+4
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], protected_func@rel32@hi+12
-; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
+; CHECK: s_swappc_b64 s[{{[0-9]+:[0-9]+}}], s[[[ADDR_LO]]:[[ADDR_HI]]]
 define amdgpu_kernel void @call_protected_func(i32 addrspace(1)* %out) {
   call void @protected_func(i32 addrspace(1)* %out)
   ret void
 }
 
 ; CHECK-LABEL: call_hidden_func:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], hidden_func@rel32@lo+4
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], hidden_func@rel32@hi+12
-; CHECK: s_swappc_b64 s{{\[}}{{[0-9]+:[0-9]+}}{{\]}}, s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
+; CHECK: s_swappc_b64 s[{{[0-9]+:[0-9]+}}], s[[[ADDR_LO]]:[[ADDR_HI]]]
 define amdgpu_kernel void @call_hidden_func(i32 addrspace(1)* %out) {
   call void @hidden_func(i32 addrspace(1)* %out)
   ret void
@@ -40,11 +40,11 @@ define amdgpu_kernel void @call_hidden_func(i32 addrspace(1)* %out) {
 declare i64 @funci()
 
 ; CHECK-LABEL: tail_call_func:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOT_ADDR_LO:[0-9]+]], s[[PC_LO]], funci@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOT_ADDR_HI:[0-9]+]], s[[PC_HI]], funci@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]{{\]}}, 0x0
-; CHECK: s_setpc_b64 s{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]], 0x0
+; CHECK: s_setpc_b64 s[[[ADDR_LO]]:[[ADDR_HI]]]
 define i64 @tail_call_func() {
   %ret = tail call i64 @funci()
   ret i64 %ret
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index e422853b1c4d6..3e3800ef62b75 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -9,12 +9,12 @@
 @available_externally = available_externally addrspace(4) global [256 x i32] zeroinitializer
 
 ; GCN-LABEL: {{^}}private_test:
-; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN: s_getpc_b64 s[[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]]
 
 ; Non-R600 OSes use relocations.
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], private1@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], private1@rel32@hi+12
-; GCN: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN: s_getpc_b64 s[[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]]
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+12
 
@@ -30,7 +30,7 @@ define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
 }
 
 ; GCN-LABEL: {{^}}available_externally_test:
-; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN: s_getpc_b64 s[[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]]
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+12
 ; R600-LABEL: available_externally_test
diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
index 4e50f995d27e7..ef77513813741 100644
--- a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -156,7 +156,7 @@ define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace
 ; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
diff --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
index 254272bd81de1..ad14ac2280356 100644
--- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -13,7 +13,7 @@
 @external_w_init = addrspace(1) global [256 x i32] zeroinitializer
 
 ; CHECK-LABEL: private_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private@rel32@lo+8
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], private@rel32@hi+16
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]]
@@ -25,7 +25,7 @@ define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: internal_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], internal@rel32@lo+8
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], internal@rel32@hi+16
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]]
@@ -37,10 +37,10 @@ define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: available_externally_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], available_externally@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], available_externally@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
@@ -50,10 +50,10 @@ define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: linkonce_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
@@ -63,10 +63,10 @@ define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: weak_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
@@ -76,10 +76,10 @@ define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: common_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], common@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], common@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
@@ -89,10 +89,10 @@ define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: extern_weak_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], extern_weak@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], extern_weak@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
@@ -102,10 +102,10 @@ define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: linkonce_odr_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce_odr@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce_odr@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
@@ -115,10 +115,10 @@ define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: weak_odr_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak_odr@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak_odr@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
@@ -128,10 +128,10 @@ define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: external_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
@@ -141,10 +141,10 @@ define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
 }
 
 ; CHECK-LABEL: external_w_init_test:
-; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
 ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external_w_init@gotpcrel32@lo+4
 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external_w_init@gotpcrel32@hi+12
-; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
+; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOTADDR_LO]]:[[GOTADDR_HI]]], 0x0
 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
 define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 1c71858c3b830..e49bf25c93496 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -37,13 +37,13 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_huge_offset:
 ; SI-DAG: v_mov_b32_e32 v[[PTRLO:[0-9]+]], 0xdeac
 ; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd
-; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+; SI: buffer_atomic_add v{{[0-9]+}}, v[[[PTRLO]]:[[PTRHI]]], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 
 ; VI: flat_atomic_add
 
 ; GFX9: s_add_u32 s[[LOW_K:[0-9]+]], s{{[0-9]+}}, 0xdeac
 ; GFX9: s_addc_u32 s[[HIGH_K:[0-9]+]], s{{[0-9]+}}, 0xabcd
-; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[LOW_K]]:[[HIGH_K]]]{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[[[LOW_K]]:[[HIGH_K]]]{{$}}
 define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
@@ -951,7 +951,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
-; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
+; SIVI: buffer_atomic_cmpswap v[[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword v[[RET]]
 
 ; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
@@ -978,7 +978,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
-; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_atomic_cmpswap v[[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; SIVI: buffer_store_dword v[[RET]]
 
@@ -1004,7 +1004,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
-; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SIVI: buffer_atomic_cmpswap v[[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; SIVI: buffer_store_dword v[[RET]]
 
 ; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
@@ -1028,7 +1028,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
-; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_atomic_cmpswap v[[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; SIVI: buffer_store_dword v[[RET]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index b07921fc3521a..a27361e54cd18 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -1001,8 +1001,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
-; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
-; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
+; CIVI: buffer_atomic_cmpswap_x2 v[[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
+; CIVI: buffer_store_dwordx2 v[[[RET]]:
 
 ; GFX9: global_atomic_cmpswap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
@@ -1027,11 +1027,11 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
-; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
-; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
-; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
+; CI: buffer_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
+; VI: flat_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; CIVI: buffer_store_dwordx2 v[[[RET]]:
 
-; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:32 glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
@@ -1052,10 +1052,10 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
-; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
-; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
+; CIVI: buffer_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; CIVI: buffer_store_dwordx2 v[[[RET]]:
 
-; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
@@ -1076,11 +1076,11 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
-; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
-; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
+; CI: buffer_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; VI: flat_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
+; CIVI: buffer_store_dwordx2 v[[[RET]]:
 
-; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 v[[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}}
 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
@@ -1107,7 +1107,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_neg_offset:
 ; CI: v_mov_b32_e32 v[[LO:[0-9]+]], 0xffffffe0
 ; CI: v_mov_b32_e32 v[[HI:[0-9]+]], -1
-; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[[[LO]]:[[HI]]], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 
 ; VI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffffffe0
 ; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1
diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
index 533a1be4e5f08..82c682c7294af 100644
--- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll
@@ -71,8 +71,8 @@ bb:
 
 ; uniform load dominated by no-alias store - scalarize
 ; CHECK-LABEL: @no_memdep_alias_arg
-; CHECK: s_load_dwordx2 s{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[4:5], 0x0
-; CHECK: s_load_dword [[SVAL:s[0-9]+]], s{{\[}}[[IN_LO]]:[[IN_HI]]], 0x0
+; CHECK: s_load_dwordx2 s[[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]], s[4:5], 0x0
+; CHECK: s_load_dword [[SVAL:s[0-9]+]], s[[[IN_LO]]:[[IN_HI]]], 0x0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
 
@@ -120,7 +120,7 @@ entry:
 ; CHECK: flat_store_dword
 ; CHECK: v_mov_b32_e32 v[[ADDR_LO:[0-9]+]], s{{[0-9]+}}
 ; CHECK: v_mov_b32_e32 v[[ADDR_HI:[0-9]+]], s{{[0-9]+}}
-; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
+; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v[[[ADDR_LO]]:[[ADDR_HI]]]
 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
 define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) {
diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
index 69191eb850db6..10adc3f3af9af 100644
--- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
@@ -54,7 +54,7 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 {
 ; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy
 ; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0
 ; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1
-; GCN: ; illegal copy v{{\[}}[[COPY1L]]:[[COPY1H]]] to s[10:11]
+; GCN: ; illegal copy v[[[COPY1L]]:[[COPY1H]]] to s[10:11]
 define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 {
   %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"()
   call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr)
diff --git a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
index 954dad01fa915..98b2f13cddf84 100644
--- a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
@@ -10,20 +10,20 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG1:[0-9]+]], v[[VREG1:[0-9]+]]
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG2:[0-9]+]], v[[VREG2:[0-9]+]]
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG3:[0-9]+]], v[[VREG3:[0-9]+]]
-; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s{{\[}}[[SREG0]]:[[SREG1]]{{\]}}, v{{\[}}[[VREG0]]:[[VREG1]]{{\]}}
-; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG2]]:[[SREG3]]{{\]}}, v{{\[}}[[VREG2]]:[[VREG3]]{{\]}}
+; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s[[[SREG0]]:[[SREG1]]], v[[[VREG0]]:[[VREG1]]]
+; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SREG2]]:[[SREG3]]], v[[[VREG2]]:[[VREG3]]]
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG4:[0-9]+]], v[[VREG4:[0-9]+]]
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG5:[0-9]+]], v[[VREG5:[0-9]+]]
 ; GCN-NEXT: s_and_b64 [[AND0:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
-; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP2:vcc]], s{{\[}}[[SREG4]]:[[SREG5]]{{\]}}, v{{\[}}[[VREG4]]:[[VREG5]]{{\]}}
+; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP2:vcc]], s[[[SREG4]]:[[SREG5]]], v[[[VREG4]]:[[VREG5]]]
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG6:[0-9]+]], v[[VREG6:[0-9]+]]
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG7:[0-9]+]], v[[VREG7:[0-9]+]]
-; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP3:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG6]]:[[SREG7]]{{\]}}, v{{\[}}[[VREG6]]:[[VREG7]]{{\]}}
+; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP3:s\[[0-9]+:[0-9]+\]]], s[[[SREG6]]:[[SREG7]]], v[[[VREG6]]:[[VREG7]]]
 ; GCN-NEXT: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], [[AND0]], [[CMP2]]
 ; GCN-NEXT: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[AND1]], [[CMP3]]
 ; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN-NEXT: s_nop 0
-; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG7]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
+; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[[[SREG0]]:[[SREG7]]], {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
@@ -43,13 +43,13 @@ main_body:
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG2:[0-9]+]], v[[VREG2:[0-9]+]]
 ; GCN-NEXT: v_readfirstlane_b32 s[[SREG3:[0-9]+]], v[[VREG3:[0-9]+]]
 
-; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s{{\[}}[[SREG0]]:[[SREG1]]{{\]}}, v{{\[}}[[VREG0]]:[[VREG1]]{{\]}}
-; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG2]]:[[SREG3]]{{\]}}, v{{\[}}[[VREG2]]:[[VREG3]]{{\]}}
+; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s[[[SREG0]]:[[SREG1]]], v[[[VREG0]]:[[VREG1]]]
+; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SREG2]]:[[SREG3]]], v[[[VREG2]]:[[VREG3]]]
 ; GCN-NEXT: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
 ; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN-NEXT: s_nop 0
 
-; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG3]]{{\]}} dmask:0x1
+; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s[[[SREG0]]:[[SREG3]]] dmask:0x1
 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
index 730bcb44d1a8c..261df086aed7d 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
@@ -7,7 +7,7 @@
 
 
 ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
+; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
index 46190f101c16a..732079b4d9ff0 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
@@ -10,7 +10,7 @@
 
 
 ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
+; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index e99b0043e402b..e1b9bb1af72a5 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -176,7 +176,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
 
 ; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
-; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
+; MOVREL: buffer_store_dwordx4 v[[[ELT0]]:[[ELT3]]]
 define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %add = add i32 %in, 1
@@ -240,7 +240,7 @@ entry:
 ; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 
-; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
+; GCN: buffer_store_dwordx4 v[[[ELT0]]:
 define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
@@ -424,7 +424,7 @@ bb:
 
 ; offset puts outside of superegister bounaries, so clamp to 1st element.
 ; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
-; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
+; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
 ; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]]
 ; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15
 
@@ -446,7 +446,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
-; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
+; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]]
 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
 ; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 52baec321bcc3..48220392aea29 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -59,10 +59,10 @@ endif:
 
 ; CHECK-LABEL: {{^}}v_cmp_asm:
 ; CHECK: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
-; CHECK: v_cmp_ne_u32_e64 s{{\[}}[[MASK_LO:[0-9]+]]:[[MASK_HI:[0-9]+]]{{\]}}, 0, [[SRC]]
+; CHECK: v_cmp_ne_u32_e64 s[[[MASK_LO:[0-9]+]]:[[MASK_HI:[0-9]+]]], 0, [[SRC]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
-; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: buffer_store_dwordx2 v[[[V_LO]]:[[V_HI]]]
 define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
   %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
   store i64 %sgpr, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 3a49236934706..252ceb0c7dc72 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -15,7 +15,7 @@
 ; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
 ; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
-; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
+; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v[[[ELT_FIRST]]:[[ELT_LAST]]]
 define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
 entry:
   %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
@@ -54,7 +54,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v[[VELT_1:[0-9]+]], s[[ELT_1]]
 ; GCN-DAG: v_mov_b32_e32 v[[VELT_2:[0-9]+]], s[[ELT_2]]
 ; GCN-DAG: v_mov_b32_e32 v[[VELT_3:[0-9]+]], s[[ELT_3]]
-; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[VELT_0]]:[[VELT_3]]]
+; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v[[[VELT_0]]:[[VELT_3]]]
 define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
 entry:
   %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
@@ -71,7 +71,7 @@ entry:
 ; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
 ; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
-; GCN:     flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
+; GCN:     flat_store_dwordx2 v[{{[0-9:]+}}], v[[[ELT_FIRST]]:[[ELT_LAST]]]
 define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
 entry:
   %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
@@ -106,8 +106,8 @@ entry:
 ; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
 ; GCN-DAG: s_cselect_b64 [[CC8:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
-; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
-; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
+; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v[[[ELT_FIRST0]]:[[ELT_LAST0]]]
+; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v[[[ELT_FIRST1]]:[[ELT_LAST1]]]
 define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
 entry:
   %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
@@ -142,7 +142,7 @@ entry:
 ; GCN:     s_mov_b32 s[[KLO:[0-9]+]], 0x3c003c00
 ; GCN:     s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
 ; GCN:     s_andn2_b64
-; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
+; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[[[KLO]]:[[KHI]]]
 ; GCN:     s_or_b64
 define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
 entry:
@@ -222,7 +222,7 @@ entry:
 ; GCN:     s_mov_b32 s[[KLO:[0-9]+]], 0x10001
 ; GCN:     s_mov_b32 s[[KHI:[0-9]+]], s[[KLO]]
 ; GCN:     s_andn2_b64
-; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{\[}}[[KLO]]:[[KHI]]]
+; GCN:     s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[[[KLO]]:[[KHI]]]
 ; GCN:     s_or_b64
 define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index d39ee12abde32..572ff60fc330f 100644
--- a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -19,9 +19,9 @@ define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointe
 }
 
 ; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_constant_pointer_load:
-; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
+; GCN: s_load_dwordx2 s[[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]]
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
-; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
+; GCN: buffer_store_dword [[K]], off, s[[[SPTR_LO]]:
 define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(4)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(4)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index 6a24049b1e2d8..c8b2a09874e0c 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -248,7 +248,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out,
 ; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg:
 ; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8
 ; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}}
-; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
+; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
 define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) {
   %in = load i32, i32 addrspace(6)* %in.byref
   store i32 %in, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index f9a4c0d0a364f..dc991aeef4f16 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -202,7 +202,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64* %out
@@ -212,8 +212,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 glc{{$}}
+; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
+; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}}
 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr) #0 {
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -224,7 +224,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr)
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
 define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -233,8 +233,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind {
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
+; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
 define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind {
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -244,8 +244,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40 glc{{$}}
+; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
+; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}}
 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64* %ptr, i32 %id
@@ -259,8 +259,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40{{$}}
+; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
+; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}}
 define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64* %ptr, i32 %id
@@ -293,7 +293,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32
 
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
 define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -306,7 +306,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 ad
 
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
+; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32
 define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -320,7 +320,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+; GCN: ds_dec_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
 define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -332,7 +332,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) noun
 
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+; GCN: ds_dec_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
 define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -343,9 +343,9 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
 
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -356,8 +356,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
+; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
+; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -369,8 +369,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
+; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GFX9: global_atomic_dec_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -380,8 +380,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) n
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
+; GFX9: global_atomic_dec_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -392,8 +392,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
-; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+; CI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
+; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
@@ -408,8 +408,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
-; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+; CI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
+; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}}
 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index e8cbdc314405c..1ad00dd639385 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -153,7 +153,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out,
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
 define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -163,7 +163,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 ad
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
+; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32
 define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -174,7 +174,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out,
 ; GCN-LABEL: {{^}}lds_atomic_inc_noret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+; GCN: ds_inc_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]]{{$}}
 define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -183,7 +183,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) noun
 ; GCN-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+; GCN: ds_inc_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
 define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -194,8 +194,8 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
+; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
+; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
@@ -206,8 +206,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
+; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
+; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -219,9 +219,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
+; GFX9: global_atomic_inc_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -231,8 +231,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) n
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
+; GFX9: global_atomic_inc_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -243,8 +243,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
-; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+; CI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
+; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
@@ -259,8 +259,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
-; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+; CI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
+; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}}
 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
@@ -351,7 +351,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
 define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64* %out
@@ -361,8 +361,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 {
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 glc{{$}}
+; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
+; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}}
 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 {
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -373,7 +373,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr)
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
+; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
 define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -382,8 +382,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind {
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
+; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
+; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}}
 define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind {
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
@@ -393,8 +393,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40 glc{{$}}
+; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}}
+; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}}
 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64* %ptr, i32 %id
@@ -408,8 +408,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
-; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:40{{$}}
+; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}}
+; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}}
 define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64* %ptr, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
index b1c2a030ea9f5..df222548051ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
@@ -11,7 +11,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
 ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0
@@ -24,10 +24,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
 main_body:
@@ -37,10 +37,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) {
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index c9a2b7fafd6f9..dcf06ae6a723f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -457,7 +457,7 @@ define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
 ; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
 ; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
 ; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
-; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]]
+; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]]
 define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
   %alloca = alloca i32, addrspace(5)
   %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
index 0a914a2e47fb1..0ae12149de211 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
@@ -19,7 +19,7 @@ main_body:
 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 
 ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
@@ -35,7 +35,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -46,12 +46,12 @@ main_body:
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
 
-; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 
 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
 
-; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
 main_body:
   call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
index 4badd8b753295..147d405ddccf8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}}
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
 ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
 ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[SX]], [[VY]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
index c264dc7ec6978..90c9874a65bf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}}
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
 ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
 ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[SX]], [[VY]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
index 414fe1502406d..ad295f5763030 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}}
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
 ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
 ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
index cab6c8c0016b1..ce566f2faf2e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}}
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
 ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
 ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 2944e89069c61..5279f8e997198 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -8,7 +8,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
   %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
   store i64 %tmp0, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 2dd7430ff78d8..8f3d77b3ff7fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -258,11 +258,11 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %
 }
 
 ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v[[[VA_LO]]:[[VA_HI]]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
@@ -274,10 +274,10 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)*
 
 ; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
 ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x1d
+; SI-DAG: s_load_dwordx2 s[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d
 ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
-; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v[[[VB_LO]]:[[VB_HI]]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
 define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
@@ -442,7 +442,7 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)
 ; SI-LABEL: {{^}}test_div_scale_f64_val_undef_val:
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000
-; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, v[0:1], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]], v[0:1], s[[[K_LO]]:[[K_HI]]]
 define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
   %result0 = extractvalue { double, i1 } %result, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index 881301fee4292..9faacc8ebde71 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -50,7 +50,7 @@ define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
 
 ; FIXME: Should be able to shift directly into m0
 ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
-; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; NOLOOP-DAG: s_load_dwordx2 s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
 
 ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
@@ -67,7 +67,7 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
 
 ; Variable offset in SGPR with constant add
 ; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
-; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; NOLOOP-DAG: s_load_dwordx2 s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
 
 ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
index a311424a47bf0..9d271075eba54 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
@@ -46,7 +46,7 @@ define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
 
 ; FIXME: Should be able to shift directly into m0
 ; GCN-LABEL: {{^}}gws_init_sgpr_offset:
-; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; NOLOOP-DAG: s_load_dwordx2 s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
 
 ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
@@ -62,7 +62,7 @@ define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
 
 ; Variable offset in SGPR with constant add
 ; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
-; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; NOLOOP-DAG: s_load_dwordx2 s[[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]]
 
 ; NOLOOP-SDAG-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
 ; NOLOOP-SDAG-DAG: s_mov_b32 m0, [[SHL]]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 2a7ee916f9cdf..9be69a9fb0861 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}is_private_vgpr:
-; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]]
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
 ; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index b8115d9e353d9..7948f0c0a18b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; GCN-LABEL: {{^}}is_local_vgpr:
-; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]{{\]}}
+; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]]
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
 ; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
 ; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index d81cb58486201..29c3d69a3e025 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -46,7 +46,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GFX90A-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
 ; GFX90A-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX90A:      v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX90A:      v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:     v_accvgpr_read_b32
 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) #0 {
@@ -64,7 +64,7 @@ bb:
 ; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 2
 ; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 1
 ; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A:          v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX90A:          v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:         v_accvgpr_read_b32
 ; GCN-COUNT-4:     global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) #0 {
@@ -82,7 +82,7 @@ bb:
 ; GCN-DAG:        v_mov_b32_e32 v[[TWO:[0-9]+]], 2
 ; GCN-DAG:        v_mov_b32_e32 v[[ONE:[0-9]+]], 1
 ; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A:         v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX90A:         v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:        v_accvgpr_read_b32
 ; GCN:            global_store_dwordx4 v{{[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) #0 {
@@ -100,7 +100,7 @@ bb:
 ; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 2
 ; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 1
 ; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A:          v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX90A:          v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:         v_accvgpr_read_b32
 ; GCN-COUNT-4:     global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) #0 {
@@ -118,7 +118,7 @@ bb:
 ; GCN-DAG:        v_mov_b32_e32 v[[TWO:[0-9]+]], 2
 ; GCN-DAG:        v_mov_b32_e32 v[[ONE:[0-9]+]], 1
 ; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A:         v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
+; GFX90A:         v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-NOT:        v_accvgpr_read_b32
 ; GCN:            global_store_dwordx4 v{{[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index ee4ef82c0d3a9..65e97ef39beb3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -410,9 +410,9 @@ bb:
 ; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000
 ; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00
 ; NOLIT-SRCC-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
-; NOLIT-SRCC:      v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}]
-; LIT-SRCC:        v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0
-; GFX90A:          v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0
+; NOLIT-SRCC:      v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}]
+; LIT-SRCC:        v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
+; GFX90A:          v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], 1.0
 ; GFX908-COUNT-16: v_accvgpr_read_b32
 ; GFX908:          global_store_dwordx4
 ; GFX90A-NOT:      v_accvgpr_read_b32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll
index 2ebf3f6633a97..cad0d30ea3daf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll
@@ -11,7 +11,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
 ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0
@@ -24,10 +24,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
 main_body:
   %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -36,10 +36,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) {
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
index 7bfdb8966fc54..08551a22e6aa1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll
@@ -19,7 +19,7 @@ main_body:
 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 
 ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %voffset) {
@@ -29,7 +29,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -39,13 +39,13 @@ main_body:
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
 
-; UNPACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; UNPACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 
 ; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
 
-; PACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; PACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) {
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -54,7 +54,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -65,12 +65,12 @@ main_body:
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
 
-; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 
 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
 
-; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) {
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
index 0ebc4e67b4fbe..8804aeb1f5001 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll
@@ -13,7 +13,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
-; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
 ; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
@@ -27,12 +27,12 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
-; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; GFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; GFX10-UNPACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; GFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
+; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; GFX10-PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
 ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
 main_body:
@@ -42,12 +42,12 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
-; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
+; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; GFX10-PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index 281c48513b6ae..fec7954a02255 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -22,7 +22,7 @@ main_body:
 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
 
 ; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
 ; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED]
@@ -33,7 +33,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}},
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}},
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -42,14 +42,14 @@ main_body:
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
-; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
 
 
 ; PACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
-; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
-; GFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED]
+; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
+; GFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED]
 define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) {
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -58,7 +58,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}},
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}},
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -68,13 +68,13 @@ main_body:
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
 
 
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
-; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED]
+; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
+; GFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED]
 define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) {
 main_body:
   call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 67ced1a64e609..53c8a8c7a0600 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -28,8 +28,8 @@ define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #
 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9]+]], 0x7fefffff
 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9]+]], 0xffefffff
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
-; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
-; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
+; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s[[[LOW1]]:[[HIGH1]]]
+; VI-DAG: v_max_f64 v[0:1], v[0:1], s[[[LOW1]]:[[HIGH2]]]
 define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   store double %rsq_clamp, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll
index e6c90336724b5..4369c60e12b05 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll
@@ -11,7 +11,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
 ; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
@@ -24,10 +24,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
 main_body:
@@ -37,10 +37,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) {
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
index dc92d48dfa914..30361a2b36ed3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll
@@ -19,7 +19,7 @@ main_body:
 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 
 ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
@@ -29,7 +29,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -39,13 +39,13 @@ main_body:
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
 
-; UNPACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 
 ; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
 
-; PACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; PACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -54,7 +54,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -65,12 +65,12 @@ main_body:
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
 
-; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 
 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
 
-; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
index b84257c2ae107..ae97f462e5693 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll
@@ -15,7 +15,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
 ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
-; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
 ; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
 ; PREGFX10-PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
@@ -30,11 +30,11 @@ main_body:
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
 ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
-; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
 ; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
-; GFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen
+; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
+; GFX10-PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen
 ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
 main_body:
@@ -45,11 +45,11 @@ main_body:
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
 ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
-; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
 ; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
-; GFX10-PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen
+; PREGFX10-PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
+; GFX10-PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen
 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 2c8855a6eaf55..5d05c955ddd0f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}tbuffer_store_d16_x:
 ; GCN-DAG: s_load_dwordx4
-; GCN-DAG: s_load_dword{{[x0-2]*}} s{{\[}}[[S_LO:[0-9]+]]
+; GCN-DAG: s_load_dword{{[x0-2]*}} s[[[S_LO:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
 ; PREGFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 ; GFX10: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
@@ -22,7 +22,7 @@ main_body:
 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 
 ; PREGFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 ; GFX10-PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
@@ -33,7 +33,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -42,13 +42,13 @@ main_body:
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
-; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 
 ; PACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
-; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-; GFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
+; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; GFX10-PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
 define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
 main_body:
   %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
@@ -57,7 +57,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -67,12 +67,12 @@ main_body:
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-; GFX10-PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
+; PREGFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; GFX10-PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
 define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
 main_body:
   call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
index ef6b1fb883eca..ea1857a5357ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
@@ -11,7 +11,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
-; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; UNPACKED: tbuffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
 ; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
@@ -24,10 +24,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
-; UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; UNPACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
 ; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
 main_body:
@@ -37,10 +37,10 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
-; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; UNPACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
 ; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
 
-; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
 ; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
 define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
 main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
index 740f068b9fd2f..9da9ca2cc99e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
@@ -19,7 +19,7 @@ main_body:
 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 
 ; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) {
@@ -29,7 +29,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -38,12 +38,12 @@ main_body:
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; UNPACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 
 ; PACKED-DAG: s_and_b32 [[SHR0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR0]]
-; PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %vindex) {
 main_body:
   call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
@@ -51,7 +51,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
 
 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
@@ -61,11 +61,11 @@ main_body:
 
 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; UNPACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 
 ; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
 ; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
 define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
 main_body:
   call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 18c4afd08e4f3..b5890ff085877 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -51,7 +51,7 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}update_dpp64_test:
-; GCN:     load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN:     load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) {
@@ -68,7 +68,7 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
 ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047
-; GCN-DAG: load_dwordx2 v{{\[}}[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
+; GCN-DAG: load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
 ; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index e2ebd5d2cd792..d4b2d11277e44 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -291,9 +291,9 @@ define amdgpu_kernel void @fma_v2f16_imm_c(
 }
 
 ; GCN-LABEL: {{^}}fma_v4f16
-; GCN: buffer_load_dwordx2 v{{\[}}[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]{{\]}}
-; GCN: buffer_load_dwordx2 v{{\[}}[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]{{\]}}
-; GCN: buffer_load_dwordx2 v{{\[}}[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]]
+; GCN: buffer_load_dwordx2 v[[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]]
+; GCN: buffer_load_dwordx2 v[[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]]
 
 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]]
 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
@@ -345,7 +345,7 @@ define amdgpu_kernel void @fma_v2f16_imm_c(
 ; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
 ; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
 
-; GCN: buffer_store_dwordx2 v{{\[}}[[R_V4_F16_LO]]:[[R_V4_F16_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[R_V4_F16_LO]]:[[R_V4_F16_HI]]]
 ; GCN: s_endpgm
 
 define amdgpu_kernel void @fma_v4f16(
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 7f782eeda5be0..9d0d387bfaa9e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -563,10 +563,10 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspac
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 
 ; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
-; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 
 ; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],
-; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
@@ -582,8 +582,8 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i
 ; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
-; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index eaae56fe04f6d..b47cbcb6e00e4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -96,8 +96,8 @@ entry:
 ; GCN-HSA-DAG: {{flat|global}}_load_dword v[[LO:[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 
-; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
@@ -111,8 +111,8 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i3
 ; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 
 
 ; EG: MEM_RAT
@@ -143,8 +143,8 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)
 ; GCN-NOHSA: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 6c93bd361d4f6..f5d1b6386fdb4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -569,10 +569,10 @@ define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 
 ; GCN-NOHSA-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
-; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 
 ; GCN-HSA-DAG: flat_load_ubyte v[[LO:[0-9]+]],
-; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
@@ -588,8 +588,8 @@ define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8
 ; GCN-HSA: flat_load_sbyte v[[LO:[0-9]+]],
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
-; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 343b37e6098e6..8d2407214752a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -606,7 +606,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(
 ; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 
-; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
 
 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
@@ -633,7 +633,7 @@ define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16
 ; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
-; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
 
 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index 4a60ff79e206c..b8cbc7b0892c2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -74,8 +74,8 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_load_v16i8:
 ; GFX9-NOT: m0
-; GCN: ds_read2_b64  v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}}
+; GCN: ds_read2_b64  v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], v{{[0-9]+}} offset1:1{{$}}
+; GCN: ds_write2_b64 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]] offset1:1{{$}}
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
@@ -455,7 +455,7 @@ define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3
 
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
-; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: MOV {{.*}}, literal
@@ -474,7 +474,7 @@ define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 a
 ; GCN: ds_read_i8 v[[LO:[0-9]+]],
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
-; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: ASHR
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
index 52e3fc2860974..027c441a0e372 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -61,7 +61,7 @@ define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 ad
 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVDATA]]:[[HIVDATA]]] offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -77,7 +77,7 @@ define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -130,7 +130,7 @@ define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out,
 
 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
@@ -378,7 +378,7 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) noun
 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: ds_add_u64 {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]] offset:32
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
@@ -392,7 +392,7 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: ds_add_u64 {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
@@ -440,7 +440,7 @@ define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %pt
 
 ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
-; GCN: ds_sub_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: ds_sub_u64 {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll b/llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll
index 80231e1a182a3..faf18d6c403ca 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-scoped-aa.ll
@@ -17,7 +17,7 @@
 define i32 @test_memcpy(i32 addrspace(1)* nocapture %p, i32 addrspace(1)* nocapture readonly %q) {
 ; Check loads of %q are scheduled ahead of that store of the memcpy on %p.
 ; CHECK-LABEL: test_memcpy:
-; CHECK-DAG:    global_load_dwordx2 v{{\[}}[[Q0:[0-9]+]]:[[Q1:[0-9]+]]{{\]}}, v[2:3], off
+; CHECK-DAG:    global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off
 ; CHECK-DAG:    global_load_dwordx4 [[PVAL:v\[[0-9]+:[0-9]+\]]], v[0:1], off offset:16
 ; CHECK-DAG:    v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]]
 ; CHECK:        global_store_dwordx4 v[0:1], [[PVAL]], off
@@ -39,7 +39,7 @@ define i32 @test_memcpy(i32 addrspace(1)* nocapture %p, i32 addrspace(1)* nocapt
 define i32 @test_memcpy_inline(i32 addrspace(1)* nocapture %p, i32 addrspace(1)* nocapture readonly %q) {
 ; Check loads of %q are scheduled ahead of that store of the memcpy on %p.
 ; CHECK-LABEL: test_memcpy_inline:
-; CHECK-DAG:    global_load_dwordx2 v{{\[}}[[Q0:[0-9]+]]:[[Q1:[0-9]+]]{{\]}}, v[2:3], off
+; CHECK-DAG:    global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off
 ; CHECK-DAG:    global_load_dwordx4 [[PVAL:v\[[0-9]+:[0-9]+\]]], v[0:1], off offset:16
 ; CHECK-DAG:    v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]]
 ; CHECK:        global_store_dwordx4 v[0:1], [[PVAL]], off
@@ -61,7 +61,7 @@ define i32 @test_memcpy_inline(i32 addrspace(1)* nocapture %p, i32 addrspace(1)*
 define i32 @test_memmove(i32 addrspace(1)* nocapture %p, i32 addrspace(1)* nocapture readonly %q) {
 ; Check loads of %q are scheduled ahead of that store of the memmove on %p.
 ; CHECK-LABEL: test_memmove:
-; CHECK-DAG:    global_load_dwordx2 v{{\[}}[[Q0:[0-9]+]]:[[Q1:[0-9]+]]{{\]}}, v[2:3], off
+; CHECK-DAG:    global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off
 ; CHECK-DAG:    global_load_dwordx4 [[PVAL:v\[[0-9]+:[0-9]+\]]], v[0:1], off offset:16
 ; CHECK-DAG:    v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]]
 ; CHECK:        global_store_dwordx4 v[0:1], [[PVAL]]
@@ -82,9 +82,9 @@ define i32 @test_memmove(i32 addrspace(1)* nocapture %p, i32 addrspace(1)* nocap
 define i32 @test_memset(i32 addrspace(1)* nocapture %p, i32 addrspace(1)* nocapture readonly %q) {
 ; Check loads of %q are scheduled ahead of that store of the memset on %p.
 ; CHECK-LABEL: test_memset:
-; CHECK-DAG:    global_load_dwordx2 v{{\[}}[[Q0:[0-9]+]]:[[Q1:[0-9]+]]{{\]}}, v[2:3], off
+; CHECK-DAG:    global_load_dwordx2 v[[[Q0:[0-9]+]]:[[Q1:[0-9]+]]], v[2:3], off
 ; CHECK-DAG:    v_mov_b32_e32 v[[PVAL:[0-9]+]], 0xaaaaaaaa
-; CHECK:        global_store_dwordx4 v[0:1], v{{\[}}[[PVAL]]{{:[0-9]+\]}}, off
+; CHECK:        global_store_dwordx4 v[0:1], v[[[PVAL]]{{:[0-9]+\]}}, off
 ; CHECK:        v_add_nc_u32_e32 v{{[0-9]+}}, v[[Q0]], v[[Q1]]
 ; CHECK:        s_setpc_b64 s[30:31]
   %p0 = bitcast i32 addrspace(1)* %p to i8 addrspace(1)*
diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index 18f46a8b8cf13..0df7feb6516c5 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
@@ -89,7 +89,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
-; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
@@ -103,7 +103,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspac
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
@@ -530,7 +530,7 @@ define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %
 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
-; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
+; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]]
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 ; GCN: buffer_store_dword v[[HI]]
 define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 4c93b7cde252c..650eddf900ea4 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: ds_read_b64
 ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; SI-DAG: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
-; SI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
+; SI-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]]
 ; SI-DAG: buffer_store_dword
 ; SI-DAG: buffer_store_dword
 ; SI:     s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 1644bdc1b270b..d93b55a013597 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -6,8 +6,8 @@
 ; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32
 
 ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add:
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v[[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]]
+; GCN-DAG: s_load_dwordx2 s[[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
@@ -16,7 +16,7 @@
 
 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
-; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
+; GCN: buffer_load_ubyte v{{[0-9]+}}, v[[[PTRLO]]:[[PTRHI]]],
 
 define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index b2be8440322be..3af1aa69abc3e 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -12,11 +12,11 @@
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W64: s_xor_b64 exec, exec, [[AND]]
 ; W64: s_cbranch_execnz [[LOOPBB]]
 ; W64: s_mov_b64 exec, [[SAVEEXEC]]
@@ -29,11 +29,11 @@
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: buffer_load_format_x [[RES:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
 ; W32: s_cbranch_execnz [[LOOPBB]]
 ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
@@ -53,11 +53,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W64: s_xor_b64 exec, exec, [[SAVE]]
 ; W64: s_cbranch_execnz [[LOOPBB0]]
 
@@ -70,11 +70,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES1:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: buffer_load_format_x [[RES1:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W64: s_xor_b64 exec, exec, [[SAVE]]
 ; W64: s_cbranch_execnz [[LOOPBB1]]
 
@@ -91,11 +91,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: buffer_load_format_x [[RES0:v[0-9]+]], v{{[0-9]+}}, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
 ; W32: s_cbranch_execnz [[LOOPBB0]]
 
@@ -108,11 +108,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
 ; W32: s_cbranch_execnz [[LOOPBB1]]
 
@@ -140,11 +140,11 @@ entry:
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W64: s_xor_b64 exec, exec, [[SAVE]]
 ; W64: s_cbranch_execnz [[LOOPBB0]]
 
@@ -160,11 +160,11 @@ entry:
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64: v_cmp_eq_u64_e32 vcc, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W64: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
 ; W64: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W64: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W64: s_xor_b64 exec, exec, [[SAVE]]
 ; W64: s_cbranch_execnz [[LOOPBB1]]
 
@@ -184,11 +184,11 @@ entry:
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
 ; W32: s_cbranch_execnz [[LOOPBB0]]
 
@@ -204,11 +204,11 @@ entry:
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v[[VRSRC1:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v[[VRSRC2:[0-9]+]]
 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v[[VRSRC3:[0-9]+]]
-; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
-; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W32: v_cmp_eq_u64_e32 vcc_lo, s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
+; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W32: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[CMP0]]
 ; W32: s_and_saveexec_b32 [[SAVE:s[0-9]+]], [[AND]]
-; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
+; W32: buffer_load_format_x [[RES]], [[IDX]], s[[[SRSRC0]]:[[SRSRC3]]], 0 idxen
 ; W32: s_xor_b32 exec_lo, exec_lo, [[SAVE]]
 ; W32: s_cbranch_execnz [[LOOPBB1]]
 
@@ -239,18 +239,18 @@ entry:
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
+; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
 ; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]]
 ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]]
 ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]]
 ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[S0]]:[[S3]]{{\]}}, {{.*}} idxen
+; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen
 ; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
 ; W64-O0: s_xor_b64 exec, exec, [[SAVE]]
@@ -263,7 +263,7 @@ entry:
 
 ; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1
 ; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
-; W64-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
+; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec
 ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
 ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
 
@@ -278,18 +278,18 @@ entry:
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[S0]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
+; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC0]]:[[SRSRC1]]], v[[[VRSRC0]]:[[VRSRC1]]]
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
 ; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
-; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
+; W64-O0-DAG: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s[[[SRSRC2]]:[[SRSRC3]]], v[[[VRSRC2]]:[[VRSRC3]]]
 ; W64-O0-DAG: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
 ; W64-O0-DAG: s_mov_b32 s[[S1:[0-9]+]], s[[SRSRCTMP1]]
 ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]]
 ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]]
 ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[S0]]:[[S3]]{{\]}}, {{.*}} idxen
+; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen
 ; W64-O0: s_waitcnt vmcnt(0)
 ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
 ; W64-O0: s_xor_b64 exec, exec, [[SAVE]]
@@ -297,7 +297,7 @@ entry:
 
 ; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
 ; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
-; W64-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
+; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]]
 ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
 ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill
 
diff --git a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
index 763d754c1b13b..22f15a2506ae9 100644
--- a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: @volatile_load
-; GCN:  s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
+; GCN:  s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
 ; GCN:  v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; GCN:  v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
-; GCN:  flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+; GCN:  flat_load_dword v{{[0-9]+}}, v[[[LO_VREG]]:[[HI_VREG]]]
 
 define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, [8 x i32], i32 addrspace(1)* nocapture %arg1) {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index 962ead6cc8284..643a872f76e29 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -45,7 +45,7 @@ endif:
 ; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
 ; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
-; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
+; CHECK: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]],
 
 define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) #1 {
 entry:
@@ -112,7 +112,7 @@ entry:
 
 ; A subregister use operand should not be tied.
 ; CHECK-LABEL: {{^}}no_fold_tied_subregister:
-; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; CHECK: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
 ; CHECK: buffer_store_dword v[[LO]]
 define amdgpu_kernel void @no_fold_tied_subregister() #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index eeb2cc379bb29..118cfe515ec1c 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -63,7 +63,7 @@ define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a)
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
@@ -75,10 +75,10 @@ define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i3
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
-; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
@@ -92,7 +92,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
 }
 
 ; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
-; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; SI: s_load_dwordx2 s[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-NOT: or_b32
 ; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
 ; SI-NOT: or_b32
@@ -100,7 +100,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
 ; SI-NOT: or_b32
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: or_b32
-; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
@@ -124,7 +124,7 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)*
 ; SI-DAG: s_or_b32 [[VAL]], [[VAL]], -8
 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
-; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[V_LO]]:[[V_HI]]]
 define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = or i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -182,7 +182,7 @@ define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addr
 }
 
 ; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
@@ -195,10 +195,10 @@ define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 add
 
 ; FIXME: The or 0 should really be removed.
 ; FUNC-LABEL: {{^}}vector_or_i64_imm:
-; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI: v_or_b32_e32 v[[LO_RESULT:[0-9]+]], 8, v[[LO_VREG]]
 ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
-; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[LO_RESULT]]:[[HI_VREG]]]
 ; SI: s_endpgm
 define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
@@ -211,7 +211,7 @@ define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspa
 ; SI-DAG: buffer_load_dword v[[LO_VREG:[0-9]+]]
 ; SI-DAG: v_or_b32_e32 v[[RES_LO:[0-9]+]], -8, v[[LO_VREG]]
 ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
-; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[RES_LO]]:[[RES_HI]]]
 ; SI: s_endpgm
 define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 91f676506a836..10e8d96ed8227 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -52,7 +52,7 @@ define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x floa
 ; GCN-LABEL: {{^}}fadd_v2_v_imm:
 ; GCN:            s_mov_b32 s[[K:[0-9]+]], 0x42c80000
 ; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}}
-; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
+; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
 define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
@@ -108,7 +108,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
 ; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
 ; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 0
 ; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0
-; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}}
+; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}}
 define amdgpu_kernel void @fadd_v2_v_lit_lo0(<2 x float> addrspace(1)* %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
@@ -263,7 +263,7 @@ define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x floa
 ; GCN-LABEL: {{^}}fmul_v2_v_imm:
 ; GCN:            s_mov_b32 s[[K:[0-9]+]], 0x42c80000
 ; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}}
-; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
+; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
 define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
@@ -382,7 +382,7 @@ define amdgpu_kernel void @fma_v32_vs(<32 x float> addrspace(1)* %a, <32 x float
 ; GCN-DAG:        s_mov_b32 s[[K1:[0-9]+]], 0x42c80000
 ; GCN-DAG:        v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
 ; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]]
-; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K1]]:{{[0-9:]+}}], v{{\[}}[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}}
+; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}}
 define amdgpu_kernel void @fma_v2_v_imm(<2 x float> addrspace(1)* %a) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
index 490678b019620..234412c72b247 100644
--- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -8,9 +8,9 @@
 ; shifted down to the end of the used registers.
 
 ; GCN-LABEL: {{^}}store_to_undef:
-; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
-; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}}
+; OPT-DAG: s_mov_b64 s[[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]], s[2:3]
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[[[RSRC_LO]]:[[RSRC_HI]]], 0 offen{{$}}
 
 ; -O0 should assume spilling, so the input scratch resource descriptor
 ; -should be used directly without any copies.
@@ -23,27 +23,27 @@ define amdgpu_kernel void @store_to_undef() #0 {
 }
 
 ; GCN-LABEL: {{^}}store_to_inttoptr:
-; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
-; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}}
+; OPT-DAG: s_mov_b64 s[[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]], s[2:3]
+; OPT: buffer_store_dword v{{[0-9]+}}, off, s[[[RSRC_LO]]:[[RSRC_HI]]], 0 offset:124{{$}}
 define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*)
  ret void
 }
 
 ; GCN-LABEL: {{^}}load_from_undef:
-; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
-; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen glc{{$}}
+; OPT-DAG: s_mov_b64 s[[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]], s[2:3]
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[[[RSRC_LO]]:[[RSRC_HI]]], 0 offen glc{{$}}
 define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32 addrspace(5)* undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_from_inttoptr:
-; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
-; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124 glc{{$}}
+; OPT-DAG: s_mov_b64 s[[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]], s[2:3]
+; OPT: buffer_load_dword v{{[0-9]+}}, off, s[[[RSRC_LO]]:[[RSRC_HI]]], 0 offset:124 glc{{$}}
 define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
index 905694c90445b..c5c6467550ef9 100644
--- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
@@ -143,7 +143,7 @@ entry:
 
 ; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}}
-; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -180,7 +180,7 @@ entry:
 
 ; HSA-ELT4-DAG: buffer_load_dword v[[HI:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword v[[LO:[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen{{$}}
-; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+; HSA-ELT4: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 63beb537fd4f8..9e39736e98701 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -75,7 +75,7 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
-; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
+; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
 ; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
 ; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
@@ -134,7 +134,7 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
-; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
+; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
 ; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
 ; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll
index 7224a8576dffe..8fd2fb05cea22 100644
--- a/llvm/test/CodeGen/AMDGPU/read_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/read_register.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_exec:
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
-; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !1)
   store i64 %exec, i64 addrspace(1)* %out
@@ -27,7 +27,7 @@ define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_flat_scratch:
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
-; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; CHECK: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
   %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
   store i64 %flat_scratch, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index d67e10856db8e..f9de42f47b658 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -19,10 +19,10 @@ declare i64 @llvm.readcyclecounter() #0
 ; GETREG-SDAG-DAG:  v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
 ; GETREG-DAG:  s_getreg_b32 [[CNT1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
 ; GETREG-DAG:  v_mov_b32_e32 v[[VCNT1:[0-9]+]], [[CNT1]]
-; GETREG:      global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT1]]:[[ZERO]]]
+; GETREG:      global_store_dwordx2 v{{.+}}, v[[[VCNT1]]:[[ZERO]]]
 ; GETREG:      s_getreg_b32 [[CNT2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
 ; GETREG:      v_mov_b32_e32 v[[VCNT2:[0-9]+]], [[CNT2]]
-; GETREG:      global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT2]]:[[ZERO]]]
+; GETREG:      global_store_dwordx2 v{{.+}}, v[[[VCNT2]]:[[ZERO]]]
 
 define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll
index 44e3b54d0e67b..ee73a12733022 100644
--- a/llvm/test/CodeGen/AMDGPU/rel32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rel32.ll
@@ -4,7 +4,7 @@
 @g = protected local_unnamed_addr addrspace(4) externally_initialized global i32 0, align 4
 
 ; CHECK-LABEL: rel32_neg_offset:
-; CHECK: s_getpc_b64 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{]}}
+; CHECK: s_getpc_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]{{]}}
 ; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g@rel32@lo-4
 ; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g@rel32@hi+4
 define i32 addrspace(4)* @rel32_neg_offset() {
diff --git a/llvm/test/CodeGen/AMDGPU/returnaddress.ll b/llvm/test/CodeGen/AMDGPU/returnaddress.ll
index 1c0139c4e1252..d9fff7c451174 100644
--- a/llvm/test/CodeGen/AMDGPU/returnaddress.ll
+++ b/llvm/test/CodeGen/AMDGPU/returnaddress.ll
@@ -67,9 +67,9 @@ declare void @callee()
 ; GCN-LABEL: {{^}}multi_use:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:4[0-9]+]], s30
 ; GCN-DAG: v_mov_b32_e32 v[[HI:4[0-9]+]], s31
-; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 ; GCN: s_swappc_b64
-; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
 define void @multi_use() nounwind {
 entry:
   %ret0 = tail call i8* @llvm.returnaddress(i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
index 797fbc2712b0f..0b4e05c6d0f2f 100644
--- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -3,7 +3,7 @@
 
 ; SI-LABEL: {{^}}s_movk_i32_k0:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
@@ -17,7 +17,7 @@ define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1
 
 ; SI-LABEL: {{^}}s_movk_i32_k1:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
@@ -31,7 +31,7 @@ define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1
 
 ; SI-LABEL: {{^}}s_movk_i32_k2:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
@@ -45,7 +45,7 @@ define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1
 
 ; SI-LABEL: {{^}}s_movk_i32_k3:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
@@ -59,7 +59,7 @@ define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1
 
 ; SI-LABEL: {{^}}s_movk_i32_k4:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
@@ -74,7 +74,7 @@ define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-LABEL: {{^}}s_movk_i32_k5:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
@@ -88,7 +88,7 @@ define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1
 
 ; SI-LABEL: {{^}}s_movk_i32_k6:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
@@ -103,7 +103,7 @@ define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-LABEL: {{^}}s_movk_i32_k7:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}}
 ; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
@@ -118,7 +118,7 @@ define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-LABEL: {{^}}s_movk_i32_k8:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
@@ -133,7 +133,7 @@ define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-LABEL: {{^}}s_movk_i32_k9:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
@@ -148,7 +148,7 @@ define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-LABEL: {{^}}s_movk_i32_k10:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
@@ -163,7 +163,7 @@ define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-LABEL: {{^}}s_movk_i32_k11:
 ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
@@ -178,7 +178,7 @@ define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-LABEL: {{^}}s_movk_i32_k12:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}}
 ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index f3c2a925c15c4..f71bb5ce5ccbe 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -58,9 +58,9 @@ done:                                             ; preds = %loop
 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 ; SI-DAG: s_mov_b32
-; SI-DAG: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
+; SI-DAG: s_load_dword [[OUT:s[0-9]+]], s[[[PTR_LO]]:[[PTR_HI]]], [[OFFSET]]
 
-; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
+; CI: s_load_dword [[OUT:s[0-9]+]], s[[[PTR_LO]]:[[PTR_HI]]], 0xbb8
 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 9fbdc60e5cd80..0a68d9f1173c6 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -46,7 +46,7 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, fl
 ; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]]
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
 define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
@@ -62,7 +62,7 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i3
 ; GCN: s_and_b64 vcc, vcc, [[CMP1]]
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
 define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
@@ -115,7 +115,7 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, flo
 ; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]]
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
 define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
@@ -131,7 +131,7 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32
 ; GCN: s_or_b64 vcc, vcc, [[CMP1]]
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
 define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index c7d6301de3419..61729cf88b778 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -216,7 +216,7 @@ define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
 }
 
 ; GCN-LABEL: {{^}}s_select_v2f32:
-; GCN-DAG: s_load_dwordx4 s{{\[}}[[ALO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dwordx4 s[[[ALO:[0-9]+]]:[[BHI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index cb77e8268699e..30487eac73d1a 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -78,10 +78,10 @@ define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out,
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64:
 ; GCN: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000
+; GCN-DAG: s_bfe_i64 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], [[VAL]], 0x10000
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
@@ -92,10 +92,10 @@ define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a,
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64:
 ; GCN: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000
+; GCN-DAG: s_bfe_i64 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], [[VAL]], 0x80000
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
@@ -106,10 +106,10 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a,
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64:
 ; GCN: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000
+; GCN-DAG: s_bfe_i64 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], [[VAL]], 0x100000
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 
 define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
@@ -121,10 +121,10 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64:
 ; GCN: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000
+; GCN-DAG: s_bfe_i64 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], [[VAL]], 0x200000
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
@@ -150,16 +150,16 @@ define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64:
 ; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_lshl_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GFX89: {{flat|global}}_load_dwordx2
-; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: v_lshlrev_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -177,16 +177,16 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 a
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64:
 ; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_lshl_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GFX89: {{flat|global}}_load_dwordx2
-; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: v_lshlrev_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -204,16 +204,16 @@ define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 a
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64:
 ; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_lshl_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GFX89: {{flat|global}}_load_dwordx2
-; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: v_lshlrev_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
-; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
+; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -231,13 +231,13 @@ define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64:
 ; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; SI: v_lshl_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]],
 
 ; GFX89: {{flat|global}}_load_dwordx2
-; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; GFX89: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]],
 
 ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
-; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[SHR]]{{\]}}
+; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[SHR]]]
 define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -461,17 +461,17 @@ define amdgpu_kernel void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocaptu
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use:
 ; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_lshl_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GFX89: {{flat|global}}_load_dwordx2
-; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: v_lshlrev_b64 v[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]]
 
 ; GCN-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]]
-; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
+; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
 define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -491,17 +491,17 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %o
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64_move_use:
 ; SI: buffer_load_dwordx2
-; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; SI: v_lshl_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]],
 
 ; GFX89: {{flat|global}}_load_dwordx2
-; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; GFX89: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]],
 
 ; GCN-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]]
 
-; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
+; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
 define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
index b574e5d98c402..4e796f3384e4b 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-local-cse.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 target triple = "amdgcn-amd-amdhsa"
 
 ; CHECK-LABEL: {{^}}t0:
-; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0
+; CHECK: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
 ; There should be no redundant copies from PTR_HI.
 ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index ef2acafa66d9a..d4cf884fbd08d 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -219,7 +219,7 @@ ENDIF:                                            ; preds = %LOOP
 ; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 11
 ; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 13
 
-; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}}
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v[[[SAMPLE_LO]]:[[SAMPLE_HI]]]
 ; CHECK: exp
 ; CHECK: s_endpgm
 define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(4)* inreg %arg, [32 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <8 x i32>] addrspace(4)* inreg %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
@@ -322,7 +322,7 @@ ENDIF69:                                          ; preds = %LOOP68
 
 ; [[END]]:
 ; CHECK: v_add_{{[iu]}}32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
-; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]]
 ; CHECK: s_branch
 define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [16 x <4 x i32>] addrspace(4)* inreg %arg2, [32 x <8 x i32>] addrspace(4)* inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index b08d54c7911b6..378212dc326fb 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -26,7 +26,7 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -42,7 +42,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -58,7 +58,7 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addr
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -75,7 +75,7 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 add
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -92,7 +92,7 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -108,7 +108,7 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -124,7 +124,7 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -140,7 +140,7 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 a
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -155,11 +155,11 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 a
 ; Spans the dword boundary, so requires full shift.
 ; Truncated after the shift, so only low shift result is used.
 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -176,7 +176,7 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -190,11 +190,11 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -211,7 +211,7 @@ define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -225,9 +225,9 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -289,7 +289,7 @@ define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %ou
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
-; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
 ; GCN-NOT: v[[SHRLO]]
@@ -314,7 +314,7 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)*
 ; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
 ; GCN-NOT: v[[SHRLO]]
 ; GCN-NOT: v[[SHRHI]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
 define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -331,11 +331,11 @@ define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspac
 
 ; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
 ; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27
+; GCN-DAG: v_lshr_b64 v[[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]], [[VAL]], 27
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
+; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -354,8 +354,8 @@ define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspac
 ; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
-; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
+; GCN-DAG: buffer_store_dwordx2 v[[[SHR]]:[[ZERO_SHR]]]
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO_BFE]]]
 define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -372,7 +372,7 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspac
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:{{[0-9]+\]}}
 ; GCN: buffer_store_dword v[[ZERO]]
 define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 9513c72275b4e..1c42fe4711ed8 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -7,7 +7,7 @@
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 35
@@ -19,7 +19,7 @@ define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 63
@@ -31,7 +31,7 @@ define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 33
@@ -42,7 +42,7 @@ define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-LABEL: {{^}}lshr_i64_32:
 ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 32
@@ -57,7 +57,7 @@ define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
@@ -72,7 +72,7 @@ define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 35
@@ -83,7 +83,7 @@ define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}shl_i64_const_32:
 ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 32
@@ -95,7 +95,7 @@ define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspac
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 63
@@ -218,10 +218,10 @@ define amdgpu_kernel void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addr
 }
 
 ; GCN-LABEL: {{^}}trunc_shl_16_v2i32_v2i64:
-; GCN: buffer_load_dwordx4 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx4 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}}
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESLO]]:[[RESHI]]]
 define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %shl = shl <2 x i64> %val, <i64 16, i64 16>
@@ -232,9 +232,9 @@ define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}trunc_shl_31_i32_i64_multi_use:
 ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN: v_lshl_b64 v[[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]], [[VAL]], 31
 ; GCN: buffer_store_dword v[[RESLO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[RESLO]]:[[RESHI]]]
 define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
diff --git a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
index 2fe064e5dc953..845b88b86e3da 100644
--- a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
@@ -8,7 +8,7 @@
 ; CHECK: v_add_u32_e32 v[[ADD:[0-9]+]], vcc, 0xc80, v[[SHL]]
 ; CHECK-NOT: v_lshl
 ; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]]
-; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
 define amdgpu_kernel void @add_const_offset(i32 addrspace(1)* nocapture %arg) {
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -25,7 +25,7 @@ bb:
 ; CHECK: v_or_b32_e32 v[[OR:[0-9]+]], 0x1000, v[[SHL]]
 ; CHECK-NOT: v_lshl
 ; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]]
-; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
 define amdgpu_kernel void @or_const_offset(i32 addrspace(1)* nocapture %arg) {
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
index 098c69f7e21a9..80755a01f425c 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -54,7 +54,7 @@ define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspa
 }
 
 ; SI-LABEL: {{^}}test_add_shl_add_constant:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI-DAG: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3
 ; SI: s_add_i32 [[RESULT:s[0-9]+]], [[SHL3]], s[[Y]]
 ; SI: s_addk_i32 [[RESULT]], 0x3d8
@@ -69,7 +69,7 @@ define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8
 }
 
 ; SI-LABEL: {{^}}test_add_shl_add_constant_inv:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
+; SI-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3
 ; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], s[[Y]]
 ; SI: s_addk_i32 [[TMP]], 0x3d8
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
index 0e56c2da87f0e..66b7b342b31df 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
@@ -1,12 +1,12 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}shl_base_atomicrmw_global_atomic_csub_ptr:
-; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5]
+; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 43
 ; GCN: v_add_co_u32 v[[EXTRA_LO:[0-9]+]], vcc_lo, 0x80, v4
 ; GCN: v_add_co_ci_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc_lo, 0, v5, vcc_lo
-; GCN: global_atomic_csub v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 glc
-; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}}
+; GCN: global_atomic_csub v{{[0-9]+}}, v[[[LO]]:[[HI]]], [[K]], off offset:512 glc
+; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
 define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 {
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32
   %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
index d1a6d7df63527..d4709f2f7c189 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
@@ -3,10 +3,10 @@
 ; GCN-LABEL: {{^}}shl_base_atomicrmw_global_ptr:
 ; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4
 ; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc
-; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5]
+; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5]
 ; GCN-DAG: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
-; GCN-DAG: global_atomic_and v{{\[}}[[LO]]:[[HI]]{{\]}}, [[THREE]], off offset:512
-; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}}
+; GCN-DAG: global_atomic_and v[[[LO]]:[[HI]]], [[THREE]], off offset:512
+; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
 define void @shl_base_atomicrmw_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 {
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32
   %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
@@ -20,10 +20,10 @@ define void @shl_base_atomicrmw_global_ptr(i32 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}shl_base_global_ptr_global_atomic_fadd:
 ; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4
 ; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc
-; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5]
+; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
-; GCN-DAG: global_atomic_add_f32 v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512
-; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}}
+; GCN-DAG: global_atomic_add_f32 v[[[LO]]:[[HI]]], [[K]], off offset:512
+; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
 define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 {
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32
   %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index ad732f3df216e..0196f7d5eb447 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -83,13 +83,13 @@ define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load
 ; GCN-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 
-; CI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
+; CI: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x1
 ; CI: buffer_store_dword
-; CI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
+; CI: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x3
 
-; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x4
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x4
 ; GFX9: global_store_dword
-; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xc
+; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0xc
 
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
@@ -113,11 +113,11 @@ define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32
 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 
-; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
-; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
+; CI-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x1
+; CI-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x3
 
-; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x4
-; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xc
+; GFX9-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x4
+; GFX9-DAG: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0xc
 
 ; GCN-DAG: ds_write_b32
 ; CI: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 62ae206572b6e..fb30f0716afbf 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -19,14 +19,14 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32
 ; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0
 ; VI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
+; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]]
 ; VI: s_endpgm
 
 ; SI-DAG: s_cmp_eq_u32
 ; SI-DAG: s_cselect_b64 vcc, -1, 0
 ; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}, vcc
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
+; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]]
 ; SI: s_endpgm
 define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
@@ -54,7 +54,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i6
 }
 
 ; GCN-LABEL: @v_sint_to_fp_i64_to_f64
-; GCN: flat_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: flat_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN-DAG: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
 ; GCN-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
 ; GCN-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index ff2c8e2381792..1d40d1e8e1d06 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -106,7 +106,7 @@ define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
 }
 
 ; GCN-LABEL: {{^}}s_abs_v4i16:
-; GFX9: s_load_dwordx2 s{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, s[0:1], 0x2c
+; GFX9: s_load_dwordx2 s[[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]], s[0:1], 0x2c
 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[VAL0]]
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]]
@@ -131,7 +131,7 @@ define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %
 }
 
 ; GCN-LABEL: {{^}}v_abs_v4i16:
-; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; GFX9: global_load_dwordx2 v[[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]]
 
 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
index 2e0633d6b70b2..10e243c3dca4a 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
@@ -1,7 +1,7 @@
 ; RUN: llc  -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
 
 ; GCN-LABEL: ; %bb.0:
-; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
+; GCN: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]], 0x0
 ; GCN: s_waitcnt lgkmcnt(0)
 ; GCN: global_store_dword v
 
diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
index 39d9ac33278e2..5a0065ec14a0b 100644
--- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -590,7 +590,7 @@ endif:
 ; GCN-LABEL: {{^}}br_scc_eq_i64_simm16:
 ; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
 ; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
-; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
 ; SI: v_cmp_eq_u64_e32
 define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
@@ -628,7 +628,7 @@ endif:
 ; GCN-LABEL: {{^}}br_scc_ne_i64_simm16:
 ; VI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x4d2
 ; VI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 1
-; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
 ; SI: v_cmp_ne_u64_e32
 define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index 7fe003ff54209..1d38c520dfeb1 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -89,7 +89,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
-; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; VI: buffer_store_dwordx2 v[[[ADD]]:[[VZERO]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1
 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI:      v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; VI-NEXT: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 64ee6bf7b6362..e38f8fa00f8ac 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -5,7 +5,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
 
 ; GCN-LABEL: {{^}}s_sub_i32:
-; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
+; GCN: s_load_dwordx2 s[[[A:[0-9]+]]:[[B:[0-9]+]]]
 ; GCN: s_load_dwordx2
 ; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]]
 define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index af6fe1eddadd1..922d9e5f4eaa4 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -36,9 +36,9 @@ define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, [8 x i32],
 }
 
 ; GCN-LABEL: {{^}}trunc_shl_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; GCN: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
+; SI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN: s_lshl_b64 s[[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s[[[LO_SREG]]:{{[0-9]+\]}}, 2
 ; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
 ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
 ; SI: buffer_store_dword v[[LO_VREG]],
@@ -93,11 +93,11 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a)
 }
 
 ; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
-; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
+; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
+; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_bitcmp1_b32 s[[SLO]], 0
-; SI: s_cselect_b64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], -1, 0
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
+; SI: s_cselect_b64 s[[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], -1, 0
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s[[[VLO]]:[[VHI]]]
 ; VI: s_cselect_b32 {{s[0-9]+}}, 63, -12
 define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) {
   %trunc = trunc i64 %x to i1
@@ -107,8 +107,8 @@ define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32],
 }
 
 ; GCN-LABEL: {{^}}v_trunc_i64_to_i1:
-; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
-; VI: flat_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
+; SI: buffer_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
+; VI: flat_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 1f26cd39c4b81..b55a429cc7a89 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -4,7 +4,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; GCN-LABEL: {{^}}v_uint_to_fp_i64_to_f64
-; GCN: flat_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: flat_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN-DAG: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
 ; GCN-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
 ; GCN-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
@@ -82,7 +82,7 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)
 ; SI-DAG: s_cselect_b64 vcc, -1, 0
 ; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, {{v[0-9]+}}, vcc
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index bd4360cb2e1f3..b69140d408ee3 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -538,10 +538,10 @@ define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrs
 }
 
 ; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
+; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
-; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
   %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1
   %v0 = load i32, i32 addrspace(4)* %p, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index d6b37506fbb75..4b52bc7153539 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -248,7 +248,7 @@ ENDIF:                                            ; preds = %IF, %main_body
 }
 
 ; GCN-LABEL: {{^}}icmp_users_different_blocks:
-; GCN: s_load_dwordx2 s{{\[}}[[COND0:[0-9]+]]:[[COND1:[0-9]+]]{{\]}}
+; GCN: s_load_dwordx2 s[[[COND0:[0-9]+]]:[[COND1:[0-9]+]]]
 ; GCN: s_cmp_lt_i32 s[[COND0]], 1
 ; GCN: s_cbranch_scc1 [[EXIT:.L[0-9_A-Za-z]+]]
 ; GCN: s_cmp_gt_i32 s[[COND1]], 0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index caf1f524a8070..6d86454708701 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -28,8 +28,8 @@ define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %o
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], s[[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -40,7 +40,7 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
 }
 
 ; GCN-LABEL: {{^}}test_use_s_v_s:
-; SI: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; SI: s_load_dwordx2 s[[[SA:[0-9]+]]:[[SB:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: buffer_load_dword [[VA0:v[0-9]+]]
 ; SI-NEXT: s_waitcnt vmcnt(0)
 ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
@@ -52,7 +52,7 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
 ; VI-NEXT: s_waitcnt vmcnt(0)
 ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
 ; VI-NEXT: s_waitcnt vmcnt(0)
-; VI: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; VI: s_load_dwordx2 s[[[SA:[0-9]+]]:[[SB:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]]
@@ -73,8 +73,8 @@ define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, fl
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], [[VGPR1]], s[[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -85,8 +85,8 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(
 }
 
 ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[SGPR0]], s[[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -151,7 +151,7 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspa
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]]
 ; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
@@ -181,7 +181,7 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspa
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]]
 ; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
@@ -211,7 +211,7 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspa
 }
 
 ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]]
 ; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]]
 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000
@@ -229,8 +229,8 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addr
 }
 
 ; GCN-LABEL: {{^}}test_s0_s1_k_f32:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]]
 
@@ -251,17 +251,17 @@ define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a,
 ; FIXME: Immediate in SGPRs just copied to VGPRs
 ; GCN-LABEL: {{^}}test_s0_s1_k_f64:
 ; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x1d|0x74}}
+; GCN-DAG: s_load_dwordx2 s[[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x1d|0x74}}
 ; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000
 ; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
-; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
+; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK0_SUB1]]]
 
 ; Same zero component is re-used for half of each immediate.
 ; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
-; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
+; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK1_SUB1]]]
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index aa82148782062..44dc9aaf03d13 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -52,7 +52,7 @@ define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f)
 ; (select (cmp (sgprX, constant)), constant, sgprZ)
 
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32:
-; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s[0:1], {{0x4c|0x13}}
+; GCN: s_load_dwordx2 s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s[0:1], {{0x4c|0x13}}
 
 ; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
 ; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
@@ -87,7 +87,7 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; GCN-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
 ; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
 ; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
@@ -212,9 +212,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64:
-; GCN: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]{{\]}}
-; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]{{\]}}
-; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
+; GCN: {{buffer|flat|global}}_load_dwordx2 v[[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]]
+; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v[[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]]
+; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v[[[X_LO]]:[[X_HI]]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
index 65207900f1e66..40b61ece2380f 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
@@ -25,7 +25,7 @@ entry:
 ; A little more complicated case where more sub-dword loads could be coalesced
 ; if they are not widening earlier.
 ; GCN-LABEL: {{^}}load_4i16:
-; GCN: s_load_dwordx2 s{{\[}}[[D0:[0-9]+]]:[[D1:[0-9]+]]{{\]}}, s[4:5], 0x4
+; GCN: s_load_dwordx2 s[[[D0:[0-9]+]]:[[D1:[0-9]+]]], s[4:5], 0x4
 ; GCN-NOT: s_load_dword {{s[0-9]+}}, s[4:5], 0x4
 ; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D0]], 16
 ; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D1]], 16
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index f42df585df2a1..8b8a430863dd8 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -22,8 +22,8 @@
 ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
 
 ; OFFREG is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 768
 
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 84ee7c509f87e..476e0f718b20f 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -828,10 +828,10 @@ main_body:
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}}
 ; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
-; GFX1064:     v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
+; GFX1064:     v_cmp_eq_f32_e64 s[[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
-; GCN:         store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s
+; GCN:         store_dwordx2 v{{[0-9]+}}, v[[[V_LO]]:[[V_HI]]], s
 define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
@@ -843,10 +843,10 @@ define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}}
 ; GFX1032-DAG: v_cmp_eq_u32_e64 [[C_LO:vcc_lo|s[0-9]+]], 0x64, {{s[0-9]+}}
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[C_LO]]
-; GFX1064:     v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}}
+; GFX1064:     v_cmp_eq_u32_e64 s[[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}}
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
-; GCN:         store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s
+; GCN:         store_dwordx2 v{{[0-9]+}}, v[[[V_LO]]:[[V_HI]]], s
 define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
@@ -856,7 +856,7 @@ define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src)
 ; GCN-LABEL: {{^}}test_intr_fcmp_i32:
 ; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
-; GFX1064:     v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
+; GFX1064:     v_cmp_eq_f32_e64 s[[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GCN:         store_dword v{{[0-9]+}}, v[[V_LO]], s
 define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) {
@@ -869,7 +869,7 @@ define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src
 ; GCN-LABEL: {{^}}test_intr_icmp_i32:
 ; GFX1032-DAG: v_cmp_eq_u32_e64 s[[C_LO:[0-9]+]], 0x64, {{s[0-9]+}}
 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
-; GFX1064:     v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}}
+; GFX1064:     v_cmp_eq_u32_e64 s[[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}}
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
 ; GCN:         store_dword v{{[0-9]+}}, v[[V_LO]], s
 define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) {
diff --git a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
index 4ae219949c220..e2858fd259362 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cmp_u_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
 ; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
-; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
+; GCN: v_cmp_lt_i64_e32 vcc, -1, v[[[VSEL]]:[[VSEL_EXT]]]
 define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
 bb:
   %tmp = extractelement <4 x double> %arg, i64 0
@@ -30,7 +30,7 @@ bb:
 ; GCN: v_cmp_eq_u64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]],
 ; GCN: v_cndmask_b32_e64 v[[VSEL:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: v_mov_b32_e32 v[[VSEL_EXT:[0-9]+]], v[[VSEL]]
-; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[VSEL]]:[[VSEL_EXT]]{{\]}}
+; GCN: v_cmp_lt_i64_e32 vcc, -1, v[[[VSEL]]:[[VSEL_EXT]]]
 define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 {
 bb:
   %tmp = extractelement <4 x i64> %arg, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index ca831154cde23..44f3d6d7a46f9 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -12,7 +12,7 @@ define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) {
   %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
 
-; GFX9: s_or_saveexec_b64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, -1
+; GFX9: s_or_saveexec_b64 s[{{[0-9]+}}:{{[0-9]+}}], -1
 
 ; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
@@ -31,7 +31,7 @@ define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) {
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
 
 ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %tmp137
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -53,7 +53,7 @@ entry:
 ; GFX9-O3: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
 ; GFX9-O0: v_add_u32_e64 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
 ; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
-; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
+; GFX9-O0: buffer_store_dword v[[FIRST]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
   %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
   %tmp121 = add i32 %tmp105, %tmp120
   %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
@@ -68,7 +68,7 @@ if:
 ; GFX9-O3: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
 ; GFX9-O0: v_add_u32_e64 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
 ; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
-; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
+; GFX9-O0: buffer_store_dword v[[SECOND]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
   %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
   %tmp136 = add i32 %tmp107, %tmp135
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
@@ -77,9 +77,9 @@ if:
 merge:
   %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
 ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET]]
-; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET]]
-; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET]]
+; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET]]
+; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %merge_value
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -138,9 +138,9 @@ define i64 @called_i64(i64 %a) noinline {
 
 ; GFX9-LABEL: {{^}}call_i64:
 define amdgpu_kernel void @call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) {
-; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}}
+; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]]
 
-; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}}
+; GFX9-O0: s_mov_b64 s[[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]], 0{{$}}
 ; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]]
 ; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]]
 
@@ -203,7 +203,7 @@ define amdgpu_cs void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
   %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
 
-; GFX9: s_or_saveexec_b64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, -1
+; GFX9: s_or_saveexec_b64 s[{{[0-9]+}}:{{[0-9]+}}], -1
 
 ; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
@@ -222,7 +222,7 @@ define amdgpu_cs void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
   %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
 
 ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %tmp137
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -244,7 +244,7 @@ entry:
 ; GFX9-O3: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
 ; GFX9-O0: v_add_u32_e64 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
 ; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
-; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
+; GFX9-O0: buffer_store_dword v[[FIRST]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
   %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
   %tmp121 = add i32 %tmp105, %tmp120
   %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
@@ -259,7 +259,7 @@ if:
 ; GFX9-O3: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
 ; GFX9-O0: v_add_u32_e64 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
 ; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
-; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
+; GFX9-O0: buffer_store_dword v[[SECOND]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
   %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
   %tmp136 = add i32 %tmp107, %tmp135
   %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
@@ -268,9 +268,9 @@ if:
 merge:
   %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
 ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET]]
-; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET]]
-; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
+; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET]]
+; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET]]
+; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %merge_value
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -329,9 +329,9 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
 
 ; GFX9-LABEL: {{^}}strict_wwm_call_i64:
 define amdgpu_kernel void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) {
-; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}}
+; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]]
 
-; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}}
+; GFX9-O0: s_mov_b64 s[[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]], 0{{$}}
 ; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]]
 ; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index df32706600699..d85733d85363c 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -173,7 +173,7 @@ endif:
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_literal_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
 ; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s{{[0-9]+}}, 0xf237b
 ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s{{[0-9]+}}, 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
@@ -185,10 +185,10 @@ define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64:
-; SI: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; SI: s_load_dwordx4 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
-; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
@@ -202,7 +202,7 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %o
 }
 
 ; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64:
-; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
+; SI: s_load_dwordx2 s[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
 ; SI-NOT: xor_b32
 ; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63
 ; SI-NOT: xor_b32
@@ -210,7 +210,7 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %o
 ; SI-NOT: xor_b32
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: xor_b32
-; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
 define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
@@ -227,7 +227,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}vector_xor_i64_neg_inline_imm:
-; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
 ; SI: s_endpgm
@@ -239,7 +239,7 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: {{^}}vector_xor_literal_i64:
-; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index d7a7d1de23fc8..01820ea1854bb 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -8,7 +8,7 @@
 
 ; GCN: {{^}}s_mad_zext_i32_to_i64:
 ; GCN: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v[0:[[V_ZERO]]]
 define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index 69c42afb9ad5a..78a367875ead6 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}zext_or_operand_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN: buffer_load_dword v[[LD32:[0-9]+]]
 ; GCN-NOT: _or_
 ; GCN-NOT: v[[HI]]
@@ -10,7 +10,7 @@
 ; GCN-NOT: _or_
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
@@ -21,7 +21,7 @@ define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrs
 }
 
 ; GCN-LABEL: {{^}}zext_or_operand_commute_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; GCN: buffer_load_dword v[[LD32:[0-9]+]]
 ; GCN-NOT: _or_
 ; GCN-NOT: v[[HI]]
@@ -30,7 +30,7 @@ define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrs
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: _or_
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
 define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
diff --git a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
index 8e48f612f3baa..3e9fac8b03f81 100644
--- a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
+++ b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -29,7 +29,7 @@ entry:
 ; CHECK: movw [[BASE:r[0-9]+]], :lower16:static_val
 ; CHECK: movt [[BASE]], :upper16:static_val
 ; ldm is not formed when the coalescer failed to coalesce everything.
-; CHECK: ldrd    r2, [[TMP:r[0-9]+]], {{\[}}[[BASE]]{{\]}}
+; CHECK: ldrd    r2, [[TMP:r[0-9]+]], [[[BASE]]]
 ; CHECK: movw r0, #555
 define i32 @main() {
 entry:
@@ -56,7 +56,7 @@ entry:
 ; CHECK: movw [[BASE:r[0-9]+]], :lower16:static_val
 ; CHECK: movt [[BASE]], :upper16:static_val
 ; ldm is not formed when the coalescer failed to coalesce everything.
-; CHECK: ldrd     r2, [[TMP:r[0-9]+]], {{\[}}[[BASE]]{{\]}}
+; CHECK: ldrd     r2, [[TMP:r[0-9]+]], [[[BASE]]]
 ; CHECK: movw r0, #555
 define i32 @main_fixed_arg() {
 entry:
diff --git a/llvm/test/CodeGen/ARM/Windows/tls.ll b/llvm/test/CodeGen/ARM/Windows/tls.ll
index 931621a5d3930..4cd7f54deff34 100644
--- a/llvm/test/CodeGen/ARM/Windows/tls.ll
+++ b/llvm/test/CodeGen/ARM/Windows/tls.ll
@@ -17,13 +17,13 @@ define i32 @f() {
 
 ; CHECK:      movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
 ; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
-; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
-; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
-; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], [[[TLS_INDEX]]]
+; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], [[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], [[[TLS_POINTER]], [[INDEX]], lsl #2]
 
 ; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:\.LCPI[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+; CHECK-NEXT: ldr r0, [[[TLS]], [[SLOT]]]
 
 ; CHECK: [[CPI]]:
 ; CHECK-NEXT: .long i(SECREL32)
@@ -37,13 +37,13 @@ define i32 @e() {
 
 ; CHECK:      movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
 ; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
-; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
-; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
-; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], [[[TLS_INDEX]]]
+; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], [[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], [[[TLS_POINTER]], [[INDEX]], lsl #2]
 
 ; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:\.LCPI[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+; CHECK-NEXT: ldr r0, [[[TLS]], [[SLOT]]]
 
 ; CHECK: [[CPI]]:
 ; CHECK-NEXT: .long j(SECREL32)
@@ -57,13 +57,13 @@ define i32 @d() {
 
 ; CHECK:      movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
 ; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
-; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
-; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
-; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], [[[TLS_INDEX]]]
+; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], [[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], [[[TLS_POINTER]], [[INDEX]], lsl #2]
 
 ; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:\.LCPI[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+; CHECK-NEXT: ldr r0, [[[TLS]], [[SLOT]]]
 
 ; CHECK: [[CPI]]:
 ; CHECK-NEXT: .long k(SECREL32)
@@ -77,13 +77,13 @@ define i32 @c() {
 
 ; CHECK:      movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
 ; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
-; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
-; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
-; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], [[[TLS_INDEX]]]
+; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], [[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], [[[TLS_POINTER]], [[INDEX]], lsl #2]
 
 ; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:\.LCPI[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+; CHECK-NEXT: ldr r0, [[[TLS]], [[SLOT]]]
 
 ; CHECK: [[CPI]]:
 ; CHECK-NEXT: .long l(SECREL32)
@@ -97,13 +97,13 @@ define i32 @b() {
 
 ; CHECK:      movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
 ; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
-; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
-; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
-; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], [[[TLS_INDEX]]]
+; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], [[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], [[[TLS_POINTER]], [[INDEX]], lsl #2]
 
 ; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:\.LCPI[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ldr r0, {{\[}}[[TLS]], [[SLOT]]]
+; CHECK-NEXT: ldr r0, [[[TLS]], [[SLOT]]]
 
 ; CHECK: [[CPI]]:
 ; CHECK: .long m(SECREL32)
@@ -117,13 +117,13 @@ define i16 @a() {
 
 ; CHECK:      movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
 ; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
-; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
-; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
-; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], [[[TLS_INDEX]]]
+; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], [[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], [[[TLS_POINTER]], [[INDEX]], lsl #2]
 
 ; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:\.LCPI[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ldrh r0, {{\[}}[[TLS]], [[SLOT]]]
+; CHECK-NEXT: ldrh r0, [[[TLS]], [[SLOT]]]
 
 ; CHECK: [[CPI]]:
 ; CHECK: .long n(SECREL32)
@@ -137,13 +137,13 @@ define i8 @Z() {
 
 ; CHECK:      movw [[TLS_INDEX:r[0-9]]], :lower16:_tls_index
 ; CHECK-NEXT: movt [[TLS_INDEX]], :upper16:_tls_index
-; CHECK-NEXT: ldr [[INDEX:r[0-9]]], {{\[}}[[TLS_INDEX]]]
-; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], {{\[}}[[TEB]], #44]
-; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], {{\[}}[[TLS_POINTER]], [[INDEX]], lsl #2]
+; CHECK-NEXT: ldr [[INDEX:r[0-9]]], [[[TLS_INDEX]]]
+; CHECK-NEXT: ldr [[TLS_POINTER:r[0-9]]], [[[TEB]], #44]
+; CHECK-NEXT: ldr{{.w}} [[TLS:r[0-9]]], [[[TLS_POINTER]], [[INDEX]], lsl #2]
 
 ; CHECK-NEXT: ldr [[SLOT:r[0-9]]], [[CPI:\.LCPI[0-9]+_[0-9]+]]
 
-; CHECK-NEXT: ldrb r0, {{\[}}[[TLS]], [[SLOT]]]
+; CHECK-NEXT: ldrb r0, [[[TLS]], [[SLOT]]]
 
 ; CHECK: [[CPI]]:
 ; CHECK-NEXT: .long o(SECREL32)
diff --git a/llvm/test/CodeGen/ARM/cmpxchg-O0-be.ll b/llvm/test/CodeGen/ARM/cmpxchg-O0-be.ll
index 29336c2f7ffdf..cace190d15e6c 100644
--- a/llvm/test/CodeGen/ARM/cmpxchg-O0-be.ll
+++ b/llvm/test/CodeGen/ARM/cmpxchg-O0-be.ll
@@ -5,11 +5,11 @@
 @z = global i64 20, align 8
 
 ; CHECK_LABEL:	main:
-; CHECK:	ldr [[R2:r[0-9]+]], {{\[}}[[R1:r[0-9]+]]{{\]}}
-; CHECK-NEXT:	ldr [[R1]], {{\[}}[[R1]], #4]
+; CHECK:	ldr [[R2:r[0-9]+]], [[[R1:r[0-9]+]]]
+; CHECK-NEXT:	ldr [[R1]], [[[R1]], #4]
 ; CHECK:	mov [[R4:r[0-9]+]], [[R1]]
-; CHECK:	ldr [[R5:r[0-9]+]], {{\[}}[[R1]]{{\]}}
-; CHECK-NEXT:	ldr [[R6:r[0-9]+]], {{\[}}[[R1]], #4]
+; CHECK:	ldr [[R5:r[0-9]+]], [[[R1]]]
+; CHECK-NEXT:	ldr [[R6:r[0-9]+]], [[[R1]], #4]
 ; CHECK:	mov [[R7:r[0-9]+]], [[R6]]
 
 define arm_aapcs_vfpcc i32 @main() #0 {
diff --git a/llvm/test/CodeGen/ARM/cmpxchg-O0.ll b/llvm/test/CodeGen/ARM/cmpxchg-O0.ll
index 1bc15dce20813..898ccd229499f 100644
--- a/llvm/test/CodeGen/ARM/cmpxchg-O0.ll
+++ b/llvm/test/CodeGen/ARM/cmpxchg-O0.ll
@@ -12,10 +12,10 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 ; CHECK:     dmb ish
 ; CHECK:     uxtb [[DESIRED:r[0-9]+]], [[DESIRED]]
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldrexb [[OLD:[lr0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldrexb [[OLD:[lr0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD]], [[DESIRED]]
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     strexb [[STATUS:r[0-9]+]], [[NEW]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     strexb [[STATUS:r[0-9]+]], [[NEW]], [[[ADDR]]]
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
@@ -36,10 +36,10 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
 ; CHECK:     dmb ish
 ; CHECK:     uxth [[DESIRED:r[0-9]+]], [[DESIRED]]
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldrexh [[OLD:[lr0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldrexh [[OLD:[lr0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD]], [[DESIRED]]
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     strexh [[STATUS:r[0-9]+]], [[NEW]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     strexh [[STATUS:r[0-9]+]], [[NEW]], [[[ADDR]]]
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
@@ -60,10 +60,10 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind
 ; CHECK:     dmb ish
 ; CHECK-NOT:     uxt
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldrex [[OLD:r[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldrex [[OLD:r[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLD]], [[DESIRED]]
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     strex [[STATUS:r[0-9]+]], [[NEW]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     strex [[STATUS:r[0-9]+]], [[NEW]], [[[ADDR]]]
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
@@ -82,7 +82,7 @@ define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind
 ; CHECK:     dmb ish
 ; CHECK-NOT: uxt
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLDLO]], r6
 ; CHECK:     cmpeq [[OLDHI]], r7
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
@@ -101,11 +101,11 @@ define { i64, i1 } @test_nontrivial_args(i64* %addr, i64 %desired, i64 %new) {
 ; CHECK:     dmb ish
 ; CHECK-NOT: uxt
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
-; CHECK:     ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], {{\[}}[[ADDR]]{{\]}}
+; CHECK:     ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [[[ADDR]]]
 ; CHECK:     cmp [[OLDLO]], {{r[0-9]+}}
 ; CHECK:     cmpeq [[OLDHI]], {{r[0-9]+}}
 ; CHECK:     bne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     strexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, {{\[}}[[ADDR]]{{\]}}
+; CHECK:     strexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [[[ADDR]]]
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
diff --git a/llvm/test/CodeGen/ARM/constantpool-promote-ldrh.ll b/llvm/test/CodeGen/ARM/constantpool-promote-ldrh.ll
index 0767d729a0ae6..6fa5288c643b8 100644
--- a/llvm/test/CodeGen/ARM/constantpool-promote-ldrh.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-promote-ldrh.ll
@@ -9,7 +9,7 @@ target triple = "thumbv6m-arm-linux-gnueabi"
 ; CHECK-LABEL: fn1:
 ; CHECK: adr [[base:r[0-9]+]], .LCPI0_0
 ; CHECK-NOT: ldrh {{r[0-9]+}}, .LCPI0_0
-; CHECK: ldrh r{{[0-9]+}}, {{\[}}[[base]]]
+; CHECK: ldrh r{{[0-9]+}}, [[[base]]]
 define hidden i32 @fn1() #0 {
 entry:
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 undef, i8* align 2 bitcast ([4 x i16]* @fn1.a to i8*), i32 8, i1 false)
diff --git a/llvm/test/CodeGen/ARM/fast-isel-call.ll b/llvm/test/CodeGen/ARM/fast-isel-call.ll
index f2d892d03d2d6..d1e5ebc30c254 100644
--- a/llvm/test/CodeGen/ARM/fast-isel-call.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel-call.ll
@@ -107,11 +107,11 @@ entry:
 
 ; ARM-LONG-MACHO: {{(movw)|(ldr)}} [[R1:l?r[0-9]*]], {{(:lower16:L_bar\$non_lazy_ptr)|(.LCPI)}}
 ; ARM-LONG-MACHO: {{(movt [[R1]], :upper16:L_bar\$non_lazy_ptr)?}}
-; ARM-LONG-MACHO: ldr [[R:r[0-9]+]], {{\[}}[[R1]]]
+; ARM-LONG-MACHO: ldr [[R:r[0-9]+]], [[[R1]]]
 
 ; ARM-LONG-ELF: movw [[R1:r[0-9]*]], :lower16:bar
 ; ARM-LONG-ELF: movt [[R1]], :upper16:bar
-; ARM-LONG-ELF: ldr [[R:r[0-9]+]], {{\[}}[[R1]]]
+; ARM-LONG-ELF: ldr [[R:r[0-9]+]], [[[R1]]]
 
 ; ARM-LONG: blx [[R]]
 ; THUMB-LABEL: @t10
@@ -133,7 +133,7 @@ entry:
 ; THUMB-LONG-LABEL: @t10
 ; THUMB-LONG: {{(movw)|(ldr.n)}} [[R1:l?r[0-9]*]], {{(:lower16:L_bar\$non_lazy_ptr)|(.LCPI)}}
 ; THUMB-LONG: {{(movt [[R1]], :upper16:L_bar\$non_lazy_ptr)?}}
-; THUMB-LONG: ldr{{(.w)?}} [[R:r[0-9]+]], {{\[}}[[R1]]{{\]}}
+; THUMB-LONG: ldr{{(.w)?}} [[R:r[0-9]+]], [[[R1]]]
 ; THUMB-LONG: blx [[R]]
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
   ret i32 0
diff --git a/llvm/test/CodeGen/ARM/fast-isel-intrinsic.ll b/llvm/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 4ad74df771b34..359ff73ef31be 100644
--- a/llvm/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -144,12 +144,12 @@ define void @t4() nounwind ssp {
 ; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM-MACHO:      ldr [[REG0:r[0-9]+]], [r0]
-; ARM-MACHO-NEXT: ldr [[REG1:r[0-9]+]], {{\[}}[[REG0]], #16]
-; ARM-MACHO-NEXT: str [[REG1]], {{\[}}[[REG0]], #4]
-; ARM-MACHO-NEXT: ldr [[REG2:r[0-9]+]], {{\[}}[[REG0]], #20]
-; ARM-MACHO-NEXT: str [[REG2]], {{\[}}[[REG0]], #8]
-; ARM-MACHO-NEXT: ldrh [[REG3:r[0-9]+]], {{\[}}[[REG0]], #24]
-; ARM-MACHO-NEXT: strh [[REG3]], {{\[}}[[REG0]], #12]
+; ARM-MACHO-NEXT: ldr [[REG1:r[0-9]+]], [[[REG0]], #16]
+; ARM-MACHO-NEXT: str [[REG1]], [[[REG0]], #4]
+; ARM-MACHO-NEXT: ldr [[REG2:r[0-9]+]], [[[REG0]], #20]
+; ARM-MACHO-NEXT: str [[REG2]], [[[REG0]], #8]
+; ARM-MACHO-NEXT: ldrh [[REG3:r[0-9]+]], [[[REG0]], #24]
+; ARM-MACHO-NEXT: strh [[REG3]], [[[REG0]], #12]
 
 ; ARM-ELF: movw [[REG0:r[0-9]+]], :lower16:temp
 ; ARM-ELF: movt [[REG0]], :upper16:temp
@@ -159,12 +159,12 @@ define void @t4() nounwind ssp {
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr [[REG1:r[0-9]+]], [r0]
-; THUMB: ldr [[REG2:r[0-9]+]], {{\[}}[[REG1]], #16]
-; THUMB: str [[REG2]], {{\[}}[[REG1]], #4]
-; THUMB: ldr [[REG3:r[0-9]+]], {{\[}}[[REG1]], #20]
-; THUMB: str [[REG3]], {{\[}}[[REG1]], #8]
-; THUMB: ldrh [[REG4:r[0-9]+]], {{\[}}[[REG1]], #24]
-; THUMB: strh [[REG4]], {{\[}}[[REG1]], #12]
+; THUMB: ldr [[REG2:r[0-9]+]], [[[REG1]], #16]
+; THUMB: str [[REG2]], [[[REG1]], #4]
+; THUMB: ldr [[REG3:r[0-9]+]], [[[REG1]], #20]
+; THUMB: str [[REG3]], [[[REG1]], #8]
+; THUMB: ldrh [[REG4:r[0-9]+]], [[[REG1]], #24]
+; THUMB: strh [[REG4]], [[[REG1]], #12]
 ; THUMB: bx lr
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 getelementptr inbounds ([60 x i8], [60 x i8]* @temp, i32 0, i32 4), i8* align 4 getelementptr inbounds ([60 x i8], [60 x i8]* @temp, i32 0, i32 16), i32 10, i1 false)
   ret void
@@ -178,16 +178,16 @@ define void @t5() nounwind ssp {
 ; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM-MACHO: ldr [[REG0:r[0-9]+]], [r0]
-; ARM-MACHO: ldrh [[REG1:r[0-9]+]], {{\[}}[[REG0]], #16]
-; ARM-MACHO-NEXT: strh [[REG1]], {{\[}}[[REG0]], #4]
-; ARM-MACHO-NEXT: ldrh [[REG2:r[0-9]+]], {{\[}}[[REG0]], #18]
-; ARM-MACHO-NEXT: strh [[REG2]], {{\[}}[[REG0]], #6]
-; ARM-MACHO-NEXT: ldrh [[REG3:r[0-9]+]], {{\[}}[[REG0]], #20]
-; ARM-MACHO-NEXT: strh [[REG3]], {{\[}}[[REG0]], #8]
-; ARM-MACHO-NEXT: ldrh [[REG4:r[0-9]+]], {{\[}}[[REG0]], #22]
-; ARM-MACHO-NEXT: strh [[REG4]], {{\[}}[[REG0]], #10]
-; ARM-MACHO-NEXT: ldrh [[REG5:r[0-9]+]], {{\[}}[[REG0]], #24]
-; ARM-MACHO-NEXT: strh [[REG5]], {{\[}}[[REG0]], #12]
+; ARM-MACHO: ldrh [[REG1:r[0-9]+]], [[[REG0]], #16]
+; ARM-MACHO-NEXT: strh [[REG1]], [[[REG0]], #4]
+; ARM-MACHO-NEXT: ldrh [[REG2:r[0-9]+]], [[[REG0]], #18]
+; ARM-MACHO-NEXT: strh [[REG2]], [[[REG0]], #6]
+; ARM-MACHO-NEXT: ldrh [[REG3:r[0-9]+]], [[[REG0]], #20]
+; ARM-MACHO-NEXT: strh [[REG3]], [[[REG0]], #8]
+; ARM-MACHO-NEXT: ldrh [[REG4:r[0-9]+]], [[[REG0]], #22]
+; ARM-MACHO-NEXT: strh [[REG4]], [[[REG0]], #10]
+; ARM-MACHO-NEXT: ldrh [[REG5:r[0-9]+]], [[[REG0]], #24]
+; ARM-MACHO-NEXT: strh [[REG5]], [[[REG0]], #12]
 
 ; ARM-ELF: movw [[REG0:r[0-9]+]], :lower16:temp
 ; ARM-ELF: movt [[REG0]], :upper16:temp
@@ -197,16 +197,16 @@ define void @t5() nounwind ssp {
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr [[REG1:r[0-9]+]], [r0]
-; THUMB: ldrh [[REG2:r[0-9]+]], {{\[}}[[REG1]], #16]
-; THUMB: strh [[REG2]], {{\[}}[[REG1]], #4]
-; THUMB: ldrh [[REG3:r[0-9]+]], {{\[}}[[REG1]], #18]
-; THUMB: strh [[REG3]], {{\[}}[[REG1]], #6]
-; THUMB: ldrh [[REG4:r[0-9]+]], {{\[}}[[REG1]], #20]
-; THUMB: strh [[REG4]], {{\[}}[[REG1]], #8]
-; THUMB: ldrh [[REG5:r[0-9]+]], {{\[}}[[REG1]], #22]
-; THUMB: strh [[REG5]], {{\[}}[[REG1]], #10]
-; THUMB: ldrh [[REG6:r[0-9]+]], {{\[}}[[REG1]], #24]
-; THUMB: strh [[REG6]], {{\[}}[[REG1]], #12]
+; THUMB: ldrh [[REG2:r[0-9]+]], [[[REG1]], #16]
+; THUMB: strh [[REG2]], [[[REG1]], #4]
+; THUMB: ldrh [[REG3:r[0-9]+]], [[[REG1]], #18]
+; THUMB: strh [[REG3]], [[[REG1]], #6]
+; THUMB: ldrh [[REG4:r[0-9]+]], [[[REG1]], #20]
+; THUMB: strh [[REG4]], [[[REG1]], #8]
+; THUMB: ldrh [[REG5:r[0-9]+]], [[[REG1]], #22]
+; THUMB: strh [[REG5]], [[[REG1]], #10]
+; THUMB: ldrh [[REG6:r[0-9]+]], [[[REG1]], #24]
+; THUMB: strh [[REG6]], [[[REG1]], #12]
 ; THUMB: bx lr
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 getelementptr inbounds ([60 x i8], [60 x i8]* @temp, i32 0, i32 4), i8* align 2 getelementptr inbounds ([60 x i8], [60 x i8]* @temp, i32 0, i32 16), i32 10, i1 false)
   ret void
@@ -218,26 +218,26 @@ define void @t6() nounwind ssp {
 ; ARM-MACHO: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM-MACHO: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; ARM-MACHO: ldr [[REG0:r[0-9]+]], [r0]
-; ARM-MACHO: ldrb [[REG1:r[0-9]+]], {{\[}}[[REG0]], #16]
-; ARM-MACHO-NEXT: strb [[REG1]], {{\[}}[[REG0]], #4]
-; ARM-MACHO-NEXT: ldrb [[REG2:r[0-9]+]], {{\[}}[[REG0]], #17]
-; ARM-MACHO-NEXT: strb [[REG2]], {{\[}}[[REG0]], #5]
-; ARM-MACHO-NEXT: ldrb [[REG3:r[0-9]+]], {{\[}}[[REG0]], #18]
-; ARM-MACHO-NEXT: strb [[REG3]], {{\[}}[[REG0]], #6]
-; ARM-MACHO-NEXT: ldrb [[REG4:r[0-9]+]], {{\[}}[[REG0]], #19]
-; ARM-MACHO-NEXT: strb [[REG4]], {{\[}}[[REG0]], #7]
-; ARM-MACHO-NEXT: ldrb [[REG5:r[0-9]+]], {{\[}}[[REG0]], #20]
-; ARM-MACHO-NEXT: strb [[REG5]], {{\[}}[[REG0]], #8]
-; ARM-MACHO-NEXT: ldrb [[REG6:r[0-9]+]], {{\[}}[[REG0]], #21]
-; ARM-MACHO-NEXT: strb [[REG6]], {{\[}}[[REG0]], #9]
-; ARM-MACHO-NEXT: ldrb [[REG7:r[0-9]+]], {{\[}}[[REG0]], #22]
-; ARM-MACHO-NEXT: strb [[REG7]], {{\[}}[[REG0]], #10]
-; ARM-MACHO-NEXT: ldrb [[REG8:r[0-9]+]], {{\[}}[[REG0]], #23]
-; ARM-MACHO-NEXT: strb [[REG8]], {{\[}}[[REG0]], #11]
-; ARM-MACHO-NEXT: ldrb [[REG9:r[0-9]+]], {{\[}}[[REG0]], #24]
-; ARM-MACHO-NEXT: strb [[REG9]], {{\[}}[[REG0]], #12]
-; ARM-MACHO-NEXT: ldrb [[REG10:r[0-9]+]], {{\[}}[[REG0]], #25]
-; ARM-MACHO-NEXT: strb [[REG10]], {{\[}}[[REG0]], #13]
+; ARM-MACHO: ldrb [[REG1:r[0-9]+]], [[[REG0]], #16]
+; ARM-MACHO-NEXT: strb [[REG1]], [[[REG0]], #4]
+; ARM-MACHO-NEXT: ldrb [[REG2:r[0-9]+]], [[[REG0]], #17]
+; ARM-MACHO-NEXT: strb [[REG2]], [[[REG0]], #5]
+; ARM-MACHO-NEXT: ldrb [[REG3:r[0-9]+]], [[[REG0]], #18]
+; ARM-MACHO-NEXT: strb [[REG3]], [[[REG0]], #6]
+; ARM-MACHO-NEXT: ldrb [[REG4:r[0-9]+]], [[[REG0]], #19]
+; ARM-MACHO-NEXT: strb [[REG4]], [[[REG0]], #7]
+; ARM-MACHO-NEXT: ldrb [[REG5:r[0-9]+]], [[[REG0]], #20]
+; ARM-MACHO-NEXT: strb [[REG5]], [[[REG0]], #8]
+; ARM-MACHO-NEXT: ldrb [[REG6:r[0-9]+]], [[[REG0]], #21]
+; ARM-MACHO-NEXT: strb [[REG6]], [[[REG0]], #9]
+; ARM-MACHO-NEXT: ldrb [[REG7:r[0-9]+]], [[[REG0]], #22]
+; ARM-MACHO-NEXT: strb [[REG7]], [[[REG0]], #10]
+; ARM-MACHO-NEXT: ldrb [[REG8:r[0-9]+]], [[[REG0]], #23]
+; ARM-MACHO-NEXT: strb [[REG8]], [[[REG0]], #11]
+; ARM-MACHO-NEXT: ldrb [[REG9:r[0-9]+]], [[[REG0]], #24]
+; ARM-MACHO-NEXT: strb [[REG9]], [[[REG0]], #12]
+; ARM-MACHO-NEXT: ldrb [[REG10:r[0-9]+]], [[[REG0]], #25]
+; ARM-MACHO-NEXT: strb [[REG10]], [[[REG0]], #13]
 
 ; ARM-ELF: movw [[REG0:r[0-9]+]], :lower16:temp
 ; ARM-ELF: movt [[REG0]], :upper16:temp
@@ -247,26 +247,26 @@ define void @t6() nounwind ssp {
 ; THUMB: {{(movw r0, :lower16:L_temp\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_temp\$non_lazy_ptr)?}}
 ; THUMB: ldr [[REG0:r[0-9]+]], [r0]
-; THUMB: ldrb [[REG2:r[0-9]+]], {{\[}}[[REG0]], #16]
-; THUMB: strb [[REG2]], {{\[}}[[REG0]], #4]
-; THUMB: ldrb [[REG3:r[0-9]+]], {{\[}}[[REG0]], #17]
-; THUMB: strb [[REG3]], {{\[}}[[REG0]], #5]
-; THUMB: ldrb [[REG4:r[0-9]+]], {{\[}}[[REG0]], #18]
-; THUMB: strb [[REG4]], {{\[}}[[REG0]], #6]
-; THUMB: ldrb [[REG5:r[0-9]+]], {{\[}}[[REG0]], #19]
-; THUMB: strb [[REG5]], {{\[}}[[REG0]], #7]
-; THUMB: ldrb [[REG6:r[0-9]+]], {{\[}}[[REG0]], #20]
-; THUMB: strb [[REG6]], {{\[}}[[REG0]], #8]
-; THUMB: ldrb [[REG7:r[0-9]+]], {{\[}}[[REG0]], #21]
-; THUMB: strb [[REG7]], {{\[}}[[REG0]], #9]
-; THUMB: ldrb [[REG8:r[0-9]+]], {{\[}}[[REG0]], #22]
-; THUMB: strb [[REG8]], {{\[}}[[REG0]], #10]
-; THUMB: ldrb [[REG9:r[0-9]+]], {{\[}}[[REG0]], #23]
-; THUMB: strb [[REG9]], {{\[}}[[REG0]], #11]
-; THUMB: ldrb [[REG10:r[0-9]+]], {{\[}}[[REG0]], #24]
-; THUMB: strb [[REG10]], {{\[}}[[REG0]], #12]
-; THUMB: ldrb [[REG11:r[0-9]+]], {{\[}}[[REG0]], #25]
-; THUMB: strb [[REG11]], {{\[}}[[REG0]], #13]
+; THUMB: ldrb [[REG2:r[0-9]+]], [[[REG0]], #16]
+; THUMB: strb [[REG2]], [[[REG0]], #4]
+; THUMB: ldrb [[REG3:r[0-9]+]], [[[REG0]], #17]
+; THUMB: strb [[REG3]], [[[REG0]], #5]
+; THUMB: ldrb [[REG4:r[0-9]+]], [[[REG0]], #18]
+; THUMB: strb [[REG4]], [[[REG0]], #6]
+; THUMB: ldrb [[REG5:r[0-9]+]], [[[REG0]], #19]
+; THUMB: strb [[REG5]], [[[REG0]], #7]
+; THUMB: ldrb [[REG6:r[0-9]+]], [[[REG0]], #20]
+; THUMB: strb [[REG6]], [[[REG0]], #8]
+; THUMB: ldrb [[REG7:r[0-9]+]], [[[REG0]], #21]
+; THUMB: strb [[REG7]], [[[REG0]], #9]
+; THUMB: ldrb [[REG8:r[0-9]+]], [[[REG0]], #22]
+; THUMB: strb [[REG8]], [[[REG0]], #10]
+; THUMB: ldrb [[REG9:r[0-9]+]], [[[REG0]], #23]
+; THUMB: strb [[REG9]], [[[REG0]], #11]
+; THUMB: ldrb [[REG10:r[0-9]+]], [[[REG0]], #24]
+; THUMB: strb [[REG10]], [[[REG0]], #12]
+; THUMB: ldrb [[REG11:r[0-9]+]], [[[REG0]], #25]
+; THUMB: strb [[REG11]], [[[REG0]], #13]
 ; THUMB: bx lr
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr inbounds ([60 x i8], [60 x i8]* @temp, i32 0, i32 4), i8* align 1 getelementptr inbounds ([60 x i8], [60 x i8]* @temp, i32 0, i32 16), i32 10, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll b/llvm/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
index b692b4bd6ffc2..512796d42b009 100644
--- a/llvm/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
@@ -87,7 +87,7 @@ entry:
   %add.ptr = getelementptr inbounds i32, i32* %ptr, i32 -1
   store i32 0, i32* %add.ptr, align 4
 ; THUMB: mov [[REG:r[0-9]+]], r0
-; THUMB: str r{{[0-9]}}, {{\[}}[[REG]], #-4]
+; THUMB: str r{{[0-9]}}, [[[REG]], #-4]
   ret void
 }
 
@@ -97,7 +97,7 @@ entry:
   %add.ptr = getelementptr inbounds i32, i32* %ptr, i32 -63
   store i32 0, i32* %add.ptr, align 4
 ; THUMB: mov [[REG:r[0-9]+]], r0
-; THUMB: str r{{[0-9]}}, {{\[}}[[REG]], #-252]
+; THUMB: str r{{[0-9]}}, [[[REG]], #-252]
   ret void
 }
 
@@ -111,7 +111,7 @@ entry:
 ; THUMB: movw [[REG:r[0-9]+]], #65280
 ; THUMB: movt [[REG]], #65535
 ; THUMB: add [[PTR]], [[REG]]
-; THUMB: str [[VAL]], {{\[}}[[PTR]]]
+; THUMB: str [[VAL]], [[[PTR]]]
   ret void
 }
 
@@ -121,7 +121,7 @@ entry:
   %add.ptr = getelementptr inbounds i16, i16* %ptr, i32 -1
   store i16 0, i16* %add.ptr, align 2
 ; THUMB: mov [[REG:r[0-9]+]], r0
-; THUMB: strh r{{[0-9]}}, {{\[}}[[REG]], #-2]
+; THUMB: strh r{{[0-9]}}, [[[REG]], #-2]
   ret void
 }
 
@@ -131,7 +131,7 @@ entry:
   %add.ptr = getelementptr inbounds i16, i16* %ptr, i32 -127
   store i16 0, i16* %add.ptr, align 2
 ; THUMB: mov [[REG:r[0-9]+]], r0
-; THUMB: strh r{{[0-9]}}, {{\[}}[[REG]], #-254]
+; THUMB: strh r{{[0-9]}}, [[[REG]], #-254]
   ret void
 }
 
@@ -145,7 +145,7 @@ entry:
 ; THUMB: movw [[REG:r[0-9]+]], #65280
 ; THUMB: movt [[REG]], #65535
 ; THUMB: add [[PTR]], [[REG]]
-; THUMB: strh [[VAL]], {{\[}}[[PTR]]]
+; THUMB: strh [[VAL]], [[[PTR]]]
   ret void
 }
 
@@ -155,7 +155,7 @@ entry:
   %add.ptr = getelementptr inbounds i8, i8* %ptr, i32 -1
   store i8 0, i8* %add.ptr, align 1
 ; THUMB: mov [[REG:r[0-9]+]], r0
-; THUMB: strb r{{[0-9]}}, {{\[}}[[REG]], #-1]
+; THUMB: strb r{{[0-9]}}, [[[REG]], #-1]
   ret void
 }
 
@@ -165,7 +165,7 @@ entry:
   %add.ptr = getelementptr inbounds i8, i8* %ptr, i32 -255
   store i8 0, i8* %add.ptr, align 1
 ; THUMB: mov [[REG:r[0-9]+]], r0
-; THUMB: strb r{{[0-9]}}, {{\[}}[[REG]], #-255]
+; THUMB: strb r{{[0-9]}}, [[[REG]], #-255]
   ret void
 }
 
@@ -179,6 +179,6 @@ entry:
 ; THUMB: movw [[REG:r[0-9]+]], #65280
 ; THUMB: movt [[REG]], #65535
 ; THUMB: add [[PTR]], [[REG]]
-; THUMB: strb [[VAL]], {{\[}}[[PTR]]]
+; THUMB: strb [[VAL]], [[[PTR]]]
   ret void
 }
diff --git a/llvm/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll b/llvm/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
index f2d728c8b361b..9da10ffc4fe13 100644
--- a/llvm/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
@@ -98,7 +98,7 @@ entry:
 ; ARM: movw [[REG1:r[0-9]+]], #0
 ; ARM: mvn [[REG2:r[0-9]+]], #255
 ; ARM: add [[REG0:r[0-9]+]], r1, [[REG2]]
-; ARM: strh [[REG1]], {{\[}}[[REG0]]]
+; ARM: strh [[REG1]], [[[REG0]]]
   ret void
 }
 
@@ -122,7 +122,7 @@ entry:
 ; ARM: mov r1, r0
 ; ARM: movw [[REG1:r[0-9]+]], #0
 ; ARM: add [[REG0:r[0-9]+]], r1, #256
-; ARM: strh [[REG1]], {{\[}}[[REG0]]]
+; ARM: strh [[REG1]], [[[REG0]]]
   ret void
 }
 
diff --git a/llvm/test/CodeGen/ARM/fast-isel-vararg.ll b/llvm/test/CodeGen/ARM/fast-isel-vararg.ll
index aaed0ca19627f..078bd466e1905 100644
--- a/llvm/test/CodeGen/ARM/fast-isel-vararg.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel-vararg.ll
@@ -18,9 +18,9 @@ entry:
 ; ARM: VarArg
 ; ARM: mov [[FP:r[0-9]+]], sp
 ; ARM: sub sp, sp, #32
-; ARM: ldr r1, {{\[}}[[FP]], #-4]
-; ARM: ldr r2, {{\[}}[[FP]], #-8]
-; ARM: ldr r3, {{\[}}[[FP]], #-12]
+; ARM: ldr r1, [[[FP]], #-4]
+; ARM: ldr r2, [[[FP]], #-8]
+; ARM: ldr r3, [[[FP]], #-12]
 ; ARM: ldr [[Ra:r[0-9]+|lr]], [sp, #16]
 ; ARM: ldr [[Rb:[lr]+[0-9]*]], [sp, #12]
 ; ARM: movw r0, #5
diff --git a/llvm/test/CodeGen/ARM/fast-isel.ll b/llvm/test/CodeGen/ARM/fast-isel.ll
index 2437bda498bad..ebb9d0e9103fb 100644
--- a/llvm/test/CodeGen/ARM/fast-isel.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel.ll
@@ -150,12 +150,12 @@ define void @test4() {
 ; THUMB: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
 ; THUMB: ldr [[REG:r[0-9]+]], [r0]
-; THUMB: ldr [[REG1:r[0-9]+]], {{\[}}[[REG]]]
+; THUMB: ldr [[REG1:r[0-9]+]], [[[REG]]]
 ; THUMB: adds [[REG1]], #1
 ; THUMB: {{(movw r1, :lower16:L_test4g\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r1, :upper16:L_test4g\$non_lazy_ptr)?}}
 ; THUMB: ldr [[REG2:r[0-9]+]], [r1]
-; THUMB: str [[REG1]], {{\[}}[[REG2]]]
+; THUMB: str [[REG1]], [[[REG2]]]
 
 ; ARM-MACHO: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM-MACHO: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
@@ -164,7 +164,7 @@ define void @test4() {
 ; ARM-ELF: movw [[REG:r[0-9]+]], :lower16:test4g
 ; ARM-ELF: movt [[REG]], :upper16:test4g
 
-; ARM: ldr [[REG1:r[0-9]+]], {{\[}}[[REG]]]
+; ARM: ldr [[REG1:r[0-9]+]], [[[REG]]]
 ; ARM: add [[REG2:r[0-9]+]], [[REG1]], #1
 
 ; ARM-MACHO: {{(movw r1, :lower16:L_test4g\$non_lazy_ptr)|(ldr r0, .LCPI)}}
@@ -174,7 +174,7 @@ define void @test4() {
 ; ARM-ELF: movw [[REG3:r[0-9]+]], :lower16:test4g
 ; ARM-ELF: movt [[REG3]], :upper16:test4g
 
-; ARM: str [[REG2]], {{\[}}[[REG3]]]
+; ARM: str [[REG2]], [[[REG3]]]
 }
 
 ; ARM: @urem_fold
diff --git a/llvm/test/CodeGen/ARM/fp16-load-store.ll b/llvm/test/CodeGen/ARM/fp16-load-store.ll
index 4f0003a3e7aca..272827135b72e 100644
--- a/llvm/test/CodeGen/ARM/fp16-load-store.ll
+++ b/llvm/test/CodeGen/ARM/fp16-load-store.ll
@@ -24,7 +24,7 @@ define void @load_256(half* %in, half* %out) {
 entry:
 ; CHECK-LABEL: load_256:
 ; CHECK: add     [[ADDR:r[0-9]+]], r0, #512
-; CHECK: vldr.16 {{s[0-9]+}}, {{\[}}[[ADDR]]{{\]}}
+; CHECK: vldr.16 {{s[0-9]+}}, [[[ADDR]]]
   %arrayidx = getelementptr inbounds half, half* %in, i32 256
   %load = load half, half* %arrayidx, align 2
   store half %load, half* %out
@@ -45,7 +45,7 @@ define void @load_neg_256(half* %in, half* %out) {
 entry:
 ; CHECK-LABEL: load_neg_256:
 ; CHECK: sub     [[ADDR:r[0-9]+]], r0, #512
-; CHECK: vldr.16 {{s[0-9]+}}, {{\[}}[[ADDR]]{{\]}}
+; CHECK: vldr.16 {{s[0-9]+}}, [[[ADDR]]]
   %arrayidx = getelementptr inbounds half, half* %in, i32 -256
   %load = load half, half* %arrayidx, align 2
   store half %load, half* %out
@@ -77,7 +77,7 @@ entry:
 ; CHECK-LABEL: store_256:
   %load = load half, half* %in, align 2
 ; CHECK: add     [[ADDR:r[0-9]+]], r1, #512
-; CHECK: vstr.16 {{s[0-9]+}}, {{\[}}[[ADDR]]{{\]}}
+; CHECK: vstr.16 {{s[0-9]+}}, [[[ADDR]]]
   %arrayidx = getelementptr inbounds half, half* %out, i32 256
   store half %load, half* %arrayidx
   ret void
@@ -98,7 +98,7 @@ entry:
 ; CHECK-LABEL: store_neg_256:
   %load = load half, half* %in, align 2
 ; CHECK: sub     [[ADDR:r[0-9]+]], r1, #512
-; CHECK: vstr.16 {{s[0-9]+}}, {{\[}}[[ADDR]]{{\]}}
+; CHECK: vstr.16 {{s[0-9]+}}, [[[ADDR]]]
   %arrayidx = getelementptr inbounds half, half* %out, i32 -256
   store half %load, half* %arrayidx
   ret void
diff --git a/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll b/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll
index 43479b7e541b7..ae54256b7a285 100644
--- a/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll
+++ b/llvm/test/CodeGen/ARM/i64_volatile_load_store.ll
@@ -10,20 +10,20 @@ entry:
 ; CHECK-LABEL: test:
 ; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], [[[ADDR1]]]
 ; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
 ; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
 ; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
 ; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
-; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], [[[ADDR1]]]
 ; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #4]
-; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #4]
-; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], [[[ADDR0]], #4]
+; CHECK-ARMV4T-NEXT:  str [[R0]], [[[ADDR1]], #4]
+; CHECK-ARMV4T-NEXT:  str [[R1]], [[[ADDR1]]]
   %0 = load volatile i64, i64* @x, align 8
   store volatile i64 %0, i64* @y, align 8
   ret void
@@ -34,20 +34,20 @@ entry:
 ; CHECK-LABEL: test_offset:
 ; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #-4]
-; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #-4]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]], #-4]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], [[[ADDR1]], #-4]
 ; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
 ; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
 ; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
 ; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
-; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #-4]
-; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #-4]
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]], #-4]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], [[[ADDR1]], #-4]
 ; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #-4]
-; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]]]
-; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #-4]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], [[[ADDR0]], #-4]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-ARMV4T-NEXT:  str [[R1]], [[[ADDR1]]]
+; CHECK-ARMV4T-NEXT:  str [[R0]], [[[ADDR1]], #-4]
   %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 -4) to i64*), align 8
   store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 -4) to i64*), align 8
   ret void
@@ -57,18 +57,18 @@ define void @test_offset_1() {
 ; CHECK-LABEL: test_offset_1:
 ; CHECK-ARMV5TE:      ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #255]
-; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #255]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]], #255]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], [[[ADDR1]], #255]
 ; CHECK-T2:           adds [[ADDR0:r[0-9]+]], #255
 ; CHECK-T2-NEXT:      adds [[ADDR1:r[0-9]+]], #255
-; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], [[[ADDR1]]]
 ; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #255]
-; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #259]
-; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #259]
-; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #255]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], [[[ADDR0]], #255]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], [[[ADDR0]], #259]
+; CHECK-ARMV4T-NEXT:  str [[R1]], [[[ADDR1]], #259]
+; CHECK-ARMV4T-NEXT:  str [[R0]], [[[ADDR1]], #255]
 entry:
   %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 255) to i64*), align 8
   store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 255) to i64*), align 8
@@ -81,20 +81,20 @@ define void @test_offset_2() {
 ; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
 ; CHECK-ARMV5TE-NEXT: add [[ADDR0]], [[ADDR0]], #256
 ; CHECK-ARMV5TE-NEXT: add [[ADDR1]], [[ADDR1]], #256
-; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], [[[ADDR1]]]
 ; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
 ; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
 ; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
 ; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
-; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #256]
-; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #256]
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]], #256]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], [[[ADDR1]], #256]
 ; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #256]
-; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #260]
-; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #260]
-; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #256]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], [[[ADDR0]], #256]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], [[[ADDR0]], #260]
+; CHECK-ARMV4T-NEXT:  str [[R1]], [[[ADDR1]], #260]
+; CHECK-ARMV4T-NEXT:  str [[R0]], [[[ADDR1]], #256]
 entry:
   %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 256) to i64*), align 8
   store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 256) to i64*), align 8
@@ -107,20 +107,20 @@ define void @test_offset_3() {
 ; CHECK-ARMV5TE-NEXT: ldr [[ADDR1:r[0-9]+]]
 ; CHECK-ARMV5TE-NEXT: add [[ADDR0]], [[ADDR0]], #1020
 ; CHECK-ARMV5TE-NEXT: add [[ADDR1]], [[ADDR1]], #1020
-; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], [[[ADDR1]]]
 ; CHECK-T2:           movw [[ADDR0:r[0-9]+]], :lower16:x
 ; CHECK-T2-NEXT:      movw [[ADDR1:r[0-9]+]], :lower16:y
 ; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
 ; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
-; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1020]
-; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]], #1020]
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]], #1020]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], [[[ADDR1]], #1020]
 ; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #1020]
-; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1024]
-; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #1024]
-; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #1020]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], [[[ADDR0]], #1020]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], [[[ADDR0]], #1024]
+; CHECK-ARMV4T-NEXT:  str [[R1]], [[[ADDR1]], #1024]
+; CHECK-ARMV4T-NEXT:  str [[R0]], [[[ADDR1]], #1020]
 entry:
   %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 1020) to i64*), align 8
   store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 1020) to i64*), align 8
@@ -133,22 +133,22 @@ define void @test_offset_4() {
 ; CHECK-ARMV5TE:      ldr [[ADDR1:r[0-9]+]]
 ; CHECK-ARMV5TE-NEXT: add [[ADDR0]], [[ADDR0]], #1024
 ; CHECK-ARMV5TE-NEXT: add [[ADDR1]], [[ADDR1]], #1024
-; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-ARMV5TE-NEXT: ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-ARMV5TE-NEXT: strd [[R0]], [[R1]], [[[ADDR1]]]
 ; CHECK-T2:           movw [[ADDR1:r[0-9]+]], :lower16:y
 ; CHECK-T2-NEXT:      movw [[ADDR0:r[0-9]+]], :lower16:x
 ; CHECK-T2-NEXT:      movt [[ADDR1]], :upper16:y
 ; CHECK-T2-NEXT:      movt [[ADDR0]], :upper16:x
 ; CHECK-T2-NEXT:      add.w [[ADDR0]], [[ADDR0]], #1024
 ; CHECK-T2-NEXT:      add.w [[ADDR1]], [[ADDR1]], #1024
-; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], {{\[}}[[ADDR0]]]
-; CHECK-T2-NEXT:      strd [[R0]], [[R1]], {{\[}}[[ADDR1]]]
+; CHECK-T2-NEXT:      ldrd [[R0:r[0-9]+]], [[R1:r[0-9]+]], [[[ADDR0]]]
+; CHECK-T2-NEXT:      strd [[R0]], [[R1]], [[[ADDR1]]]
 ; CHECK-ARMV4T:       ldr [[ADDR0:r[0-9]+]]
 ; CHECK-ARMV4T-NEXT:  ldr [[ADDR1:r[0-9]+]]
-; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], {{\[}}[[ADDR0]], #1024]
-; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], {{\[}}[[ADDR0]], #1028]
-; CHECK-ARMV4T-NEXT:  str [[R1]], {{\[}}[[ADDR1]], #1028]
-; CHECK-ARMV4T-NEXT:  str [[R0]], {{\[}}[[ADDR1]], #1024]
+; CHECK-ARMV4T-NEXT:  ldr [[R0:r[0-9]+]], [[[ADDR0]], #1024]
+; CHECK-ARMV4T-NEXT:  ldr [[R1:r[0-9]+]], [[[ADDR0]], #1028]
+; CHECK-ARMV4T-NEXT:  str [[R1]], [[[ADDR1]], #1028]
+; CHECK-ARMV4T-NEXT:  str [[R0]], [[[ADDR1]], #1024]
 entry:
   %0 = load volatile i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @x to i8*), i32 1024) to i64*), align 8
   store volatile i64 %0, i64* bitcast (i8* getelementptr (i8, i8* bitcast (i64* @y to i8*), i32 1024) to i64*), align 8
diff --git a/llvm/test/CodeGen/ARM/indirectbr.ll b/llvm/test/CodeGen/ARM/indirectbr.ll
index db5014f937531..68ad606b01cef 100644
--- a/llvm/test/CodeGen/ARM/indirectbr.ll
+++ b/llvm/test/CodeGen/ARM/indirectbr.ll
@@ -60,7 +60,7 @@ L1:                                               ; preds = %L2, %bb2
 ; ARM: ldr [[R1:r[0-9]+]], LCPI
 ; ARM: add [[R_NEXTADDR_b:r[0-9]+]], pc, [[R_NEXTADDR]]
 ; ARM: add [[R1b:r[0-9]+]], pc, [[R1]]
-; ARM: str [[R1b]], {{\[}}[[R_NEXTADDR_b]]]
+; ARM: str [[R1b]], [[[R_NEXTADDR_b]]]
 
 ; THUMB-LABEL: %L1
 ; THUMB: ldr [[R2:r[0-9]+]], LCPI
diff --git a/llvm/test/CodeGen/ARM/jump-table-tbh.ll b/llvm/test/CodeGen/ARM/jump-table-tbh.ll
index ab2c579e514ea..eea54d42202ea 100644
--- a/llvm/test/CodeGen/ARM/jump-table-tbh.ll
+++ b/llvm/test/CodeGen/ARM/jump-table-tbh.ll
@@ -20,7 +20,7 @@ define i32 @test_tbh(i1 %tst, i32 %sw, i32 %l) {
 ; T1-LABEL: test_tbh:
 ; T1: lsls [[x:r[0-9]+]], r4, #1
 ; T1: add [[x]], pc
-; T1: ldrh [[x]], {{\[}}[[x]], #4]
+; T1: ldrh [[x]], [[[x]], #4]
 ; T1: lsls [[x]], [[x]], #1
 ; T1: [[ANCHOR:.LCPI[0-9_]+]]:
 ; T1: add pc, [[x]]
diff --git a/llvm/test/CodeGen/ARM/ldrd.ll b/llvm/test/CodeGen/ARM/ldrd.ll
index 2bba841413803..3b3724fd7b7ba 100644
--- a/llvm/test/CodeGen/ARM/ldrd.ll
+++ b/llvm/test/CodeGen/ARM/ldrd.ll
@@ -83,10 +83,10 @@ define void @Func1() nounwind ssp "frame-pointer"="all" {
 entry:
 ; A8: movw [[BASER:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}}
 ; A8: movt [[BASER]], :upper16:{{.*}}TestVar{{.*}}
-; A8: ldr [[BASE:r[0-9]+]], {{\[}}[[BASER]]]
-; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4]
+; A8: ldr [[BASE:r[0-9]+]], [[[BASER]]]
+; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], [[[BASE]], #4]
 ; A8-NEXT: add [[FIELD2]], [[FIELD1]]
-; A8-NEXT: str [[FIELD2]], {{\[}}[[BASE]]{{\]}}
+; A8-NEXT: str [[FIELD2]], [[[BASE]]]
 ; CONSERVATIVE-NOT: ldrd
   %orig_blocks = alloca [256 x i16], align 2
   %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start.p0i8(i64 512, i8* %0) nounwind
diff --git a/llvm/test/CodeGen/ARM/memcpy-ldm-stm.ll b/llvm/test/CodeGen/ARM/memcpy-ldm-stm.ll
index 0bda070f7b28b..77d2efb50a92b 100644
--- a/llvm/test/CodeGen/ARM/memcpy-ldm-stm.ll
+++ b/llvm/test/CodeGen/ARM/memcpy-ldm-stm.ll
@@ -21,9 +21,9 @@ entry:
 ; CHECKV7-NEXT: movt [[SB:[rl0-9]+]], :upper16:s
 ; CHECK-NEXT: ldm{{(\.w)?}} [[LB]]!,
 ; CHECK-NEXT: stm{{(\.w)?}} [[SB]]!,
-; Think of the monstrosity '{{\[}}[[LB]]]' as '[ [[LB]] ]' without the spaces.
-; CHECK-NEXT: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]]]
-; CHECK-NEXT: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]]]
+; Think of the monstrosity '[[[LB]]]' as '[ [[LB]] ]' without the spaces.
+; CHECK-NEXT: ldrb{{(\.w)?}} {{.*}}, [[[LB]]]
+; CHECK-NEXT: strb{{(\.w)?}} {{.*}}, [[[SB]]]
     tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast ([64 x i32]* @s to i8*), i8* align 4 bitcast ([64 x i32]* @d to i8*), i32 17, i1 false)
     ret void
 }
@@ -36,14 +36,14 @@ entry:
 ; CHECKV6-NEXT: ldr [[SB:r[0-7]]],
 ; CHECKV6-NEXT: ldm{{(\.w)?}} [[LB]]!,
 ; CHECKV6-NEXT: stm{{(\.w)?}} [[SB]]!,
-; CHECKV6-DAG: ldrh{{(\.w)?}} {{.*}}, {{\[}}[[LB]]]
-; CHECKV6-DAG: ldrb{{(\.w)?}} {{.*}}, {{\[}}[[LB]], #2]
-; CHECKV6-DAG: strb{{(\.w)?}} {{.*}}, {{\[}}[[SB]], #2]
-; CHECKV6-DAG: strh{{(\.w)?}} {{.*}}, {{\[}}[[SB]]]
+; CHECKV6-DAG: ldrh{{(\.w)?}} {{.*}}, [[[LB]]]
+; CHECKV6-DAG: ldrb{{(\.w)?}} {{.*}}, [[[LB]], #2]
+; CHECKV6-DAG: strb{{(\.w)?}} {{.*}}, [[[SB]], #2]
+; CHECKV6-DAG: strh{{(\.w)?}} {{.*}}, [[[SB]]]
 ; CHECKV7: movt [[LB:[rl0-9]+]], :upper16:d
 ; CHECKV7-NEXT: movt [[SB:[rl0-9]+]], :upper16:s
-; CHECKV7: ldr{{(\.w)?}} {{.*}}, {{\[}}[[LB]], #11]
-; CHECKV7-NEXT: str{{(\.w)?}} {{.*}}, {{\[}}[[SB]], #11]
+; CHECKV7: ldr{{(\.w)?}} {{.*}}, [[[LB]], #11]
+; CHECKV7-NEXT: str{{(\.w)?}} {{.*}}, [[[SB]], #11]
     tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast ([64 x i32]* @s to i8*), i8* align 4 bitcast ([64 x i32]* @d to i8*), i32 15, i1 false)
     ret void
 }
diff --git a/llvm/test/CodeGen/ARM/setjmp_longjmp.ll b/llvm/test/CodeGen/ARM/setjmp_longjmp.ll
index 22458a4e7288a..21eac987f5e82 100644
--- a/llvm/test/CodeGen/ARM/setjmp_longjmp.ll
+++ b/llvm/test/CodeGen/ARM/setjmp_longjmp.ll
@@ -20,26 +20,26 @@ declare i8* @llvm.stacksave()
 ;
 ; setjmp sequence:
 ; CHECK: add [[PCREG:r[0-9]+]], pc, #8
-; CHECK-NEXT: str [[PCREG]], {{\[}}[[BUFREG:r[0-9]+]], #4]
+; CHECK-NEXT: str [[PCREG]], [[[BUFREG:r[0-9]+]], #4]
 ; CHECK-NEXT: mov r0, #0
 ; CHECK-NEXT: add pc, pc, #0
 ; CHECK-NEXT: mov r0, #1
 ;
 ; longjmp sequence:
 ; CHECK: ldr sp, [{{\s*}}[[BUFREG:r[0-9]+]], #8]
-; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], {{\[}}[[BUFREG]], #4]
-; CHECK-NEXT: ldr r7, {{\[}}[[BUFREG]]{{\]}}
+; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], [[[BUFREG]], #4]
+; CHECK-NEXT: ldr r7, [[[BUFREG]]]
 ; CHECK-NEXT: bx [[DESTREG]]
 
 ; CHECK-LINUX: ldr sp, [{{\s*}}[[BUFREG:r[0-9]+]], #8]
-; CHECK-LINUX-NEXT: ldr [[DESTREG:r[0-9]+]], {{\[}}[[BUFREG]], #4]
-; CHECK-LINUX-NEXT: ldr r7, {{\[}}[[BUFREG]]{{\]}}
-; CHECK-LINUX-NEXT: ldr r11, {{\[}}[[BUFREG]]{{\]}}
+; CHECK-LINUX-NEXT: ldr [[DESTREG:r[0-9]+]], [[[BUFREG]], #4]
+; CHECK-LINUX-NEXT: ldr r7, [[[BUFREG]]]
+; CHECK-LINUX-NEXT: ldr r11, [[[BUFREG]]]
 ; CHECK-LINUX-NEXT: bx [[DESTREG]]
 
 ; CHECK-WIN32: ldr.w r11, [{{\s*}}[[BUFREG:r[0-9]+]]]
-; CHECK-WIN32-NEXT: ldr.w sp, {{\[}}[[BUFREG]], #8]
-; CHECK-WIN32-NEXT: ldr.w pc, {{\[}}[[BUFREG]], #4]
+; CHECK-WIN32-NEXT: ldr.w sp, [[[BUFREG]], #8]
+; CHECK-WIN32-NEXT: ldr.w pc, [[[BUFREG]], #4]
 define void @foobar() {
 entry:
   %buf = alloca [5 x i8*], align 4
@@ -70,15 +70,15 @@ if.end:
 ;
 ; setjmp sequence:
 ; CHECK: add [[PCREG:r[0-9]+]], pc, #8
-; CHECK-NEXT: str [[PCREG]], {{\[}}[[BUFREG:r[0-9]+]], #4]
+; CHECK-NEXT: str [[PCREG]], [[[BUFREG:r[0-9]+]], #4]
 ; CHECK-NEXT: mov r0, #0
 ; CHECK-NEXT: add pc, pc, #0
 ; CHECK-NEXT: mov r0, #1
 ;
 ; longjmp sequence:
 ; CHECK: ldr sp, [{{\s*}}[[BUFREG:r[0-9]+]], #8]
-; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], {{\[}}[[BUFREG]], #4]
-; CHECK-NEXT: ldr r7, {{\[}}[[BUFREG]]{{\]}}
+; CHECK-NEXT: ldr [[DESTREG:r[0-9]+]], [[[BUFREG]], #4]
+; CHECK-NEXT: ldr r7, [[[BUFREG]]]
 ; CHECK-NEXT: bx [[DESTREG]]
 define void @combine_sjlj_eh_and_setjmp_longjmp() personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
 entry:
diff --git a/llvm/test/CodeGen/ARM/stack-guard-tls.ll b/llvm/test/CodeGen/ARM/stack-guard-tls.ll
index a891aacbd2ae2..5d405e276d8ec 100644
--- a/llvm/test/CodeGen/ARM/stack-guard-tls.ll
+++ b/llvm/test/CodeGen/ARM/stack-guard-tls.ll
@@ -28,11 +28,11 @@ define void @foo(i64 %t) sspstrong {
 !2 = !{i32 2, !"stack-protector-guard-offset", i32 4296}
 
 ; CHECK: mrc p15, #0, [[REG1:r[0-9]+]], c13, c0, #3
-; CHECK-SMALL-NEXT: ldr{{(\.w)?}} [[REG1]], {{\[}}[[REG1]], #1296]
+; CHECK-SMALL-NEXT: ldr{{(\.w)?}} [[REG1]], [[[REG1]], #1296]
 ; CHECK-LARGE-NEXT: add{{(\.w)?}} [[REG1]], [[REG1]], #4096
-; CHECK-LARGE-NEXT: ldr{{(\.w)?}} [[REG1]], {{\[}}[[REG1]], #200]
+; CHECK-LARGE-NEXT: ldr{{(\.w)?}} [[REG1]], [[[REG1]], #200]
 ; CHECK: bl baz
 ; CHECK: mrc p15, #0, [[REG2:r[0-9]+]], c13, c0, #3
-; CHECK-SMALL-NEXT: ldr{{(\.w)?}} [[REG2]], {{\[}}[[REG2]], #1296]
+; CHECK-SMALL-NEXT: ldr{{(\.w)?}} [[REG2]], [[[REG2]], #1296]
 ; CHECK-LARGE-NEXT: add{{(\.w)?}} [[REG2]], [[REG2]], #4096
-; CHECK-LARGE-NEXT: ldr{{(\.w)?}} [[REG2]], {{\[}}[[REG2]], #200]
+; CHECK-LARGE-NEXT: ldr{{(\.w)?}} [[REG2]], [[[REG2]], #200]
diff --git a/llvm/test/CodeGen/ARM/stack_guard_remat.ll b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
index eb6538603e1e1..12c9b85ab8d0a 100644
--- a/llvm/test/CodeGen/ARM/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
@@ -9,8 +9,8 @@
 ;PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
 ;PIC: [[LABEL1:LPC0_1]]:
 ;PIC:   add [[R1:r[0-9]+]], pc, [[R0]]
-;PIC:   ldr [[R2:r[0-9]+]], {{\[}}[[R1]]{{\]}}
-;PIC:   ldr {{r[0-9]+}}, {{\[}}[[R2]]{{\]}}
+;PIC:   ldr [[R2:r[0-9]+]], [[[R1]]]
+;PIC:   ldr {{r[0-9]+}}, [[[R2]]]
 
 ;PIC:      [[LABEL0]]:
 ;PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+8)
@@ -18,7 +18,7 @@
 ;NO-PIC: foo2
 ;NO-PIC: ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
 ;NO-PIC-NOT: LPC
-;NO-PIC: ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+;NO-PIC: ldr {{r[0-9]+}}, [[[R0]]]
 
 ;STATIC:      [[LABEL0]]:
 ;STATIC-NEXT:   .long ___stack_chk_guard
@@ -29,20 +29,20 @@
 ;PIC-V7:   movw [[R0:r[0-9]+]], :lower16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0:LPC[0-9_]+]]+8))
 ;PIC-V7:   movt [[R0]], :upper16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0]]+8))
 ;PIC-V7: [[LABEL0]]:
-;PIC-V7:   ldr [[R0]], {{\[}}pc, [[R0]]{{\]}}
-;PIC-V7:   ldr [[R0]], {{\[}}[[R0]]{{\]}}
+;PIC-V7:   ldr [[R0]], [pc, [[R0]]]
+;PIC-V7:   ldr [[R0]], [[[R0]]]
 
 ;PIC-V7: L___stack_chk_guard$non_lazy_ptr:
 ;PIC-V7:   .indirect_symbol        ___stack_chk_guard
 
 ;STATIC-V7: movw [[R0:r[0-9]+]], :lower16:___stack_chk_guard
 ;STATIC-V7: movt [[R0]], :upper16:___stack_chk_guard
-;STATIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+;STATIC-V7: ldr  [[R0]], [[[R0]]]
 
 ;DYNAMIC-NO-PIC-V7: movw [[R0:r[0-9]+]], :lower16:L___stack_chk_guard$non_lazy_ptr
 ;DYNAMIC-NO-PIC-V7: movt [[R0]], :upper16:L___stack_chk_guard$non_lazy_ptr
-;DYNAMIC-NO-PIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
-;DYNAMIC-NO-PIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+;DYNAMIC-NO-PIC-V7: ldr  [[R0]], [[[R0]]]
+;DYNAMIC-NO-PIC-V7: ldr  [[R0]], [[[R0]]]
 
 ;DYNAMIC-NO-PIC-V7: L___stack_chk_guard$non_lazy_ptr:
 ;DYNAMIC-NO-PIC-V7:   .indirect_symbol        ___stack_chk_guard
diff --git a/llvm/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll b/llvm/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
index eb1e710ae6c11..b881d472e1f4a 100644
--- a/llvm/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
+++ b/llvm/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
@@ -65,7 +65,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
@@ -86,7 +86,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
@@ -107,7 +107,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
@@ -129,7 +129,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
@@ -153,7 +153,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
@@ -174,7 +174,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
@@ -198,9 +198,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
   entry:
@@ -223,9 +223,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
   entry:
@@ -249,9 +249,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
   entry:
@@ -275,9 +275,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
   entry:
@@ -297,7 +297,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
@@ -321,9 +321,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
   entry:
@@ -346,9 +346,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
@@ -373,9 +373,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
@@ -400,9 +400,9 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
@@ -426,7 +426,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON:     bne
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 ;THUMB1:      bne
 
@@ -451,7 +451,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 ;NO_NEON:     bne
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 ;THUMB1:      bne
 
@@ -476,7 +476,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON:     bne
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -502,7 +502,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -528,7 +528,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -553,7 +553,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON:     bne
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 ;THUMB1:      bne
 
@@ -581,10 +581,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
   entry:
@@ -610,10 +610,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
   entry:
@@ -640,10 +640,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
   entry:
@@ -670,10 +670,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
   entry:
@@ -696,7 +696,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON:     bne
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 ;THUMB1:      bne
 
@@ -724,10 +724,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
   entry:
@@ -753,10 +753,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
@@ -784,10 +784,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
@@ -815,10 +815,10 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
@@ -839,7 +839,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
@@ -860,7 +860,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
@@ -881,7 +881,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
@@ -903,7 +903,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
@@ -925,7 +925,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
@@ -946,7 +946,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
@@ -967,7 +967,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
@@ -988,7 +988,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
@@ -1010,7 +1010,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
@@ -1032,7 +1032,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
@@ -1053,7 +1053,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 
 ;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
@@ -1074,7 +1074,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 
 ;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
@@ -1095,7 +1095,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
@@ -1117,7 +1117,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
@@ -1139,7 +1139,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 
 ;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
@@ -1163,7 +1163,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON:     bne
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 ;THUMB1:      bne
 
@@ -1188,7 +1188,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 ;NO_NEON:     bne
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 ;THUMB1:      bne
 
@@ -1213,7 +1213,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON:     bne
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1239,7 +1239,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1265,7 +1265,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1290,7 +1290,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON:     bne
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 ;THUMB1:      bne
 
@@ -1315,7 +1315,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 ;NO_NEON:     bne
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 ;THUMB1:      bne
 
@@ -1340,7 +1340,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON:     bne
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1366,7 +1366,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1392,7 +1392,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1417,7 +1417,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
 ;NO_NEON:     bne
 
-;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #1
 ;THUMB1:      bne
 
@@ -1442,7 +1442,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
 ;NO_NEON:     bne
 
-;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldrh    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #2
 ;THUMB1:      bne
 
@@ -1467,7 +1467,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
 ;NO_NEON:     bne
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1493,7 +1493,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1519,7 +1519,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;NO_NEON:     bne
 ;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [{{.*}}]!
 
-;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      ldr     r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;THUMB1:      adds    [[BASE]], #4
 ;THUMB1:      bne
 
@@ -1532,7 +1532,7 @@ declare void @use_N(%struct.N* byval(%struct.N))
 ;V8MBASE-LABEL: <test_M>:
   define void @test_M() {
 
-;V8MBASE:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;V8MBASE:      ldrb    r{{[0-9]+}}, [[[BASE:r[0-9]+]]]
 ;V8MBASE:      adds    [[BASE]], #1
 ;V8MBASE-NOT:  movw
   entry:
diff --git a/llvm/test/CodeGen/ARM/swiftself.ll b/llvm/test/CodeGen/ARM/swiftself.ll
index 3878b85a8aded..337cb5851f34d 100644
--- a/llvm/test/CodeGen/ARM/swiftself.ll
+++ b/llvm/test/CodeGen/ARM/swiftself.ll
@@ -72,7 +72,7 @@ declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
 ; OPT-DAG: mov [[CSREG:r[1-9].*]], r0
 ; OPT-DAG: ldr r10, [r10]
 ; OPT: bl  {{_?}}thisreturn_attribute
-; OPT: str r0, {{\[}}[[CSREG]]
+; OPT: str r0, [[[CSREG]]
 define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret(i8**), i8** noalias nocapture readonly swiftself) {
 entry:
   %2 = load i8*, i8** %1, align 8
diff --git a/llvm/test/CodeGen/ARM/thumb-big-stack.ll b/llvm/test/CodeGen/ARM/thumb-big-stack.ll
index e5cbb9747a7e8..cfb10ff2eccbf 100644
--- a/llvm/test/CodeGen/ARM/thumb-big-stack.ll
+++ b/llvm/test/CodeGen/ARM/thumb-big-stack.ll
@@ -11,7 +11,7 @@ target triple = "thumbv7s-apple-ios"
 ; CHECK-LABEL: f:
 ; CHECK: movw [[ADDR:(r[0-9]+|lr)]], #
 ; CHECK-NEXT: add [[ADDR]], sp
-; CHECK-NEXT: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, {{\[}}[[ADDR]]:128]
+; CHECK-NEXT: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [[[ADDR]]:128]
 define <4 x float> @f(<4 x float> %x) {
 entry:
   %.compoundliteral7837 = alloca <4 x float>, align 16
diff --git a/llvm/test/CodeGen/ARM/thumb_indirect_calls.ll b/llvm/test/CodeGen/ARM/thumb_indirect_calls.ll
index f5ecd04280c98..317d0a424689d 100644
--- a/llvm/test/CodeGen/ARM/thumb_indirect_calls.ll
+++ b/llvm/test/CodeGen/ARM/thumb_indirect_calls.ll
@@ -11,7 +11,7 @@ entry:
   ret void
 
 ; CHECK: ldr [[TMP:r[0-3]]], [[F:\.[A-Z0-9_]+]]
-; CHECK: ldr [[CALLEE:r[0-3]]], {{\[}}[[TMP]]{{\]}}
+; CHECK: ldr [[CALLEE:r[0-3]]], [[[TMP]]]
 
 ; CHECK-V4T-NOT: blx
 ; CHECK-V4T: bl [[INDIRECT_PAD:\.Ltmp[0-9]+]]
diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
index ba045a8c5303e..5556e2d103335 100644
--- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -239,7 +239,7 @@ define <2 x i8> @test_truncate(<2 x i128> %in) {
 ; CHECK-LABEL: test_truncate:
 ; CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
 ; CHECK-NEXT: mov [[BASE:r[0-9]+]], sp
-; CHECK-NEXT: vld1.32 {[[REG]][1]}, {{\[}}[[BASE]]:32]
+; CHECK-NEXT: vld1.32 {[[REG]][1]}, [[[BASE]]:32]
 ; CHECK-NEXT: vmov r0, r1, [[REG]]
 entry:
   %res = trunc <2 x i128> %in to <2 x i8>
diff --git a/llvm/test/CodeGen/ARM/vld3.ll b/llvm/test/CodeGen/ARM/vld3.ll
index 46e17c97e6e7a..142b0f1b643cc 100644
--- a/llvm/test/CodeGen/ARM/vld3.ll
+++ b/llvm/test/CodeGen/ARM/vld3.ll
@@ -148,8 +148,8 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind {
 ;Check for a post-increment updating load. 
 define <4 x i32> @vld3Qi32_update(i32** %ptr) nounwind {
 ;CHECK-LABEL: vld3Qi32_update:
-;CHECK: vld3.32 {d16, d18, d20}, {{\[}}[[R:r[0-9]+|lr]]]!
-;CHECK: vld3.32 {d17, d19, d21}, {{\[}}[[R]]]!
+;CHECK: vld3.32 {d16, d18, d20}, [[[R:r[0-9]+|lr]]]!
+;CHECK: vld3.32 {d17, d19, d21}, [[[R]]]!
 	%A = load i32*, i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
diff --git a/llvm/test/CodeGen/ARM/win32-ssp.ll b/llvm/test/CodeGen/ARM/win32-ssp.ll
index 679f2ea4011ba..7f446e2de4049 100644
--- a/llvm/test/CodeGen/ARM/win32-ssp.ll
+++ b/llvm/test/CodeGen/ARM/win32-ssp.ll
@@ -9,13 +9,13 @@ entry:
 ; MINGW-LABEL: func:
 ; MINGW: movw [[REG:r[0-9]+]], :lower16:.refptr.__stack_chk_guard
 ; MINGW: movt [[REG]], :upper16:.refptr.__stack_chk_guard
-; MINGW: ldr [[REG2:r[0-9]+]], {{\[}}[[REG]]]
-; MINGW: ldr {{r[0-9]+}}, {{\[}}[[REG2]]]
+; MINGW: ldr [[REG2:r[0-9]+]], [[[REG]]]
+; MINGW: ldr {{r[0-9]+}}, [[[REG2]]]
 ; MINGW: bl other
 ; MINGW: movw [[REG3:r[0-9]+]], :lower16:.refptr.__stack_chk_guard
 ; MINGW: movt [[REG3]], :upper16:.refptr.__stack_chk_guard
-; MINGW: ldr [[REG4:r[0-9]+]], {{\[}}[[REG3]]]
-; MINGW: ldr {{r[0-9]+}}, {{\[}}[[REG4]]]
+; MINGW: ldr [[REG4:r[0-9]+]], [[[REG3]]]
+; MINGW: ldr {{r[0-9]+}}, [[[REG4]]]
 ; MINGW: bl __stack_chk_fail
 
   %c = alloca i8, align 1
diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
index 675888496a05d..11feabfeb2470 100644
--- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -5,12 +5,12 @@
 ;PIC:        foo2
 ;PIC:        ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]]
 ;PIC-NEXT:   add [[SAVED_GUARD]], sp
-;PIC-NEXT:   ldr [[SAVED_GUARD]], {{\[}}[[SAVED_GUARD]]{{\]}}
+;PIC-NEXT:   ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]]
 ;PIC-NEXT:   ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]]
 ;PIC-NEXT: [[LABEL1:LPC[0-9_]+]]:
 ;PIC-NEXT:   add [[ORIGINAL_GUARD]], pc
-;PIC-NEXT:   ldr [[ORIGINAL_GUARD]], {{\[}}[[ORIGINAL_GUARD]]{{\]}}
-;PIC-NEXT:   ldr [[ORIGINAL_GUARD]], {{\[}}[[ORIGINAL_GUARD]]{{\]}}
+;PIC-NEXT:   ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
+;PIC-NEXT:   ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
 ;PIC-NEXT:   cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]]
 
 ;PIC:      [[GUARD_STACK_OFFSET]]:
@@ -21,11 +21,11 @@
 ;NO-PIC:   foo2
 ;NO-PIC:                ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]]
 ;NO-PIC-NEXT:           add [[SAVED_GUARD]], sp
-;NO-PIC-NEXT:           ldr [[SAVED_GUARD]], {{\[}}[[SAVED_GUARD]]{{\]}}
+;NO-PIC-NEXT:           ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]]
 ;NO-PIC-NEXT:           ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]]
 ;NO-PIC-NOT: LPC
-;NO-PIC-NEXT:           ldr [[ORIGINAL_GUARD]], {{\[}}[[ORIGINAL_GUARD]]{{\]}}
-;DYNAMIC-NO-PIC-NEXT:   ldr [[ORIGINAL_GUARD]], {{\[}}[[ORIGINAL_GUARD]]{{\]}}
+;NO-PIC-NEXT:           ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
+;DYNAMIC-NO-PIC-NEXT:   ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
 ;NO-PIC-NEXT:           cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]]
 
 ;STATIC:      [[GUARD_STACK_OFFSET]]:
diff --git a/llvm/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll b/llvm/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
index b9bfdcbec4e49..75bce22ebd246 100644
--- a/llvm/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
+++ b/llvm/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
@@ -5,7 +5,7 @@
 ; CHECK: Perl_ck_sort
 ; CHECK: ldr
 ; CHECK: mov [[REGISTER:(r[0-9]+)|(lr)]]
-; CHECK: str {{(r[0-9])|(lr)}}, {{\[}}[[REGISTER]]{{\]}}, #24
+; CHECK: str {{(r[0-9])|(lr)}}, [[[REGISTER]]], #24
 
 define void @Perl_ck_sort() nounwind optsize {
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
index b007419b1e136..4d96dc277f0dc 100644
--- a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
@@ -7,18 +7,18 @@
 ;PIC:   movt  [[R0]], :upper16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0]]+4))
 ;PIC: [[LABEL0]]:
 ;PIC:   add [[R0]], pc
-;PIC:   ldr [[R1:r[0-9]+]], {{\[}}[[R0]]{{\]}}
-;PIC:   ldr {{r[0-9]+}}, {{\[}}[[R1]]{{\]}}
+;PIC:   ldr [[R1:r[0-9]+]], [[[R0]]]
+;PIC:   ldr {{r[0-9]+}}, [[[R1]]]
 
 ;STATIC:   foo2
 ;STATIC:   movw  [[R0:r[0-9]+]], :lower16:___stack_chk_guard
 ;STATIC:   movt  [[R0]], :upper16:___stack_chk_guard
-;STATIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+;STATIC:   ldr {{r[0-9]+}}, [[[R0]]]
 
 ;DYNAMIC-NO-PIC:   foo2
 ;DYNAMIC-NO-PIC:   movw  [[R0:r[0-9]+]], :lower16:L___stack_chk_guard$non_lazy_ptr
 ;DYNAMIC-NO-PIC:   movt  [[R0]], :upper16:L___stack_chk_guard$non_lazy_ptr
-;DYNAMIC-NO-PIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+;DYNAMIC-NO-PIC:   ldr {{r[0-9]+}}, [[[R0]]]
 
 ; Function Attrs: nounwind ssp
 define i32 @test_stack_guard_remat() #0 {
diff --git a/llvm/test/CodeGen/XCore/epilogue_prologue.ll b/llvm/test/CodeGen/XCore/epilogue_prologue.ll
index a7194419c6547..c4a9add9da0b6 100644
--- a/llvm/test/CodeGen/XCore/epilogue_prologue.ll
+++ b/llvm/test/CodeGen/XCore/epilogue_prologue.ll
@@ -190,7 +190,7 @@ entry:
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
 ; CHECKFP-NEXT: mkmsk [[REG:r[0-9]+]], 8
-; CHECKFP-NEXT: ldaw r0, r10{{\[}}[[REG]]{{\]}}
+; CHECKFP-NEXT: ldaw r0, r10[[[REG]]]
 ; CHECKFP-NEXT: extsp 1
 ; CHECKFP-NEXT: bl f5
 ; CHECKFP-NEXT: ldaw sp, sp[1]
@@ -218,7 +218,7 @@ entry:
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
 ; CHECKFP-NEXT: ldc [[REG:r[0-9]+]], 32767
-; CHECKFP-NEXT: ldaw r0, r10{{\[}}[[REG]]{{\]}}
+; CHECKFP-NEXT: ldaw r0, r10[[[REG]]]
 ; CHECKFP-NEXT: extsp 1
 ; CHECKFP-NEXT: bl f5
 ; CHECKFP-NEXT: ldaw sp, sp[1]
diff --git a/llvm/test/CodeGen/XCore/scavenging.ll b/llvm/test/CodeGen/XCore/scavenging.ll
index b46c75a4aaf60..adf313e4a9890 100644
--- a/llvm/test/CodeGen/XCore/scavenging.ll
+++ b/llvm/test/CodeGen/XCore/scavenging.ll
@@ -75,26 +75,26 @@ declare void @g(i32*, i32*)
 ; CHECK: ldaw r11, sp[0]
 ; scavenge r4 using SR spill slot
 ; CHECK: stw r4, sp[1]
-; CHECK: ldw r4, cp{{\[}}[[ARG5]]{{\]}}
+; CHECK: ldw r4, cp[[[ARG5]]]
 ; r11 used to load 5th argument
 ; CHECK: ldw r11, r11[r4]
 ; CHECK: ldaw r4, sp[0]
 ; scavenge r5 using SR spill slot
 ; CHECK: stw r5, sp[0]
-; CHECK: ldw r5, cp{{\[}}[[INDEX0]]{{\]}}
+; CHECK: ldw r5, cp[[[INDEX0]]]
 ; r4 & r5 used by InsertSPConstInst() to emit STW_l3r instruction.
 ; CHECK: stw r0, r4[r5]
 ; CHECK: ldaw r0, sp[0]
-; CHECK: ldw r5, cp{{\[}}[[INDEX1]]{{\]}}
+; CHECK: ldw r5, cp[[[INDEX1]]]
 ; CHECK: stw r1, r0[r5]
 ; CHECK: ldaw r0, sp[0]
-; CHECK: ldw r1, cp{{\[}}[[INDEX2]]{{\]}}
+; CHECK: ldw r1, cp[[[INDEX2]]]
 ; CHECK: stw r2, r0[r1]
 ; CHECK: ldaw r0, sp[0]
-; CHECK: ldw r1, cp{{\[}}[[INDEX3]]{{\]}}
+; CHECK: ldw r1, cp[[[INDEX3]]]
 ; CHECK: stw r3, r0[r1]
 ; CHECK: ldaw r0, sp[0]
-; CHECK: ldw r1, cp{{\[}}[[INDEX4]]{{\]}}
+; CHECK: ldw r1, cp[[[INDEX4]]]
 ; CHECK: stw r11, r0[r1]
 ; CHECK: ldaw sp, sp[65535]
 ; CHECK: ldw r4, sp[1]
diff --git a/llvm/test/CodeGen/XCore/varargs.ll b/llvm/test/CodeGen/XCore/varargs.ll
index b6f716d66c9df..f88395e5945c6 100644
--- a/llvm/test/CodeGen/XCore/varargs.ll
+++ b/llvm/test/CodeGen/XCore/varargs.ll
@@ -5,13 +5,13 @@ entry:
 ; CHECK-LABEL: _Z1fz:
 ; CHECK: extsp 3
 ; CHECK: stw r[[REG:[0-3]{1,1}]]
-; CHECK: , sp{{\[}}[[REG]]{{\]}}
+; CHECK: , sp[[[REG]]]
 ; CHECK: stw r[[REG:[0-3]{1,1}]]
-; CHECK: , sp{{\[}}[[REG]]{{\]}}
+; CHECK: , sp[[[REG]]]
 ; CHECK: stw r[[REG:[0-3]{1,1}]]
-; CHECK: , sp{{\[}}[[REG]]{{\]}}
+; CHECK: , sp[[[REG]]]
 ; CHECK: stw r[[REG:[0-3]{1,1}]]
-; CHECK: , sp{{\[}}[[REG]]{{\]}}
+; CHECK: , sp[[[REG]]]
 ; CHECK: ldaw sp, sp[3]
 ; CHECK: retsp 0
   ret void
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 19957689286d8..8b278a6c4efb0 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -45,10 +45,10 @@ define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 ; IR-LABEL: @sum_of_array(
 ; TODO: GVN is unable to preserve the "inbounds" keyword on the first GEP. Need
@@ -90,10 +90,10 @@ define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array2(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 ; IR-LABEL: @sum_of_array2(
 ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
@@ -140,10 +140,10 @@ define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array3(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 ; IR-LABEL: @sum_of_array3(
 ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
@@ -186,10 +186,10 @@ define void @sum_of_array4(i32 %x, i32 %y, float* nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array4(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 ; IR-LABEL: @sum_of_array4(
 ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
@@ -218,7 +218,7 @@ entry:
   %0 = sext i32 %xy to i64
   %p0 = getelementptr inbounds float, float* %input, i64 %0
   %v0 = load float, float* %p0, align 4
-; PTX: ld.f32 %f{{[0-9]+}}, {{\[}}[[p0:%rd[0-9]+]]{{\]}}
+; PTX: ld.f32 %f{{[0-9]+}}, [[[p0:%rd[0-9]+]]]
   call void @use(float %v0)
 
   %y5 = add nsw i32 %y, 5
@@ -227,7 +227,7 @@ entry:
   %p1 = getelementptr inbounds float, float* %input, i64 %1
 ; IR: getelementptr inbounds float, float* %p0, i64 5
   %v1 = load float, float* %p1, align 4
-; PTX: ld.f32 %f{{[0-9]+}}, {{\[}}[[p0]]+20{{\]}}
+; PTX: ld.f32 %f{{[0-9]+}}, [[[p0]]+20]
   call void @use(float %v1)
 
   ret void
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll
index 3a272d07d6fca..9e204593d4e26 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll
@@ -42,7 +42,7 @@ define void @slsr_after_reassociate_geps(float* %arr, i32 %i) {
 ; PTX: mul.wide.s32 [[i4:%rd[0-9]+]], [[i]], 4;
 ; PTX: add.s64 [[base1:%rd[0-9]+]], [[arr]], [[i4]];
   %v1 = load float, float* %p1, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base1]]+20];
+; PTX: ld.f32 {{%f[0-9]+}}, [[[base1]]+20];
   call void @foo(float %v1)
 
   %j2 = add nsw i32 %i2, 5
@@ -50,7 +50,7 @@ define void @slsr_after_reassociate_geps(float* %arr, i32 %i) {
 ; CHECK: [[b2:%[0-9]+]] = getelementptr float, float* [[b1]], i64 [[bump]]
 ; PTX: add.s64 [[base2:%rd[0-9]+]], [[base1]], [[i4]];
   %v2 = load float, float* %p2, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base2]]+20];
+; PTX: ld.f32 {{%f[0-9]+}}, [[[base2]]+20];
   call void @foo(float %v2)
 
   %j3 = add nsw i32 %i3, 5
@@ -58,7 +58,7 @@ define void @slsr_after_reassociate_geps(float* %arr, i32 %i) {
 ; CHECK: [[b3:%[0-9]+]] = getelementptr float, float* [[b2]], i64 [[bump]]
 ; PTX: add.s64 [[base3:%rd[0-9]+]], [[base2]], [[i4]];
   %v3 = load float, float* %p3, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base3]]+20];
+; PTX: ld.f32 {{%f[0-9]+}}, [[[base3]]+20];
   call void @foo(float %v3)
 
   %j4 = add nsw i32 %i4, 5
@@ -66,7 +66,7 @@ define void @slsr_after_reassociate_geps(float* %arr, i32 %i) {
 ; CHECK: [[b4:%[0-9]+]] = getelementptr float, float* [[b3]], i64 [[bump]]
 ; PTX: add.s64 [[base4:%rd[0-9]+]], [[base3]], [[i4]];
   %v4 = load float, float* %p4, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, {{\[}}[[base4]]+20];
+; PTX: ld.f32 {{%f[0-9]+}}, [[[base4]]+20];
   call void @foo(float %v4)
 
   ret void

From a259e62bb68d56a77578d7c720fc2f3baf70ba10 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 08:18:26 -0800
Subject: [PATCH 280/748] [instsimplify] Add a couple more pointer compare
 folding tests [NFC]

---
 llvm/test/Transforms/InstSimplify/compare.ll | 28 +++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index b82134d58db9b..5afb56203a0f9 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -2700,7 +2700,7 @@ define <2 x i1> @cttz_slt_bitwidth_splat(<2 x i13> %x) {
   ret <2 x i1> %cmp
 }
 
-; FIXME: A zero sized alloca *can* be equal to another alloca
+; A zero sized alloca *can* be equal to another alloca
 define i1 @zero_sized_alloca1() {
 ; CHECK-LABEL: @zero_sized_alloca1(
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, i32 0, align 4
@@ -2798,6 +2798,32 @@ define i1 @globals_inequal() {
   ret i1 %res
 }
 
+; TODO: Never equal
+define i1 @globals_offset_inequal() {
+; CHECK-LABEL: @globals_offset_inequal(
+; CHECK-NEXT:    ret i1 icmp ne (i8* getelementptr (i8, i8* bitcast (i32* @A to i8*), i32 1), i8* getelementptr (i8, i8* bitcast (i32* @B to i8*), i32 1))
+;
+  %a.cast = bitcast i32* @A to i8*
+  %a.off = getelementptr i8, i8* %a.cast, i32 1
+  %b.cast = bitcast i32* @B to i8*
+  %b.off = getelementptr i8, i8* %b.cast, i32 1
+  %res = icmp ne i8* %a.off, %b.off
+  ret i1 %res
+}
+
+
+; TODO: Never equal
+define i1 @test_byval_global_inequal(i32* byval(i32) %a) {
+; CHECK-LABEL: @test_byval_global_inequal(
+; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A:%.*]], @B
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %b = alloca i32
+  %res = icmp ne i32* %a, @B
+  ret i1 %res
+}
+
+
 define i1 @neg_global_alias() {
 ; CHECK-LABEL: @neg_global_alias(
 ; CHECK-NEXT:    ret i1 icmp ne (i32* @A, i32* @A.alias)

From f20f9f5a32c02e7df3fc9a9d48643471dc6d41a5 Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Mon, 14 Feb 2022 10:11:29 -0800
Subject: [PATCH 281/748] [lldb] Add llvm_unreachable in RichManglingContext

Add `llvm_unreachable` to prevent warnings/errors in gcc and msvc.

Differential Revision: https://reviews.llvm.org/D119737
---
 lldb/source/Core/RichManglingContext.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/source/Core/RichManglingContext.cpp b/lldb/source/Core/RichManglingContext.cpp
index f1e81be417b0e..64b18b401f2d6 100644
--- a/lldb/source/Core/RichManglingContext.cpp
+++ b/lldb/source/Core/RichManglingContext.cpp
@@ -123,6 +123,7 @@ llvm::StringRef RichManglingContext::ParseFunctionBaseName() {
   case None:
     return {};
   }
+  llvm_unreachable("Fully covered switch above!");
 }
 
 llvm::StringRef RichManglingContext::ParseFunctionDeclContextName() {
@@ -139,6 +140,7 @@ llvm::StringRef RichManglingContext::ParseFunctionDeclContextName() {
   case None:
     return {};
   }
+  llvm_unreachable("Fully covered switch above!");
 }
 
 llvm::StringRef RichManglingContext::ParseFullName() {
@@ -156,4 +158,5 @@ llvm::StringRef RichManglingContext::ParseFullName() {
   case None:
     return {};
   }
+  llvm_unreachable("Fully covered switch above!");
 }

From 5ecf218eca3558aa647bbc8120dfa734a233953c Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 08:40:58 -0800
Subject: [PATCH 282/748] [instsimplify] Add a comment hinting how compares
 involving two globals are handled [NFC]

---
 llvm/lib/Analysis/InstructionSimplify.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 5fa6b69c1014f..54895def7970b 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2512,8 +2512,12 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
 /// *are* possible, and that zero sized regions do not overlap with any other.
 static bool HaveNonOverlappingStorage(const Value *V1, const Value *V2) {
   // Global variables always exist, so they always exist during the lifetime
-  // of each other and all allocas. Two different allocas usually have
-  // different addresses...
+  // of each other and all allocas.  Global variables themselves usually have
+  // non-overlapping storage, but since their addresses are constants, the
+  // case involving two globals does not reach here and is instead handled in
+  // constant folding.
+  //
+  // Two different allocas usually have different addresses...
   //
   // However, if there's an @llvm.stackrestore dynamically in between two
   // allocas, they may have the same address. It's tempting to reduce the
@@ -2532,7 +2536,6 @@ static bool HaveNonOverlappingStorage(const Value *V1, const Value *V2) {
   //
   // So, we'll assume that two non-empty allocas have different addresses
   // for now.
-  //
   return isa<AllocaInst>(V1) &&
     (isa<AllocaInst>(V2) || isa<GlobalVariable>(V2));
 }

From bf296ea6bbb13177dea9e5a72c3e446d5c5a1d81 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 08:51:18 -0800
Subject: [PATCH 283/748] [instsimplify] Clarify assumptions about disjoint
 memory regions [NFC]

---
 llvm/lib/Analysis/InstructionSimplify.cpp | 38 +++++++++++++----------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 54895def7970b..b6692319c09dc 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2507,6 +2507,25 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
   return nullptr;
 }
 
+/// Return true if the underlying object (storage) must be disjoint from
+/// storage returned by any noalias return call.
+static bool IsAllocDisjoint(const Value *V) {
+  // For allocas, we consider only static ones (dynamic
+  // allocas might be transformed into calls to malloc not simultaneously
+  // live with the compared-to allocation). For globals, we exclude symbols
+  // that might be resolve lazily to symbols in another dynamically-loaded
+  // library (and, thus, could be malloc'ed by the implementation).
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
+    return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
+            GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) &&
+      !GV->isThreadLocal();
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasByValAttr();
+  return false;
+}
+
 /// Return true if V1 and V2 are each the base of some distict storage region
 /// [V, object_size(V)] which do not overlap.  Note that zero sized regions
 /// *are* possible, and that zero sized regions do not overlap with any other.
@@ -2666,23 +2685,10 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
     };
 
     // Is the set of underlying objects all things which must be disjoint from
-    // noalias calls. For allocas, we consider only static ones (dynamic
-    // allocas might be transformed into calls to malloc not simultaneously
-    // live with the compared-to allocation). For globals, we exclude symbols
-    // that might be resolve lazily to symbols in another dynamically-loaded
-    // library (and, thus, could be malloc'ed by the implementation).
+    // noalias calls.  We assume that indexing from such disjoint storage
+    // into the heap is undefined, and thus offsets can be safely ignored.
     auto IsAllocDisjoint = [](ArrayRef<const Value *> Objects) {
-      return all_of(Objects, [](const Value *V) {
-        if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
-          return AI->getParent() && AI->getFunction() && AI->isStaticAlloca();
-        if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-          return (GV->hasLocalLinkage() || GV->hasHiddenVisibility() ||
-                  GV->hasProtectedVisibility() || GV->hasGlobalUnnamedAddr()) &&
-                 !GV->isThreadLocal();
-        if (const Argument *A = dyn_cast<Argument>(V))
-          return A->hasByValAttr();
-        return false;
-      });
+      return all_of(Objects, ::IsAllocDisjoint);
     };
 
     if ((IsNAC(LHSUObjs) && IsAllocDisjoint(RHSUObjs)) ||

From a2963d871ee5c2786de409b42f67ffcc39e53184 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 18 Feb 2022 11:43:03 -0500
Subject: [PATCH 284/748] [SDAG] fold sub-of-shift to add-of-shift

This fold is done in IR:
https://alive2.llvm.org/ce/z/jWyFrP

There is an x86 test that shows an improvement
from the added flexibility of using add (commutative).

The other diffs are presumed neutral.

Note that this could also be folded to an 'xor',
but I'm not sure if that would be universally better
(eg, x86 can convert adds more easily into LEA).

This helps prevent regressions from a potential fold for
issue #53829.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp     | 9 +++++++++
 llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll | 2 +-
 llvm/test/CodeGen/X86/combine-srem.ll             | 6 +++---
 llvm/test/CodeGen/X86/imul.ll                     | 5 ++---
 llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll     | 8 ++++----
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a0708336d26a2..89c3e41392882 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3656,6 +3656,15 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
   }
 
+  // As with the previous fold, prefer add for more folding potential.
+  // Subtracting SMIN/0 is the same as adding SMIN/0:
+  // N0 - (X << BW-1) --> N0 + (X << BW-1)
+  if (N1.getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlC = isConstOrConstSplat(N1.getOperand(1));
+    if (ShlC && ShlC->getAPIntValue() == VT.getScalarSizeInBits() - 1)
+      return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
+  }
+
   if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
     // (sub Carry, X)  ->  (addcarry (sub 0, X), 0, Carry)
     if (SDValue Carry = getAsCarry(TLI, N0)) {
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index fc033bc741c19..c6f7377794413 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -208,7 +208,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    movi v3.4s, #128, lsl #24
 ; CHECK-NEXT:    usra v1.4s, v2.4s, #1
 ; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index be2b7b86c8aed..575f37117f2da 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -74,7 +74,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
 ; SSE-NEXT:    psrld $1, %xmm1
 ; SSE-NEXT:    paddd %xmm0, %xmm1
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_srem_by_minsigned:
@@ -83,7 +83,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_vec_srem_by_minsigned:
@@ -93,7 +93,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
   %1 = srem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
   ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/X86/imul.ll b/llvm/test/CodeGen/X86/imul.ll
index 4a4b159c68ef7..9131688c4efcc 100644
--- a/llvm/test/CodeGen/X86/imul.ll
+++ b/llvm/test/CodeGen/X86/imul.ll
@@ -529,9 +529,8 @@ define i64 @testNegOverflow(i64 %a) {
 ; X64-LABEL: testNegOverflow:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    shlq $63, %rcx
-; X64-NEXT:    subq %rcx, %rax
+; X64-NEXT:    shlq $63, %rax
+; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: testNegOverflow:
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index 4aa45ecde8736..95eb23fc3cd5d 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -622,7 +622,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-SSE-NEXT:    psrld $1, %xmm1
 ; CHECK-SSE-NEXT:    paddd %xmm0, %xmm1
 ; CHECK-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE-NEXT:    psubd %xmm1, %xmm0
+; CHECK-SSE-NEXT:    paddd %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    psrld $31, %xmm0
@@ -634,7 +634,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -647,7 +647,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; CHECK-AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -659,7 +659,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-AVX512VL-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpsrld $31, %xmm0, %xmm0

From 12c4e65a76ed16ce90a9db0b47fdd41133fef4e9 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nathan@acm.org>
Date: Fri, 18 Feb 2022 04:05:06 -0800
Subject: [PATCH 285/748] [demangler][NFC] Reformatting

The linter complains about the formatting in subsequent changes.
Fixing that now.

Reviewed By: iains

Differential Revision: https://reviews.llvm.org/D120117
---
 libcxxabi/src/demangle/ItaniumDemangle.h     | 154 +++++++++----------
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 154 +++++++++----------
 2 files changed, 154 insertions(+), 154 deletions(-)

diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 5318a4c7e7b02..d005db5476cd6 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -32,83 +32,83 @@
 #include <limits>
 #include <utility>
 
-#define FOR_EACH_NODE_KIND(X) \
-    X(NodeArrayNode) \
-    X(DotSuffix) \
-    X(VendorExtQualType) \
-    X(QualType) \
-    X(ConversionOperatorType) \
-    X(PostfixQualifiedType) \
-    X(ElaboratedTypeSpefType) \
-    X(NameType) \
-    X(AbiTagAttr) \
-    X(EnableIfAttr) \
-    X(ObjCProtoName) \
-    X(PointerType) \
-    X(ReferenceType) \
-    X(PointerToMemberType) \
-    X(ArrayType) \
-    X(FunctionType) \
-    X(NoexceptSpec) \
-    X(DynamicExceptionSpec) \
-    X(FunctionEncoding) \
-    X(LiteralOperator) \
-    X(SpecialName) \
-    X(CtorVtableSpecialName) \
-    X(QualifiedName) \
-    X(NestedName) \
-    X(LocalName) \
-    X(VectorType) \
-    X(PixelVectorType) \
-    X(BinaryFPType) \
-    X(SyntheticTemplateParamName) \
-    X(TypeTemplateParamDecl) \
-    X(NonTypeTemplateParamDecl) \
-    X(TemplateTemplateParamDecl) \
-    X(TemplateParamPackDecl) \
-    X(ParameterPack) \
-    X(TemplateArgumentPack) \
-    X(ParameterPackExpansion) \
-    X(TemplateArgs) \
-    X(ForwardTemplateReference) \
-    X(NameWithTemplateArgs) \
-    X(GlobalQualifiedName) \
-    X(ExpandedSpecialSubstitution) \
-    X(SpecialSubstitution) \
-    X(CtorDtorName) \
-    X(DtorName) \
-    X(UnnamedTypeName) \
-    X(ClosureTypeName) \
-    X(StructuredBindingName) \
-    X(BinaryExpr) \
-    X(ArraySubscriptExpr) \
-    X(PostfixExpr) \
-    X(ConditionalExpr) \
-    X(MemberExpr) \
-    X(SubobjectExpr) \
-    X(EnclosingExpr) \
-    X(CastExpr) \
-    X(SizeofParamPackExpr) \
-    X(CallExpr) \
-    X(NewExpr) \
-    X(DeleteExpr) \
-    X(PrefixExpr) \
-    X(FunctionParam) \
-    X(ConversionExpr) \
-    X(PointerToMemberConversionExpr) \
-    X(InitListExpr) \
-    X(FoldExpr) \
-    X(ThrowExpr) \
-    X(BoolExpr) \
-    X(StringLiteral) \
-    X(LambdaExpr) \
-    X(EnumLiteral)    \
-    X(IntegerLiteral) \
-    X(FloatLiteral) \
-    X(DoubleLiteral) \
-    X(LongDoubleLiteral) \
-    X(BracedExpr) \
-    X(BracedRangeExpr)
+#define FOR_EACH_NODE_KIND(X)                                                  \
+  X(NodeArrayNode)                                                             \
+  X(DotSuffix)                                                                 \
+  X(VendorExtQualType)                                                         \
+  X(QualType)                                                                  \
+  X(ConversionOperatorType)                                                    \
+  X(PostfixQualifiedType)                                                      \
+  X(ElaboratedTypeSpefType)                                                    \
+  X(NameType)                                                                  \
+  X(AbiTagAttr)                                                                \
+  X(EnableIfAttr)                                                              \
+  X(ObjCProtoName)                                                             \
+  X(PointerType)                                                               \
+  X(ReferenceType)                                                             \
+  X(PointerToMemberType)                                                       \
+  X(ArrayType)                                                                 \
+  X(FunctionType)                                                              \
+  X(NoexceptSpec)                                                              \
+  X(DynamicExceptionSpec)                                                      \
+  X(FunctionEncoding)                                                          \
+  X(LiteralOperator)                                                           \
+  X(SpecialName)                                                               \
+  X(CtorVtableSpecialName)                                                     \
+  X(QualifiedName)                                                             \
+  X(NestedName)                                                                \
+  X(LocalName)                                                                 \
+  X(VectorType)                                                                \
+  X(PixelVectorType)                                                           \
+  X(BinaryFPType)                                                              \
+  X(SyntheticTemplateParamName)                                                \
+  X(TypeTemplateParamDecl)                                                     \
+  X(NonTypeTemplateParamDecl)                                                  \
+  X(TemplateTemplateParamDecl)                                                 \
+  X(TemplateParamPackDecl)                                                     \
+  X(ParameterPack)                                                             \
+  X(TemplateArgumentPack)                                                      \
+  X(ParameterPackExpansion)                                                    \
+  X(TemplateArgs)                                                              \
+  X(ForwardTemplateReference)                                                  \
+  X(NameWithTemplateArgs)                                                      \
+  X(GlobalQualifiedName)                                                       \
+  X(ExpandedSpecialSubstitution)                                               \
+  X(SpecialSubstitution)                                                       \
+  X(CtorDtorName)                                                              \
+  X(DtorName)                                                                  \
+  X(UnnamedTypeName)                                                           \
+  X(ClosureTypeName)                                                           \
+  X(StructuredBindingName)                                                     \
+  X(BinaryExpr)                                                                \
+  X(ArraySubscriptExpr)                                                        \
+  X(PostfixExpr)                                                               \
+  X(ConditionalExpr)                                                           \
+  X(MemberExpr)                                                                \
+  X(SubobjectExpr)                                                             \
+  X(EnclosingExpr)                                                             \
+  X(CastExpr)                                                                  \
+  X(SizeofParamPackExpr)                                                       \
+  X(CallExpr)                                                                  \
+  X(NewExpr)                                                                   \
+  X(DeleteExpr)                                                                \
+  X(PrefixExpr)                                                                \
+  X(FunctionParam)                                                             \
+  X(ConversionExpr)                                                            \
+  X(PointerToMemberConversionExpr)                                             \
+  X(InitListExpr)                                                              \
+  X(FoldExpr)                                                                  \
+  X(ThrowExpr)                                                                 \
+  X(BoolExpr)                                                                  \
+  X(StringLiteral)                                                             \
+  X(LambdaExpr)                                                                \
+  X(EnumLiteral)                                                               \
+  X(IntegerLiteral)                                                            \
+  X(FloatLiteral)                                                              \
+  X(DoubleLiteral)                                                             \
+  X(LongDoubleLiteral)                                                         \
+  X(BracedExpr)                                                                \
+  X(BracedRangeExpr)
 
 DEMANGLE_NAMESPACE_BEGIN
 
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 2ec76f401c6b5..ae7be8a0ac258 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -32,83 +32,83 @@
 #include <limits>
 #include <utility>
 
-#define FOR_EACH_NODE_KIND(X) \
-    X(NodeArrayNode) \
-    X(DotSuffix) \
-    X(VendorExtQualType) \
-    X(QualType) \
-    X(ConversionOperatorType) \
-    X(PostfixQualifiedType) \
-    X(ElaboratedTypeSpefType) \
-    X(NameType) \
-    X(AbiTagAttr) \
-    X(EnableIfAttr) \
-    X(ObjCProtoName) \
-    X(PointerType) \
-    X(ReferenceType) \
-    X(PointerToMemberType) \
-    X(ArrayType) \
-    X(FunctionType) \
-    X(NoexceptSpec) \
-    X(DynamicExceptionSpec) \
-    X(FunctionEncoding) \
-    X(LiteralOperator) \
-    X(SpecialName) \
-    X(CtorVtableSpecialName) \
-    X(QualifiedName) \
-    X(NestedName) \
-    X(LocalName) \
-    X(VectorType) \
-    X(PixelVectorType) \
-    X(BinaryFPType) \
-    X(SyntheticTemplateParamName) \
-    X(TypeTemplateParamDecl) \
-    X(NonTypeTemplateParamDecl) \
-    X(TemplateTemplateParamDecl) \
-    X(TemplateParamPackDecl) \
-    X(ParameterPack) \
-    X(TemplateArgumentPack) \
-    X(ParameterPackExpansion) \
-    X(TemplateArgs) \
-    X(ForwardTemplateReference) \
-    X(NameWithTemplateArgs) \
-    X(GlobalQualifiedName) \
-    X(ExpandedSpecialSubstitution) \
-    X(SpecialSubstitution) \
-    X(CtorDtorName) \
-    X(DtorName) \
-    X(UnnamedTypeName) \
-    X(ClosureTypeName) \
-    X(StructuredBindingName) \
-    X(BinaryExpr) \
-    X(ArraySubscriptExpr) \
-    X(PostfixExpr) \
-    X(ConditionalExpr) \
-    X(MemberExpr) \
-    X(SubobjectExpr) \
-    X(EnclosingExpr) \
-    X(CastExpr) \
-    X(SizeofParamPackExpr) \
-    X(CallExpr) \
-    X(NewExpr) \
-    X(DeleteExpr) \
-    X(PrefixExpr) \
-    X(FunctionParam) \
-    X(ConversionExpr) \
-    X(PointerToMemberConversionExpr) \
-    X(InitListExpr) \
-    X(FoldExpr) \
-    X(ThrowExpr) \
-    X(BoolExpr) \
-    X(StringLiteral) \
-    X(LambdaExpr) \
-    X(EnumLiteral)    \
-    X(IntegerLiteral) \
-    X(FloatLiteral) \
-    X(DoubleLiteral) \
-    X(LongDoubleLiteral) \
-    X(BracedExpr) \
-    X(BracedRangeExpr)
+#define FOR_EACH_NODE_KIND(X)                                                  \
+  X(NodeArrayNode)                                                             \
+  X(DotSuffix)                                                                 \
+  X(VendorExtQualType)                                                         \
+  X(QualType)                                                                  \
+  X(ConversionOperatorType)                                                    \
+  X(PostfixQualifiedType)                                                      \
+  X(ElaboratedTypeSpefType)                                                    \
+  X(NameType)                                                                  \
+  X(AbiTagAttr)                                                                \
+  X(EnableIfAttr)                                                              \
+  X(ObjCProtoName)                                                             \
+  X(PointerType)                                                               \
+  X(ReferenceType)                                                             \
+  X(PointerToMemberType)                                                       \
+  X(ArrayType)                                                                 \
+  X(FunctionType)                                                              \
+  X(NoexceptSpec)                                                              \
+  X(DynamicExceptionSpec)                                                      \
+  X(FunctionEncoding)                                                          \
+  X(LiteralOperator)                                                           \
+  X(SpecialName)                                                               \
+  X(CtorVtableSpecialName)                                                     \
+  X(QualifiedName)                                                             \
+  X(NestedName)                                                                \
+  X(LocalName)                                                                 \
+  X(VectorType)                                                                \
+  X(PixelVectorType)                                                           \
+  X(BinaryFPType)                                                              \
+  X(SyntheticTemplateParamName)                                                \
+  X(TypeTemplateParamDecl)                                                     \
+  X(NonTypeTemplateParamDecl)                                                  \
+  X(TemplateTemplateParamDecl)                                                 \
+  X(TemplateParamPackDecl)                                                     \
+  X(ParameterPack)                                                             \
+  X(TemplateArgumentPack)                                                      \
+  X(ParameterPackExpansion)                                                    \
+  X(TemplateArgs)                                                              \
+  X(ForwardTemplateReference)                                                  \
+  X(NameWithTemplateArgs)                                                      \
+  X(GlobalQualifiedName)                                                       \
+  X(ExpandedSpecialSubstitution)                                               \
+  X(SpecialSubstitution)                                                       \
+  X(CtorDtorName)                                                              \
+  X(DtorName)                                                                  \
+  X(UnnamedTypeName)                                                           \
+  X(ClosureTypeName)                                                           \
+  X(StructuredBindingName)                                                     \
+  X(BinaryExpr)                                                                \
+  X(ArraySubscriptExpr)                                                        \
+  X(PostfixExpr)                                                               \
+  X(ConditionalExpr)                                                           \
+  X(MemberExpr)                                                                \
+  X(SubobjectExpr)                                                             \
+  X(EnclosingExpr)                                                             \
+  X(CastExpr)                                                                  \
+  X(SizeofParamPackExpr)                                                       \
+  X(CallExpr)                                                                  \
+  X(NewExpr)                                                                   \
+  X(DeleteExpr)                                                                \
+  X(PrefixExpr)                                                                \
+  X(FunctionParam)                                                             \
+  X(ConversionExpr)                                                            \
+  X(PointerToMemberConversionExpr)                                             \
+  X(InitListExpr)                                                              \
+  X(FoldExpr)                                                                  \
+  X(ThrowExpr)                                                                 \
+  X(BoolExpr)                                                                  \
+  X(StringLiteral)                                                             \
+  X(LambdaExpr)                                                                \
+  X(EnumLiteral)                                                               \
+  X(IntegerLiteral)                                                            \
+  X(FloatLiteral)                                                              \
+  X(DoubleLiteral)                                                             \
+  X(LongDoubleLiteral)                                                         \
+  X(BracedExpr)                                                                \
+  X(BracedRangeExpr)
 
 DEMANGLE_NAMESPACE_BEGIN
 

From 47b749e5be2190d1ccb214fd6364da462a9098cf Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Fri, 18 Feb 2022 13:28:16 +0100
Subject: [PATCH 286/748] [clangd] Tweak --query-driver to ignore slash
 direction on windows

See https://github.com/clangd/clangd/issues/1022

Differential Revision: https://reviews.llvm.org/D120115
---
 clang-tools-extra/clangd/QueryDriverDatabase.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/QueryDriverDatabase.cpp b/clang-tools-extra/clangd/QueryDriverDatabase.cpp
index 5e51837b4820f..0daa0c5cd0be0 100644
--- a/clang-tools-extra/clangd/QueryDriverDatabase.cpp
+++ b/clang-tools-extra/clangd/QueryDriverDatabase.cpp
@@ -38,12 +38,10 @@
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Driver/Types.h"
 #include "clang/Tooling/CompilationDatabase.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -276,6 +274,10 @@ std::string convertGlobToRegex(llvm::StringRef Glob) {
         // Single star, accept any sequence without a slash.
         RegStream << "[^/]*";
       }
+    } else if (llvm::sys::path::is_separator(Glob[I]) &&
+               llvm::sys::path::is_separator('/') &&
+               llvm::sys::path::is_separator('\\')) {
+      RegStream << R"([/\\])"; // Accept either slash on windows.
     } else {
       RegStream << llvm::Regex::escape(Glob.substr(I, 1));
     }
@@ -293,6 +295,7 @@ llvm::Regex convertGlobsToRegex(llvm::ArrayRef<std::string> Globs) {
   for (llvm::StringRef Glob : Globs)
     RegTexts.push_back(convertGlobToRegex(Glob));
 
+  // Tempting to pass IgnoreCase, but we don't know the FS sensitivity.
   llvm::Regex Reg(llvm::join(RegTexts, "|"));
   assert(Reg.isValid(RegTexts.front()) &&
          "Created an invalid regex from globs");

From e4a03b26898e9ca5917a425fc4b089bf5ba19272 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 17 Feb 2022 20:51:01 -0800
Subject: [PATCH 287/748] [lldb] Default initialize CommandOptions fields (NFC)

Make sure all fields are default initialized to the same values.
---
 lldb/source/Commands/CommandObjectCommands.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index ebb84bce7323f..a828ba16e8781 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -1480,7 +1480,7 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
     std::string m_class_name;
     std::string m_funct_name;
     std::string m_short_help;
-    bool m_overwrite;
+    bool m_overwrite = false;
     ScriptedCommandSynchronicity m_synchronicity =
         eScriptedCommandSynchronicitySynchronous;
   };
@@ -1637,8 +1637,9 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
   std::string m_cmd_name;
   CommandObjectMultiword *m_container = nullptr;
   std::string m_short_help;
-  bool m_overwrite;
-  ScriptedCommandSynchronicity m_synchronicity;
+  bool m_overwrite = false;
+  ScriptedCommandSynchronicity m_synchronicity =
+      eScriptedCommandSynchronicitySynchronous;
 };
 
 // CommandObjectCommandsScriptList

From 622ea723ccfdc4495ac3a1283598a8d15f1524a3 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 18 Feb 2022 09:05:00 -0800
Subject: [PATCH 288/748] [dsymutil] Make verification test resilient against
 output ordering

I didn't mean the checks for QUIET-OUTPUT-FAIL, QUIET-INPUT-FAIL and
VERBOSE-INPUT-FAIL to have any specific ordering.
---
 llvm/test/tools/dsymutil/X86/verify.test | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/tools/dsymutil/X86/verify.test b/llvm/test/tools/dsymutil/X86/verify.test
index 752d79a9871d5..eb8aa045c0f7b 100644
--- a/llvm/test/tools/dsymutil/X86/verify.test
+++ b/llvm/test/tools/dsymutil/X86/verify.test
@@ -18,9 +18,9 @@
 # RUN: not dsymutil -verify-dwarf=bogus -verbose -oso-prepend-path=%p/../Inputs -y %s -o %t 2>&1 | FileCheck %s --check-prefixes=BOGUS
 # RUN: not dsymutil -verify-dwarf=all -oso-prepend-path=%p/../Inputs -y %s -o %t 2>&1 | FileCheck %s --check-prefixes=QUIET-OUTPUT-FAIL,QUIET-INPUT-FAIL,VERBOSE-INPUT-FAIL
 
-# VERBOSE-INPUT-FAIL: error: Abbreviation declaration contains multiple DW_AT_language attributes.
-# QUIET-INPUT-FAIL: warning: input verification failed
-# QUIET-OUTPUT-FAIL: error: output verification failed
+# VERBOSE-INPUT-FAIL-DAG: error: Abbreviation declaration contains multiple DW_AT_language attributes.
+# QUIET-INPUT-FAIL-DAG: warning: input verification failed
+# QUIET-OUTPUT-FAIL-DAG: error: output verification failed
 # QUIET-SUCCESS-NOT: input verification failed
 # QUIET-SUCCESS-NOT: output verification failed
 # BOGUS: error: invalid verify type specified: 'bogus'

From fd3669c2567302d34d9dd2222ee97204e4e26d4a Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@fb.com>
Date: Fri, 18 Feb 2022 12:03:11 -0500
Subject: [PATCH 289/748] [lld-macho] Improve hiding of unnamed_addr symbols

Symbols for which `canBeOmittedFromSymbolTable()` is true should be
treated as private externs. This diff tries to do that by unsetting the
ExportDynamic bit. It seems to mostly work with the FullLTO backend, but
with the ThinLTO backend, the `local_unnamed_addr` symbols still fail to
be properly hidden. Nonetheless, this is a step in the right direction.

I've documented all the remaining differences between our behavior and
LD64's in the lto-internalized-unnamed-addr.ll test.

See also https://discourse.llvm.org/t/mach-o-lto-handling-of-linkonce-odr-unnamed-addr/60015

Reviewed By: #lld-macho, thevinster

Differential Revision: https://reviews.llvm.org/D119767
---
 lld/MachO/InputFiles.cpp                      |  1 +
 .../MachO/lto-internalize-unnamed-addr.ll     | 75 +++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 lld/test/MachO/lto-internalize-unnamed-addr.ll

diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 12df200b7347e..ac329536dc426 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -1559,6 +1559,7 @@ static macho::Symbol *createBitcodeSymbol(const lto::InputFile::Symbol &objSym,
   case GlobalValue::DefaultVisibility:
     break;
   }
+  isPrivateExtern = isPrivateExtern || objSym.canBeOmittedFromSymbolTable();
 
   if (objSym.isCommon())
     return symtab->addCommon(name, &file, objSym.getCommonSize(),
diff --git a/lld/test/MachO/lto-internalize-unnamed-addr.ll b/lld/test/MachO/lto-internalize-unnamed-addr.ll
new file mode 100644
index 0000000000000..e0622d691cdc4
--- /dev/null
+++ b/lld/test/MachO/lto-internalize-unnamed-addr.ll
@@ -0,0 +1,75 @@
+; REQUIRES: x86
+; RUN: rm -rf %t; split-file %s %t
+;; This test covers both FullLTO and ThinLTO code paths because we have observed
+;; (unexpected) differences between the two.
+; RUN: llvm-as %t/test.ll -o %t/test.o
+; RUN: llvm-as %t/test2.ll -o %t/test2.o
+; RUN: opt -module-summary %t/test.ll -o %t/test.thinlto.o
+; RUN: opt -module-summary %t/test2.ll -o %t/test2.thinlto.o
+
+; RUN: %lld -lSystem %t/test.o %t/test2.o -o %t/test
+; RUN: llvm-nm -m %t/test | FileCheck %s --check-prefix=LTO
+
+; RUN: %lld -lSystem -dylib %t/test.o %t/test2.o -o %t/test.dylib
+; RUN: llvm-nm -m %t/test.dylib | FileCheck %s --check-prefix=LTO-DYLIB
+
+; RUN: %lld -lSystem %t/test.thinlto.o %t/test2.o -o %t/test.thinlto
+; RUN: llvm-nm -m %t/test.thinlto | FileCheck %s --check-prefix=THINLTO
+
+; RUN: %lld -lSystem -dylib %t/test.thinlto.o %t/test2.o -o %t/test.thinlto.dylib
+; RUN: llvm-nm -m %t/test.thinlto.dylib | FileCheck %s --check-prefix=THINLTO
+
+; LTO-DAG: (__DATA,__data) non-external _global_unnamed
+; LTO-DAG: (__DATA,__data) non-external _local_unnamed
+;; LD64 marks this with (was a private external). IMO both LD64 and LLD should
+;; mark all the other internalized symbols with (was a private external).
+; LTO-DAG: (__TEXT,__const) non-external _local_unnamed_always_const
+; LTO-DAG: (__TEXT,__const) non-external _local_unnamed_const
+;; LD64 doesn't internalize this -- it emits it as a weak external -- which I
+;; think is a missed optimization on its end.
+; LTO-DAG: (__TEXT,__const) non-external _local_unnamed_sometimes_const
+
+;; The output here is largely identical to LD64's, except that the non-external
+;; symbols here are all marked as (was a private external) by LD64. LLD should
+;; follow suit.
+; LTO-DYLIB-DAG: (__DATA,__data) non-external _global_unnamed
+; LTO-DYLIB-DAG: (__DATA,__data) weak external _local_unnamed
+; LTO-DYLIB-DAG: (__TEXT,__const) non-external _local_unnamed_always_const
+; LTO-DYLIB-DAG: (__TEXT,__const) non-external _local_unnamed_const
+; LTO-DYLIB-DAG: (__TEXT,__const) weak external _local_unnamed_sometimes_const
+
+; THINLTO-DAG: (__DATA,__data) non-external (was a private external) _global_unnamed
+; THINLTO-DAG: (__DATA,__data) weak external _local_unnamed
+;; The next two symbols are rendered as non-external (was a private external)
+;; by LD64. This is a missed optimization on LLD's end.
+; THINLTO-DAG: (__TEXT,__const) weak external _local_unnamed_always_const
+; THINLTO-DAG: (__TEXT,__const) weak external _local_unnamed_const
+;; LD64 actually fails to link when the following symbol is included in the test
+;; input, instead producing this error:
+;; reference to bitcode symbol '_local_unnamed_sometimes_const' which LTO has not compiled in '_used' from /tmp/lto.o for architecture x86_64
+; THINLTO-DAG: (__TEXT,__const) weak external _local_unnamed_sometimes_const
+
+;--- test.ll
+target triple = "x86_64-apple-darwin"
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+@global_unnamed = linkonce_odr unnamed_addr global i8 42
+@local_unnamed_const = linkonce_odr local_unnamed_addr constant i8 42
+@local_unnamed_always_const = linkonce_odr local_unnamed_addr constant i8 42
+@local_unnamed_sometimes_const = linkonce_odr local_unnamed_addr constant i8 42
+@local_unnamed = linkonce_odr local_unnamed_addr global i8 42
+@used = hidden constant [5 x i8*] [i8* @global_unnamed, i8* @local_unnamed,
+  i8* @local_unnamed_const, i8* @local_unnamed_always_const,
+  i8* @local_unnamed_sometimes_const]
+@llvm.used = appending global [1 x [5 x i8*]*] [[5 x i8*]* @used]
+
+define void @main() {
+  ret void
+}
+
+;--- test2.ll
+target triple = "x86_64-apple-darwin"
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+@local_unnamed_always_const = linkonce_odr local_unnamed_addr constant i8 42
+@local_unnamed_sometimes_const = linkonce_odr local_unnamed_addr global i8 42

From 34313583331e5c8cb0d3df28efb6c34c428fd235 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Fri, 18 Feb 2022 18:23:51 +0100
Subject: [PATCH 290/748] [libc++] Replace _LIBCPP_INLINE_VISIBILITY with
 _LIBCPP_HIDE_FROM_ABI in __filesystem/operations.h

---
 libcxx/include/__filesystem/operations.h | 192 +++++++++++------------
 1 file changed, 96 insertions(+), 96 deletions(-)

diff --git a/libcxx/include/__filesystem/operations.h b/libcxx/include/__filesystem/operations.h
index 92730ca40a2ba..894c501d4ace1 100644
--- a/libcxx/include/__filesystem/operations.h
+++ b/libcxx/include/__filesystem/operations.h
@@ -62,41 +62,41 @@ _LIBCPP_FUNC_VIS void __rename(const path& from, const path& to, error_code* ec
 _LIBCPP_FUNC_VIS void __resize_file(const path& p, uintmax_t size, error_code* ec = nullptr);
 _LIBCPP_FUNC_VIS path __temp_directory_path(error_code* __ec = nullptr);
 
-inline _LIBCPP_INLINE_VISIBILITY path absolute(const path& __p) { return __absolute(__p); }
-inline _LIBCPP_INLINE_VISIBILITY path absolute(const path& __p, error_code& __ec) { return __absolute(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY path canonical(const path& __p)                   { return __canonical(__p); }
-inline _LIBCPP_INLINE_VISIBILITY path canonical(const path& __p, error_code& __ec) { return __canonical(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool copy_file(const path& __from, const path& __to) { return __copy_file(__from, __to, copy_options::none); }
-inline _LIBCPP_INLINE_VISIBILITY bool copy_file(const path& __from, const path& __to, error_code& __ec) { return __copy_file(__from, __to, copy_options::none, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool copy_file(const path& __from, const path& __to, copy_options __opt) { return __copy_file(__from, __to, __opt); }
-inline _LIBCPP_INLINE_VISIBILITY bool copy_file(const path& __from, const path& __to, copy_options __opt, error_code& __ec) { return __copy_file(__from, __to, __opt, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void copy_symlink(const path& __from, const path& __to) { __copy_symlink(__from, __to); }
-inline _LIBCPP_INLINE_VISIBILITY void copy_symlink(const path& __from, const path& __to, error_code& __ec) noexcept { __copy_symlink(__from, __to, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void copy(const path& __from, const path& __to) { __copy(__from, __to, copy_options::none); }
-inline _LIBCPP_INLINE_VISIBILITY void copy(const path& __from, const path& __to, error_code& __ec) { __copy(__from, __to, copy_options::none, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void copy(const path& __from, const path& __to, copy_options __opt) { __copy(__from, __to, __opt); }
-inline _LIBCPP_INLINE_VISIBILITY void copy(const path& __from, const path& __to, copy_options __opt, error_code& __ec) { __copy(__from, __to, __opt, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool create_directories(const path& __p) { return __create_directories(__p); }
-inline _LIBCPP_INLINE_VISIBILITY bool create_directories(const path& __p, error_code& __ec) { return __create_directories(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void create_directory_symlink(const path& __target, const path& __link) { __create_directory_symlink(__target, __link); }
-inline _LIBCPP_INLINE_VISIBILITY void create_directory_symlink(const path& __target, const path& __link, error_code& __ec) noexcept { __create_directory_symlink(__target, __link, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool create_directory(const path& __p) { return __create_directory(__p); }
-inline _LIBCPP_INLINE_VISIBILITY bool create_directory(const path& __p, error_code& __ec) noexcept { return __create_directory(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool create_directory(const path& __p, const path& __attrs) { return __create_directory(__p, __attrs); }
-inline _LIBCPP_INLINE_VISIBILITY bool create_directory(const path& __p, const path& __attrs, error_code& __ec) noexcept { return __create_directory(__p, __attrs, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void create_hard_link(const path& __target, const path& __link) { __create_hard_link(__target, __link); }
-inline _LIBCPP_INLINE_VISIBILITY void create_hard_link(const path& __target, const path& __link, error_code& __ec) noexcept { __create_hard_link(__target, __link, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void create_symlink(const path& __target, const path& __link) { __create_symlink(__target, __link); }
-inline _LIBCPP_INLINE_VISIBILITY void create_symlink(const path& __target, const path& __link, error_code& __ec) noexcept { return __create_symlink(__target, __link, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY path current_path() { return __current_path(); }
-inline _LIBCPP_INLINE_VISIBILITY path current_path(error_code& __ec) { return __current_path(&__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void current_path(const path& __p) { __current_path(__p); }
-inline _LIBCPP_INLINE_VISIBILITY void current_path(const path& __p, error_code& __ec) noexcept { __current_path(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool equivalent(const path& __p1, const path& __p2) { return __equivalent(__p1, __p2); }
-inline _LIBCPP_INLINE_VISIBILITY bool equivalent(const path& __p1, const path& __p2, error_code& __ec) noexcept { return __equivalent(__p1, __p2, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool status_known(file_status __s) noexcept { return __s.type() != file_type::none; }
-inline _LIBCPP_INLINE_VISIBILITY bool exists(file_status __s) noexcept { return status_known(__s) && __s.type() != file_type::not_found; }
-inline _LIBCPP_INLINE_VISIBILITY bool exists(const path& __p) { return exists(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI path absolute(const path& __p) { return __absolute(__p); }
+inline _LIBCPP_HIDE_FROM_ABI path absolute(const path& __p, error_code& __ec) { return __absolute(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI path canonical(const path& __p)                   { return __canonical(__p); }
+inline _LIBCPP_HIDE_FROM_ABI path canonical(const path& __p, error_code& __ec) { return __canonical(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool copy_file(const path& __from, const path& __to) { return __copy_file(__from, __to, copy_options::none); }
+inline _LIBCPP_HIDE_FROM_ABI bool copy_file(const path& __from, const path& __to, error_code& __ec) { return __copy_file(__from, __to, copy_options::none, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool copy_file(const path& __from, const path& __to, copy_options __opt) { return __copy_file(__from, __to, __opt); }
+inline _LIBCPP_HIDE_FROM_ABI bool copy_file(const path& __from, const path& __to, copy_options __opt, error_code& __ec) { return __copy_file(__from, __to, __opt, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void copy_symlink(const path& __from, const path& __to) { __copy_symlink(__from, __to); }
+inline _LIBCPP_HIDE_FROM_ABI void copy_symlink(const path& __from, const path& __to, error_code& __ec) noexcept { __copy_symlink(__from, __to, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void copy(const path& __from, const path& __to) { __copy(__from, __to, copy_options::none); }
+inline _LIBCPP_HIDE_FROM_ABI void copy(const path& __from, const path& __to, error_code& __ec) { __copy(__from, __to, copy_options::none, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void copy(const path& __from, const path& __to, copy_options __opt) { __copy(__from, __to, __opt); }
+inline _LIBCPP_HIDE_FROM_ABI void copy(const path& __from, const path& __to, copy_options __opt, error_code& __ec) { __copy(__from, __to, __opt, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool create_directories(const path& __p) { return __create_directories(__p); }
+inline _LIBCPP_HIDE_FROM_ABI bool create_directories(const path& __p, error_code& __ec) { return __create_directories(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void create_directory_symlink(const path& __target, const path& __link) { __create_directory_symlink(__target, __link); }
+inline _LIBCPP_HIDE_FROM_ABI void create_directory_symlink(const path& __target, const path& __link, error_code& __ec) noexcept { __create_directory_symlink(__target, __link, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool create_directory(const path& __p) { return __create_directory(__p); }
+inline _LIBCPP_HIDE_FROM_ABI bool create_directory(const path& __p, error_code& __ec) noexcept { return __create_directory(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool create_directory(const path& __p, const path& __attrs) { return __create_directory(__p, __attrs); }
+inline _LIBCPP_HIDE_FROM_ABI bool create_directory(const path& __p, const path& __attrs, error_code& __ec) noexcept { return __create_directory(__p, __attrs, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void create_hard_link(const path& __target, const path& __link) { __create_hard_link(__target, __link); }
+inline _LIBCPP_HIDE_FROM_ABI void create_hard_link(const path& __target, const path& __link, error_code& __ec) noexcept { __create_hard_link(__target, __link, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void create_symlink(const path& __target, const path& __link) { __create_symlink(__target, __link); }
+inline _LIBCPP_HIDE_FROM_ABI void create_symlink(const path& __target, const path& __link, error_code& __ec) noexcept { return __create_symlink(__target, __link, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI path current_path() { return __current_path(); }
+inline _LIBCPP_HIDE_FROM_ABI path current_path(error_code& __ec) { return __current_path(&__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void current_path(const path& __p) { __current_path(__p); }
+inline _LIBCPP_HIDE_FROM_ABI void current_path(const path& __p, error_code& __ec) noexcept { __current_path(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool equivalent(const path& __p1, const path& __p2) { return __equivalent(__p1, __p2); }
+inline _LIBCPP_HIDE_FROM_ABI bool equivalent(const path& __p1, const path& __p2, error_code& __ec) noexcept { return __equivalent(__p1, __p2, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool status_known(file_status __s) noexcept { return __s.type() != file_type::none; }
+inline _LIBCPP_HIDE_FROM_ABI bool exists(file_status __s) noexcept { return status_known(__s) && __s.type() != file_type::not_found; }
+inline _LIBCPP_HIDE_FROM_ABI bool exists(const path& __p) { return exists(__status(__p)); }
 
 inline _LIBCPP_INLINE_VISIBILITY bool exists(const path& __p, error_code& __ec) noexcept {
   auto __s = __status(__p, &__ec);
@@ -105,45 +105,45 @@ inline _LIBCPP_INLINE_VISIBILITY bool exists(const path& __p, error_code& __ec)
   return exists(__s);
 }
 
-inline _LIBCPP_INLINE_VISIBILITY uintmax_t file_size(const path& __p) { return __file_size(__p); }
-inline _LIBCPP_INLINE_VISIBILITY uintmax_t file_size(const path& __p, error_code& __ec) noexcept { return __file_size(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY uintmax_t hard_link_count(const path& __p) { return __hard_link_count(__p); }
-inline _LIBCPP_INLINE_VISIBILITY uintmax_t hard_link_count(const path& __p, error_code& __ec) noexcept { return __hard_link_count(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_block_file(file_status __s) noexcept { return __s.type() == file_type::block; }
-inline _LIBCPP_INLINE_VISIBILITY bool is_block_file(const path& __p) { return is_block_file(__status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_block_file(const path& __p, error_code& __ec) noexcept { return is_block_file(__status(__p, &__ec)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_character_file(file_status __s) noexcept { return __s.type() == file_type::character; }
-inline _LIBCPP_INLINE_VISIBILITY bool is_character_file(const path& __p) { return is_character_file(__status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_character_file(const path& __p, error_code& __ec) noexcept { return is_character_file(__status(__p, &__ec)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_directory(file_status __s) noexcept { return __s.type() == file_type::directory; }
-inline _LIBCPP_INLINE_VISIBILITY bool is_directory(const path& __p) { return is_directory(__status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_directory(const path& __p, error_code& __ec) noexcept { return is_directory(__status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI uintmax_t file_size(const path& __p) { return __file_size(__p); }
+inline _LIBCPP_HIDE_FROM_ABI uintmax_t file_size(const path& __p, error_code& __ec) noexcept { return __file_size(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI uintmax_t hard_link_count(const path& __p) { return __hard_link_count(__p); }
+inline _LIBCPP_HIDE_FROM_ABI uintmax_t hard_link_count(const path& __p, error_code& __ec) noexcept { return __hard_link_count(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_block_file(file_status __s) noexcept { return __s.type() == file_type::block; }
+inline _LIBCPP_HIDE_FROM_ABI bool is_block_file(const path& __p) { return is_block_file(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_block_file(const path& __p, error_code& __ec) noexcept { return is_block_file(__status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_character_file(file_status __s) noexcept { return __s.type() == file_type::character; }
+inline _LIBCPP_HIDE_FROM_ABI bool is_character_file(const path& __p) { return is_character_file(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_character_file(const path& __p, error_code& __ec) noexcept { return is_character_file(__status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_directory(file_status __s) noexcept { return __s.type() == file_type::directory; }
+inline _LIBCPP_HIDE_FROM_ABI bool is_directory(const path& __p) { return is_directory(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_directory(const path& __p, error_code& __ec) noexcept { return is_directory(__status(__p, &__ec)); }
 _LIBCPP_FUNC_VIS bool __fs_is_empty(const path& p, error_code* ec = nullptr);
-inline _LIBCPP_INLINE_VISIBILITY bool is_empty(const path& __p) { return __fs_is_empty(__p); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_empty(const path& __p, error_code& __ec) { return __fs_is_empty(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_fifo(file_status __s) noexcept { return __s.type() == file_type::fifo; }
-inline _LIBCPP_INLINE_VISIBILITY bool is_fifo(const path& __p) { return is_fifo(__status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_fifo(const path& __p, error_code& __ec) noexcept { return is_fifo(__status(__p, &__ec)); } 
-inline _LIBCPP_INLINE_VISIBILITY bool is_regular_file(file_status __s) noexcept { return __s.type() == file_type::regular; }
-inline _LIBCPP_INLINE_VISIBILITY bool is_regular_file(const path& __p) { return is_regular_file(__status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_regular_file(const path& __p, error_code& __ec) noexcept { return is_regular_file(__status(__p, &__ec)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_symlink(file_status __s) noexcept { return __s.type() == file_type::symlink; }
-inline _LIBCPP_INLINE_VISIBILITY bool is_symlink(const path& __p) { return is_symlink(__symlink_status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_symlink(const path& __p, error_code& __ec) noexcept { return is_symlink(__symlink_status(__p, &__ec)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_other(file_status __s) noexcept { return exists(__s) && !is_regular_file(__s) && !is_directory(__s) && !is_symlink(__s); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_other(const path& __p) { return is_other(__status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_other(const path& __p, error_code& __ec) noexcept { return is_other(__status(__p, &__ec)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_socket(file_status __s) noexcept { return __s.type() == file_type::socket; }
-inline _LIBCPP_INLINE_VISIBILITY bool is_socket(const path& __p) { return is_socket(__status(__p)); }
-inline _LIBCPP_INLINE_VISIBILITY bool is_socket(const path& __p, error_code& __ec) noexcept { return is_socket(__status(__p, &__ec)); }
-inline _LIBCPP_INLINE_VISIBILITY file_time_type last_write_time(const path& __p) { return __last_write_time(__p); }
-inline _LIBCPP_INLINE_VISIBILITY file_time_type last_write_time(const path& __p, error_code& __ec) noexcept { return __last_write_time(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void last_write_time(const path& __p, file_time_type __t) { __last_write_time(__p, __t); }
-inline _LIBCPP_INLINE_VISIBILITY void last_write_time(const path& __p, file_time_type __t, error_code& __ec) noexcept { __last_write_time(__p, __t, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_empty(const path& __p) { return __fs_is_empty(__p); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_empty(const path& __p, error_code& __ec) { return __fs_is_empty(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_fifo(file_status __s) noexcept { return __s.type() == file_type::fifo; }
+inline _LIBCPP_HIDE_FROM_ABI bool is_fifo(const path& __p) { return is_fifo(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_fifo(const path& __p, error_code& __ec) noexcept { return is_fifo(__status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_regular_file(file_status __s) noexcept { return __s.type() == file_type::regular; }
+inline _LIBCPP_HIDE_FROM_ABI bool is_regular_file(const path& __p) { return is_regular_file(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_regular_file(const path& __p, error_code& __ec) noexcept { return is_regular_file(__status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_symlink(file_status __s) noexcept { return __s.type() == file_type::symlink; }
+inline _LIBCPP_HIDE_FROM_ABI bool is_symlink(const path& __p) { return is_symlink(__symlink_status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_symlink(const path& __p, error_code& __ec) noexcept { return is_symlink(__symlink_status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_other(file_status __s) noexcept { return exists(__s) && !is_regular_file(__s) && !is_directory(__s) && !is_symlink(__s); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_other(const path& __p) { return is_other(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_other(const path& __p, error_code& __ec) noexcept { return is_other(__status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_socket(file_status __s) noexcept { return __s.type() == file_type::socket; }
+inline _LIBCPP_HIDE_FROM_ABI bool is_socket(const path& __p) { return is_socket(__status(__p)); }
+inline _LIBCPP_HIDE_FROM_ABI bool is_socket(const path& __p, error_code& __ec) noexcept { return is_socket(__status(__p, &__ec)); }
+inline _LIBCPP_HIDE_FROM_ABI file_time_type last_write_time(const path& __p) { return __last_write_time(__p); }
+inline _LIBCPP_HIDE_FROM_ABI file_time_type last_write_time(const path& __p, error_code& __ec) noexcept { return __last_write_time(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void last_write_time(const path& __p, file_time_type __t) { __last_write_time(__p, __t); }
+inline _LIBCPP_HIDE_FROM_ABI void last_write_time(const path& __p, file_time_type __t, error_code& __ec) noexcept { __last_write_time(__p, __t, &__ec); }
 _LIBCPP_FUNC_VIS void __permissions(const path&, perms, perm_options, error_code* = nullptr);
-inline _LIBCPP_INLINE_VISIBILITY void permissions(const path& __p, perms __prms, perm_options __opts = perm_options::replace) { __permissions(__p, __prms, __opts); }
-inline _LIBCPP_INLINE_VISIBILITY void permissions(const path& __p, perms __prms, error_code& __ec) noexcept { __permissions(__p, __prms, perm_options::replace, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void permissions(const path& __p, perms __prms, perm_options __opts, error_code& __ec) { __permissions(__p, __prms, __opts, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void permissions(const path& __p, perms __prms, perm_options __opts = perm_options::replace) { __permissions(__p, __prms, __opts); }
+inline _LIBCPP_HIDE_FROM_ABI void permissions(const path& __p, perms __prms, error_code& __ec) noexcept { __permissions(__p, __prms, perm_options::replace, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void permissions(const path& __p, perms __prms, perm_options __opts, error_code& __ec) { __permissions(__p, __prms, __opts, &__ec); }
 
 inline _LIBCPP_INLINE_VISIBILITY path proximate(const path& __p, const path& __base, error_code& __ec) {
   path __tmp = __weakly_canonical(__p, &__ec);
@@ -155,10 +155,10 @@ inline _LIBCPP_INLINE_VISIBILITY path proximate(const path& __p, const path& __b
   return __tmp.lexically_proximate(__tmp_base);
 }
 
-inline _LIBCPP_INLINE_VISIBILITY path proximate(const path& __p, error_code& __ec) { return proximate(__p, current_path(), __ec); }
-inline _LIBCPP_INLINE_VISIBILITY path proximate(const path& __p, const path& __base = current_path()) { return __weakly_canonical(__p).lexically_proximate(__weakly_canonical(__base)); }
-inline _LIBCPP_INLINE_VISIBILITY path read_symlink(const path& __p) { return __read_symlink(__p); }
-inline _LIBCPP_INLINE_VISIBILITY path read_symlink(const path& __p, error_code& __ec) { return __read_symlink(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI path proximate(const path& __p, error_code& __ec) { return proximate(__p, current_path(), __ec); }
+inline _LIBCPP_HIDE_FROM_ABI path proximate(const path& __p, const path& __base = current_path()) { return __weakly_canonical(__p).lexically_proximate(__weakly_canonical(__base)); }
+inline _LIBCPP_HIDE_FROM_ABI path read_symlink(const path& __p) { return __read_symlink(__p); }
+inline _LIBCPP_HIDE_FROM_ABI path read_symlink(const path& __p, error_code& __ec) { return __read_symlink(__p, &__ec); }
 
 inline _LIBCPP_INLINE_VISIBILITY path relative(const path& __p, const path& __base, error_code& __ec) {
   path __tmp = __weakly_canonical(__p, &__ec);
@@ -170,27 +170,27 @@ inline _LIBCPP_INLINE_VISIBILITY path relative(const path& __p, const path& __ba
   return __tmp.lexically_relative(__tmpbase);
 }
 
-inline _LIBCPP_INLINE_VISIBILITY path relative(const path& __p, error_code& __ec) { return relative(__p, current_path(), __ec); }
-inline _LIBCPP_INLINE_VISIBILITY path relative(const path& __p, const path& __base = current_path()) { return __weakly_canonical(__p).lexically_relative(__weakly_canonical(__base)); }
-inline _LIBCPP_INLINE_VISIBILITY uintmax_t remove_all(const path& __p) { return __remove_all(__p); }
-inline _LIBCPP_INLINE_VISIBILITY uintmax_t remove_all(const path& __p, error_code& __ec) { return __remove_all(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY bool remove(const path& __p) { return __remove(__p); }
-inline _LIBCPP_INLINE_VISIBILITY bool remove(const path& __p, error_code& __ec) noexcept { return __remove(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void rename(const path& __from, const path& __to) { return __rename(__from, __to); }
-inline _LIBCPP_INLINE_VISIBILITY void rename(const path& __from, const path& __to, error_code& __ec) noexcept { return __rename(__from, __to, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY void resize_file(const path& __p, uintmax_t __ns) { return __resize_file(__p, __ns); }
-inline _LIBCPP_INLINE_VISIBILITY void resize_file(const path& __p, uintmax_t __ns, error_code& __ec) noexcept { return __resize_file(__p, __ns, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI path relative(const path& __p, error_code& __ec) { return relative(__p, current_path(), __ec); }
+inline _LIBCPP_HIDE_FROM_ABI path relative(const path& __p, const path& __base = current_path()) { return __weakly_canonical(__p).lexically_relative(__weakly_canonical(__base)); }
+inline _LIBCPP_HIDE_FROM_ABI uintmax_t remove_all(const path& __p) { return __remove_all(__p); }
+inline _LIBCPP_HIDE_FROM_ABI uintmax_t remove_all(const path& __p, error_code& __ec) { return __remove_all(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI bool remove(const path& __p) { return __remove(__p); }
+inline _LIBCPP_HIDE_FROM_ABI bool remove(const path& __p, error_code& __ec) noexcept { return __remove(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void rename(const path& __from, const path& __to) { return __rename(__from, __to); }
+inline _LIBCPP_HIDE_FROM_ABI void rename(const path& __from, const path& __to, error_code& __ec) noexcept { return __rename(__from, __to, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI void resize_file(const path& __p, uintmax_t __ns) { return __resize_file(__p, __ns); }
+inline _LIBCPP_HIDE_FROM_ABI void resize_file(const path& __p, uintmax_t __ns, error_code& __ec) noexcept { return __resize_file(__p, __ns, &__ec); }
 _LIBCPP_FUNC_VIS space_info __space(const path&, error_code* __ec = nullptr);
-inline _LIBCPP_INLINE_VISIBILITY space_info space(const path& __p) { return __space(__p); }
-inline _LIBCPP_INLINE_VISIBILITY space_info space(const path& __p, error_code& __ec) noexcept { return __space(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY file_status status(const path& __p) { return __status(__p); }
-inline _LIBCPP_INLINE_VISIBILITY file_status status(const path& __p, error_code& __ec) noexcept { return __status(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY file_status symlink_status(const path& __p) { return __symlink_status(__p); }
-inline _LIBCPP_INLINE_VISIBILITY file_status symlink_status(const path& __p, error_code& __ec) noexcept { return __symlink_status(__p, &__ec); }
-inline _LIBCPP_INLINE_VISIBILITY path temp_directory_path() { return __temp_directory_path(); }
-inline _LIBCPP_INLINE_VISIBILITY path temp_directory_path(error_code& __ec) { return __temp_directory_path(&__ec); }
-inline _LIBCPP_INLINE_VISIBILITY path weakly_canonical(path const& __p) { return __weakly_canonical(__p); }
-inline _LIBCPP_INLINE_VISIBILITY path weakly_canonical(path const& __p, error_code& __ec) { return __weakly_canonical(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI space_info space(const path& __p) { return __space(__p); }
+inline _LIBCPP_HIDE_FROM_ABI space_info space(const path& __p, error_code& __ec) noexcept { return __space(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI file_status status(const path& __p) { return __status(__p); }
+inline _LIBCPP_HIDE_FROM_ABI file_status status(const path& __p, error_code& __ec) noexcept { return __status(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI file_status symlink_status(const path& __p) { return __symlink_status(__p); }
+inline _LIBCPP_HIDE_FROM_ABI file_status symlink_status(const path& __p, error_code& __ec) noexcept { return __symlink_status(__p, &__ec); }
+inline _LIBCPP_HIDE_FROM_ABI path temp_directory_path() { return __temp_directory_path(); }
+inline _LIBCPP_HIDE_FROM_ABI path temp_directory_path(error_code& __ec) { return __temp_directory_path(&__ec); }
+inline _LIBCPP_HIDE_FROM_ABI path weakly_canonical(path const& __p) { return __weakly_canonical(__p); }
+inline _LIBCPP_HIDE_FROM_ABI path weakly_canonical(path const& __p, error_code& __ec) { return __weakly_canonical(__p, &__ec); }
 
 _LIBCPP_AVAILABILITY_FILESYSTEM_POP
 

From 74036dbafd3d742e86464043f3e2f4d52bf79f1e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 17:16:48 +0000
Subject: [PATCH 291/748] Fix Wdocumentation unknown parameter warning

---
 clang/lib/Sema/SemaTemplateDeduction.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 22dd395d99439..a53d83ea700b6 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -533,9 +533,9 @@ DeduceTemplateArguments(Sema &S,
 ///
 /// \param TemplateParams the template parameters that we are deducing
 ///
-/// \param Param the parameter type
+/// \param P the parameter type
 ///
-/// \param Arg the argument type
+/// \param A the argument type
 ///
 /// \param Info information about the template argument deduction itself
 ///
@@ -1199,11 +1199,11 @@ static CXXRecordDecl *getCanonicalRD(QualType T) {
 ///
 /// \param S the semantic analysis object within which we are deducing.
 ///
-/// \param RecordT the top level record object we are deducing against.
+/// \param RD the top level record object we are deducing against.
 ///
 /// \param TemplateParams the template parameters that we are deducing.
 ///
-/// \param SpecParam the template specialization parameter type.
+/// \param P the template specialization parameter type.
 ///
 /// \param Info information about the template argument deduction itself.
 ///
@@ -1315,9 +1315,9 @@ DeduceTemplateBases(Sema &S, const CXXRecordDecl *RD,
 ///
 /// \param TemplateParams the template parameters that we are deducing
 ///
-/// \param ParamIn the parameter type
+/// \param P the parameter type
 ///
-/// \param ArgIn the argument type
+/// \param A the argument type
 ///
 /// \param Info information about the template argument deduction itself
 ///

From 9415fbbbcf73ab90692fafdac5bd6e302d07ba4b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 17:27:32 +0000
Subject: [PATCH 292/748] [clangd] getHover - pass FormatStyle argument by
 const reference

Reported by coverity
---
 clang-tools-extra/clangd/Hover.cpp | 2 +-
 clang-tools-extra/clangd/Hover.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index dda5ad36e9b89..8b22018411404 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -963,7 +963,7 @@ void maybeAddCalleeArgInfo(const SelectionTree::Node *N, HoverInfo &HI,
 } // namespace
 
 llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
-                                   format::FormatStyle Style,
+                                   const format::FormatStyle &Style,
                                    const SymbolIndex *Index) {
   PrintingPolicy PP =
       getPrintingPolicy(AST.getASTContext().getPrintingPolicy());
diff --git a/clang-tools-extra/clangd/Hover.h b/clang-tools-extra/clangd/Hover.h
index 7478ede88a46c..1a46ff5ad772b 100644
--- a/clang-tools-extra/clangd/Hover.h
+++ b/clang-tools-extra/clangd/Hover.h
@@ -137,7 +137,7 @@ inline bool operator==(const HoverInfo::Param &LHS,
 
 /// Get the hover information when hovering at \p Pos.
 llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
-                                   format::FormatStyle Style,
+                                   const format::FormatStyle &Style,
                                    const SymbolIndex *Index);
 
 } // namespace clangd

From be1ffda0a5b9a0fee866ec01cc3a7c3cb0cd712e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 17:33:11 +0000
Subject: [PATCH 293/748] [InstCombine] visitCallInst - pull out repeated bswap
 scalar type bitwidth. NFC.

---
 llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index eecd583740c30..0abe34da2c262 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1295,9 +1295,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     KnownBits Known = computeKnownBits(IIOperand, 0, II);
     uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8);
     uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8);
+    unsigned BW = Known.getBitWidth();
 
     // bswap(x) -> shift(x) if x has exactly one "active byte"
-    if (Known.getBitWidth() - LZ - TZ == 8) {
+    if (BW - LZ - TZ == 8) {
       assert(LZ != TZ && "active byte cannot be in the middle");
       if (LZ > TZ)  // -> shl(x) if the "active byte" is in the low part of x
         return BinaryOperator::CreateNUWShl(
@@ -1309,8 +1310,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
     if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
-      unsigned C = X->getType()->getScalarSizeInBits() -
-                   IIOperand->getType()->getScalarSizeInBits();
+      unsigned C = X->getType()->getScalarSizeInBits() - BW;
       Value *CV = ConstantInt::get(X->getType(), C);
       Value *V = Builder.CreateLShr(X, CV);
       return new TruncInst(V, IIOperand->getType());

From bfdf28f9638c3904f9bcbb5747f2ec9b16f71535 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Fri, 18 Feb 2022 10:41:41 -0800
Subject: [PATCH 294/748] [docs][NewPM] Remove buildDefaultAAPipeline() in
 example

With D113210 we're already using the default AA pipeline by default.
---
 llvm/docs/NewPassManager.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/docs/NewPassManager.rst b/llvm/docs/NewPassManager.rst
index 9074603e4aa40..74ed077935a76 100644
--- a/llvm/docs/NewPassManager.rst
+++ b/llvm/docs/NewPassManager.rst
@@ -28,10 +28,6 @@ Just Tell Me How To Run The Default Optimization Pipeline With The New Pass Mana
   // options.
   PassBuilder PB;
 
-  // Make sure to use the default alias analysis pipeline, otherwise we'll end
-  // up only using a subset of the available analyses.
-  FAM.registerPass([&] { return PB.buildDefaultAAPipeline(); });
-
   // Register all the basic analyses with the managers.
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);

From 3ad0bdae8f0b2dbf2ddc8505ccebc921b39c1af4 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 10:57:08 -0800
Subject: [PATCH 295/748] [SLP] Address post commit comment from 2e50760

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 024890e5845ef..4a731107f46de 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7930,7 +7930,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   BS->verify();
 #endif
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
   // Check that all schedulable entities got scheduled
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [&](ScheduleData *SD) {

From eae62b226630377e7b88ed019fa3b6c7cec12fd8 Mon Sep 17 00:00:00 2001
From: Groverkss <groverkss@gmail.com>
Date: Sat, 19 Feb 2022 00:24:18 +0530
Subject: [PATCH 296/748] [mlir][Presburger] Introduce Domain and Range
 identifiers in PresburgerSpace

This patch introducing seperating dimensions into two types: Domain and Range.
This allows building relations over PresburgerSpace.

This patch is part of a series of patches to introduce relations in Presburger
library.

Reviewed By: arjunp

Differential Revision: https://reviews.llvm.org/D119709
---
 .../Analysis/Presburger/PresburgerSpace.h     | 95 +++++++++++++++---
 .../Analysis/Presburger/IntegerPolyhedron.cpp | 13 ++-
 .../Analysis/Presburger/PresburgerSpace.cpp   | 99 +++++++++++++++----
 .../Affine/Analysis/AffineStructures.cpp      | 10 +-
 .../Analysis/Presburger/CMakeLists.txt        |  1 +
 .../Presburger/IntegerPolyhedronTest.cpp      |  2 +-
 .../Presburger/PresburgerSpaceTest.cpp        | 50 ++++++++++
 7 files changed, 224 insertions(+), 46 deletions(-)
 create mode 100644 mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp

diff --git a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
index b97c8f67b28af..a7a93980d4aed 100644
--- a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
+++ b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
@@ -15,6 +15,7 @@
 #define MLIR_ANALYSIS_PRESBURGER_PRESBURGERSPACE_H
 
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 
@@ -31,23 +32,45 @@ class PresburgerLocalSpace;
 ///
 /// Local: Local identifiers correspond to existentially quantified variables.
 ///
-/// PresburgerSpace only supports identifiers of kind Dimension and Symbol.
+/// Dimension identifiers are further divided into Domain and Range identifiers
+/// to support building relations.
+///
+/// Spaces with distinction between domain and range identifiers should use
+/// IdKind::Domain and IdKind::Range to refer to domain and range identifiers.
+///
+/// Spaces with no distinction between domain and range identifiers should use
+/// IdKind::SetDim to refer to dimension identifiers.
+///
+/// PresburgerSpace does not support identifiers of kind Local. See
+/// PresburgerLocalSpace for an extension that supports Local ids.
 class PresburgerSpace {
   friend PresburgerLocalSpace;
 
 public:
-  /// Kind of identifier (column).
-  enum IdKind { Dimension, Symbol, Local };
+  /// Kind of identifier. Implementation wise SetDims are treated as Range
+  /// ids, and spaces with no distinction between dimension ids are treated
+  /// as relations with zero domain ids.
+  enum IdKind { Symbol, Local, Domain, Range, SetDim = Range };
 
-  PresburgerSpace(unsigned numDims, unsigned numSymbols)
-      : numDims(numDims), numSymbols(numSymbols), numLocals(0) {}
+  static PresburgerSpace getRelationSpace(unsigned numDomain, unsigned numRange,
+                                          unsigned numSymbols);
+
+  static PresburgerSpace getSetSpace(unsigned numDims, unsigned numSymbols);
 
   virtual ~PresburgerSpace() = default;
 
-  unsigned getNumIds() const { return numDims + numSymbols + numLocals; }
-  unsigned getNumDimIds() const { return numDims; }
+  unsigned getNumDomainIds() const { return numDomain; }
+  unsigned getNumRangeIds() const { return numRange; }
   unsigned getNumSymbolIds() const { return numSymbols; }
-  unsigned getNumDimAndSymbolIds() const { return numDims + numSymbols; }
+  unsigned getNumSetDimIds() const { return numRange; }
+
+  unsigned getNumDimIds() const { return numDomain + numRange; }
+  unsigned getNumDimAndSymbolIds() const {
+    return numDomain + numRange + numSymbols;
+  }
+  unsigned getNumIds() const {
+    return numDomain + numRange + numSymbols + numLocals;
+  }
 
   /// Get the number of ids of the specified kind.
   unsigned getNumIdKind(IdKind kind) const;
@@ -78,12 +101,36 @@ class PresburgerSpace {
   /// split become dimensions.
   void setDimSymbolSeparation(unsigned newSymbolCount);
 
+  void print(llvm::raw_ostream &os) const;
+  void dump() const;
+
+protected:
+  /// Space constructor for Relation space type.
+  PresburgerSpace(unsigned numDomain, unsigned numRange, unsigned numSymbols)
+      : PresburgerSpace(Relation, numDomain, numRange, numSymbols,
+                        /*numLocals=*/0) {}
+
+  /// Space constructor for Set space type.
+  PresburgerSpace(unsigned numDims, unsigned numSymbols)
+      : PresburgerSpace(Set, /*numDomain=*/0, numDims, numSymbols,
+                        /*numLocals=*/0) {}
+
 private:
-  PresburgerSpace(unsigned numDims, unsigned numSymbols, unsigned numLocals)
-      : numDims(numDims), numSymbols(numSymbols), numLocals(numLocals) {}
+  /// Kind of space.
+  enum SpaceKind { Set, Relation };
+
+  PresburgerSpace(SpaceKind spaceKind, unsigned numDomain, unsigned numRange,
+                  unsigned numSymbols, unsigned numLocals)
+      : spaceKind(spaceKind), numDomain(numDomain), numRange(numRange),
+        numSymbols(numSymbols), numLocals(numLocals) {}
 
-  /// Number of identifiers corresponding to real dimensions.
-  unsigned numDims;
+  SpaceKind spaceKind;
+
+  // Number of identifiers corresponding to domain identifiers.
+  unsigned numDomain;
+
+  // Number of identifiers corresponding to range identifiers.
+  unsigned numRange;
 
   /// Number of identifiers corresponding to symbols (unknown but constant for
   /// analysis).
@@ -96,9 +143,13 @@ class PresburgerSpace {
 /// Extension of PresburgerSpace supporting Local identifiers.
 class PresburgerLocalSpace : public PresburgerSpace {
 public:
-  PresburgerLocalSpace(unsigned numDims, unsigned numSymbols,
-                       unsigned numLocals)
-      : PresburgerSpace(numDims, numSymbols, numLocals) {}
+  static PresburgerLocalSpace getRelationSpace(unsigned numDomain,
+                                               unsigned numRange,
+                                               unsigned numSymbols,
+                                               unsigned numLocals);
+
+  static PresburgerLocalSpace getSetSpace(unsigned numDims, unsigned numSymbols,
+                                          unsigned numLocals);
 
   unsigned getNumLocalIds() const { return numLocals; }
 
@@ -110,6 +161,20 @@ class PresburgerLocalSpace : public PresburgerSpace {
 
   /// Removes identifiers in the column range [idStart, idLimit).
   void removeIdRange(unsigned idStart, unsigned idLimit) override;
+
+  void print(llvm::raw_ostream &os) const;
+  void dump() const;
+
+protected:
+  /// Local Space constructor for Relation space type.
+  PresburgerLocalSpace(unsigned numDomain, unsigned numRange,
+                       unsigned numSymbols, unsigned numLocals)
+      : PresburgerSpace(Relation, numDomain, numRange, numSymbols, numLocals) {}
+
+  /// Local Space constructor for Set space type.
+  PresburgerLocalSpace(unsigned numDims, unsigned numSymbols,
+                       unsigned numLocals)
+      : PresburgerSpace(Set, /*numDomain=*/0, numDims, numSymbols, numLocals) {}
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
index 60a1361ec81aa..d7d1b47d3b09b 100644
--- a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
@@ -93,7 +93,7 @@ IntegerPolyhedron::getRationalLexMin() const {
 }
 
 unsigned IntegerPolyhedron::insertDimId(unsigned pos, unsigned num) {
-  return insertId(IdKind::Dimension, pos, num);
+  return insertId(IdKind::SetDim, pos, num);
 }
 
 unsigned IntegerPolyhedron::insertSymbolId(unsigned pos, unsigned num) {
@@ -107,16 +107,15 @@ unsigned IntegerPolyhedron::insertLocalId(unsigned pos, unsigned num) {
 unsigned IntegerPolyhedron::insertId(IdKind kind, unsigned pos, unsigned num) {
   assert(pos <= getNumIdKind(kind));
 
-  unsigned absolutePos = getIdKindOffset(kind) + pos;
-  inequalities.insertColumns(absolutePos, num);
-  equalities.insertColumns(absolutePos, num);
-
-  return PresburgerLocalSpace::insertId(kind, pos, num);
+  unsigned insertPos = PresburgerLocalSpace::insertId(kind, pos, num);
+  inequalities.insertColumns(insertPos, num);
+  equalities.insertColumns(insertPos, num);
+  return insertPos;
 }
 
 unsigned IntegerPolyhedron::appendDimId(unsigned num) {
   unsigned pos = getNumDimIds();
-  insertId(IdKind::Dimension, pos, num);
+  insertId(IdKind::SetDim, pos, num);
   return pos;
 }
 
diff --git a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
index 8e63eb24717ae..b65fc2214dbe7 100644
--- a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
@@ -12,24 +12,56 @@
 
 using namespace mlir;
 
+PresburgerSpace PresburgerSpace::getRelationSpace(unsigned numDomain,
+                                                  unsigned numRange,
+                                                  unsigned numSymbols) {
+  return PresburgerSpace(numDomain, numRange, numSymbols);
+}
+
+PresburgerSpace PresburgerSpace::getSetSpace(unsigned numDims,
+                                             unsigned numSymbols) {
+  return PresburgerSpace(numDims, numSymbols);
+}
+
+PresburgerLocalSpace
+PresburgerLocalSpace::getRelationSpace(unsigned numDomain, unsigned numRange,
+                                       unsigned numSymbols,
+                                       unsigned numLocals) {
+  return PresburgerLocalSpace(numDomain, numRange, numSymbols, numLocals);
+}
+
+PresburgerLocalSpace PresburgerLocalSpace::getSetSpace(unsigned numDims,
+                                                       unsigned numSymbols,
+                                                       unsigned numLocals) {
+  return PresburgerLocalSpace(numDims, numSymbols, numLocals);
+}
+
 unsigned PresburgerSpace::getNumIdKind(IdKind kind) const {
-  if (kind == IdKind::Dimension)
-    return getNumDimIds();
+  if (kind == IdKind::Domain) {
+    assert(spaceKind == Relation && "IdKind::Domain is not supported in Set.");
+    return getNumDomainIds();
+  }
+  if (kind == IdKind::Range)
+    return getNumRangeIds();
   if (kind == IdKind::Symbol)
     return getNumSymbolIds();
   if (kind == IdKind::Local)
     return numLocals;
-  llvm_unreachable("IdKind does not exit!");
+  llvm_unreachable("IdKind does not exist!");
 }
 
 unsigned PresburgerSpace::getIdKindOffset(IdKind kind) const {
-  if (kind == IdKind::Dimension)
+  if (kind == IdKind::Domain) {
+    assert(spaceKind == Relation && "IdKind::Domain is not supported in Set.");
     return 0;
+  }
+  if (kind == IdKind::Range)
+    return getNumDomainIds();
   if (kind == IdKind::Symbol)
     return getNumDimIds();
   if (kind == IdKind::Local)
     return getNumDimAndSymbolIds();
-  llvm_unreachable("IdKind does not exit!");
+  llvm_unreachable("IdKind does not exist!");
 }
 
 unsigned PresburgerSpace::getIdKindEnd(IdKind kind) const {
@@ -56,13 +88,16 @@ unsigned PresburgerSpace::insertId(IdKind kind, unsigned pos, unsigned num) {
 
   unsigned absolutePos = getIdKindOffset(kind) + pos;
 
-  if (kind == IdKind::Dimension)
-    numDims += num;
-  else if (kind == IdKind::Symbol)
+  if (kind == IdKind::Domain) {
+    assert(spaceKind == Relation && "IdKind::Domain is not supported in Set.");
+    numDomain += num;
+  } else if (kind == IdKind::Range) {
+    numRange += num;
+  } else if (kind == IdKind::Symbol) {
     numSymbols += num;
-  else
-    llvm_unreachable(
-        "PresburgerSpace only supports Dimensions and Symbol identifiers!");
+  } else {
+    llvm_unreachable("PresburgerSpace does not support local identifiers!");
+  }
 
   return absolutePos;
 }
@@ -76,13 +111,17 @@ void PresburgerSpace::removeIdRange(unsigned idStart, unsigned idLimit) {
   // We are going to be removing one or more identifiers from the range.
   assert(idStart < getNumIds() && "invalid idStart position");
 
-  // Update members numDims, numSymbols and numIds.
-  unsigned numDimsEliminated =
-      getIdKindOverlap(IdKind::Dimension, idStart, idLimit);
+  // Update members numDomain, numRange, numSymbols and numIds.
+  unsigned numDomainEliminated = 0;
+  if (spaceKind == Relation)
+    numDomainEliminated = getIdKindOverlap(IdKind::Domain, idStart, idLimit);
+  unsigned numRangeEliminated =
+      getIdKindOverlap(IdKind::Range, idStart, idLimit);
   unsigned numSymbolsEliminated =
       getIdKindOverlap(IdKind::Symbol, idStart, idLimit);
 
-  numDims -= numDimsEliminated;
+  numDomain -= numDomainEliminated;
+  numRange -= numRangeEliminated;
   numSymbols -= numSymbolsEliminated;
 }
 
@@ -108,8 +147,7 @@ void PresburgerLocalSpace::removeIdRange(unsigned idStart, unsigned idLimit) {
       getIdKindOverlap(IdKind::Local, idStart, idLimit);
 
   // Update space parameters.
-  PresburgerSpace::removeIdRange(
-      idStart, std::min(idLimit, PresburgerSpace::getNumIds()));
+  PresburgerSpace::removeIdRange(idStart, idLimit);
 
   // Update local ids.
   numLocals -= numLocalsEliminated;
@@ -118,6 +156,31 @@ void PresburgerLocalSpace::removeIdRange(unsigned idStart, unsigned idLimit) {
 void PresburgerSpace::setDimSymbolSeparation(unsigned newSymbolCount) {
   assert(newSymbolCount <= getNumDimAndSymbolIds() &&
          "invalid separation position");
-  numDims = numDims + numSymbols - newSymbolCount;
+  numRange = numRange + numSymbols - newSymbolCount;
   numSymbols = newSymbolCount;
 }
+
+void PresburgerSpace::print(llvm::raw_ostream &os) const {
+  if (spaceKind == Relation) {
+    os << "Domain: " << getNumDomainIds() << ", "
+       << "Range: " << getNumRangeIds() << ", ";
+  } else {
+    os << "Dimension: " << getNumDomainIds() << ", ";
+  }
+  os << "Symbols: " << getNumSymbolIds() << "\n";
+}
+
+void PresburgerSpace::dump() const { print(llvm::errs()); }
+
+void PresburgerLocalSpace::print(llvm::raw_ostream &os) const {
+  if (spaceKind == Relation) {
+    os << "Domain: " << getNumDomainIds() << ", "
+       << "Range: " << getNumRangeIds() << ", ";
+  } else {
+    os << "Dimension: " << getNumDomainIds() << ", ";
+  }
+  os << "Symbols: " << getNumSymbolIds() << ", "
+     << "Locals" << getNumLocalIds() << "\n";
+}
+
+void PresburgerLocalSpace::dump() const { print(llvm::errs()); }
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
index cc9e497072183..c8d8d328246f5 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
@@ -268,7 +268,7 @@ void FlatAffineValueConstraints::reset(unsigned newNumDims,
 
 unsigned FlatAffineValueConstraints::appendDimId(ValueRange vals) {
   unsigned pos = getNumDimIds();
-  insertId(IdKind::Dimension, pos, vals);
+  insertId(IdKind::SetDim, pos, vals);
   return pos;
 }
 
@@ -280,7 +280,7 @@ unsigned FlatAffineValueConstraints::appendSymbolId(ValueRange vals) {
 
 unsigned FlatAffineValueConstraints::insertDimId(unsigned pos,
                                                  ValueRange vals) {
-  return insertId(IdKind::Dimension, pos, vals);
+  return insertId(IdKind::SetDim, pos, vals);
 }
 
 unsigned FlatAffineValueConstraints::insertSymbolId(unsigned pos,
@@ -365,7 +365,7 @@ areIdsUnique(const FlatAffineConstraints &cst) {
 static bool LLVM_ATTRIBUTE_UNUSED areIdsUnique(
     const FlatAffineValueConstraints &cst, FlatAffineConstraints::IdKind kind) {
 
-  if (kind == FlatAffineConstraints::IdKind::Dimension)
+  if (kind == FlatAffineConstraints::IdKind::SetDim)
     return areIdsUnique(cst, 0, cst.getNumDimIds());
   if (kind == FlatAffineConstraints::IdKind::Symbol)
     return areIdsUnique(cst, cst.getNumDimIds(), cst.getNumDimAndSymbolIds());
@@ -1214,8 +1214,8 @@ FlatAffineValueConstraints::computeAlignedMap(AffineMap map,
 
   dims.reserve(getNumDimIds());
   syms.reserve(getNumSymbolIds());
-  for (unsigned i = getIdKindOffset(IdKind::Dimension),
-                e = getIdKindEnd(IdKind::Dimension);
+  for (unsigned i = getIdKindOffset(IdKind::SetDim),
+                e = getIdKindEnd(IdKind::SetDim);
        i < e; ++i)
     dims.push_back(values[i] ? *values[i] : Value());
   for (unsigned i = getIdKindOffset(IdKind::Symbol),
diff --git a/mlir/unittests/Analysis/Presburger/CMakeLists.txt b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
index e7142a7f87509..11ab72d8c1f88 100644
--- a/mlir/unittests/Analysis/Presburger/CMakeLists.txt
+++ b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_unittest(MLIRPresburgerTests
   LinearTransformTest.cpp
   MatrixTest.cpp
   PresburgerSetTest.cpp
+  PresburgerSpaceTest.cpp
   PWMAFunctionTest.cpp
   SimplexTest.cpp
   ../../Dialect/Affine/Analysis/AffineStructuresParser.cpp
diff --git a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
index d3c40237c692d..933467f191d4c 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
@@ -158,7 +158,7 @@ TEST(IntegerPolyhedronTest, removeIdRange) {
   EXPECT_THAT(set.getInequality(0),
               testing::ElementsAre(10, 11, 12, 20, 30, 40));
 
-  set.removeIdRange(IntegerPolyhedron::IdKind::Dimension, 0, 2);
+  set.removeIdRange(IntegerPolyhedron::IdKind::SetDim, 0, 2);
   EXPECT_THAT(set.getInequality(0), testing::ElementsAre(12, 20, 30, 40));
 
   set.removeIdRange(IntegerPolyhedron::IdKind::Local, 1, 1);
diff --git a/mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp b/mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp
new file mode 100644
index 0000000000000..0716397b7c730
--- /dev/null
+++ b/mlir/unittests/Analysis/Presburger/PresburgerSpaceTest.cpp
@@ -0,0 +1,50 @@
+//===- PresburgerSpaceTest.cpp - Tests for PresburgerSpace ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+using namespace mlir;
+using IdKind = PresburgerSpace::IdKind;
+
+TEST(PresburgerSpaceTest, insertId) {
+  PresburgerSpace space = PresburgerSpace::getRelationSpace(2, 2, 1);
+
+  // Try inserting 2 domain ids.
+  space.insertId(PresburgerSpace::IdKind::Domain, 0, 2);
+  EXPECT_EQ(space.getNumDomainIds(), 4u);
+
+  // Try inserting 1 range ids.
+  space.insertId(PresburgerSpace::IdKind::Range, 0, 1);
+  EXPECT_EQ(space.getNumRangeIds(), 3u);
+}
+
+TEST(PresburgerSpaceTest, insertIdSet) {
+  PresburgerSpace space = PresburgerSpace::getSetSpace(2, 1);
+
+  // Try inserting 2 dimension ids. The space should have 4 range ids since
+  // spaces which do not distinguish between domain, range are implemented like
+  // this.
+  space.insertId(PresburgerSpace::IdKind::SetDim, 0, 2);
+  EXPECT_EQ(space.getNumRangeIds(), 4u);
+}
+
+TEST(PresburgerSpaceTest, removeIdRange) {
+  PresburgerSpace space = PresburgerSpace::getRelationSpace(2, 1, 3);
+
+  // Remove 1 domain identifier.
+  space.removeIdRange(0, 1);
+  EXPECT_EQ(space.getNumDomainIds(), 1u);
+
+  // Remove 1 symbol and 1 range identifier.
+  space.removeIdRange(1, 3);
+  EXPECT_EQ(space.getNumDomainIds(), 1u);
+  EXPECT_EQ(space.getNumRangeIds(), 0u);
+  EXPECT_EQ(space.getNumSymbolIds(), 2u);
+}

From ff2e4c04c46a9173489a6aa7dd70b804105f4d03 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 11:06:54 -0800
Subject: [PATCH 297/748] [instsimplify] Assume storage for byval args doesn't
 overlap allocas, globals, or other byval args

This allows us to discharge many pointer comparisons based on byval arguments.

Differential Revision: https://reviews.llvm.org/D120133
---
 llvm/lib/Analysis/InstructionSimplify.cpp    | 22 +++++++++++++++++---
 llvm/test/Transforms/InstSimplify/compare.ll | 16 ++++++--------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index b6692319c09dc..b3459b5ffb013 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2555,7 +2555,19 @@ static bool HaveNonOverlappingStorage(const Value *V1, const Value *V2) {
   //
   // So, we'll assume that two non-empty allocas have different addresses
   // for now.
-  return isa<AllocaInst>(V1) &&
+  auto isByValArg = [](const Value *V) {
+    const Argument *A = dyn_cast<Argument>(V);
+    return A && A->hasByValAttr();
+  };
+
+  // Byval args are backed by store which does not overlap with each other,
+  // allocas, or globals.
+  if (isByValArg(V1))
+    return isa<AllocaInst>(V2) || isa<GlobalVariable>(V2) || isByValArg(V2);
+  if (isByValArg(V2))
+    return isa<AllocaInst>(V1) || isa<GlobalVariable>(V1) || isByValArg(V1);
+
+ return isa<AllocaInst>(V1) &&
     (isa<AllocaInst>(V2) || isa<GlobalVariable>(V2));
 }
 
@@ -2659,8 +2671,12 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
       uint64_t LHSSize, RHSSize;
       ObjectSizeOpts Opts;
       Opts.EvalMode = ObjectSizeOpts::Mode::Min;
-      Opts.NullIsUnknownSize =
-          NullPointerIsDefined(cast<AllocaInst>(LHS)->getFunction());
+      auto *F = [](Value *V) {
+        if (auto *I = dyn_cast<Instruction>(V))
+          return I->getFunction();
+        return cast<Argument>(V)->getParent();
+      }(LHS);
+      Opts.NullIsUnknownSize = NullPointerIsDefined(F);
       if (getObjectSize(LHS, LHSSize, DL, TLI, Opts) &&
           getObjectSize(RHS, RHSSize, DL, TLI, Opts) &&
           !LHSOffset.isNegative() && !RHSOffset.isNegative() &&
diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index 5afb56203a0f9..efe1d6e225012 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -2737,11 +2737,10 @@ define i1 @scalar_vectors_are_non_empty() {
   ret i1 %res
 }
 
-; TODO: Never equal
+; Never equal
 define i1 @byval_args_inequal(i32* byval(i32) %a, i32* byval(i32) %b) {
 ; CHECK-LABEL: @byval_args_inequal(
-; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %res = icmp ne i32* %a, %b
   ret i1 %res
@@ -2759,12 +2758,10 @@ define i1 @neg_args_adjacent(i32* byval(i32) %a, i32* byval(i32) %b) {
   ret i1 %res
 }
 
-; TODO: Never equal
+; Never equal
 define i1 @test_byval_alloca_inequal(i32* byval(i32) %a) {
 ; CHECK-LABEL: @test_byval_alloca_inequal(
-; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A:%.*]], [[B]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %b = alloca i32
   %res = icmp ne i32* %a, %b
@@ -2812,11 +2809,10 @@ define i1 @globals_offset_inequal() {
 }
 
 
-; TODO: Never equal
+; Never equal
 define i1 @test_byval_global_inequal(i32* byval(i32) %a) {
 ; CHECK-LABEL: @test_byval_global_inequal(
-; CHECK-NEXT:    [[RES:%.*]] = icmp ne i32* [[A:%.*]], @B
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    ret i1 true
 ;
   %b = alloca i32
   %res = icmp ne i32* %a, @B

From 1e116867dbc84ebc6e7165996a8bbef7261c3ccd Mon Sep 17 00:00:00 2001
From: Alex Brachet <abrachet@google.com>
Date: Fri, 18 Feb 2022 19:13:55 +0000
Subject: [PATCH 298/748] [ifs] Add --exclude flag

Use to remove certain symbols which match the glob pattern. Can be used with --strip-undefined

Reviewed By: haowei, mcgrathr

Differential Revision: https://reviews.llvm.org/D119962
---
 llvm/include/llvm/InterfaceStub/IFSHandler.h |  6 ++--
 llvm/lib/InterfaceStub/IFSHandler.cpp        | 33 +++++++++++++++-----
 llvm/test/tools/llvm-ifs/exclude.test        | 30 ++++++++++++++++++
 llvm/tools/llvm-ifs/llvm-ifs.cpp             | 13 +++++---
 4 files changed, 69 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ifs/exclude.test

diff --git a/llvm/include/llvm/InterfaceStub/IFSHandler.h b/llvm/include/llvm/InterfaceStub/IFSHandler.h
index 6ae6a421318eb..bfa5692811d76 100644
--- a/llvm/include/llvm/InterfaceStub/IFSHandler.h
+++ b/llvm/include/llvm/InterfaceStub/IFSHandler.h
@@ -19,6 +19,8 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/VersionTuple.h"
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
@@ -51,8 +53,8 @@ Error validateIFSTarget(IFSStub &Stub, bool ParseTriple);
 void stripIFSTarget(IFSStub &Stub, bool StripTriple, bool StripArch,
                     bool StripEndianness, bool StripBitWidth);
 
-/// Strips symbols from IFS symbol table that are undefined.
-void stripIFSUndefinedSymbols(IFSStub &Stub);
+Error filterIFSSyms(IFSStub &Stub, bool StripUndefined,
+                    const std::vector<std::string> &Exclude = {});
 
 /// Parse llvm triple string into a IFSTarget struct.
 IFSTarget parseTriple(StringRef TripleStr);
diff --git a/llvm/lib/InterfaceStub/IFSHandler.cpp b/llvm/lib/InterfaceStub/IFSHandler.cpp
index 4ccbb18ca04a4..92c88309a1587 100644
--- a/llvm/lib/InterfaceStub/IFSHandler.cpp
+++ b/llvm/lib/InterfaceStub/IFSHandler.cpp
@@ -7,14 +7,17 @@
 //===-----------------------------------------------------------------------===/
 
 #include "llvm/InterfaceStub/IFSHandler.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/InterfaceStub/IFSStub.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/YAMLTraits.h"
+#include <functional>
 
 using namespace llvm;
 using namespace llvm::ifs;
@@ -328,12 +331,28 @@ void ifs::stripIFSTarget(IFSStub &Stub, bool StripTriple, bool StripArch,
   }
 }
 
-void ifs::stripIFSUndefinedSymbols(IFSStub &Stub) {
-  for (auto Iter = Stub.Symbols.begin(); Iter != Stub.Symbols.end();) {
-    if (Iter->Undefined) {
-      Iter = Stub.Symbols.erase(Iter);
-    } else {
-      Iter++;
-    }
+Error ifs::filterIFSSyms(IFSStub &Stub, bool StripUndefined,
+                         const std::vector<std::string> &Exclude) {
+  std::function<bool(const IFSSymbol &)> Filter = [](const IFSSymbol &) {
+    return false;
+  };
+
+  if (StripUndefined) {
+    Filter = [Filter](const IFSSymbol &Sym) {
+      return Sym.Undefined || Filter(Sym);
+    };
+  }
+
+  for (StringRef Glob : Exclude) {
+    Expected<llvm::GlobPattern> PatternOrErr = llvm::GlobPattern::create(Glob);
+    if (!PatternOrErr)
+      return PatternOrErr.takeError();
+    Filter = [Pattern = *PatternOrErr, Filter](const IFSSymbol &Sym) {
+      return Pattern.match(Sym.Name) || Filter(Sym);
+    };
   }
+
+  llvm::erase_if(Stub.Symbols, Filter);
+
+  return Error::success();
 }
diff --git a/llvm/test/tools/llvm-ifs/exclude.test b/llvm/test/tools/llvm-ifs/exclude.test
new file mode 100644
index 0000000000000..29f9ab8515124
--- /dev/null
+++ b/llvm/test/tools/llvm-ifs/exclude.test
@@ -0,0 +1,30 @@
+## Test --exclude flag
+
+# RUN: llvm-ifs --input-format=IFS --output-ifs=- --exclude='exclude*' %s | FileCheck %s
+
+# RUN: llvm-ifs --input-format=IFS --output-ifs=- --exclude='exclude*' \
+# RUN: --strip-undefined %s | FileCheck %s --check-prefix=BOTH
+
+# RUN: not llvm-ifs --input-format=IFS --output-ifs=- --exclude='[' %s 2>&1 | \
+# RUN: FileCheck %s --check-prefix=BAD-GLOB
+
+# BAD-GLOB: error: invalid glob pattern: [
+
+--- !ifs-v1
+SoName: somelib.so
+IfsVersion: 3.0
+Symbols:
+  - { Name: dont_exclude, Type: Func, Undefined: true }
+  - { Name: exclude_1, Type: Func }
+  - { Name: exclude_2, Type: Func, Undefined: true }
+  - { Name: no_match_not_undef, Type: Func }
+...
+
+# CHECK:      Symbols:
+# CHECK-NEXT:   - { Name: dont_exclude, Type: Func, Undefined: true }
+# CHECK-NEXT:   - { Name: no_match_not_undef, Type: Func }
+# CHECK-NEXT: ...
+
+# BOTH:       Symbols:
+# BOTH-NEXT:    - { Name: no_match_not_undef, Type: Func }
+# BOTH-NEXT:  ...
diff --git a/llvm/tools/llvm-ifs/llvm-ifs.cpp b/llvm/tools/llvm-ifs/llvm-ifs.cpp
index ef8864e08fdbf..feced928f854d 100644
--- a/llvm/tools/llvm-ifs/llvm-ifs.cpp
+++ b/llvm/tools/llvm-ifs/llvm-ifs.cpp
@@ -106,6 +106,11 @@ cl::opt<bool>
 cl::opt<bool> StripNeededLibs("strip-needed",
                               cl::desc("Strip needed libs from output"),
                               cl::cat(IfsCategory));
+cl::list<std::string>
+    ExcludeSyms("exclude",
+                cl::desc("Remove symbols which match the pattern. Can be "
+                         "specified multiple times"),
+                cl::cat(IfsCategory));
 
 cl::opt<std::string>
     SoName("soname",
@@ -479,8 +484,8 @@ int main(int argc, char *argv[]) {
         stripIFSTarget(Stub, StripIFSTarget, StripIFSArch,
                        StripIFSEndiannessWidth, StripIFSBitWidth);
       }
-      if (StripUndefined)
-        stripIFSUndefinedSymbols(Stub);
+      if (Error E = filterIFSSyms(Stub, StripUndefined, ExcludeSyms))
+        fatalError(std::move(E));
       Error IFSWriteError = writeIFS(OutputFilePath.getValue(), Stub);
       if (IFSWriteError)
         fatalError(std::move(IFSWriteError));
@@ -531,8 +536,8 @@ int main(int argc, char *argv[]) {
         stripIFSTarget(Stub, StripIFSTarget, StripIFSArch,
                        StripIFSEndiannessWidth, StripIFSBitWidth);
       }
-      if (StripUndefined)
-        stripIFSUndefinedSymbols(Stub);
+      if (Error E = filterIFSSyms(Stub, StripUndefined, ExcludeSyms))
+        fatalError(std::move(E));
       Error IFSWriteError = writeIFS(OutputIFSFilePath.getValue(), Stub);
       if (IFSWriteError)
         fatalError(std::move(IFSWriteError));

From cb0a4bb5be10636aaec3ecb56ed586dee3eb0b9e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 18 Feb 2022 11:20:36 -0800
Subject: [PATCH 299/748] [ELF] Change (NOLOAD) section type mismatch error to
 warning

Making a (NOLOAD) section SHT_PROGBITS is fishy (the user may expect all-zero
content, but the linker does not check that), but some projects (e.g. Linux
kernel https://github.com/ClangBuiltLinux/linux/issues/1597) traditionally rely
on the behavior. Issue a warning to not break them.
---
 lld/ELF/OutputSections.cpp         | 15 ++++++++++-----
 lld/test/ELF/linkerscript/noload.s | 11 ++++++++---
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 2b5deecdcec75..252108b464b2b 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -112,11 +112,16 @@ void OutputSection::commitSection(InputSection *isec) {
     if (hasInputSections || typeIsSet) {
       if (typeIsSet || !canMergeToProgbits(type) ||
           !canMergeToProgbits(isec->type)) {
-        errorOrWarn("section type mismatch for " + isec->name + "\n>>> " +
-                    toString(isec) + ": " +
-                    getELFSectionTypeName(config->emachine, isec->type) +
-                    "\n>>> output section " + name + ": " +
-                    getELFSectionTypeName(config->emachine, type));
+        // Changing the type of a (NOLOAD) section is fishy, but some projects
+        // (e.g. https://github.com/ClangBuiltLinux/linux/issues/1597)
+        // traditionally rely on the behavior. Issue a warning to not break
+        // them. Other types get an error.
+        auto diagnose = type == SHT_NOBITS ? warn : errorOrWarn;
+        diagnose("section type mismatch for " + isec->name + "\n>>> " +
+                 toString(isec) + ": " +
+                 getELFSectionTypeName(config->emachine, isec->type) +
+                 "\n>>> output section " + name + ": " +
+                 getELFSectionTypeName(config->emachine, type));
       }
       type = SHT_PROGBITS;
     } else {
diff --git a/lld/test/ELF/linkerscript/noload.s b/lld/test/ELF/linkerscript/noload.s
index 92afadc9b263f..1cc09670e8b16 100644
--- a/lld/test/ELF/linkerscript/noload.s
+++ b/lld/test/ELF/linkerscript/noload.s
@@ -17,9 +17,14 @@
 # CHECK:      00 .data_noload_a .data_noload_b .no_input_sec_noload {{$}}
 # CHECK:      01 .text {{$}}
 
-# RUN: not ld.lld --script %t/lds %t.o %t/mismatch.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
-
-# ERR: error: section type mismatch for .data_noload_a
+## The output SHT_PROBITS is contrary to the user expectation of SHT_NOBITS.
+## Issue a warning. See https://github.com/ClangBuiltLinux/linux/issues/1597
+# RUN: ld.lld --script %t/lds %t.o %t/mismatch.o -o %t/out 2>&1 |& FileCheck %s --check-prefix=WARN
+# RUN: llvm-readelf -S -l %t/out | FileCheck %s --check-prefix=CHECK2
+
+# WARN:   warning: section type mismatch for .data_noload_a
+# CHECK2:      Name                 Type     Address          Off               Size
+# CHECK2:      .data_noload_a       PROGBITS 0000000000000000 [[OFF:[0-9a-f]+]] 001001
 
 #--- asm
 .section .text,"ax",@progbits

From 3a6be124cc01191ec52192017791bb04a6c7295a Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 11:32:28 -0800
Subject: [PATCH 300/748] [instsimplify] Simplify HaveNonOverlappingStorage per
 review suggestion on D120133 [NFC]

---
 llvm/lib/Analysis/InstructionSimplify.cpp | 26 +++++++++++++----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index b3459b5ffb013..7475b995cbd86 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2555,20 +2555,24 @@ static bool HaveNonOverlappingStorage(const Value *V1, const Value *V2) {
   //
   // So, we'll assume that two non-empty allocas have different addresses
   // for now.
-  auto isByValArg = [](const Value *V) {
-    const Argument *A = dyn_cast<Argument>(V);
-    return A && A->hasByValAttr();
+
+  auto isByValArgOrGlobalVarOrAlloca = [](const Value *V) {
+    if (const Argument *A = dyn_cast<Argument>(V))
+      return A->hasByValAttr();
+    return isa<AllocaInst>(V) || isa<GlobalVariable>(V);
   };
 
-  // Byval args are backed by store which does not overlap with each other,
-  // allocas, or globals.
-  if (isByValArg(V1))
-    return isa<AllocaInst>(V2) || isa<GlobalVariable>(V2) || isByValArg(V2);
-  if (isByValArg(V2))
-    return isa<AllocaInst>(V1) || isa<GlobalVariable>(V1) || isByValArg(V1);
+  if (!isByValArgOrGlobalVarOrAlloca(V1) ||
+      !isByValArgOrGlobalVarOrAlloca(V2))
+    return false;
 
- return isa<AllocaInst>(V1) &&
-    (isa<AllocaInst>(V2) || isa<GlobalVariable>(V2));
+  // Both sides being globals shouldn't reach here - as the resulting compare
+  // is a constantexpr - but we want to guard against it to be safe.  The
+  // semantics of globals are complicated by e.g. unnamed_addr.  The assumption
+  // in this code is that while two globals could end up overlapping, they'll
+  // never overlap with any alloca or byval, and thus we can still reason about
+  // *one* global and one *non* global as disjoint storage.
+  return !isa<GlobalVariable>(V1) || !isa<GlobalVariable>(V2);
 }
 
 // A significant optimization not implemented here is assuming that alloca

From 5ee500acbbe7eadc1b46a289c8f7fc86a4be4af5 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <Shraiysh.Vaishay@amd.com>
Date: Fri, 18 Feb 2022 23:17:34 +0530
Subject: [PATCH 301/748] [mlir][OpenMP] Remove clauses that are not being
 handled

This patch removes the following clauses from OpenMP Dialect:

 - private
 - firstprivate
 - lastprivate
 - shared
 - default
 - copyin
 - copyprivate

The privatization clauses are being handled in the flang frontend. The
data copying clauses are not being handled anywhere for now. Once
we have a better picture of how to handle these clauses in OpenMP
Dialect, we can add these. For the time being, removing unneeded
clauses.

For detailed discussion about this refer to [[ https://discourse.llvm.org/t/rfc-privatisation-in-openmp-dialect/3526 | Privatisation in OpenMP dialect ]]

Reviewed By: kiranchandramohan, clementval

Differential Revision: https://reviews.llvm.org/D120029
---
 flang/lib/Lower/OpenMP.cpp                    |  52 +----
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  71 ++----
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 221 ++----------------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   9 +-
 .../OpenMPToLLVM/convert-to-llvmir.mlir       |   2 +-
 mlir/test/Dialect/OpenMP/invalid.mlir         | 149 +-----------
 mlir/test/Dialect/OpenMP/ops.mlir             | 185 ++++-----------
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      |   6 +-
 8 files changed, 97 insertions(+), 598 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 0d713b320ce01..bf02f577cc408 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -143,9 +143,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   if (blockDirective.v == llvm::omp::OMPD_parallel) {
 
     mlir::Value ifClauseOperand, numThreadsClauseOperand;
-    SmallVector<Value, 4> privateClauseOperands, firstprivateClauseOperands,
-        sharedClauseOperands, copyinClauseOperands;
-    Attribute defaultClauseOperand, procBindClauseOperand;
+    Attribute procBindClauseOperand;
 
     const auto &parallelOpClauseList =
         std::get<Fortran::parser::OmpClauseList>(beginBlockDirective.t);
@@ -162,59 +160,17 @@ genOMP(Fortran::lower::AbstractConverter &converter,
         // OMPIRBuilder expects `NUM_THREAD` clause as a `Value`.
         numThreadsClauseOperand = fir::getBase(converter.genExprValue(
             *Fortran::semantics::GetExpr(numThreadsClause->v)));
-      } else if (const auto &privateClause =
-                     std::get_if<Fortran::parser::OmpClause::Private>(
-                         &clause.u)) {
-        const Fortran::parser::OmpObjectList &ompObjectList = privateClause->v;
-        genObjectList(ompObjectList, converter, privateClauseOperands);
-      } else if (const auto &firstprivateClause =
-                     std::get_if<Fortran::parser::OmpClause::Firstprivate>(
-                         &clause.u)) {
-        const Fortran::parser::OmpObjectList &ompObjectList =
-            firstprivateClause->v;
-        genObjectList(ompObjectList, converter, firstprivateClauseOperands);
-      } else if (const auto &sharedClause =
-                     std::get_if<Fortran::parser::OmpClause::Shared>(
-                         &clause.u)) {
-        const Fortran::parser::OmpObjectList &ompObjectList = sharedClause->v;
-        genObjectList(ompObjectList, converter, sharedClauseOperands);
-      } else if (const auto &copyinClause =
-                     std::get_if<Fortran::parser::OmpClause::Copyin>(
-                         &clause.u)) {
-        const Fortran::parser::OmpObjectList &ompObjectList = copyinClause->v;
-        genObjectList(ompObjectList, converter, copyinClauseOperands);
       }
+      // TODO: Handle private, firstprivate, shared and copyin
     }
     // Create and insert the operation.
     auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(
         currentLocation, argTy, ifClauseOperand, numThreadsClauseOperand,
-        defaultClauseOperand.dyn_cast_or_null<omp::ClauseDefaultAttr>(),
-        privateClauseOperands, firstprivateClauseOperands, sharedClauseOperands,
-        copyinClauseOperands, ValueRange(), ValueRange(),
+        ValueRange(), ValueRange(),
         procBindClauseOperand.dyn_cast_or_null<omp::ClauseProcBindKindAttr>());
     // Handle attribute based clauses.
     for (const auto &clause : parallelOpClauseList.v) {
-      if (const auto &defaultClause =
-              std::get_if<Fortran::parser::OmpClause::Default>(&clause.u)) {
-        const auto &ompDefaultClause{defaultClause->v};
-        omp::ClauseDefault clause;
-        switch (ompDefaultClause.v) {
-        case Fortran::parser::OmpDefaultClause::Type::Private:
-          clause = omp::ClauseDefault::defprivate;
-          break;
-        case Fortran::parser::OmpDefaultClause::Type::Firstprivate:
-          clause = omp::ClauseDefault::deffirstprivate;
-          break;
-        case Fortran::parser::OmpDefaultClause::Type::Shared:
-          clause = omp::ClauseDefault::defshared;
-          break;
-        case Fortran::parser::OmpDefaultClause::Type::None:
-          clause = omp::ClauseDefault::defnone;
-          break;
-        }
-        parallelOp.default_valAttr(
-            omp::ClauseDefaultAttr::get(firOpBuilder.getContext(), clause));
-      }
+      // TODO: Handle default clause
       if (const auto &procBindClause =
               std::get_if<Fortran::parser::OmpClause::ProcBind>(&clause.u)) {
         const auto &ompProcBindClause{procBindClause->v};
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 51ae0dc5bea2b..5933326489dd3 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -63,25 +63,8 @@ def OpenMP_PointerLikeType : Type<
 // 2.6 parallel Construct
 //===----------------------------------------------------------------------===//
 
-// Possible values for the default clause
-def ClauseDefaultPrivate      : I32EnumAttrCase<"defprivate",      0>;
-def ClauseDefaultFirstPrivate : I32EnumAttrCase<"deffirstprivate", 1>;
-def ClauseDefaultShared       : I32EnumAttrCase<"defshared",       2>;
-def ClauseDefaultNone         : I32EnumAttrCase<"defnone",         3>;
-
-def ClauseDefault : I32EnumAttr<
-    "ClauseDefault",
-    "default clause",
-    [ClauseDefaultPrivate, ClauseDefaultFirstPrivate, ClauseDefaultShared,
-     ClauseDefaultNone]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-def ClauseDefaultAttr : EnumAttr<OpenMP_Dialect, ClauseDefault,
-                                 "clause_default">;
-
-def ParallelOp : OpenMP_Op<"parallel", [AutomaticAllocationScope, 
-                                        AttrSizedOperandSegments,
+def ParallelOp : OpenMP_Op<"parallel", [
+                 AutomaticAllocationScope, AttrSizedOperandSegments,
                  DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>]> {
   let summary = "parallel construct";
   let description = [{
@@ -96,14 +79,6 @@ def ParallelOp : OpenMP_Op<"parallel", [AutomaticAllocationScope,
     The optional $num_threads_var parameter specifies the number of threads which
     should be used to execute the parallel region.
 
-    The optional $default_val attribute specifies the default data sharing attribute
-    of values used in the parallel region that are not passed explicitly as parameters
-    to the operation.
-
-    The $private_vars, $firstprivate_vars, $shared_vars and $copyin_vars parameters
-    are a variadic list of values that specify the data sharing attribute of
-    those values.
-
     The $allocators_vars and $allocate_vars parameters are a variadic list of values
     that specify the memory allocator to be used to obtain storage for private values.
 
@@ -113,11 +88,6 @@ def ParallelOp : OpenMP_Op<"parallel", [AutomaticAllocationScope,
 
   let arguments = (ins Optional<AnyType>:$if_expr_var,
              Optional<AnyType>:$num_threads_var,
-             OptionalAttr<ClauseDefaultAttr>:$default_val,
-             Variadic<AnyType>:$private_vars,
-             Variadic<AnyType>:$firstprivate_vars,
-             Variadic<AnyType>:$shared_vars,
-             Variadic<AnyType>:$copyin_vars,
              Variadic<AnyType>:$allocate_vars,
              Variadic<AnyType>:$allocators_vars,
              OptionalAttr<ProcBindKindAttr>:$proc_bind_val);
@@ -182,10 +152,6 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments]> {
     is executed once by one of the threads in the team in the context of its
     implicit task.
 
-    `private_vars`, `firstprivate_vars` and`lastprivate_vars` arguments are
-    variadic list of operands that specify the data sharing attributes of the
-    list of values. They are optional.
-
     Reductions can be performed in a sections construct by specifying reduction
     accumulator variables in `reduction_vars` and symbols referring to reduction
     declarations in the `reductions` attribute. Each reduction is identified
@@ -204,10 +170,7 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments]> {
     The `nowait` attribute, when present, signifies that there should be no
     implicit barrier at the end of the construct.
   }];
-  let arguments = (ins Variadic<AnyType>:$private_vars,
-                       Variadic<AnyType>:$firstprivate_vars,
-                       Variadic<AnyType>:$lastprivate_vars,
-                       Variadic<OpenMP_PointerLikeType>:$reduction_vars,
+  let arguments = (ins Variadic<OpenMP_PointerLikeType>:$reduction_vars,
                        OptionalAttr<SymbolRefArrayAttr>:$reductions,
                        Variadic<AnyType>:$allocate_vars,
                        Variadic<AnyType>:$allocators_vars,
@@ -247,12 +210,10 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
     }
     ```
 
-    `private_vars`, `firstprivate_vars`, `lastprivate_vars` and `linear_vars`
-    arguments are variadic list of operands that specify the data sharing
-    attributes of the list of values. The `linear_step_vars` operand
-    additionally specifies the step for each associated linear operand. Note
-    that the `linear_vars` and `linear_step_vars` variadic lists should contain
-    the same number of elements.
+    The `linear_step_vars` operand additionally specifies the step for each
+    associated linear operand. Note that the `linear_vars` and
+    `linear_step_vars` variadic lists should contain the same number of
+    elements.
 
     Reductions can be performed in a workshare loop by specifying reduction
     accumulator variables in `reduction_vars` and symbols referring to reduction
@@ -288,9 +249,6 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
   let arguments = (ins Variadic<IntLikeType>:$lowerBound,
              Variadic<IntLikeType>:$upperBound,
              Variadic<IntLikeType>:$step,
-             Variadic<AnyType>:$private_vars,
-             Variadic<AnyType>:$firstprivate_vars,
-             Variadic<AnyType>:$lastprivate_vars,
              Variadic<AnyType>:$linear_vars,
              Variadic<AnyType>:$linear_step_vars,
              Variadic<OpenMP_PointerLikeType>:$reduction_vars,
@@ -313,13 +271,12 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
                CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
     OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$lowerBound,
                "ValueRange":$upperBound, "ValueRange":$step,
-               "ValueRange":$privateVars, "ValueRange":$firstprivateVars,
-               "ValueRange":$lastprivate_vars, "ValueRange":$linear_vars,
-               "ValueRange":$linear_step_vars, "ValueRange":$reduction_vars,
-               "StringAttr":$schedule_val, "Value":$schedule_chunk_var,
-               "IntegerAttr":$collapse_val, "UnitAttr":$nowait,
-               "IntegerAttr":$ordered_val, "StringAttr":$order_val,
-               "UnitAttr":$inclusive, CArg<"bool", "true">:$buildBody)>,
+               "ValueRange":$linear_vars, "ValueRange":$linear_step_vars,
+               "ValueRange":$reduction_vars, "StringAttr":$schedule_val,
+               "Value":$schedule_chunk_var, "IntegerAttr":$collapse_val,
+               "UnitAttr":$nowait, "IntegerAttr":$ordered_val,
+               "StringAttr":$order_val, "UnitAttr":$inclusive,
+               CArg<"bool", "true">:$buildBody)>,
     OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$operands,
                CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>
   ];
@@ -404,7 +361,7 @@ def TargetOp : OpenMP_Op<"target",[AttrSizedOperandSegments]> {
     The optional $nowait elliminates the implicit barrier so the parent task can make progress
     even if the target task is not yet completed.
 
-    TODO:  private, map, is_device_ptr, firstprivate, depend, defaultmap, in_reduction
+    TODO:  map, is_device_ptr, depend, defaultmap, in_reduction
 
   }];
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 4ff38e2b455a1..b6f45bf044f06 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -66,56 +66,11 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
                        ArrayRef<NamedAttribute> attributes) {
   ParallelOp::build(
       builder, state, /*if_expr_var=*/nullptr, /*num_threads_var=*/nullptr,
-      /*default_val=*/nullptr, /*private_vars=*/ValueRange(),
-      /*firstprivate_vars=*/ValueRange(), /*shared_vars=*/ValueRange(),
-      /*copyin_vars=*/ValueRange(), /*allocate_vars=*/ValueRange(),
-      /*allocators_vars=*/ValueRange(), /*proc_bind_val=*/nullptr);
+      /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
+      /*proc_bind_val=*/nullptr);
   state.addAttributes(attributes);
 }
 
-//===----------------------------------------------------------------------===//
-// Parser and printer for Operand and type list
-//===----------------------------------------------------------------------===//
-
-/// Parse a list of operands with types.
-///
-/// operand-and-type-list ::= `(` ssa-id-and-type-list `)`
-/// ssa-id-and-type-list ::= ssa-id-and-type |
-///                          ssa-id-and-type `,` ssa-id-and-type-list
-/// ssa-id-and-type ::= ssa-id `:` type
-static ParseResult
-parseOperandAndTypeList(OpAsmParser &parser,
-                        SmallVectorImpl<OpAsmParser::OperandType> &operands,
-                        SmallVectorImpl<Type> &types) {
-  return parser.parseCommaSeparatedList(
-      OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
-        OpAsmParser::OperandType operand;
-        Type type;
-        if (parser.parseOperand(operand) || parser.parseColonType(type))
-          return failure();
-        operands.push_back(operand);
-        types.push_back(type);
-        return success();
-      });
-}
-
-/// Print an operand and type list with parentheses
-static void printOperandAndTypeList(OpAsmPrinter &p, OperandRange operands) {
-  p << "(";
-  llvm::interleaveComma(
-      operands, p, [&](const Value &v) { p << v << " : " << v.getType(); });
-  p << ") ";
-}
-
-/// Print data variables corresponding to a data-sharing clause `name`
-static void printDataVars(OpAsmPrinter &p, OperandRange operands,
-                          StringRef name) {
-  if (!operands.empty()) {
-    p << name;
-    printOperandAndTypeList(p, operands);
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Parser and printer for Allocate Clause
 //===----------------------------------------------------------------------===//
@@ -180,17 +135,9 @@ void ParallelOp::print(OpAsmPrinter &p) {
   if (auto threads = num_threads_var())
     p << "num_threads(" << threads << " : " << threads.getType() << ") ";
 
-  printDataVars(p, private_vars(), "private");
-  printDataVars(p, firstprivate_vars(), "firstprivate");
-  printDataVars(p, shared_vars(), "shared");
-  printDataVars(p, copyin_vars(), "copyin");
-
   if (!allocate_vars().empty())
     printAllocateAndAllocator(p, allocate_vars(), allocators_vars());
 
-  if (auto def = default_val())
-    p << "default(" << stringifyClauseDefault(*def).drop_front(3) << ") ";
-
   if (auto bind = proc_bind_val())
     p << "proc_bind(" << stringifyClauseProcBindKind(*bind) << ") ";
 
@@ -542,13 +489,7 @@ enum ClauseType {
   numThreadsClause,
   deviceClause,
   threadLimitClause,
-  privateClause,
-  firstprivateClause,
-  lastprivateClause,
-  sharedClause,
-  copyinClause,
   allocateClause,
-  defaultClause,
   procBindClause,
   reductionClause,
   nowaitClause,
@@ -589,19 +530,11 @@ static ParseResult parseClauseAttr(AsmParser &parser, OperationState &state,
 /// `clauses` list. The operand segments are added over the prevSegments
 
 /// clause-list ::= clause clause-list | empty
-/// clause ::= if | num-threads | private | firstprivate | lastprivate |
-///            shared | copyin | allocate | default | proc-bind | reduction |
-///            nowait | linear | schedule | collapse | order | ordered |
-///            inclusive
+/// clause ::= if | num-threads | allocate | proc-bind | reduction | nowait
+///          | linear | schedule | collapse | order | ordered | inclusive
 /// if ::= `if` `(` ssa-id-and-type `)`
 /// num-threads ::= `num_threads` `(` ssa-id-and-type `)`
-/// private ::= `private` operand-and-type-list
-/// firstprivate ::= `firstprivate` operand-and-type-list
-/// lastprivate ::= `lastprivate` operand-and-type-list
-/// shared ::= `shared` operand-and-type-list
-/// copyin ::= `copyin` operand-and-type-list
 /// allocate ::= `allocate` `(` allocate-operand-list `)`
-/// default ::= `default` `(` (`private` | `firstprivate` | `shared` | `none`)
 /// proc-bind ::= `proc_bind` `(` (`master` | `close` | `spread`) `)`
 /// reduction ::= `reduction` `(` reduction-entry-list `)`
 /// nowait ::= `nowait`
@@ -633,11 +566,6 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
   std::pair<OpAsmParser::OperandType, Type> device;
   std::pair<OpAsmParser::OperandType, Type> threadLimit;
 
-  SmallVector<OpAsmParser::OperandType> privates, firstprivates, lastprivates,
-      shareds, copyins;
-  SmallVector<Type> privateTypes, firstprivateTypes, lastprivateTypes,
-      sharedTypes, copyinTypes;
-
   SmallVector<OpAsmParser::OperandType> allocates, allocators;
   SmallVector<Type> allocateTypes, allocatorTypes;
 
@@ -660,9 +588,9 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
 
     // Skip the following clauses - they do not take any position in operand
     // segments
-    if (clause == defaultClause || clause == procBindClause ||
-        clause == nowaitClause || clause == collapseClause ||
-        clause == orderClause || clause == orderedClause)
+    if (clause == procBindClause || clause == nowaitClause ||
+        clause == collapseClause || clause == orderClause ||
+        clause == orderedClause)
       continue;
 
     pos[clause] = currPos++;
@@ -714,31 +642,6 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
           parser.parseColonType(threadLimit.second) || parser.parseRParen())
         return failure();
       clauseSegments[pos[threadLimitClause]] = 1;
-    } else if (clauseKeyword == "private") {
-      if (checkAllowed(privateClause) ||
-          parseOperandAndTypeList(parser, privates, privateTypes))
-        return failure();
-      clauseSegments[pos[privateClause]] = privates.size();
-    } else if (clauseKeyword == "firstprivate") {
-      if (checkAllowed(firstprivateClause) ||
-          parseOperandAndTypeList(parser, firstprivates, firstprivateTypes))
-        return failure();
-      clauseSegments[pos[firstprivateClause]] = firstprivates.size();
-    } else if (clauseKeyword == "lastprivate") {
-      if (checkAllowed(lastprivateClause) ||
-          parseOperandAndTypeList(parser, lastprivates, lastprivateTypes))
-        return failure();
-      clauseSegments[pos[lastprivateClause]] = lastprivates.size();
-    } else if (clauseKeyword == "shared") {
-      if (checkAllowed(sharedClause) ||
-          parseOperandAndTypeList(parser, shareds, sharedTypes))
-        return failure();
-      clauseSegments[pos[sharedClause]] = shareds.size();
-    } else if (clauseKeyword == "copyin") {
-      if (checkAllowed(copyinClause) ||
-          parseOperandAndTypeList(parser, copyins, copyinTypes))
-        return failure();
-      clauseSegments[pos[copyinClause]] = copyins.size();
     } else if (clauseKeyword == "allocate") {
       if (checkAllowed(allocateClause) ||
           parseAllocateAndAllocator(parser, allocates, allocateTypes,
@@ -746,21 +649,6 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
         return failure();
       clauseSegments[pos[allocateClause]] = allocates.size();
       clauseSegments[pos[allocateClause] + 1] = allocators.size();
-    } else if (clauseKeyword == "default") {
-      StringRef defval;
-      SMLoc loc = parser.getCurrentLocation();
-      if (checkAllowed(defaultClause) || parser.parseLParen() ||
-          parser.parseKeyword(&defval) || parser.parseRParen())
-        return failure();
-      // The def prefix is required for the attribute as "private" is a keyword
-      // in C++.
-      if (Optional<ClauseDefault> def =
-              symbolizeClauseDefault(("def" + defval).str())) {
-        result.addAttribute("default_val",
-                            ClauseDefaultAttr::get(parser.getContext(), *def));
-      } else {
-        return parser.emitError(loc, "invalid default clause");
-      }
     } else if (clauseKeyword == "proc_bind") {
       if (checkAllowed(procBindClause) ||
           parseClauseAttr<ClauseProcBindKindAttr>(parser, result,
@@ -857,37 +745,6 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
                                    result.operands)))
     return failure();
 
-  // Add private parameters.
-  if (done[privateClause] && clauseSegments[pos[privateClause]] &&
-      failed(parser.resolveOperands(privates, privateTypes,
-                                    privates[0].location, result.operands)))
-    return failure();
-
-  // Add firstprivate parameters.
-  if (done[firstprivateClause] && clauseSegments[pos[firstprivateClause]] &&
-      failed(parser.resolveOperands(firstprivates, firstprivateTypes,
-                                    firstprivates[0].location,
-                                    result.operands)))
-    return failure();
-
-  // Add lastprivate parameters.
-  if (done[lastprivateClause] && clauseSegments[pos[lastprivateClause]] &&
-      failed(parser.resolveOperands(lastprivates, lastprivateTypes,
-                                    lastprivates[0].location, result.operands)))
-    return failure();
-
-  // Add shared parameters.
-  if (done[sharedClause] && clauseSegments[pos[sharedClause]] &&
-      failed(parser.resolveOperands(shareds, sharedTypes, shareds[0].location,
-                                    result.operands)))
-    return failure();
-
-  // Add copyin parameters.
-  if (done[copyinClause] && clauseSegments[pos[copyinClause]] &&
-      failed(parser.resolveOperands(copyins, copyinTypes, copyins[0].location,
-                                    result.operands)))
-    return failure();
-
   // Add allocate parameters.
   if (done[allocateClause] && clauseSegments[pos[allocateClause]] &&
       failed(parser.resolveOperands(allocates, allocateTypes,
@@ -967,14 +824,11 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
 ///
 /// operation ::= `omp.parallel` clause-list
 /// clause-list ::= clause | clause clause-list
-/// clause ::= if | num-threads | private | firstprivate | shared | copyin |
-///            allocate | default | proc-bind
+/// clause ::= if | num-threads | allocate | proc-bind
 ///
 ParseResult ParallelOp::parse(OpAsmParser &parser, OperationState &result) {
-  SmallVector<ClauseType> clauses = {
-      ifClause,           numThreadsClause, privateClause,
-      firstprivateClause, sharedClause,     copyinClause,
-      allocateClause,     defaultClause,    procBindClause};
+  SmallVector<ClauseType> clauses = {ifClause, numThreadsClause, allocateClause,
+                                     procBindClause};
 
   SmallVector<int> segments;
 
@@ -1027,12 +881,10 @@ ParseResult TargetOp::parse(OpAsmParser &parser, OperationState &result) {
 ///
 /// sections ::= `omp.sections` clause-list
 /// clause-list ::= clause clause-list | empty
-/// clause ::= private | firstprivate | lastprivate | reduction | allocate |
-///            nowait
+/// clause ::= reduction | allocate | nowait
 ParseResult SectionsOp::parse(OpAsmParser &parser, OperationState &result) {
-  SmallVector<ClauseType> clauses = {privateClause,     firstprivateClause,
-                                     lastprivateClause, reductionClause,
-                                     allocateClause,    nowaitClause};
+  SmallVector<ClauseType> clauses = {reductionClause, allocateClause,
+                                     nowaitClause};
 
   SmallVector<int> segments;
 
@@ -1051,9 +903,6 @@ ParseResult SectionsOp::parse(OpAsmParser &parser, OperationState &result) {
 
 void SectionsOp::print(OpAsmPrinter &p) {
   p << " ";
-  printDataVars(p, private_vars(), "private");
-  printDataVars(p, firstprivate_vars(), "firstprivate");
-  printDataVars(p, lastprivate_vars(), "lastprivate");
 
   if (!reduction_vars().empty())
     printReductionVarList(p, reductions(), reduction_vars());
@@ -1069,18 +918,6 @@ void SectionsOp::print(OpAsmPrinter &p) {
 }
 
 LogicalResult SectionsOp::verify() {
-  // A list item may not appear in more than one clause on the same directive,
-  // except that it may be specified in both firstprivate and lastprivate
-  // clauses.
-  for (auto var : private_vars()) {
-    if (llvm::is_contained(firstprivate_vars(), var))
-      return emitOpError()
-             << "operand used in both private and firstprivate clauses";
-    if (llvm::is_contained(lastprivate_vars(), var))
-      return emitOpError()
-             << "operand used in both private and lastprivate clauses";
-  }
-
   if (allocate_vars().size() != allocators_vars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
@@ -1102,8 +939,8 @@ LogicalResult SectionsOp::verify() {
 /// loop-bounds := `(` ssa-id-list `)` to `(` ssa-id-list `)` inclusive? steps
 /// steps := `step` `(`ssa-id-list`)`
 /// clause-list ::= clause clause-list | empty
-/// clause ::= private | firstprivate | lastprivate | linear | schedule |
-//             collapse | nowait | ordered | order | reduction
+/// clause ::= linear | schedule | collapse | nowait | ordered | order
+///          | reduction
 ParseResult WsLoopOp::parse(OpAsmParser &parser, OperationState &result) {
   // Parse an opening `(` followed by induction variables followed by `)`
   SmallVector<OpAsmParser::OperandType> ivs;
@@ -1142,9 +979,8 @@ ParseResult WsLoopOp::parse(OpAsmParser &parser, OperationState &result) {
     return failure();
 
   SmallVector<ClauseType> clauses = {
-      privateClause,   firstprivateClause, lastprivateClause, linearClause,
-      reductionClause, collapseClause,     orderClause,       orderedClause,
-      nowaitClause,    scheduleClause};
+      linearClause,  reductionClause, collapseClause, orderClause,
+      orderedClause, nowaitClause,    scheduleClause};
   SmallVector<int> segments{numIVs, numIVs, numIVs};
   if (failed(parseClauses(parser, result, clauses, segments)))
     return failure();
@@ -1170,10 +1006,6 @@ void WsLoopOp::print(OpAsmPrinter &p) {
   }
   p << "step (" << step() << ") ";
 
-  printDataVars(p, private_vars(), "private");
-  printDataVars(p, firstprivate_vars(), "firstprivate");
-  printDataVars(p, lastprivate_vars(), "lastprivate");
-
   if (!linear_vars().empty())
     printLinearClause(p, linear_vars(), linear_step_vars());
 
@@ -1288,8 +1120,6 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &state,
                      ValueRange lowerBound, ValueRange upperBound,
                      ValueRange step, ArrayRef<NamedAttribute> attributes) {
   build(builder, state, TypeRange(), lowerBound, upperBound, step,
-        /*privateVars=*/ValueRange(),
-        /*firstprivateVars=*/ValueRange(), /*lastprivate_vars=*/ValueRange(),
         /*linear_vars=*/ValueRange(), /*linear_step_vars=*/ValueRange(),
         /*reduction_vars=*/ValueRange(), /*schedule_val=*/nullptr,
         /*schedule_chunk_var=*/nullptr, /*collapse_val=*/nullptr,
@@ -1310,18 +1140,14 @@ void WsLoopOp::build(OpBuilder &, OperationState &state, TypeRange resultTypes,
 void WsLoopOp::build(OpBuilder &builder, OperationState &result,
                      TypeRange typeRange, ValueRange lowerBounds,
                      ValueRange upperBounds, ValueRange steps,
-                     ValueRange privateVars, ValueRange firstprivateVars,
-                     ValueRange lastprivateVars, ValueRange linearVars,
-                     ValueRange linearStepVars, ValueRange reductionVars,
-                     StringAttr scheduleVal, Value scheduleChunkVar,
-                     IntegerAttr collapseVal, UnitAttr nowait,
-                     IntegerAttr orderedVal, StringAttr orderVal,
-                     UnitAttr inclusive, bool buildBody) {
+                     ValueRange linearVars, ValueRange linearStepVars,
+                     ValueRange reductionVars, StringAttr scheduleVal,
+                     Value scheduleChunkVar, IntegerAttr collapseVal,
+                     UnitAttr nowait, IntegerAttr orderedVal,
+                     StringAttr orderVal, UnitAttr inclusive, bool buildBody) {
   result.addOperands(lowerBounds);
   result.addOperands(upperBounds);
   result.addOperands(steps);
-  result.addOperands(privateVars);
-  result.addOperands(firstprivateVars);
   result.addOperands(linearVars);
   result.addOperands(linearStepVars);
   if (scheduleChunkVar)
@@ -1345,9 +1171,6 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &result,
           {static_cast<int32_t>(lowerBounds.size()),
            static_cast<int32_t>(upperBounds.size()),
            static_cast<int32_t>(steps.size()),
-           static_cast<int32_t>(privateVars.size()),
-           static_cast<int32_t>(firstprivateVars.size()),
-           static_cast<int32_t>(lastprivateVars.size()),
            static_cast<int32_t>(linearVars.size()),
            static_cast<int32_t>(linearStepVars.size()),
            static_cast<int32_t>(reductionVars.size()),
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4b44197c4366f..40d8582d91287 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -580,15 +580,12 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Support the following clauses: private, firstprivate, lastprivate,
   // reduction, allocate
-  if (!sectionsOp.private_vars().empty() ||
-      !sectionsOp.firstprivate_vars().empty() ||
-      !sectionsOp.lastprivate_vars().empty() ||
-      !sectionsOp.reduction_vars().empty() || sectionsOp.reductions() ||
+  if (!sectionsOp.reduction_vars().empty() || sectionsOp.reductions() ||
       !sectionsOp.allocate_vars().empty() ||
       !sectionsOp.allocators_vars().empty())
     return emitError(sectionsOp.getLoc())
-           << "private, firstprivate, lastprivate, reduction and allocate "
-              "clauses are not supported for sections construct";
+           << "reduction and allocate clauses are not supported for sections "
+              "construct";
 
   LogicalResult bodyGenStatus = success();
   SmallVector<StorableBodyGenCallbackTy> sectionCBs;
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index 460ea9045ae07..f9aa110d815ed 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -57,7 +57,7 @@ func @wsloop(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: inde
       // CHECK: "test.payload"(%[[CAST_ARG6]], %[[CAST_ARG7]]) : (index, index) -> ()
       "test.payload"(%arg6, %arg7) : (index, index) -> ()
       omp.yield
-    }) {operand_segment_sizes = dense<[2, 2, 2, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (index, index, index, index, index, index) -> ()
+    }) {operand_segment_sizes = dense<[2, 2, 2, 0, 0, 0, 0]> : vector<7xi32>} : (index, index, index, index, index, index) -> ()
     omp.terminator
   }
   return
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index e2dbd0f7e7268..6646410183c74 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -30,54 +30,6 @@ func @num_threads_once(%n : si32) {
 
 // -----
 
-func @private_once(%n : memref<i32>) {
-  // expected-error@+1 {{at most one private clause can appear on the omp.parallel operation}}
-  omp.parallel private(%n : memref<i32>) private(%n : memref<i32>) {
-  }
-
-  return
-}
-
-// -----
-
-func @firstprivate_once(%n : memref<i32>) {
-  // expected-error@+1 {{at most one firstprivate clause can appear on the omp.parallel operation}}
-  omp.parallel firstprivate(%n : memref<i32>) firstprivate(%n : memref<i32>) {
-  }
-
-  return
-}
-
-// -----
-
-func @shared_once(%n : memref<i32>) {
-  // expected-error@+1 {{at most one shared clause can appear on the omp.parallel operation}}
-  omp.parallel shared(%n : memref<i32>) shared(%n : memref<i32>) {
-  }
-
-  return
-}
-
-// -----
-
-func @copyin_once(%n : memref<i32>) {
-  // expected-error@+1 {{at most one copyin clause can appear on the omp.parallel operation}}
-  omp.parallel copyin(%n : memref<i32>) copyin(%n : memref<i32>) {
-  }
-
-  return
-}
-
-// -----
-
-func @lastprivate_not_allowed(%n : memref<i32>) {
-  // expected-error@+1 {{lastprivate is not a valid clause for the omp.parallel operation}}
-  omp.parallel lastprivate(%n : memref<i32>) {}
-  return
-}
-
-// -----
-
 func @nowait_not_allowed(%n : memref<i32>) {
   // expected-error@+1 {{nowait is not a valid clause for the omp.parallel operation}}
   omp.parallel nowait {}
@@ -125,16 +77,6 @@ func @ordered_not_allowed() {
 
 // -----
 
-func @default_once() {
-  // expected-error@+1 {{at most one default clause can appear on the omp.parallel operation}}
-  omp.parallel default(private) default(firstprivate) {
-  }
-
-  return
-}
-
-// -----
-
 func @proc_bind_once() {
   // expected-error@+1 {{at most one proc_bind clause can appear on the omp.parallel operation}}
   omp.parallel proc_bind(close) proc_bind(spread) {
@@ -163,24 +105,6 @@ func @order_value(%lb : index, %ub : index, %step : index) {
 
 // -----
 
-func @shared_not_allowed(%lb : index, %ub : index, %step : index, %var : memref<i32>) {
-  // expected-error @below {{shared is not a valid clause for the omp.wsloop operation}}
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) shared(%var) {
-    omp.yield
-  }
-}
-
-// -----
-
-func @copyin(%lb : index, %ub : index, %step : index, %var : memref<i32>) {
-  // expected-error @below {{copyin is not a valid clause for the omp.wsloop operation}}
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) copyin(%var) {
-    omp.yield
-  }
-}
-
-// -----
-
 func @if_not_allowed(%lb : index, %ub : index, %step : index, %bool_var : i1) {
   // expected-error @below {{if is not a valid clause for the omp.wsloop operation}}
   omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) if(%bool_var: i1) {
@@ -199,15 +123,6 @@ func @num_threads_not_allowed(%lb : index, %ub : index, %step : index, %int_var
 
 // -----
 
-func @default_not_allowed(%lb : index, %ub : index, %step : index) {
-  // expected-error @below {{default is not a valid clause for the omp.wsloop operation}}
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) default(private) {
-    omp.yield
-  }
-}
-
-// -----
-
 func @proc_bind_not_allowed(%lb : index, %ub : index, %step : index) {
   // expected-error @below {{proc_bind is not a valid clause for the omp.wsloop operation}}
   omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) proc_bind(close) {
@@ -847,41 +762,11 @@ func @omp_atomic_capture(%x: memref<i32>, %y: memref<i32>, %v: memref<i32>, %exp
 
 // -----
 
-func @omp_sections(%data_var1 : memref<i32>, %data_var2 : memref<i32>, %data_var3 : memref<i32>) -> () {
-  // expected-error @below {{operand used in both private and firstprivate clauses}}
-  omp.sections private(%data_var1 : memref<i32>) firstprivate(%data_var1 : memref<i32>) {
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-func @omp_sections(%data_var1 : memref<i32>, %data_var2 : memref<i32>, %data_var3 : memref<i32>) -> () {
-  // expected-error @below {{operand used in both private and lastprivate clauses}}
-  omp.sections private(%data_var1 : memref<i32>) lastprivate(%data_var1 : memref<i32>) {
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-func @omp_sections(%data_var1 : memref<i32>, %data_var2 : memref<i32>, %data_var3 : memref<i32>) -> () {
-  // expected-error @below {{operand used in both private and lastprivate clauses}}
-  omp.sections private(%data_var1 : memref<i32>, %data_var2 : memref<i32>) lastprivate(%data_var3 : memref<i32>, %data_var2 : memref<i32>) {
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
 func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = dense<[0,0,0,0,1,0]> : vector<6xi32>} : (memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[0,1,0]> : vector<3xi32>} : (memref<i32>) -> ()
   return
 }
 
@@ -891,7 +776,7 @@ func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = dense<[0,0,0,1,0,0]> : vector<6xi32>} : (memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[1,0,0]> : vector<3xi32>} : (memref<i32>) -> ()
   return
 }
 
@@ -927,36 +812,6 @@ func @omp_sections() {
 
 // -----
 
-func @omp_sections(%datavar : memref<i32>) {
-  // expected-error @below {{shared is not a valid clause for the omp.sections operation}}
-  omp.sections shared(%datavar : memref<i32>) {
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-func @omp_sections(%datavar : memref<i32>) {
-  // expected-error @below {{copyin is not a valid clause for the omp.sections operation}}
-  omp.sections copyin(%datavar : memref<i32>) {
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-func @omp_sections() {
-  // expected-error @below {{default is not a valid clause for the omp.sections operation}}
-  omp.sections default(private) {
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
 func @omp_sections() {
   // expected-error @below {{proc_bind is not a valid clause for the omp.sections operation}}
   omp.sections proc_bind(close) {
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 5732712a2c056..1c16ae75c68af 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -52,38 +52,38 @@ func @omp_terminator() -> () {
 }
 
 func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32) -> () {
-  // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-  "omp.parallel" (%if_cond, %num_threads, %data_var, %data_var, %data_var, %data_var, %data_var, %data_var) ({
+  // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
+  "omp.parallel" (%if_cond, %num_threads, %data_var, %data_var) ({
 
   // test without if condition
-  // CHECK: omp.parallel num_threads(%{{.*}} : si32) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-    "omp.parallel"(%num_threads, %data_var, %data_var, %data_var, %data_var, %data_var, %data_var) ({
+  // CHECK: omp.parallel num_threads(%{{.*}} : si32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
+    "omp.parallel"(%num_threads, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = dense<[0,1,1,1,1,1,1,1]>: vector<8xi32>, default_val = #omp<"clause_default defshared">} : (si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+    }) {operand_segment_sizes = dense<[0,1,1,1]>: vector<4xi32>} : (si32, memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
 
   // test without num_threads
-  // CHECK: omp.parallel if(%{{.*}}) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-    "omp.parallel"(%if_cond, %data_var, %data_var, %data_var, %data_var, %data_var, %data_var) ({
+  // CHECK: omp.parallel if(%{{.*}}) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
+    "omp.parallel"(%if_cond, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = dense<[1,0,1,1,1,1,1,1]> : vector<8xi32>} : (i1, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+    }) {operand_segment_sizes = dense<[1,0,1,1]> : vector<4xi32>} : (i1, memref<i32>, memref<i32>) -> ()
 
   // test without allocate
-  // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>)
-    "omp.parallel"(%if_cond, %num_threads, %data_var, %data_var, %data_var, %data_var) ({
+  // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32)
+    "omp.parallel"(%if_cond, %num_threads) ({
       omp.terminator
-    }) {operand_segment_sizes = dense<[1,1,1,1,1,1,0,0]> : vector<8xi32>} : (i1, si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+    }) {operand_segment_sizes = dense<[1,1,0,0]> : vector<4xi32>} : (i1, si32) -> ()
 
     omp.terminator
-  }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1]> : vector<8xi32>, proc_bind_val = #omp<"procbindkind spread">} : (i1, si32, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[1,1,1,1]> : vector<4xi32>, proc_bind_val = #omp<"procbindkind spread">} : (i1, si32, memref<i32>, memref<i32>) -> ()
 
   // test with multiple parameters for single variadic argument
-  // CHECK: omp.parallel private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>, %{{.*}} : memref<i32>) shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-  "omp.parallel" (%data_var, %data_var, %data_var, %data_var, %data_var, %data_var, %data_var) ({
+  // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
+  "omp.parallel" (%data_var, %data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = dense<[0,0,1,2,1,1,1,1]> : vector<8xi32>} : (memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>, memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[0,0,1,1]> : vector<4xi32>} : (memref<i32>, memref<i32>) -> ()
 
   return
 }
@@ -104,13 +104,9 @@ func @omp_parallel_pretty(%data_var : memref<i32>, %if_cond : i1, %num_threads :
    omp.terminator
  }
 
- // CHECK: omp.parallel private(%{{.*}} : memref<i32>, %{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>)
- omp.parallel private(%data_var : memref<i32>, %data_var : memref<i32>) firstprivate(%data_var : memref<i32>) {
-   omp.terminator
- }
-
- // CHECK omp.parallel shared(%{{.*}} : memref<i32>) copyin(%{{.*}} : memref<i32>, %{{.*}} : memref<i32>)
- omp.parallel shared(%data_var : memref<i32>) copyin(%data_var : memref<i32>, %data_var : memref<i32>) {
+ // CHECK: omp.parallel
+ // CHECK-NEXT: omp.parallel if(%{{.*}} : i1)
+ omp.parallel {
    omp.parallel if(%if_cond: i1) {
      omp.terminator
    }
@@ -118,71 +114,50 @@ func @omp_parallel_pretty(%data_var : memref<i32>, %if_cond : i1, %num_threads :
  }
 
  // CHECK omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32) private(%{{.*}} : memref<i32>) proc_bind(close)
- omp.parallel num_threads(%num_threads : si32) if(%if_cond: i1)
-              private(%data_var : memref<i32>) proc_bind(close) {
+ omp.parallel num_threads(%num_threads : si32) if(%if_cond: i1) proc_bind(close) {
    omp.terminator
  }
 
-  // CHECK: omp.parallel default(private)
-  omp.parallel default(private) {
-    omp.terminator
-  }
-
-  // CHECK: omp.parallel default(firstprivate)
-  omp.parallel default(firstprivate) {
-    omp.terminator
-  }
-
-  // CHECK: omp.parallel default(shared)
-  omp.parallel default(shared) {
-    omp.terminator
-  }
-
-  // CHECK: omp.parallel default(none)
-  omp.parallel default(none) {
-    omp.terminator
-  }
-
   return
 }
 
 // CHECK-LABEL: omp_wsloop
 func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memref<i32>, %linear_var : i32, %chunk_var : i32) -> () {
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>, %{{.*}} : memref<i32>) collapse(2) ordered(1)
-  "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var) ({
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) collapse(2) ordered(1)
+  "omp.wsloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,2,0,0,0,0,0,0]> : vector<10xi32>, collapse_val = 2, ordered_val = 1} :
-    (index, index, index, memref<i32>, memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,0]> : vector<7xi32>, collapse_val = 2, ordered_val = 1} :
+    (index, index, index) -> ()
 
   // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static)
   "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,1,1,0,0]> : vector<10xi32>, schedule_val = #omp<"schedulekind Static">} :
+  }) {operand_segment_sizes = dense<[1,1,1,1,1,0,0]> : vector<7xi32>, schedule_val = #omp<"schedulekind Static">} :
     (index, index, index, memref<i32>, i32) -> ()
 
   // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>, %{{.*}} = %{{.*}} : memref<i32>) schedule(static)
   "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %linear_var, %linear_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,2,2,0,0]> : vector<10xi32>, schedule_val = #omp<"schedulekind Static">} :
+  }) {operand_segment_sizes = dense<[1,1,1,2,2,0,0]> : vector<7xi32>, schedule_val = #omp<"schedulekind Static">} :
     (index, index, index, memref<i32>, memref<i32>, i32, i32) -> ()
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) collapse(3) ordered(2)
-  "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %data_var, %data_var, %linear_var, %chunk_var) ({
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) collapse(3) ordered(2)
+  "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var, %chunk_var) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1,0,1]> : vector<10xi32>, schedule_val = #omp<"schedulekind Dynamic">, collapse_val = 3, ordered_val = 2} :
-    (index, index, index, memref<i32>, memref<i32>, memref<i32>, memref<i32>, i32, i32) -> ()
+  }) {operand_segment_sizes = dense<[1,1,1,1,1,0,1]> : vector<7xi32>, schedule_val = #omp<"schedulekind Dynamic">, collapse_val = 3, ordered_val = 2} :
+    (index, index, index, memref<i32>, i32, i32) -> ()
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>) schedule(auto) nowait
-  "omp.wsloop" (%lb, %ub, %step, %data_var) ({
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) schedule(auto) nowait
+  "omp.wsloop" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-  }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0,0]> : vector<10xi32>, nowait, schedule_val = #omp<"schedulekind Auto">} :
-    (index, index, index, memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[1,1,1,0,0,0,0]> : vector<7xi32>, nowait, schedule_val = #omp<"schedulekind Auto">} :
+    (index, index, index) -> ()
 
   return
 }
@@ -190,39 +165,36 @@ func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memref<i32
 // CHECK-LABEL: omp_wsloop_pretty
 func @omp_wsloop_pretty(%lb : index, %ub : index, %step : index, %data_var : memref<i32>, %linear_var : i32, %chunk_var : i32, %chunk_var2 : i16) -> () {
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>)
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref<i32>) collapse(2) ordered(2) {
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
+  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) collapse(2) ordered(2) {
     omp.yield
   }
 
   // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static)
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) schedule(static) lastprivate(%data_var : memref<i32>) linear(%data_var = %linear_var : memref<i32>) {
+  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) schedule(static) linear(%data_var = %linear_var : memref<i32>) {
     omp.yield
   }
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static = %{{.*}} : i32) collapse(3) ordered(2)
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) private(%data_var : memref<i32>)
-     firstprivate(%data_var : memref<i32>) lastprivate(%data_var : memref<i32>) linear(%data_var = %linear_var : memref<i32>)
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static = %{{.*}} : i32) collapse(3) ordered(2)
+  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) linear(%data_var = %linear_var : memref<i32>)
      schedule(static = %chunk_var : i32) collapse(3) {
     omp.yield
   }
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}} : i32, nonmonotonic) collapse(3) ordered(2)
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) private(%data_var : memref<i32>)
-     firstprivate(%data_var : memref<i32>) lastprivate(%data_var : memref<i32>) linear(%data_var = %linear_var : memref<i32>)
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}} : i32, nonmonotonic) collapse(3) ordered(2)
+  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) linear(%data_var = %linear_var : memref<i32>)
      schedule(dynamic = %chunk_var : i32, nonmonotonic) collapse(3) {
     omp.yield
   }
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}} : i16, monotonic) collapse(3) ordered(2)
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) private(%data_var : memref<i32>)
-     firstprivate(%data_var : memref<i32>) lastprivate(%data_var : memref<i32>) linear(%data_var = %linear_var : memref<i32>)
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}} : i16, monotonic) collapse(3) ordered(2)
+  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) linear(%data_var = %linear_var : memref<i32>)
      schedule(dynamic = %chunk_var2 : i16, monotonic) collapse(3) {
     omp.yield
   }
 
-  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private({{.*}} : memref<i32>)
-  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref<i32>) {
+  // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
+  omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) {
     omp.yield
   }
 
@@ -648,81 +620,20 @@ func @omp_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
 // CHECK-LABEL: omp_sectionsop
 func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
                      %data_var3 : memref<i32>, %redn_var : !llvm.ptr<f32>) {
-
-  // CHECK: omp.sections private(%{{.*}} : memref<i32>) {
-  "omp.sections" (%data_var1) ({
-    // CHECK: omp.terminator
-    omp.terminator
-  }) {operand_segment_sizes = dense<[1,0,0,0,0,0]> : vector<6xi32>} : (memref<i32>) -> ()
-
-  // CHECK: omp.sections firstprivate(%{{.*}} : memref<i32>) {
-  "omp.sections" (%data_var1) ({
-    // CHECK: omp.terminator
-    omp.terminator
-  }) {operand_segment_sizes = dense<[0,1,0,0,0,0]> : vector<6xi32>} : (memref<i32>) -> ()
-
-  // CHECK: omp.sections lastprivate(%{{.*}} : memref<i32>) {
-  "omp.sections" (%data_var1) ({
-    // CHECK: omp.terminator
-    omp.terminator
-  }) {operand_segment_sizes = dense<[0,0,1,0,0,0]> : vector<6xi32>} : (memref<i32>) -> ()
-
-  // CHECK: omp.sections private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) {
-  "omp.sections" (%data_var1, %data_var2, %data_var3) ({
-    // CHECK: omp.terminator
-    omp.terminator
-  }) {operand_segment_sizes = dense<[1,1,1,0,0,0]> : vector<6xi32>} : (memref<i32>, memref<i32>, memref<i32>) -> ()
-
   // CHECK: omp.sections allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
   "omp.sections" (%data_var1, %data_var1) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = dense<[0,0,0,0,1,1]> : vector<6xi32>} : (memref<i32>, memref<i32>) -> ()
+  }) {operand_segment_sizes = dense<[0,1,1]> : vector<3xi32>} : (memref<i32>, memref<i32>) -> ()
 
     // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr<f32>)
   "omp.sections" (%redn_var) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = dense<[0,0,0,1,0,0]> : vector<6xi32>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
-
-  // CHECK: omp.sections private(%{{.*}} : memref<i32>) {
-  omp.sections private(%data_var1 : memref<i32>) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
+  }) {operand_segment_sizes = dense<[1,0,0]> : vector<3xi32>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
 
-  // CHECK: omp.sections firstprivate(%{{.*}} : memref<i32>)
-  omp.sections firstprivate(%data_var1 : memref<i32>) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.sections lastprivate(%{{.*}} : memref<i32>)
-  omp.sections lastprivate(%data_var1 : memref<i32>) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.sections private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) {
-  omp.sections private(%data_var1 : memref<i32>) firstprivate(%data_var2 : memref<i32>) lastprivate(%data_var3 : memref<i32>) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.sections private(%{{.*}} : memref<i32>) firstprivate(%{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) {
-  omp.sections lastprivate(%data_var1 : memref<i32>) firstprivate(%data_var2 : memref<i32>) private(%data_var3 : memref<i32>) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.sections private(%{{.*}} : memref<i32>) nowait {
-  omp.sections nowait private(%data_var1 : memref<i32>) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.sections firstprivate(%{{.*}} : memref<i32>, %{{.*}} : memref<i32>) lastprivate(%{{.*}} : memref<i32>) {
-  omp.sections firstprivate(%data_var1 : memref<i32>, %data_var2 : memref<i32>) lastprivate(%data_var1 : memref<i32>) {
+  // CHECK: omp.sections nowait {
+  omp.sections nowait {
     // CHECK: omp.terminator
     omp.terminator
   }
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 94a0e0b3ae376..0de8c4d4e019a 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -379,7 +379,7 @@ llvm.func @wsloop_simple(%arg0: !llvm.ptr<f32>) {
       llvm.store %3, %4 : !llvm.ptr<f32>
       omp.yield
       // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @[[$wsloop_loc_struct]],
-    }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (i64, i64, i64) -> ()
+    }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0]> : vector<7xi32>} : (i64, i64, i64) -> ()
     omp.terminator
   }
   llvm.return
@@ -399,7 +399,7 @@ llvm.func @wsloop_inclusive_1(%arg0: !llvm.ptr<f32>) {
     %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
     llvm.store %3, %4 : !llvm.ptr<f32>
     omp.yield
-  }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (i64, i64, i64) -> ()
+  }) {operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0]> : vector<7xi32>} : (i64, i64, i64) -> ()
   llvm.return
 }
 
@@ -417,7 +417,7 @@ llvm.func @wsloop_inclusive_2(%arg0: !llvm.ptr<f32>) {
     %4 = llvm.getelementptr %arg0[%arg1] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
     llvm.store %3, %4 : !llvm.ptr<f32>
     omp.yield
-  }) {inclusive, operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]> : vector<10xi32>} : (i64, i64, i64) -> ()
+  }) {inclusive, operand_segment_sizes = dense<[1, 1, 1, 0, 0, 0, 0]> : vector<7xi32>} : (i64, i64, i64) -> ()
   llvm.return
 }
 

From 60210f9acbd760272856495175636bc2da0b1fcd Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <Shraiysh.Vaishay@amd.com>
Date: Sat, 19 Feb 2022 01:14:01 +0530
Subject: [PATCH 302/748] [mlir][OpenMP] Added assemblyformat for TargetOp

This patch removes custom parser/printer for `omp.target` and adds
assemblyformat.

Reviewed By: rriddle

Differential Revision: https://reviews.llvm.org/D120138
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  8 +++-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 44 -------------------
 mlir/test/Dialect/OpenMP/ops.mlir             |  6 +--
 3 files changed, 10 insertions(+), 48 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 5933326489dd3..ec535edf81d9f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -372,7 +372,13 @@ def TargetOp : OpenMP_Op<"target",[AttrSizedOperandSegments]> {
 
   let regions = (region AnyRegion:$region);
 
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = [{
+    oilist( `if` `(` $if_expr `)`
+          | `device` `(` $device `:` type($device) `)`
+          | `thread_limit` `(` $thread_limit `:` type($thread_limit) `)`
+          | `nowait`
+    ) $region attr-dict
+  }];
 }
 
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index b6f45bf044f06..bc3b595483d78 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -145,23 +145,6 @@ void ParallelOp::print(OpAsmPrinter &p) {
   p.printRegion(getRegion());
 }
 
-void TargetOp::print(OpAsmPrinter &p) {
-  p << " ";
-  if (auto ifCond = if_expr())
-    p << "if(" << ifCond << " : " << ifCond.getType() << ") ";
-
-  if (auto device = this->device())
-    p << "device(" << device << " : " << device.getType() << ") ";
-
-  if (auto threads = thread_limit())
-    p << "thread_limit(" << threads << " : " << threads.getType() << ") ";
-
-  if (nowait())
-    p << "nowait ";
-
-  p.printRegion(getRegion());
-}
-
 //===----------------------------------------------------------------------===//
 // Parser and printer for Linear Clause
 //===----------------------------------------------------------------------===//
@@ -846,33 +829,6 @@ ParseResult ParallelOp::parse(OpAsmParser &parser, OperationState &result) {
   return success();
 }
 
-/// Parses a target operation.
-///
-/// operation ::= `omp.target` clause-list
-/// clause-list ::= clause | clause clause-list
-/// clause ::= if | device | thread_limit | nowait
-///
-ParseResult TargetOp::parse(OpAsmParser &parser, OperationState &result) {
-  SmallVector<ClauseType> clauses = {ifClause, deviceClause, threadLimitClause,
-                                     nowaitClause};
-
-  SmallVector<int> segments;
-
-  if (failed(parseClauses(parser, result, clauses, segments)))
-    return failure();
-
-  result.addAttribute(
-      TargetOp::AttrSizedOperandSegments::getOperandSegmentSizeAttr(),
-      parser.getBuilder().getI32VectorAttr(segments));
-
-  Region *body = result.addRegion();
-  SmallVector<OpAsmParser::OperandType> regionArgs;
-  SmallVector<Type> regionArgTypes;
-  if (parser.parseRegion(*body, regionArgs, regionArgTypes))
-    return failure();
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // Parser, printer and verifier for SectionsOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 1c16ae75c68af..573b036f5746a 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -307,7 +307,7 @@ func @omp_target(%if_cond : i1, %device : si32,  %num_threads : si32) -> () {
     "omp.target"(%if_cond, %device, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {operand_segment_sizes = dense<[1,1,1]>: vector<3xi32>, nowait } : ( i1, si32, si32 ) -> ()
+    }) {if, device, nowait, operand_segment_sizes = dense<[1,1,1]>: vector<3xi32>, thread_limit} : ( i1, si32, si32 ) -> ()
 
     // CHECK: omp.barrier
     omp.barrier
@@ -318,12 +318,12 @@ func @omp_target(%if_cond : i1, %device : si32,  %num_threads : si32) -> () {
 // CHECK-LABEL: omp_target_pretty
 func @omp_target_pretty(%if_cond : i1, %device : si32,  %num_threads : si32) -> () {
     // CHECK: omp.target if({{.*}}) device({{.*}})
-    omp.target if(%if_cond : i1) device(%device : si32) {
+    omp.target if(%if_cond) device(%device : si32) {
       omp.terminator
     }
 
     // CHECK: omp.target if({{.*}}) device({{.*}}) nowait
-    omp.target if(%if_cond : i1) device(%device : si32) thread_limit(%num_threads : si32) nowait {
+    omp.target if(%if_cond) device(%device : si32) thread_limit(%num_threads : si32) nowait {
       omp.terminator
     }
 

From 0712c575b90a7eb508bf43d15c38c1c0b0d69695 Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@fb.com>
Date: Fri, 18 Feb 2022 14:56:16 -0500
Subject: [PATCH 303/748] [ADT] Have ArrayRef::copy() return a MutableArrayRef

The allocated memory itself is mutable, so let's expose that to the
caller. LLD has a use case for this.

Reviewed By: MaskRay, #lld-macho

Differential Revision: https://reviews.llvm.org/D120144
---
 llvm/include/llvm/ADT/ArrayRef.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index b6896395dae8a..9af4232414e57 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -25,6 +25,7 @@
 #include <vector>
 
 namespace llvm {
+  template<typename T> class LLVM_NODISCARD MutableArrayRef;
 
   /// ArrayRef - Represent a constant reference to an array (0 or more elements
   /// consecutively in memory), i.e. a start pointer and a length.  It allows
@@ -175,10 +176,10 @@ namespace llvm {
     }
 
     // copy - Allocate copy in Allocator and return ArrayRef<T> to it.
-    template <typename Allocator> ArrayRef<T> copy(Allocator &A) {
+    template <typename Allocator> MutableArrayRef<T> copy(Allocator &A) {
       T *Buff = A.template Allocate<T>(Length);
       std::uninitialized_copy(begin(), end(), Buff);
-      return ArrayRef<T>(Buff, Length);
+      return MutableArrayRef<T>(Buff, Length);
     }
 
     /// equals - Check for element-wise equality.

From 805f7a4fa4ce97277c3b73d0c204fc3aa4b072e1 Mon Sep 17 00:00:00 2001
From: David Goldman <davg@google.com>
Date: Wed, 9 Feb 2022 14:50:26 -0500
Subject: [PATCH 304/748] [clang] Add `ObjCProtocolLoc` to represent protocol
 references

Add `ObjCProtocolLoc` which behaves like `TypeLoc` but for
`ObjCProtocolDecl` references.

RecursiveASTVisitor now synthesizes `ObjCProtocolLoc` during traversal
and the `ObjCProtocolLoc` can be stored in a `DynTypedNode`.

In a follow up patch, I'll update clangd to make use of this
to properly support protocol references for hover + goto definition.

Differential Revision: https://reviews.llvm.org/D119363
---
 clang/include/clang/AST/ASTFwd.h              |  1 +
 clang/include/clang/AST/ASTTypeTraits.h       |  8 ++-
 clang/include/clang/AST/RecursiveASTVisitor.h | 47 +++++++++++++--
 clang/include/clang/AST/TypeLoc.h             | 16 +++++
 clang/lib/AST/ASTTypeTraits.cpp               |  6 ++
 clang/lib/AST/ParentMapContext.cpp            | 14 ++++-
 .../unittests/AST/RecursiveASTVisitorTest.cpp | 60 ++++++++++++++++++-
 7 files changed, 143 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/AST/ASTFwd.h b/clang/include/clang/AST/ASTFwd.h
index fdbd603ce5d04..f84b3238e32b5 100644
--- a/clang/include/clang/AST/ASTFwd.h
+++ b/clang/include/clang/AST/ASTFwd.h
@@ -33,6 +33,7 @@ class OMPClause;
 class Attr;
 #define ATTR(A) class A##Attr;
 #include "clang/Basic/AttrList.inc"
+class ObjCProtocolLoc;
 
 } // end namespace clang
 
diff --git a/clang/include/clang/AST/ASTTypeTraits.h b/clang/include/clang/AST/ASTTypeTraits.h
index 6d96146a4d455..cd6b5143bf790 100644
--- a/clang/include/clang/AST/ASTTypeTraits.h
+++ b/clang/include/clang/AST/ASTTypeTraits.h
@@ -160,6 +160,7 @@ class ASTNodeKind {
     NKI_Attr,
 #define ATTR(A) NKI_##A##Attr,
 #include "clang/Basic/AttrList.inc"
+    NKI_ObjCProtocolLoc,
     NKI_NumberOfKinds
   };
 
@@ -213,6 +214,7 @@ KIND_TO_KIND_ID(Stmt)
 KIND_TO_KIND_ID(Type)
 KIND_TO_KIND_ID(OMPClause)
 KIND_TO_KIND_ID(Attr)
+KIND_TO_KIND_ID(ObjCProtocolLoc)
 KIND_TO_KIND_ID(CXXBaseSpecifier)
 #define DECL(DERIVED, BASE) KIND_TO_KIND_ID(DERIVED##Decl)
 #include "clang/AST/DeclNodes.inc"
@@ -499,7 +501,7 @@ class DynTypedNode {
   /// have storage or unique pointers and thus need to be stored by value.
   llvm::AlignedCharArrayUnion<const void *, TemplateArgument,
                               TemplateArgumentLoc, NestedNameSpecifierLoc,
-                              QualType, TypeLoc>
+                              QualType, TypeLoc, ObjCProtocolLoc>
       Storage;
 };
 
@@ -570,6 +572,10 @@ template <>
 struct DynTypedNode::BaseConverter<CXXBaseSpecifier, void>
     : public PtrConverter<CXXBaseSpecifier> {};
 
+template <>
+struct DynTypedNode::BaseConverter<ObjCProtocolLoc, void>
+    : public ValueConverter<ObjCProtocolLoc> {};
+
 // The only operation we allow on unsupported types is \c get.
 // This allows to conveniently use \c DynTypedNode when having an arbitrary
 // AST node that is not supported, but prevents misuse - a user cannot create
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index f62dc36de556e..16da64100d424 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -324,6 +324,12 @@ template <typename Derived> class RecursiveASTVisitor {
   /// \returns false if the visitation was terminated early, true otherwise.
   bool TraverseConceptReference(const ConceptReference &C);
 
+  /// Recursively visit an Objective-C protocol reference with location
+  /// information.
+  ///
+  /// \returns false if the visitation was terminated early, true otherwise.
+  bool TraverseObjCProtocolLoc(ObjCProtocolLoc ProtocolLoc);
+
   // ---- Methods on Attrs ----
 
   // Visit an attribute.
@@ -1340,7 +1346,12 @@ DEF_TRAVERSE_TYPELOC(DependentTemplateSpecializationType, {
 DEF_TRAVERSE_TYPELOC(PackExpansionType,
                      { TRY_TO(TraverseTypeLoc(TL.getPatternLoc())); })
 
-DEF_TRAVERSE_TYPELOC(ObjCTypeParamType, {})
+DEF_TRAVERSE_TYPELOC(ObjCTypeParamType, {
+  for (unsigned I = 0, N = TL.getNumProtocols(); I != N; ++I) {
+    ObjCProtocolLoc ProtocolLoc(TL.getProtocol(I), TL.getProtocolLoc(I));
+    TRY_TO(TraverseObjCProtocolLoc(ProtocolLoc));
+  }
+})
 
 DEF_TRAVERSE_TYPELOC(ObjCInterfaceType, {})
 
@@ -1351,6 +1362,10 @@ DEF_TRAVERSE_TYPELOC(ObjCObjectType, {
     TRY_TO(TraverseTypeLoc(TL.getBaseLoc()));
   for (unsigned i = 0, n = TL.getNumTypeArgs(); i != n; ++i)
     TRY_TO(TraverseTypeLoc(TL.getTypeArgTInfo(i)->getTypeLoc()));
+  for (unsigned I = 0, N = TL.getNumProtocols(); I != N; ++I) {
+    ObjCProtocolLoc ProtocolLoc(TL.getProtocol(I), TL.getProtocolLoc(I));
+    TRY_TO(TraverseObjCProtocolLoc(ProtocolLoc));
+  }
 })
 
 DEF_TRAVERSE_TYPELOC(ObjCObjectPointerType,
@@ -1541,12 +1556,16 @@ DEF_TRAVERSE_DECL(
 DEF_TRAVERSE_DECL(ObjCCompatibleAliasDecl, {// FIXME: implement
                                            })
 
-DEF_TRAVERSE_DECL(ObjCCategoryDecl, {// FIXME: implement
+DEF_TRAVERSE_DECL(ObjCCategoryDecl, {
   if (ObjCTypeParamList *typeParamList = D->getTypeParamList()) {
     for (auto typeParam : *typeParamList) {
       TRY_TO(TraverseObjCTypeParamDecl(typeParam));
     }
   }
+  for (auto It : llvm::zip(D->protocols(), D->protocol_locs())) {
+    ObjCProtocolLoc ProtocolLoc(std::get<0>(It), std::get<1>(It));
+    TRY_TO(TraverseObjCProtocolLoc(ProtocolLoc));
+  }
 })
 
 DEF_TRAVERSE_DECL(ObjCCategoryImplDecl, {// FIXME: implement
@@ -1555,7 +1574,7 @@ DEF_TRAVERSE_DECL(ObjCCategoryImplDecl, {// FIXME: implement
 DEF_TRAVERSE_DECL(ObjCImplementationDecl, {// FIXME: implement
                                           })
 
-DEF_TRAVERSE_DECL(ObjCInterfaceDecl, {// FIXME: implement
+DEF_TRAVERSE_DECL(ObjCInterfaceDecl, {
   if (ObjCTypeParamList *typeParamList = D->getTypeParamListAsWritten()) {
     for (auto typeParam : *typeParamList) {
       TRY_TO(TraverseObjCTypeParamDecl(typeParam));
@@ -1565,10 +1584,22 @@ DEF_TRAVERSE_DECL(ObjCInterfaceDecl, {// FIXME: implement
   if (TypeSourceInfo *superTInfo = D->getSuperClassTInfo()) {
     TRY_TO(TraverseTypeLoc(superTInfo->getTypeLoc()));
   }
+  if (D->isThisDeclarationADefinition()) {
+    for (auto It : llvm::zip(D->protocols(), D->protocol_locs())) {
+      ObjCProtocolLoc ProtocolLoc(std::get<0>(It), std::get<1>(It));
+      TRY_TO(TraverseObjCProtocolLoc(ProtocolLoc));
+    }
+  }
 })
 
-DEF_TRAVERSE_DECL(ObjCProtocolDecl, {// FIXME: implement
-                                    })
+DEF_TRAVERSE_DECL(ObjCProtocolDecl, {
+  if (D->isThisDeclarationADefinition()) {
+    for (auto It : llvm::zip(D->protocols(), D->protocol_locs())) {
+      ObjCProtocolLoc ProtocolLoc(std::get<0>(It), std::get<1>(It));
+      TRY_TO(TraverseObjCProtocolLoc(ProtocolLoc));
+    }
+  }
+})
 
 DEF_TRAVERSE_DECL(ObjCMethodDecl, {
   if (D->getReturnTypeSourceInfo()) {
@@ -2409,6 +2440,12 @@ bool RecursiveASTVisitor<Derived>::TraverseConceptReference(
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::TraverseObjCProtocolLoc(
+    ObjCProtocolLoc ProtocolLoc) {
+  return true;
+}
+
 // If shouldVisitImplicitCode() returns false, this method traverses only the
 // syntactic form of InitListExpr.
 // If shouldVisitImplicitCode() return true, this method is called once for
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 8cfa579a22da7..59dfa8a9a54d8 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -2607,6 +2607,22 @@ class DependentBitIntTypeLoc final
     : public InheritingConcreteTypeLoc<TypeSpecTypeLoc, DependentBitIntTypeLoc,
                                        DependentBitIntType> {};
 
+class ObjCProtocolLoc {
+  ObjCProtocolDecl *Protocol = nullptr;
+  SourceLocation Loc = SourceLocation();
+
+public:
+  ObjCProtocolLoc(ObjCProtocolDecl *protocol, SourceLocation loc)
+      : Protocol(protocol), Loc(loc) {}
+  ObjCProtocolDecl *getProtocol() const { return Protocol; }
+  SourceLocation getLocation() const { return Loc; }
+
+  /// The source range is just the protocol name.
+  SourceRange getSourceRange() const LLVM_READONLY {
+    return SourceRange(Loc, Loc);
+  }
+};
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_TYPELOC_H
diff --git a/clang/lib/AST/ASTTypeTraits.cpp b/clang/lib/AST/ASTTypeTraits.cpp
index b333f4618efb8..64823f77e58a1 100644
--- a/clang/lib/AST/ASTTypeTraits.cpp
+++ b/clang/lib/AST/ASTTypeTraits.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclObjC.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/OpenMPClause.h"
 #include "clang/AST/TypeLoc.h"
@@ -52,6 +53,7 @@ const ASTNodeKind::KindInfo ASTNodeKind::AllKindInfo[] = {
     {NKI_None, "Attr"},
 #define ATTR(A) {NKI_Attr, #A "Attr"},
 #include "clang/Basic/AttrList.inc"
+    {NKI_None, "ObjCProtocolLoc"},
 };
 
 bool ASTNodeKind::isBaseOf(ASTNodeKind Other, unsigned *Distance) const {
@@ -193,6 +195,8 @@ void DynTypedNode::print(llvm::raw_ostream &OS,
     QualType(T, 0).print(OS, PP);
   else if (const Attr *A = get<Attr>())
     A->printPretty(OS, PP);
+  else if (const ObjCProtocolLoc *P = get<ObjCProtocolLoc>())
+    P->getProtocol()->print(OS, PP);
   else
     OS << "Unable to print values of type " << NodeKind.asStringRef() << "\n";
 }
@@ -228,5 +232,7 @@ SourceRange DynTypedNode::getSourceRange() const {
     return CBS->getSourceRange();
   if (const auto *A = get<Attr>())
     return A->getRange();
+  if (const ObjCProtocolLoc *P = get<ObjCProtocolLoc>())
+    return P->getSourceRange();
   return SourceRange();
 }
diff --git a/clang/lib/AST/ParentMapContext.cpp b/clang/lib/AST/ParentMapContext.cpp
index d216be5b59e89..e0d4700e4b10b 100644
--- a/clang/lib/AST/ParentMapContext.cpp
+++ b/clang/lib/AST/ParentMapContext.cpp
@@ -330,6 +330,9 @@ template <>
 DynTypedNode createDynTypedNode(const NestedNameSpecifierLoc &Node) {
   return DynTypedNode::create(Node);
 }
+template <> DynTypedNode createDynTypedNode(const ObjCProtocolLoc &Node) {
+  return DynTypedNode::create(Node);
+}
 /// @}
 
 /// A \c RecursiveASTVisitor that builds a map from nodes to their
@@ -398,11 +401,14 @@ class ParentMapContext::ParentMap::ASTVisitor
     }
   }
 
+  template <typename T> static bool isNull(T Node) { return !Node; }
+  static bool isNull(ObjCProtocolLoc Node) { return false; }
+
   template <typename T, typename MapNodeTy, typename BaseTraverseFn,
             typename MapTy>
   bool TraverseNode(T Node, MapNodeTy MapNode, BaseTraverseFn BaseTraverse,
                     MapTy *Parents) {
-    if (!Node)
+    if (isNull(Node))
       return true;
     addParent(MapNode, Parents);
     ParentStack.push_back(createDynTypedNode(Node));
@@ -433,6 +439,12 @@ class ParentMapContext::ParentMap::ASTVisitor
         AttrNode, AttrNode, [&] { return VisitorBase::TraverseAttr(AttrNode); },
         &Map.PointerParents);
   }
+  bool TraverseObjCProtocolLoc(ObjCProtocolLoc ProtocolLocNode) {
+    return TraverseNode(
+        ProtocolLocNode, DynTypedNode::create(ProtocolLocNode),
+        [&] { return VisitorBase::TraverseObjCProtocolLoc(ProtocolLocNode); },
+        &Map.OtherParents);
+  }
 
   // Using generic TraverseNode for Stmt would prevent data-recursion.
   bool dataTraverseStmtPre(Stmt *StmtNode) {
diff --git a/clang/unittests/AST/RecursiveASTVisitorTest.cpp b/clang/unittests/AST/RecursiveASTVisitorTest.cpp
index f44a5eca18728..9d7ff5947fe53 100644
--- a/clang/unittests/AST/RecursiveASTVisitorTest.cpp
+++ b/clang/unittests/AST/RecursiveASTVisitorTest.cpp
@@ -60,6 +60,12 @@ enum class VisitEvent {
   EndTraverseEnum,
   StartTraverseTypedefType,
   EndTraverseTypedefType,
+  StartTraverseObjCInterface,
+  EndTraverseObjCInterface,
+  StartTraverseObjCProtocol,
+  EndTraverseObjCProtocol,
+  StartTraverseObjCProtocolLoc,
+  EndTraverseObjCProtocolLoc,
 };
 
 class CollectInterestingEvents
@@ -97,18 +103,43 @@ class CollectInterestingEvents
     return Ret;
   }
 
+  bool TraverseObjCInterfaceDecl(ObjCInterfaceDecl *ID) {
+    Events.push_back(VisitEvent::StartTraverseObjCInterface);
+    bool Ret = RecursiveASTVisitor::TraverseObjCInterfaceDecl(ID);
+    Events.push_back(VisitEvent::EndTraverseObjCInterface);
+
+    return Ret;
+  }
+
+  bool TraverseObjCProtocolDecl(ObjCProtocolDecl *PD) {
+    Events.push_back(VisitEvent::StartTraverseObjCProtocol);
+    bool Ret = RecursiveASTVisitor::TraverseObjCProtocolDecl(PD);
+    Events.push_back(VisitEvent::EndTraverseObjCProtocol);
+
+    return Ret;
+  }
+
+  bool TraverseObjCProtocolLoc(ObjCProtocolLoc ProtocolLoc) {
+    Events.push_back(VisitEvent::StartTraverseObjCProtocolLoc);
+    bool Ret = RecursiveASTVisitor::TraverseObjCProtocolLoc(ProtocolLoc);
+    Events.push_back(VisitEvent::EndTraverseObjCProtocolLoc);
+
+    return Ret;
+  }
+
   std::vector<VisitEvent> takeEvents() && { return std::move(Events); }
 
 private:
   std::vector<VisitEvent> Events;
 };
 
-std::vector<VisitEvent> collectEvents(llvm::StringRef Code) {
+std::vector<VisitEvent> collectEvents(llvm::StringRef Code,
+                                      const Twine &FileName = "input.cc") {
   CollectInterestingEvents Visitor;
   clang::tooling::runToolOnCode(
       std::make_unique<ProcessASTAction>(
           [&](clang::ASTContext &Ctx) { Visitor.TraverseAST(Ctx); }),
-      Code);
+      Code, FileName);
   return std::move(Visitor).takeEvents();
 }
 } // namespace
@@ -139,3 +170,28 @@ TEST(RecursiveASTVisitorTest, EnumDeclWithBase) {
                           VisitEvent::EndTraverseTypedefType,
                           VisitEvent::EndTraverseEnum));
 }
+
+TEST(RecursiveASTVisitorTest, InterfaceDeclWithProtocols) {
+  // Check interface and its protocols are visited.
+  llvm::StringRef Code = R"cpp(
+  @protocol Foo
+  @end
+  @protocol Bar
+  @end
+
+  @interface SomeObject <Foo, Bar>
+  @end
+  )cpp";
+
+  EXPECT_THAT(collectEvents(Code, "input.m"),
+              ElementsAre(VisitEvent::StartTraverseObjCProtocol,
+                          VisitEvent::EndTraverseObjCProtocol,
+                          VisitEvent::StartTraverseObjCProtocol,
+                          VisitEvent::EndTraverseObjCProtocol,
+                          VisitEvent::StartTraverseObjCInterface,
+                          VisitEvent::StartTraverseObjCProtocolLoc,
+                          VisitEvent::EndTraverseObjCProtocolLoc,
+                          VisitEvent::StartTraverseObjCProtocolLoc,
+                          VisitEvent::EndTraverseObjCProtocolLoc,
+                          VisitEvent::EndTraverseObjCInterface));
+}

From 54a962bbfee86d5af90d5fdd39b4ff4ec8030f12 Mon Sep 17 00:00:00 2001
From: David Goldman <davg@google.com>
Date: Wed, 9 Feb 2022 15:01:17 -0500
Subject: [PATCH 305/748] [clangd] Use `ObjCProtocolLoc` for generalized ObjC
 protocol support

This removes clangd's existing workaround in favor of proper support
via the newly added `ObjCProtocolLoc`. This improves support by
allowing clangd to properly identify which protocol is selected
now that `ObjCProtocolLoc` gets its own ASTNode.

Differential Revision: https://reviews.llvm.org/D119366
---
 clang-tools-extra/clangd/FindTarget.cpp       | 56 +++++--------------
 clang-tools-extra/clangd/Selection.cpp        |  3 +
 .../clangd/unittests/FindTargetTests.cpp      | 21 ++++---
 .../clangd/unittests/HoverTests.cpp           | 16 ++++++
 4 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index e96aa25fd780c..1b7b7de4f9047 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -453,15 +453,6 @@ struct TargetFinder {
       void VisitObjCInterfaceType(const ObjCInterfaceType *OIT) {
         Outer.add(OIT->getDecl(), Flags);
       }
-      void VisitObjCObjectType(const ObjCObjectType *OOT) {
-        // Make all of the protocols targets since there's no child nodes for
-        // protocols. This isn't needed for the base type, which *does* have a
-        // child `ObjCInterfaceTypeLoc`. This structure is a hack, but it works
-        // well for go-to-definition.
-        unsigned NumProtocols = OOT->getNumProtocols();
-        for (unsigned I = 0; I < NumProtocols; I++)
-          Outer.add(OOT->getProtocol(I), Flags);
-      }
     };
     Visitor(*this, Flags).Visit(T.getTypePtr());
   }
@@ -547,6 +538,8 @@ allTargetDecls(const DynTypedNode &N, const HeuristicResolver *Resolver) {
     Finder.add(TAL->getArgument(), Flags);
   else if (const CXXBaseSpecifier *CBS = N.get<CXXBaseSpecifier>())
     Finder.add(CBS->getTypeSourceInfo()->getType(), Flags);
+  else if (const ObjCProtocolLoc *PL = N.get<ObjCProtocolLoc>())
+    Finder.add(PL->getProtocol(), Flags);
   return Finder.takeDecls();
 }
 
@@ -669,25 +662,7 @@ llvm::SmallVector<ReferenceLoc> refInDecl(const Decl *D,
                                   {OMD}});
     }
 
-    void visitProtocolList(
-        llvm::iterator_range<ObjCProtocolList::iterator> Protocols,
-        llvm::iterator_range<const SourceLocation *> Locations) {
-      for (const auto &P : llvm::zip(Protocols, Locations)) {
-        Refs.push_back(ReferenceLoc{NestedNameSpecifierLoc(),
-                                    std::get<1>(P),
-                                    /*IsDecl=*/false,
-                                    {std::get<0>(P)}});
-      }
-    }
-
-    void VisitObjCInterfaceDecl(const ObjCInterfaceDecl *OID) {
-      if (OID->isThisDeclarationADefinition())
-        visitProtocolList(OID->protocols(), OID->protocol_locs());
-      Base::VisitObjCInterfaceDecl(OID); // Visit the interface's name.
-    }
-
     void VisitObjCCategoryDecl(const ObjCCategoryDecl *OCD) {
-      visitProtocolList(OCD->protocols(), OCD->protocol_locs());
       // getLocation is the extended class's location, not the category's.
       Refs.push_back(ReferenceLoc{NestedNameSpecifierLoc(),
                                   OCD->getLocation(),
@@ -709,12 +684,6 @@ llvm::SmallVector<ReferenceLoc> refInDecl(const Decl *D,
                                   /*IsDecl=*/true,
                                   {OCID->getCategoryDecl()}});
     }
-
-    void VisitObjCProtocolDecl(const ObjCProtocolDecl *OPD) {
-      if (OPD->isThisDeclarationADefinition())
-        visitProtocolList(OPD->protocols(), OPD->protocol_locs());
-      Base::VisitObjCProtocolDecl(OPD); // Visit the protocol's name.
-    }
   };
 
   Visitor V{Resolver};
@@ -944,16 +913,6 @@ refInTypeLoc(TypeLoc L, const HeuristicResolver *Resolver) {
                                   /*IsDecl=*/false,
                                   {L.getIFaceDecl()}});
     }
-
-    void VisitObjCObjectTypeLoc(ObjCObjectTypeLoc L) {
-      unsigned NumProtocols = L.getNumProtocols();
-      for (unsigned I = 0; I < NumProtocols; I++) {
-        Refs.push_back(ReferenceLoc{NestedNameSpecifierLoc(),
-                                    L.getProtocolLoc(I),
-                                    /*IsDecl=*/false,
-                                    {L.getProtocol(I)}});
-      }
-    }
   };
 
   Visitor V{Resolver};
@@ -1049,6 +1008,11 @@ class ExplicitReferenceCollector
     return RecursiveASTVisitor::TraverseNestedNameSpecifierLoc(L);
   }
 
+  bool TraverseObjCProtocolLoc(ObjCProtocolLoc ProtocolLoc) {
+    visitNode(DynTypedNode::create(ProtocolLoc));
+    return true;
+  }
+
   bool TraverseConstructorInitializer(CXXCtorInitializer *Init) {
     visitNode(DynTypedNode::create(*Init));
     return RecursiveASTVisitor::TraverseConstructorInitializer(Init);
@@ -1094,6 +1058,12 @@ class ExplicitReferenceCollector
                              {CCI->getAnyMember()}}};
       }
     }
+    if (const ObjCProtocolLoc *PL = N.get<ObjCProtocolLoc>())
+      return {ReferenceLoc{NestedNameSpecifierLoc(),
+                           PL->getLocation(),
+                           /*IsDecl=*/false,
+                           {PL->getProtocol()}}};
+
     // We do not have location information for other nodes (QualType, etc)
     return {};
   }
diff --git a/clang-tools-extra/clangd/Selection.cpp b/clang-tools-extra/clangd/Selection.cpp
index dbfe05c8e8b5c..ba2f253eb0757 100644
--- a/clang-tools-extra/clangd/Selection.cpp
+++ b/clang-tools-extra/clangd/Selection.cpp
@@ -684,6 +684,9 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
     return traverseNode<TypeLoc>(
         &QX, [&] { return TraverseTypeLoc(QX.getUnqualifiedLoc()); });
   }
+  bool TraverseObjCProtocolLoc(ObjCProtocolLoc PL) {
+    return traverseNode(&PL, [&] { return Base::TraverseObjCProtocolLoc(PL); });
+  }
   // Uninteresting parts of the AST that don't have locations within them.
   bool TraverseNestedNameSpecifier(NestedNameSpecifier *) { return true; }
   bool TraverseType(QualType) { return true; }
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 4887ee5b5deb3..7026f7fced3c9 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -946,11 +946,9 @@ TEST_F(TargetDeclTest, ObjC) {
   EXPECT_DECLS("ObjCCategoryImplDecl", "@interface Foo(Ext)");
 
   Code = R"cpp(
-    @protocol Foo
-    @end
-    void test([[id<Foo>]] p);
+    void test(id</*error-ok*/[[InvalidProtocol]]> p);
   )cpp";
-  EXPECT_DECLS("ObjCObjectTypeLoc", "@protocol Foo");
+  EXPECT_DECLS("ParmVarDecl", "id p");
 
   Code = R"cpp(
     @class C;
@@ -966,7 +964,7 @@ TEST_F(TargetDeclTest, ObjC) {
     @end
     void test(C<[[Foo]]> *p);
   )cpp";
-  EXPECT_DECLS("ObjCObjectTypeLoc", "@protocol Foo");
+  EXPECT_DECLS("ObjCProtocolLoc", "@protocol Foo");
 
   Code = R"cpp(
     @class C;
@@ -976,8 +974,17 @@ TEST_F(TargetDeclTest, ObjC) {
     @end
     void test(C<[[Foo]], Bar> *p);
   )cpp";
-  // FIXME: We currently can't disambiguate between multiple protocols.
-  EXPECT_DECLS("ObjCObjectTypeLoc", "@protocol Foo", "@protocol Bar");
+  EXPECT_DECLS("ObjCProtocolLoc", "@protocol Foo");
+
+  Code = R"cpp(
+    @class C;
+    @protocol Foo
+    @end
+    @protocol Bar
+    @end
+    void test(C<Foo, [[Bar]]> *p);
+  )cpp";
+  EXPECT_DECLS("ObjCProtocolLoc", "@protocol Bar");
 
   Code = R"cpp(
     @interface Foo
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 0b1203ae81797..0f07aa9d3b421 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -2522,6 +2522,22 @@ TEST(Hover, All) {
             HI.Definition = "@property(nonatomic, assign, unsafe_unretained, "
                             "readwrite) int prop1;";
           }},
+      {
+          R"cpp(
+          @protocol MYProtocol
+          @end
+          @interface MYObject
+          @end
+
+          @interface MYObject (Ext) <[[MYProt^ocol]]>
+          @end
+          )cpp",
+          [](HoverInfo &HI) {
+            HI.Name = "MYProtocol";
+            HI.Kind = index::SymbolKind::Protocol;
+            HI.NamespaceScope = "";
+            HI.Definition = "@protocol MYProtocol\n@end";
+          }},
       {R"objc(
         @interface Foo
         @end

From 93e2b59c076e266e78627ff7cf1dc9ed7d85550d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 18 Feb 2022 12:32:27 -0800
Subject: [PATCH 306/748] [ELF][test] Avoid non-portable |& in notest.s

---
 lld/test/ELF/linkerscript/noload.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/ELF/linkerscript/noload.s b/lld/test/ELF/linkerscript/noload.s
index 1cc09670e8b16..fbee54b9e5b4e 100644
--- a/lld/test/ELF/linkerscript/noload.s
+++ b/lld/test/ELF/linkerscript/noload.s
@@ -19,7 +19,7 @@
 
 ## The output SHT_PROBITS is contrary to the user expectation of SHT_NOBITS.
 ## Issue a warning. See https://github.com/ClangBuiltLinux/linux/issues/1597
-# RUN: ld.lld --script %t/lds %t.o %t/mismatch.o -o %t/out 2>&1 |& FileCheck %s --check-prefix=WARN
+# RUN: ld.lld --script %t/lds %t.o %t/mismatch.o -o %t/out 2>&1 | FileCheck %s --check-prefix=WARN
 # RUN: llvm-readelf -S -l %t/out | FileCheck %s --check-prefix=CHECK2
 
 # WARN:   warning: section type mismatch for .data_noload_a

From e7afbea8ca4e1c8239614aa37c2bd975172ddfa6 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Fri, 18 Feb 2022 15:34:24 -0500
Subject: [PATCH 307/748] [MemorySSA] Clear VisitedBlocks per query

The problem can be shown from the newly added test case.
There are two invocations to MemorySSAUpdater::moveToPlace, and the
internal data structure VisitedBlocks is changed in the first
invocation, and reused in the second invocation. In between the two
invocations, there is a change to the CFG, and MemorySSAUpdater is
notified about the change.

Reviewed By: asbirlea

Differential Revision: https://reviews.llvm.org/D119898
---
 llvm/lib/Analysis/MemorySSAUpdater.cpp    |  2 +
 llvm/unittests/Analysis/MemorySSATest.cpp | 92 +++++++++++++++++++++++
 2 files changed, 94 insertions(+)

diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index 9c841883de6db..66e7167038c9e 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -243,6 +243,7 @@ MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
 }
 
 void MemorySSAUpdater::insertUse(MemoryUse *MU, bool RenameUses) {
+  VisitedBlocks.clear();
   InsertedPHIs.clear();
   MU->setDefiningAccess(getPreviousDef(MU));
 
@@ -311,6 +312,7 @@ static void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB,
 // point to the correct new defs, to ensure we only have one variable, and no
 // disconnected stores.
 void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
+  VisitedBlocks.clear();
   InsertedPHIs.clear();
 
   // See if we had a local def, and if not, go hunting.
diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp
index 959b60adb069d..4db076715c389 100644
--- a/llvm/unittests/Analysis/MemorySSATest.cpp
+++ b/llvm/unittests/Analysis/MemorySSATest.cpp
@@ -1772,3 +1772,95 @@ TEST_F(MemorySSATest, TestInvariantGroup) {
     EXPECT_EQ(CallAccess, LClobber);
   }
 }
+
+static BasicBlock *getBasicBlockByName(Function &F, StringRef Name) {
+  for (BasicBlock &BB : F)
+    if (BB.getName() == Name)
+      return &BB;
+  llvm_unreachable("Expected to find basic block!");
+}
+
+static Instruction *getInstructionByName(Function &F, StringRef Name) {
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      if (I.getName() == Name)
+        return &I;
+  llvm_unreachable("Expected to find instruction!");
+}
+
+TEST_F(MemorySSATest, TestVisitedBlocks) {
+  SMDiagnostic E;
+  auto M = parseAssemblyString(
+      "define void @test(i64* noalias %P, i64 %N) {\n"
+      "preheader.n:\n"
+      "  br label %header.n\n"
+      "header.n:\n"
+      "  %n = phi i64 [ 0, %preheader.n ], [ %inc.n, %latch.n ]\n"
+      "  %guard.cond.i = icmp slt i64 0, %N\n"
+      "  br i1 %guard.cond.i, label %header.i.check, label %other.i\n"
+      "header.i.check:\n"
+      "  br label %preheader.i\n"
+      "preheader.i:\n"
+      "  br label %header.i\n"
+      "header.i:\n"
+      "  %i = phi i64 [ 0, %preheader.i ], [ %inc.i, %header.i ]\n"
+      "  %v1 = load i64, i64* %P, align 8\n"
+      "  %v2 = load i64, i64* %P, align 8\n"
+      "  %inc.i = add nsw i64 %i, 1\n"
+      "  %cmp.i = icmp slt i64 %inc.i, %N\n"
+      "  br i1 %cmp.i, label %header.i, label %exit.i\n"
+      "exit.i:\n"
+      "  br label %commonexit\n"
+      "other.i:\n"
+      "  br label %commonexit\n"
+      "commonexit:\n"
+      "  br label %latch.n\n"
+      "latch.n:\n"
+      "  %inc.n = add nsw i64 %n, 1\n"
+      "  %cmp.n = icmp slt i64 %inc.n, %N\n"
+      "  br i1 %cmp.n, label %header.n, label %exit.n\n"
+      "exit.n:\n"
+      "  ret void\n"
+      "}\n",
+      E, C);
+  ASSERT_TRUE(M);
+  F = M->getFunction("test");
+  ASSERT_TRUE(F);
+  setupAnalyses();
+  MemorySSA &MSSA = *Analyses->MSSA;
+  MemorySSAUpdater Updater(&MSSA);
+
+  {
+    // Move %v1 before the terminator of %header.i.check
+    BasicBlock *BB = getBasicBlockByName(*F, "header.i.check");
+    Instruction *LI = getInstructionByName(*F, "v1");
+    LI->moveBefore(BB->getTerminator());
+    if (MemoryUseOrDef *MUD = MSSA.getMemoryAccess(LI))
+      Updater.moveToPlace(MUD, BB, MemorySSA::BeforeTerminator);
+
+    // Change the termiantor of %header.i.check to `br label true, label
+    // %preheader.i, label %other.i`
+    BB->getTerminator()->eraseFromParent();
+    ConstantInt *BoolTrue = ConstantInt::getTrue(F->getContext());
+    BranchInst::Create(getBasicBlockByName(*F, "preheader.i"),
+                       getBasicBlockByName(*F, "other.i"), BoolTrue, BB);
+    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+    DTUpdates.push_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, BB, getBasicBlockByName(*F, "other.i")));
+    Updater.applyUpdates(DTUpdates, Analyses->DT, true);
+  }
+
+  // After the first moveToPlace(), %other.i is in VisitedBlocks, even after
+  // there is a new edge to %other.i, which makes the second moveToPlace()
+  // traverse incorrectly.
+  {
+    // Move %v2 before the terminator of %preheader.i
+    BasicBlock *BB = getBasicBlockByName(*F, "preheader.i");
+    Instruction *LI = getInstructionByName(*F, "v2");
+    LI->moveBefore(BB->getTerminator());
+    // Check that there is no assertion of "Incomplete phi during partial
+    // rename"
+    if (MemoryUseOrDef *MUD = MSSA.getMemoryAccess(LI))
+      Updater.moveToPlace(MUD, BB, MemorySSA::BeforeTerminator);
+  }
+}

From 1c1e2cce9a50ac9fe6b884b79925d71914cf5a30 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Fri, 18 Feb 2022 12:42:56 -0800
Subject: [PATCH 308/748] Add a new reflection section for multi-payload enum
 mask information

Differential Revision: https://reviews.llvm.org/D120151
---
 llvm/include/llvm/BinaryFormat/Swift.def | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/BinaryFormat/Swift.def b/llvm/include/llvm/BinaryFormat/Swift.def
index 1ea0bc548b37e..05b60e40632cd 100644
--- a/llvm/include/llvm/BinaryFormat/Swift.def
+++ b/llvm/include/llvm/BinaryFormat/Swift.def
@@ -30,3 +30,4 @@ HANDLE_SWIFT_SECTION(protocs, "__swift5_protos", "swift5_protocols",
                      ".sw5prt$B")
 HANDLE_SWIFT_SECTION(acfuncs, "__swift5_acfuncs", "swift5_accessible_functions",
                      ".sw5acfn$B")
+HANDLE_SWIFT_SECTION(mpenum, "__swift5_mpenum", "swift5_mpenum", ".sw5mpen$B")

From 6438783fdaf1a89bcc0945c3c03455793d802352 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Fri, 18 Feb 2022 12:06:28 -0800
Subject: [PATCH 309/748] [mlir][sparse] provide more types for external
 to/from MLIR routines

These routines will need to be specialized a lot more based on value types,
index types, pointer types, and permutation/dimension ordering. This is a
careful first step, providing some functionality needed in PyTACO bridge.

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D120154
---
 .../lib/ExecutionEngine/SparseTensorUtils.cpp | 193 ++++++++++--------
 .../python/tools/np_to_sparse_tensor.py       |  11 +-
 .../taco/tools/mlir_pytaco_utils.py           |  19 +-
 3 files changed, 128 insertions(+), 95 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
index 665dd8663a6c2..a93836cefdc26 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
@@ -717,10 +717,15 @@ static SparseTensorCOO<V> *openSparseTensorCOO(char *filename, uint64_t rank,
 
 /// Writes the sparse tensor to extended FROSTT format.
 template <typename V>
-void outSparseTensor(const SparseTensorCOO<V> &tensor, char *filename) {
-  auto &sizes = tensor.getSizes();
-  auto &elements = tensor.getElements();
-  uint64_t rank = tensor.getRank();
+void outSparseTensor(void *tensor, void *dest, bool sort) {
+  assert(tensor && dest);
+  auto coo = static_cast<SparseTensorCOO<V> *>(tensor);
+  if (sort)
+    coo->sort();
+  char *filename = static_cast<char *>(dest);
+  auto &sizes = coo->getSizes();
+  auto &elements = coo->getElements();
+  uint64_t rank = coo->getRank();
   uint64_t nnz = elements.size();
   std::fstream file;
   file.open(filename, std::ios_base::out | std::ios_base::trunc);
@@ -738,6 +743,67 @@ void outSparseTensor(const SparseTensorCOO<V> &tensor, char *filename) {
   file.flush();
   file.close();
   assert(file.good());
+  delete coo;
+}
+
+/// Initializes sparse tensor from an external COO-flavored format.
+template <typename V>
+SparseTensorStorage<uint64_t, uint64_t, V> *
+toMLIRSparseTensor(uint64_t rank, uint64_t nse, uint64_t *shape, V *values,
+                   uint64_t *indices) {
+  // Setup all-dims compressed and default ordering.
+  std::vector<DimLevelType> sparse(rank, DimLevelType::kCompressed);
+  std::vector<uint64_t> perm(rank);
+  std::iota(perm.begin(), perm.end(), 0);
+  // Convert external format to internal COO.
+  auto *tensor =
+      SparseTensorCOO<V>::newSparseTensorCOO(rank, shape, perm.data(), nse);
+  std::vector<uint64_t> idx(rank);
+  for (uint64_t i = 0, base = 0; i < nse; i++) {
+    for (uint64_t r = 0; r < rank; r++)
+      idx[r] = indices[base + r];
+    tensor->add(idx, values[i]);
+    base += rank;
+  }
+  // Return sparse tensor storage format as opaque pointer.
+  return SparseTensorStorage<uint64_t, uint64_t, V>::newSparseTensor(
+      rank, shape, perm.data(), sparse.data(), tensor);
+}
+
+/// Converts a sparse tensor to an external COO-flavored format.
+template <typename V>
+void fromMLIRSparseTensor(void *tensor, uint64_t *pRank, uint64_t *pNse,
+                          uint64_t **pShape, V **pValues, uint64_t **pIndices) {
+  auto sparseTensor =
+      static_cast<SparseTensorStorage<uint64_t, uint64_t, V> *>(tensor);
+  uint64_t rank = sparseTensor->getRank();
+  std::vector<uint64_t> perm(rank);
+  std::iota(perm.begin(), perm.end(), 0);
+  SparseTensorCOO<V> *coo = sparseTensor->toCOO(perm.data());
+
+  const std::vector<Element<V>> &elements = coo->getElements();
+  uint64_t nse = elements.size();
+
+  uint64_t *shape = new uint64_t[rank];
+  for (uint64_t i = 0; i < rank; i++)
+    shape[i] = coo->getSizes()[i];
+
+  V *values = new V[nse];
+  uint64_t *indices = new uint64_t[rank * nse];
+
+  for (uint64_t i = 0, base = 0; i < nse; i++) {
+    values[i] = elements[i].value;
+    for (uint64_t j = 0; j < rank; j++)
+      indices[base + j] = elements[i].indices[j];
+    base += rank;
+  }
+
+  delete coo;
+  *pRank = rank;
+  *pNse = nse;
+  *pShape = shape;
+  *pValues = values;
+  *pIndices = indices;
 }
 
 } // namespace
@@ -873,17 +939,6 @@ extern "C" {
         cursor, values, filled, added, count);                                 \
   }
 
-#define IMPL_OUT(NAME, V)                                                      \
-  void NAME(void *tensor, void *dest, bool sort) {                             \
-    assert(tensor &&dest);                                                     \
-    auto coo = static_cast<SparseTensorCOO<V> *>(tensor);                      \
-    if (sort)                                                                  \
-      coo->sort();                                                             \
-    char *filename = static_cast<char *>(dest);                                \
-    outSparseTensor<V>(*coo, filename);                                        \
-    delete coo;                                                                \
-  }
-
 // Assume index_type is in fact uint64_t, so that _mlir_ciface_newSparseTensor
 // can safely rewrite kIndex to kU64.  We make this assertion to guarantee
 // that this file cannot get out of sync with its header.
@@ -1048,8 +1103,7 @@ IMPL_GETNEXT(getNextI32, int32_t)
 IMPL_GETNEXT(getNextI16, int16_t)
 IMPL_GETNEXT(getNextI8, int8_t)
 
-/// Helper to insert elements in lexicographical index order, one per value
-/// type.
+/// Insert elements in lexicographical index order, one per value type.
 IMPL_LEXINSERT(lexInsertF64, double)
 IMPL_LEXINSERT(lexInsertF32, float)
 IMPL_LEXINSERT(lexInsertI64, int64_t)
@@ -1057,7 +1111,7 @@ IMPL_LEXINSERT(lexInsertI32, int32_t)
 IMPL_LEXINSERT(lexInsertI16, int16_t)
 IMPL_LEXINSERT(lexInsertI8, int8_t)
 
-/// Helper to insert using expansion, one per value type.
+/// Insert using expansion, one per value type.
 IMPL_EXPINSERT(expInsertF64, double)
 IMPL_EXPINSERT(expInsertF32, float)
 IMPL_EXPINSERT(expInsertI64, int64_t)
@@ -1065,14 +1119,6 @@ IMPL_EXPINSERT(expInsertI32, int32_t)
 IMPL_EXPINSERT(expInsertI16, int16_t)
 IMPL_EXPINSERT(expInsertI8, int8_t)
 
-/// Helper to output a sparse tensor, one per value type.
-IMPL_OUT(outSparseTensorF64, double)
-IMPL_OUT(outSparseTensorF32, float)
-IMPL_OUT(outSparseTensorI64, int64_t)
-IMPL_OUT(outSparseTensorI32, int32_t)
-IMPL_OUT(outSparseTensorI16, int16_t)
-IMPL_OUT(outSparseTensorI8, int8_t)
-
 #undef CASE
 #undef IMPL_SPARSEVALUES
 #undef IMPL_GETOVERHEAD
@@ -1080,7 +1126,26 @@ IMPL_OUT(outSparseTensorI8, int8_t)
 #undef IMPL_GETNEXT
 #undef IMPL_LEXINSERT
 #undef IMPL_EXPINSERT
-#undef IMPL_OUT
+
+/// Output a sparse tensor, one per value type.
+void outSparseTensorF64(void *tensor, void *dest, bool sort) {
+  return outSparseTensor<double>(tensor, dest, sort);
+}
+void outSparseTensorF32(void *tensor, void *dest, bool sort) {
+  return outSparseTensor<float>(tensor, dest, sort);
+}
+void outSparseTensorI64(void *tensor, void *dest, bool sort) {
+  return outSparseTensor<int64_t>(tensor, dest, sort);
+}
+void outSparseTensorI32(void *tensor, void *dest, bool sort) {
+  return outSparseTensor<int32_t>(tensor, dest, sort);
+}
+void outSparseTensorI16(void *tensor, void *dest, bool sort) {
+  return outSparseTensor<int16_t>(tensor, dest, sort);
+}
+void outSparseTensorI8(void *tensor, void *dest, bool sort) {
+  return outSparseTensor<int8_t>(tensor, dest, sort);
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -1134,27 +1199,16 @@ void delSparseTensor(void *tensor) {
 ///      values  = [1.0, 5.0, 3.0]
 ///      indices = [ 0, 0,  1, 1,  1, 2]
 //
-// TODO: for now f64 tensors only, no dim ordering, all dimensions compressed
+// TODO: generalize beyond 64-bit indices, no dim ordering, all dimensions
+// compressed
 //
-void *convertToMLIRSparseTensor(uint64_t rank, uint64_t nse, uint64_t *shape,
-                                double *values, uint64_t *indices) {
-  // Setup all-dims compressed and default ordering.
-  std::vector<DimLevelType> sparse(rank, DimLevelType::kCompressed);
-  std::vector<uint64_t> perm(rank);
-  std::iota(perm.begin(), perm.end(), 0);
-  // Convert external format to internal COO.
-  SparseTensorCOO<double> *tensor = SparseTensorCOO<double>::newSparseTensorCOO(
-      rank, shape, perm.data(), nse);
-  std::vector<uint64_t> idx(rank);
-  for (uint64_t i = 0, base = 0; i < nse; i++) {
-    for (uint64_t r = 0; r < rank; r++)
-      idx[r] = indices[base + r];
-    tensor->add(idx, values[i]);
-    base += rank;
-  }
-  // Return sparse tensor storage format as opaque pointer.
-  return SparseTensorStorage<uint64_t, uint64_t, double>::newSparseTensor(
-      rank, shape, perm.data(), sparse.data(), tensor);
+void *convertToMLIRSparseTensorF64(uint64_t rank, uint64_t nse, uint64_t *shape,
+                                   double *values, uint64_t *indices) {
+  return toMLIRSparseTensor<double>(rank, nse, shape, values, indices);
+}
+void *convertToMLIRSparseTensorF32(uint64_t rank, uint64_t nse, uint64_t *shape,
+                                   float *values, uint64_t *indices) {
+  return toMLIRSparseTensor<float>(rank, nse, shape, values, indices);
 }
 
 /// Converts a sparse tensor to COO-flavored format expressed using C-style
@@ -1174,41 +1228,18 @@ void *convertToMLIRSparseTensor(uint64_t rank, uint64_t nse, uint64_t *shape,
 //  SparseTensorCOO, then to the output. We may want to reduce the number of
 //  copies.
 //
-//  TODO: for now f64 tensors only, no dim ordering, all dimensions compressed
+// TODO: generalize beyond 64-bit indices, no dim ordering, all dimensions
+// compressed
 //
-void convertFromMLIRSparseTensor(void *tensor, uint64_t *pRank, uint64_t *pNse,
-                                 uint64_t **pShape, double **pValues,
-                                 uint64_t **pIndices) {
-  SparseTensorStorage<uint64_t, uint64_t, double> *sparseTensor =
-      static_cast<SparseTensorStorage<uint64_t, uint64_t, double> *>(tensor);
-  uint64_t rank = sparseTensor->getRank();
-  std::vector<uint64_t> perm(rank);
-  std::iota(perm.begin(), perm.end(), 0);
-  SparseTensorCOO<double> *coo = sparseTensor->toCOO(perm.data());
-
-  const std::vector<Element<double>> &elements = coo->getElements();
-  uint64_t nse = elements.size();
-
-  uint64_t *shape = new uint64_t[rank];
-  for (uint64_t i = 0; i < rank; i++)
-    shape[i] = coo->getSizes()[i];
-
-  double *values = new double[nse];
-  uint64_t *indices = new uint64_t[rank * nse];
-
-  for (uint64_t i = 0, base = 0; i < nse; i++) {
-    values[i] = elements[i].value;
-    for (uint64_t j = 0; j < rank; j++)
-      indices[base + j] = elements[i].indices[j];
-    base += rank;
-  }
-
-  delete coo;
-  *pRank = rank;
-  *pNse = nse;
-  *pShape = shape;
-  *pValues = values;
-  *pIndices = indices;
+void convertFromMLIRSparseTensorF64(void *tensor, uint64_t *pRank,
+                                    uint64_t *pNse, uint64_t **pShape,
+                                    double **pValues, uint64_t **pIndices) {
+  fromMLIRSparseTensor<double>(tensor, pRank, pNse, pShape, pValues, pIndices);
+}
+void convertFromMLIRSparseTensorF32(void *tensor, uint64_t *pRank,
+                                    uint64_t *pNse, uint64_t **pShape,
+                                    float **pValues, uint64_t **pIndices) {
+  fromMLIRSparseTensor<float>(tensor, pRank, pNse, pShape, pValues, pIndices);
 }
 
 } // extern "C"
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py b/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
index d238e6fdb79b4..f5b0ab60e85e9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
@@ -28,9 +28,9 @@ def _get_c_shared_lib(lib_name: str):
   c_lib = ctypes.CDLL(lib_name)
 
   try:
-    c_lib.convertFromMLIRSparseTensor.restype = ctypes.c_void_p
+    c_lib.convertFromMLIRSparseTensorF64.restype = ctypes.c_void_p
   except Exception as e:
-    raise ValueError('Missing function convertFromMLIRSparseTensor from '
+    raise ValueError('Missing function convertFromMLIRSparseTensorF64 from '
                      f'the C shared library: {e} ') from e
 
   return c_lib
@@ -64,9 +64,10 @@ def sparse_tensor_to_coo_tensor(support_lib, sparse, dtype):
   shape = ctypes.POINTER(ctypes.c_ulonglong)()
   values = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))()
   indices = ctypes.POINTER(ctypes.c_ulonglong)()
-  c_lib.convertFromMLIRSparseTensor(sparse, ctypes.byref(rank),
-                                    ctypes.byref(nse), ctypes.byref(shape),
-                                    ctypes.byref(values), ctypes.byref(indices))
+  c_lib.convertFromMLIRSparseTensorF64(sparse, ctypes.byref(rank),
+                                       ctypes.byref(nse), ctypes.byref(shape),
+                                       ctypes.byref(values),
+                                       ctypes.byref(indices))
   # Convert the returned values to the corresponding numpy types.
   shape = np.ctypeslib.as_array(shape, shape=[rank.value])
   values = np.ctypeslib.as_array(values, shape=[nse.value])
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
index 62cd6baff6388..62aa98ee8aaf8 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
@@ -55,15 +55,15 @@ def _get_c_shared_lib() -> ctypes.CDLL:
   c_lib = ctypes.CDLL(_get_support_lib_name())
 
   try:
-    c_lib.convertToMLIRSparseTensor.restype = ctypes.c_void_p
+    c_lib.convertToMLIRSparseTensorF64.restype = ctypes.c_void_p
   except Exception as e:
-    raise ValueError("Missing function convertToMLIRSparseTensor from "
+    raise ValueError("Missing function convertToMLIRSparseTensorF64 from "
                      f"the supporting C shared library: {e} ") from e
 
   try:
-    c_lib.convertFromMLIRSparseTensor.restype = ctypes.c_void_p
+    c_lib.convertFromMLIRSparseTensorF64.restype = ctypes.c_void_p
   except Exception as e:
-    raise ValueError("Missing function convertFromMLIRSparseTensor from "
+    raise ValueError("Missing function convertFromMLIRSparseTensorF64 from "
                      f"the C shared library: {e} ") from e
 
   return c_lib
@@ -100,9 +100,10 @@ def sparse_tensor_to_coo_tensor(
   shape = ctypes.POINTER(ctypes.c_ulonglong)()
   values = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))()
   indices = ctypes.POINTER(ctypes.c_ulonglong)()
-  c_lib.convertFromMLIRSparseTensor(sparse_tensor, ctypes.byref(rank),
-                                    ctypes.byref(nse), ctypes.byref(shape),
-                                    ctypes.byref(values), ctypes.byref(indices))
+  c_lib.convertFromMLIRSparseTensorF64(sparse_tensor, ctypes.byref(rank),
+                                       ctypes.byref(nse), ctypes.byref(shape),
+                                       ctypes.byref(values),
+                                       ctypes.byref(indices))
 
   # Convert the returned values to the corresponding numpy types.
   shape = np.ctypeslib.as_array(shape, shape=[rank.value])
@@ -138,8 +139,8 @@ def coo_tensor_to_sparse_tensor(np_shape: np.ndarray, np_values: np.ndarray,
   indices = np_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_ulonglong))
 
   c_lib = _get_c_shared_lib()
-  ptr = c_lib.convertToMLIRSparseTensor(rank, nse, shape, values, indices)
-  assert ptr is not None, "Problem with calling convertToMLIRSparseTensor"
+  ptr = c_lib.convertToMLIRSparseTensorF64(rank, nse, shape, values, indices)
+  assert ptr is not None, "Problem with calling convertToMLIRSparseTensorF64"
   return ptr
 
 

From deb73a285b92ece59c93c2c3b4b398bdd540513c Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed@bougacha.org>
Date: Fri, 18 Feb 2022 13:10:42 -0800
Subject: [PATCH 310/748] [AArch64][GlobalISel] Constrain the right MOs when
 lowering calls.

This was constraining the stale Info.Callee MO instead of the one we
copied into the MI.
In addition, with c8b8c8e989e, when there's an attachedcall, the
Callee is at position 1 rather than 0.

Differential Revision: https://reviews.llvm.org/D120161
---
 .../lib/Target/AArch64/GISel/AArch64CallLowering.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 3027b9a36a5c3..3b4eaad4133ad 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -1059,10 +1059,10 @@ bool AArch64CallLowering::lowerTailCall(
 
   // If Callee is a reg, since it is used by a target specific instruction,
   // it must have a register class matching the constraint of that instruction.
-  if (Info.Callee.isReg())
+  if (MIB->getOperand(0).isReg())
     constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
                              *MF.getSubtarget().getRegBankInfo(), *MIB,
-                             MIB->getDesc(), Info.Callee, 0);
+                             MIB->getDesc(), MIB->getOperand(0), 0);
 
   MF.getFrameInfo().setHasTailCall();
   Info.LoweredTailCall = true;
@@ -1139,12 +1139,16 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
 
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+  unsigned CalleeOpNo = 0;
+
   if (Opc == AArch64::BLR_RVMARKER) {
     // Add a target global address for the retainRV/claimRV runtime function
     // just before the call target.
     Function *ARCFn = *objcarc::getAttachedARCFunction(Info.CB);
     MIB.addGlobalAddress(ARCFn);
+    ++CalleeOpNo;
   }
+
   MIB.add(Info.Callee);
 
   // Tell the call which registers are clobbered.
@@ -1175,10 +1179,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // If Callee is a reg, since it is used by a target specific
   // instruction, it must have a register class matching the
   // constraint of that instruction.
-  if (Info.Callee.isReg())
+  if (MIB->getOperand(CalleeOpNo).isReg())
     constrainOperandRegClass(MF, *TRI, MRI, *Subtarget.getInstrInfo(),
                              *Subtarget.getRegBankInfo(), *MIB, MIB->getDesc(),
-                             Info.Callee, 0);
+                             MIB->getOperand(CalleeOpNo), CalleeOpNo);
 
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arguments, the physical register must be an

From 3c8fc215cc28533c52212037dcb7b8028b0aeae5 Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Fri, 18 Feb 2022 10:35:40 -0800
Subject: [PATCH 311/748] [memprof] Remove packed qualifier for
 MemprofRecord::Frame.

Now that we use dedicated serialize and deserialize methods in order to
ensure consistency across big and small endian systems. The packed
qualifier on the Frame struct can be removed.

Reviewed By: davidxl, tejohnson

Differential Revision: https://reviews.llvm.org/D120147
---
 llvm/include/llvm/ProfileData/MemProf.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 784927e4805d7..a18033f93633f 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -138,7 +138,7 @@ struct MemProfRecord {
   // Describes a call frame for a dynamic allocation context. The contents of
   // the frame are populated by symbolizing the stack depot call frame from the
   // compiler runtime.
-  PACKED(struct Frame {
+  struct Frame {
     // A uuid (uint64_t) identifying the function. It is obtained by
     // llvm::md5(FunctionName) which returns the lower 64 bits.
     GlobalValue::GUID Function;
@@ -194,7 +194,7 @@ struct MemProfRecord {
       return sizeof(Frame::Function) + sizeof(Frame::LineOffset) +
              sizeof(Frame::Column) + sizeof(Frame::IsInlineFrame);
     }
-  });
+  };
 
   // The dynamic calling context for the allocation.
   std::vector<Frame> CallStack;
@@ -208,7 +208,8 @@ struct MemProfRecord {
 
   size_t serializedSize() const {
     return sizeof(uint64_t) + // The number of frames to serialize.
-           sizeof(Frame) * CallStack.size() + // The contents of the frames.
+           Frame::serializedSize() *
+               CallStack.size() + // The contents of the frames.
            PortableMemInfoBlock::serializedSize(); // The size of the payload.
   }
 

From e3b9bb5a1847483338190c82ee63a962e82696fd Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <medismail.bennani@gmail.com>
Date: Fri, 18 Feb 2022 14:11:29 -0800
Subject: [PATCH 312/748] [lldb/bindings] Expose the progress reporting
 machinery to the SWIG interface

This patch defines the SBDebugger::eBroadcastBitProgress enum in the SWIG
interface and exposes the SBDebugger::{GetProgressFromEvent,GetBroadcaster}
methods as well.

This allows to exercise the API from the script interpreter using python.

Differential Revision: https://reviews.llvm.org/D120100

Signed-off-by: Med Ismail Bennani <medismail.bennani@gmail.com>
---
 lldb/bindings/interface/SBDebugger.i          | 16 ++++++
 .../progress_reporting/Makefile               |  3 +
 .../TestProgressReporting.py                  | 57 +++++++++++++++++++
 .../functionalities/progress_reporting/main.c | 11 ++++
 4 files changed, 87 insertions(+)
 create mode 100644 lldb/test/API/functionalities/progress_reporting/Makefile
 create mode 100644 lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
 create mode 100644 lldb/test/API/functionalities/progress_reporting/main.c

diff --git a/lldb/bindings/interface/SBDebugger.i b/lldb/bindings/interface/SBDebugger.i
index f21e60d628738..3790857b8ab61 100644
--- a/lldb/bindings/interface/SBDebugger.i
+++ b/lldb/bindings/interface/SBDebugger.i
@@ -117,6 +117,22 @@ or the equivalent arguments for :py:class:`SBTarget.AttachToProcessWithID` .") S
 class SBDebugger
 {
 public:
+    enum
+    {
+        eBroadcastBitProgress = (1 << 0)
+    };
+
+
+    %apply uint64_t& INOUT { uint64_t& progress_id };
+    %apply uint64_t& INOUT { uint64_t& completed };
+    %apply uint64_t& INOUT { uint64_t& total };
+    %apply bool& INOUT { bool& is_debugger_specific };
+    static const char *GetProgressFromEvent(const lldb::SBEvent &event,
+                                        uint64_t &progress_id,
+                                        uint64_t &completed, uint64_t &total,
+                                        bool &is_debugger_specific);
+
+    SBBroadcaster GetBroadcaster();
 
     static void
     Initialize();
diff --git a/lldb/test/API/functionalities/progress_reporting/Makefile b/lldb/test/API/functionalities/progress_reporting/Makefile
new file mode 100644
index 0000000000000..10495940055b6
--- /dev/null
+++ b/lldb/test/API/functionalities/progress_reporting/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
new file mode 100644
index 0000000000000..b9d9953539c11
--- /dev/null
+++ b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
@@ -0,0 +1,57 @@
+"""
+Test that we are able to broadcast and receive progress events from lldb
+"""
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+import threading
+
+class TestProgressReporting(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    eBroadcastBitStopProgressThread = (1 << 0)
+
+    def setUp(self):
+        TestBase.setUp(self)
+        self.progress_events = []
+
+    def fetch_events(self, test_broadcaster):
+        listener = lldb.SBListener("lldb.progress.listener")
+        listener.StartListeningForEvents(test_broadcaster,
+                                         self.eBroadcastBitStopProgressThread)
+
+        progress_broadcaster = self.dbg.GetBroadcaster()
+        progress_broadcaster.AddListener(listener, lldb.SBDebugger.eBroadcastBitProgress)
+
+        event = lldb.SBEvent()
+
+        done = False
+        while not done:
+            if listener.WaitForEvent(1, event):
+                event_mask = event.GetType();
+                if event.BroadcasterMatchesRef(test_broadcaster):
+                    if event_mask & self.eBroadcastBitStopProgressThread:
+                        done = True;
+                elif event.BroadcasterMatchesRef(progress_broadcaster):
+                    message = lldb.SBDebugger().GetProgressFromEvent(event, 0, 0, 0, False);
+                    if message:
+                        self.progress_events.append((message, event))
+
+    @skipUnlessDarwin
+    def test_dwarf_symbol_loading_progress_report(self):
+        """Test that we are able to fetch dwarf symbol loading progress events"""
+        self.build()
+
+        test_broadcaster = lldb.SBBroadcaster('lldb.broadcaster.test')
+        listener_thread = threading.Thread(target=self.fetch_events,
+                                           args=[test_broadcaster])
+        listener_thread.start()
+
+        lldbutil.run_to_source_breakpoint(self, 'break here', lldb.SBFileSpec('main.c'))
+
+        test_broadcaster.BroadcastEventByType(self.eBroadcastBitStopProgressThread)
+        listener_thread.join()
+
+        self.assertTrue(len(self.progress_events) > 0)
diff --git a/lldb/test/API/functionalities/progress_reporting/main.c b/lldb/test/API/functionalities/progress_reporting/main.c
new file mode 100644
index 0000000000000..3ebd6282aa958
--- /dev/null
+++ b/lldb/test/API/functionalities/progress_reporting/main.c
@@ -0,0 +1,11 @@
+int bar(int b) { return b * b; }
+
+int foo(int f) {
+  int b = bar(f); // break here
+  return b;
+}
+
+int main() {
+  int f = foo(42);
+  return f;
+}

From c12d49c4e286fa108d4d69f1c6d2b8d691993ffd Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 18 Feb 2022 14:54:10 -0800
Subject: [PATCH 313/748] [ELF] Remove .strtab deduplication

D118577: the 0.1~1.1% .strtab size reduction does not justify the 3~6%
link time increase. Just remove it even for -O2. release/14.x
has D118577 and the release note mentioned that this may be removed.

Fix https://github.com/ClangBuiltLinux/linux/issues/1578
caused by D118577 (empty string not in stringMap).
---
 lld/ELF/SyntheticSections.cpp |  5 ++---
 lld/test/ELF/strtab-dedup.s   | 33 --------------------------------
 lld/test/ELF/strtab-nodedup.s | 36 +++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 36 deletions(-)
 delete mode 100644 lld/test/ELF/strtab-dedup.s
 create mode 100644 lld/test/ELF/strtab-nodedup.s

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 0cc2cfb62b2ca..37b6877d699d1 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1230,6 +1230,7 @@ StringTableSection::StringTableSection(StringRef name, bool dynamic)
       dynamic(dynamic) {
   // ELF string tables start with a NUL byte.
   strings.push_back("");
+  stringMap.try_emplace(CachedHashStringRef(""), 0);
   size = 1;
 }
 
@@ -2156,9 +2157,7 @@ void SymbolTableBaseSection::sortSymTabSymbols() {
 void SymbolTableBaseSection::addSymbol(Symbol *b) {
   // Adding a local symbol to a .dynsym is a bug.
   assert(this->type != SHT_DYNSYM || !b->isLocal());
-
-  bool hashIt = b->isLocal() && config->optimize >= 2;
-  symbols.push_back({b, strTabSec.addString(b->getName(), hashIt)});
+  symbols.push_back({b, strTabSec.addString(b->getName(), false)});
 }
 
 size_t SymbolTableBaseSection::getSymbolIndex(Symbol *sym) {
diff --git a/lld/test/ELF/strtab-dedup.s b/lld/test/ELF/strtab-dedup.s
deleted file mode 100644
index e7c36a4e2489b..0000000000000
--- a/lld/test/ELF/strtab-dedup.s
+++ /dev/null
@@ -1,33 +0,0 @@
-# REQUIRES: x86
-# RUN: split-file %s %t
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/a.s -o %t/a.o
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/b.s -o %t/b.o
-
-## By default local symbol names are not deduplicated.
-# RUN: ld.lld %t/a.o %t/b.o -o %t/a
-# RUN: llvm-readelf -p .strtab %t/a | FileCheck %s --check-prefix=NODEDUP
-
-# NODEDUP:        [     1]  local
-# NODEDUP-NEXT:   [     7]  local
-# NODEDUP-NEXT:   [     d]  foo
-# NODEDUP-EMPTY:
-
-## -O2 deduplicates local symbol names.
-# RUN: ld.lld -O2 %t/a.o %t/b.o -o %t/a
-# RUN: llvm-readelf -p .strtab %t/a | FileCheck %s --check-prefix=DEDUP
-
-# DEDUP:        [     1]  local
-# DEDUP-NEXT:   [     7]  foo
-# DEDUP-EMPTY:
-
-#--- a.s
-.global foo
-foo:
-local:
-  ret
-
-#--- b.s
-.weak foo
-foo:
-local:
-  ret
diff --git a/lld/test/ELF/strtab-nodedup.s b/lld/test/ELF/strtab-nodedup.s
new file mode 100644
index 0000000000000..b20e738de0679
--- /dev/null
+++ b/lld/test/ELF/strtab-nodedup.s
@@ -0,0 +1,36 @@
+# REQUIRES: x86
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/a.s -o %t/a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/b.s -o %t/b.o
+
+## Non-empty local symbol names are not deduplicated. This helps parallel
+## .symtab write. We used to perform deduplication at -O2.
+# RUN: ld.lld %t/a.o %t/b.o -o %t/a
+# RUN: llvm-readelf -p .strtab %t/a | FileCheck %s --check-prefix=NODEDUP
+# RUN: ld.lld -r -O2 %t/a.o %t/b.o -o %t/a.ro
+# RUN: llvm-readelf -p .strtab %t/a.ro | FileCheck %s --check-prefix=NODEDUP
+
+# NODEDUP:        [     1]  local
+# NODEDUP-NEXT:   [     7]  local
+# NODEDUP-NEXT:   [     d]  foo
+# NODEDUP-EMPTY:
+
+# RUN: llvm-readelf -s %t/a.ro | FileCheck %s --check-prefix=SYMTAB
+
+# SYMTAB:    0: {{0+}} 0 NOTYPE  LOCAL  DEFAULT UND
+# SYMTAB-NEXT:           NOTYPE  LOCAL  DEFAULT [[#]] local
+# SYMTAB-NEXT:           SECTION LOCAL  DEFAULT [[#]] .text
+# SYMTAB-NEXT:           NOTYPE  LOCAL  DEFAULT [[#]] local
+# SYMTAB-NEXT:           NOTYPE  GLOBAL DEFAULT [[#]] foo
+
+#--- a.s
+.global foo
+foo:
+local:
+  ret
+
+#--- b.s
+.weak foo
+foo:
+local:
+  ret

From 9b9a084af06accf851cc7c718bde5765170d7c2d Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Fri, 18 Feb 2022 13:43:56 -0800
Subject: [PATCH 314/748] [mlir][sparse][pytaco] test with 3-dim tensor and
 scalar

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D120163
---
 .../Dialect/SparseTensor/taco/test_Tensor.py  | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/SparseTensor/taco/test_Tensor.py

diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/test_Tensor.py b/mlir/test/Integration/Dialect/SparseTensor/taco/test_Tensor.py
new file mode 100644
index 0000000000000..08710cd429c89
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/test_Tensor.py
@@ -0,0 +1,49 @@
+# RUN: SUPPORTLIB=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext %PYTHON %s | FileCheck %s
+
+import filecmp
+import numpy as np
+import os
+import sys
+import tempfile
+
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+
+from tools import mlir_pytaco_api as pt
+from tools import testing_utils as utils
+
+i, j, k = pt.get_index_vars(3)
+
+# Set up scalar and sparse tensors.
+alpha = pt.tensor(42.0)
+S = pt.tensor([8, 8, 8],
+              pt.format([pt.compressed, pt.compressed, pt.compressed]))
+X = pt.tensor([8, 8, 8],
+              pt.format([pt.compressed, pt.compressed, pt.compressed]))
+S.insert([0, 0, 0], 2.0)
+S.insert([1, 1, 1], 3.0)
+S.insert([4, 4, 4], 4.0)
+S.insert([7, 7, 7], 5.0)
+
+# TODO: make this work:
+# X[i, j, k] = alpha[0] * S[i, j, k]
+X[i, j, k] = S[i, j, k]
+
+expected = """; extended FROSTT format
+3 4
+8 8 8
+1 1 1 2
+2 2 2 3
+5 5 5 4
+8 8 8 5
+"""
+
+# Force evaluation of the kernel by writing out X.
+with tempfile.TemporaryDirectory() as test_dir:
+  x_file = os.path.join(test_dir, 'X.tns')
+  pt.write(x_file, X)
+  #
+  # CHECK: Compare result True
+  #
+  x_data = utils.file_as_string(x_file)
+  print(f'Compare result {x_data == expected}')

From 34a9642af8b4d29545d41824412bd3a5abee3f24 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 15:34:56 -0800
Subject: [PATCH 315/748] Revert "[instsimplify] Simplify
 HaveNonOverlappingStorage per review suggestion on D120133 [NFC]"

This reverts commit 3a6be124cc01191ec52192017791bb04a6c7295a.  This appears to have caused a stage2 build failure: https://lab.llvm.org/buildbot/#/builders/168/builds/4813

Will investigate further on Monday and recommit.
---
 llvm/lib/Analysis/InstructionSimplify.cpp | 26 ++++++++++-------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 7475b995cbd86..b3459b5ffb013 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2555,24 +2555,20 @@ static bool HaveNonOverlappingStorage(const Value *V1, const Value *V2) {
   //
   // So, we'll assume that two non-empty allocas have different addresses
   // for now.
-
-  auto isByValArgOrGlobalVarOrAlloca = [](const Value *V) {
-    if (const Argument *A = dyn_cast<Argument>(V))
-      return A->hasByValAttr();
-    return isa<AllocaInst>(V) || isa<GlobalVariable>(V);
+  auto isByValArg = [](const Value *V) {
+    const Argument *A = dyn_cast<Argument>(V);
+    return A && A->hasByValAttr();
   };
 
-  if (!isByValArgOrGlobalVarOrAlloca(V1) ||
-      !isByValArgOrGlobalVarOrAlloca(V2))
-    return false;
+  // Byval args are backed by store which does not overlap with each other,
+  // allocas, or globals.
+  if (isByValArg(V1))
+    return isa<AllocaInst>(V2) || isa<GlobalVariable>(V2) || isByValArg(V2);
+  if (isByValArg(V2))
+    return isa<AllocaInst>(V1) || isa<GlobalVariable>(V1) || isByValArg(V1);
 
-  // Both sides being globals shouldn't reach here - as the resulting compare
-  // is a constantexpr - but we want to guard against it to be safe.  The
-  // semantics of globals are complicated by e.g. unnamed_addr.  The assumption
-  // in this code is that while two globals could end up overlapping, they'll
-  // never overlap with any alloca or byval, and thus we can still reason about
-  // *one* global and one *non* global as disjoint storage.
-  return !isa<GlobalVariable>(V1) || !isa<GlobalVariable>(V2);
+ return isa<AllocaInst>(V1) &&
+    (isa<AllocaInst>(V2) || isa<GlobalVariable>(V2));
 }
 
 // A significant optimization not implemented here is assuming that alloca

From 86b5e256628ae49193ad9962626a73bafeda2883 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 18 Feb 2022 13:06:54 -0800
Subject: [PATCH 316/748] [SelectionDAG][X86] Support f16 in
 getReciprocalOpName.

If the "reciprocal-estimates" attribute is present and it doesn't
contain "all", "none", or "default", we previously crashed on f16
operations.

This patch addes an 'h' suffix' to prevent the crash.

I've added simple tests that just enable the estimate for all
vec-sqrt and one test case that explicitly tests the new 'h' suffix
to override the default steps.

There may be some frontend change needed to, but I haven't checked
that yet.

Differential Revision: https://reviews.llvm.org/D120158
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  4 ++-
 .../test/CodeGen/X86/avx512fp16-intrinsics.ll | 26 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3a7e82c9038c1..700c11a66904f 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2072,9 +2072,11 @@ static std::string getReciprocalOpName(bool IsSqrt, EVT VT) {
 
   Name += IsSqrt ? "sqrt" : "div";
 
-  // TODO: Handle "half" or other float types?
+  // TODO: Handle other float types?
   if (VT.getScalarType() == MVT::f64) {
     Name += "d";
+  } else if (VT.getScalarType() == MVT::f16) {
+    Name += "h";
   } else {
     assert(VT.getScalarType() == MVT::f32 &&
            "Unexpected FP type for reciprocal estimate");
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 6a5c3e243209a..61b483329a4d9 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -35,6 +35,32 @@ define <32 x half> @test_sqrt_ph_512_fast(<32 x half> %a0, <32 x half> %a1) {
   ret <32 x half> %2
 }
 
+define <32 x half> @test_sqrt_ph_512_fast_estimate_attribute(<32 x half> %a0, <32 x half> %a1) "reciprocal-estimates"="vec-sqrt" {
+; CHECK-LABEL: test_sqrt_ph_512_fast_estimate_attribute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtph %zmm0, %zmm0
+; CHECK-NEXT:    vmulph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %2 = fdiv fast <32 x half> %a1, %1
+  ret <32 x half> %2
+}
+
+define <32 x half> @test_sqrt_ph_512_fast_estimate_attribute_2(<32 x half> %a0, <32 x half> %a1) "reciprocal-estimates"="vec-sqrth:1" {
+; CHECK-LABEL: test_sqrt_ph_512_fast_estimate_attribute_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtph %zmm0, %zmm2
+; CHECK-NEXT:    vmulph %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vfmadd213ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm0
+; CHECK-NEXT:    vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm2
+; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vmulph %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %2 = fdiv fast <32 x half> %a1, %1
+  ret <32 x half> %2
+}
+
 define <32 x half> @test_mask_sqrt_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) {
 ; CHECK-LABEL: test_mask_sqrt_ph_512:
 ; CHECK:       # %bb.0:

From 04f815c26f7c7b7932c0f80fda8fcb5fa5814bca Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 18 Feb 2022 15:27:38 -0800
Subject: [PATCH 317/748] [SelectionDAGBuilder] Remove LegalTypes=false from a
 call to getShiftAmountConstant.

getShiftAmountTy will return MVT::i32 if the shift amount
coming from the target's getScalarShiftAmountTy can't reprsent
all possible values. That should eliminate the need to use the
pointer type which is what we do when LegalTypes is false.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D120165
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 78da827c96f74..3e2dd9ec74e09 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -566,7 +566,7 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
     unsigned RoundBits = RoundParts * PartBits;
     unsigned OddParts = NumParts - RoundParts;
     SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
-      DAG.getShiftAmountConstant(RoundBits, ValueVT, DL, /*LegalTypes*/false));
+      DAG.getShiftAmountConstant(RoundBits, ValueVT, DL));
 
     getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V,
                    CallConv);

From 0d59a54cea2875b38c420947897457506a150960 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 18 Feb 2022 15:39:50 -0800
Subject: [PATCH 318/748] Revert "[SelectionDAG][X86] Support f16 in
 getReciprocalOpName."

This reverts commit 86b5e256628ae49193ad9962626a73bafeda2883.

This wasn't supposed to be commited yet
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  4 +--
 .../test/CodeGen/X86/avx512fp16-intrinsics.ll | 26 -------------------
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 700c11a66904f..3a7e82c9038c1 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2072,11 +2072,9 @@ static std::string getReciprocalOpName(bool IsSqrt, EVT VT) {
 
   Name += IsSqrt ? "sqrt" : "div";
 
-  // TODO: Handle other float types?
+  // TODO: Handle "half" or other float types?
   if (VT.getScalarType() == MVT::f64) {
     Name += "d";
-  } else if (VT.getScalarType() == MVT::f16) {
-    Name += "h";
   } else {
     assert(VT.getScalarType() == MVT::f32 &&
            "Unexpected FP type for reciprocal estimate");
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 61b483329a4d9..6a5c3e243209a 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -35,32 +35,6 @@ define <32 x half> @test_sqrt_ph_512_fast(<32 x half> %a0, <32 x half> %a1) {
   ret <32 x half> %2
 }
 
-define <32 x half> @test_sqrt_ph_512_fast_estimate_attribute(<32 x half> %a0, <32 x half> %a1) "reciprocal-estimates"="vec-sqrt" {
-; CHECK-LABEL: test_sqrt_ph_512_fast_estimate_attribute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrsqrtph %zmm0, %zmm0
-; CHECK-NEXT:    vmulph %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
-  %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
-  %2 = fdiv fast <32 x half> %a1, %1
-  ret <32 x half> %2
-}
-
-define <32 x half> @test_sqrt_ph_512_fast_estimate_attribute_2(<32 x half> %a0, <32 x half> %a1) "reciprocal-estimates"="vec-sqrth:1" {
-; CHECK-LABEL: test_sqrt_ph_512_fast_estimate_attribute_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrsqrtph %zmm0, %zmm2
-; CHECK-NEXT:    vmulph %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vfmadd213ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm0
-; CHECK-NEXT:    vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm2
-; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vmulph %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
-  %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
-  %2 = fdiv fast <32 x half> %a1, %1
-  ret <32 x half> %2
-}
-
 define <32 x half> @test_mask_sqrt_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) {
 ; CHECK-LABEL: test_mask_sqrt_ph_512:
 ; CHECK:       # %bb.0:

From 70aa11187e5c7ecca327356569dbb2e56f06cbe0 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Thu, 17 Feb 2022 07:06:50 -0800
Subject: [PATCH 319/748] [lld][WebAssembly] Convert a bunch more tests to asm.
 NFC

Differential Revision: https://reviews.llvm.org/D120060
---
 ...{archive-export.ll => archive-export.test} |   0
 lld/test/wasm/call-indirect.ll                | 162 -----------------
 lld/test/wasm/call-indirect.s                 | 165 ++++++++++++++++++
 lld/test/wasm/driver.ll                       |  41 -----
 lld/test/wasm/driver.s                        |  39 +++++
 lld/test/wasm/entry.ll                        |  38 ----
 lld/test/wasm/entry.s                         |  36 ++++
 lld/test/wasm/fatal-warnings.ll               |  17 --
 lld/test/wasm/fatal-warnings.s                |  19 ++
 lld/test/wasm/function-imports.ll             |  43 -----
 lld/test/wasm/function-imports.s              |  42 +++++
 lld/test/wasm/stack-pointer.ll                |  67 -------
 lld/test/wasm/stack-pointer.s                 |  70 ++++++++
 lld/test/wasm/trace-symbol.ll                 |  28 ---
 lld/test/wasm/trace-symbol.s                  |  28 +++
 15 files changed, 399 insertions(+), 396 deletions(-)
 rename lld/test/wasm/{archive-export.ll => archive-export.test} (100%)
 delete mode 100644 lld/test/wasm/call-indirect.ll
 create mode 100644 lld/test/wasm/call-indirect.s
 delete mode 100644 lld/test/wasm/driver.ll
 create mode 100644 lld/test/wasm/driver.s
 delete mode 100644 lld/test/wasm/entry.ll
 create mode 100644 lld/test/wasm/entry.s
 delete mode 100644 lld/test/wasm/fatal-warnings.ll
 create mode 100644 lld/test/wasm/fatal-warnings.s
 delete mode 100644 lld/test/wasm/function-imports.ll
 create mode 100644 lld/test/wasm/function-imports.s
 delete mode 100644 lld/test/wasm/stack-pointer.ll
 create mode 100644 lld/test/wasm/stack-pointer.s
 delete mode 100644 lld/test/wasm/trace-symbol.ll
 create mode 100644 lld/test/wasm/trace-symbol.s

diff --git a/lld/test/wasm/archive-export.ll b/lld/test/wasm/archive-export.test
similarity index 100%
rename from lld/test/wasm/archive-export.ll
rename to lld/test/wasm/archive-export.test
diff --git a/lld/test/wasm/call-indirect.ll b/lld/test/wasm/call-indirect.ll
deleted file mode 100644
index 31e2c75864554..0000000000000
--- a/lld/test/wasm/call-indirect.ll
+++ /dev/null
@@ -1,162 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-indirect.s -o %t2.o
-; RUN: wasm-ld --export-dynamic -o %t.wasm %t2.o %t.o
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-; bitcode generated from the following C code:
-; int foo(void) { return 1; }
-; int (*indirect_func)(void) = &foo;
-; void _start(void) { indirect_func(); }
-
-target triple = "wasm32-unknown-unknown"
-
-@indirect_func = local_unnamed_addr global i32 ()* @foo, align 4
-
-; Function Attrs: norecurse nounwind readnone
-define i32 @foo() #0 {
-entry:
-  ret i32 2
-}
-
-; Function Attrs: nounwind
-define void @_start() local_unnamed_addr #1 {
-entry:
-  %0 = load i32 ()*, i32 ()** @indirect_func, align 4
-  %call = call i32 %0() #2
-  ret void
-}
-
-; Indirect function call where no function actually has this type.
-; Ensures that the type entry is still created in this case.
-define void @call_ptr(i64 (i64)* %arg) {
-  %1 = call i64 %arg(i64 1)
-  ret void
-}
-
-; CHECK:      !WASM
-; CHECK-NEXT: FileHeader:
-; CHECK-NEXT:   Version:         0x1
-; CHECK-NEXT: Sections:
-; CHECK-NEXT:   - Type:            TYPE
-; CHECK-NEXT:     Signatures:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:         ReturnTypes:
-; CHECK-NEXT:           - I64
-; CHECK-NEXT:       - Index:           1
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:         ReturnTypes:
-; CHECK-NEXT:           - I32
-; CHECK-NEXT:       - Index:           2
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:           - I64
-; CHECK-NEXT:         ReturnTypes:
-; CHECK-NEXT:           - I64
-; CHECK-NEXT:       - Index:           3
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:         ReturnTypes:     []
-; CHECK-NEXT:       - Index:           4
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:           - I32
-; CHECK-NEXT:         ReturnTypes:     []
-; CHECK-NEXT:   - Type:            FUNCTION
-; CHECK-NEXT:     FunctionTypes:   [ 0, 3, 1, 3, 4 ]
-; CHECK-NEXT:   - Type:            TABLE
-; CHECK-NEXT:     Tables:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         ElemType:        FUNCREF
-; CHECK-NEXT:         Limits:
-; CHECK-NEXT:           Flags:           [ HAS_MAX ]
-; CHECK-NEXT:           Minimum:         0x3
-; CHECK-NEXT:           Maximum:         0x3
-; CHECK-NEXT:   - Type:            MEMORY
-; CHECK-NEXT:     Memories:
-; CHECK-NEXT:       - Minimum:         0x2
-; CHECK-NEXT:   - Type:            GLOBAL
-; CHECK-NEXT:     Globals:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Type:            I32
-; CHECK-NEXT:         Mutable:         true
-; CHECK-NEXT:         InitExpr:
-; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           66576
-; CHECK-NEXT:       - Index:           1
-; CHECK-NEXT:         Type:            I32
-; CHECK-NEXT:         Mutable:         false
-; CHECK-NEXT:         InitExpr:
-; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1032
-; CHECK-NEXT:   - Type:            EXPORT
-; CHECK-NEXT:     Exports:
-; CHECK-NEXT:       - Name:            memory
-; CHECK-NEXT:         Kind:            MEMORY
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:       - Name:            bar
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:       - Name:            call_bar_indirect
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           1
-; CHECK-NEXT:       - Name:            foo
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           2
-; CHECK-NEXT:       - Name:            _start
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           3
-; CHECK-NEXT:       - Name:            indirect_func
-; CHECK-NEXT:         Kind:            GLOBAL
-; CHECK-NEXT:         Index:           1
-; CHECK-NEXT:       - Name:            call_ptr
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           4
-; CHECK-NEXT:   - Type:            ELEM
-; CHECK-NEXT:     Segments:
-; CHECK-NEXT:       - Offset:
-; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1
-; CHECK-NEXT:         Functions:       [ 0, 2 ]
-; CHECK-NEXT:   - Type:            CODE
-; CHECK-NEXT:     Functions:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            42010B
-; CHECK-NEXT:       - Index:           1
-; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            410028028088808000118080808000001A410028028488808000118180808000001A0B
-; CHECK-NEXT:       - Index:           2
-; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            41020B
-; CHECK-NEXT:       - Index:           3
-; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            410028028888808000118180808000001A0B
-; CHECK-NEXT:       - Index:           4
-; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            42012000118280808000001A0B
-; CHECK-NEXT:   - Type:            DATA
-; CHECK-NEXT:     Segments:
-; CHECK-NEXT:       - SectionOffset:    7
-; CHECK-NEXT:         InitFlags:        0
-; CHECK-NEXT:         Offset:
-; CHECK-NEXT:           Opcode:          I32_CONST
-; CHECK-NEXT:           Value:           1024
-; CHECK-NEXT:         Content:         '010000000200000002000000'
-; CHECK-NEXT:   - Type:            CUSTOM
-; CHECK-NEXT:     Name:            name
-; CHECK-NEXT:     FunctionNames:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Name:            bar
-; CHECK-NEXT:       - Index:           1
-; CHECK-NEXT:         Name:            call_bar_indirect
-; CHECK-NEXT:       - Index:           2
-; CHECK-NEXT:         Name:            foo
-; CHECK-NEXT:       - Index:           3
-; CHECK-NEXT:         Name:            _start
-; CHECK-NEXT:       - Index:           4
-; CHECK-NEXT:         Name:            call_ptr
-; CHECK-NEXT:     GlobalNames:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Name:            __stack_pointer
-; CHECK-NEXT:     DataSegmentNames:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Name:            .data
-; CHECK-NEXT: ...
diff --git a/lld/test/wasm/call-indirect.s b/lld/test/wasm/call-indirect.s
new file mode 100644
index 0000000000000..7bf39a9f5aec9
--- /dev/null
+++ b/lld/test/wasm/call-indirect.s
@@ -0,0 +1,165 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-indirect.s -o %t2.o
+# RUN: wasm-ld --export-dynamic -o %t.wasm %t2.o %t.o
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+.globl foo
+foo:
+  .functype foo () -> (i32)
+  i32.const 2
+  end_function
+
+.globl  _start
+_start:
+  .functype _start () -> ()
+  i32.const 0
+  i32.load indirect_func
+  call_indirect () -> (i32)
+  drop
+  end_function
+
+# Indirect function call where no function actually has this type.
+# Ensures that the type entry is still created in this case.
+.section .text,"",@
+.globl call_ptr
+call_ptr:
+  .functype call_ptr (i32) -> ()
+  i64.const 1
+  local.get 0
+  call_indirect (i64) -> (i64)
+  drop
+  end_function
+
+.globl indirect_func
+.section .data.indirect_func,"",@
+indirect_func:
+ .int32 foo
+ .size indirect_func, 4
+
+# CHECK:      !WASM
+# CHECK-NEXT: FileHeader:
+# CHECK-NEXT:   Version:         0x1
+# CHECK-NEXT: Sections:
+# CHECK-NEXT:   - Type:            TYPE
+# CHECK-NEXT:     Signatures:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:         ReturnTypes:
+# CHECK-NEXT:           - I64
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:         ReturnTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:       - Index:           2
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:           - I64
+# CHECK-NEXT:         ReturnTypes:
+# CHECK-NEXT:           - I64
+# CHECK-NEXT:       - Index:           3
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:         ReturnTypes:     []
+# CHECK-NEXT:       - Index:           4
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:         ReturnTypes:     []
+# CHECK-NEXT:   - Type:            FUNCTION
+# CHECK-NEXT:     FunctionTypes:   [ 0, 3, 1, 3, 4 ]
+# CHECK-NEXT:   - Type:            TABLE
+# CHECK-NEXT:     Tables:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         ElemType:        FUNCREF
+# CHECK-NEXT:         Limits:
+# CHECK-NEXT:           Flags:           [ HAS_MAX ]
+# CHECK-NEXT:           Minimum:         0x3
+# CHECK-NEXT:           Maximum:         0x3
+# CHECK-NEXT:   - Type:            MEMORY
+# CHECK-NEXT:     Memories:
+# CHECK-NEXT:       - Minimum:         0x2
+# CHECK-NEXT:   - Type:            GLOBAL
+# CHECK-NEXT:     Globals:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Type:            I32
+# CHECK-NEXT:         Mutable:         true
+# CHECK-NEXT:         InitExpr:
+# CHECK-NEXT:           Opcode:          I32_CONST
+# CHECK-NEXT:           Value:           66576
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         Type:            I32
+# CHECK-NEXT:         Mutable:         false
+# CHECK-NEXT:         InitExpr:
+# CHECK-NEXT:           Opcode:          I32_CONST
+# CHECK-NEXT:           Value:           1032
+# CHECK-NEXT:   - Type:            EXPORT
+# CHECK-NEXT:     Exports:
+# CHECK-NEXT:       - Name:            memory
+# CHECK-NEXT:         Kind:            MEMORY
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:       - Name:            bar
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:       - Name:            call_bar_indirect
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:       - Name:            foo
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           2
+# CHECK-NEXT:       - Name:            _start
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           3
+# CHECK-NEXT:       - Name:            indirect_func
+# CHECK-NEXT:         Kind:            GLOBAL
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:       - Name:            call_ptr
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           4
+# CHECK-NEXT:   - Type:            ELEM
+# CHECK-NEXT:     Segments:
+# CHECK-NEXT:       - Offset:
+# CHECK-NEXT:           Opcode:          I32_CONST
+# CHECK-NEXT:           Value:           1
+# CHECK-NEXT:         Functions:       [ 0, 2 ]
+# CHECK-NEXT:   - Type:            CODE
+# CHECK-NEXT:     Functions:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            42010B
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            410028028088808000118080808000001A410028028488808000118180808000001A0B
+# CHECK-NEXT:       - Index:           2
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            41020B
+# CHECK-NEXT:       - Index:           3
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            410028028888808000118180808000001A0B
+# CHECK-NEXT:       - Index:           4
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            42012000118280808000001A0B
+# CHECK-NEXT:   - Type:            DATA
+# CHECK-NEXT:     Segments:
+# CHECK-NEXT:       - SectionOffset:    7
+# CHECK-NEXT:         InitFlags:        0
+# CHECK-NEXT:         Offset:
+# CHECK-NEXT:           Opcode:          I32_CONST
+# CHECK-NEXT:           Value:           1024
+# CHECK-NEXT:         Content:         '010000000200000002000000'
+# CHECK-NEXT:   - Type:            CUSTOM
+# CHECK-NEXT:     Name:            name
+# CHECK-NEXT:     FunctionNames:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Name:            bar
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         Name:            call_bar_indirect
+# CHECK-NEXT:       - Index:           2
+# CHECK-NEXT:         Name:            foo
+# CHECK-NEXT:       - Index:           3
+# CHECK-NEXT:         Name:            _start
+# CHECK-NEXT:       - Index:           4
+# CHECK-NEXT:         Name:            call_ptr
+# CHECK-NEXT:     GlobalNames:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Name:            __stack_pointer
+# CHECK-NEXT:     DataSegmentNames:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Name:            .data
+# CHECK-NEXT: ...
diff --git a/lld/test/wasm/driver.ll b/lld/test/wasm/driver.ll
deleted file mode 100644
index e5aecf8a49740..0000000000000
--- a/lld/test/wasm/driver.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-
-target triple = "wasm32-unknown-unknown"
-
-define hidden void @_start() local_unnamed_addr #0 {
-entry:
-  ret void
-}
-
-; RUN: not wasm-ld -o %t.exe 2>&1 | FileCheck -check-prefix=IN %s
-; IN: error: no input files
-
-; RUN: not wasm-ld %t.o 2>&1 | FileCheck -check-prefix=OUT %s
-; OUT: error: no output file specified
-
-; RUN: not wasm-ld 2>&1 | FileCheck -check-prefix=BOTH %s
-; BOTH:     error: no input files
-; BOTH-NOT: error: no output file specified
-
-; RUN: not wasm-ld --export-table --import-table %t.o 2>&1 \
-; RUN:   | FileCheck -check-prefix=TABLE %s
-; TABLE: error: --import-table and --export-table may not be used together
-
-; RUN: not wasm-ld --relocatable --shared-memory %t.o 2>&1 \
-; RUN:   | FileCheck -check-prefix=SHARED-MEM %s
-; SHARED-MEM: error: -r and --shared-memory may not be used together
-
-; RUN: wasm-ld %t.o -z foo -o /dev/null 2>&1 | FileCheck -check-prefix=ERR10 %s
-; RUN: wasm-ld %t.o -z foo -o /dev/null --version 2>&1 | FileCheck -check-prefix=ERR10 %s
-; ERR10: warning: unknown -z value: foo
-
-;; Check we report "unknown -z value" error even with -v.
-; RUN: wasm-ld %t.o -z foo -o /dev/null -v 2>&1 | FileCheck -check-prefix=ERR10 %s
-
-;; Note: in GNU ld, --fatal-warning still leads to a warning.
-; RUN: not wasm-ld %t.o -z foo --fatal-warnings 2>&1 | FileCheck --check-prefix=ERR10-FATAL %s
-; ERR10-FATAL: error: unknown -z value: foo
-
-;; stack-size without an = is also an error
-; RUN: not wasm-ld %t.o -z stack-size 2>&1 | FileCheck -check-prefix=ERR11 %s
-; ERR11: unknown -z value: stack-size
diff --git a/lld/test/wasm/driver.s b/lld/test/wasm/driver.s
new file mode 100644
index 0000000000000..46d59e362a6a8
--- /dev/null
+++ b/lld/test/wasm/driver.s
@@ -0,0 +1,39 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+
+.globl  _start
+_start:
+  .functype _start () -> ()
+  end_function
+
+# RUN: not wasm-ld -o %t.exe 2>&1 | FileCheck -check-prefix=IN %s
+# IN: error: no input files
+
+# RUN: not wasm-ld %t.o 2>&1 | FileCheck -check-prefix=OUT %s
+# OUT: error: no output file specified
+
+# RUN: not wasm-ld 2>&1 | FileCheck -check-prefix=BOTH %s
+# BOTH:     error: no input files
+# BOTH-NOT: error: no output file specified
+
+# RUN: not wasm-ld --export-table --import-table %t.o 2>&1 \
+# RUN:   | FileCheck -check-prefix=TABLE %s
+# TABLE: error: --import-table and --export-table may not be used together
+
+# RUN: not wasm-ld --relocatable --shared-memory %t.o 2>&1 \
+# RUN:   | FileCheck -check-prefix=SHARED-MEM %s
+# SHARED-MEM: error: -r and --shared-memory may not be used together
+
+# RUN: wasm-ld %t.o -z foo -o /dev/null 2>&1 | FileCheck -check-prefix=ERR10 %s
+# RUN: wasm-ld %t.o -z foo -o /dev/null --version 2>&1 | FileCheck -check-prefix=ERR10 %s
+# ERR10: warning: unknown -z value: foo
+
+## Check we report "unknown -z value" error even with -v.
+# RUN: wasm-ld %t.o -z foo -o /dev/null -v 2>&1 | FileCheck -check-prefix=ERR10 %s
+
+## Note: in GNU ld, --fatal-warning still leads to a warning.
+# RUN: not wasm-ld %t.o -z foo --fatal-warnings 2>&1 | FileCheck --check-prefix=ERR10-FATAL %s
+# ERR10-FATAL: error: unknown -z value: foo
+
+## stack-size without an = is also an error
+# RUN: not wasm-ld %t.o -z stack-size 2>&1 | FileCheck -check-prefix=ERR11 %s
+# ERR11: unknown -z value: stack-size
diff --git a/lld/test/wasm/entry.ll b/lld/test/wasm/entry.ll
deleted file mode 100644
index 21779a01bfec7..0000000000000
--- a/lld/test/wasm/entry.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-
-target triple = "wasm32-unknown-unknown"
-
-define hidden void @entry() local_unnamed_addr #0 {
-entry:
-  ret void
-}
-
-; RUN: wasm-ld -e entry -o %t1.wasm %t.o
-; RUN: obj2yaml %t1.wasm | FileCheck %s
-; RUN: wasm-ld --entry=entry -o %t2.wasm %t.o
-; RUN: obj2yaml %t2.wasm | FileCheck %s
-
-; CHECK:        - Type:            EXPORT
-; CHECK-NEXT:     Exports:
-; CHECK-NEXT:       - Name:            memory
-; CHECK-NEXT:         Kind:            MEMORY
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:       - Name:            entry
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Index:           0
-; CHECK-NEXT:   - Type:
-
-; The __wasm_call_ctors is somewhat special since its created by the linker.
-; Make sure we can use it as the entry point if we choose
-; RUN: wasm-ld --entry=__wasm_call_ctors -o %t3.wasm %t.o
-; RUN: obj2yaml %t3.wasm | FileCheck %s -check-prefix=CHECK-CTOR
-
-; CHECK-CTOR:        - Type:            EXPORT
-; CHECK-CTOR-NEXT:     Exports:
-; CHECK-CTOR-NEXT:       - Name:            memory
-; CHECK-CTOR-NEXT:         Kind:            MEMORY
-; CHECK-CTOR-NEXT:         Index:           0
-; CHECK-CTOR-NEXT:       - Name:            __wasm_call_ctors
-; CHECK-CTOR-NEXT:         Kind:            FUNCTION
-; CHECK-CTOR-NEXT:         Index:           0
-; CHECK-CTOR-NEXT:   - Type:
diff --git a/lld/test/wasm/entry.s b/lld/test/wasm/entry.s
new file mode 100644
index 0000000000000..9275249492aac
--- /dev/null
+++ b/lld/test/wasm/entry.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+
+.globl  entry
+entry:
+  .functype entry () -> ()
+  end_function
+
+# RUN: wasm-ld -e entry -o %t1.wasm %t.o
+# RUN: obj2yaml %t1.wasm | FileCheck %s
+# RUN: wasm-ld --entry=entry -o %t2.wasm %t.o
+# RUN: obj2yaml %t2.wasm | FileCheck %s
+
+# CHECK:        - Type:            EXPORT
+# CHECK-NEXT:     Exports:
+# CHECK-NEXT:       - Name:            memory
+# CHECK-NEXT:         Kind:            MEMORY
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:       - Name:            entry
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Index:           0
+# CHECK-NEXT:   - Type:
+
+# The __wasm_call_ctors is somewhat special since its created by the linker.
+# Make sure we can use it as the entry point if we choose
+# RUN: wasm-ld --entry=__wasm_call_ctors -o %t3.wasm %t.o
+# RUN: obj2yaml %t3.wasm | FileCheck %s -check-prefix=CHECK-CTOR
+
+# CHECK-CTOR:        - Type:            EXPORT
+# CHECK-CTOR-NEXT:     Exports:
+# CHECK-CTOR-NEXT:       - Name:            memory
+# CHECK-CTOR-NEXT:         Kind:            MEMORY
+# CHECK-CTOR-NEXT:         Index:           0
+# CHECK-CTOR-NEXT:       - Name:            __wasm_call_ctors
+# CHECK-CTOR-NEXT:         Kind:            FUNCTION
+# CHECK-CTOR-NEXT:         Index:           0
+# CHECK-CTOR-NEXT:   - Type:
diff --git a/lld/test/wasm/fatal-warnings.ll b/lld/test/wasm/fatal-warnings.ll
deleted file mode 100644
index 01a0137a2f454..0000000000000
--- a/lld/test/wasm/fatal-warnings.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.main.o
-; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
-; RUN: wasm-ld -o %t.wasm %t.main.o %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-WARN
-; RUN: not wasm-ld --fatal-warnings -o %t.wasm %t.main.o %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-FATAL
-
-; CHECK-WARN: warning: function signature mismatch: ret32
-; CHECK-FATAL: error: function signature mismatch: ret32
-
-target triple = "wasm32-unknown-unknown"
-
-define hidden void @_start() local_unnamed_addr #0 {
-entry:
-  %call = tail call i32 @ret32(i32 1, i64 2, i32 3) #2
-  ret void
-}
-
-declare i32 @ret32(i32, i64, i32) local_unnamed_addr #1
diff --git a/lld/test/wasm/fatal-warnings.s b/lld/test/wasm/fatal-warnings.s
new file mode 100644
index 0000000000000..a534195b0478b
--- /dev/null
+++ b/lld/test/wasm/fatal-warnings.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+# RUN: wasm-ld -o %t.wasm %t.o %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-WARN
+# RUN: not wasm-ld --fatal-warnings -o %t.wasm %t.o %t.ret32.o 2>&1 | FileCheck %s -check-prefix=CHECK-FATAL
+
+# CHECK-WARN: warning: function signature mismatch: ret32
+# CHECK-FATAL: error: function signature mismatch: ret32
+
+.functype ret32 (f32, i64, i32) -> (i32)
+
+.globl  _start
+_start:
+  .functype _start () -> ()
+  f32.const 1.0
+  i64.const 2
+  i32.const 3
+  call ret32
+  drop
+  end_function
diff --git a/lld/test/wasm/function-imports.ll b/lld/test/wasm/function-imports.ll
deleted file mode 100644
index 4b18532fc2207..0000000000000
--- a/lld/test/wasm/function-imports.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: wasm-ld -o %t.wasm %t.ret32.o %t.o
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-; Function Attrs: nounwind
-define hidden void @_start() local_unnamed_addr #0 {
-entry:
-  %call = tail call i32 @ret32(float 0.000000e+00) #2
-  ret void
-}
-
-declare i32 @ret32(float) local_unnamed_addr #1
-
-; CHECK:      Sections:
-; CHECK:       - Type:            TYPE
-; CHECK-NEXT:    Signatures:
-; CHECK-NEXT:      - Index:           0
-; CHECK-NEXT:        ParamTypes:
-; CHECK-NEXT:          - F32
-; CHECK-NEXT:        ReturnTypes:
-; CHECK-NEXT:          - I32
-; CHECK-NEXT:      - Index:           1
-; CHECK-NEXT:        ParamTypes:
-; CHECK-NEXT:        ReturnTypes:     []
-; CHECK-NEXT:  - Type:            FUNCTION
-; CHECK-NEXT:    FunctionTypes:   [ 0, 1 ]
-; CHECK:       - Type:            CODE
-; CHECK-NEXT:    Functions:
-; CHECK:           - Index:       0
-; CHECK:           - Index:       1
-; CHECK:         Name:            name
-; CHECK-NEXT:    FunctionNames:
-; CHECK-NEXT:      - Index:           0
-; CHECK-NEXT:        Name:            ret32
-; CHECK-NEXT:      - Index:           1
-; CHECK-NEXT:        Name:            _start
-; CHECK-NEXT:    GlobalNames:
-; CHECK-NEXT:      - Index:           0
-; CHECK-NEXT:        Name:            __stack_pointer
-; CHECK-NEXT: ...
diff --git a/lld/test/wasm/function-imports.s b/lld/test/wasm/function-imports.s
new file mode 100644
index 0000000000000..825747849fff2
--- /dev/null
+++ b/lld/test/wasm/function-imports.s
@@ -0,0 +1,42 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld -o %t.wasm %t.ret32.o %t.o
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+.functype ret32 (f32) -> (i32)
+
+.globl  _start
+_start:
+  .functype _start () -> ()
+  f32.const 0.000000e+00
+  call ret32
+  drop
+  end_function
+
+# CHECK:      Sections:
+# CHECK:       - Type:            TYPE
+# CHECK-NEXT:    Signatures:
+# CHECK-NEXT:      - Index:           0
+# CHECK-NEXT:        ParamTypes:
+# CHECK-NEXT:          - F32
+# CHECK-NEXT:        ReturnTypes:
+# CHECK-NEXT:          - I32
+# CHECK-NEXT:      - Index:           1
+# CHECK-NEXT:        ParamTypes:
+# CHECK-NEXT:        ReturnTypes:     []
+# CHECK-NEXT:  - Type:            FUNCTION
+# CHECK-NEXT:    FunctionTypes:   [ 0, 1 ]
+# CHECK:       - Type:            CODE
+# CHECK-NEXT:    Functions:
+# CHECK:           - Index:       0
+# CHECK:           - Index:       1
+# CHECK:         Name:            name
+# CHECK-NEXT:    FunctionNames:
+# CHECK-NEXT:      - Index:           0
+# CHECK-NEXT:        Name:            ret32
+# CHECK-NEXT:      - Index:           1
+# CHECK-NEXT:        Name:            _start
+# CHECK-NEXT:    GlobalNames:
+# CHECK-NEXT:      - Index:           0
+# CHECK-NEXT:        Name:            __stack_pointer
+# CHECK-NEXT: ...
diff --git a/lld/test/wasm/stack-pointer.ll b/lld/test/wasm/stack-pointer.ll
deleted file mode 100644
index 38693d252ea7f..0000000000000
--- a/lld/test/wasm/stack-pointer.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc -filetype=obj %s -o %t.o
-; RUN: wasm-ld --relocatable -o %t.wasm %t.o
-; RUN: obj2yaml %t.wasm | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
-
-; Function Attrs: nounwind
-define i32 @_start() local_unnamed_addr {
-entry:
-  %retval = alloca i32, align 4
-  ret i32 0
-}
-
-; CHECK:      --- !WASM
-; CHECK-NEXT: FileHeader:
-; CHECK-NEXT:   Version:         0x1
-; CHECK-NEXT: Sections:
-; CHECK-NEXT:   - Type:            TYPE
-; CHECK-NEXT:     Signatures:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         ParamTypes:
-; CHECK-NEXT:         ReturnTypes:
-; CHECK-NEXT:           - I32
-; CHECK-NEXT:   - Type:            IMPORT
-; CHECK-NEXT:     Imports:
-; CHECK-NEXT:       - Module:          env
-; CHECK-NEXT:         Field:           __stack_pointer
-; CHECK-NEXT:         Kind:            GLOBAL
-; CHECK-NEXT:         GlobalType:      I32
-; CHECK-NEXT:         GlobalMutable:   true
-; CHECK-NEXT:   - Type:            FUNCTION
-; CHECK-NEXT:     FunctionTypes:   [ 0 ]
-; CHECK-NEXT:   - Type:            MEMORY
-; CHECK-NEXT:     Memories:
-; CHECK-NEXT:       - Minimum:         0x0
-; CHECK-NEXT:   - Type:            CODE
-; CHECK-NEXT:     Relocations:
-; CHECK-NEXT:       - Type:            R_WASM_GLOBAL_INDEX_LEB
-; CHECK-NEXT:         Index:           1
-; CHECK-NEXT:         Offset:          0x4
-; CHECK-NEXT:     Functions:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            23808080800041106B1A41000B
-; CHECK-NEXT:   - Type:            CUSTOM
-; CHECK-NEXT:     Name:            linking
-; CHECK-NEXT:     Version:         2
-; CHECK-NEXT:     SymbolTable:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Kind:            FUNCTION
-; CHECK-NEXT:         Name:            _start
-; CHECK-NEXT:         Flags:           [  ]
-; CHECK-NEXT:         Function:        0
-; CHECK-NEXT:       - Index:           1
-; CHECK-NEXT:         Kind:            GLOBAL
-; CHECK-NEXT:         Name:            __stack_pointer
-; CHECK-NEXT:         Flags:           [ UNDEFINED ]
-; CHECK-NEXT:         Global:          0
-; CHECK-NEXT:   - Type:            CUSTOM
-; CHECK-NEXT:     Name:            name
-; CHECK-NEXT:     FunctionNames:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Name:            _start
-; CHECK-NEXT:     GlobalNames:
-; CHECK-NEXT:       - Index:           0
-; CHECK-NEXT:         Name:            __stack_pointer
-; CHECK-NEXT: ...
diff --git a/lld/test/wasm/stack-pointer.s b/lld/test/wasm/stack-pointer.s
new file mode 100644
index 0000000000000..902ac493d3dec
--- /dev/null
+++ b/lld/test/wasm/stack-pointer.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --relocatable -o %t.wasm %t.o
+# RUN: obj2yaml %t.wasm | FileCheck %s
+
+.globaltype __stack_pointer, i32
+
+.globl  _start
+_start:
+  .functype _start () -> (i32)
+  global.get __stack_pointer
+  i32.const 16
+  i32.sub
+  drop
+  i32.const 0
+  end_function
+
+# CHECK:      --- !WASM
+# CHECK-NEXT: FileHeader:
+# CHECK-NEXT:   Version:         0x1
+# CHECK-NEXT: Sections:
+# CHECK-NEXT:   - Type:            TYPE
+# CHECK-NEXT:     Signatures:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         ParamTypes:
+# CHECK-NEXT:         ReturnTypes:
+# CHECK-NEXT:           - I32
+# CHECK-NEXT:   - Type:            IMPORT
+# CHECK-NEXT:     Imports:
+# CHECK-NEXT:       - Module:          env
+# CHECK-NEXT:         Field:           __stack_pointer
+# CHECK-NEXT:         Kind:            GLOBAL
+# CHECK-NEXT:         GlobalType:      I32
+# CHECK-NEXT:         GlobalMutable:   true
+# CHECK-NEXT:   - Type:            FUNCTION
+# CHECK-NEXT:     FunctionTypes:   [ 0 ]
+# CHECK-NEXT:   - Type:            MEMORY
+# CHECK-NEXT:     Memories:
+# CHECK-NEXT:       - Minimum:         0x0
+# CHECK-NEXT:   - Type:            CODE
+# CHECK-NEXT:     Relocations:
+# CHECK-NEXT:       - Type:            R_WASM_GLOBAL_INDEX_LEB
+# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         Offset:          0x4
+# CHECK-NEXT:     Functions:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Locals:
+# CHECK-NEXT:         Body:            23808080800041106B1A41000B
+# CHECK-NEXT:   - Type:            CUSTOM
+# CHECK-NEXT:     Name:            linking
+# CHECK-NEXT:     Version:         2
+# CHECK-NEXT:     SymbolTable:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Kind:            FUNCTION
+# CHECK-NEXT:         Name:            _start
+# CHECK-NEXT:         Flags:           [  ]
+# CHECK-NEXT:         Function:        0
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         Kind:            GLOBAL
+# CHECK-NEXT:         Name:            __stack_pointer
+# CHECK-NEXT:         Flags:           [ UNDEFINED ]
+# CHECK-NEXT:         Global:          0
+# CHECK-NEXT:   - Type:            CUSTOM
+# CHECK-NEXT:     Name:            name
+# CHECK-NEXT:     FunctionNames:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Name:            _start
+# CHECK-NEXT:     GlobalNames:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Name:            __stack_pointer
+# CHECK-NEXT: ...
diff --git a/lld/test/wasm/trace-symbol.ll b/lld/test/wasm/trace-symbol.ll
deleted file mode 100644
index 25154004d6b0e..0000000000000
--- a/lld/test/wasm/trace-symbol.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
-; RUN: llc -filetype=obj -o %t.start.o %s
-; RUN: wasm-ld -o %t.wasm %t.start.o %t.ret32.o -y ret32 -y _start | FileCheck %s -check-prefix=BOTH
-; RUN: wasm-ld -o %t.wasm %t.ret32.o %t.start.o -y ret32 -y _start | FileCheck %s -check-prefix=REVERSED
-
-; check alias
-; RUN: wasm-ld -o %t.wasm %t.start.o %t.ret32.o -trace-symbol=_start | FileCheck %s -check-prefixes=JUST-START
-
-target triple = "wasm32-unknown-unknown"
-
-declare i32 @ret32(float %arg)
-
-define void @_start() {
-entry:
-  %call1 = call i32 @ret32(float 0.0)
-  ret void
-}
-
-; BOTH:          start.o: definition of _start
-; BOTH-NEXT:     start.o: reference to ret32
-; BOTH-NEXT:     ret32.o: definition of ret32
-
-; REVERSED:      ret32.o: definition of ret32
-; REVERSED-NEXT: start.o: definition of _start
-; REVERSED-NEXT: start.o: reference to ret32
-
-; JUST-START: start.o: definition of _start
-; JUST-START-NOT: ret32
diff --git a/lld/test/wasm/trace-symbol.s b/lld/test/wasm/trace-symbol.s
new file mode 100644
index 0000000000000..88e5c6f5829e3
--- /dev/null
+++ b/lld/test/wasm/trace-symbol.s
@@ -0,0 +1,28 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.start.o %s
+# RUN: wasm-ld -o %t.wasm %t.start.o %t.ret32.o -y ret32 -y _start | FileCheck %s -check-prefix=BOTH
+# RUN: wasm-ld -o %t.wasm %t.ret32.o %t.start.o -y ret32 -y _start | FileCheck %s -check-prefix=REVERSED
+
+# check alias
+# RUN: wasm-ld -o %t.wasm %t.start.o %t.ret32.o -trace-symbol=_start | FileCheck %s -check-prefixes=JUST-START
+
+.functype ret32 (f32) -> (i32)
+
+.globl  _start
+_start:
+  .functype _start () -> ()
+  f32.const 0.0
+  call ret32
+  drop
+  end_function
+
+# BOTH:          start.o: definition of _start
+# BOTH-NEXT:     start.o: reference to ret32
+# BOTH-NEXT:     ret32.o: definition of ret32
+
+# REVERSED:      ret32.o: definition of ret32
+# REVERSED-NEXT: start.o: definition of _start
+# REVERSED-NEXT: start.o: reference to ret32
+
+# JUST-START: start.o: definition of _start
+# JUST-START-NOT: ret32

From 3d728ef0ba281a72ff92e2cbc7326fc922ade13f Mon Sep 17 00:00:00 2001
From: Gulfem Savrun Yeniceri <gulfem@google.com>
Date: Sat, 19 Feb 2022 00:59:23 +0000
Subject: [PATCH 320/748] [scudo] Add missing <algorithm> include

After https://reviews.llvm.org/D119667, <algorithm> is no longer
transitively included from various headers. This patch adds the
<algorithm> include into scudo.

Differential Revision: https://reviews.llvm.org/D120171
---
 compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 723679228cbab..a7df3d90e7df5 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -12,6 +12,7 @@
 #include "allocator_config.h"
 #include "secondary.h"
 
+#include <algorithm>
 #include <condition_variable>
 #include <memory>
 #include <mutex>

From 4961bb477d0526a7b965609cd42d44caa8413d45 Mon Sep 17 00:00:00 2001
From: Gulfem Savrun Yeniceri <gulfem@google.com>
Date: Sat, 19 Feb 2022 01:05:38 +0000
Subject: [PATCH 321/748] [gwp_asan] Add missing <algorithm> include

After https://reviews.llvm.org/D119667, <algorithm> is no longer
transitively included from various headers. This patch adds the
<algorithm> include into gwp_asan.

Differential Revision: https://reviews.llvm.org/D120172
---
 compiler-rt/lib/gwp_asan/tests/iterate.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/lib/gwp_asan/tests/iterate.cpp b/compiler-rt/lib/gwp_asan/tests/iterate.cpp
index 2b8635d5b36da..49953f33abf89 100644
--- a/compiler-rt/lib/gwp_asan/tests/iterate.cpp
+++ b/compiler-rt/lib/gwp_asan/tests/iterate.cpp
@@ -8,6 +8,7 @@
 
 #include "gwp_asan/tests/harness.h"
 
+#include <algorithm>
 #include <set>
 #include <vector>
 

From efb383266d04c70b8adf4ffb3f4872b36bc4653f Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Sat, 19 Feb 2022 09:21:02 +0800
Subject: [PATCH 322/748] [LoongArch] Fix atomic instructions operands sequence

According to https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#atomic-memory-access-instructions,
the operands sequence of am* instructions should be "rd, rk, rj"
but not "rd, rj, rk". Sorry for this typo in initial patches.

Reviewed By: xen0n, MaskRay

Differential Revision: https://reviews.llvm.org/D120107
---
 .../Target/LoongArch/LoongArchInstrInfo.td    |  4 +-
 llvm/test/CodeGen/LoongArch/3r.mir            | 72 +++++++++----------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 4d207ebdea9ad..ee73e81126f8f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -138,8 +138,8 @@ class STORE_2RI14<bits<8> op, string opstr>
 
 let mayLoad = 1, mayStore = 1 in
 class AM_3R<bits<17> op, string opstr>
-    : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rj, GPR:$rk),
-            !strconcat(opstr, "\t$rd, $rj, $rk")>;
+    : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rk, GPR:$rj),
+            !strconcat(opstr, "\t$rd, $rk, $rj")>;
 
 let mayLoad = 1 in
 class LLBase<bits<8> op, string opstr>
diff --git a/llvm/test/CodeGen/LoongArch/3r.mir b/llvm/test/CodeGen/LoongArch/3r.mir
index 19f0446a7d685..a1b97d5637b22 100644
--- a/llvm/test/CodeGen/LoongArch/3r.mir
+++ b/llvm/test/CodeGen/LoongArch/3r.mir
@@ -429,7 +429,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMSWAP_DB_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amswap_db.w	$a0, $a1, $a2
 name: test_AMSWAP_DB_W
 body: |
@@ -438,7 +438,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMSWAP_DB_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amswap_db.d	$a0, $a1, $a2
 name: test_AMSWAP_DB_D
 body: |
@@ -447,7 +447,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMADD_DB_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amadd_db.w	$a0, $a1, $a2
 name: test_AMADD_DB_W
 body: |
@@ -456,7 +456,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMADD_DB_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amadd_db.d	$a0, $a1, $a2
 name: test_AMADD_DB_D
 body: |
@@ -465,7 +465,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMAND_DB_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amand_db.w	$a0, $a1, $a2
 name: test_AMAND_DB_W
 body: |
@@ -474,7 +474,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMAND_DB_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amand_db.d	$a0, $a1, $a2
 name: test_AMAND_DB_D
 body: |
@@ -483,7 +483,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMOR_DB_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amor_db.w	$a0, $a1, $a2
 name: test_AMOR_DB_W
 body: |
@@ -492,7 +492,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMOR_DB_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amor_db.d	$a0, $a1, $a2
 name: test_AMOR_DB_D
 body: |
@@ -501,7 +501,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMXOR_DB_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amxor_db.w	$a0, $a1, $a2
 name: test_AMXOR_DB_W
 body: |
@@ -510,7 +510,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMXOR_DB_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amxor_db.d	$a0, $a1, $a2
 name: test_AMXOR_DB_D
 body: |
@@ -519,7 +519,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_DB_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax_db.w	$a0, $a1, $a2
 name: test_AMMAX_DB_W
 body: |
@@ -528,7 +528,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_DB_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax_db.d	$a0, $a1, $a2
 name: test_AMMAX_DB_D
 body: |
@@ -537,7 +537,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_DB_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin_db.w	$a0, $a1, $a2
 name: test_AMMIN_DB_W
 body: |
@@ -546,7 +546,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_DB_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin_db.d	$a0, $a1, $a2
 name: test_AMMIN_DB_D
 body: |
@@ -555,7 +555,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_DB_WU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax_db.wu	$a0, $a1, $a2
 name: test_AMMAX_DB_WU
 body: |
@@ -564,7 +564,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_DB_DU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax_db.du	$a0, $a1, $a2
 name: test_AMMAX_DB_DU
 body: |
@@ -573,7 +573,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_DB_WU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin_db.wu	$a0, $a1, $a2
 name: test_AMMIN_DB_WU
 body: |
@@ -582,7 +582,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_DB_DU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin_db.du	$a0, $a1, $a2
 name: test_AMMIN_DB_DU
 body: |
@@ -591,7 +591,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMSWAP_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amswap.w	$a0, $a1, $a2
 name: test_AMSWAP_W
 body: |
@@ -600,7 +600,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMSWAP_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amswap.d	$a0, $a1, $a2
 name: test_AMSWAP_D
 body: |
@@ -609,7 +609,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMADD_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amadd.w	$a0, $a1, $a2
 name: test_AMADD_W
 body: |
@@ -618,7 +618,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMADD_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amadd.d	$a0, $a1, $a2
 name: test_AMADD_D
 body: |
@@ -627,7 +627,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMAND_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amand.w	$a0, $a1, $a2
 name: test_AMAND_W
 body: |
@@ -636,7 +636,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMAND_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amand.d	$a0, $a1, $a2
 name: test_AMAND_D
 body: |
@@ -645,7 +645,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMOR_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amor.w	$a0, $a1, $a2
 name: test_AMOR_W
 body: |
@@ -654,7 +654,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMOR_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amor.d	$a0, $a1, $a2
 name: test_AMOR_D
 body: |
@@ -663,7 +663,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMXOR_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amxor.w	$a0, $a1, $a2
 name: test_AMXOR_W
 body: |
@@ -672,7 +672,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMXOR_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: amxor.d	$a0, $a1, $a2
 name: test_AMXOR_D
 body: |
@@ -681,7 +681,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax.w	$a0, $a1, $a2
 name: test_AMMAX_W
 body: |
@@ -690,7 +690,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax.d	$a0, $a1, $a2
 name: test_AMMAX_D
 body: |
@@ -699,7 +699,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_W:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin.w	$a0, $a1, $a2
 name: test_AMMIN_W
 body: |
@@ -708,7 +708,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_D:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin.d	$a0, $a1, $a2
 name: test_AMMIN_D
 body: |
@@ -717,7 +717,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_WU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax.wu	$a0, $a1, $a2
 name: test_AMMAX_WU
 body: |
@@ -726,7 +726,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMAX_DU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 0 1 1 1 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammax.du	$a0, $a1, $a2
 name: test_AMMAX_DU
 body: |
@@ -735,7 +735,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_WU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin.wu	$a0, $a1, $a2
 name: test_AMMIN_WU
 body: |
@@ -744,7 +744,7 @@ body: |
 ...
 ---
 # CHECK-LABEL: test_AMMIN_DU:
-# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0
+# CHECK-ENC: 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0
 # CHECK-ASM: ammin.du	$a0, $a1, $a2
 name: test_AMMIN_DU
 body: |

From 8e7247a377974d9a4026386dd4c23d2e3cbccd28 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 18 Feb 2022 17:05:38 -0800
Subject: [PATCH 323/748] [SelectionDAG] Fix off by one error in range check in
 DAGTypeLegalizer::ExpandShiftByConstant.

The code was considering shifts by an about larger than the number of
bits in the original VT to be out of range. Shifts exactly equal to
the original bit width are also out of range.

I don't know how to test this. DAGCombiner should usually fold this
away. I just noticed while looking for something else in this code. The
llvm-cov report shows that we don't have coverage for out of range shifts here.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D120170
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8c7b90b6cd336..12dcb7a9b9991 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2468,7 +2468,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   EVT ShTy = N->getOperand(1).getValueType();
 
   if (N->getOpcode() == ISD::SHL) {
-    if (Amt.ugt(VTBits)) {
+    if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getConstant(0, DL, NVT);
@@ -2489,7 +2489,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   }
 
   if (N->getOpcode() == ISD::SRL) {
-    if (Amt.ugt(VTBits)) {
+    if (Amt.uge(VTBits)) {
       Lo = Hi = DAG.getConstant(0, DL, NVT);
     } else if (Amt.ugt(NVTBits)) {
       Lo = DAG.getNode(ISD::SRL, DL,
@@ -2510,7 +2510,7 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
   }
 
   assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
-  if (Amt.ugt(VTBits)) {
+  if (Amt.uge(VTBits)) {
     Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
                           DAG.getConstant(NVTBits - 1, DL, ShTy));
   } else if (Amt.ugt(NVTBits)) {

From 6f9d557e0835d884cdc343eef4ec4e86e96aca62 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 18:49:24 -0800
Subject: [PATCH 324/748] [instcombine] Cleanup foldAllocaCmp slightly [NFC]

---
 .../Transforms/InstCombine/InstCombineCompares.cpp | 14 ++++++--------
 .../Transforms/InstCombine/InstCombineInternal.h   |  3 +--
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 46af7d8468922..13540a77b511c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1006,8 +1006,7 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 }
 
 Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
-                                             const AllocaInst *Alloca,
-                                             const Value *Other) {
+                                             const AllocaInst *Alloca) {
   assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
 
   // It would be tempting to fold away comparisons between allocas and any
@@ -1076,10 +1075,9 @@ Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
     }
   }
 
-  Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
-  return replaceInstUsesWith(
-      ICI,
-      ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
+  auto *Res = ConstantInt::get(ICI.getType(),
+                               !CmpInst::isTrueWhenEqual(ICI.getPredicate()));
+  return replaceInstUsesWith(ICI, Res);
 }
 
 /// Fold "icmp pred (X+C), X".
@@ -6061,10 +6059,10 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Op0->getType()->isPointerTy() && I.isEquality()) {
     assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
     if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op0)))
-      if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
+      if (Instruction *New = foldAllocaCmp(I, Alloca))
         return New;
     if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op1)))
-      if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
+      if (Instruction *New = foldAllocaCmp(I, Alloca))
         return New;
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 674d20461daff..e590a301fefc5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -650,8 +650,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                            ICmpInst::Predicate Cond, Instruction &I);
-  Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca,
-                             const Value *Other);
+  Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca);
   Instruction *foldCmpLoadFromIndexedGlobal(LoadInst *LI,
                                             GetElementPtrInst *GEP,
                                             GlobalVariable *GV, CmpInst &ICI,

From 357b18e2821c7be7fb0ae6cbde3f8cade8195d93 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Fri, 18 Feb 2022 19:01:55 -0800
Subject: [PATCH 325/748] [instcombine] Add/cleanup attributes in a test

---
 llvm/test/Transforms/InstCombine/compare-unescaped.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/InstCombine/compare-unescaped.ll b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
index 9f91ccbcc8e44..fefe036b0e7c1 100644
--- a/llvm/test/Transforms/InstCombine/compare-unescaped.ll
+++ b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
@@ -3,7 +3,7 @@
 
 @gp = global i32* null, align 8
 
-declare i8* @malloc(i64) #1
+declare noalias i8* @malloc(i64)
 
 define i1 @compare_global_trivialeq() {
 ; CHECK-LABEL: @compare_global_trivialeq(

From 39151717dbb494463cda59fe5d776870816790ce Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <Shraiysh.Vaishay@amd.com>
Date: Sat, 19 Feb 2022 10:00:03 +0530
Subject: [PATCH 326/748] [mlir][OpenMP] Added assemblyFormat for ParallelOp

This patch adds assemblyFormat for omp.parallel operation.

Some existing functions have been altered to fit the custom directive
in assemblyFormat. This has led to their callsites to get modified too,
but those will be removed in later patches, when other operations get
their assemblyFormat. All operations were not changed in one patch for
ease of review.

Reviewed By: Mogball

Differential Revision: https://reviews.llvm.org/D120157
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  12 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 119 ++++++++----------
 mlir/test/Dialect/OpenMP/invalid.mlir         |  20 +--
 mlir/test/Dialect/OpenMP/ops.mlir             |  10 +-
 4 files changed, 75 insertions(+), 86 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index ec535edf81d9f..6ed13e6d8ff2c 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -97,7 +97,17 @@ def ParallelOp : OpenMP_Op<"parallel", [
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>
   ];
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = [{
+    oilist( `if` `(` $if_expr_var `:` type($if_expr_var) `)`
+          | `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
+          | `allocate` `(`
+              custom<AllocateAndAllocator>(
+                $allocate_vars, type($allocate_vars),
+                $allocators_vars, type($allocators_vars)
+              ) `)`
+          | `proc_bind` `(` custom<ProcBindKind>($proc_bind_val) `)`
+    ) $region attr-dict
+  }];
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index bc3b595483d78..babd71e85bd09 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -89,35 +89,53 @@ static ParseResult parseAllocateAndAllocator(
     SmallVectorImpl<OpAsmParser::OperandType> &operandsAllocator,
     SmallVectorImpl<Type> &typesAllocator) {
 
-  return parser.parseCommaSeparatedList(
-      OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
-        OpAsmParser::OperandType operand;
-        Type type;
-        if (parser.parseOperand(operand) || parser.parseColonType(type))
-          return failure();
-        operandsAllocator.push_back(operand);
-        typesAllocator.push_back(type);
-        if (parser.parseArrow())
-          return failure();
-        if (parser.parseOperand(operand) || parser.parseColonType(type))
-          return failure();
+  return parser.parseCommaSeparatedList([&]() -> ParseResult {
+    OpAsmParser::OperandType operand;
+    Type type;
+    if (parser.parseOperand(operand) || parser.parseColonType(type))
+      return failure();
+    operandsAllocator.push_back(operand);
+    typesAllocator.push_back(type);
+    if (parser.parseArrow())
+      return failure();
+    if (parser.parseOperand(operand) || parser.parseColonType(type))
+      return failure();
 
-        operandsAllocate.push_back(operand);
-        typesAllocate.push_back(type);
-        return success();
-      });
+    operandsAllocate.push_back(operand);
+    typesAllocate.push_back(type);
+    return success();
+  });
 }
 
 /// Print allocate clause
-static void printAllocateAndAllocator(OpAsmPrinter &p,
+static void printAllocateAndAllocator(OpAsmPrinter &p, Operation *op,
                                       OperandRange varsAllocate,
-                                      OperandRange varsAllocator) {
-  p << "allocate(";
+                                      TypeRange typesAllocate,
+                                      OperandRange varsAllocator,
+                                      TypeRange typesAllocator) {
   for (unsigned i = 0; i < varsAllocate.size(); ++i) {
-    std::string separator = i == varsAllocate.size() - 1 ? ") " : ", ";
-    p << varsAllocator[i] << " : " << varsAllocator[i].getType() << " -> ";
-    p << varsAllocate[i] << " : " << varsAllocate[i].getType() << separator;
+    std::string separator = i == varsAllocate.size() - 1 ? "" : ", ";
+    p << varsAllocator[i] << " : " << typesAllocator[i] << " -> ";
+    p << varsAllocate[i] << " : " << typesAllocate[i] << separator;
+  }
+}
+
+ParseResult parseProcBindKind(OpAsmParser &parser,
+                              omp::ClauseProcBindKindAttr &procBindAttr) {
+  StringRef procBindStr;
+  if (parser.parseKeyword(&procBindStr))
+    return failure();
+  if (auto procBindVal = symbolizeClauseProcBindKind(procBindStr)) {
+    procBindAttr =
+        ClauseProcBindKindAttr::get(parser.getContext(), *procBindVal);
+    return success();
   }
+  return failure();
+}
+
+void printProcBindKind(OpAsmPrinter &p, Operation *op,
+                       omp::ClauseProcBindKindAttr procBindAttr) {
+  p << stringifyClauseProcBindKind(procBindAttr.getValue());
 }
 
 LogicalResult ParallelOp::verify() {
@@ -127,24 +145,6 @@ LogicalResult ParallelOp::verify() {
   return success();
 }
 
-void ParallelOp::print(OpAsmPrinter &p) {
-  p << " ";
-  if (auto ifCond = if_expr_var())
-    p << "if(" << ifCond << " : " << ifCond.getType() << ") ";
-
-  if (auto threads = num_threads_var())
-    p << "num_threads(" << threads << " : " << threads.getType() << ") ";
-
-  if (!allocate_vars().empty())
-    printAllocateAndAllocator(p, allocate_vars(), allocators_vars());
-
-  if (auto bind = proc_bind_val())
-    p << "proc_bind(" << stringifyClauseProcBindKind(*bind) << ") ";
-
-  p << ' ';
-  p.printRegion(getRegion());
-}
-
 //===----------------------------------------------------------------------===//
 // Parser and printer for Linear Clause
 //===----------------------------------------------------------------------===//
@@ -626,9 +626,10 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
         return failure();
       clauseSegments[pos[threadLimitClause]] = 1;
     } else if (clauseKeyword == "allocate") {
-      if (checkAllowed(allocateClause) ||
+      if (checkAllowed(allocateClause) || parser.parseLParen() ||
           parseAllocateAndAllocator(parser, allocates, allocateTypes,
-                                    allocators, allocatorTypes))
+                                    allocators, allocatorTypes) ||
+          parser.parseRParen())
         return failure();
       clauseSegments[pos[allocateClause]] = allocates.size();
       clauseSegments[pos[allocateClause] + 1] = allocators.size();
@@ -803,32 +804,6 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
   return success();
 }
 
-/// Parses a parallel operation.
-///
-/// operation ::= `omp.parallel` clause-list
-/// clause-list ::= clause | clause clause-list
-/// clause ::= if | num-threads | allocate | proc-bind
-///
-ParseResult ParallelOp::parse(OpAsmParser &parser, OperationState &result) {
-  SmallVector<ClauseType> clauses = {ifClause, numThreadsClause, allocateClause,
-                                     procBindClause};
-
-  SmallVector<int> segments;
-
-  if (failed(parseClauses(parser, result, clauses, segments)))
-    return failure();
-
-  result.addAttribute("operand_segment_sizes",
-                      parser.getBuilder().getI32VectorAttr(segments));
-
-  Region *body = result.addRegion();
-  SmallVector<OpAsmParser::OperandType> regionArgs;
-  SmallVector<Type> regionArgTypes;
-  if (parser.parseRegion(*body, regionArgs, regionArgTypes))
-    return failure();
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // Parser, printer and verifier for SectionsOp
 //===----------------------------------------------------------------------===//
@@ -863,8 +838,12 @@ void SectionsOp::print(OpAsmPrinter &p) {
   if (!reduction_vars().empty())
     printReductionVarList(p, reductions(), reduction_vars());
 
-  if (!allocate_vars().empty())
-    printAllocateAndAllocator(p, allocate_vars(), allocators_vars());
+  if (!allocate_vars().empty()) {
+    printAllocateAndAllocator(p << "allocate(", *this, allocate_vars(),
+                              allocate_vars().getTypes(), allocators_vars(),
+                              allocators_vars().getTypes());
+    p << ")";
+  }
 
   if (nowait())
     p << "nowait";
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 6646410183c74..8a5d50dd0fb96 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt -split-input-file -verify-diagnostics %s
 
 func @unknown_clause() {
-  // expected-error@+1 {{invalid is not a valid clause}}
+  // expected-error@+1 {{expected '{' to begin a region}}
   omp.parallel invalid {
   }
 
@@ -11,7 +11,7 @@ func @unknown_clause() {
 // -----
 
 func @if_once(%n : i1) {
-  // expected-error@+1 {{at most one if clause can appear on the omp.parallel operation}}
+  // expected-error@+1 {{`if` clause can appear at most once in the expansion of the oilist directive}}
   omp.parallel if(%n : i1) if(%n : i1) {
   }
 
@@ -21,7 +21,7 @@ func @if_once(%n : i1) {
 // -----
 
 func @num_threads_once(%n : si32) {
-  // expected-error@+1 {{at most one num_threads clause can appear on the omp.parallel operation}}
+  // expected-error@+1 {{`num_threads` clause can appear at most once in the expansion of the oilist directive}}
   omp.parallel num_threads(%n : si32) num_threads(%n : si32) {
   }
 
@@ -31,7 +31,7 @@ func @num_threads_once(%n : si32) {
 // -----
 
 func @nowait_not_allowed(%n : memref<i32>) {
-  // expected-error@+1 {{nowait is not a valid clause for the omp.parallel operation}}
+  // expected-error@+1 {{expected '{' to begin a region}}
   omp.parallel nowait {}
   return
 }
@@ -39,7 +39,7 @@ func @nowait_not_allowed(%n : memref<i32>) {
 // -----
 
 func @linear_not_allowed(%data_var : memref<i32>, %linear_var : i32) {
-  // expected-error@+1 {{linear is not a valid clause for the omp.parallel operation}}
+  // expected-error@+1 {{expected '{' to begin a region}}
   omp.parallel linear(%data_var = %linear_var : memref<i32>)  {}
   return
 }
@@ -47,7 +47,7 @@ func @linear_not_allowed(%data_var : memref<i32>, %linear_var : i32) {
 // -----
 
 func @schedule_not_allowed() {
-  // expected-error@+1 {{schedule is not a valid clause for the omp.parallel operation}}
+  // expected-error@+1 {{expected '{' to begin a region}}
   omp.parallel schedule(static) {}
   return
 }
@@ -55,7 +55,7 @@ func @schedule_not_allowed() {
 // -----
 
 func @collapse_not_allowed() {
-  // expected-error@+1 {{collapse is not a valid clause for the omp.parallel operation}}
+  // expected-error@+1 {{expected '{' to begin a region}}
   omp.parallel collapse(3) {}
   return
 }
@@ -63,7 +63,7 @@ func @collapse_not_allowed() {
 // -----
 
 func @order_not_allowed() {
-  // expected-error@+1 {{order is not a valid clause for the omp.parallel operation}}
+  // expected-error@+1 {{expected '{' to begin a region}}
   omp.parallel order(concurrent) {}
   return
 }
@@ -71,14 +71,14 @@ func @order_not_allowed() {
 // -----
 
 func @ordered_not_allowed() {
-  // expected-error@+1 {{ordered is not a valid clause for the omp.parallel operation}}
+  // expected-error@+1 {{expected '{' to begin a region}}
   omp.parallel ordered(2) {}
 }
 
 // -----
 
 func @proc_bind_once() {
-  // expected-error@+1 {{at most one proc_bind clause can appear on the omp.parallel operation}}
+  // expected-error@+1 {{`proc_bind` clause can appear at most once in the expansion of the oilist directive}}
   omp.parallel proc_bind(close) proc_bind(spread) {
   }
 
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 573b036f5746a..cbb8b1f550da4 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -59,7 +59,7 @@ func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32)
   // CHECK: omp.parallel num_threads(%{{.*}} : si32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%num_threads, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = dense<[0,1,1,1]>: vector<4xi32>} : (si32, memref<i32>, memref<i32>) -> ()
+    }) {num_threads, allocate, operand_segment_sizes = dense<[0,1,1,1]>: vector<4xi32>} : (si32, memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
@@ -68,22 +68,22 @@ func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : si32)
   // CHECK: omp.parallel if(%{{.*}}) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
     "omp.parallel"(%if_cond, %data_var, %data_var) ({
       omp.terminator
-    }) {operand_segment_sizes = dense<[1,0,1,1]> : vector<4xi32>} : (i1, memref<i32>, memref<i32>) -> ()
+    }) {if, allocate, operand_segment_sizes = dense<[1,0,1,1]> : vector<4xi32>} : (i1, memref<i32>, memref<i32>) -> ()
 
   // test without allocate
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32)
     "omp.parallel"(%if_cond, %num_threads) ({
       omp.terminator
-    }) {operand_segment_sizes = dense<[1,1,0,0]> : vector<4xi32>} : (i1, si32) -> ()
+    }) {if, num_threads, operand_segment_sizes = dense<[1,1,0,0]> : vector<4xi32>} : (i1, si32) -> ()
 
     omp.terminator
-  }) {operand_segment_sizes = dense<[1,1,1,1]> : vector<4xi32>, proc_bind_val = #omp<"procbindkind spread">} : (i1, si32, memref<i32>, memref<i32>) -> ()
+  }) {if, num_threads, allocate, operand_segment_sizes = dense<[1,1,1,1]> : vector<4xi32>, proc_bind_val = #omp<"procbindkind spread">} : (i1, si32, memref<i32>, memref<i32>) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
   "omp.parallel" (%data_var, %data_var) ({
     omp.terminator
-  }) {operand_segment_sizes = dense<[0,0,1,1]> : vector<4xi32>} : (memref<i32>, memref<i32>) -> ()
+  }) {allocate, operand_segment_sizes = dense<[0,0,1,1]> : vector<4xi32>} : (memref<i32>, memref<i32>) -> ()
 
   return
 }

From 1df8efae56b590a58123499d2bb8ffcd1f36fc40 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 18 Feb 2022 21:55:48 -0800
Subject: [PATCH 327/748] [SelectionDAG][X86] Support f16 in
 getReciprocalOpName.

If the "reciprocal-estimates" attribute is present and it doesn't
contain "all", "none", or "default", we previously crashed on f16
operations.

This patch addes an 'h' suffix' to prevent the crash.

I've added simple tests that just enable the estimate for all
vec-sqrt and one test case that explicitly tests the new 'h' suffix
to override the default steps.

There may be some frontend change needed to, but I haven't checked
that yet.

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D120158
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  4 ++-
 .../test/CodeGen/X86/avx512fp16-intrinsics.ll | 26 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3a7e82c9038c1..700c11a66904f 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2072,9 +2072,11 @@ static std::string getReciprocalOpName(bool IsSqrt, EVT VT) {
 
   Name += IsSqrt ? "sqrt" : "div";
 
-  // TODO: Handle "half" or other float types?
+  // TODO: Handle other float types?
   if (VT.getScalarType() == MVT::f64) {
     Name += "d";
+  } else if (VT.getScalarType() == MVT::f16) {
+    Name += "h";
   } else {
     assert(VT.getScalarType() == MVT::f32 &&
            "Unexpected FP type for reciprocal estimate");
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 6a5c3e243209a..61b483329a4d9 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -35,6 +35,32 @@ define <32 x half> @test_sqrt_ph_512_fast(<32 x half> %a0, <32 x half> %a1) {
   ret <32 x half> %2
 }
 
+define <32 x half> @test_sqrt_ph_512_fast_estimate_attribute(<32 x half> %a0, <32 x half> %a1) "reciprocal-estimates"="vec-sqrt" {
+; CHECK-LABEL: test_sqrt_ph_512_fast_estimate_attribute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtph %zmm0, %zmm0
+; CHECK-NEXT:    vmulph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %2 = fdiv fast <32 x half> %a1, %1
+  ret <32 x half> %2
+}
+
+define <32 x half> @test_sqrt_ph_512_fast_estimate_attribute_2(<32 x half> %a0, <32 x half> %a1) "reciprocal-estimates"="vec-sqrth:1" {
+; CHECK-LABEL: test_sqrt_ph_512_fast_estimate_attribute_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtph %zmm0, %zmm2
+; CHECK-NEXT:    vmulph %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vfmadd213ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm0
+; CHECK-NEXT:    vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm2
+; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vmulph %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %2 = fdiv fast <32 x half> %a1, %1
+  ret <32 x half> %2
+}
+
 define <32 x half> @test_mask_sqrt_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) {
 ; CHECK-LABEL: test_mask_sqrt_ph_512:
 ; CHECK:       # %bb.0:

From 5489969550a28a5ef63b6e242469b6eb96b2fbbd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 18 Feb 2022 22:38:13 -0800
Subject: [PATCH 328/748] [RISCV] Add IsRV32 to the isel pattern for
 ZIP_RV32/UNZIP_RV32. NFC

I think the i32 in the pattern prevents this from matching on RV64,
but using IsRV32 is safer.

Add tests for RV64 to make sure we don't print zip or unzip
because we incorrectly picked ZIP_RV32/UNZIP_RV32.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td    |  2 ++
 llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 5ccd24b077f3a..3d93b41320aec 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -852,7 +852,9 @@ let Predicates = [HasStdExtZbpOrZbkb] in {
 // We treat brev8 as a separate instruction, so match it directly. We also
 // use this for brev8 when lowering bitreverse with Zbkb.
 def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>;
+}
 
+let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in {
 // We treat zip and unzip as separate instructions, so match it directly.
 def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>;
 def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>;
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
index 88e5054aaefe3..50b2a1b322c92 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
@@ -100,6 +100,15 @@ define signext i32 @shfli32(i32 signext %a) nounwind {
  ret i32 %tmp
 }
 
+define signext i32 @zip_w(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zip_w:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 15)
+ ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.unshfl.i32(i32 %a, i32 %b)
 
 define signext i32 @unshfl32(i32 signext %a, i32 signext %b) nounwind {
@@ -132,6 +141,15 @@ define signext i32 @unshfli32(i32 signext %a) nounwind {
  ret i32 %tmp
 }
 
+define signext i32 @unzip_w(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzip_w:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 15)
+ ret i32 %tmp
+}
+
 declare i64 @llvm.riscv.grev.i64(i64 %a, i64 %b)
 
 define i64 @grev64(i64 %a, i64 %b) nounwind {

From 17d5ba5bc744cda4b1a668cc3f95418a8039fad0 Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Fri, 18 Feb 2022 22:26:01 -0800
Subject: [PATCH 329/748] [RISCV][NFC] Remove unused multiclass def.

---
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 4e762b63d8013..46a6f8d2af678 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3499,23 +3499,6 @@ multiclass VPatBinaryMaskOut<string intrinsic,
                          GPR:$vl, sew)>;
 }
 
-multiclass VPatConversion<string intrinsic,
-                          string inst,
-                          string kind,
-                          ValueType result_type,
-                          ValueType op1_type,
-                          ValueType mask_type,
-                          int sew,
-                          LMULInfo vlmul,
-                          VReg result_reg_class,
-                          VReg op1_reg_class>
-{
-  def : VPatUnaryNoMask<intrinsic, inst, kind, result_type, op1_type,
-                        sew, vlmul, op1_reg_class>;
-  def : VPatUnaryMask<intrinsic, inst, kind, result_type, op1_type,
-                      mask_type, sew, vlmul, result_reg_class, op1_reg_class>;
-}
-
 multiclass VPatConversionTA<string intrinsic,
                             string inst,
                             string kind,

From 9106b5e546c95537957ca9e34cc12cd62ff00112 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sat, 19 Feb 2022 13:23:27 +0100
Subject: [PATCH 330/748] [CodeGen] Make ShapeT::operator== const. NFC.

Otherwise it becomes asymmetric in the types it accepts.
---
 llvm/include/llvm/CodeGen/TileShapeInfo.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
index 4e574bd96cca4..1b5f902139fbf 100644
--- a/llvm/include/llvm/CodeGen/TileShapeInfo.h
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -38,7 +38,7 @@ class ShapeT {
   ShapeT()
       : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
         ColImm(InvalidImmShape) {}
-  bool operator==(const ShapeT &Shape) {
+  bool operator==(const ShapeT &Shape) const {
     MachineOperand *R = Shape.Row;
     MachineOperand *C = Shape.Col;
     if (!R || !C)
@@ -52,7 +52,7 @@ class ShapeT {
     return false;
   }
 
-  bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }
+  bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); }
 
   MachineOperand *getRow() const { return Row; }
 

From bc53ebbda8fb9727784a0429730b7902fed593e6 Mon Sep 17 00:00:00 2001
From: fourdim <fourdim@foxmail.com>
Date: Sat, 19 Feb 2022 20:36:31 +0800
Subject: [PATCH 331/748] [docs] HowToCrossCompileLLVM.rst: update cmake
 options

This patch updates the cmake options suggested when cross compiling. This should fix [#52819](https://github.com/llvm/llvm-project/issues/52819).

Brad King (Member of CMake) says:

The linked [CMAKE_CROSSCOMPILING](https://cmake.org/cmake/help/v3.22/variable/CMAKE_CROSSCOMPILING.html) documentation says:

This variable will be set to true by CMake if the `CMAKE_SYSTEM_NAME` variable has been set manually (i.e. in a toolchain file or as a cache entry from the cmake command line).

It is not meant to be set by project code or toolchain files. It is always set automatically. Don't put `set(CMAKE_CROSSCOMPILING ON)` anywhere in your code.

`CMAKE_CROSSCOMPILING` indicates only whether `CMAKE_SYSTEM_NAME` was set by the user/project/toolchain-file instead of by CMake.

In LLVM project, `CMAKE_CROSSCOMPILING` is used to determine whether to execute some tests on the host machine.

LLVM needs to use another method for that. `CMAKE_CROSSCOMPILING` is not a reliable indicator of whether produced binaries will run on the host, and does not claim so in its documentation. If one sets `CMAKE_SYSTEM_NAME` to Linux in a toolchain file, and builds on a Linux host, that doesn't mean the target architecture or minimum glibc version is the same.

Reviewed By: rengolin

Differential Revision: https://reviews.llvm.org/D119804
---
 llvm/docs/HowToCrossCompileLLVM.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/HowToCrossCompileLLVM.rst b/llvm/docs/HowToCrossCompileLLVM.rst
index e1ad8e5f5f4ff..0df4ba96517e6 100644
--- a/llvm/docs/HowToCrossCompileLLVM.rst
+++ b/llvm/docs/HowToCrossCompileLLVM.rst
@@ -40,7 +40,7 @@ see :doc:`CMake`.
 
 The CMake options you need to add are:
 
- * ``-DCMAKE_CROSSCOMPILING=True``
+ * ``-DCMAKE_SYSTEM_NAME=<target-system>``
  * ``-DCMAKE_INSTALL_PREFIX=<install-dir>``
  * ``-DLLVM_TABLEGEN=<path-to-host-bin>/llvm-tblgen``
  * ``-DCLANG_TABLEGEN=<path-to-host-bin>/clang-tblgen``
@@ -48,6 +48,8 @@ The CMake options you need to add are:
  * ``-DLLVM_TARGET_ARCH=ARM``
  * ``-DLLVM_TARGETS_TO_BUILD=ARM``
 
+Note: ``CMAKE_CROSSCOMPILING`` is always set automatically when ``CMAKE_SYSTEM_NAME`` is set. Don't put ``-DCMAKE_CROSSCOMPILING=TRUE`` in your options.
+
 If you're compiling with GCC, you can use architecture options for your target,
 and the compiler driver will detect everything that it needs:
 

From f60d101b00aea4b6c45efd932cf03eae865ae48f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 13:00:59 +0000
Subject: [PATCH 332/748] Fix Wdocumentation unknown parameter warning

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 185e18e884fa7..757a5b042a02a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13405,7 +13405,6 @@ static bool IsSVECntIntrinsic(SDValue S) {
 /// operations need a bit more inspection to get this information.
 ///
 /// \param Extend The SDNode from the DAG that represents the extend operation
-/// \param DAG The SelectionDAG hosting the \p Extend node
 ///
 /// \returns The type representing the \p Extend source type, or \p MVT::Other
 /// if no valid type can be determined

From a54b56ecf2e7c35e0bb3e61585a2c27c252069c8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 13:06:09 +0000
Subject: [PATCH 333/748] Fix Wdocumentation unknown parameter warning

---
 clang/lib/Parse/ParseTemplate.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index f875e3bf43e81..0d8ab6ad2fbcd 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -1233,8 +1233,6 @@ bool Parser::ParseGreaterThanInTemplateList(SourceLocation LAngleLoc,
 /// token that forms the template-id. Otherwise, we will leave the
 /// last token in the stream (e.g., so that it can be replaced with an
 /// annotation token).
-///
-/// \param NameHint is not required, and merely affects code completion.
 bool Parser::ParseTemplateIdAfterTemplateName(bool ConsumeLastToken,
                                               SourceLocation &LAngleLoc,
                                               TemplateArgList &TemplateArgs,

From 4a01ec404633e5a15320ccd439835fb53b469af6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 13:17:10 +0000
Subject: [PATCH 334/748] Fix Wdocumentation missing code snippet warnings

---
 llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
index 46b995cee840a..0e2d55c0182e0 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
@@ -20,28 +20,28 @@ enum Fixups {
   /// fixup_ve_srel32 - 32-bit fixup corresponding to foo for relative branch
   fixup_ve_srel32,
 
-  /// fixup_ve_hi32 - 32-bit fixup corresponding to foo@hi
+  /// fixup_ve_hi32 - 32-bit fixup corresponding to foo\@hi
   fixup_ve_hi32,
 
-  /// fixup_ve_lo32 - 32-bit fixup corresponding to foo@lo
+  /// fixup_ve_lo32 - 32-bit fixup corresponding to foo\@lo
   fixup_ve_lo32,
 
-  /// fixup_ve_pc_hi32 - 32-bit fixup corresponding to foo@pc_hi
+  /// fixup_ve_pc_hi32 - 32-bit fixup corresponding to foo\@pc_hi
   fixup_ve_pc_hi32,
 
-  /// fixup_ve_pc_lo32 - 32-bit fixup corresponding to foo@pc_lo
+  /// fixup_ve_pc_lo32 - 32-bit fixup corresponding to foo\@pc_lo
   fixup_ve_pc_lo32,
 
-  /// fixup_ve_got_hi32 - 32-bit fixup corresponding to foo@got_hi
+  /// fixup_ve_got_hi32 - 32-bit fixup corresponding to foo\@got_hi
   fixup_ve_got_hi32,
 
-  /// fixup_ve_got_lo32 - 32-bit fixup corresponding to foo@got_lo
+  /// fixup_ve_got_lo32 - 32-bit fixup corresponding to foo\@got_lo
   fixup_ve_got_lo32,
 
-  /// fixup_ve_gotoff_hi32 - 32-bit fixup corresponding to foo@gotoff_hi
+  /// fixup_ve_gotoff_hi32 - 32-bit fixup corresponding to foo\@gotoff_hi
   fixup_ve_gotoff_hi32,
 
-  /// fixup_ve_gotoff_lo32 - 32-bit fixup corresponding to foo@gotoff_lo
+  /// fixup_ve_gotoff_lo32 - 32-bit fixup corresponding to foo\@gotoff_lo
   fixup_ve_gotoff_lo32,
 
   /// fixup_ve_plt_hi32/lo32

From b8801ba0503936bd42e6d16e291bf66209323723 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 19 Feb 2022 14:38:57 +0000
Subject: [PATCH 335/748] [AArch64] Common patterns between UMULL and
 int_aarch64_neon_umull

We have some duplicate patterns between the AArch64ISD::UMULL (/SMULL)
and the int_aarch64_neon_umull (/smull) intrinsics. They did not
replicate all the patterns though, leaving some gaps on instructions
like umlal2 from codegen. This commons all the patterns by converting
all int_aarch64_neon_umull intrinsics to UMULL nodes and removing the
duplicate for umull/smull intrinsics, so that all instructions go
through the same tablegen pattern.

This improves some of the longer-than-legal mla patterns, helping them
replace ext with umlal2.

Differential Revision: https://reviews.llvm.org/D119887
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  7 ++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 93 ++++---------------
 llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll | 64 +++++--------
 3 files changed, 48 insertions(+), 116 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 757a5b042a02a..d4f9906e687f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15447,7 +15447,11 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_smull:
+    return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_umull:
+    return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_pmull:
   case Intrinsic::aarch64_neon_sqdmull:
     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
@@ -18131,6 +18135,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
   case AArch64ISD::UADDV:
     return performUADDVCombine(N, DAG);
+  case AArch64ISD::SMULL:
+  case AArch64ISD::UMULL:
+    return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 664f670d741c0..509fd05806211 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5105,10 +5105,10 @@ defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
 defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
                  BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
 defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
 defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+    TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
 defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
                                                int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
@@ -5126,10 +5126,10 @@ defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
 defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
                  BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
 defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+    TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
 defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
                  BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
@@ -5164,74 +5164,15 @@ multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperat
                                      V64:$Rn, V64:$Rm)), dsub)>;
 }
 
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64umull,
      UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<add, AArch64smull,
      SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64umull,
      UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
+defm : Neon_mul_acc_widen_patterns<sub, AArch64smull,
      SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
 
-// Additional patterns for SMULL and UMULL
-multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
-  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
-  def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
-            (INST8B V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
-            (INST4H V64:$Rn, V64:$Rm)>;
-  def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
-            (INST2S V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
-  SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
-defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
-  UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
-
-// Patterns for smull2/umull2.
-multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
-  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
-  def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
-                           (extract_high_v16i8 V128:$Rm))),
-             (INST8B V128:$Rn, V128:$Rm)>;
-  def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
-                           (extract_high_v8i16 V128:$Rm))),
-             (INST4H V128:$Rn, V128:$Rm)>;
-  def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
-                           (extract_high_v4i32 V128:$Rm))),
-             (INST2S V128:$Rn, V128:$Rm)>;
-}
-
-defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
-  SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
-defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
-  UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
-
-// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
-multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
-  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
-  def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
-            (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
-            (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
-  def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
-            (INST2S  V128:$Rd, V64:$Rn, V64:$Rm)>;
-}
-
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
-  SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
-  UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
-  SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
-defm : Neon_mulacc_widen_patterns<
-  TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
-  UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
-
 // Patterns for 64-bit pmull
 def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
           (PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -6404,11 +6345,10 @@ defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
 
 defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
 defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
 defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
-                int_aarch64_neon_smull>;
+    TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
 defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
                                            int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
@@ -6419,11 +6359,10 @@ defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
                                           int_aarch64_neon_sqrdmlsh>;
 defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
 defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
-    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+    TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
-                int_aarch64_neon_umull>;
+    TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull", AArch64umull>;
 
 // A scalar sqdmull with the second operand being a vector lane can be
 // handled directly with the indexed instruction encoding.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index b591438b7ceef..1c52b359156f6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -71,12 +71,10 @@ entry:
 define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
 ; CHECK-LABEL: mla_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    umlal2 v3.8h, v0.16b, v1.16b
 ; CHECK-NEXT:    umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    umlal v3.8h, v4.8b, v5.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i16>
@@ -91,18 +89,14 @@ define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll v6.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v7.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    ext v16.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT:    ext v17.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v18.16b, v7.16b, v7.16b, #8
-; CHECK-NEXT:    ext v19.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    umlal v4.4s, v0.4h, v1.4h
-; CHECK-NEXT:    umlal v2.4s, v6.4h, v7.4h
-; CHECK-NEXT:    umlal v3.4s, v16.4h, v18.4h
-; CHECK-NEXT:    umlal v5.4s, v17.4h, v19.4h
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ushll2 v7.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    umlal2 v5.4s, v0.8h, v7.8h
+; CHECK-NEXT:    umlal2 v3.4s, v6.8h, v1.8h
+; CHECK-NEXT:    umlal v2.4s, v6.4h, v1.4h
+; CHECK-NEXT:    umlal v4.4s, v0.4h, v7.4h
 ; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    mov v2.16b, v4.16b
 ; CHECK-NEXT:    mov v3.16b, v5.16b
 ; CHECK-NEXT:    ret
@@ -117,43 +111,35 @@ entry:
 define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
 ; CHECK-LABEL: mla_i64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v17.16b, v7.16b
+; CHECK-NEXT:    mov v16.16b, v6.16b
+; CHECK-NEXT:    ldp q6, q7, [sp]
 ; CHECK-NEXT:    ushll v18.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v25.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v21.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
 ; CHECK-NEXT:    ushll v19.4s, v18.4h, #0
 ; CHECK-NEXT:    ushll v20.4s, v0.4h, #0
 ; CHECK-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-NEXT:    ushll v26.4s, v25.4h, #0
-; CHECK-NEXT:    ushll v27.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v25.4s, v25.8h, #0
-; CHECK-NEXT:    mov v16.16b, v7.16b
-; CHECK-NEXT:    mov v17.16b, v6.16b
-; CHECK-NEXT:    ldp q6, q7, [sp]
+; CHECK-NEXT:    ushll v22.4s, v21.4h, #0
+; CHECK-NEXT:    ushll v23.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v21.4s, v21.8h, #0
 ; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ext v21.16b, v19.16b, v19.16b, #8
-; CHECK-NEXT:    ext v22.16b, v20.16b, v20.16b, #8
-; CHECK-NEXT:    ext v23.16b, v18.16b, v18.16b, #8
-; CHECK-NEXT:    ext v28.16b, v26.16b, v26.16b, #8
-; CHECK-NEXT:    ext v29.16b, v27.16b, v27.16b, #8
-; CHECK-NEXT:    ext v30.16b, v25.16b, v25.16b, #8
-; CHECK-NEXT:    ext v24.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v31.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    umlal v4.2d, v18.2s, v25.2s
-; CHECK-NEXT:    umlal v17.2d, v20.2s, v27.2s
-; CHECK-NEXT:    umlal v2.2d, v19.2s, v26.2s
-; CHECK-NEXT:    umlal v3.2d, v21.2s, v28.2s
-; CHECK-NEXT:    umlal v5.2d, v23.2s, v30.2s
-; CHECK-NEXT:    umlal v16.2d, v22.2s, v29.2s
+; CHECK-NEXT:    umlal2 v5.2d, v18.4s, v21.4s
+; CHECK-NEXT:    umlal2 v17.2d, v20.4s, v23.4s
+; CHECK-NEXT:    umlal2 v3.2d, v19.4s, v22.4s
+; CHECK-NEXT:    umlal v2.2d, v19.2s, v22.2s
+; CHECK-NEXT:    umlal v4.2d, v18.2s, v21.2s
+; CHECK-NEXT:    umlal v16.2d, v20.2s, v23.2s
+; CHECK-NEXT:    umlal2 v7.2d, v0.4s, v1.4s
 ; CHECK-NEXT:    umlal v6.2d, v0.2s, v1.2s
-; CHECK-NEXT:    umlal v7.2d, v24.2s, v31.2s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    mov v1.16b, v3.16b
 ; CHECK-NEXT:    mov v2.16b, v4.16b
 ; CHECK-NEXT:    mov v3.16b, v5.16b
-; CHECK-NEXT:    mov v4.16b, v17.16b
-; CHECK-NEXT:    mov v5.16b, v16.16b
+; CHECK-NEXT:    mov v4.16b, v16.16b
+; CHECK-NEXT:    mov v5.16b, v17.16b
 ; CHECK-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i64>

From c69af70f02f200c1c443cbd8f43b1bc9fb59cced Mon Sep 17 00:00:00 2001
From: Micah Weston <micahsweston@gmail.com>
Date: Sat, 19 Feb 2022 15:35:53 +0000
Subject: [PATCH 336/748] [AArch64] Adds SUBS and ADDS instructions to the
 MIPeepholeOpt.

Implements ADDS/SUBS 24-bit immediate optimization using the
MIPeepholeOpt pass. This follows the pattern:

Optimize ([adds|subs] r, imm) -> ([ADDS|SUBS] ([ADD|SUB] r, #imm0, lsl #12), #imm1),
if imm == (imm0<<12)+imm1. and both imm0 and imm1 are non-zero 12-bit unsigned
integers.

Optimize ([adds|subs] r, imm) -> ([SUBS|ADDS] ([SUB|ADD] r, #imm0, lsl #12), #imm1),
if imm == -(imm0<<12)-imm1, and both imm0 and imm1 are non-zero 12-bit unsigned
integers.

The SplitAndOpcFunc type had to change the return type to an Opcode pair so that
the first add/sub is the regular instruction and the second is the flag setting
instruction. This required updating the code in the AND case.

Testing:

I ran a two stage bootstrap with this code.
Using the second stage compiler, I verified that the negation of an ADDS to SUBS
or vice versa is a valid optimization. Example V == -0x111111.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D118663
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  42 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |  27 ++
 .../Target/AArch64/AArch64MIPeepholeOpt.cpp   | 150 +++++++--
 llvm/test/CodeGen/AArch64/addsub.ll           | 290 +++++++++++++++++-
 .../AArch64/arm64-instruction-mix-remarks.ll  |  15 +-
 5 files changed, 452 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e80a9ae7c0eea..84469dd257cab 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1547,27 +1547,6 @@ findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
   }
 }
 
-namespace {
-
-struct UsedNZCV {
-  bool N = false;
-  bool Z = false;
-  bool C = false;
-  bool V = false;
-
-  UsedNZCV() = default;
-
-  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
-    this->N |= UsedFlags.N;
-    this->Z |= UsedFlags.Z;
-    this->C |= UsedFlags.C;
-    this->V |= UsedFlags.V;
-    return *this;
-  }
-};
-
-} // end anonymous namespace
-
 /// Find a condition code used by the instruction.
 /// Returns AArch64CC::Invalid if either the instruction does not use condition
 /// codes or we don't optimize CmpInstr in the presence of such instructions.
@@ -1622,15 +1601,15 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
   return UsedFlags;
 }
 
-/// \returns Conditions flags used after \p CmpInstr in its MachineBB if they
-/// are not containing C or V flags and NZCV flags are not alive in successors
-/// of the same \p CmpInstr and \p MI parent. \returns None otherwise.
+/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
+/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
+/// \returns None otherwise.
 ///
 /// Collect instructions using that flags in \p CCUseInstrs if provided.
-static Optional<UsedNZCV>
-examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
-                 const TargetRegisterInfo &TRI,
-                 SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) {
+Optional<UsedNZCV>
+llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
+                       const TargetRegisterInfo &TRI,
+                       SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
   MachineBasicBlock *CmpParent = CmpInstr.getParent();
   if (MI.getParent() != CmpParent)
     return None;
@@ -1652,8 +1631,6 @@ examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
     if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
       break;
   }
-  if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V)
-    return None;
   return NZCVUsedAfterCmp;
 }
 
@@ -1684,7 +1661,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
     return false;
 
-  if (!examineCFlagsUse(MI, CmpInstr, TRI))
+  Optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
+  if (!NZVCUsed || NZVCUsed->C || NZVCUsed->V)
     return false;
 
   AccessKind AccessToCheck = AK_Write;
@@ -1773,7 +1751,7 @@ static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
       examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
   // Condition flags are not used in CmpInstr basic block successors and only
   // Z or N flags allowed to be used after CmpInstr within its basic block
-  if (!NZCVUsedAfterCmp)
+  if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
     return false;
   // Z or N flag used after CmpInstr must correspond to the flag used in MI
   if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index b522230496d25..55b1813f0b301 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -362,6 +362,33 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                           const MachineRegisterInfo *MRI) const;
 };
 
+struct UsedNZCV {
+  bool N = false;
+  bool Z = false;
+  bool C = false;
+  bool V = false;
+
+  UsedNZCV() = default;
+
+  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
+    this->N |= UsedFlags.N;
+    this->Z |= UsedFlags.Z;
+    this->C |= UsedFlags.C;
+    this->V |= UsedFlags.V;
+    return *this;
+  }
+};
+
+/// \returns Conditions flags used after \p CmpInstr in its MachineBB if  NZCV
+/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
+/// \returns None otherwise.
+///
+/// Collect instructions using that flags in \p CCUseInstrs if provided.
+Optional<UsedNZCV>
+examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
+                 const TargetRegisterInfo &TRI,
+                 SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr);
+
 /// Return true if there is an instruction /after/ \p DefMI and before \p UseMI
 /// which either reads or clobbers NZCV.
 bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 1fc5617b49f66..bfee78d751517 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -60,12 +60,13 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   MachineLoopInfo *MLI;
   MachineRegisterInfo *MRI;
 
+  using OpcodePair = std::pair<unsigned, unsigned>;
   template <typename T>
   using SplitAndOpcFunc =
-      std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
+      std::function<Optional<OpcodePair>(T, unsigned, T &, T &)>;
   using BuildMIFunc =
-      std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
-                         Register, Register)>;
+      std::function<void(MachineInstr &, OpcodePair, unsigned, unsigned,
+                         Register, Register, Register)>;
 
   /// For instructions where an immediate operand could be split into two
   /// separate immediate instructions, use the splitTwoPartImm two handle the
@@ -93,6 +94,10 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
                    SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   template <typename T>
+  bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI,
+                     SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+
+  template <typename T>
   bool visitAND(unsigned Opc, MachineInstr &MI,
                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   bool visitORR(MachineInstr &MI,
@@ -171,20 +176,20 @@ bool AArch64MIPeepholeOpt::visitAND(
 
   return splitTwoPartImm<T>(
       MI, ToBeRemoved,
-      [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
+      [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> {
         if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
-          return Opc;
+          return std::make_pair(Opc, Opc);
         return None;
       },
-      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
                    Register NewDstReg) {
         DebugLoc DL = MI.getDebugLoc();
         MachineBasicBlock *MBB = MI.getParent();
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
             .addReg(SrcReg)
             .addImm(Imm0);
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
             .addReg(NewTmpReg)
             .addImm(Imm1);
       });
@@ -273,23 +278,64 @@ bool AArch64MIPeepholeOpt::visitADDSUB(
   return splitTwoPartImm<T>(
       MI, ToBeRemoved,
       [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
-                       T &Imm1) -> Optional<unsigned> {
+                       T &Imm1) -> Optional<OpcodePair> {
         if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
-          return PosOpc;
+          return std::make_pair(PosOpc, PosOpc);
         if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
-          return NegOpc;
+          return std::make_pair(NegOpc, NegOpc);
         return None;
       },
-      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
+                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
+                   Register NewDstReg) {
+        DebugLoc DL = MI.getDebugLoc();
+        MachineBasicBlock *MBB = MI.getParent();
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
+            .addReg(SrcReg)
+            .addImm(Imm0)
+            .addImm(12);
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
+            .addReg(NewTmpReg)
+            .addImm(Imm1)
+            .addImm(0);
+      });
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::visitADDSSUBS(
+    OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI,
+    SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+  // Try the same transformation as ADDSUB but with additional requirement
+  // that the condition code usages are only for Equal and Not Equal
+  return splitTwoPartImm<T>(
+      MI, ToBeRemoved,
+      [PosOpcs, NegOpcs, &MI, &TRI = TRI, &MRI = MRI](
+          T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<OpcodePair> {
+        OpcodePair OP;
+        if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
+          OP = PosOpcs;
+        else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
+          OP = NegOpcs;
+        else
+          return None;
+        // Check conditional uses last since it is expensive for scanning
+        // proceeding instructions
+        MachineInstr &SrcMI = *MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
+        Optional<UsedNZCV> NZCVUsed = examineCFlagsUse(SrcMI, MI, *TRI);
+        if (!NZCVUsed || NZCVUsed->C || NZCVUsed->V)
+          return None;
+        return OP;
+      },
+      [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0,
                    unsigned Imm1, Register SrcReg, Register NewTmpReg,
                    Register NewDstReg) {
         DebugLoc DL = MI.getDebugLoc();
         MachineBasicBlock *MBB = MI.getParent();
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg)
             .addReg(SrcReg)
             .addImm(Imm0)
             .addImm(12);
-        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+        BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg)
             .addReg(NewTmpReg)
             .addImm(Imm1)
             .addImm(0);
@@ -357,33 +403,57 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm(
   // number since it was sign extended when we assign to the 64-bit Imm.
   if (SubregToRegMI)
     Imm &= 0xFFFFFFFF;
-  unsigned Opcode;
+  OpcodePair Opcode;
   if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
     Opcode = R.getValue();
   else
     return false;
 
-  // Create new ADD/SUB MIs.
+  // Create new MIs using the first and second opcodes. Opcodes might differ for
+  // flag setting operations that should only set flags on second instruction.
+  // NewTmpReg = Opcode.first SrcReg Imm0
+  // NewDstReg = Opcode.second NewTmpReg Imm1
+
+  // Determine register classes for destinations and register operands
   MachineFunction *MF = MI.getMF();
-  const TargetRegisterClass *RC =
-      TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
-  const TargetRegisterClass *ORC =
-      TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
+  const TargetRegisterClass *FirstInstrDstRC =
+      TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF);
+  const TargetRegisterClass *FirstInstrOperandRC =
+      TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF);
+  const TargetRegisterClass *SecondInstrDstRC =
+      (Opcode.first == Opcode.second)
+          ? FirstInstrDstRC
+          : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF);
+  const TargetRegisterClass *SecondInstrOperandRC =
+      (Opcode.first == Opcode.second)
+          ? FirstInstrOperandRC
+          : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF);
+
+  // Get old registers destinations and new register destinations
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-  Register NewTmpReg = MRI->createVirtualRegister(RC);
-  Register NewDstReg = MRI->createVirtualRegister(RC);
-
-  MRI->constrainRegClass(SrcReg, RC);
-  MRI->constrainRegClass(NewTmpReg, ORC);
-  MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
-
+  Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC);
+  // In the situation that DstReg is not Virtual (likely WZR or XZR), we want to
+  // reuse that same destination register.
+  Register NewDstReg = DstReg.isVirtual()
+                           ? MRI->createVirtualRegister(SecondInstrDstRC)
+                           : DstReg;
+
+  // Constrain registers based on their new uses
+  MRI->constrainRegClass(SrcReg, FirstInstrOperandRC);
+  MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC);
+  if (DstReg != NewDstReg)
+    MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
+
+  // Call the delegating operation to build the instruction
   BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
 
-  MRI->replaceRegWith(DstReg, NewDstReg);
   // replaceRegWith changes MI's definition register. Keep it for SSA form until
-  // deleting MI.
-  MI.getOperand(0).setReg(DstReg);
+  // deleting MI. Only if we made a new destination register.
+  if (DstReg != NewDstReg) {
+    MRI->replaceRegWith(DstReg, NewDstReg);
+    MI.getOperand(0).setReg(DstReg);
+  }
 
   // Record the MIs need to be removed.
   ToBeRemoved.insert(&MI);
@@ -439,6 +509,26 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
         Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
                                         ToBeRemoved);
         break;
+      case AArch64::ADDSWrr:
+        Changed = visitADDSSUBS<uint32_t>({AArch64::ADDWri, AArch64::ADDSWri},
+                                          {AArch64::SUBWri, AArch64::SUBSWri},
+                                          MI, ToBeRemoved);
+        break;
+      case AArch64::SUBSWrr:
+        Changed = visitADDSSUBS<uint32_t>({AArch64::SUBWri, AArch64::SUBSWri},
+                                          {AArch64::ADDWri, AArch64::ADDSWri},
+                                          MI, ToBeRemoved);
+        break;
+      case AArch64::ADDSXrr:
+        Changed = visitADDSSUBS<uint64_t>({AArch64::ADDXri, AArch64::ADDSXri},
+                                          {AArch64::SUBXri, AArch64::SUBSXri},
+                                          MI, ToBeRemoved);
+        break;
+      case AArch64::SUBSXrr:
+        Changed = visitADDSSUBS<uint64_t>({AArch64::SUBXri, AArch64::SUBSXri},
+                                          {AArch64::ADDXri, AArch64::ADDSXri},
+                                          MI, ToBeRemoved);
+        break;
       }
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index b95c15ac6d073..dd36bf6e8d35b 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu -verify-machineinstrs | FileCheck %s
 
 ; Note that this should be refactored (for efficiency if nothing else)
 ; when the PCS is implemented so we don't have to worry about the
@@ -406,4 +406,290 @@ define i64 @addl_0x80000000(i64 %a) {
   ret i64 %b
 }
 
-; TODO: adds/subs
+; ADDS and SUBS Optimizations
+; Checks with all types first, then checks that only EQ and NE optimize
+define i1 @eq_i(i32 %0) {
+; CHECK-LABEL: eq_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmp w8, #273
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %2 = icmp eq i32 %0, 1118481
+  ret i1 %2
+}
+
+define i1 @eq_l(i64 %0) {
+; CHECK-LABEL: eq_l:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmp x8, #273
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %2 = icmp eq i64 %0, 1118481
+  ret i1 %2
+}
+
+define i1 @ne_i(i32 %0) {
+; CHECK-LABEL: ne_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmp w8, #273
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %2 = icmp ne i32 %0, 1118481
+  ret i1 %2
+}
+
+define i1 @ne_l(i64 %0) {
+; CHECK-LABEL: ne_l:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmp x8, #273
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %2 = icmp ne i64 %0, 1118481
+  ret i1 %2
+}
+
+define i1 @eq_in(i32 %0) {
+; CHECK-LABEL: eq_in:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmn w8, #273
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %2 = icmp eq i32 %0, -1118481
+  ret i1 %2
+}
+
+define i1 @eq_ln(i64 %0) {
+; CHECK-LABEL: eq_ln:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmn x8, #273
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %2 = icmp eq i64 %0, -1118481
+  ret i1 %2
+}
+
+define i1 @ne_in(i32 %0) {
+; CHECK-LABEL: ne_in:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmn w8, #273
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %2 = icmp ne i32 %0, -1118481
+  ret i1 %2
+}
+
+define i1 @ne_ln(i64 %0) {
+; CHECK-LABEL: ne_ln:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    cmn x8, #273
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %2 = icmp ne i64 %0, -1118481
+  ret i1 %2
+}
+
+define i1 @reject_eq(i32 %0) {
+; CHECK-LABEL: reject_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #51712
+; CHECK-NEXT:    movk w8, #15258, lsl #16
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %2 = icmp eq i32 %0, 1000000000
+  ret i1 %2
+}
+
+define i1 @reject_non_eqne_csinc(i32 %0) {
+; CHECK-LABEL: reject_non_eqne_csinc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #4369
+; CHECK-NEXT:    movk w8, #17, lsl #16
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %2 = icmp ult i32 %0, 1118481
+  ret i1 %2
+}
+
+define i32 @accept_csel(i32 %0) {
+; CHECK-LABEL: accept_csel:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w9, w0, #273, lsl #12 // =1118208
+; CHECK-NEXT:    mov w8, #17
+; CHECK-NEXT:    cmp w9, #273
+; CHECK-NEXT:    mov w9, #11
+; CHECK-NEXT:    csel w0, w9, w8, eq
+; CHECK-NEXT:    ret
+  %2 = icmp eq i32 %0, 1118481
+  %3 = select i1 %2, i32 11, i32 17
+  ret i32 %3
+}
+
+define i32 @reject_non_eqne_csel(i32 %0) {
+; CHECK-LABEL: reject_non_eqne_csel:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #4369
+; CHECK-NEXT:    mov w9, #11
+; CHECK-NEXT:    movk w8, #17, lsl #16
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    mov w8, #17
+; CHECK-NEXT:    csel w0, w9, w8, lo
+; CHECK-NEXT:    ret
+  %2 = icmp ult i32 %0, 1118481
+  %3 = select i1 %2, i32 11, i32 17
+  ret i32 %3
+}
+
+declare void @fooy()
+
+define void @accept_branch(i32 %0) {
+; CHECK-LABEL: accept_branch:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, #291, lsl #12 // =1191936
+; CHECK-NEXT:    cmp w8, #1110
+; CHECK-NEXT:    b.eq .LBB32_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    b fooy
+  %2 = icmp ne i32 %0, 1193046
+  br i1 %2, label %4, label %3
+3:                                                ; preds = %1
+  tail call void @fooy()
+  br label %4
+4:                                                ; preds = %3, %1
+  ret void
+}
+
+define void @reject_non_eqne_branch(i32 %0) {
+; CHECK-LABEL: reject_non_eqne_branch:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #13398
+; CHECK-NEXT:    movk w8, #18, lsl #16
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    b.le .LBB33_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    b fooy
+  %2 = icmp sgt i32 %0, 1193046
+  br i1 %2, label %4, label %3
+3:                                                ; preds = %1
+  tail call void @fooy()
+  br label %4
+4:                                                ; preds = %3, %1
+  ret void
+}
+
+define i32 @reject_multiple_usages(i32 %0) {
+; CHECK-LABEL: reject_multiple_usages:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #4369
+; CHECK-NEXT:    mov w9, #3
+; CHECK-NEXT:    movk w8, #17, lsl #16
+; CHECK-NEXT:    mov w10, #17
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    mov w11, #12
+; CHECK-NEXT:    csel w8, w8, w9, eq
+; CHECK-NEXT:    csel w9, w11, w10, hi
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    mov w9, #53312
+; CHECK-NEXT:    movk w9, #2, lsl #16
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    mov w9, #26304
+; CHECK-NEXT:    movk w9, #1433, lsl #16
+; CHECK-NEXT:    csel w0, w8, w9, hi
+; CHECK-NEXT:    ret
+  %2 = icmp eq i32 %0, 1118481
+  %3 = icmp ugt i32 %0, 1118481
+  %4 = select i1 %2, i32 9, i32 3
+  %5 = select i1 %3, i32 12, i32 17
+  %6 = add i32 %4, %5
+  %7 = icmp ugt i32 %0, 184384
+  %8 = select i1 %7, i32 %6, i32 93939392
+  ret i32 %8
+}
+
+; Unique case found in ClangBuiltLinux where the DstReg is not Virtual and
+; caused an assertion failure
+define dso_local i32 @neigh_periodic_work_tbl_1() {
+; CHECK-LABEL: neigh_periodic_work_tbl_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, neigh_periodic_work_tbl_1
+; CHECK-NEXT:    add x8, x8, :lo12:neigh_periodic_work_tbl_1
+; CHECK-NEXT:    add x8, x8, #18, lsl #12 // =73728
+; CHECK-NEXT:    cmn x8, #1272
+; CHECK-NEXT:    b.pl .LBB35_2
+; CHECK-NEXT:  .LBB35_1: // %for.cond
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    b .LBB35_1
+; CHECK-NEXT:  .LBB35_2: // %if.end
+; CHECK-NEXT:    ret
+entry:
+  br i1 icmp slt (i64 add (i64 ptrtoint (i32 ()* @neigh_periodic_work_tbl_1 to i64), i64 75000), i64 0), label %for.cond, label %if.end
+for.cond:                                         ; preds = %entry, %for.cond
+  br label %for.cond
+if.end:                                           ; preds = %entry
+  ret i32 undef
+}
+
+@jiffies = dso_local local_unnamed_addr global i32 0, align 4
+@primary_crng = dso_local local_unnamed_addr global i32 0, align 4
+@input_pool = dso_local global i32 0, align 4
+declare dso_local i32 @crng_reseed(...) local_unnamed_addr
+; Function Attrs: nounwind uwtable
+define dso_local i32 @_extract_crng_crng() {
+; CHECK-LABEL: _extract_crng_crng:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    adrp x8, _extract_crng_crng
+; CHECK-NEXT:    add x8, x8, :lo12:_extract_crng_crng
+; CHECK-NEXT:    tbnz x8, #63, .LBB36_2
+; CHECK-NEXT:  // %bb.1: // %lor.lhs.false
+; CHECK-NEXT:    adrp x9, jiffies
+; CHECK-NEXT:    ldrsw x9, [x9, :lo12:jiffies]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    add x8, x8, #18, lsl #12 // =73728
+; CHECK-NEXT:    cmn x8, #1272
+; CHECK-NEXT:    b.pl .LBB36_3
+; CHECK-NEXT:  .LBB36_2: // %if.then
+; CHECK-NEXT:    adrp x8, primary_crng
+; CHECK-NEXT:    adrp x9, input_pool
+; CHECK-NEXT:    add x9, x9, :lo12:input_pool
+; CHECK-NEXT:    ldr w8, [x8, :lo12:primary_crng]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel x0, xzr, x9, eq
+; CHECK-NEXT:    bl crng_reseed
+; CHECK-NEXT:  .LBB36_3: // %if.end
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  br i1 icmp slt (i32 ()* @_extract_crng_crng, i32 ()* null), label %if.then, label %lor.lhs.false
+lor.lhs.false:                                    ; preds = %entry
+  %0 = load i32, i32* @jiffies, align 4
+  %idx.ext = sext i32 %0 to i64
+  %idx.neg = sub nsw i64 0, %idx.ext
+  %add.ptr = getelementptr i8, i8* getelementptr (i8, i8* bitcast (i32 ()* @_extract_crng_crng to i8*), i64 75000), i64 %idx.neg
+  %cmp = icmp slt i8* %add.ptr, null
+  br i1 %cmp, label %if.then, label %if.end
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %1 = load i32, i32* @primary_crng, align 4
+  %tobool.not = icmp eq i32 %1, 0
+  %cond = select i1 %tobool.not, i32* null, i32* @input_pool
+  %call = tail call i32 bitcast (i32 (...)* @crng_reseed to i32 (i32*)*)(i32* noundef %cond)
+  br label %if.end
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 undef
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll
index f5a4a45adbc0e..0c2cf1778722e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll
@@ -12,8 +12,8 @@
 ; YAML:      - INST_add:    '2'
 ; YAML:      - INST_b.:     '1'
 ; YAML:      - INST_ldr:    '1'
-; YAML:      - INST_movk:   '1'
-; YAML:      - INST_movz:   '1'
+; YAML:      - INST_orr:    '1'
+; YAML:      - INST_sub:   '1'
 ; YAML:      - INST_subs:   '1'
 
 ; YAML:      Name:            InstructionMix
@@ -27,13 +27,12 @@
 define i32 @foo(i32* %ptr, i32 %x, i64 %y) !dbg !3 {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    ldr w10, [x0]
+; CHECK-NEXT:    ldr w9, [x0]
 ; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    mov w9, #16959
-; CHECK-NEXT:    movk w9, #15, lsl #16
-; CHECK-NEXT:    add w0, w10, w1
-; CHECK-NEXT:    add x10, x0, x2
-; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add w0, w9, w1
+; CHECK-NEXT:    add x9, x0, x2
+; CHECK-NEXT:    sub x9, x9, #244, lsl #12 ; =999424
+; CHECK-NEXT:    cmp x9, #575
 ; CHECK-NEXT:    b.eq LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %else
 ; CHECK-NEXT:    mul w9, w0, w1

From f6ee45e94391ef8cee67e2a4ad6d61c614985de9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 15:36:17 +0000
Subject: [PATCH 337/748] [X86] Add some add reduction tests for values that
 are zero in the upper bits

This is an extension of some of the tests mentioned in Issue #42019 - we might be able to use PSADBW to add+zext 4 x bytes to i64 that can then be reduced
---
 .../CodeGen/X86/vector-reduce-add-mask.ll     | 1362 +++++++++++++++++
 1 file changed, 1362 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/vector-reduce-add-mask.ll

diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
new file mode 100644
index 0000000000000..2a83c0235db6a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -0,0 +1,1362 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; vXi64
+;
+
+define i64 @test_v2i64_v2i32(<2 x i64> %a0) {
+; SSE2-LABEL: test_v2i64_v2i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i64_v2i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_v2i64_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+  %1 = and <2 x i64> %a0, <i64 255, i64 255>
+  %2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %1)
+  ret i64 %2
+}
+
+define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
+; SSE2-LABEL: test_v4i64_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i64_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    movq %xmm1, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i64_v4i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i64_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i64_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <4 x i64> %a0, <i64 15, i64 31, i64 63, i64 127>
+  %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1)
+  ret i64 %2
+}
+
+define i64 @test_v8i64_v8i8(<8 x i64> %a0) {
+; SSE2-LABEL: test_v8i64_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrlq $60, %xmm2
+; SSE2-NEXT:    psrlq $60, %xmm0
+; SSE2-NEXT:    psrlq $60, %xmm3
+; SSE2-NEXT:    psrlq $60, %xmm1
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i64_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrlq $60, %xmm2
+; SSE41-NEXT:    psrlq $60, %xmm0
+; SSE41-NEXT:    psrlq $60, %xmm3
+; SSE41-NEXT:    psrlq $60, %xmm1
+; SSE41-NEXT:    paddq %xmm3, %xmm1
+; SSE41-NEXT:    paddq %xmm2, %xmm1
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i64_v8i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i64_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlq $60, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $60, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i64_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $60, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = lshr <8 x i64> %a0, <i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60, i64 60>
+  %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1)
+  ret i64 %2
+}
+
+define i64 @test_v16i64_v16i8(<16 x i64> %a0) {
+; SSE2-LABEL: test_v16i64_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [1,1]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    paddq %xmm5, %xmm3
+; SSE2-NEXT:    paddq %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    paddq %xmm4, %xmm2
+; SSE2-NEXT:    paddq %xmm3, %xmm2
+; SSE2-NEXT:    paddq %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i64_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [1,1]
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm1
+; SSE41-NEXT:    pand %xmm8, %xmm7
+; SSE41-NEXT:    pand %xmm8, %xmm3
+; SSE41-NEXT:    paddq %xmm7, %xmm3
+; SSE41-NEXT:    paddq %xmm5, %xmm3
+; SSE41-NEXT:    paddq %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm8, %xmm4
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm8, %xmm6
+; SSE41-NEXT:    pand %xmm8, %xmm2
+; SSE41-NEXT:    paddq %xmm6, %xmm2
+; SSE41-NEXT:    paddq %xmm4, %xmm2
+; SSE41-NEXT:    paddq %xmm3, %xmm2
+; SSE41-NEXT:    paddq %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: test_v16i64_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1]
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i64_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1]
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i64_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <16 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %1)
+  ret i64 %2
+}
+
+;
+; vXi32
+;
+
+define i32 @test_v2i32_v2i16(<2 x i32> %a0) {
+; SSE2-LABEL: test_v2i32_v2i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i32_v2i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v2i32_v2i16:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v2i32_v2i16:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i32_v2i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i32_v2i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = and <2 x i32> %a0, <i32 255, i32 255>
+  %2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @test_v4i32(<4 x i32> %a0) {
+; SSE2-LABEL: test_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrld $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrld $31, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v4i32:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v4i32:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    retq
+  %1 = lshr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
+  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @test_v8i32_v8i8(<8 x i32> %a0) {
+; SSE2-LABEL: test_v8i32_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i32_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v8i32_v8i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-SLOW-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v8i32_v8i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i32_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64>
+  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
+; SSE2-LABEL: test_v16i32_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i32_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v16i32_v16i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-SLOW-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v16i32_v16i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-FAST-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-FAST-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i32_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i32_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
+  ret i32 %2
+}
+
+define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
+; SSE2-LABEL: test_v32i32_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i32_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm1
+; SSE41-NEXT:    pand %xmm8, %xmm7
+; SSE41-NEXT:    pand %xmm8, %xmm3
+; SSE41-NEXT:    paddd %xmm7, %xmm3
+; SSE41-NEXT:    paddd %xmm5, %xmm3
+; SSE41-NEXT:    paddd %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm8, %xmm4
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm8, %xmm6
+; SSE41-NEXT:    pand %xmm8, %xmm2
+; SSE41-NEXT:    paddd %xmm6, %xmm2
+; SSE41-NEXT:    paddd %xmm4, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    paddd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v32i32_v32i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v32i32_v32i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i32_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i32_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512-NEXT:    vpandd %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpandd %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <32 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
+  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
+  ret i32 %2
+}
+
+;
+; vXi16
+;
+
+define i16 @test_v2i16_v2i8(<2 x i16> %a0) {
+; SSE2-LABEL: test_v2i16_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v2i16_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v2i16_v2i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v2i16_v2i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i16_v2i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v2i16_v2i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = and <2 x i16> %a0, <i16 255, i16 255>
+  %2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %1)
+  ret i16 %2
+}
+
+define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
+; SSE2-LABEL: test_v4i16_v4i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v4i16_v4i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v4i16_v4i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v4i16_v4i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i16_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    retq
+  %1 = lshr <4 x i16> %a0, <i16 0, i16 1, i16 2, i16 3>
+  %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
+  ret i16 %2
+}
+
+define i16 @test_v8i16_v8i8(<8 x i16> %a0) {
+; SSE2-LABEL: test_v8i16_v8i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v8i16_v8i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v8i16_v8i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v8i16_v8i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i16_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v8i16_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+  %1 = and <8 x i16> %a0, <i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>
+  %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
+  ret i16 %2
+}
+
+define i16 @test_v16i16_v16i8(<16 x i16> %a0) {
+; SSE2-LABEL: test_v16i16_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,4,8,16,32,64]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v16i16_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,4,8,16,32,64]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v16i16_v16i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v16i16_v16i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v16i16_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v16i16_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>
+  %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
+  ret i16 %2
+}
+
+define i16 @test_v32i16_v32i8(<32 x i16> %a0) {
+; SSE2-LABEL: test_v32i16_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm3
+; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm3, %xmm1
+; SSE2-NEXT:    paddw %xmm2, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v32i16_v32i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrlw $8, %xmm2
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    psrlw $8, %xmm3
+; SSE41-NEXT:    psrlw $8, %xmm1
+; SSE41-NEXT:    paddw %xmm3, %xmm1
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v32i16_v32i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpsrlw $8, %xmm1, %xmm2
+; AVX1-SLOW-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX1-SLOW-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v32i16_v32i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vpsrlw $8, %xmm1, %xmm2
+; AVX1-FAST-NEXT:    vpsrlw $8, %xmm0, %xmm3
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX1-FAST-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v32i16_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v32i16_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1)
+  ret i16 %2
+}
+
+define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
+; SSE2-LABEL: test_v64i16_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127]
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    paddw %xmm7, %xmm3
+; SSE2-NEXT:    paddw %xmm5, %xmm3
+; SSE2-NEXT:    paddw %xmm1, %xmm3
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    paddw %xmm6, %xmm2
+; SSE2-NEXT:    paddw %xmm4, %xmm2
+; SSE2-NEXT:    paddw %xmm3, %xmm2
+; SSE2-NEXT:    paddw %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_v64i16_v64i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127]
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm1
+; SSE41-NEXT:    pand %xmm8, %xmm7
+; SSE41-NEXT:    pand %xmm8, %xmm3
+; SSE41-NEXT:    paddw %xmm7, %xmm3
+; SSE41-NEXT:    paddw %xmm5, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm3
+; SSE41-NEXT:    pand %xmm8, %xmm4
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm8, %xmm6
+; SSE41-NEXT:    pand %xmm8, %xmm2
+; SSE41-NEXT:    paddw %xmm6, %xmm2
+; SSE41-NEXT:    paddw %xmm4, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm2
+; SSE41-NEXT:    paddw %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE41-NEXT:    paddw %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: test_v64i16_v64i8:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovaps {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT:    vzeroupper
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: test_v64i16_v64i8:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-FAST-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
+; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT:    vzeroupper
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: test_v64i16_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_v64i16_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %1 = and <64 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
+  %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1)
+  ret i16 %2
+}
+
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
+
+declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
+
+declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
+declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)

From ebeb191b65feec74d54cec574e6bcae9f269cf8b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 17:37:07 +0000
Subject: [PATCH 338/748] [X86] Add bswap(shl()) test

Test based off issues #51391 and #53867 - we're going to end up needing InstCombine + DAG variants of this fold as DAG can create BSWAP nodes as part of load folding
---
 llvm/test/CodeGen/X86/combine-bswap.ll | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll
index 4fbb3bf98171f..2d262935f8137 100644
--- a/llvm/test/CodeGen/X86/combine-bswap.ll
+++ b/llvm/test/CodeGen/X86/combine-bswap.ll
@@ -6,6 +6,7 @@
 ; actual output is massive at the moment as llvm.bswap is not yet legal.
 
 declare i32 @llvm.bswap.i32(i32) readnone
+declare i64 @llvm.bswap.i64(i64) readnone
 declare i32 @llvm.bswap.v4i32(i32) readnone
 
 ; fold (bswap undef) -> undef
@@ -82,3 +83,24 @@ define void @demand_one_loaded_byte(i64* %xp, i32* %yp) {
   store i32 %r, i32* %yp, align 4
   ret void
 }
+
+define i64 @test_bswap_shift(i16 %0) {
+; X86-LABEL: test_bswap_shift:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bswap_shift:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlq $48, %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    retq
+  %2 = zext i16 %0 to i64
+  %3 = shl i64 %2, 48
+  %4 = call i64 @llvm.bswap.i64(i64 %3)
+  ret i64 %4
+}

From d5304d44a583657cc58e8d321ddc13dee09a690f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 19:16:56 +0000
Subject: [PATCH 339/748] [X86] Extend bswap+shl tests

Different shift amounts and multiuse tests
---
 llvm/test/CodeGen/X86/combine-bswap.ll | 104 +++++++++++++++++++++++--
 1 file changed, 97 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll
index 2d262935f8137..c20f54d3e3582 100644
--- a/llvm/test/CodeGen/X86/combine-bswap.ll
+++ b/llvm/test/CodeGen/X86/combine-bswap.ll
@@ -84,8 +84,8 @@ define void @demand_one_loaded_byte(i64* %xp, i32* %yp) {
   ret void
 }
 
-define i64 @test_bswap_shift(i16 %0) {
-; X86-LABEL: test_bswap_shift:
+define i64 @test_bswap64_shift48_zext(i16 %a0) {
+; X86-LABEL: test_bswap64_shift48_zext:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $16, %eax
@@ -93,14 +93,104 @@ define i64 @test_bswap_shift(i16 %0) {
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_bswap_shift:
+; X64-LABEL: test_bswap64_shift48_zext:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shlq $48, %rax
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    retq
-  %2 = zext i16 %0 to i64
-  %3 = shl i64 %2, 48
-  %4 = call i64 @llvm.bswap.i64(i64 %3)
-  ret i64 %4
+  %z = zext i16 %a0 to i64
+  %s = shl i64 %z, 48
+  %b = call i64 @llvm.bswap.i64(i64 %s)
+  ret i64 %b
+}
+
+define i64 @test_bswap64_shift48(i64 %a0) {
+; X86-LABEL: test_bswap64_shift48:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bswap64_shift48:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $48, %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    retq
+  %s = shl i64 %a0, 48
+  %b = call i64 @llvm.bswap.i64(i64 %s)
+  ret i64 %b
+}
+
+define i32 @test_bswap32_shift17(i32 %a0) {
+; X86-LABEL: test_bswap32_shift17:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $17, %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bswap32_shift17:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $17, %eax
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    retq
+  %s = shl i32 %a0, 17
+  %b = call i32 @llvm.bswap.i32(i32 %s)
+  ret i32 %b
+}
+
+; negative test
+define i64 @test_bswap64_shift17(i64 %a0) {
+; X86-LABEL: test_bswap64_shift17:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shldl $17, %edx, %eax
+; X86-NEXT:    shll $17, %edx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bswap64_shift17:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $17, %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    retq
+  %s = shl i64 %a0, 17
+  %b = call i64 @llvm.bswap.i64(i64 %s)
+  ret i64 %b
+}
+
+; negative test
+define i64 @test_bswap64_shift48_multiuse(i64 %a0, i64* %a1) {
+; X86-LABEL: test_bswap64_shift48_multiuse:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bswap64_shift48_multiuse:
+; X64:       # %bb.0:
+; X64-NEXT:    shlq $48, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    orq %rax, %rdi
+; X64-NEXT:    movq %rdi, (%rsi)
+; X64-NEXT:    retq
+  %s = shl i64 %a0, 48
+  %b = call i64 @llvm.bswap.i64(i64 %s)
+  %a = add i64 %s, %b
+  store i64 %a, i64* %a1
+  ret i64 %b
 }

From dc4f9f0368cd56484d5e33364c06739be5ae1f1d Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sat, 19 Feb 2022 20:20:52 +0100
Subject: [PATCH 340/748] [ADT] Just use a union in IntervalMap

IntervalMap has seen type-punned arrays, AlignedCharArrayUnion and
std::aligned_union_t, with varying degrees of buggyness. Plain unions
have become quite powerful, so just try that instead.
---
 llvm/include/llvm/ADT/IntervalMap.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/ADT/IntervalMap.h b/llvm/include/llvm/ADT/IntervalMap.h
index 368ed46f98d23..2da72aec77d5d 100644
--- a/llvm/include/llvm/ADT/IntervalMap.h
+++ b/llvm/include/llvm/ADT/IntervalMap.h
@@ -106,8 +106,6 @@
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/bit.h"
-#include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include <algorithm>
@@ -969,7 +967,10 @@ class IntervalMap {
 
 private:
   // The root data is either a RootLeaf or a RootBranchData instance.
-  AlignedCharArrayUnion<RootLeaf, RootBranchData> data;
+  union {
+    RootLeaf leaf;
+    RootBranchData branchData;
+  };
 
   // Tree height.
   // 0: Leaves in root.
@@ -983,25 +984,22 @@ class IntervalMap {
   // Allocator used for creating external nodes.
   Allocator &allocator;
 
-  /// Represent data as a node type without breaking aliasing rules.
-  template <typename T> T &dataAs() const { return *bit_cast<T *>(&data); }
-
   const RootLeaf &rootLeaf() const {
     assert(!branched() && "Cannot acces leaf data in branched root");
-    return dataAs<RootLeaf>();
+    return leaf;
   }
   RootLeaf &rootLeaf() {
     assert(!branched() && "Cannot acces leaf data in branched root");
-    return dataAs<RootLeaf>();
+    return leaf;
   }
 
-  RootBranchData &rootBranchData() const {
+  const RootBranchData &rootBranchData() const {
     assert(branched() && "Cannot access branch data in non-branched root");
-    return dataAs<RootBranchData>();
+    return branchData;
   }
   RootBranchData &rootBranchData() {
     assert(branched() && "Cannot access branch data in non-branched root");
-    return dataAs<RootBranchData>();
+    return branchData;
   }
 
   const RootBranch &rootBranch() const { return rootBranchData().node; }
@@ -1042,8 +1040,6 @@ class IntervalMap {
 
 public:
   explicit IntervalMap(Allocator &a) : height(0), rootSize(0), allocator(a) {
-    assert((uintptr_t(&data) & (alignof(RootLeaf) - 1)) == 0 &&
-           "Insufficient alignment");
     new(&rootLeaf()) RootLeaf();
   }
 

From ab069f37e80fe6d51c1ed32a931866e99a05e2b7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 18 Feb 2022 22:04:57 +0000
Subject: [PATCH 341/748] [X86] combineArithReduction - pull out repeated
 getVectorNumElements() calls

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9e765a90e8bcd..e2d37ee917f5e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42963,6 +42963,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
     return SDValue();
 
   SDLoc DL(ExtElt);
+  unsigned NumElts = VecVT.getVectorNumElements();
 
   // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
   auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
@@ -42984,7 +42985,6 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
 
   // vXi8 mul reduction - promote to vXi16 mul reduction.
   if (Opc == ISD::MUL) {
-    unsigned NumElts = VecVT.getVectorNumElements();
     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
       return SDValue();
     if (VecVT.getSizeInBits() >= 128) {
@@ -43027,8 +43027,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
   }
 
   // Must be a >=128-bit vector with pow2 elements.
-  if ((VecVT.getSizeInBits() % 128) != 0 ||
-      !isPowerOf2_32(VecVT.getVectorNumElements()))
+  if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
     return SDValue();
 
   // vXi8 add reduction - sum lo/hi halves then use PSADBW.

From dfa9716dd7a63943ba887bc002d726bffc3490ef Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 19:43:53 +0000
Subject: [PATCH 342/748] [GISel] Fix dead code warning in
 getRuleRangeForIdentifier emitted method. NFC.

Break the if-else chain as every block returns, and remove the return at the end of the function as the else block means this was never hit
---
 llvm/utils/TableGen/GICombinerEmitter.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/utils/TableGen/GICombinerEmitter.cpp b/llvm/utils/TableGen/GICombinerEmitter.cpp
index 0dea1ef00e4bd..3ab44ae528449 100644
--- a/llvm/utils/TableGen/GICombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GICombinerEmitter.cpp
@@ -939,15 +939,14 @@ void GICombinerEmitter::run(raw_ostream &OS) {
      << "      report_fatal_error(\"Beginning of range should be before "
         "end of range\");\n"
      << "    return {{*First, *Last + 1}};\n"
-     << "  } else if (RangePair.first == \"*\") {\n"
+     << "  }\n"
+     << "  if (RangePair.first == \"*\") {\n"
      << "    return {{0, " << Rules.size() << "}};\n"
-     << "  } else {\n"
-     << "    const auto I = getRuleIdxForIdentifier(RangePair.first);\n"
-     << "    if (!I.hasValue())\n"
-     << "      return None;\n"
-     << "    return {{*I, *I + 1}};\n"
      << "  }\n"
-     << "  return None;\n"
+     << "  const auto I = getRuleIdxForIdentifier(RangePair.first);\n"
+     << "  if (!I.hasValue())\n"
+     << "    return None;\n"
+     << "  return {{*I, *I + 1}};\n"
      << "}\n\n";
 
   for (bool Enabled : {true, false}) {

From 05cd79d59900ac57a94a0699594163689606b336 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sat, 19 Feb 2022 20:55:20 +0100
Subject: [PATCH 343/748] [lldb] Fix some accidental IntervalMap copies

I made that type non-copyable in some cases in dc4f9f0368cd
---
 lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index a1ff6adea1202..f9929aed06ecf 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -1679,8 +1679,8 @@ class VMAddressProvider {
   ObjectFile::Type ObjectType;
   addr_t NextVMAddress = 0;
   VMMap::Allocator Alloc;
-  VMMap Segments = VMMap(Alloc);
-  VMMap Sections = VMMap(Alloc);
+  VMMap Segments{Alloc};
+  VMMap Sections{Alloc};
   lldb_private::Log *Log = GetLog(LLDBLog::Modules);
   size_t SegmentCount = 0;
   std::string SegmentName;

From 55e0b388d06d8e9983b54ac9f9f2707d91ce0bda Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 19 Feb 2022 20:19:48 +0000
Subject: [PATCH 344/748] [X86] vector-reduce-add-mask.ll - add missing
 AVX512BW/BWVL check prefixes

---
 .../CodeGen/X86/vector-reduce-add-mask.ll     | 29 +++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 2a83c0235db6a..27abcf095946d 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -4,8 +4,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL
 
 ;
 ; vXi64
@@ -860,6 +860,31 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v4i16_v4i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: test_v4i16_v4i8:
+; AVX512BWVL:       # %bb.0:
+; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BWVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BWVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
   %1 = lshr <4 x i16> %a0, <i16 0, i16 1, i16 2, i16 3>
   %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
   ret i16 %2

From 24bfa243551034bf949772146bdf27b14ce9674a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 19 Feb 2022 12:03:28 -0800
Subject: [PATCH 345/748] [SelectionDAGBuilder] Simplify visitShift. NFC

This code was detecting whether the value returned by getShiftAmountTy
can represent all shift amounts. If not, it would use MVT::i32 as a
placeholder. getShiftAmountTy was updated last year to return i32
if the type returned by the target couldn't represent all values.

This means the MVT::i32 case here is dead and can the logic can
be simplified.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D120164
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 24 ++++---------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3e2dd9ec74e09..8dc6e98483aa7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3150,26 +3150,12 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
   EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
       Op1.getValueType(), DAG.getDataLayout());
 
-  // Coerce the shift amount to the right type if we can.
+  // Coerce the shift amount to the right type if we can. This exposes the
+  // truncate or zext to optimization early.
   if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
-    unsigned ShiftSize = ShiftTy.getSizeInBits();
-    unsigned Op2Size = Op2.getValueSizeInBits();
-    SDLoc DL = getCurSDLoc();
-
-    // If the operand is smaller than the shift count type, promote it.
-    if (ShiftSize > Op2Size)
-      Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);
-
-    // If the operand is larger than the shift count type but the shift
-    // count type has enough bits to represent any shift value, truncate
-    // it now. This is a common case and it exposes the truncate to
-    // optimization early.
-    else if (ShiftSize >= Log2_32_Ceil(Op1.getValueSizeInBits()))
-      Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
-    // Otherwise we'll need to temporarily settle for some other convenient
-    // type.  Type legalization will make adjustments once the shiftee is split.
-    else
-      Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
+    assert(ShiftTy.getSizeInBits() >= Log2_32_Ceil(Op1.getValueSizeInBits()) &&
+           "Unexpected shift type");
+    Op2 = DAG.getZExtOrTrunc(Op2, getCurSDLoc(), ShiftTy);
   }
 
   bool nuw = false;

From 4ec00fb3eafa885da6d305ebdf1361d4be54dedf Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Sun, 20 Feb 2022 05:49:33 +0900
Subject: [PATCH 346/748] [mlir][bufferize] Add a way for ops to fail the
 analysis

Add `BufferizableOpInterface::verifyAnalysis`. Ops can implement this method to check for expected invariants and limitations.

The purpose of this change is to introduce a modular way of checking assertions such as `assertScfForAliasingProperties`.

Differential Revision: https://reviews.llvm.org/D120189
---
 .../IR/BufferizableOpInterface.td             | 17 +++++
 .../Dialect/SCF/BufferizableOpInterfaceImpl.h | 10 ---
 .../Transforms/OneShotAnalysis.cpp            | 13 ++++
 .../Transforms/ComprehensiveBufferizePass.cpp |  3 -
 .../BufferizableOpInterfaceImpl.cpp           | 66 +++++++++----------
 ...omprehensive-module-bufferize-invalid.mlir |  6 +-
 ...omprehensive-module-bufferize-partial.mlir | 29 --------
 .../Linalg/TestComprehensiveBufferize.cpp     |  4 --
 8 files changed, 65 insertions(+), 83 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
index f6c51dae92eaf..ac26d327d2e31 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
@@ -290,6 +290,23 @@ def BufferizableOpInterface : OpInterface<"BufferizableOpInterface"> {
         /*defaultImplementation=*/[{
           return false;
         }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Return `failure` if this op does not pass the analysis. This method
+          is run during One-Shot Bufferize (after all post-analysis steps). If
+          the op does not pass the analysis, bufferization is aborted.
+
+          This method can be used to check expected invariants and limitations
+          of the current bufferization implementation.
+        }],
+        /*retType=*/"LogicalResult",
+        /*methodName=*/"verifyAnalysis",
+        /*args=*/(ins "const BufferizationState &":$state),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return success();
+        }]
       >
   ];
 
diff --git a/mlir/include/mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h b/mlir/include/mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h
index dfeb9514409fb..08c6ca2ee0d29 100644
--- a/mlir/include/mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h
+++ b/mlir/include/mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h
@@ -20,16 +20,6 @@ class BufferizationAliasInfo;
 } // namespace bufferization
 
 namespace scf {
-/// Assert that yielded values of an scf.for op are aliasing their corresponding
-/// bbArgs. This is required because the i-th OpResult of an scf.for op is
-/// currently assumed to alias with the i-th iter_arg (in the absence of
-/// conflicts).
-LogicalResult
-assertScfForAliasingProperties(Operation *op,
-                               bufferization::BufferizationState &state,
-                               bufferization::BufferizationAliasInfo &aliasInfo,
-                               SmallVector<Operation *> &newOps);
-
 void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
 } // namespace scf
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 78e3ac8aba7c3..6232e9ae7cba0 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -778,6 +778,19 @@ LogicalResult bufferization::analyzeOp(Operation *op,
       return failure();
   }
 
+  // Analysis verification: After setting up alias/equivalence sets, each op
+  // can check for expected invariants/limitations and fail the analysis if
+  // necessary.
+  bool passedAnalysis = true;
+  op->walk([&](Operation *op) {
+    if (BufferizableOpInterface bufferizableOp =
+            options.dynCastBufferizableOp(op))
+      if (failed(bufferizableOp.verifyAnalysis(state)))
+        passedAnalysis = false;
+  });
+  if (!passedAnalysis)
+    return failure();
+
   // Annotate operations if we only want to report the analysis.
   if (options.testAnalysisOnly)
     annotateOpsWithBufferizationMarkers(op, aliasInfo, state);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
index cd71264064168..b4ac512463cb5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
@@ -105,9 +105,6 @@ void LinalgComprehensiveModuleBufferize::runOnOperation() {
     opt = *options;
   }
 
-  // Only certain scf.for ops are supported by the analysis.
-  opt.addPostAnalysisStep(scf::assertScfForAliasingProperties);
-
   ModuleOp moduleOp = getOperation();
   applyEnablingTransformations(moduleOp);
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 83a70e8dcf3af..d4dd3489841c6 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -385,6 +385,37 @@ struct ForOpInterface
 
     return success();
   }
+
+  /// Assert that yielded values of an scf.for op are aliasing with their
+  /// corresponding bbArgs. This is required because the i-th OpResult of an
+  /// scf.for op is currently assumed to alias with the i-th iter_arg (in the
+  /// absence of conflicts).
+  LogicalResult verifyAnalysis(Operation *op,
+                               const BufferizationState &state) const {
+    auto forOp = cast<scf::ForOp>(op);
+    auto yieldOp =
+        cast<scf::YieldOp>(forOp.getLoopBody().front().getTerminator());
+    for (OpOperand &operand : yieldOp->getOpOperands()) {
+      auto tensorType = operand.get().getType().dyn_cast<TensorType>();
+      if (!tensorType)
+        continue;
+
+      OpOperand &forOperand = forOp.getOpOperandForResult(
+          forOp->getResult(operand.getOperandNumber()));
+      auto bbArg = forOp.getRegionIterArgForOpOperand(forOperand);
+      // Note: This is overly strict. We should check for aliasing bufferized
+      // values. But we don't have a "must-alias" analysis yet.
+      if (!state.areEquivalentBufferizedValues(operand.get(), bbArg))
+        // TODO: this could get resolved with copies but it can also turn into
+        // swaps so we need to be careful about order of copies.
+        return yieldOp->emitError()
+               << "Yield operand #" << operand.getOperandNumber()
+               << " does not bufferize to a buffer that is aliasing the "
+                  "matching"
+               << " enclosing scf::for operand";
+    }
+    return success();
+  }
 };
 
 /// Bufferization of scf.yield. Bufferized as part of their enclosing ops, so
@@ -434,41 +465,6 @@ struct YieldOpInterface
 } // namespace scf
 } // namespace mlir
 
-LogicalResult mlir::scf::assertScfForAliasingProperties(
-    Operation *op, BufferizationState &state, BufferizationAliasInfo &aliasInfo,
-    SmallVector<Operation *> &newOps) {
-  LogicalResult status = success();
-
-  op->walk([&](scf::ForOp forOp) {
-    auto yieldOp =
-        cast<scf::YieldOp>(forOp.getLoopBody().front().getTerminator());
-    for (OpOperand &operand : yieldOp->getOpOperands()) {
-      auto tensorType = operand.get().getType().dyn_cast<TensorType>();
-      if (!tensorType)
-        continue;
-
-      OpOperand &forOperand = forOp.getOpOperandForResult(
-          forOp->getResult(operand.getOperandNumber()));
-      auto bbArg = forOp.getRegionIterArgForOpOperand(forOperand);
-      // Note: This is overly strict. We should check for aliasing bufferized
-      // values. But we don't have a "must-alias" analysis yet.
-      if (!aliasInfo.areEquivalentBufferizedValues(operand.get(), bbArg)) {
-        // TODO: this could get resolved with copies but it can also turn into
-        // swaps so we need to be careful about order of copies.
-        status =
-            yieldOp->emitError()
-            << "Yield operand #" << operand.getOperandNumber()
-            << " does not bufferize to a buffer that is aliasing the matching"
-            << " enclosing scf::for operand";
-        return WalkResult::interrupt();
-      }
-    }
-    return WalkResult::advance();
-  });
-
-  return status;
-}
-
 void mlir::scf::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addOpInterface<ExecuteRegionOp, ExecuteRegionOpInterface>();
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
index d1791da1646bf..2adf2aadc2d93 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir
@@ -87,7 +87,7 @@ func @scf_for(%A : tensor<?xf32>,
               %B : tensor<?xf32> {linalg.inplaceable = true},
               %C : tensor<4xf32>,
               %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
+  -> (f32, f32)
 {
   %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
       -> (tensor<?xf32>, tensor<?xf32>)
@@ -102,7 +102,9 @@ func @scf_for(%A : tensor<?xf32>,
     scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
   }
 
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+  %f0 = tensor.extract %r0#0[%step] : tensor<?xf32>
+  %f1 = tensor.extract %r0#1[%step] : tensor<?xf32>
+  return %f0, %f1: f32, f32
 }
 
 // -----
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir
index ea1251fc080b2..0ea8b59adb9ef 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir
@@ -179,35 +179,6 @@ func @simple_tensor_test(%t1 : tensor<?xf32>, %f : f32) -> tensor<?xf32> {
 
 // -----
 
-// CHECK-SCF-LABEL: func @simple_scf_for(
-//  CHECK-SCF-SAME:     %[[t1:.*]]: tensor<?xf32>
-func @simple_scf_for(
-    %t1: tensor<?xf32>, %sz: index, %step: index, %f: f32) -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-
-  // CHECK-SCF: %[[t1_memref:.*]] = bufferization.to_memref %[[t1]]
-  // CHECK-SCF: %[[alloc:.*]] = memref.alloc
-  // CHECK-SCF: %[[casted:.*]] = memref.cast %[[alloc]]
-  // CHECK-SCF: memref.copy %[[t1_memref]], %[[alloc]]
-  // CHECK-SCF: %[[scf_for:.*]] = scf.for %[[iv:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[arg0:.*]] = %[[casted]]) -> ({{.*}}) {
-  %0 = scf.for %iv = %c0 to %sz step %step iter_args(%arg0 = %t1) -> tensor<?xf32> {
-    // CHECK-SCF: %[[arg0_tensor:.*]] = bufferization.to_tensor %[[arg0]]
-    // CHECK-SCF: %[[insert:.*]] = tensor.insert %{{.*}} into %[[arg0_tensor]]
-    %1 = tensor.insert %f into %arg0[%iv] : tensor<?xf32>
-
-    // CHECK-SCF: %[[insert_memref:.*]] = bufferization.to_memref %[[insert]]
-    // CHECK-SCF: scf.yield %[[insert_memref]]
-    scf.yield %1 : tensor<?xf32>
-  }
-  // CHECK-SCF: }
-
-  // CHECK-SCF: %[[scf_for_tensor:.*]] = bufferization.to_tensor %[[scf_for]]
-  // CHECK-SCF: return %[[scf_for_tensor]]
-  return %0 : tensor<?xf32>
-}
-
-// -----
-
 // CHECK-SCF-LABEL: func @simple_scf_if(
 //  CHECK-SCF-SAME:     %[[t1:.*]]: tensor<?xf32> {linalg.inplaceable = true}, %[[c:.*]]: i1, %[[pos:.*]]: index
 func @simple_scf_if(%t1: tensor<?xf32> {linalg.inplaceable = true}, %c: i1, %pos: index, %f: f32)
diff --git a/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp b/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
index f0b6b0e669ec4..9eb68343eaadf 100644
--- a/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
@@ -102,10 +102,6 @@ struct TestComprehensiveFunctionBufferize
 
 void TestComprehensiveFunctionBufferize::runOnOperation() {
   auto options = std::make_unique<AnalysisBufferizationOptions>();
-
-  if (!allowReturnMemref)
-    options->addPostAnalysisStep(scf::assertScfForAliasingProperties);
-
   options->allowReturnMemref = allowReturnMemref;
   options->allowUnknownOps = allowUnknownOps;
   options->testAnalysisOnly = testAnalysisOnly;

From c141d158e5d330c751966ed9814161eadcae086b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 19 Feb 2022 21:05:32 +0000
Subject: [PATCH 347/748] [VectorCombine] Remove redundant checks (NFC).

The removed conditions are already checked by the if above.

Fixes #53761.
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 620d388199e0f..e938ca6803bfd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1019,12 +1019,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
           return false;
         NumInstChecked++;
       }
-    }
-
-    if (!LastCheckedInst)
-      LastCheckedInst = UI;
-    else if (LastCheckedInst->comesBefore(UI))
       LastCheckedInst = UI;
+    }
 
     auto ScalarIdx = canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC, DT);
     if (!ScalarIdx.isSafe()) {

From 8e7995884a6525b44fe2f71318883ba2fec2d972 Mon Sep 17 00:00:00 2001
From: Arjun P <arjunpitchanathan@gmail.com>
Date: Sat, 19 Feb 2022 20:06:14 +0000
Subject: [PATCH 348/748] [MLIR][Presburger] Introduce MaybeOptimum type to
 represent computed optima

This allows to differentiate between the cases where the optimum does not
exist due to being unbounded and due to the polytope being empty.

Reviewed By: Groverkss

Differential Revision: https://reviews.llvm.org/D120127
---
 .../Analysis/Presburger/IntegerPolyhedron.h   |   3 +-
 .../mlir/Analysis/Presburger/Simplex.h        |  42 +++--
 mlir/include/mlir/Analysis/Presburger/Utils.h |  61 +++++++
 .../Analysis/Presburger/IntegerPolyhedron.cpp |  17 +-
 mlir/lib/Analysis/Presburger/Simplex.cpp      | 151 +++++++++++-------
 .../Presburger/IntegerPolyhedronTest.cpp      |  15 +-
 .../Analysis/Presburger/SimplexTest.cpp       |  10 +-
 7 files changed, 210 insertions(+), 89 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
index 5a1d6df84f736..1a786d89f27b8 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
@@ -209,7 +209,8 @@ class IntegerPolyhedron : public PresburgerLocalSpace {
   /// constraints. Returns an empty optional if the polyhedron is empty or if
   /// the lexmin is unbounded. Symbols are not supported and will result in
   /// assert-failure.
-  Optional<SmallVector<Fraction, 8>> getRationalLexMin() const;
+  presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>>
+  getRationalLexMin() const;
 
   /// Swap the posA^th identifier with the posB^th identifier.
   virtual void swapId(unsigned posA, unsigned posB);
diff --git a/mlir/include/mlir/Analysis/Presburger/Simplex.h b/mlir/include/mlir/Analysis/Presburger/Simplex.h
index 646598f01a788..4f4abc1579cd2 100644
--- a/mlir/include/mlir/Analysis/Presburger/Simplex.h
+++ b/mlir/include/mlir/Analysis/Presburger/Simplex.h
@@ -18,6 +18,7 @@
 #include "mlir/Analysis/Presburger/Fraction.h"
 #include "mlir/Analysis/Presburger/IntegerPolyhedron.h"
 #include "mlir/Analysis/Presburger/Matrix.h"
+#include "mlir/Analysis/Presburger/Utils.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
@@ -202,14 +203,6 @@ class SimplexBase {
   /// Add all the constraints from the given IntegerPolyhedron.
   void intersectIntegerPolyhedron(const IntegerPolyhedron &poly);
 
-  /// Returns the current sample point, which may contain non-integer (rational)
-  /// coordinates. Returns an empty optional when the tableau is empty.
-  ///
-  /// Also returns empty when the big M parameter is used and a variable
-  /// has a non-zero big M coefficient, meaning its value is infinite or
-  /// unbounded.
-  Optional<SmallVector<Fraction, 8>> getRationalSample() const;
-
   /// Print the tableau's internal state.
   void print(raw_ostream &os) const;
   void dump() const;
@@ -441,9 +434,18 @@ class LexSimplex : public SimplexBase {
   unsigned getSnapshot() { return SimplexBase::getSnapshotBasis(); }
 
   /// Return the lexicographically minimum rational solution to the constraints.
-  Optional<SmallVector<Fraction, 8>> getRationalLexMin();
+  presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>> getRationalLexMin();
 
 protected:
+  /// Returns the current sample point, which may contain non-integer (rational)
+  /// coordinates. Returns an empty optimum when the tableau is empty.
+  ///
+  /// Returns an unbounded optimum when the big M parameter is used and a
+  /// variable has a non-zero big M coefficient, meaning its value is infinite
+  /// or unbounded.
+  presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>>
+  getRationalSample() const;
+
   /// Undo the addition of the last constraint. This is only called while
   /// rolling back.
   void undoLastConstraint() final;
@@ -510,15 +512,16 @@ class Simplex : public SimplexBase {
   ///
   /// Returns a Fraction denoting the optimum, or a null value if no optimum
   /// exists, i.e., if the expression is unbounded in this direction.
-  Optional<Fraction> computeRowOptimum(Direction direction, unsigned row);
+  presburger_utils::MaybeOptimum<Fraction>
+  computeRowOptimum(Direction direction, unsigned row);
 
   /// Compute the maximum or minimum value of the given expression, depending on
   /// direction. Should not be called when the Simplex is empty.
   ///
   /// Returns a Fraction denoting the optimum, or a null value if no optimum
   /// exists, i.e., if the expression is unbounded in this direction.
-  Optional<Fraction> computeOptimum(Direction direction,
-                                    ArrayRef<int64_t> coeffs);
+  presburger_utils::MaybeOptimum<Fraction>
+  computeOptimum(Direction direction, ArrayRef<int64_t> coeffs);
 
   /// Returns whether the perpendicular of the specified constraint is a
   /// is a direction along which the polytope is bounded.
@@ -537,10 +540,10 @@ class Simplex : public SimplexBase {
   void detectRedundant();
 
   /// Returns a (min, max) pair denoting the minimum and maximum integer values
-  /// of the given expression. If either of the values is unbounded, an empty
-  /// optional is returned in its place. If the result has min > max then no
-  /// integer value exists.
-  std::pair<Optional<int64_t>, Optional<int64_t>>
+  /// of the given expression. If no integer value exists, both results will be
+  /// of kind Empty.
+  std::pair<presburger_utils::MaybeOptimum<int64_t>,
+            presburger_utils::MaybeOptimum<int64_t>>
   computeIntegerBounds(ArrayRef<int64_t> coeffs);
 
   /// Returns true if the polytope is unbounded, i.e., extends to infinity in
@@ -569,6 +572,10 @@ class Simplex : public SimplexBase {
   /// None.
   Optional<SmallVector<int64_t, 8>> getSamplePointIfIntegral() const;
 
+  /// Returns the current sample point, which may contain non-integer (rational)
+  /// coordinates. Returns an empty optional when the tableau is empty.
+  Optional<SmallVector<Fraction, 8>> getRationalSample() const;
+
 private:
   friend class GBRSimplex;
 
@@ -610,7 +617,8 @@ class Simplex : public SimplexBase {
   ///
   /// Returns a Fraction denoting the optimum, or a null value if no optimum
   /// exists, i.e., if the expression is unbounded in this direction.
-  Optional<Fraction> computeOptimum(Direction direction, Unknown &u);
+  presburger_utils::MaybeOptimum<Fraction> computeOptimum(Direction direction,
+                                                          Unknown &u);
 
   /// Mark the specified unknown redundant. This operation is added to the undo
   /// log and will be undone by rollbacks. The specified unknown must be in row
diff --git a/mlir/include/mlir/Analysis/Presburger/Utils.h b/mlir/include/mlir/Analysis/Presburger/Utils.h
index 10a6fd771035a..8d366f34d1509 100644
--- a/mlir/include/mlir/Analysis/Presburger/Utils.h
+++ b/mlir/include/mlir/Analysis/Presburger/Utils.h
@@ -22,6 +22,67 @@ class IntegerPolyhedron;
 
 namespace presburger_utils {
 
+/// This class represents the result of operations optimizing something subject
+/// to some constraints. If the constraints were not satisfiable the, kind will
+/// be Empty. If the optimum is unbounded, the kind is Unbounded, and if the
+/// optimum is bounded, the kind will be Bounded and `optimum` holds the optimal
+/// value.
+enum class OptimumKind { Empty, Unbounded, Bounded };
+template <typename T>
+class MaybeOptimum {
+public:
+private:
+  OptimumKind kind = OptimumKind::Empty;
+  T optimum;
+
+public:
+  MaybeOptimum() = default;
+  MaybeOptimum(OptimumKind kind) : kind(kind) {
+    assert(kind != OptimumKind::Bounded &&
+           "Bounded optima should be constructed by specifying the optimum!");
+  }
+  MaybeOptimum(const T &optimum)
+      : kind(OptimumKind::Bounded), optimum(optimum) {}
+
+  OptimumKind getKind() const { return kind; }
+  bool isBounded() const { return kind == OptimumKind::Bounded; }
+  bool isUnbounded() const { return kind == OptimumKind::Unbounded; }
+  bool isEmpty() const { return kind == OptimumKind::Empty; }
+
+  Optional<T> getOptimumIfBounded() const { return optimum; }
+  const T &getBoundedOptimum() const {
+    assert(kind == OptimumKind::Bounded &&
+           "This should be called only for bounded optima");
+    return optimum;
+  }
+  T &getBoundedOptimum() {
+    assert(kind == OptimumKind::Bounded &&
+           "This should be called only for bounded optima");
+    return optimum;
+  }
+  const T &operator*() const { return getBoundedOptimum(); }
+  T &operator*() { return getBoundedOptimum(); }
+  const T *operator->() const { return &getBoundedOptimum(); }
+  T *operator->() { return &getBoundedOptimum(); }
+  bool operator==(const MaybeOptimum<T> &other) const {
+    if (kind != other.kind)
+      return false;
+    if (kind != OptimumKind::Bounded)
+      return true;
+    return optimum == other.optimum;
+  }
+
+  // Given f that takes a T and returns a U, convert this `MaybeOptimum<T>` to
+  // a `MaybeOptimum<U>` by applying `f` to the bounded optimum if it exists, or
+  // returning a MaybeOptimum of the same kind otherwise.
+  template <class Function>
+  auto map(const Function &f) const & -> MaybeOptimum<decltype(f(optimum))> {
+    if (kind == OptimumKind::Bounded)
+      return f(optimum);
+    return kind;
+  }
+};
+
 /// `ReprKind` enum is used to set the constraint type in `MaybeLocalRepr`.
 enum class ReprKind { Inequality, Equality, None };
 
diff --git a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
index d7d1b47d3b09b..ce0f339967a52 100644
--- a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
@@ -72,14 +72,14 @@ bool IntegerPolyhedron::isSubsetOf(const IntegerPolyhedron &other) const {
   return PresburgerSet(*this).isSubsetOf(PresburgerSet(other));
 }
 
-Optional<SmallVector<Fraction, 8>>
+MaybeOptimum<SmallVector<Fraction, 8>>
 IntegerPolyhedron::getRationalLexMin() const {
   assert(getNumSymbolIds() == 0 && "Symbols are not supported!");
-  Optional<SmallVector<Fraction, 8>> maybeLexMin =
+  MaybeOptimum<SmallVector<Fraction, 8>> maybeLexMin =
       LexSimplex(*this).getRationalLexMin();
 
-  if (!maybeLexMin)
-    return {};
+  if (!maybeLexMin.isBounded())
+    return maybeLexMin;
 
   // The Simplex returns the lexmin over all the variables including locals. But
   // locals are not actually part of the space and should not be returned in the
@@ -1032,20 +1032,23 @@ Optional<uint64_t> IntegerPolyhedron::computeVolume() const {
   bool hasUnboundedId = false;
   for (unsigned i = 0, e = getNumDimAndSymbolIds(); i < e; ++i) {
     dim[i] = 1;
-    Optional<int64_t> min, max;
+    MaybeOptimum<int64_t> min, max;
     std::tie(min, max) = simplex.computeIntegerBounds(dim);
     dim[i] = 0;
 
+    assert((!min.isEmpty() && !max.isEmpty()) &&
+           "Polytope should be rationally non-empty!");
+
     // One of the dimensions is unbounded. Note this fact. We will return
     // unbounded if none of the other dimensions makes the volume zero.
-    if (!min || !max) {
+    if (min.isUnbounded() || max.isUnbounded()) {
       hasUnboundedId = true;
       continue;
     }
 
     // In this case there are no valid integer points and the volume is
     // definitely zero.
-    if (*min > *max)
+    if (min.getBoundedOptimum() > max.getBoundedOptimum())
       return 0;
 
     count *= (*max - *min + 1);
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index 4d9123602c69f..5ba213ac9e3b4 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -12,6 +12,8 @@
 #include "llvm/ADT/Optional.h"
 
 namespace mlir {
+
+using namespace presburger_utils;
 using Direction = Simplex::Direction;
 
 const int nullIndex = std::numeric_limits<int>::max();
@@ -157,7 +159,7 @@ Direction flippedDirection(Direction direction) {
 }
 } // namespace
 
-Optional<SmallVector<Fraction, 8>> LexSimplex::getRationalLexMin() {
+MaybeOptimum<SmallVector<Fraction, 8>> LexSimplex::getRationalLexMin() {
   restoreRationalConsistency();
   return getRationalSample();
 }
@@ -786,14 +788,14 @@ void SimplexBase::intersectIntegerPolyhedron(const IntegerPolyhedron &poly) {
     addEquality(poly.getEquality(i));
 }
 
-Optional<Fraction> Simplex::computeRowOptimum(Direction direction,
-                                              unsigned row) {
+MaybeOptimum<Fraction> Simplex::computeRowOptimum(Direction direction,
+                                                  unsigned row) {
   // Keep trying to find a pivot for the row in the specified direction.
   while (Optional<Pivot> maybePivot = findPivot(row, direction)) {
     // If findPivot returns a pivot involving the row itself, then the optimum
     // is unbounded, so we return None.
     if (maybePivot->row == row)
-      return {};
+      return OptimumKind::Unbounded;
     pivot(*maybePivot);
   }
 
@@ -805,34 +807,36 @@ Optional<Fraction> Simplex::computeRowOptimum(Direction direction,
 
 /// Compute the optimum of the specified expression in the specified direction,
 /// or None if it is unbounded.
-Optional<Fraction> Simplex::computeOptimum(Direction direction,
-                                           ArrayRef<int64_t> coeffs) {
-  assert(!empty && "Simplex should not be empty");
-
+MaybeOptimum<Fraction> Simplex::computeOptimum(Direction direction,
+                                               ArrayRef<int64_t> coeffs) {
+  if (empty)
+    return OptimumKind::Empty;
   unsigned snapshot = getSnapshot();
   unsigned conIndex = addRow(coeffs);
   unsigned row = con[conIndex].pos;
-  Optional<Fraction> optimum = computeRowOptimum(direction, row);
+  MaybeOptimum<Fraction> optimum = computeRowOptimum(direction, row);
   rollback(snapshot);
   return optimum;
 }
 
-Optional<Fraction> Simplex::computeOptimum(Direction direction, Unknown &u) {
-  assert(!empty && "Simplex should not be empty!");
+MaybeOptimum<Fraction> Simplex::computeOptimum(Direction direction,
+                                               Unknown &u) {
+  if (empty)
+    return OptimumKind::Empty;
   if (u.orientation == Orientation::Column) {
     unsigned column = u.pos;
     Optional<unsigned> pivotRow = findPivotRow({}, direction, column);
     // If no pivot is returned, the constraint is unbounded in the specified
     // direction.
     if (!pivotRow)
-      return {};
+      return OptimumKind::Unbounded;
     pivot(*pivotRow, column);
   }
 
   unsigned row = u.pos;
-  Optional<Fraction> optimum = computeRowOptimum(direction, row);
+  MaybeOptimum<Fraction> optimum = computeRowOptimum(direction, row);
   if (u.restricted && direction == Direction::Down &&
-      (!optimum || *optimum < Fraction(0, 1))) {
+      (optimum.isUnbounded() || *optimum < Fraction(0, 1))) {
     if (failed(restoreRow(u)))
       llvm_unreachable("Could not restore row!");
   }
@@ -844,7 +848,7 @@ bool Simplex::isBoundedAlongConstraint(unsigned constraintIndex) {
                    "in an empty set.");
   // The constraint's perpendicular is already bounded below, since it is a
   // constraint. If it is also bounded above, we can return true.
-  return computeOptimum(Direction::Up, con[constraintIndex]).hasValue();
+  return computeOptimum(Direction::Up, con[constraintIndex]).isBounded();
 }
 
 /// Redundant constraints are those that are in row orientation and lie in
@@ -895,8 +899,8 @@ void Simplex::detectRedundant() {
     }
 
     unsigned row = u.pos;
-    Optional<Fraction> minimum = computeRowOptimum(Direction::Down, row);
-    if (!minimum || *minimum < Fraction(0, 1)) {
+    MaybeOptimum<Fraction> minimum = computeRowOptimum(Direction::Down, row);
+    if (minimum.isUnbounded() || *minimum < Fraction(0, 1)) {
       // Constraint is unbounded below or can attain negative sample values and
       // hence is not redundant.
       if (failed(restoreRow(u)))
@@ -916,12 +920,10 @@ bool Simplex::isUnbounded() {
   for (unsigned i = 0; i < var.size(); ++i) {
     dir[i] = 1;
 
-    Optional<Fraction> maybeMax = computeOptimum(Direction::Up, dir);
-    if (!maybeMax)
+    if (computeOptimum(Direction::Up, dir).isUnbounded())
       return true;
 
-    Optional<Fraction> maybeMin = computeOptimum(Direction::Down, dir);
-    if (!maybeMin)
+    if (computeOptimum(Direction::Down, dir).isUnbounded())
       return true;
 
     dir[i] = 0;
@@ -1010,7 +1012,7 @@ Simplex Simplex::makeProduct(const Simplex &a, const Simplex &b) {
   return result;
 }
 
-Optional<SmallVector<Fraction, 8>> SimplexBase::getRationalSample() const {
+Optional<SmallVector<Fraction, 8>> Simplex::getRationalSample() const {
   if (empty)
     return {};
 
@@ -1022,20 +1024,41 @@ Optional<SmallVector<Fraction, 8>> SimplexBase::getRationalSample() const {
       // If the variable is in column position, its sample value is zero.
       sample.emplace_back(0, 1);
     } else {
+      // If the variable is in row position, its sample value is the
+      // entry in the constant column divided by the denominator.
       int64_t denom = tableau(u.pos, 0);
+      sample.emplace_back(tableau(u.pos, 1), denom);
+    }
+  }
+  return sample;
+}
 
-      // When the big M parameter is being used, each variable x is represented
-      // as M + x, so its sample value is finite only if it is of the form
-      // 1*M + c. If the coefficient of M is not one then the sample value is
-      // infinite, and we return an empty optional.
-      if (usingBigM)
-        if (tableau(u.pos, 2) != denom)
-          return {};
+MaybeOptimum<SmallVector<Fraction, 8>> LexSimplex::getRationalSample() const {
+  if (empty)
+    return OptimumKind::Empty;
 
-      // Otherwise, If the variable is in row position, its sample value is the
-      // entry in the constant column divided by the denominator.
-      sample.emplace_back(tableau(u.pos, 1), denom);
+  SmallVector<Fraction, 8> sample;
+  sample.reserve(var.size());
+  // Push the sample value for each variable into the vector.
+  for (const Unknown &u : var) {
+    // When the big M parameter is being used, each variable x is represented
+    // as M + x, so its sample value is finite if and only if it is of the
+    // form 1*M + c. If the coefficient of M is not one then the sample value
+    // is infinite, and we return an empty optional.
+
+    if (u.orientation == Orientation::Column) {
+      // If the variable is in column position, the sample value of M + x is
+      // zero, so x = -M which is unbounded.
+      return OptimumKind::Unbounded;
     }
+
+    // If the variable is in row position, its sample value is the
+    // entry in the constant column divided by the denominator.
+    int64_t denom = tableau(u.pos, 0);
+    if (usingBigM)
+      if (tableau(u.pos, 2) != denom)
+        return OptimumKind::Unbounded;
+    sample.emplace_back(tableau(u.pos, 1), denom);
   }
   return sample;
 }
@@ -1088,9 +1111,9 @@ class GBRSimplex {
   }
   /// Compute max(dotProduct(dir, x - y)).
   Fraction computeWidth(ArrayRef<int64_t> dir) {
-    Optional<Fraction> maybeWidth =
+    MaybeOptimum<Fraction> maybeWidth =
         simplex.computeOptimum(Direction::Up, getCoeffsForDirection(dir));
-    assert(maybeWidth.hasValue() && "Width should not be unbounded!");
+    assert(maybeWidth.isBounded() && "Width should be bounded!");
     return *maybeWidth;
   }
 
@@ -1108,9 +1131,9 @@ class GBRSimplex {
     unsigned snap = simplex.getSnapshot();
     unsigned conIndex = simplex.addRow(getCoeffsForDirection(dir));
     unsigned row = simplex.con[conIndex].pos;
-    Optional<Fraction> maybeWidth =
+    MaybeOptimum<Fraction> maybeWidth =
         simplex.computeRowOptimum(Simplex::Direction::Up, row);
-    assert(maybeWidth.hasValue() && "Width should not be unbounded!");
+    assert(maybeWidth.isBounded() && "Width should be bounded!");
     dualDenom = simplex.tableau(row, 0);
     dual.clear();
 
@@ -1456,16 +1479,32 @@ Optional<SmallVector<int64_t, 8>> Simplex::findIntegerSample() {
           llvm::to_vector<8>(basis.getRow(level));
       basisCoeffs.push_back(0);
 
-      Optional<int64_t> minRoundedUp, maxRoundedDown;
+      MaybeOptimum<int64_t> minRoundedUp, maxRoundedDown;
       std::tie(minRoundedUp, maxRoundedDown) =
           computeIntegerBounds(basisCoeffs);
 
+      // We don't have any integer values in the range.
+      // Pop the stack and return up a level.
+      if (minRoundedUp.isEmpty() || maxRoundedDown.isEmpty()) {
+        assert((minRoundedUp.isEmpty() && maxRoundedDown.isEmpty()) &&
+               "If one bound is empty, both should be.");
+        snapshotStack.pop_back();
+        nextValueStack.pop_back();
+        upperBoundStack.pop_back();
+        level--;
+        continue;
+      }
+
+      // We already checked the empty case above.
+      assert((minRoundedUp.isBounded() && maxRoundedDown.isBounded()) &&
+             "Polyhedron should be bounded!");
+
       // Heuristic: if the sample point is integral at this point, just return
       // it.
       if (auto maybeSample = getSamplePointIfIntegral())
         return *maybeSample;
 
-      if (minRoundedUp < maxRoundedDown) {
+      if (*minRoundedUp < *maxRoundedDown) {
         reduceBasis(basis, level);
         basisCoeffs = llvm::to_vector<8>(basis.getRow(level));
         basisCoeffs.push_back(0);
@@ -1515,18 +1554,12 @@ Optional<SmallVector<int64_t, 8>> Simplex::findIntegerSample() {
 
 /// Compute the minimum and maximum integer values the expression can take. We
 /// compute each separately.
-std::pair<Optional<int64_t>, Optional<int64_t>>
+std::pair<MaybeOptimum<int64_t>, MaybeOptimum<int64_t>>
 Simplex::computeIntegerBounds(ArrayRef<int64_t> coeffs) {
-  Optional<int64_t> minRoundedUp;
-  if (Optional<Fraction> maybeMin =
-          computeOptimum(Simplex::Direction::Down, coeffs))
-    minRoundedUp = ceil(*maybeMin);
-
-  Optional<int64_t> maxRoundedDown;
-  if (Optional<Fraction> maybeMax =
-          computeOptimum(Simplex::Direction::Up, coeffs))
-    maxRoundedDown = floor(*maybeMax);
-
+  MaybeOptimum<int64_t> minRoundedUp(
+      computeOptimum(Simplex::Direction::Down, coeffs).map(ceil));
+  MaybeOptimum<int64_t> maxRoundedDown(
+      computeOptimum(Simplex::Direction::Up, coeffs).map(floor));
   return {minRoundedUp, maxRoundedDown};
 }
 
@@ -1586,8 +1619,12 @@ bool Simplex::isRationalSubsetOf(const IntegerPolyhedron &poly) {
 /// or equal to zero, the polytope entirely lies in the half-space defined by
 /// `coeffs >= 0`.
 bool Simplex::isRedundantInequality(ArrayRef<int64_t> coeffs) {
-  Optional<Fraction> minimum = computeOptimum(Direction::Down, coeffs);
-  return minimum && *minimum >= Fraction(0, 1);
+  assert(!empty &&
+         "It is not meaningful to ask about redundancy in an empty set!");
+  MaybeOptimum<Fraction> minimum = computeOptimum(Direction::Down, coeffs);
+  assert(!minimum.isEmpty() &&
+         "Optima should be non-empty for a non-empty set");
+  return minimum.isBounded() && *minimum >= Fraction(0, 1);
 }
 
 /// Check whether the equality given by `coeffs == 0` is redundant given
@@ -1595,10 +1632,14 @@ bool Simplex::isRedundantInequality(ArrayRef<int64_t> coeffs) {
 /// always zero under the existing constraints. `coeffs` is always zero
 /// when the minimum and maximum value that `coeffs` can take are both zero.
 bool Simplex::isRedundantEquality(ArrayRef<int64_t> coeffs) {
-  Optional<Fraction> minimum = computeOptimum(Direction::Down, coeffs);
-  Optional<Fraction> maximum = computeOptimum(Direction::Up, coeffs);
-  return minimum && maximum && *maximum == Fraction(0, 1) &&
-         *minimum == Fraction(0, 1);
+  assert(!empty &&
+         "It is not meaningful to ask about redundancy in an empty set!");
+  MaybeOptimum<Fraction> minimum = computeOptimum(Direction::Down, coeffs);
+  MaybeOptimum<Fraction> maximum = computeOptimum(Direction::Up, coeffs);
+  assert((!minimum.isEmpty() && !maximum.isEmpty()) &&
+         "Optima should be non-empty for a non-empty set");
+  return minimum.isBounded() && maximum.isBounded() &&
+         *maximum == Fraction(0, 1) && *minimum == Fraction(0, 1);
 }
 
 } // namespace mlir
diff --git a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
index 933467f191d4c..d7e9b967136b5 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
@@ -16,7 +16,7 @@
 #include <numeric>
 
 namespace mlir {
-
+using namespace presburger_utils;
 using testing::ElementsAre;
 
 enum class TestFunction { Sample, Empty };
@@ -1057,12 +1057,14 @@ TEST(IntegerPolyhedronTest, negativeDividends) {
 void expectRationalLexMin(const IntegerPolyhedron &poly,
                           ArrayRef<Fraction> min) {
   auto lexMin = poly.getRationalLexMin();
-  ASSERT_TRUE(lexMin.hasValue());
+  ASSERT_TRUE(lexMin.isBounded());
   EXPECT_EQ(ArrayRef<Fraction>(*lexMin), min);
 }
 
-void expectNoRationalLexMin(const IntegerPolyhedron &poly) {
-  EXPECT_FALSE(poly.getRationalLexMin().hasValue());
+void expectNoRationalLexMin(OptimumKind kind, const IntegerPolyhedron &poly) {
+  ASSERT_NE(kind, OptimumKind::Bounded)
+      << "Use expectRationalLexMin for bounded min";
+  EXPECT_EQ(poly.getRationalLexMin().getKind(), kind);
 }
 
 TEST(IntegerPolyhedronTest, getRationalLexMin) {
@@ -1118,6 +1120,7 @@ TEST(IntegerPolyhedronTest, getRationalLexMin) {
 
   // Same as above with one constraint removed, making the lexmin unbounded.
   expectNoRationalLexMin(
+      OptimumKind::Unbounded,
       parsePoly("(x, y, z, w) : (3*x + 2*y + 10 >= 0, -4*x + 7*y + 10 >= 0,"
                 "-3*y + 10 >= 0, 3*z + 2*w - 9*x - 12*y >= 0,"
                 "-4*z + 7*w + - 9*x - 9*y - 10>= 0)",
@@ -1125,12 +1128,14 @@ TEST(IntegerPolyhedronTest, getRationalLexMin) {
 
   // Again, the lexmin is unbounded.
   expectNoRationalLexMin(
+      OptimumKind::Unbounded,
       parsePoly("(x, y, z) : (2*x + 5*y + 8*z - 10 >= 0,"
                 "2*x + 10*y + 8*z - 10 >= 0, 2*x + 5*y + 10*z - 10 >= 0)",
                 &context));
 
   // The set is empty.
-  expectNoRationalLexMin(parsePoly("(x) : (2*x >= 0, -x - 1 >= 0)", &context));
+  expectNoRationalLexMin(OptimumKind::Empty,
+                         parsePoly("(x) : (2*x >= 0, -x - 1 >= 0)", &context));
 }
 
 static void
diff --git a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
index 81ed73afc49e3..eb403adb87e0a 100644
--- a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
@@ -15,6 +15,7 @@
 #include <gtest/gtest.h>
 
 namespace mlir {
+using namespace presburger_utils;
 
 /// Take a snapshot, add constraints making the set empty, and rollback.
 /// The set should not be empty after rolling back. We add additional
@@ -406,9 +407,9 @@ TEST(Simplextest, pivotRedundantRegressionTest) {
   // After the rollback, the only remaining constraint is x <= -1.
   // The maximum value of x should be -1.
   simplex.rollback(snapshot);
-  Optional<Fraction> maxX =
+  MaybeOptimum<Fraction> maxX =
       simplex.computeOptimum(Simplex::Direction::Up, {1, 0, 0});
-  EXPECT_TRUE(maxX.hasValue() && *maxX == Fraction(-1, 1));
+  EXPECT_TRUE(maxX.isBounded() && *maxX == Fraction(-1, 1));
 }
 
 TEST(SimplexTest, addInequality_already_redundant) {
@@ -440,8 +441,9 @@ TEST(SimplexTest, appendVariable) {
 
   EXPECT_EQ(simplex.getNumVariables(), 2u);
   EXPECT_EQ(simplex.getNumConstraints(), 2u);
-  EXPECT_EQ(simplex.computeIntegerBounds({0, 1, 0}),
-            std::make_pair(Optional<int64_t>(yMin), Optional<int64_t>(yMax)));
+  EXPECT_EQ(
+      simplex.computeIntegerBounds({0, 1, 0}),
+      std::make_pair(MaybeOptimum<int64_t>(yMin), MaybeOptimum<int64_t>(yMax)));
 
   simplex.rollback(snapshot1);
   EXPECT_EQ(simplex.getNumVariables(), 1u);

From fa0f90bc55ed536e1488648255278ce9029cfa59 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 17 Feb 2022 10:42:15 -0500
Subject: [PATCH 349/748] [HIP] Support linking archive of bundled bitcode

HIP programs compiled with -c -fgpu-rdc generate clang-offload-bundler
bundles which contain bitcode for different GPU's.

Such files can be archived to an archive file which can be linked with
HIP programs with -fgpu-rdc.

This patch adds suppor of linking archive of bundled bitcode.

When an archive of bundled bitcode is passed to clang by -l, for each
GPU specified through --offload-arch, clang extracts bitcode from
the archive and creates a new archive for that GPU and pass it
to lld.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D120070

Fixes: SWDEV-321741, SWDEV-315773
---
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |  8 +++++++
 clang/test/Driver/clang-offload-bundler.c     | 22 +++++++++++++++++++
 clang/test/Driver/hip-link-bundle-archive.hip | 14 ++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100644 clang/test/Driver/hip-link-bundle-archive.hip

diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 6d553791b394b..4a952530993e8 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -121,6 +121,14 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
   for (auto Input : Inputs)
     LldArgs.push_back(Input.getFilename());
 
+  // Look for archive of bundled bitcode in arguments, and add temporary files
+  // for the extracted archive of bitcode to inputs.
+  auto TargetID = Args.getLastArgValue(options::OPT_mcpu_EQ);
+  AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, LldArgs, "amdgcn",
+                             TargetID,
+                             /*IsBitCodeSDL=*/true,
+                             /*PostClangLink=*/false);
+
   const char *Lld = Args.MakeArgString(getToolChain().GetProgramPath("lld"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Lld, LldArgs, Inputs, Output));
diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index a307af4473a1d..eab4dbc7e3be0 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -365,6 +365,28 @@
 // CKLST2-NOT: openmp-powerpc64le-ibm-linux-gnu
 // CKLST2-NOT: openmp-x86_64-pc-linux-gnu
 
+//
+// Check unbundling archive for HIP.
+//
+// When the input to clang-offload-bundler is an archive of bundled bitcodes,
+// for each target, clang-offload-bundler extracts the bitcode from each
+// bundle and archives them. Therefore for each target, the output is an
+// archive of unbundled bitcodes.
+//
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -inputs=%t.tgt1,%t.tgt2 -outputs=%T/hip_bundle1.bc
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -inputs=%t.tgt1,%t.tgt2 -outputs=%T/hip_bundle2.bc
+// RUN: llvm-ar cr %T/hip_archive.a %T/hip_bundle1.bc %T/hip_bundle2.bc
+// RUN: clang-offload-bundler -unbundle -type=a -targets=hip-amdgcn-amd-amdhsa--gfx900,hip-amdgcn-amd-amdhsa--gfx906 \
+// RUN:   -outputs=%T/hip_900.a,%T/hip_906.a -inputs=%T/hip_archive.a
+// RUN: llvm-ar t %T/hip_900.a | FileCheck -check-prefix=HIP-AR-900 %s
+// RUN: llvm-ar t %T/hip_906.a | FileCheck -check-prefix=HIP-AR-906 %s
+// HIP-AR-900-DAG: hip_bundle1-hip-amdgcn-amd-amdhsa--gfx900
+// HIP-AR-900-DAG: hip_bundle2-hip-amdgcn-amd-amdhsa--gfx900
+// HIP-AR-906-DAG: hip_bundle1-hip-amdgcn-amd-amdhsa--gfx906
+// HIP-AR-906-DAG: hip_bundle2-hip-amdgcn-amd-amdhsa--gfx906
+
 //
 // Check bundling without host target is allowed for HIP.
 //
diff --git a/clang/test/Driver/hip-link-bundle-archive.hip b/clang/test/Driver/hip-link-bundle-archive.hip
new file mode 100644
index 0000000000000..4b97844faf46c
--- /dev/null
+++ b/clang/test/Driver/hip-link-bundle-archive.hip
@@ -0,0 +1,14 @@
+// REQUIRES: clang-driver, x86-registered-target, amdgpu-registered-target
+
+// RUN: touch %T/libhipBundled.a
+
+// Check clang unbundle the archive and link them by lld.
+
+// RUN: %clang -### --offload-arch=gfx906 --offload-arch=gfx1030 \
+// RUN:   -nogpulib %s -fgpu-rdc -L%T -lhipBundled \
+// RUN:   2>&1 | FileCheck -check-prefix=CHECK %s
+
+// CHECK: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-inputs={{.*}}libhipBundled.a" "-targets=hip-amdgcn-amd-amdhsa-gfx1030" "-outputs=[[A1030:.*\.a]]" "-allow-missing-bundles"
+// CHECK: "{{.*}}lld" {{.*}}"-plugin-opt=mcpu=gfx1030" {{.*}} "[[A1030]]"
+// CHECK: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-inputs={{.*}}libhipBundled.a" "-targets=hip-amdgcn-amd-amdhsa-gfx906" "-outputs=[[A906:.*\.a]]" "-allow-missing-bundles"
+// CHECK: "{{.*}}lld" {{.*}}"-plugin-opt=mcpu=gfx906" {{.*}} "[[A906]]"

From b09e63bad1e53eccd18b9920d10b489b47bd7634 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Sun, 20 Feb 2022 00:53:09 -0800
Subject: [PATCH 350/748] [AArch64][GlobalISel] Implement combines for boolean
 G_SELECT->bitwise ops.

Differential Revision: https://reviews.llvm.org/D117160
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   3 +
 .../include/llvm/Target/GlobalISel/Combine.td |  10 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  57 ++++++
 .../AArch64/GlobalISel/combine-select.mir     | 182 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/arm64-xaluo.ll      |  52 ++---
 5 files changed, 278 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 7e754b8b2ffdf..05e6da5ff1e7c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -727,6 +727,9 @@ class CombinerHelper {
   bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
                                                 BuildFnTy &MatchInfo);
 
+  /// Fold boolean selects to logical operations.
+  bool matchSelectToLogical(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a post-indexing operation.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index e89c50e467935..66f7463445300 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -284,6 +284,13 @@ def select_constant_cmp: GICombineRule<
   (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
 >;
 
+def select_to_logical : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$root,
+    [{ return Helper.matchSelectToLogical(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
+>;
+
 // Fold x op 0 -> x
 def right_identity_zero: GICombineRule<
   (defs root:$root),
@@ -907,7 +914,8 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
 
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
-def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp]>;
+def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
+                                      select_to_logical]>;
 
 def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
                                        mul_by_neg_one]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 83fde833f6473..5644eea5f07a3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5539,6 +5539,63 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   return false;
 }
 
+bool CombinerHelper::matchSelectToLogical(MachineInstr &MI,
+                                          BuildFnTy &MatchInfo) {
+  GSelect &Sel = cast<GSelect>(MI);
+  Register DstReg = Sel.getReg(0);
+  Register Cond = Sel.getCondReg();
+  Register TrueReg = Sel.getTrueReg();
+  Register FalseReg = Sel.getFalseReg();
+
+  auto *TrueDef = getDefIgnoringCopies(TrueReg, MRI);
+  auto *FalseDef = getDefIgnoringCopies(FalseReg, MRI);
+
+  const LLT CondTy = MRI.getType(Cond);
+  const LLT OpTy = MRI.getType(TrueReg);
+  if (CondTy != OpTy || OpTy.getScalarSizeInBits() != 1)
+    return false;
+
+  // We have a boolean select.
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  auto MaybeCstTrue = isConstantOrConstantSplatVector(*TrueDef, MRI);
+  if (Cond == TrueReg || (MaybeCstTrue && MaybeCstTrue->isOne())) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildOr(DstReg, Cond, FalseReg);
+    };
+    return true;
+  }
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  auto MaybeCstFalse = isConstantOrConstantSplatVector(*FalseDef, MRI);
+  if (Cond == FalseReg || (MaybeCstFalse && MaybeCstFalse->isZero())) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildAnd(DstReg, Cond, TrueReg);
+    };
+    return true;
+  }
+
+ // select Cond, T, 1 --> or (not Cond), T
+  if (MaybeCstFalse && MaybeCstFalse->isOne()) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildOr(DstReg, MIB.buildNot(OpTy, Cond), TrueReg);
+    };
+    return true;
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (MaybeCstTrue && MaybeCstTrue->isZero()) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildAnd(DstReg, MIB.buildNot(OpTy, Cond), FalseReg);
+    };
+    return true;
+  }
+  return false;
+}
+
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 6f2e8121c4bcd..4447716f8a69c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown- --aarch64postlegalizercombinerhelper-only-enable-rule="select_to_logical" %s -o - | FileCheck %s
 # RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+# REQUIRES: asserts
 ---
 # select (c, x, x) -> x
 name:            test_combine_select_same_res
@@ -92,3 +93,182 @@ body:             |
     %3:_(<4 x s32>) = G_SELECT %condvec, %0, %1
     $q0 = COPY %3(<4 x s32>)
 ...
+---
+# select Cond, Cond, F --> or Cond, F
+name:            bool_cond_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: bool_cond_cond_false
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %sel:_(s1) = G_SELECT %c, %c, %f
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select Cond, 1, F    --> or Cond, F
+name:            bool_cond_one_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: bool_cond_one_false
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %one, %f
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select Cond, 1, F    --> or Cond, F
+name:            bool_cond_one_false_vector
+body:             |
+  bb.1:
+    liveins: $d0, $d1, $d2
+    ; CHECK-LABEL: name: bool_cond_one_false_vector
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d2
+    ; CHECK-NEXT: %c:_(<2 x s1>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[COPY1]](<2 x s32>)
+    ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, %f
+    ; CHECK-NEXT: %ext:_(<2 x s32>) = G_ANYEXT %sel(<2 x s1>)
+    ; CHECK-NEXT: $d0 = COPY %ext(<2 x s32>)
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = COPY $d1
+    %2:_(<2 x s32>) = COPY $d2
+    %c:_(<2 x s1>) = G_TRUNC %0
+    %t:_(<2 x s1>) = G_TRUNC %1
+    %f:_(<2 x s1>) = G_TRUNC %2
+    %one:_(s1) = G_CONSTANT i1 1
+    %one_vec:_(<2 x s1>) = G_BUILD_VECTOR %one, %one
+    %sel:_(<2 x s1>) = G_SELECT %c, %one_vec, %f
+    %ext:_(<2 x s32>) = G_ANYEXT %sel
+    $d0 = COPY %ext(<2 x s32>)
+...
+---
+# select Cond, T, Cond --> and Cond, T
+name:            bool_cond_true_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: bool_cond_true_cond
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %t, %c
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select Cond, T, 0    --> and Cond, T
+name:            bool_cond_true_zero
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: bool_cond_true_zero
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %sel:_(s1) = G_SELECT %c, %t, %zero
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select Cond, T, 1 --> or (not Cond), T
+name:            bool_cond_true_one
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: bool_cond_true_one
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
+    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], %t
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %t, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select Cond, 0, F --> and (not Cond), F
+name:            bool_cond_zero_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: bool_cond_zero_false
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], %f
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %sel:_(s1) = G_SELECT %c, %zero, %f
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
index 0ce5b8ab8e400..05b444a6110e0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -2159,8 +2159,8 @@ define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmn w0, w1
 ; GISEL-NEXT:    cset w8, vs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2195,8 +2195,8 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmn x0, x1
 ; GISEL-NEXT:    cset w8, vs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2231,8 +2231,8 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmn w0, w1
 ; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2267,8 +2267,8 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmn x0, x1
 ; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2303,8 +2303,8 @@ define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmp w0, w1
 ; GISEL-NEXT:    cset w8, vs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2339,8 +2339,8 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmp x0, x1
 ; GISEL-NEXT:    cset w8, vs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2375,8 +2375,8 @@ define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmp w0, w1
 ; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2411,8 +2411,8 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmp x0, x1
 ; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2451,7 +2451,8 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 ; GISEL-NEXT:    mul w9, w0, w1
 ; GISEL-NEXT:    asr x8, x8, #32
 ; GISEL-NEXT:    cmp w8, w9, asr #31
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    cset w8, ne
+; GISEL-NEXT:    eor w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2491,7 +2492,8 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; GISEL-NEXT:    mul x8, x0, x1
 ; GISEL-NEXT:    smulh x9, x0, x1
 ; GISEL-NEXT:    cmp x9, x8, asr #63
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    cset w8, ne
+; GISEL-NEXT:    eor w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2526,8 +2528,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmn x0, x0
 ; GISEL-NEXT:    cset w8, vs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 2)
@@ -2565,7 +2567,8 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 ; GISEL-NEXT:    umull x8, w0, w1
 ; GISEL-NEXT:    lsr x8, x8, #32
 ; GISEL-NEXT:    cmp w8, #0
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    cset w8, ne
+; GISEL-NEXT:    eor w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2602,7 +2605,8 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    umulh x8, x0, x1
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    cset w8, ne
+; GISEL-NEXT:    eor w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
@@ -2637,8 +2641,8 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    cmn x0, x0
 ; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    cset w0, eq
+; GISEL-NEXT:    eor w8, w8, #0x1
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)

From 2a46450849de6904fc64f9a65303b20ca7fc9dbd Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Sun, 20 Feb 2022 01:13:34 -0800
Subject: [PATCH 351/748] [AArch64][GlobalISel] Optimize conjunctions of
 compares to conditional compares.

This is a partial port of the same optimization from AArch64ISelLowering,
although the original handles more cases when generating regular compares
instead of this one which just does it when selecting G_SELECTs.

For more detailed comments see the original comments for
emitConditionalComparison() in AArch64ISelLowering.

Gives minor code size improvements.

Differential Revision: https://reviews.llvm.org/D117166
---
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |  32 ++
 .../GISel/AArch64InstructionSelector.cpp      | 381 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       | 260 ++++--------
 3 files changed, 479 insertions(+), 194 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 7103656365b1b..58fe48200e732 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 #define LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 
+#include "llvm/IR/Instructions.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -226,6 +227,37 @@ class GSelect : public GenericMachineInstr {
   }
 };
 
+/// Represent a G_ICMP or G_FCMP.
+class GAnyCmp : public GenericMachineInstr {
+public:
+  CmpInst::Predicate getCond() const {
+    return static_cast<CmpInst::Predicate>(getOperand(1).getPredicate());
+  }
+  Register getLHSReg() const { return getReg(2); }
+  Register getRHSReg() const { return getReg(3); }
+
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_ICMP ||
+           MI->getOpcode() == TargetOpcode::G_FCMP;
+  }
+};
+
+/// Represent a G_ICMP.
+class GICmp : public GAnyCmp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_ICMP;
+  }
+};
+
+/// Represent a G_FCMP.
+class GFCmp : public GAnyCmp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_FCMP;
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 8a79d2426c8f0..0b065398ccee5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -63,6 +64,7 @@ namespace {
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATE_BITSET
 
+
 class AArch64InstructionSelector : public InstructionSelector {
 public:
   AArch64InstructionSelector(const AArch64TargetMachine &TM,
@@ -294,6 +296,20 @@ class AArch64InstructionSelector : public InstructionSelector {
   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
 
+  /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
+  /// In some cases this is even possible with OR operations in the expression.
+  MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
+                                MachineIRBuilder &MIB) const;
+  MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
+                                          CmpInst::Predicate CC,
+                                          AArch64CC::CondCode Predicate,
+                                          AArch64CC::CondCode OutCC,
+                                          MachineIRBuilder &MIB) const;
+  MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
+                                   bool Negate, Register CCOp,
+                                   AArch64CC::CondCode Predicate,
+                                   MachineIRBuilder &MIB) const;
+
   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
   /// \p IsNegative is true if the test should be "not zero".
   /// This will also optimize the test bit instruction when possible.
@@ -425,7 +441,8 @@ class AArch64InstructionSelector : public InstructionSelector {
   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
 
   // Optimization methods.
-  bool tryOptSelect(MachineInstr &MI);
+  bool tryOptSelect(GSelect &Sel);
+  bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                       MachineOperand &Predicate,
                                       MachineIRBuilder &MIRBuilder) const;
@@ -1310,6 +1327,90 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
   }
 }
 
+/// changeFPCCToAArch64CC - Convert an IR fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(CmpInst::Predicate CC,
+                                  AArch64CC::CondCode &CondCode,
+                                  AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case CmpInst::FCMP_OEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case CmpInst::FCMP_OGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_OGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case CmpInst::FCMP_OLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case CmpInst::FCMP_OLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case CmpInst::FCMP_ONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_ORD:
+    CondCode = AArch64CC::VC;
+    break;
+  case CmpInst::FCMP_UNO:
+    CondCode = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case CmpInst::FCMP_UGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case CmpInst::FCMP_ULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case CmpInst::FCMP_ULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case CmpInst::FCMP_UNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
+
+/// Convert an IR fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
+                                     AArch64CC::CondCode &CondCode,
+                                     AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    assert(CondCode2 == AArch64CC::AL);
+    break;
+  case CmpInst::FCMP_ONE:
+    // (a one b)
+    // == ((a olt b) || (a ogt b))
+    // == ((a ord b) && (a une b))
+    CondCode = AArch64CC::VC;
+    CondCode2 = AArch64CC::NE;
+    break;
+  case CmpInst::FCMP_UEQ:
+    // (a ueq b)
+    // == ((a uno b) || (a oeq b))
+    // == ((a ule b) && (a uge b))
+    CondCode = AArch64CC::PL;
+    CondCode2 = AArch64CC::LE;
+    break;
+  }
+}
+
 /// Return a register which can be used as a bit to test in a TB(N)Z.
 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
                               MachineRegisterInfo &MRI) {
@@ -3292,17 +3393,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return selectCopy(I, TII, MRI, TRI, RBI);
 
   case TargetOpcode::G_SELECT: {
-    if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
+    auto &Sel = cast<GSelect>(I);
+    if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) {
       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
                         << ", expected: " << LLT::scalar(1) << '\n');
       return false;
     }
 
-    const Register CondReg = I.getOperand(1).getReg();
-    const Register TReg = I.getOperand(2).getReg();
-    const Register FReg = I.getOperand(3).getReg();
+    const Register CondReg = Sel.getCondReg();
+    const Register TReg = Sel.getTrueReg();
+    const Register FReg = Sel.getFalseReg();
 
-    if (tryOptSelect(I))
+    if (tryOptSelect(Sel))
       return true;
 
     // Make sure to use an unused vreg instead of wzr, so that the peephole
@@ -3311,9 +3413,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
-    if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
+    if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
       return false;
-    I.eraseFromParent();
+    Sel.eraseFromParent();
     return true;
   }
   case TargetOpcode::G_ICMP: {
@@ -4702,7 +4804,263 @@ AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
   }
 }
 
-bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
+/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
+/// expressed as a conjunction.
+/// \param CanNegate    Set to true if we can negate the whole sub-tree just by
+///                     changing the conditions on the CMP tests.
+///                     (this means we can call emitConjunctionRec() with
+///                      Negate==true on this sub-tree)
+/// \param MustBeFirst  Set to true if this subtree needs to be negated and we
+///                     cannot do the negation naturally. We are required to
+///                     emit the subtree first in this case.
+/// \param WillNegate   Is true if are called when the result of this
+///                     subexpression must be negated. This happens when the
+///                     outer expression is an OR. We can use this fact to know
+///                     that we have a double negation (or (or ...) ...) that
+///                     can be implemented for free.
+static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
+                               bool WillNegate, MachineRegisterInfo &MRI,
+                               unsigned Depth = 0) {
+  if (!MRI.hasOneNonDBGUse(Val))
+    return false;
+  MachineInstr *ValDef = MRI.getVRegDef(Val);
+  unsigned Opcode = ValDef->getOpcode();
+  if (Opcode == TargetOpcode::G_TRUNC) {
+    // Look through a trunc.
+    Val = ValDef->getOperand(1).getReg();
+    ValDef = MRI.getVRegDef(Val);
+    Opcode = ValDef->getOpcode();
+  }
+  if (isa<GAnyCmp>(ValDef)) {
+    CanNegate = true;
+    MustBeFirst = false;
+    return true;
+  }
+  // Protect against exponential runtime and stack overflow.
+  if (Depth > 6)
+    return false;
+  if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
+    bool IsOR = Opcode == TargetOpcode::G_OR;
+    Register O0 = ValDef->getOperand(1).getReg();
+    Register O1 = ValDef->getOperand(2).getReg();
+    bool CanNegateL;
+    bool MustBeFirstL;
+    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
+      return false;
+    bool CanNegateR;
+    bool MustBeFirstR;
+    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
+      return false;
+
+    if (MustBeFirstL && MustBeFirstR)
+      return false;
+
+    if (IsOR) {
+      // For an OR expression we need to be able to naturally negate at least
+      // one side or we cannot do the transformation at all.
+      if (!CanNegateL && !CanNegateR)
+        return false;
+      // If we the result of the OR will be negated and we can naturally negate
+      // the leafs, then this sub-tree as a whole negates naturally.
+      CanNegate = WillNegate && CanNegateL && CanNegateR;
+      // If we cannot naturally negate the whole sub-tree, then this must be
+      // emitted first.
+      MustBeFirst = !CanNegate;
+    } else {
+      assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
+      // We cannot naturally negate an AND operation.
+      CanNegate = false;
+      MustBeFirst = MustBeFirstL || MustBeFirstR;
+    }
+    return true;
+  }
+  return false;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
+    Register LHS, Register RHS, CmpInst::Predicate CC,
+    AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
+    MachineIRBuilder &MIB) const {
+  // TODO: emit CMN as an optimization.
+  auto &MRI = *MIB.getMRI();
+  LLT OpTy = MRI.getType(LHS);
+  assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
+  unsigned CCmpOpc;
+  if (CmpInst::isIntPredicate(CC)) {
+    CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
+  } else {
+    switch (OpTy.getSizeInBits()) {
+    case 16:
+      CCmpOpc = AArch64::FCCMPHrr;
+      break;
+    case 32:
+      CCmpOpc = AArch64::FCCMPSrr;
+      break;
+    case 64:
+      CCmpOpc = AArch64::FCCMPDrr;
+      break;
+    default:
+      return nullptr;
+    }
+  }
+  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+  auto CCmp =
+      MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate);
+  constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
+  return &*CCmp;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
+    Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
+    AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
+  // We're at a tree leaf, produce a conditional comparison operation.
+  auto &MRI = *MIB.getMRI();
+  MachineInstr *ValDef = MRI.getVRegDef(Val);
+  unsigned Opcode = ValDef->getOpcode();
+  if (Opcode == TargetOpcode::G_TRUNC) {
+    // Look through a trunc.
+    Val = ValDef->getOperand(1).getReg();
+    ValDef = MRI.getVRegDef(Val);
+    Opcode = ValDef->getOpcode();
+  }
+  if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
+    Register LHS = Cmp->getLHSReg();
+    Register RHS = Cmp->getRHSReg();
+    CmpInst::Predicate CC = Cmp->getCond();
+    if (Negate)
+      CC = CmpInst::getInversePredicate(CC);
+    // We only handle integer compares for now.
+    if (isa<GICmp>(Cmp)) {
+      OutCC = changeICMPPredToAArch64CC(CC);
+    } else {
+      // Handle special FP cases.
+      AArch64CC::CondCode ExtraCC;
+      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+      // Some floating point conditions can't be tested with a single condition
+      // code. Construct an additional comparison in this case.
+      if (ExtraCC != AArch64CC::AL) {
+        MachineInstr *ExtraCmp;
+        if (!CCOp)
+          ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
+        else
+          ExtraCmp =
+              emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
+        CCOp = ExtraCmp->getOperand(0).getReg();
+        Predicate = ExtraCC;
+      }
+    }
+
+    // Produce a normal comparison if we are first in the chain
+    if (!CCOp) {
+      auto Dst = MRI.cloneVirtualRegister(LHS);
+      if (isa<GICmp>(Cmp))
+        return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
+      return emitFPCompare(Cmp->getOperand(2).getReg(),
+                           Cmp->getOperand(3).getReg(), MIB);
+    }
+    // Otherwise produce a ccmp.
+    return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
+  }
+  assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
+
+  bool IsOR = Opcode == TargetOpcode::G_OR;
+
+  Register LHS = ValDef->getOperand(1).getReg();
+  bool CanNegateL;
+  bool MustBeFirstL;
+  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
+  assert(ValidL && "Valid conjunction/disjunction tree");
+  (void)ValidL;
+
+  Register RHS = ValDef->getOperand(2).getReg();
+  bool CanNegateR;
+  bool MustBeFirstR;
+  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
+  assert(ValidR && "Valid conjunction/disjunction tree");
+  (void)ValidR;
+
+  // Swap sub-tree that must come first to the right side.
+  if (MustBeFirstL) {
+    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
+    std::swap(LHS, RHS);
+    std::swap(CanNegateL, CanNegateR);
+    std::swap(MustBeFirstL, MustBeFirstR);
+  }
+
+  bool NegateR;
+  bool NegateAfterR;
+  bool NegateL;
+  bool NegateAfterAll;
+  if (Opcode == TargetOpcode::G_OR) {
+    // Swap the sub-tree that we can negate naturally to the left.
+    if (!CanNegateL) {
+      assert(CanNegateR && "at least one side must be negatable");
+      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
+      assert(!Negate);
+      std::swap(LHS, RHS);
+      NegateR = false;
+      NegateAfterR = true;
+    } else {
+      // Negate the left sub-tree if possible, otherwise negate the result.
+      NegateR = CanNegateR;
+      NegateAfterR = !CanNegateR;
+    }
+    NegateL = true;
+    NegateAfterAll = !Negate;
+  } else {
+    assert(Opcode == TargetOpcode::G_AND &&
+           "Valid conjunction/disjunction tree");
+    assert(!Negate && "Valid conjunction/disjunction tree");
+
+    NegateL = false;
+    NegateR = false;
+    NegateAfterR = false;
+    NegateAfterAll = false;
+  }
+
+  // Emit sub-trees.
+  AArch64CC::CondCode RHSCC;
+  MachineInstr *CmpR =
+      emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
+  if (NegateAfterR)
+    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+  MachineInstr *CmpL = emitConjunctionRec(
+      LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
+  if (NegateAfterAll)
+    OutCC = AArch64CC::getInvertedCondCode(OutCC);
+  return CmpL;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunction(
+    Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
+  bool DummyCanNegate;
+  bool DummyMustBeFirst;
+  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
+                          *MIB.getMRI()))
+    return nullptr;
+  return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
+}
+
+bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
+                                                         MachineInstr &CondMI) {
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  AArch64CC::CondCode AArch64CC;
+  MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
+  if (!ConjMI)
+    return false;
+  auto CSel =
+      MIB.buildInstr(MRI.getType(SelI.getReg(0)).getSizeInBits() == 32
+                         ? AArch64::CSELWr
+                         : AArch64::CSELXr,
+                     {SelI.getReg(0)}, {SelI.getTrueReg(), SelI.getFalseReg()})
+          .addImm(AArch64CC);
+  constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI);
+  SelI.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
   MachineRegisterInfo &MRI = *MIB.getMRI();
   // We want to recognize this pattern:
   //
@@ -4755,8 +5113,11 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
     return false;
 
   unsigned CondOpc = CondDef->getOpcode();
-  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
+  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
+    if (tryOptSelectConjunction(I, *CondDef))
+      return true;
     return false;
+  }
 
   AArch64CC::CondCode CondCode;
   if (CondOpc == TargetOpcode::G_ICMP) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index f81ed69b137f6..58bf419715519 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -569,14 +569,10 @@ define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_and:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    mov w9, #5
-; GISEL-NEXT:    cmp w9, w1
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, x2, x3, ne
+; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    cmp w8, w1
+; GISEL-NEXT:    ccmp w0, w1, #0, ne
+; GISEL-NEXT:    csel x0, x2, x3, lt
 ; GISEL-NEXT:    ret
   %1 = icmp slt i32 %w0, %w1
   %2 = icmp ne i32 5, %w1
@@ -595,14 +591,10 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    mov w9, #5
-; GISEL-NEXT:    cmp w9, w1
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, x2, x3, ne
+; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    cmp w8, w1
+; GISEL-NEXT:    ccmp w0, w1, #8, eq
+; GISEL-NEXT:    csel x0, x2, x3, lt
 ; GISEL-NEXT:    ret
   %1 = icmp slt i32 %w0, %w1
   %2 = icmp ne i32 5, %w1
@@ -623,17 +615,13 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 ;
 ; GISEL-LABEL: gccbug:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp x1, #0
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    mov w9, #2
+; GISEL-NEXT:    mov w8, #2
+; GISEL-NEXT:    mov w9, #4
+; GISEL-NEXT:    mov w10, #1
 ; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    cmp x0, #4
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w11, w10
-; GISEL-NEXT:    and w8, w10, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csinc x0, x9, xzr, ne
+; GISEL-NEXT:    ccmp x0, x9, #4, ne
+; GISEL-NEXT:    ccmp x1, xzr, #0, eq
+; GISEL-NEXT:    csel x0, x8, x10, eq
 ; GISEL-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
@@ -658,19 +646,13 @@ define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) {
 ;
 ; GISEL-LABEL: select_ororand:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w1, #13
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    cmp w2, #2
-; GISEL-NEXT:    cset w10, lt
+; GISEL-NEXT:    mov w8, #13
+; GISEL-NEXT:    mov w9, #2
 ; GISEL-NEXT:    cmp w3, #4
-; GISEL-NEXT:    cset w11, gt
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w3, wzr, ne
+; GISEL-NEXT:    ccmp w2, w9, #0, gt
+; GISEL-NEXT:    ccmp w1, w8, #2, ge
+; GISEL-NEXT:    ccmp w0, wzr, #4, ls
+; GISEL-NEXT:    csel w0, w3, wzr, eq
 ; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %w0, 0
   %c1 = icmp ugt i32 %w1, 13
@@ -694,16 +676,10 @@ define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) {
 ;
 ; GISEL-LABEL: select_andor:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
 ; GISEL-NEXT:    cmp w1, w2
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    orr w9, w10, w9
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    ccmp w0, wzr, #4, lt
+; GISEL-NEXT:    ccmp w0, w1, #0, eq
+; GISEL-NEXT:    csel w0, w0, w1, eq
 ; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %v1, %v2
   %c1 = icmp sge i32 %v2, %v3
@@ -872,14 +848,9 @@ define i32 @select_and_olt_one(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_olt_one:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #4, mi
+; GISEL-NEXT:    fccmp d2, d3, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vc
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp one double %v2, %v3
@@ -900,14 +871,9 @@ define i32 @select_and_one_olt(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_one_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, vc
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp one double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -928,14 +894,9 @@ define i32 @select_and_olt_ueq(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_olt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    cset w10, vs
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, mi
+; GISEL-NEXT:    fccmp d2, d3, #8, le
+; GISEL-NEXT:    csel w0, w0, w1, pl
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -956,14 +917,9 @@ define i32 @select_and_ueq_olt(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_ueq_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #8, le
+; GISEL-NEXT:    fccmp d2, d3, #0, pl
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp ueq double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -984,14 +940,9 @@ define i32 @select_or_olt_one(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_olt_one:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, pl
+; GISEL-NEXT:    fccmp d2, d3, #8, le
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp one double %v2, %v3
@@ -1012,14 +963,9 @@ define i32 @select_or_one_olt(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_one_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #8, le
+; GISEL-NEXT:    fccmp d2, d3, #8, pl
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp one double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -1040,14 +986,9 @@ define i32 @select_or_olt_ueq(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_olt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    cset w10, vs
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #4, pl
+; GISEL-NEXT:    fccmp d2, d3, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vs
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -1068,14 +1009,9 @@ define i32 @select_or_ueq_olt(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_ueq_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #1, ne
+; GISEL-NEXT:    fccmp d2, d3, #8, vc
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp ueq double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -1097,17 +1033,10 @@ define i32 @select_or_olt_ogt_ueq(double %v0, double %v1, double %v2, double %v3
 ; GISEL-LABEL: select_or_olt_ogt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    fcmp d4, d5
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    cset w11, vs
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    orr w8, w10, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, pl
+; GISEL-NEXT:    fccmp d4, d5, #4, le
+; GISEL-NEXT:    fccmp d4, d5, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vs
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ogt double %v2, %v3
@@ -1131,17 +1060,10 @@ define i32 @select_or_olt_ueq_ogt(double %v0, double %v1, double %v2, double %v3
 ; GISEL-LABEL: select_or_olt_ueq_ogt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    cset w10, vs
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    fcmp d4, d5
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    orr w8, w10, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #4, pl
+; GISEL-NEXT:    fccmp d2, d3, #1, ne
+; GISEL-NEXT:    fccmp d4, d5, #0, vc
+; GISEL-NEXT:    csel w0, w0, w1, gt
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -1170,15 +1092,11 @@ define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    fcvt s1, h1
+; GISEL-NEXT:    fcvt s2, h2
+; GISEL-NEXT:    fcvt s3, h3
 ; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcvt s0, h2
-; GISEL-NEXT:    fcvt s1, h3
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp s2, s3, #8, mi
+; GISEL-NEXT:    csel w0, w0, w1, ge
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp oge half %v2, %v3
@@ -1204,17 +1122,12 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    fcvt s1, h1
+; GISEL-NEXT:    fcvt s2, h2
+; GISEL-NEXT:    fcvt s3, h3
 ; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcvt s0, h2
-; GISEL-NEXT:    fcvt s1, h3
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp s2, s3, #4, mi
+; GISEL-NEXT:    fccmp s2, s3, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vc
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp one half %v2, %v3
@@ -1294,18 +1207,11 @@ define i32 @deep_or(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    cmp w2, #15
-; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    mov w8, #15
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    and w9, w10, w9
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    ccmp w2, w8, #4, ne
+; GISEL-NEXT:    ccmp w1, wzr, #4, eq
+; GISEL-NEXT:    ccmp w0, wzr, #4, ne
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0
@@ -1333,18 +1239,11 @@ define i32 @deep_or1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or1:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    cmp w2, #15
-; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    mov w8, #15
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    ccmp w2, w8, #4, ne
+; GISEL-NEXT:    ccmp w0, wzr, #4, eq
+; GISEL-NEXT:    ccmp w1, wzr, #4, ne
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0
@@ -1372,18 +1271,11 @@ define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or2:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    cmp w2, #15
-; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    mov w8, #15
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    ccmp w2, w8, #4, ne
+; GISEL-NEXT:    ccmp w1, wzr, #4, eq
+; GISEL-NEXT:    ccmp w0, wzr, #4, ne
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0

From 8a3f9a584ad43369cf6a034dc875ebfca76d9033 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Thu, 11 Feb 2021 00:15:56 +0000
Subject: [PATCH 352/748] [C++20][Modules][1/8] Track valid import state.

In C++20 modules imports must be together and at the start of the module.
Rather than growing more ad-hoc flags to test state, this keeps track of the
phase of of a valid module TU (first decl, global module frag, module,
private module frag).  If the phasing is broken (with some diagnostic) the
pattern does not conform to a valid C++20 module, and we set the state
accordingly.

We can thus issue diagnostics when imports appear in the wrong places and
decouple the C++20 modules state from other module variants (modules-ts and
clang modules).  Additionally, we attempt to diagnose wrong imports before
trying to find the module where possible (the latter will generally emit an
unhelpful diagnostic about the module not being available).

Although this generally simplifies the handling of C++20 module import
diagnostics, the motivation was that, in particular, it allows detecting
invalid imports like:

import module A;

int some_decl();

import module B;

where being in a module purview is insufficient to identify them.

Differential Revision: https://reviews.llvm.org/D118893
---
 .../clang/Basic/DiagnosticParseKinds.td       |   4 +
 clang/include/clang/Parse/Parser.h            |  14 +-
 clang/include/clang/Sema/Sema.h               |  15 +-
 clang/lib/Interpreter/IncrementalParser.cpp   |   5 +-
 clang/lib/Parse/ParseAST.cpp                  |   5 +-
 clang/lib/Parse/ParseObjc.cpp                 |   3 +-
 clang/lib/Parse/Parser.cpp                    |  91 ++++++++++--
 clang/lib/Sema/SemaModule.cpp                 |  46 ++++--
 .../Modules/cxx20-import-diagnostics-a.cpp    | 140 ++++++++++++++++++
 9 files changed, 286 insertions(+), 37 deletions(-)
 create mode 100644 clang/test/Modules/cxx20-import-diagnostics-a.cpp

diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index e23810f402365..f21e841bcdd38 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1539,6 +1539,10 @@ def err_private_module_fragment_expected_semi : Error<
 def err_missing_before_module_end : Error<"expected %0 at end of module">;
 def err_unsupported_module_partition : Error<
   "sorry, module partitions are not yet supported">;
+def err_import_not_allowed_here : Error<
+  "imports must immediately follow the module declaration">;
+def err_import_in_wrong_fragment : Error<
+  "module%select{| partition}0 imports cannot be in the %select{global|private}1 module fragment">;
 
 def err_export_empty : Error<"export declaration cannot be empty">;
 }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 981800a7e2356..08d492a7ec721 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -464,14 +464,17 @@ class Parser : public CodeCompletionHandler {
   void Initialize();
 
   /// Parse the first top-level declaration in a translation unit.
-  bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result);
+  bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result,
+                              Sema::ModuleImportState &ImportState);
 
   /// ParseTopLevelDecl - Parse one top-level declaration. Returns true if
   /// the EOF was encountered.
-  bool ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl = false);
+  bool ParseTopLevelDecl(DeclGroupPtrTy &Result,
+                         Sema::ModuleImportState &ImportState);
   bool ParseTopLevelDecl() {
     DeclGroupPtrTy Result;
-    return ParseTopLevelDecl(Result);
+    Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
+    return ParseTopLevelDecl(Result, IS);
   }
 
   /// ConsumeToken - Consume the current 'peek token' and lex the next one.
@@ -3491,8 +3494,9 @@ class Parser : public CodeCompletionHandler {
 
   //===--------------------------------------------------------------------===//
   // Modules
-  DeclGroupPtrTy ParseModuleDecl(bool IsFirstDecl);
-  Decl *ParseModuleImport(SourceLocation AtLoc);
+  DeclGroupPtrTy ParseModuleDecl(Sema::ModuleImportState &ImportState);
+  Decl *ParseModuleImport(SourceLocation AtLoc,
+                          Sema::ModuleImportState &ImportState);
   bool parseMisplacedModuleImport();
   bool tryParseMisplacedModuleImport() {
     tok::TokenKind Kind = Tok.getKind();
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index c1e846c55dee7..dfa12ad40b72a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2949,11 +2949,24 @@ class Sema final {
     Implementation, ///< 'module X;'
   };
 
+  /// An enumeration to represent the transition of states in parsing module
+  /// fragments and imports.  If we are not parsing a C++20 TU, or we find
+  /// an error in state transition, the state is set to NotACXX20Module.
+  enum class ModuleImportState {
+    FirstDecl,       ///< Parsing the first decl in a TU.
+    GlobalFragment,  ///< after 'module;' but before 'module X;'
+    ImportAllowed,   ///< after 'module X;' but before any non-import decl.
+    ImportFinished,  ///< after any non-import decl.
+    PrivateFragment, ///< after 'module :private;'.
+    NotACXX20Module  ///< Not a C++20 TU, or an invalid state was found.
+  };
+
   /// The parser has processed a module-declaration that begins the definition
   /// of a module interface or implementation.
   DeclGroupPtrTy ActOnModuleDecl(SourceLocation StartLoc,
                                  SourceLocation ModuleLoc, ModuleDeclKind MDK,
-                                 ModuleIdPath Path, bool IsFirstDecl);
+                                 ModuleIdPath Path,
+                                 ModuleImportState &ImportState);
 
   /// The parser has processed a global-module-fragment declaration that begins
   /// the definition of the global module fragment of the current module unit.
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
index 4ade8b8bb0741..0f1ef3233a2a1 100644
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -164,8 +164,9 @@ IncrementalParser::ParseOrWrapTopLevelDecl() {
   }
 
   Parser::DeclGroupPtrTy ADecl;
-  for (bool AtEOF = P->ParseFirstTopLevelDecl(ADecl); !AtEOF;
-       AtEOF = P->ParseTopLevelDecl(ADecl)) {
+  Sema::ModuleImportState ImportState;
+  for (bool AtEOF = P->ParseFirstTopLevelDecl(ADecl, ImportState); !AtEOF;
+       AtEOF = P->ParseTopLevelDecl(ADecl, ImportState)) {
     // If we got a null return and something *was* parsed, ignore it.  This
     // is due to a top-level semicolon, an action override, or a parse error
     // skipping something.
diff --git a/clang/lib/Parse/ParseAST.cpp b/clang/lib/Parse/ParseAST.cpp
index 01510e8caf3b7..fd79ed3ca158b 100644
--- a/clang/lib/Parse/ParseAST.cpp
+++ b/clang/lib/Parse/ParseAST.cpp
@@ -154,8 +154,9 @@ void clang::ParseAST(Sema &S, bool PrintStats, bool SkipFunctionBodies) {
     llvm::TimeTraceScope TimeScope("Frontend");
     P.Initialize();
     Parser::DeclGroupPtrTy ADecl;
-    for (bool AtEOF = P.ParseFirstTopLevelDecl(ADecl); !AtEOF;
-         AtEOF = P.ParseTopLevelDecl(ADecl)) {
+    Sema::ModuleImportState ImportState;
+    for (bool AtEOF = P.ParseFirstTopLevelDecl(ADecl, ImportState); !AtEOF;
+         AtEOF = P.ParseTopLevelDecl(ADecl, ImportState)) {
       // If we got a null return and something *was* parsed, ignore it.  This
       // is due to a top-level semicolon, an action override, or a parse error
       // skipping something.
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index f493ac9b92caf..08f131ed0d874 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -79,7 +79,8 @@ Parser::ParseObjCAtDirectives(ParsedAttributesWithRange &Attrs) {
     break;
   case tok::objc_import:
     if (getLangOpts().Modules || getLangOpts().DebuggerSupport) {
-      SingleDecl = ParseModuleImport(AtLoc);
+      Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
+      SingleDecl = ParseModuleImport(AtLoc, IS);
       break;
     }
     Diag(AtLoc, diag::err_atimport);
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index ffa1e0f027f1d..87500a0405531 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -581,15 +581,20 @@ void Parser::DestroyTemplateIds() {
 ///                 top-level-declaration-seq[opt] private-module-fragment[opt]
 ///
 /// Note that in C, it is an error if there is no first declaration.
-bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result) {
+bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result,
+                                    Sema::ModuleImportState &ImportState) {
   Actions.ActOnStartOfTranslationUnit();
 
+  // For C++20 modules, a module decl must be the first in the TU.  We also
+  // need to track module imports.
+  ImportState = Sema::ModuleImportState::FirstDecl;
+  bool NoTopLevelDecls = ParseTopLevelDecl(Result, ImportState);
+
   // C11 6.9p1 says translation units must have at least one top-level
   // declaration. C++ doesn't have this restriction. We also don't want to
   // complain if we have a precompiled header, although technically if the PCH
   // is empty we should still emit the (pedantic) diagnostic.
   // If the main file is a header, we're only pretending it's a TU; don't warn.
-  bool NoTopLevelDecls = ParseTopLevelDecl(Result, true);
   if (NoTopLevelDecls && !Actions.getASTContext().getExternalSource() &&
       !getLangOpts().CPlusPlus && !getLangOpts().IsHeaderFile)
     Diag(diag::ext_empty_translation_unit);
@@ -603,7 +608,8 @@ bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result) {
 ///   top-level-declaration:
 ///           declaration
 /// [C++20]   module-import-declaration
-bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
+bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
+                               Sema::ModuleImportState &ImportState) {
   DestroyTemplateIdAnnotationsRAIIObj CleanupRAII(*this);
 
   // Skip over the EOF token, flagging end of previous input for incremental
@@ -647,13 +653,12 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
 
   case tok::kw_module:
   module_decl:
-    Result = ParseModuleDecl(IsFirstDecl);
+    Result = ParseModuleDecl(ImportState);
     return false;
 
-  // tok::kw_import is handled by ParseExternalDeclaration. (Under the Modules
-  // TS, an import can occur within an export block.)
+  case tok::kw_import:
   import_decl: {
-    Decl *ImportDecl = ParseModuleImport(SourceLocation());
+    Decl *ImportDecl = ParseModuleImport(SourceLocation(), ImportState);
     Result = Actions.ConvertDeclToDeclGroup(ImportDecl);
     return false;
   }
@@ -669,12 +674,14 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
     Actions.ActOnModuleBegin(Tok.getLocation(), reinterpret_cast<Module *>(
                                                     Tok.getAnnotationValue()));
     ConsumeAnnotationToken();
+    ImportState = Sema::ModuleImportState::NotACXX20Module;
     return false;
 
   case tok::annot_module_end:
     Actions.ActOnModuleEnd(Tok.getLocation(), reinterpret_cast<Module *>(
                                                   Tok.getAnnotationValue()));
     ConsumeAnnotationToken();
+    ImportState = Sema::ModuleImportState::NotACXX20Module;
     return false;
 
   case tok::eof:
@@ -718,6 +725,16 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
   MaybeParseCXX11Attributes(attrs);
 
   Result = ParseExternalDeclaration(attrs);
+  // An empty Result might mean a line with ';' or some parsing error, ignore
+  // it.
+  if (Result) {
+    if (ImportState == Sema::ModuleImportState::FirstDecl)
+      // First decl was not modular.
+      ImportState = Sema::ModuleImportState::NotACXX20Module;
+    else if (ImportState == Sema::ModuleImportState::ImportAllowed)
+      // Non-imports disallow further imports.
+      ImportState = Sema::ModuleImportState::ImportFinished;
+  }
   return false;
 }
 
@@ -887,11 +904,17 @@ Parser::ParseExternalDeclaration(ParsedAttributesWithRange &attrs,
         getCurScope(),
         CurParsedObjCImpl ? Sema::PCC_ObjCImplementation : Sema::PCC_Namespace);
     return nullptr;
-  case tok::kw_import:
-    SingleDecl = ParseModuleImport(SourceLocation());
-    break;
+  case tok::kw_import: {
+    Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
+    if (getLangOpts().CPlusPlusModules) {
+      llvm_unreachable("not expecting a c++20 import here");
+      ProhibitAttributes(attrs);
+    }
+    SingleDecl = ParseModuleImport(SourceLocation(), IS);
+  } break;
   case tok::kw_export:
     if (getLangOpts().CPlusPlusModules || getLangOpts().ModulesTS) {
+      ProhibitAttributes(attrs);
       SingleDecl = ParseExportDeclaration();
       break;
     }
@@ -2291,7 +2314,8 @@ void Parser::ParseMicrosoftIfExistsExternalDeclaration() {
 ///            attribute-specifier-seq[opt] ';'
 ///   private-module-fragment: [C++2a]
 ///     'module' ':' 'private' ';' top-level-declaration-seq[opt]
-Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
+Parser::DeclGroupPtrTy
+Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   SourceLocation StartLoc = Tok.getLocation();
 
   Sema::ModuleDeclKind MDK = TryConsumeToken(tok::kw_export)
@@ -2311,7 +2335,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
   // Parse a global-module-fragment, if present.
   if (getLangOpts().CPlusPlusModules && Tok.is(tok::semi)) {
     SourceLocation SemiLoc = ConsumeToken();
-    if (!IsFirstDecl) {
+    if (ImportState != Sema::ModuleImportState::FirstDecl) {
       Diag(StartLoc, diag::err_global_module_introducer_not_at_start)
         << SourceRange(StartLoc, SemiLoc);
       return nullptr;
@@ -2320,6 +2344,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
       Diag(StartLoc, diag::err_module_fragment_exported)
         << /*global*/0 << FixItHint::CreateRemoval(StartLoc);
     }
+    ImportState = Sema::ModuleImportState::GlobalFragment;
     return Actions.ActOnGlobalModuleFragmentDecl(ModuleLoc);
   }
 
@@ -2334,6 +2359,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
     SourceLocation PrivateLoc = ConsumeToken();
     DiagnoseAndSkipCXX11Attributes();
     ExpectAndConsumeSemi(diag::err_private_module_fragment_expected_semi);
+    ImportState = Sema::ModuleImportState::PrivateFragment;
     return Actions.ActOnPrivateModuleFragmentDecl(ModuleLoc, PrivateLoc);
   }
 
@@ -2361,7 +2387,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
 
   ExpectAndConsumeSemi(diag::err_module_expected_semi);
 
-  return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, IsFirstDecl);
+  return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, ImportState);
 }
 
 /// Parse a module import declaration. This is essentially the same for
@@ -2379,7 +2405,8 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
 ///                   attribute-specifier-seq[opt] ';'
 ///           'export'[opt] 'import' header-name
 ///                   attribute-specifier-seq[opt] ';'
-Decl *Parser::ParseModuleImport(SourceLocation AtLoc) {
+Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
+                                Sema::ModuleImportState &ImportState) {
   SourceLocation StartLoc = AtLoc.isInvalid() ? Tok.getLocation() : AtLoc;
 
   SourceLocation ExportLoc;
@@ -2428,6 +2455,42 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc) {
     return nullptr;
   }
 
+  // Diagnose mis-imports.
+  bool SeenError = true;
+  switch (ImportState) {
+  case Sema::ModuleImportState::ImportAllowed:
+    SeenError = false;
+    break;
+  case Sema::ModuleImportState::FirstDecl:
+  case Sema::ModuleImportState::NotACXX20Module:
+    // TODO: These cases will be an error when partitions are implemented.
+    SeenError = false;
+    break;
+  case Sema::ModuleImportState::GlobalFragment:
+    // We can only have pre-processor directives in the global module
+    // fragment.  We can, however have a header unit import here.
+    if (!HeaderUnit)
+      // We do not have partition support yet, so first arg is 0.
+      Diag(ImportLoc, diag::err_import_in_wrong_fragment) << 0 << 0;
+    else
+      SeenError = false;
+    break;
+  case Sema::ModuleImportState::ImportFinished:
+    if (getLangOpts().CPlusPlusModules)
+      Diag(ImportLoc, diag::err_import_not_allowed_here);
+    else
+      SeenError = false;
+    break;
+  case Sema::ModuleImportState::PrivateFragment:
+    // We do not have partition support yet, so first arg is 0.
+    Diag(ImportLoc, diag::err_import_in_wrong_fragment) << 0 << 1;
+    break;
+  }
+  if (SeenError) {
+    ExpectAndConsumeSemi(diag::err_module_expected_semi);
+    return nullptr;
+  }
+
   DeclResult Import;
   if (HeaderUnit)
     Import =
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 85e58640044dc..9bed3cb769f70 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -80,12 +80,20 @@ Sema::ActOnGlobalModuleFragmentDecl(SourceLocation ModuleLoc) {
   return nullptr;
 }
 
-Sema::DeclGroupPtrTy
-Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
-                      ModuleDeclKind MDK, ModuleIdPath Path, bool IsFirstDecl) {
+Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
+                                           SourceLocation ModuleLoc,
+                                           ModuleDeclKind MDK,
+                                           ModuleIdPath Path,
+                                           ModuleImportState &ImportState) {
   assert((getLangOpts().ModulesTS || getLangOpts().CPlusPlusModules) &&
          "should only have module decl in Modules TS or C++20");
 
+  bool IsFirstDecl = ImportState == ModuleImportState::FirstDecl;
+  bool SeenGMF = ImportState == ModuleImportState::GlobalFragment;
+  // If any of the steps here fail, we count that as invalidating C++20
+  // module state;
+  ImportState = ModuleImportState::NotACXX20Module;
+
   // A module implementation unit requires that we are not compiling a module
   // of any kind. A module interface unit requires that we are not compiling a
   // module map.
@@ -134,9 +142,13 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
       ModuleScopes.back().Module->Kind == Module::GlobalModuleFragment)
     GlobalModuleFragment = ModuleScopes.back().Module;
 
+  assert((!getLangOpts().CPlusPlusModules ||
+          SeenGMF == (bool)GlobalModuleFragment) &&
+         "mismatched global module state");
+
   // In C++20, the module-declaration must be the first declaration if there
   // is no global module fragment.
-  if (getLangOpts().CPlusPlusModules && !IsFirstDecl && !GlobalModuleFragment) {
+  if (getLangOpts().CPlusPlusModules && !IsFirstDecl && !SeenGMF) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
     SourceLocation BeginLoc =
         ModuleScopes.empty()
@@ -231,6 +243,10 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   TU->setModuleOwnershipKind(Decl::ModuleOwnershipKind::ModulePrivate);
   TU->setLocalOwningModule(Mod);
 
+  // We are in the module purview, but before any other (non import)
+  // statements, so imports are allowed.
+  ImportState = ModuleImportState::ImportAllowed;
+
   // FIXME: Create a ModuleDecl.
   return nullptr;
 }
@@ -301,10 +317,10 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
                                    SourceLocation ExportLoc,
                                    SourceLocation ImportLoc,
                                    ModuleIdPath Path) {
-  // Flatten the module path for a Modules TS module name.
+  // Flatten the module path for a C++20 or Modules TS module name.
   std::pair<IdentifierInfo *, SourceLocation> ModuleNameLoc;
-  if (getLangOpts().ModulesTS) {
-    std::string ModuleName;
+  std::string ModuleName;
+  if (getLangOpts().CPlusPlusModules || getLangOpts().ModulesTS) {
     for (auto &Piece : Path) {
       if (!ModuleName.empty())
         ModuleName += ".";
@@ -314,6 +330,14 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
     Path = ModuleIdPath(ModuleNameLoc);
   }
 
+  // Diagnose self-import before attempting a load.
+  if (getLangOpts().CPlusPlusModules && isCurrentModulePurview() &&
+      getCurrentModule()->Name == ModuleName) {
+    Diag(ImportLoc, diag::err_module_self_import)
+        << ModuleName << getLangOpts().CurrentModule;
+    return true;
+  }
+
   Module *Mod =
       getModuleLoader().loadModule(ImportLoc, Path, Module::AllVisible,
                                    /*IsInclusionDirective=*/false);
@@ -342,11 +366,9 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
   // FIXME: we should support importing a submodule within a different submodule
   // of the same top-level module. Until we do, make it an error rather than
   // silently ignoring the import.
-  // Import-from-implementation is valid in the Modules TS. FIXME: Should we
-  // warn on a redundant import of the current module?
-  // FIXME: Import of a module from an implementation partition of the same
-  // module is permitted.
-  if (Mod->getTopLevelModuleName() == getLangOpts().CurrentModule &&
+  // FIXME: Should we warn on a redundant import of the current module?
+  if (!getLangOpts().CPlusPlusModules &&
+      Mod->getTopLevelModuleName() == getLangOpts().CurrentModule &&
       (getLangOpts().isCompilingModule() || !getLangOpts().ModulesTS)) {
     Diag(ImportLoc, getLangOpts().isCompilingModule()
                         ? diag::err_module_self_import
diff --git a/clang/test/Modules/cxx20-import-diagnostics-a.cpp b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
new file mode 100644
index 0000000000000..fd4085bcb4713
--- /dev/null
+++ b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
@@ -0,0 +1,140 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=0 -x c++ %s \
+// RUN:  -o %t/B.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=1 -x c++ %s \
+// RUN:  -o %t/C.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=2 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/AOK1.pcm
+
+// RUN: %clang_cc1 -std=c++20 -S -D TU=3 -x c++ %s \
+// RUN:  -fmodule-file=%t/AOK1.pcm -o %t/tu_3.s -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=4 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/BC.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -S -D TU=5 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/tu_5.s -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=6 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=7 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -S -D TU=8 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -o %t/tu_8.s -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=9 -x c++ %s \
+// RUN:  -o %t/B.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-obj -D TU=10 -x c++ %s \
+// RUN:  -fmodule-file=%t/C.pcm  -o %t/impl.o
+
+// Test diagnostics for incorrect module import sequences.
+
+#if TU == 0
+
+export module B;
+
+int foo ();
+
+// expected-no-diagnostics
+
+#elif TU == 1
+
+export module C;
+
+int bar ();
+
+// expected-no-diagnostics
+
+#elif TU == 2
+
+export module AOK1;
+
+import B;
+export import C;
+
+export int theAnswer ();
+
+// expected-no-diagnostics
+
+#elif TU == 3
+
+module;
+
+module AOK1;
+
+export import C; // expected-error {{export declaration can only be used within a module interface unit}}
+
+int theAnswer () { return 42; }
+
+#elif TU == 4
+
+export module BC;
+
+export import B;
+
+int foo () { return 10; }
+
+import C; // expected-error {{imports must immediately follow the module declaration}}
+
+#elif TU == 5
+
+module B; // implicitly imports B.
+
+int foo () { return 10; }
+
+import C; // expected-error {{imports must immediately follow the module declaration}}
+
+#elif TU == 6
+
+module;
+// We can only have preprocessor commands here, which could include an include
+// translated header unit.  However those are identified specifically by the
+// preprocessor; non-preprocessed user code should not contain an import here.
+import B; // expected-error {{module imports cannot be in the global module fragment}}
+
+export module D;
+
+int delta ();
+
+#elif TU == 7
+
+export module D;
+
+int delta ();
+
+module :private;
+
+import B; // expected-error {{module imports cannot be in the private module fragment}}
+
+#elif TU == 8
+
+module B;
+
+import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
+
+#elif TU == 9
+
+export module B;
+
+import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
+
+#elif TU == 10
+
+int x;
+
+import C;
+
+int baz() { return 6174; }
+
+// expected-no-diagnostics
+
+#else
+#error "no MODE set"
+#endif

From a2ce8df49b019898f5e84862db39ae41a1d08fa7 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Sat, 19 Feb 2022 20:21:45 +0300
Subject: [PATCH 353/748] [ArgPromotion] auto-update test checks.

Rename %tmp => %temp IR values to avoid update warning.
---
 .../2008-02-01-ReturnAttrs.ll                 | 20 +++---
 .../2008-07-02-array-indexing.ll              |  8 +--
 .../aggregate-promote-dead-gep.ll             | 10 +--
 .../ArgumentPromotion/aggregate-promote.ll    | 16 ++---
 .../Transforms/ArgumentPromotion/attrs.ll     | 34 ++++-----
 .../Transforms/ArgumentPromotion/basictest.ll | 10 +--
 .../Transforms/ArgumentPromotion/byval-2.ll   | 34 ++++-----
 .../Transforms/ArgumentPromotion/byval.ll     | 72 ++++++++++---------
 .../Transforms/ArgumentPromotion/chained.ll   | 10 +--
 .../ArgumentPromotion/control-flow.ll         |  6 +-
 .../ArgumentPromotion/control-flow2.ll        | 12 ++--
 llvm/test/Transforms/ArgumentPromotion/dbg.ll | 20 +++---
 .../Transforms/ArgumentPromotion/inalloca.ll  |  4 +-
 .../ArgumentPromotion/invalidation.ll         | 12 ++--
 .../Transforms/ArgumentPromotion/metadata.ll  |  2 +-
 .../Transforms/ArgumentPromotion/musttail.ll  | 18 ++---
 .../ArgumentPromotion/naked_functions.ll      |  4 +-
 .../nonzero-address-spaces.ll                 |  4 +-
 .../Transforms/ArgumentPromotion/pr27568.ll   |  4 +-
 .../Transforms/ArgumentPromotion/pr32917.ll   | 12 ++--
 .../ArgumentPromotion/pr42028-recursion.ll    | 32 ++++-----
 .../Transforms/ArgumentPromotion/profile.ll   |  8 +--
 .../ArgumentPromotion/reserve-tbaa.ll         | 22 +++---
 .../test/Transforms/ArgumentPromotion/sret.ll | 22 +++---
 .../Transforms/ArgumentPromotion/variadic.ll  |  6 +-
 25 files changed, 204 insertions(+), 198 deletions(-)

diff --git a/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll b/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
index 7c6980ec0d3ad..d2d5e38d18bd0 100644
--- a/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
@@ -3,28 +3,28 @@
 
 define internal i32 @deref(i32* %x) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@deref
-; CHECK-SAME: (i32 [[X_VAL:%.*]])
+; CHECK-SAME: (i32 [[X_0_VAL:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 [[X_VAL]]
+; CHECK-NEXT:    ret i32 [[X_0_VAL]]
 ;
 entry:
-  %tmp2 = load i32, i32* %x, align 4
-  ret i32 %tmp2
+  %temp2 = load i32, i32* %x, align 4
+  ret i32 %temp2
 }
 
 define i32 @f(i32 %x) {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[X:%.*]])
+; CHECK-SAME: (i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32
+; CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[X]], i32* [[X_ADDR]], align 4
 ; CHECK-NEXT:    [[X_ADDR_VAL:%.*]] = load i32, i32* [[X_ADDR]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @deref(i32 [[X_ADDR_VAL]])
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[TEMP1:%.*]] = call i32 @deref(i32 [[X_ADDR_VAL]])
+; CHECK-NEXT:    ret i32 [[TEMP1]]
 ;
 entry:
   %x_addr = alloca i32
   store i32 %x, i32* %x_addr, align 4
-  %tmp1 = call i32 @deref( i32* %x_addr ) nounwind
-  ret i32 %tmp1
+  %temp1 = call i32 @deref( i32* %x_addr ) nounwind
+  ret i32 %temp1
 }
diff --git a/llvm/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll b/llvm/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
index 2a6cccb72c775..6970062f2afec 100644
--- a/llvm/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
@@ -6,15 +6,15 @@
 ; because there is a load of %A in the entry block
 define internal i32 @callee(i1 %C, i32* %A) {
 ; CHECK-LABEL: define {{[^@]+}}@callee
-; CHECK-SAME: (i1 [[C:%.*]], i32* [[A:%.*]])
+; CHECK-SAME: (i1 [[C:%.*]], i32* [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_0:%.*]] = load i32, i32* [[A]]
+; CHECK-NEXT:    [[A_0:%.*]] = load i32, i32* [[A]], align 4
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       T:
 ; CHECK-NEXT:    ret i32 [[A_0]]
 ; CHECK:       F:
 ; CHECK-NEXT:    [[A_2:%.*]] = getelementptr i32, i32* [[A]], i32 2
-; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[A_2]]
+; CHECK-NEXT:    [[R:%.*]] = load i32, i32* [[A_2]], align 4
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
 entry:
@@ -33,7 +33,7 @@ F:
 }
 
 define i32 @foo() {
-; CHECK-LABEL: define {{[^@]+}}@foo()
+; CHECK-LABEL: define {{[^@]+}}@foo() {
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @callee(i1 false, i32* null)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
diff --git a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll
index aa319680a80ee..b7b43ee5d2547 100644
--- a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote-dead-gep.ll
@@ -6,9 +6,9 @@
 
 define internal i32 @test(%T* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (i32 [[P_0_3_VAL:%.*]]) {
+; CHECK-SAME: (i32 [[P_12_VAL:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = add i32 [[P_0_3_VAL]], 10
+; CHECK-NEXT:    [[V:%.*]] = add i32 [[P_12_VAL]], 10
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
@@ -22,9 +22,9 @@ entry:
 define i32 @caller() {
 ; CHECK-LABEL: define {{[^@]+}}@caller() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[G_IDX:%.*]] = getelementptr [[T:%.*]], %T* @G, i64 0, i32 3
-; CHECK-NEXT:    [[G_IDX_VAL:%.*]] = load i32, i32* [[G_IDX]], align 4
-; CHECK-NEXT:    [[V:%.*]] = call i32 @test(i32 [[G_IDX_VAL]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[T:%.*]], %T* @G, i64 0, i32 3
+; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[V:%.*]] = call i32 @test(i32 [[G_VAL]])
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
diff --git a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll
index 0dd72882741e6..6efa9345aab42 100644
--- a/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/aggregate-promote.ll
@@ -6,9 +6,9 @@
 
 define internal i32 @test(%T* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (i32 [[P_0_2_VAL:%.*]], i32 [[P_0_3_VAL:%.*]])
+; CHECK-SAME: (i32 [[P_8_VAL:%.*]], i32 [[P_12_VAL:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = add i32 [[P_0_3_VAL]], [[P_0_2_VAL]]
+; CHECK-NEXT:    [[V:%.*]] = add i32 [[P_12_VAL]], [[P_8_VAL]]
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
@@ -21,13 +21,13 @@ entry:
 }
 
 define i32 @caller() {
-; CHECK-LABEL: define {{[^@]+}}@caller()
+; CHECK-LABEL: define {{[^@]+}}@caller() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[G_IDX:%.*]] = getelementptr [[T:%.*]], %T* @G, i64 0, i32 2
-; CHECK-NEXT:    [[G_IDX_VAL:%.*]] = load i32, i32* [[G_IDX]]
-; CHECK-NEXT:    [[G_IDX1:%.*]] = getelementptr [[T]], %T* @G, i64 0, i32 3
-; CHECK-NEXT:    [[G_IDX1_VAL:%.*]] = load i32, i32* [[G_IDX1]]
-; CHECK-NEXT:    [[V:%.*]] = call i32 @test(i32 [[G_IDX_VAL]], i32 [[G_IDX1_VAL]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[T:%.*]], %T* @G, i64 0, i32 2
+; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[T]], %T* @G, i64 0, i32 3
+; CHECK-NEXT:    [[G_VAL1:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[V:%.*]] = call i32 @test(i32 [[G_VAL]], i32 [[G_VAL1]])
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
 entry:
diff --git a/llvm/test/Transforms/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
index 4a875735b4964..3365199d95535 100644
--- a/llvm/test/Transforms/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
@@ -6,26 +6,26 @@
 ; Don't drop 'byval' on %X here.
 define internal void @f(%struct.ss* byval(%struct.ss) align 4 %b, i32* byval(i32) align 4 %X, i32 %i) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]], i32 [[I:%.*]]) [[ATTR0:#.*]] {
+; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
+; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
 ; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
 
-  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
-  %tmp1 = load i32, i32* %tmp, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, i32* %tmp, align 4
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
 
   store i32 0, i32* %X
   ret void
@@ -37,10 +37,10 @@ define i32 @test(i32* %X) {
 ; CHECK-SAME: (i32* [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    store i64 2, i64* [[TMP4]], align 4
+; CHECK-NEXT:    [[TEMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
+; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
+; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
 ; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
 ; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -50,10 +50,10 @@ define i32 @test(i32* %X) {
 ;
 entry:
   %S = alloca %struct.ss
-  %tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
-  store i32 1, i32* %tmp1, align 8
-  %tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
-  store i64 2, i64* %tmp4, align 4
+  %temp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
+  store i32 1, i32* %temp1, align 8
+  %temp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
+  store i64 2, i64* %temp4, align 4
 
   call void @f( %struct.ss* byval(%struct.ss) align 4 %S, i32* byval(i32) align 4 %X, i32 zeroext 0)
 
diff --git a/llvm/test/Transforms/ArgumentPromotion/basictest.ll b/llvm/test/Transforms/ArgumentPromotion/basictest.ll
index 43f27b9208ee7..1c3710b836913 100644
--- a/llvm/test/Transforms/ArgumentPromotion/basictest.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/basictest.ll
@@ -4,8 +4,8 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 
 define internal i32 @test(i32* %X, i32* %Y) {
 ; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]])
-; CHECK-NEXT:    [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]]
+; CHECK-SAME: (i32 [[X_0_VAL:%.*]], i32 [[Y_0_VAL:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[X_0_VAL]], [[Y_0_VAL]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %A = load i32, i32* %X
@@ -16,8 +16,8 @@ define internal i32 @test(i32* %X, i32* %Y) {
 
 define internal i32 @caller(i32* %B) {
 ; CHECK-LABEL: define {{[^@]+}}@caller
-; CHECK-SAME: (i32 [[B_VAL1:%.*]])
-; CHECK-NEXT:    [[C:%.*]] = call i32 @test(i32 1, i32 [[B_VAL1]])
+; CHECK-SAME: (i32 [[B_0_VAL:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = call i32 @test(i32 1, i32 [[B_0_VAL]])
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %A = alloca i32
@@ -27,7 +27,7 @@ define internal i32 @caller(i32* %B) {
 }
 
 define i32 @callercaller() {
-; CHECK-LABEL: define {{[^@]+}}@callercaller()
+; CHECK-LABEL: define {{[^@]+}}@callercaller() {
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @caller(i32 2)
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
diff --git a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
index 24382544a59d0..42b7d6d31905d 100644
--- a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -8,25 +8,25 @@
 
 define internal void @f(%struct.ss* byval(%struct.ss) align 8 %b, i32* byval(i32) align 4 %X) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]]) [[ATTR0:#.*]] {
+; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval(i32) align 4 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 8
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
+; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
 ; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
-  %tmp1 = load i32, i32* %tmp, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, i32* %tmp, align 4
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
 
   store i32 0, i32* %X
   ret void
@@ -37,10 +37,10 @@ define i32 @test(i32* %X) {
 ; CHECK-SAME: (i32* [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    store i64 2, i64* [[TMP4]], align 4
+; CHECK-NEXT:    [[TEMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
+; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
+; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
 ; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 8
 ; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -50,10 +50,10 @@ define i32 @test(i32* %X) {
 ;
 entry:
   %S = alloca %struct.ss, align 8
-  %tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
-  store i32 1, i32* %tmp1, align 8
-  %tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
-  store i64 2, i64* %tmp4, align 4
+  %temp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
+  store i32 1, i32* %temp1, align 8
+  %temp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
+  store i64 2, i64* %temp4, align 4
   call void @f( %struct.ss* byval(%struct.ss) align 8 %S, i32* byval(i32) align 4 %X)
   ret i32 0
 }
diff --git a/llvm/test/Transforms/ArgumentPromotion/byval.ll b/llvm/test/Transforms/ArgumentPromotion/byval.ll
index 45988351d9ee6..2416345400c3d 100644
--- a/llvm/test/Transforms/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval.ll
@@ -7,48 +7,48 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 
 define internal void @f(%struct.ss* byval(%struct.ss) align 4 %b) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) [[ATTR0:#.*]] {
+; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
+; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
-  %tmp1 = load i32, i32* %tmp, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, i32* %tmp, align 4
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
   ret void
 }
 
 
 define internal void @g(%struct.ss* byval(%struct.ss) align 32 %b) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@g
-; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) [[ATTR0]] {
+; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 32
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 32
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 4
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
+; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
-  %tmp1 = load i32, i32* %tmp, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, i32* %tmp, align 4
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
   ret void
 }
 
@@ -59,25 +59,31 @@ entry:
 ; just delete this test.)
 define internal void @h(%struct.ss* byval(%struct.ss) %b) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@h
-; CHECK-SAME: (%struct.ss* byval(%struct.ss) %b)
+; CHECK-SAME: (%struct.ss* byval([[STRUCT_SS:%.*]]) [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP1:%.*]] = load i32, i32* [[TEMP]], align 4
+; CHECK-NEXT:    [[TEMP2:%.*]] = add i32 [[TEMP1]], 1
+; CHECK-NEXT:    store i32 [[TEMP2]], i32* [[TEMP]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
-  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
-  %tmp1 = load i32, i32* %tmp, align 4
-  %tmp2 = add i32 %tmp1, 1
-  store i32 %tmp2, i32* %tmp, align 4
+  %temp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %temp1 = load i32, i32* %temp, align 4
+  %temp2 = add i32 %temp1, 1
+  store i32 %temp2, i32* %temp, align 4
   ret void
 }
 
 define i32 @main() nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@main
-; CHECK-SAME: () [[ATTR0]] {
+; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    store i64 2, i64* [[TMP4]], align 4
+; CHECK-NEXT:    [[TEMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
+; CHECK-NEXT:    store i32 1, i32* [[TEMP1]], align 8
+; CHECK-NEXT:    [[TEMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
+; CHECK-NEXT:    store i64 2, i64* [[TEMP4]], align 4
 ; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
 ; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -88,15 +94,15 @@ define i32 @main() nounwind  {
 ; CHECK-NEXT:    [[S_12:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    [[S_12_VAL:%.*]] = load i64, i64* [[S_12]], align 4
 ; CHECK-NEXT:    call void @g(i32 [[S_01_VAL]], i64 [[S_12_VAL]])
-; CHECK-NEXT:    call void @h(%struct.ss* byval(%struct.ss) %S)
+; CHECK-NEXT:    call void @h(%struct.ss* byval([[STRUCT_SS]]) [[S]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
   %S = alloca %struct.ss, align 32
-  %tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
-  store i32 1, i32* %tmp1, align 8
-  %tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
-  store i64 2, i64* %tmp4, align 4
+  %temp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
+  store i32 1, i32* %temp1, align 8
+  %temp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
+  store i64 2, i64* %temp4, align 4
   call void @f(%struct.ss* byval(%struct.ss) align 4 %S) nounwind
   call void @g(%struct.ss* byval(%struct.ss) align 32 %S) nounwind
   call void @h(%struct.ss* byval(%struct.ss) %S) nounwind
diff --git a/llvm/test/Transforms/ArgumentPromotion/chained.ll b/llvm/test/Transforms/ArgumentPromotion/chained.ll
index 5939f36dec51c..60441567ae6f4 100644
--- a/llvm/test/Transforms/ArgumentPromotion/chained.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/chained.ll
@@ -6,9 +6,9 @@
 
 define internal i32 @test(i32** %x) {
 ; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (i32 [[X_VAL_VAL:%.*]])
+; CHECK-SAME: (i32 [[X_0_VAL_0_VAL:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 [[X_VAL_VAL]]
+; CHECK-NEXT:    ret i32 [[X_0_VAL_0_VAL]]
 ;
 entry:
   %y = load i32*, i32** %x
@@ -17,10 +17,10 @@ entry:
 }
 
 define i32 @caller() {
-; CHECK-LABEL: define {{[^@]+}}@caller()
+; CHECK-LABEL: define {{[^@]+}}@caller() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[G2_VAL:%.*]] = load i32*, i32** @G2
-; CHECK-NEXT:    [[G2_VAL_VAL:%.*]] = load i32, i32* [[G2_VAL]]
+; CHECK-NEXT:    [[G2_VAL:%.*]] = load i32*, i32** @G2, align 8
+; CHECK-NEXT:    [[G2_VAL_VAL:%.*]] = load i32, i32* [[G2_VAL]], align 4
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @test(i32 [[G2_VAL_VAL]])
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
diff --git a/llvm/test/Transforms/ArgumentPromotion/control-flow.ll b/llvm/test/Transforms/ArgumentPromotion/control-flow.ll
index 620b4c8a7153d..d8c149721a68f 100644
--- a/llvm/test/Transforms/ArgumentPromotion/control-flow.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/control-flow.ll
@@ -4,13 +4,13 @@
 ; Don't promote around control flow.
 define internal i32 @callee(i1 %C, i32* %P) {
 ; CHECK-LABEL: define {{[^@]+}}@callee
-; CHECK-SAME: (i1 [[C:%.*]], i32* [[P:%.*]])
+; CHECK-SAME: (i1 [[C:%.*]], i32* [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       T:
 ; CHECK-NEXT:    ret i32 17
 ; CHECK:       F:
-; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P]], align 4
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
 entry:
@@ -25,7 +25,7 @@ F:
 }
 
 define i32 @foo() {
-; CHECK-LABEL: define {{[^@]+}}@foo()
+; CHECK-LABEL: define {{[^@]+}}@foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @callee(i1 true, i32* null)
 ; CHECK-NEXT:    ret i32 [[X]]
diff --git a/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll b/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll
index ef52fe59b6648..33d2d0339218a 100644
--- a/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/control-flow2.ll
@@ -5,12 +5,12 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 
 define internal i32 @callee(i1 %C, i32* %P) {
 ; CHECK-LABEL: define {{[^@]+}}@callee
-; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_VAL:%.*]])
+; CHECK-SAME: (i1 [[C:%.*]], i32 [[P_0_VAL:%.*]]) {
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       T:
 ; CHECK-NEXT:    ret i32 17
 ; CHECK:       F:
-; CHECK-NEXT:    ret i32 [[P_VAL]]
+; CHECK-NEXT:    ret i32 [[P_0_VAL]]
 ;
   br i1 %C, label %T, label %F
 
@@ -23,10 +23,10 @@ F:              ; preds = %0
 }
 
 define i32 @foo() {
-; CHECK-LABEL: define {{[^@]+}}@foo()
-; CHECK-NEXT:    [[A:%.*]] = alloca i32
-; CHECK-NEXT:    store i32 17, i32* [[A]]
-; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, i32* [[A]]
+; CHECK-LABEL: define {{[^@]+}}@foo() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 17, i32* [[A]], align 4
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, i32* [[A]], align 4
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @callee(i1 false, i32 [[A_VAL]])
 ; CHECK-NEXT:    ret i32 [[X]]
 ;
diff --git a/llvm/test/Transforms/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
index 67f204e2953a3..7720d750b66ba 100644
--- a/llvm/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
@@ -5,8 +5,8 @@ declare void @sink(i32)
 
 define internal void @test(i32** %X) !dbg !2 {
 ; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (i32 [[X_VAL_VAL:%.*]]) [[DBG3:!dbg !.*]] {
-; CHECK-NEXT:    call void @sink(i32 [[X_VAL_VAL]])
+; CHECK-SAME: (i32 [[X_0_VAL_0_VAL:%.*]]) !dbg [[DBG3:![0-9]+]] {
+; CHECK-NEXT:    call void @sink(i32 [[X_0_VAL_0_VAL]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = load i32*, i32** %X, align 8
@@ -33,14 +33,14 @@ define internal void @test_byval(%struct.pair* byval(%struct.pair) align 4 %P) {
 define void @caller(i32** %Y, %struct.pair* %P) {
 ; CHECK-LABEL: define {{[^@]+}}@caller
 ; CHECK-SAME: (i32** [[Y:%.*]], %struct.pair* [[P:%.*]]) {
-; CHECK-NEXT:    [[Y_VAL:%.*]] = load i32*, i32** [[Y]], align 8, [[DBG4:!dbg !.*]]
-; CHECK-NEXT:    [[Y_VAL_VAL:%.*]] = load i32, i32* [[Y_VAL]], align 8, [[DBG4]]
-; CHECK-NEXT:    call void @test(i32 [[Y_VAL_VAL]]), [[DBG4]]
-; CHECK-NEXT:    [[P_0:%.*]] = getelementptr [[STRUCT_PAIR:%.*]], %struct.pair* [[P]], i32 0, i32 0, [[DBG5:!dbg !.*]]
-; CHECK-NEXT:    [[P_0_VAL:%.*]] = load i32, i32* [[P_0]], align 4, [[DBG5]]
-; CHECK-NEXT:    [[P_1:%.*]] = getelementptr [[STRUCT_PAIR]], %struct.pair* [[P]], i32 0, i32 1, [[DBG5]]
-; CHECK-NEXT:    [[P_1_VAL:%.*]] = load i32, i32* [[P_1]], align 4, [[DBG5]]
-; CHECK-NEXT:    call void @test_byval(i32 [[P_0_VAL]], i32 [[P_1_VAL]]), [[DBG5]]
+; CHECK-NEXT:    [[Y_VAL:%.*]] = load i32*, i32** [[Y]], align 8, !dbg [[DBG4:![0-9]+]]
+; CHECK-NEXT:    [[Y_VAL_VAL:%.*]] = load i32, i32* [[Y_VAL]], align 8, !dbg [[DBG4]]
+; CHECK-NEXT:    call void @test(i32 [[Y_VAL_VAL]]), !dbg [[DBG4]]
+; CHECK-NEXT:    [[P_0:%.*]] = getelementptr [[STRUCT_PAIR:%.*]], %struct.pair* [[P]], i32 0, i32 0, !dbg [[DBG5:![0-9]+]]
+; CHECK-NEXT:    [[P_0_VAL:%.*]] = load i32, i32* [[P_0]], align 4, !dbg [[DBG5]]
+; CHECK-NEXT:    [[P_1:%.*]] = getelementptr [[STRUCT_PAIR]], %struct.pair* [[P]], i32 0, i32 1, !dbg [[DBG5]]
+; CHECK-NEXT:    [[P_1_VAL:%.*]] = load i32, i32* [[P_1]], align 4, !dbg [[DBG5]]
+; CHECK-NEXT:    call void @test_byval(i32 [[P_0_VAL]], i32 [[P_1_VAL]]), !dbg [[DBG5]]
 ; CHECK-NEXT:    ret void
 ;
   call void @test(i32** %Y), !dbg !1
diff --git a/llvm/test/Transforms/ArgumentPromotion/inalloca.ll b/llvm/test/Transforms/ArgumentPromotion/inalloca.ll
index 3a57b281d7897..c82fa9bc3e15e 100644
--- a/llvm/test/Transforms/ArgumentPromotion/inalloca.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/inalloca.ll
@@ -8,9 +8,9 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 ; Argpromote + sroa should change this to passing the two integers by value.
 define internal i32 @f(%struct.ss* inalloca(%struct.ss)  %s) {
 ; CHECK-LABEL: define {{[^@]+}}@f
-; CHECK-SAME: (i32 [[S_0_0_VAL:%.*]], i32 [[S_0_1_VAL:%.*]]) unnamed_addr {
+; CHECK-SAME: (i32 [[S_0_VAL:%.*]], i32 [[S_4_VAL:%.*]]) unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = add i32 [[S_0_0_VAL]], [[S_0_1_VAL]]
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[S_0_VAL]], [[S_4_VAL]]
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
 entry:
diff --git a/llvm/test/Transforms/ArgumentPromotion/invalidation.ll b/llvm/test/Transforms/ArgumentPromotion/invalidation.ll
index d86c93b28ac30..669ada1ad273f 100644
--- a/llvm/test/Transforms/ArgumentPromotion/invalidation.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/invalidation.ll
@@ -13,9 +13,9 @@
 
 define internal i32 @a(i32* %x) {
 ; CHECK-LABEL: define {{[^@]+}}@a
-; CHECK-SAME: (i32 [[X_VAL:%.*]])
+; CHECK-SAME: (i32 [[X_0_VAL:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 [[X_VAL]]
+; CHECK-NEXT:    ret i32 [[X_0_VAL]]
 ;
 entry:
   %v = load i32, i32* %x
@@ -23,9 +23,9 @@ entry:
 }
 
 define i32 @b() {
-; CHECK-LABEL: define {{[^@]+}}@b()
+; CHECK-LABEL: define {{[^@]+}}@b() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* @G
+; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* @G, align 4
 ; CHECK-NEXT:    [[V:%.*]] = call i32 @a(i32 [[G_VAL]])
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
@@ -35,9 +35,9 @@ entry:
 }
 
 define i32 @c() {
-; CHECK-LABEL: define {{[^@]+}}@c()
+; CHECK-LABEL: define {{[^@]+}}@c() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* @G
+; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* @G, align 4
 ; CHECK-NEXT:    [[V1:%.*]] = call i32 @a(i32 [[G_VAL]])
 ; CHECK-NEXT:    [[V2:%.*]] = call i32 @b()
 ; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[V1]], [[V2]]
diff --git a/llvm/test/Transforms/ArgumentPromotion/metadata.ll b/llvm/test/Transforms/ArgumentPromotion/metadata.ll
index 92e9d0327967b..c98049b0fa9f0 100644
--- a/llvm/test/Transforms/ArgumentPromotion/metadata.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/metadata.ll
@@ -68,7 +68,7 @@ else:
 define void @caller_conditional(i1 %c, i32** %p) {
 ; CHECK-LABEL: define {{[^@]+}}@caller_conditional
 ; CHECK-SAME: (i1 [[C:%.*]], i32** [[P:%.*]]) {
-; CHECK-NEXT:    [[P_VAL:%.*]] = load i32*, i32** [[P]], align 8{{$}}
+; CHECK-NEXT:    [[P_VAL:%.*]] = load i32*, i32** [[P]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32* @callee_conditional(i1 [[C]], i32* [[P_VAL]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/ArgumentPromotion/musttail.ll b/llvm/test/Transforms/ArgumentPromotion/musttail.ll
index cbe2c04ac7a54..e1624f358cc80 100644
--- a/llvm/test/Transforms/ArgumentPromotion/musttail.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/musttail.ll
@@ -8,11 +8,11 @@
 
 define internal i32 @test(%T* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@test
-; CHECK-SAME: (%T* [[P:%.*]])
+; CHECK-SAME: (%T* [[P:%.*]]) {
 ; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr [[T:%.*]], %T* [[P]], i64 0, i32 3
 ; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr [[T]], %T* [[P]], i64 0, i32 2
-; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_GEP]]
-; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_GEP]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_GEP]], align 4
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_GEP]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = add i32 [[A]], [[B]]
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
@@ -26,7 +26,7 @@ define internal i32 @test(%T* %p) {
 
 define i32 @caller(%T* %p) {
 ; CHECK-LABEL: define {{[^@]+}}@caller
-; CHECK-SAME: (%T* [[P:%.*]])
+; CHECK-SAME: (%T* [[P:%.*]]) {
 ; CHECK-NEXT:    [[V:%.*]] = musttail call i32 @test(%T* [[P]])
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
@@ -38,7 +38,7 @@ define i32 @caller(%T* %p) {
 
 define i32 @foo(%T* %p, i32 %v) {
 ; CHECK-LABEL: define {{[^@]+}}@foo
-; CHECK-SAME: (%T* [[P:%.*]], i32 [[V:%.*]])
+; CHECK-SAME: (%T* [[P:%.*]], i32 [[V:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
   ret i32 0
@@ -46,11 +46,11 @@ define i32 @foo(%T* %p, i32 %v) {
 
 define internal i32 @test2(%T* %p, i32 %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@test2
-; CHECK-SAME: (%T* [[P:%.*]], i32 [[P2:%.*]])
+; CHECK-SAME: (%T* [[P:%.*]], i32 [[P2:%.*]]) {
 ; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr [[T:%.*]], %T* [[P]], i64 0, i32 3
 ; CHECK-NEXT:    [[B_GEP:%.*]] = getelementptr [[T]], %T* [[P]], i64 0, i32 2
-; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_GEP]]
-; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_GEP]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_GEP]], align 4
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_GEP]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = add i32 [[A]], [[B]]
 ; CHECK-NEXT:    [[CA:%.*]] = musttail call i32 @foo(%T* undef, i32 [[V]])
 ; CHECK-NEXT:    ret i32 [[CA]]
@@ -66,7 +66,7 @@ define internal i32 @test2(%T* %p, i32 %p2) {
 
 define i32 @caller2(%T* %g) {
 ; CHECK-LABEL: define {{[^@]+}}@caller2
-; CHECK-SAME: (%T* [[G:%.*]])
+; CHECK-SAME: (%T* [[G:%.*]]) {
 ; CHECK-NEXT:    [[V:%.*]] = call i32 @test2(%T* [[G]], i32 0)
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
diff --git a/llvm/test/Transforms/ArgumentPromotion/naked_functions.ll b/llvm/test/Transforms/ArgumentPromotion/naked_functions.ll
index d74ceab1d805d..0973a31803a66 100644
--- a/llvm/test/Transforms/ArgumentPromotion/naked_functions.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/naked_functions.ll
@@ -6,7 +6,7 @@
 @g = common global i32 0, align 4
 
 define i32 @bar() {
-; CHECK-LABEL: define {{[^@]+}}@bar()
+; CHECK-LABEL: define {{[^@]+}}@bar() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo(i32* @g)
 ; CHECK-NEXT:    ret i32 [[CALL]]
@@ -18,7 +18,7 @@ entry:
 
 define internal i32 @foo(i32*) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@foo
-; CHECK-SAME: (i32* [[TMP0:%.*]])
+; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call void asm sideeffect "ldr r0, [r0] \0Abx lr \0A", ""()
diff --git a/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll b/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll
index 106fe6c47fe8e..4d1cda363ab32 100644
--- a/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/nonzero-address-spaces.ll
@@ -9,7 +9,7 @@ target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
 @g = common global i32 0, align 4
 
 define i32 @bar() {
-; CHECK-LABEL: define {{[^@]+}}@bar() addrspace(1)
+; CHECK-LABEL: define {{[^@]+}}@bar() addrspace(1) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = call addrspace(1) i32 @foo()
 ; CHECK-NEXT:    ret i32 [[CALL]]
@@ -21,7 +21,7 @@ entry:
 }
 
 define internal i32 @foo(i32*) {
-; CHECK-LABEL: define {{[^@]+}}@foo() addrspace(1)
+; CHECK-LABEL: define {{[^@]+}}@foo() addrspace(1) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    call addrspace(0) void asm sideeffect "ldr r0, [r0] \0Abx lr \0A", ""()
diff --git a/llvm/test/Transforms/ArgumentPromotion/pr27568.ll b/llvm/test/Transforms/ArgumentPromotion/pr27568.ll
index 738c39289f801..69e7e9c0b05e9 100644
--- a/llvm/test/Transforms/ArgumentPromotion/pr27568.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/pr27568.ll
@@ -4,7 +4,7 @@
 target triple = "x86_64-pc-windows-msvc"
 
 define internal void @callee(i8*) {
-; CHECK-LABEL: define {{[^@]+}}@callee()
+; CHECK-LABEL: define {{[^@]+}}@callee() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @thunk()
 ; CHECK-NEXT:    ret void
@@ -15,7 +15,7 @@ entry:
 }
 
 define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
-; CHECK-LABEL: define {{[^@]+}}@test1() personality i32 (...)* @__CxxFrameHandler3
+; CHECK-LABEL: define {{[^@]+}}@test1() personality i32 (...)* @__CxxFrameHandler3 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    invoke void @thunk()
 ; CHECK-NEXT:    to label [[OUT:%.*]] unwind label [[CPAD:%.*]]
diff --git a/llvm/test/Transforms/ArgumentPromotion/pr32917.ll b/llvm/test/Transforms/ArgumentPromotion/pr32917.ll
index 34c025072f898..d880175d94248 100644
--- a/llvm/test/Transforms/ArgumentPromotion/pr32917.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/pr32917.ll
@@ -6,13 +6,13 @@
 @a = common local_unnamed_addr global i32 0, align 4
 
 define i32 @fn2() local_unnamed_addr {
-; CHECK-LABEL: define {{[^@]+}}@fn2() local_unnamed_addr
+; CHECK-LABEL: define {{[^@]+}}@fn2() local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @b, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to i32*
-; CHECK-NEXT:    [[DOTIDX:%.*]] = getelementptr i32, i32* [[TMP3]], i64 -1
-; CHECK-NEXT:    [[DOTIDX_VAL:%.*]] = load i32, i32* [[DOTIDX]], align 4
-; CHECK-NEXT:    call fastcc void @fn1(i32 [[DOTIDX_VAL]])
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[TMP3]], i64 -1
+; CHECK-NEXT:    [[DOTVAL:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    call fastcc void @fn1(i32 [[DOTVAL]])
 ; CHECK-NEXT:    ret i32 undef
 ;
   %1 = load i32, i32* @b, align 4
@@ -24,8 +24,8 @@ define i32 @fn2() local_unnamed_addr {
 
 define internal fastcc void @fn1(i32* nocapture readonly) unnamed_addr {
 ; CHECK-LABEL: define {{[^@]+}}@fn1
-; CHECK-SAME: (i32 [[DOT18446744073709551615_VAL:%.*]]) unnamed_addr
-; CHECK-NEXT:    store i32 [[DOT18446744073709551615_VAL]], i32* @a, align 4
+; CHECK-SAME: (i32 [[DOT_4_VAL:%.*]]) unnamed_addr {
+; CHECK-NEXT:    store i32 [[DOT_4_VAL]], i32* @a, align 4
 ; CHECK-NEXT:    ret void
 ;
   %2 = getelementptr inbounds i32, i32* %0, i64 -1
diff --git a/llvm/test/Transforms/ArgumentPromotion/pr42028-recursion.ll b/llvm/test/Transforms/ArgumentPromotion/pr42028-recursion.ll
index 6f90573b0565f..e37ad1819a9f2 100644
--- a/llvm/test/Transforms/ArgumentPromotion/pr42028-recursion.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/pr42028-recursion.ll
@@ -9,15 +9,15 @@ define i32 @test_inf_promote_caller(i32 %arg) {
 ; CHECK-LABEL: define {{[^@]+}}@test_inf_promote_caller
 ; CHECK-SAME: (i32 [[ARG:%.*]]) {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = alloca [[S:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[S]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TMP]], %S* [[TMP1]])
+; CHECK-NEXT:    [[TEMP:%.*]] = alloca [[S:%.*]], align 8
+; CHECK-NEXT:    [[TEMP1:%.*]] = alloca [[S]], align 8
+; CHECK-NEXT:    [[TEMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TEMP]], %S* [[TEMP1]])
 ; CHECK-NEXT:    ret i32 0
 ;
 bb:
-  %tmp = alloca %S
-  %tmp1 = alloca %S
-  %tmp2 = call i32 @test_inf_promote_callee(%S* %tmp, %S* %tmp1)
+  %temp = alloca %S
+  %temp1 = alloca %S
+  %temp2 = call i32 @test_inf_promote_callee(%S* %temp, %S* %temp1)
   ret i32 0
 }
 
@@ -25,19 +25,19 @@ define internal i32 @test_inf_promote_callee(%S* %arg, %S* %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@test_inf_promote_callee
 ; CHECK-SAME: (%S* [[ARG:%.*]], %S* [[ARG1:%.*]]) {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load %S*, %S** [[TMP]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = load %S*, %S** [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @test_inf_promote_callee2(%S* [[TMP4]], %S* [[TMP2]])
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP2:%.*]] = load %S*, %S** [[TEMP]], align 8
+; CHECK-NEXT:    [[TEMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP4:%.*]] = load %S*, %S** [[TEMP3]], align 8
+; CHECK-NEXT:    [[TEMP5:%.*]] = call i32 @test_inf_promote_callee2(%S* [[TEMP4]], %S* [[TEMP2]])
 ; CHECK-NEXT:    ret i32 0
 ;
 bb:
-  %tmp = getelementptr %S, %S* %arg1, i32 0, i32 0
-  %tmp2 = load %S*, %S** %tmp
-  %tmp3 = getelementptr %S, %S* %arg, i32 0, i32 0
-  %tmp4 = load %S*, %S** %tmp3
-  %tmp5 = call i32 @test_inf_promote_callee2(%S* %tmp4, %S* %tmp2)
+  %temp = getelementptr %S, %S* %arg1, i32 0, i32 0
+  %temp2 = load %S*, %S** %temp
+  %temp3 = getelementptr %S, %S* %arg, i32 0, i32 0
+  %temp4 = load %S*, %S** %temp3
+  %temp5 = call i32 @test_inf_promote_callee2(%S* %temp4, %S* %temp2)
   ret i32 0
 }
 
diff --git a/llvm/test/Transforms/ArgumentPromotion/profile.ll b/llvm/test/Transforms/ArgumentPromotion/profile.ll
index 941eafad1af3e..04fd580fe074d 100644
--- a/llvm/test/Transforms/ArgumentPromotion/profile.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/profile.ll
@@ -5,8 +5,8 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 ; Checks if !prof metadata is corret in deadargelim.
 
 define void @caller() #0 {
-; CHECK-LABEL: define {{[^@]+}}@caller()
-; CHECK-NEXT:    call void @promote_i32_ptr(i32 42), !prof !0
+; CHECK-LABEL: define {{[^@]+}}@caller() {
+; CHECK-NEXT:    call void @promote_i32_ptr(i32 42), !prof [[PROF0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   %x = alloca i32
@@ -17,8 +17,8 @@ define void @caller() #0 {
 
 define internal void @promote_i32_ptr(i32* %xp) !prof !1 {
 ; CHECK-LABEL: define {{[^@]+}}@promote_i32_ptr
-; CHECK-SAME: (i32 [[XP_VAL:%.*]]) !prof !1
-; CHECK-NEXT:    call void @use_i32(i32 [[XP_VAL]])
+; CHECK-SAME: (i32 [[XP_0_VAL:%.*]]) !prof [[PROF1:![0-9]+]] {
+; CHECK-NEXT:    call void @use_i32(i32 [[XP_0_VAL]])
 ; CHECK-NEXT:    ret void
 ;
   %x = load i32, i32* %xp
diff --git a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
index 4f83127548f70..d43b8d5ea6bd9 100644
--- a/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
@@ -15,11 +15,11 @@
 
 define internal fastcc void @fn(i32* nocapture readonly %p1, i64* nocapture readonly %p2) {
 ; CHECK-LABEL: define {{[^@]+}}@fn
-; CHECK-SAME: (i32 [[P1_VAL:%.*]], i64 [[P2_VAL:%.*]])
+; CHECK-SAME: (i32 [[P1_0_VAL:%.*]], i64 [[P2_0_VAL:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[P2_VAL]] to i32
-; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[P1_VAL]] to i8
-; CHECK-NEXT:    store i8 [[CONV1]], i8* @d, align 1, !tbaa !0
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[P2_0_VAL]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[P1_0_VAL]] to i8
+; CHECK-NEXT:    store i8 [[CONV1]], i8* @d, align 1, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -32,14 +32,14 @@ entry:
 }
 
 define i32 @main() {
-; CHECK-LABEL: define {{[^@]+}}@main()
+; CHECK-LABEL: define {{[^@]+}}@main() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32**, i32*** @e, align 8, !tbaa !3
-; CHECK-NEXT:    store i32* @g, i32** [[TMP0]], align 8, !tbaa !3
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** @a, align 8, !tbaa !3
-; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 4, !tbaa !5
-; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* @g, align 4, !tbaa !5
-; CHECK-NEXT:    [[C_VAL:%.*]] = load i64, i64* @c, align 8, !tbaa !7
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32**, i32*** @e, align 8, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    store i32* @g, i32** [[TMP0]], align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** @a, align 8, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[G_VAL:%.*]] = load i32, i32* @g, align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[C_VAL:%.*]] = load i64, i64* @c, align 8, !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    call fastcc void @fn(i32 [[G_VAL]], i64 [[C_VAL]])
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/ArgumentPromotion/sret.ll b/llvm/test/Transforms/ArgumentPromotion/sret.ll
index 61ce260767d77..6255db0bfd30e 100644
--- a/llvm/test/Transforms/ArgumentPromotion/sret.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/sret.ll
@@ -6,9 +6,9 @@ target triple = "x86_64-pc-windows-msvc"
 
 define internal void @add({i32, i32}* %this, i32* sret(i32) %r) {
 ; CHECK-LABEL: define {{[^@]+}}@add
-; CHECK-SAME: (i32 [[THIS_0_0_VAL:%.*]], i32 [[THIS_0_1_VAL:%.*]], i32* noalias [[R:%.*]])
-; CHECK-NEXT:    [[AB:%.*]] = add i32 [[THIS_0_0_VAL]], [[THIS_0_1_VAL]]
-; CHECK-NEXT:    store i32 [[AB]], i32* [[R]]
+; CHECK-SAME: (i32 [[THIS_0_VAL:%.*]], i32 [[THIS_4_VAL:%.*]], i32* noalias [[R:%.*]]) {
+; CHECK-NEXT:    [[AB:%.*]] = add i32 [[THIS_0_VAL]], [[THIS_4_VAL]]
+; CHECK-NEXT:    store i32 [[AB]], i32* [[R]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %ap = getelementptr {i32, i32}, {i32, i32}* %this, i32 0, i32 0
@@ -21,14 +21,14 @@ define internal void @add({i32, i32}* %this, i32* sret(i32) %r) {
 }
 
 define void @f() {
-; CHECK-LABEL: define {{[^@]+}}@f()
-; CHECK-NEXT:    [[R:%.*]] = alloca i32
-; CHECK-NEXT:    [[PAIR:%.*]] = alloca { i32, i32 }
-; CHECK-NEXT:    [[PAIR_IDX:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[PAIR]], i64 0, i32 0
-; CHECK-NEXT:    [[PAIR_IDX_VAL:%.*]] = load i32, i32* [[PAIR_IDX]]
-; CHECK-NEXT:    [[PAIR_IDX1:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[PAIR]], i64 0, i32 1
-; CHECK-NEXT:    [[PAIR_IDX1_VAL:%.*]] = load i32, i32* [[PAIR_IDX1]]
-; CHECK-NEXT:    call void @add(i32 [[PAIR_IDX_VAL]], i32 [[PAIR_IDX1_VAL]], i32* noalias [[R]])
+; CHECK-LABEL: define {{[^@]+}}@f() {
+; CHECK-NEXT:    [[R:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[PAIR:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[PAIR]], i64 0, i32 0
+; CHECK-NEXT:    [[PAIR_VAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[PAIR]], i64 0, i32 1
+; CHECK-NEXT:    [[PAIR_VAL1:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void @add(i32 [[PAIR_VAL]], i32 [[PAIR_VAL1]], i32* noalias [[R]])
 ; CHECK-NEXT:    ret void
 ;
   %r = alloca i32
diff --git a/llvm/test/Transforms/ArgumentPromotion/variadic.ll b/llvm/test/Transforms/ArgumentPromotion/variadic.ll
index 4fbc5e38ba1fd..9d9e72bd57203 100644
--- a/llvm/test/Transforms/ArgumentPromotion/variadic.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/variadic.ll
@@ -16,9 +16,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@main
-; CHECK-SAME: (i32 [[ARGC:%.*]], i8** nocapture readnone [[ARGV:%.*]])
+; CHECK-SAME: (i32 [[ARGC:%.*]], i8** nocapture readnone [[ARGV:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void (i8*, i8*, i8*, i8*, i8*, ...) @callee_t0f(i8* undef, i8* undef, i8* undef, i8* undef, i8* undef, %struct.tt0* byval(%struct.tt0) align 8 @t45)
+; CHECK-NEXT:    tail call void (i8*, i8*, i8*, i8*, i8*, ...) @callee_t0f(i8* undef, i8* undef, i8* undef, i8* undef, i8* undef, %struct.tt0* byval([[STRUCT_TT0:%.*]]) align 8 @t45)
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -29,7 +29,7 @@ entry:
 ; Function Attrs: nounwind uwtable
 define internal void @callee_t0f(i8* nocapture readnone %tp13, i8* nocapture readnone %tp14, i8* nocapture readnone %tp15, i8* nocapture readnone %tp16, i8* nocapture readnone %tp17, ...) {
 ; CHECK-LABEL: define {{[^@]+}}@callee_t0f
-; CHECK-SAME: (i8* nocapture readnone [[TP13:%.*]], i8* nocapture readnone [[TP14:%.*]], i8* nocapture readnone [[TP15:%.*]], i8* nocapture readnone [[TP16:%.*]], i8* nocapture readnone [[TP17:%.*]], ...)
+; CHECK-SAME: (i8* nocapture readnone [[TP13:%.*]], i8* nocapture readnone [[TP14:%.*]], i8* nocapture readnone [[TP15:%.*]], i8* nocapture readnone [[TP16:%.*]], i8* nocapture readnone [[TP17:%.*]], ...) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    ret void
 ;

From 673879249d4d1c4e6d763a6db4a4812d721b41b6 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Sun, 20 Feb 2022 10:20:48 +0000
Subject: [PATCH 354/748] Revert "[C++20][Modules][1/8] Track valid import
 state."

This reverts commit 8a3f9a584ad43369cf6a034dc875ebfca76d9033.

need to investigate build failures that do not show on CI or local
testing.
---
 .../clang/Basic/DiagnosticParseKinds.td       |   4 -
 clang/include/clang/Parse/Parser.h            |  14 +-
 clang/include/clang/Sema/Sema.h               |  15 +-
 clang/lib/Interpreter/IncrementalParser.cpp   |   5 +-
 clang/lib/Parse/ParseAST.cpp                  |   5 +-
 clang/lib/Parse/ParseObjc.cpp                 |   3 +-
 clang/lib/Parse/Parser.cpp                    |  91 ++----------
 clang/lib/Sema/SemaModule.cpp                 |  46 ++----
 .../Modules/cxx20-import-diagnostics-a.cpp    | 140 ------------------
 9 files changed, 37 insertions(+), 286 deletions(-)
 delete mode 100644 clang/test/Modules/cxx20-import-diagnostics-a.cpp

diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index f21e841bcdd38..e23810f402365 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1539,10 +1539,6 @@ def err_private_module_fragment_expected_semi : Error<
 def err_missing_before_module_end : Error<"expected %0 at end of module">;
 def err_unsupported_module_partition : Error<
   "sorry, module partitions are not yet supported">;
-def err_import_not_allowed_here : Error<
-  "imports must immediately follow the module declaration">;
-def err_import_in_wrong_fragment : Error<
-  "module%select{| partition}0 imports cannot be in the %select{global|private}1 module fragment">;
 
 def err_export_empty : Error<"export declaration cannot be empty">;
 }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 08d492a7ec721..981800a7e2356 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -464,17 +464,14 @@ class Parser : public CodeCompletionHandler {
   void Initialize();
 
   /// Parse the first top-level declaration in a translation unit.
-  bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result,
-                              Sema::ModuleImportState &ImportState);
+  bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result);
 
   /// ParseTopLevelDecl - Parse one top-level declaration. Returns true if
   /// the EOF was encountered.
-  bool ParseTopLevelDecl(DeclGroupPtrTy &Result,
-                         Sema::ModuleImportState &ImportState);
+  bool ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl = false);
   bool ParseTopLevelDecl() {
     DeclGroupPtrTy Result;
-    Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
-    return ParseTopLevelDecl(Result, IS);
+    return ParseTopLevelDecl(Result);
   }
 
   /// ConsumeToken - Consume the current 'peek token' and lex the next one.
@@ -3494,9 +3491,8 @@ class Parser : public CodeCompletionHandler {
 
   //===--------------------------------------------------------------------===//
   // Modules
-  DeclGroupPtrTy ParseModuleDecl(Sema::ModuleImportState &ImportState);
-  Decl *ParseModuleImport(SourceLocation AtLoc,
-                          Sema::ModuleImportState &ImportState);
+  DeclGroupPtrTy ParseModuleDecl(bool IsFirstDecl);
+  Decl *ParseModuleImport(SourceLocation AtLoc);
   bool parseMisplacedModuleImport();
   bool tryParseMisplacedModuleImport() {
     tok::TokenKind Kind = Tok.getKind();
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index dfa12ad40b72a..c1e846c55dee7 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2949,24 +2949,11 @@ class Sema final {
     Implementation, ///< 'module X;'
   };
 
-  /// An enumeration to represent the transition of states in parsing module
-  /// fragments and imports.  If we are not parsing a C++20 TU, or we find
-  /// an error in state transition, the state is set to NotACXX20Module.
-  enum class ModuleImportState {
-    FirstDecl,       ///< Parsing the first decl in a TU.
-    GlobalFragment,  ///< after 'module;' but before 'module X;'
-    ImportAllowed,   ///< after 'module X;' but before any non-import decl.
-    ImportFinished,  ///< after any non-import decl.
-    PrivateFragment, ///< after 'module :private;'.
-    NotACXX20Module  ///< Not a C++20 TU, or an invalid state was found.
-  };
-
   /// The parser has processed a module-declaration that begins the definition
   /// of a module interface or implementation.
   DeclGroupPtrTy ActOnModuleDecl(SourceLocation StartLoc,
                                  SourceLocation ModuleLoc, ModuleDeclKind MDK,
-                                 ModuleIdPath Path,
-                                 ModuleImportState &ImportState);
+                                 ModuleIdPath Path, bool IsFirstDecl);
 
   /// The parser has processed a global-module-fragment declaration that begins
   /// the definition of the global module fragment of the current module unit.
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
index 0f1ef3233a2a1..4ade8b8bb0741 100644
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -164,9 +164,8 @@ IncrementalParser::ParseOrWrapTopLevelDecl() {
   }
 
   Parser::DeclGroupPtrTy ADecl;
-  Sema::ModuleImportState ImportState;
-  for (bool AtEOF = P->ParseFirstTopLevelDecl(ADecl, ImportState); !AtEOF;
-       AtEOF = P->ParseTopLevelDecl(ADecl, ImportState)) {
+  for (bool AtEOF = P->ParseFirstTopLevelDecl(ADecl); !AtEOF;
+       AtEOF = P->ParseTopLevelDecl(ADecl)) {
     // If we got a null return and something *was* parsed, ignore it.  This
     // is due to a top-level semicolon, an action override, or a parse error
     // skipping something.
diff --git a/clang/lib/Parse/ParseAST.cpp b/clang/lib/Parse/ParseAST.cpp
index fd79ed3ca158b..01510e8caf3b7 100644
--- a/clang/lib/Parse/ParseAST.cpp
+++ b/clang/lib/Parse/ParseAST.cpp
@@ -154,9 +154,8 @@ void clang::ParseAST(Sema &S, bool PrintStats, bool SkipFunctionBodies) {
     llvm::TimeTraceScope TimeScope("Frontend");
     P.Initialize();
     Parser::DeclGroupPtrTy ADecl;
-    Sema::ModuleImportState ImportState;
-    for (bool AtEOF = P.ParseFirstTopLevelDecl(ADecl, ImportState); !AtEOF;
-         AtEOF = P.ParseTopLevelDecl(ADecl, ImportState)) {
+    for (bool AtEOF = P.ParseFirstTopLevelDecl(ADecl); !AtEOF;
+         AtEOF = P.ParseTopLevelDecl(ADecl)) {
       // If we got a null return and something *was* parsed, ignore it.  This
       // is due to a top-level semicolon, an action override, or a parse error
       // skipping something.
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 08f131ed0d874..f493ac9b92caf 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -79,8 +79,7 @@ Parser::ParseObjCAtDirectives(ParsedAttributesWithRange &Attrs) {
     break;
   case tok::objc_import:
     if (getLangOpts().Modules || getLangOpts().DebuggerSupport) {
-      Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
-      SingleDecl = ParseModuleImport(AtLoc, IS);
+      SingleDecl = ParseModuleImport(AtLoc);
       break;
     }
     Diag(AtLoc, diag::err_atimport);
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 87500a0405531..ffa1e0f027f1d 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -581,20 +581,15 @@ void Parser::DestroyTemplateIds() {
 ///                 top-level-declaration-seq[opt] private-module-fragment[opt]
 ///
 /// Note that in C, it is an error if there is no first declaration.
-bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result,
-                                    Sema::ModuleImportState &ImportState) {
+bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result) {
   Actions.ActOnStartOfTranslationUnit();
 
-  // For C++20 modules, a module decl must be the first in the TU.  We also
-  // need to track module imports.
-  ImportState = Sema::ModuleImportState::FirstDecl;
-  bool NoTopLevelDecls = ParseTopLevelDecl(Result, ImportState);
-
   // C11 6.9p1 says translation units must have at least one top-level
   // declaration. C++ doesn't have this restriction. We also don't want to
   // complain if we have a precompiled header, although technically if the PCH
   // is empty we should still emit the (pedantic) diagnostic.
   // If the main file is a header, we're only pretending it's a TU; don't warn.
+  bool NoTopLevelDecls = ParseTopLevelDecl(Result, true);
   if (NoTopLevelDecls && !Actions.getASTContext().getExternalSource() &&
       !getLangOpts().CPlusPlus && !getLangOpts().IsHeaderFile)
     Diag(diag::ext_empty_translation_unit);
@@ -608,8 +603,7 @@ bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result,
 ///   top-level-declaration:
 ///           declaration
 /// [C++20]   module-import-declaration
-bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
-                               Sema::ModuleImportState &ImportState) {
+bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
   DestroyTemplateIdAnnotationsRAIIObj CleanupRAII(*this);
 
   // Skip over the EOF token, flagging end of previous input for incremental
@@ -653,12 +647,13 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
 
   case tok::kw_module:
   module_decl:
-    Result = ParseModuleDecl(ImportState);
+    Result = ParseModuleDecl(IsFirstDecl);
     return false;
 
-  case tok::kw_import:
+  // tok::kw_import is handled by ParseExternalDeclaration. (Under the Modules
+  // TS, an import can occur within an export block.)
   import_decl: {
-    Decl *ImportDecl = ParseModuleImport(SourceLocation(), ImportState);
+    Decl *ImportDecl = ParseModuleImport(SourceLocation());
     Result = Actions.ConvertDeclToDeclGroup(ImportDecl);
     return false;
   }
@@ -674,14 +669,12 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
     Actions.ActOnModuleBegin(Tok.getLocation(), reinterpret_cast<Module *>(
                                                     Tok.getAnnotationValue()));
     ConsumeAnnotationToken();
-    ImportState = Sema::ModuleImportState::NotACXX20Module;
     return false;
 
   case tok::annot_module_end:
     Actions.ActOnModuleEnd(Tok.getLocation(), reinterpret_cast<Module *>(
                                                   Tok.getAnnotationValue()));
     ConsumeAnnotationToken();
-    ImportState = Sema::ModuleImportState::NotACXX20Module;
     return false;
 
   case tok::eof:
@@ -725,16 +718,6 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
   MaybeParseCXX11Attributes(attrs);
 
   Result = ParseExternalDeclaration(attrs);
-  // An empty Result might mean a line with ';' or some parsing error, ignore
-  // it.
-  if (Result) {
-    if (ImportState == Sema::ModuleImportState::FirstDecl)
-      // First decl was not modular.
-      ImportState = Sema::ModuleImportState::NotACXX20Module;
-    else if (ImportState == Sema::ModuleImportState::ImportAllowed)
-      // Non-imports disallow further imports.
-      ImportState = Sema::ModuleImportState::ImportFinished;
-  }
   return false;
 }
 
@@ -904,17 +887,11 @@ Parser::ParseExternalDeclaration(ParsedAttributesWithRange &attrs,
         getCurScope(),
         CurParsedObjCImpl ? Sema::PCC_ObjCImplementation : Sema::PCC_Namespace);
     return nullptr;
-  case tok::kw_import: {
-    Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
-    if (getLangOpts().CPlusPlusModules) {
-      llvm_unreachable("not expecting a c++20 import here");
-      ProhibitAttributes(attrs);
-    }
-    SingleDecl = ParseModuleImport(SourceLocation(), IS);
-  } break;
+  case tok::kw_import:
+    SingleDecl = ParseModuleImport(SourceLocation());
+    break;
   case tok::kw_export:
     if (getLangOpts().CPlusPlusModules || getLangOpts().ModulesTS) {
-      ProhibitAttributes(attrs);
       SingleDecl = ParseExportDeclaration();
       break;
     }
@@ -2314,8 +2291,7 @@ void Parser::ParseMicrosoftIfExistsExternalDeclaration() {
 ///            attribute-specifier-seq[opt] ';'
 ///   private-module-fragment: [C++2a]
 ///     'module' ':' 'private' ';' top-level-declaration-seq[opt]
-Parser::DeclGroupPtrTy
-Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
+Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
   SourceLocation StartLoc = Tok.getLocation();
 
   Sema::ModuleDeclKind MDK = TryConsumeToken(tok::kw_export)
@@ -2335,7 +2311,7 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   // Parse a global-module-fragment, if present.
   if (getLangOpts().CPlusPlusModules && Tok.is(tok::semi)) {
     SourceLocation SemiLoc = ConsumeToken();
-    if (ImportState != Sema::ModuleImportState::FirstDecl) {
+    if (!IsFirstDecl) {
       Diag(StartLoc, diag::err_global_module_introducer_not_at_start)
         << SourceRange(StartLoc, SemiLoc);
       return nullptr;
@@ -2344,7 +2320,6 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
       Diag(StartLoc, diag::err_module_fragment_exported)
         << /*global*/0 << FixItHint::CreateRemoval(StartLoc);
     }
-    ImportState = Sema::ModuleImportState::GlobalFragment;
     return Actions.ActOnGlobalModuleFragmentDecl(ModuleLoc);
   }
 
@@ -2359,7 +2334,6 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
     SourceLocation PrivateLoc = ConsumeToken();
     DiagnoseAndSkipCXX11Attributes();
     ExpectAndConsumeSemi(diag::err_private_module_fragment_expected_semi);
-    ImportState = Sema::ModuleImportState::PrivateFragment;
     return Actions.ActOnPrivateModuleFragmentDecl(ModuleLoc, PrivateLoc);
   }
 
@@ -2387,7 +2361,7 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
 
   ExpectAndConsumeSemi(diag::err_module_expected_semi);
 
-  return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, ImportState);
+  return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, IsFirstDecl);
 }
 
 /// Parse a module import declaration. This is essentially the same for
@@ -2405,8 +2379,7 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
 ///                   attribute-specifier-seq[opt] ';'
 ///           'export'[opt] 'import' header-name
 ///                   attribute-specifier-seq[opt] ';'
-Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
-                                Sema::ModuleImportState &ImportState) {
+Decl *Parser::ParseModuleImport(SourceLocation AtLoc) {
   SourceLocation StartLoc = AtLoc.isInvalid() ? Tok.getLocation() : AtLoc;
 
   SourceLocation ExportLoc;
@@ -2455,42 +2428,6 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
     return nullptr;
   }
 
-  // Diagnose mis-imports.
-  bool SeenError = true;
-  switch (ImportState) {
-  case Sema::ModuleImportState::ImportAllowed:
-    SeenError = false;
-    break;
-  case Sema::ModuleImportState::FirstDecl:
-  case Sema::ModuleImportState::NotACXX20Module:
-    // TODO: These cases will be an error when partitions are implemented.
-    SeenError = false;
-    break;
-  case Sema::ModuleImportState::GlobalFragment:
-    // We can only have pre-processor directives in the global module
-    // fragment.  We can, however have a header unit import here.
-    if (!HeaderUnit)
-      // We do not have partition support yet, so first arg is 0.
-      Diag(ImportLoc, diag::err_import_in_wrong_fragment) << 0 << 0;
-    else
-      SeenError = false;
-    break;
-  case Sema::ModuleImportState::ImportFinished:
-    if (getLangOpts().CPlusPlusModules)
-      Diag(ImportLoc, diag::err_import_not_allowed_here);
-    else
-      SeenError = false;
-    break;
-  case Sema::ModuleImportState::PrivateFragment:
-    // We do not have partition support yet, so first arg is 0.
-    Diag(ImportLoc, diag::err_import_in_wrong_fragment) << 0 << 1;
-    break;
-  }
-  if (SeenError) {
-    ExpectAndConsumeSemi(diag::err_module_expected_semi);
-    return nullptr;
-  }
-
   DeclResult Import;
   if (HeaderUnit)
     Import =
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 9bed3cb769f70..85e58640044dc 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -80,20 +80,12 @@ Sema::ActOnGlobalModuleFragmentDecl(SourceLocation ModuleLoc) {
   return nullptr;
 }
 
-Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
-                                           SourceLocation ModuleLoc,
-                                           ModuleDeclKind MDK,
-                                           ModuleIdPath Path,
-                                           ModuleImportState &ImportState) {
+Sema::DeclGroupPtrTy
+Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
+                      ModuleDeclKind MDK, ModuleIdPath Path, bool IsFirstDecl) {
   assert((getLangOpts().ModulesTS || getLangOpts().CPlusPlusModules) &&
          "should only have module decl in Modules TS or C++20");
 
-  bool IsFirstDecl = ImportState == ModuleImportState::FirstDecl;
-  bool SeenGMF = ImportState == ModuleImportState::GlobalFragment;
-  // If any of the steps here fail, we count that as invalidating C++20
-  // module state;
-  ImportState = ModuleImportState::NotACXX20Module;
-
   // A module implementation unit requires that we are not compiling a module
   // of any kind. A module interface unit requires that we are not compiling a
   // module map.
@@ -142,13 +134,9 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
       ModuleScopes.back().Module->Kind == Module::GlobalModuleFragment)
     GlobalModuleFragment = ModuleScopes.back().Module;
 
-  assert((!getLangOpts().CPlusPlusModules ||
-          SeenGMF == (bool)GlobalModuleFragment) &&
-         "mismatched global module state");
-
   // In C++20, the module-declaration must be the first declaration if there
   // is no global module fragment.
-  if (getLangOpts().CPlusPlusModules && !IsFirstDecl && !SeenGMF) {
+  if (getLangOpts().CPlusPlusModules && !IsFirstDecl && !GlobalModuleFragment) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
     SourceLocation BeginLoc =
         ModuleScopes.empty()
@@ -243,10 +231,6 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
   TU->setModuleOwnershipKind(Decl::ModuleOwnershipKind::ModulePrivate);
   TU->setLocalOwningModule(Mod);
 
-  // We are in the module purview, but before any other (non import)
-  // statements, so imports are allowed.
-  ImportState = ModuleImportState::ImportAllowed;
-
   // FIXME: Create a ModuleDecl.
   return nullptr;
 }
@@ -317,10 +301,10 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
                                    SourceLocation ExportLoc,
                                    SourceLocation ImportLoc,
                                    ModuleIdPath Path) {
-  // Flatten the module path for a C++20 or Modules TS module name.
+  // Flatten the module path for a Modules TS module name.
   std::pair<IdentifierInfo *, SourceLocation> ModuleNameLoc;
-  std::string ModuleName;
-  if (getLangOpts().CPlusPlusModules || getLangOpts().ModulesTS) {
+  if (getLangOpts().ModulesTS) {
+    std::string ModuleName;
     for (auto &Piece : Path) {
       if (!ModuleName.empty())
         ModuleName += ".";
@@ -330,14 +314,6 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
     Path = ModuleIdPath(ModuleNameLoc);
   }
 
-  // Diagnose self-import before attempting a load.
-  if (getLangOpts().CPlusPlusModules && isCurrentModulePurview() &&
-      getCurrentModule()->Name == ModuleName) {
-    Diag(ImportLoc, diag::err_module_self_import)
-        << ModuleName << getLangOpts().CurrentModule;
-    return true;
-  }
-
   Module *Mod =
       getModuleLoader().loadModule(ImportLoc, Path, Module::AllVisible,
                                    /*IsInclusionDirective=*/false);
@@ -366,9 +342,11 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
   // FIXME: we should support importing a submodule within a different submodule
   // of the same top-level module. Until we do, make it an error rather than
   // silently ignoring the import.
-  // FIXME: Should we warn on a redundant import of the current module?
-  if (!getLangOpts().CPlusPlusModules &&
-      Mod->getTopLevelModuleName() == getLangOpts().CurrentModule &&
+  // Import-from-implementation is valid in the Modules TS. FIXME: Should we
+  // warn on a redundant import of the current module?
+  // FIXME: Import of a module from an implementation partition of the same
+  // module is permitted.
+  if (Mod->getTopLevelModuleName() == getLangOpts().CurrentModule &&
       (getLangOpts().isCompilingModule() || !getLangOpts().ModulesTS)) {
     Diag(ImportLoc, getLangOpts().isCompilingModule()
                         ? diag::err_module_self_import
diff --git a/clang/test/Modules/cxx20-import-diagnostics-a.cpp b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
deleted file mode 100644
index fd4085bcb4713..0000000000000
--- a/clang/test/Modules/cxx20-import-diagnostics-a.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-// RUN: rm -rf %t
-// RUN: mkdir -p %t
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=0 -x c++ %s \
-// RUN:  -o %t/B.pcm
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=1 -x c++ %s \
-// RUN:  -o %t/C.pcm
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=2 -x c++ %s \
-// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/AOK1.pcm
-
-// RUN: %clang_cc1 -std=c++20 -S -D TU=3 -x c++ %s \
-// RUN:  -fmodule-file=%t/AOK1.pcm -o %t/tu_3.s -verify
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=4 -x c++ %s \
-// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/BC.pcm -verify
-
-// RUN: %clang_cc1 -std=c++20 -S -D TU=5 -x c++ %s \
-// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/tu_5.s -verify
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=6 -x c++ %s \
-// RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=7 -x c++ %s \
-// RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
-
-// RUN: %clang_cc1 -std=c++20 -S -D TU=8 -x c++ %s \
-// RUN:  -fmodule-file=%t/B.pcm -o %t/tu_8.s -verify
-
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=9 -x c++ %s \
-// RUN:  -o %t/B.pcm -verify
-
-// RUN: %clang_cc1 -std=c++20 -emit-obj -D TU=10 -x c++ %s \
-// RUN:  -fmodule-file=%t/C.pcm  -o %t/impl.o
-
-// Test diagnostics for incorrect module import sequences.
-
-#if TU == 0
-
-export module B;
-
-int foo ();
-
-// expected-no-diagnostics
-
-#elif TU == 1
-
-export module C;
-
-int bar ();
-
-// expected-no-diagnostics
-
-#elif TU == 2
-
-export module AOK1;
-
-import B;
-export import C;
-
-export int theAnswer ();
-
-// expected-no-diagnostics
-
-#elif TU == 3
-
-module;
-
-module AOK1;
-
-export import C; // expected-error {{export declaration can only be used within a module interface unit}}
-
-int theAnswer () { return 42; }
-
-#elif TU == 4
-
-export module BC;
-
-export import B;
-
-int foo () { return 10; }
-
-import C; // expected-error {{imports must immediately follow the module declaration}}
-
-#elif TU == 5
-
-module B; // implicitly imports B.
-
-int foo () { return 10; }
-
-import C; // expected-error {{imports must immediately follow the module declaration}}
-
-#elif TU == 6
-
-module;
-// We can only have preprocessor commands here, which could include an include
-// translated header unit.  However those are identified specifically by the
-// preprocessor; non-preprocessed user code should not contain an import here.
-import B; // expected-error {{module imports cannot be in the global module fragment}}
-
-export module D;
-
-int delta ();
-
-#elif TU == 7
-
-export module D;
-
-int delta ();
-
-module :private;
-
-import B; // expected-error {{module imports cannot be in the private module fragment}}
-
-#elif TU == 8
-
-module B;
-
-import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
-
-#elif TU == 9
-
-export module B;
-
-import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
-
-#elif TU == 10
-
-int x;
-
-import C;
-
-int baz() { return 6174; }
-
-// expected-no-diagnostics
-
-#else
-#error "no MODE set"
-#endif

From 51c0650f6ba8128fb07036b4be8512bb5f727c1a Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 20 Feb 2022 12:12:39 +0100
Subject: [PATCH 355/748] Unionize clang::DynTypedNodeList. NFC.

---
 clang/include/clang/AST/ParentMapContext.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/AST/ParentMapContext.h b/clang/include/clang/AST/ParentMapContext.h
index 2edbc987850d2..3c2e2f9640ca3 100644
--- a/clang/include/clang/AST/ParentMapContext.h
+++ b/clang/include/clang/AST/ParentMapContext.h
@@ -90,29 +90,27 @@ class TraversalKindScope {
 /// Container for either a single DynTypedNode or for an ArrayRef to
 /// DynTypedNode. For use with ParentMap.
 class DynTypedNodeList {
-  llvm::AlignedCharArrayUnion<DynTypedNode, ArrayRef<DynTypedNode>> Storage;
+  union {
+    DynTypedNode SingleNode;
+    ArrayRef<DynTypedNode> Nodes;
+  };
   bool IsSingleNode;
 
 public:
   DynTypedNodeList(const DynTypedNode &N) : IsSingleNode(true) {
-    new (&Storage) DynTypedNode(N);
+    new (&SingleNode) DynTypedNode(N);
   }
 
   DynTypedNodeList(ArrayRef<DynTypedNode> A) : IsSingleNode(false) {
-    new (&Storage) ArrayRef<DynTypedNode>(A);
+    new (&Nodes) ArrayRef<DynTypedNode>(A);
   }
 
   const DynTypedNode *begin() const {
-    if (!IsSingleNode)
-      return reinterpret_cast<const ArrayRef<DynTypedNode> *>(&Storage)
-          ->begin();
-    return reinterpret_cast<const DynTypedNode *>(&Storage);
+    return !IsSingleNode ? Nodes.begin() : &SingleNode;
   }
 
   const DynTypedNode *end() const {
-    if (!IsSingleNode)
-      return reinterpret_cast<const ArrayRef<DynTypedNode> *>(&Storage)->end();
-    return reinterpret_cast<const DynTypedNode *>(&Storage) + 1;
+    return !IsSingleNode ? Nodes.end() : &SingleNode + 1;
   }
 
   size_t size() const { return end() - begin(); }

From 5c404049b5c757b17bf092ac2471712c986095ad Mon Sep 17 00:00:00 2001
From: Nuno Lopes <nuno.lopes@tecnico.ulisboa.pt>
Date: Sun, 20 Feb 2022 11:33:47 +0000
Subject: [PATCH 356/748] [docs] Add a note saying that the use of poison is
 preferred to the use of undef Plus fix a few wrong examples with undef

---
 llvm/docs/LangRef.rst | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 02d11362b48e9..ecb3903988eeb 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3889,6 +3889,13 @@ indicates that the user of the value may receive an unspecified
 bit-pattern. Undefined values may be of any type (other than '``label``'
 or '``void``') and be used anywhere a constant is permitted.
 
+.. note::
+
+  A '``poison``' value (decribed in the next section) should be used instead of
+  '``undef``' whenever possible. Poison values are stronger than undef, and
+  enable more optimizations. Just the existence of '``undef``' blocks certain
+  optimizations (see the examples below).
+
 Undefined values are useful because they indicate to the compiler that
 the program is well defined no matter what value is used. This gives the
 compiler more freedom to optimize. Here are some examples of
@@ -3939,7 +3946,7 @@ allowing the '``or``' to be folded to -1.
     Safe:
       %A = %X     (or %Y)
       %B = 42     (or %Y)
-      %C = %Y
+      %C = %Y     (if %Y is probably not poison; unsafe otherwise)
     Unsafe:
       %A = undef
       %B = undef
@@ -3951,7 +3958,8 @@ of the two operands. In the ``%A`` example, if ``%X`` and ``%Y`` were
 both known to have a clear low bit, then ``%A`` would have to have a
 cleared low bit. However, in the ``%C`` example, the optimizer is
 allowed to assume that the '``undef``' operand could be the same as
-``%Y``, allowing the whole '``select``' to be eliminated.
+``%Y`` if ``%Y`` is provably not '``poison``', allowing the whole '``select``'
+to be eliminated. This is because '``poison``' is stronger than '``undef``'.
 
 .. code-block:: llvm
 
@@ -4013,12 +4021,13 @@ optimizer can assume that it occurs in dead code.
     a:  store undef -> %X
     b:  store %X -> undef
     Safe:
-    a: <deleted>
+    a: <deleted>     (if the stored value in %X is provably not poison)
     b: unreachable
 
 A store *of* an undefined value can be assumed to not have any effect;
 we can assume that the value is overwritten with bits that happen to
-match what was already there. However, a store *to* an undefined
+match what was already there. This argument is only valid if the stored value
+is provably not ``poison``. However, a store *to* an undefined
 location could clobber arbitrary memory, therefore, it has undefined
 behavior.
 
@@ -4048,17 +4057,6 @@ it is undefined behavior.
       br %X, BB1, BB2 ; Well-defined (non-deterministic jump)
 
 
-This is also consistent with the behavior of MemorySanitizer.
-MemorySanitizer, detector of uses of uninitialized memory,
-defines a branch with condition that depends on an undef value (or
-certain other values, like e.g. a result of a load from heap-allocated
-memory that has never been stored to) to have an externally visible
-side effect. For this reason functions with *sanitize_memory*
-attribute are not allowed to produce such branches "out of thin
-air". More strictly, an optimization that inserts a conditional branch
-is only valid if in all executions where the branch condition has at
-least one undefined bit, the same branch condition is evaluated in the
-input IR as well.
 
 .. _poisonvalues:
 

From da23fc966be91cae9af0b5f0d213961bf96e5d46 Mon Sep 17 00:00:00 2001
From: Nuno Lopes <nuno.lopes@tecnico.ulisboa.pt>
Date: Sun, 20 Feb 2022 11:41:49 +0000
Subject: [PATCH 357/748] [docs] Simplify the description of poison values

---
 llvm/docs/LangRef.rst | 45 +++++--------------------------------------
 1 file changed, 5 insertions(+), 40 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ecb3903988eeb..df7065cecd253 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4071,49 +4071,14 @@ The string '``poison``' can be used anywhere a constant is expected, and
 operations such as :ref:`add <i_add>` with the ``nsw`` flag can produce
 a poison value.
 
-Poison value behavior is defined in terms of value *dependence*:
-
--  Values other than :ref:`phi <i_phi>` nodes, :ref:`select <i_select>`, and
-   :ref:`freeze <i_freeze>` instructions depend on their operands.
--  :ref:`Phi <i_phi>` nodes depend on the operand corresponding to
-   their dynamic predecessor basic block.
--  :ref:`Select <i_select>` instructions depend on their condition operand and
-   their selected operand.
--  Function arguments depend on the corresponding actual argument values
-   in the dynamic callers of their functions.
--  :ref:`Call <i_call>` instructions depend on the :ref:`ret <i_ret>`
-   instructions that dynamically transfer control back to them.
--  :ref:`Invoke <i_invoke>` instructions depend on the
-   :ref:`ret <i_ret>`, :ref:`resume <i_resume>`, or exception-throwing
-   call instructions that dynamically transfer control back to them.
--  Non-volatile loads and stores depend on the most recent stores to all
-   of the referenced memory addresses, following the order in the IR
-   (including loads and stores implied by intrinsics such as
-   :ref:`@llvm.memcpy <int_memcpy>`.)
--  An instruction with externally visible side effects depends on the
-   most recent preceding instruction with externally visible side
-   effects, following the order in the IR. (This includes :ref:`volatile
-   operations <volatile>`.)
--  An instruction *control-depends* on a :ref:`terminator
-   instruction <terminators>` if the terminator instruction has
-   multiple successors and the instruction is always executed when
-   control transfers to one of the successors, and may not be executed
-   when control is transferred to another.
--  Additionally, an instruction also *control-depends* on a terminator
-   instruction if the set of instructions it otherwise depends on would
-   be different if the terminator had transferred control to a different
-   successor.
--  Dependence is transitive.
--  Vector elements may be independently poisoned. Therefore, transforms
-   on instructions such as shufflevector must be careful to propagate
-   poison across values or elements only as allowed by the original code.
-
-An instruction that *depends* on a poison value, produces a poison value
-itself. A poison value may be relaxed into an
-:ref:`undef value <undefvalues>`, which takes an arbitrary bit-pattern.
+Most instructions return '``poison``' when one of their arguments is
+'``poison``'. A notable expection is the :ref:`select instruction <i_select>`.
 Propagation of poison can be stopped with the
 :ref:`freeze instruction <i_freeze>`.
 
+It is correct to replace a poison value with an
+:ref:`undef value <undefvalues>` or any value of the type.
+
 This means that immediate undefined behavior occurs if a poison value is
 used as an instruction operand that has any values that trigger undefined
 behavior. Notably this includes (but is not limited to):

From 29d2ae59e45f4e1e5a2896a89ef8bb7bba90cc3a Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Sun, 20 Feb 2022 14:57:06 +0300
Subject: [PATCH 358/748] [ArgPromotion] Regenerate test checks for
 dead-gep-no-promotion.ll with --function-signature option (otherwise
 filecheck gets confused).

---
 .../ArgumentPromotion/dead-gep-no-promotion.ll     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/test/Transforms/ArgumentPromotion/dead-gep-no-promotion.ll b/llvm/test/Transforms/ArgumentPromotion/dead-gep-no-promotion.ll
index 8d152ae42c70a..062407af89597 100644
--- a/llvm/test/Transforms/ArgumentPromotion/dead-gep-no-promotion.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/dead-gep-no-promotion.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
 ; RUN: opt -passes=argpromotion -S %s | FileCheck %s
 
 @glob = external global i32*
@@ -6,8 +6,9 @@
 ; No arguments in @callee can be promoted, but it contains a dead GEP. Make
 ; sure it is not removed, as we do not perform any promotion.
 define i32 @caller(i32* %ptr) {
-; CHECK-LABEL: @caller(
-; CHECK-NEXT:    call void @callee(i32* [[PTR:%.*]], i32* [[PTR]], i32* [[PTR]])
+; CHECK-LABEL: define {{[^@]+}}@caller
+; CHECK-SAME: (i32* [[PTR:%.*]]) {
+; CHECK-NEXT:    call void @callee(i32* [[PTR]], i32* [[PTR]], i32* [[PTR]])
 ; CHECK-NEXT:    ret i32 0
 ;
   call void @callee(i32* %ptr, i32* %ptr, i32* %ptr)
@@ -15,10 +16,11 @@ define i32 @caller(i32* %ptr) {
 }
 
 define internal void @callee(i32* %arg, i32* %arg1, i32* %arg2) {
-; CHECK-LABEL: define internal void @callee(
-; CHECK-NEXT:    call void @external_fn(i32* [[ARG:%.*]], i32* [[ARG1:%.*]])
+; CHECK-LABEL: define {{[^@]+}}@callee
+; CHECK-SAME: (i32* [[ARG:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]]) {
+; CHECK-NEXT:    call void @external_fn(i32* [[ARG]], i32* [[ARG1]])
 ; CHECK-NEXT:    [[DEAD_GEP:%.*]] = getelementptr inbounds i32, i32* [[ARG1]], i32 17
-; CHECK-NEXT:    store i32* [[ARG2:%.*]], i32** @glob, align 8
+; CHECK-NEXT:    store i32* [[ARG2]], i32** @glob, align 8
 ; CHECK-NEXT:    ret void
 ;
   call void @external_fn(i32* %arg, i32* %arg1)

From 8608650a7652fa154de493c760374ea19fa75cc1 Mon Sep 17 00:00:00 2001
From: Nuno Lopes <nuno.lopes@tecnico.ulisboa.pt>
Date: Sun, 20 Feb 2022 11:58:46 +0000
Subject: [PATCH 359/748] [docs] Frotend perf tips: mention poison vs undef and
 noundef attribute

---
 llvm/docs/Frontend/PerformanceTips.rst | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/Frontend/PerformanceTips.rst b/llvm/docs/Frontend/PerformanceTips.rst
index 5889329e25fc7..e4a3bc034b471 100644
--- a/llvm/docs/Frontend/PerformanceTips.rst
+++ b/llvm/docs/Frontend/PerformanceTips.rst
@@ -170,8 +170,7 @@ Other Things to Consider
    comparison type.  The GVN pass *will* optimize redundant equalities even if
    the type of comparison is inverted, but GVN only runs late in the pipeline.
    As a result, you may miss the opportunity to run other important
-   optimizations.  Improvements to EarlyCSE to remove this issue are tracked in
-   Bug 23333.
+   optimizations.
 
 #. Avoid using arithmetic intrinsics unless you are *required* by your source
    language specification to emit a particular code sequence.  The optimizer
@@ -227,6 +226,12 @@ Describing Aliasing Properties
 
 #. Use inbounds on geps.  This can help to disambiguate some aliasing queries.
 
+Undefined Values
+^^^^^^^^^^^^^^^^
+
+#. Use poison values instead of undef values whenever possible.
+
+#. Tag function attributes with the noundef attribute whenever possible.
 
 Modeling Memory Effects
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -276,8 +281,8 @@ relatively common and are generally well received by the community.  You will
 need to ensure that your proposal is sufficiently general so that it benefits
 others if you wish to contribute it upstream.
 
-You should also consider describing the problem you're facing on `llvm-dev
-<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ and asking for advice.
+You should also consider describing the problem you're facing on `Discourse
+<https://discourse.llvm.org>`_ and asking for advice.
 It's entirely possible someone has encountered your problem before and can
 give good advice.  If there are multiple interested parties, that also
 increases the chances that a metadata extension would be well received by the
@@ -290,8 +295,7 @@ If you run across a case that you feel deserves to be covered here, please send
 a patch to `llvm-commits
 <http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ for review.
 
-If you have questions on these items, please direct them to `llvm-dev
-<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_.  The more relevant
+If you have questions on these items, please ask them on `Discourse
+<https://discourse.llvm.org>`_.  The more relevant
 context you are able to give to your question, the more likely it is to be
 answered.
-

From 52fcdc8d69d20b48fb5266b00f505dc89b19be9b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 20 Feb 2022 14:06:58 +0100
Subject: [PATCH 360/748] Prune unused diagnostics. NFC.

---
 clang/include/clang/Basic/DiagnosticASTKinds.td  |  2 --
 .../include/clang/Basic/DiagnosticDriverKinds.td |  8 --------
 clang/include/clang/Basic/DiagnosticSemaKinds.td | 16 ----------------
 3 files changed, 26 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index a89bdff1a10c2..56662bcd0cc25 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -439,8 +439,6 @@ def note_odr_tag_kind_here: Note<
 def note_odr_field : Note<"field %0 has type %1 here">;
 def note_odr_field_name : Note<"field has name %0 here">;
 def note_odr_missing_field : Note<"no corresponding field here">;
-def note_odr_bit_field : Note<"bit-field %0 with type %1 and length %2 here">;
-def note_odr_not_bit_field : Note<"field %0 is not a bit-field">;
 def note_odr_base : Note<"class has base type %0">;
 def note_odr_virtual_base : Note<
   "%select{non-virtual|virtual}0 derivation here">;
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index b688c121b1c07..276e83434d030 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -29,8 +29,6 @@ def err_drv_invalid_arch_name : Error<
   "invalid arch name '%0'">;
 def err_drv_invalid_riscv_arch_name : Error<
   "invalid arch name '%0', %1">;
-def err_drv_invalid_riscv_ext_arch_name : Error<
-  "invalid arch name '%0', %1 '%2'">;
 def warn_drv_invalid_arch_name_with_suggestion : Warning<
   "ignoring invalid /arch: argument '%0'; for %select{64|32}1-bit expected one of %2">,
   InGroup<UnusedCommandLineArgument>;
@@ -302,7 +300,6 @@ def err_drv_optimization_remark_format : Error<
   "unknown remark serializer format: '%0'">;
 def err_drv_no_neon_modifier : Error<"[no]neon is not accepted as modifier, please use [no]simd instead">;
 def err_drv_invalid_omp_target : Error<"OpenMP target is invalid: '%0'">;
-def err_drv_debug_no_new_runtime : Error<"OpenMP target device debugging enabled with incompatible runtime">;
 def err_drv_incompatible_omp_arch : Error<"OpenMP target architecture '%0' pointer size is incompatible with host '%1'">;
 def err_drv_omp_host_ir_file_not_found : Error<
   "provided host compiler IR file '%0' is required to generate code for OpenMP "
@@ -326,10 +323,6 @@ def err_drv_unsupported_embed_bitcode
     : Error<"%0 is not supported with -fembed-bitcode">;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
-def err_drv_negative_columns : Error<
-    "invalid value '%1' in '%0', value must be 'none' or a positive integer">;
-def err_drv_small_columns : Error<
-    "invalid value '%1' in '%0', value must be '%2' or greater">;
 
 def err_drv_invalid_malign_branch_EQ : Error<
   "invalid argument '%0' to -malign-branch=; each element must be one of: %1">;
@@ -531,7 +524,6 @@ def warn_drv_ps4_sdk_dir : Warning<
   "environment variable SCE_ORBIS_SDK_DIR is set, but points to invalid or nonexistent directory '%0'">,
   InGroup<InvalidOrNonExistentDirectory>;
 
-def err_drv_unsupported_linker : Error<"unsupported value '%0' for -linker option">;
 def err_drv_defsym_invalid_format : Error<"defsym must be of the form: sym=value: %0">;
 def err_drv_defsym_invalid_symval : Error<"value is not an integer: %0">;
 def warn_drv_msvc_not_found : Warning<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8af1bed7b67f1..1719db4871ff3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -556,9 +556,6 @@ def err_using_decl_can_not_refer_to_class_member : Error<
 def warn_cxx17_compat_using_decl_class_member_enumerator : Warning<
   "member using declaration naming a non-member enumerator is incompatible "
   "with C++ standards before C++20">, InGroup<CXXPre20Compat>, DefaultIgnore;
-def ext_using_decl_class_member_enumerator : ExtWarn<
-  "member using declaration naming a non-member enumerator is "
-  "a C++20 extension">, InGroup<CXX20>;
 def err_using_enum_is_dependent : Error<
   "using-enum cannot name a dependent type">;
 def err_ambiguous_inherited_constructor : Error<
@@ -1696,8 +1693,6 @@ def err_missing_exception_specification : Error<
 def ext_missing_exception_specification : ExtWarn<
   err_missing_exception_specification.Text>,
   InGroup<DiagGroup<"missing-exception-spec">>;
-def err_noexcept_needs_constant_expression : Error<
-  "argument to noexcept specifier must be a constant expression">;
 def err_exception_spec_not_parsed : Error<
   "exception specification is not available until end of class definition">;
 def err_exception_spec_cycle : Error<
@@ -3524,8 +3519,6 @@ def warn_attribute_not_on_decl : Warning<
   "%0 attribute ignored when parsing type">, InGroup<IgnoredAttributes>;
 def err_base_specifier_attribute : Error<
   "%0 attribute cannot be applied to a base specifier">;
-def err_invalid_attribute_on_virtual_function : Error<
-  "%0 attribute cannot be applied to virtual functions">;
 def warn_declspec_allocator_nonpointer : Warning<
   "ignoring __declspec(allocator) because the function return type %0 is not "
   "a pointer or reference type">, InGroup<IgnoredAttributes>;
@@ -4976,8 +4969,6 @@ def err_template_spec_unknown_kind : Error<
   "class template">;
 def note_specialized_entity : Note<
   "explicitly specialized declaration is here">;
-def note_explicit_specialization_declared_here : Note<
-  "explicit specialization declared here">;
 def err_template_spec_decl_function_scope : Error<
   "explicit specialization of %0 in function scope">;
 def err_template_spec_decl_friend : Error<
@@ -5086,8 +5077,6 @@ def err_partial_spec_ordering_ambiguous : Error<
 def note_partial_spec_match : Note<"partial specialization matches %0">;
 def err_partial_spec_redeclared : Error<
   "class template partial specialization %0 cannot be redeclared">;
-def note_partial_specialization_declared_here : Note<
-  "explicit specialization declared here">;
 def note_prev_partial_spec_here : Note<
   "previous declaration of class template partial specialization %0 is here">;
 def err_partial_spec_fully_specialized : Error<
@@ -7364,8 +7353,6 @@ def err_bad_dynamic_cast_not_polymorphic : Error<"%0 is not polymorphic">;
 // Other C++ expressions
 def err_need_header_before_typeid : Error<
   "you need to include <typeinfo> before using the 'typeid' operator">;
-def err_need_header_before_ms_uuidof : Error<
-  "you need to include <guiddef.h> before using the '__uuidof' operator">;
 def err_need_header_before_placement_new : Error<
   "no matching %0 function for non-allocating placement new expression; "
   "include <new>">;
@@ -11319,9 +11306,6 @@ def err_multiversion_disallowed_other_attr
             "'%select{|target|cpu_specific|cpu_dispatch|target_clones}0' "
             "multiversioning cannot be combined"
             " with attribute %1">;
-def err_multiversion_mismatched_attrs
-    : Error<"attributes on multiversioned functions must all match, attribute "
-            "%0 %select{is missing|has different arguments}1">;
 def err_multiversion_diff : Error<
   "multiversioned function declaration has a different %select{calling convention"
   "|return type|constexpr specification|inline specification|linkage|"

From 147b97de906968aa1927c9d2828399518250831d Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 20 Feb 2022 13:20:34 +0000
Subject: [PATCH 361/748] [AArch64] Extra tests for larger umull/smull
 generation. NFC

---
 .../AArch64/aarch64-matrix-umull-smull.ll     | 352 ++++++++++++++++++
 1 file changed, 352 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index c2b6ad43ccd6d..4f999edf3d571 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -184,3 +184,355 @@ vector.body:                                      ; preds = %vector.header, %vec
 for.end12:                                        ; preds = %vector.body
   ret void
 }
+
+
+define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
+; CHECK-LABEL: larger_smull:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB3_8
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    sxth w8, w1
+; CHECK-NEXT:    mov w9, w3
+; CHECK-NEXT:    cmp w3, #15
+; CHECK-NEXT:    b.hi .LBB3_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov x10, xzr
+; CHECK-NEXT:    b .LBB3_6
+; CHECK-NEXT:  .LBB3_3: // %vector.ph
+; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    add x11, x2, #32
+; CHECK-NEXT:    add x12, x0, #16
+; CHECK-NEXT:    mov x13, x10
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:  .LBB3_4: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    subs x13, x13, #16
+; CHECK-NEXT:    add x12, x12, #32
+; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    b.ne .LBB3_4
+; CHECK-NEXT:  // %bb.5: // %middle.block
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    b.eq .LBB3_8
+; CHECK-NEXT:  .LBB3_6: // %for.body.preheader1
+; CHECK-NEXT:    sub x9, x9, x10
+; CHECK-NEXT:    add x11, x2, x10, lsl #2
+; CHECK-NEXT:    add x10, x0, x10, lsl #1
+; CHECK-NEXT:  .LBB3_7: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrsh w12, [x10], #2
+; CHECK-NEXT:    subs x9, x9, #1
+; CHECK-NEXT:    mul w12, w12, w8
+; CHECK-NEXT:    str w12, [x11], #4
+; CHECK-NEXT:    b.ne .LBB3_7
+; CHECK-NEXT:  .LBB3_8: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %conv1 = sext i16 %y to i32
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 16
+  br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16, i16* %x, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+  %2 = getelementptr inbounds i16, i16* %0, i64 8
+  %3 = bitcast i16* %2 to <8 x i16>*
+  %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
+  %4 = sext <8 x i16> %wide.load to <8 x i32>
+  %5 = sext <8 x i16> %wide.load11 to <8 x i32>
+  %6 = mul nsw <8 x i32> %broadcast.splat, %4
+  %7 = mul nsw <8 x i32> %broadcast.splat13, %5
+  %8 = getelementptr inbounds i32, i32* %s, i64 %index
+  %9 = bitcast i32* %8 to <8 x i32>*
+  store <8 x i32> %6, <8 x i32>* %9, align 4
+  %10 = getelementptr inbounds i32, i32* %8, i64 8
+  %11 = bitcast i32* %10 to <8 x i32>*
+  store <8 x i32> %7, <8 x i32>* %11, align 4
+  %index.next = add nuw i64 %index, 16
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
+
+for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader14, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
+  %13 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %13 to i32
+  %mul = mul nsw i32 %conv, %conv1
+  %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
+; CHECK-LABEL: larger_umull:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB4_8
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    and w8, w1, #0xffff
+; CHECK-NEXT:    mov w9, w3
+; CHECK-NEXT:    cmp w3, #15
+; CHECK-NEXT:    b.hi .LBB4_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov x10, xzr
+; CHECK-NEXT:    b .LBB4_6
+; CHECK-NEXT:  .LBB4_3: // %vector.ph
+; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    add x11, x2, #32
+; CHECK-NEXT:    add x12, x0, #16
+; CHECK-NEXT:    mov x13, x10
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:  .LBB4_4: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    subs x13, x13, #16
+; CHECK-NEXT:    add x12, x12, #32
+; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    b.ne .LBB4_4
+; CHECK-NEXT:  // %bb.5: // %middle.block
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    b.eq .LBB4_8
+; CHECK-NEXT:  .LBB4_6: // %for.body.preheader1
+; CHECK-NEXT:    sub x9, x9, x10
+; CHECK-NEXT:    add x11, x2, x10, lsl #2
+; CHECK-NEXT:    add x10, x0, x10, lsl #1
+; CHECK-NEXT:  .LBB4_7: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh w12, [x10], #2
+; CHECK-NEXT:    subs x9, x9, #1
+; CHECK-NEXT:    mul w12, w12, w8
+; CHECK-NEXT:    str w12, [x11], #4
+; CHECK-NEXT:    b.ne .LBB4_7
+; CHECK-NEXT:  .LBB4_8: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %conv1 = zext i16 %y to i32
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 16
+  br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16, i16* %x, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+  %2 = getelementptr inbounds i16, i16* %0, i64 8
+  %3 = bitcast i16* %2 to <8 x i16>*
+  %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
+  %4 = zext <8 x i16> %wide.load to <8 x i32>
+  %5 = zext <8 x i16> %wide.load11 to <8 x i32>
+  %6 = mul nuw <8 x i32> %broadcast.splat, %4
+  %7 = mul nuw <8 x i32> %broadcast.splat13, %5
+  %8 = getelementptr inbounds i32, i32* %s, i64 %index
+  %9 = bitcast i32* %8 to <8 x i32>*
+  store <8 x i32> %6, <8 x i32>* %9, align 4
+  %10 = getelementptr inbounds i32, i32* %8, i64 8
+  %11 = bitcast i32* %10 to <8 x i32>*
+  store <8 x i32> %7, <8 x i32>* %11, align 4
+  %index.next = add nuw i64 %index, 16
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
+
+for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader14, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
+  %13 = load i16, i16* %arrayidx, align 2
+  %conv = zext i16 %13 to i32
+  %mul = mul nuw i32 %conv, %conv1
+  %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-LABEL: red_mla_dup_ext_u8_s8_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cbz w2, .LBB5_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    sxtb w9, w1
+; CHECK-NEXT:    mov w10, w2
+; CHECK-NEXT:    cmp w2, #15
+; CHECK-NEXT:    b.hi .LBB5_4
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov x11, xzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB5_7
+; CHECK-NEXT:  .LBB5_3:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB5_4: // %vector.ph
+; CHECK-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov x12, x11
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    dup v2.8h, w9
+; CHECK-NEXT:  .LBB5_5: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp d3, d4, [x8, #-8]
+; CHECK-NEXT:    subs x12, x12, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-NEXT:    mla v0.8h, v2.8h, v3.8h
+; CHECK-NEXT:    mla v1.8h, v2.8h, v4.8h
+; CHECK-NEXT:    b.ne .LBB5_5
+; CHECK-NEXT:  // %bb.6: // %middle.block
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    cmp x11, x10
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    b.eq .LBB5_9
+; CHECK-NEXT:  .LBB5_7: // %for.body.preheader1
+; CHECK-NEXT:    sub x10, x10, x11
+; CHECK-NEXT:    add x11, x0, x11
+; CHECK-NEXT:  .LBB5_8: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrb w12, [x11], #1
+; CHECK-NEXT:    subs x10, x10, #1
+; CHECK-NEXT:    madd w8, w12, w9, w8
+; CHECK-NEXT:    b.ne .LBB5_8
+; CHECK-NEXT:  .LBB5_9: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %conv2 = sext i8 %B to i16
+  %cmp10.not = icmp eq i32 %n, 0
+  br i1 %cmp10.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 16
+  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <8 x i16> poison, i16 %conv2, i64 0
+  %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %broadcast.splatinsert15 = insertelement <8 x i16> poison, i16 %conv2, i64 0
+  %broadcast.splat16 = shufflevector <8 x i16> %broadcast.splatinsert15, <8 x i16> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ]
+  %vec.phi13 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %A, i64 %index
+  %1 = bitcast i8* %0 to <8 x i8>*
+  %wide.load = load <8 x i8>, <8 x i8>* %1, align 1
+  %2 = getelementptr inbounds i8, i8* %0, i64 8
+  %3 = bitcast i8* %2 to <8 x i8>*
+  %wide.load14 = load <8 x i8>, <8 x i8>* %3, align 1
+  %4 = zext <8 x i8> %wide.load to <8 x i16>
+  %5 = zext <8 x i8> %wide.load14 to <8 x i16>
+  %6 = mul nsw <8 x i16> %broadcast.splat, %4
+  %7 = mul nsw <8 x i16> %broadcast.splat16, %5
+  %8 = add <8 x i16> %6, %vec.phi
+  %9 = add <8 x i16> %7, %vec.phi13
+  %index.next = add nuw i64 %index, 16
+  %10 = icmp eq i64 %index.next, %n.vec
+  br i1 %10, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <8 x i16> %9, %8
+  %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
+
+for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %s.011.ph = phi i16 [ 0, %for.body.preheader ], [ %11, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  %s.0.lcssa = phi i16 [ 0, %entry ], [ %11, %middle.block ], [ %add, %for.body ]
+  ret i16 %s.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader17, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
+  %s.011 = phi i16 [ %add, %for.body ], [ %s.011.ph, %for.body.preheader17 ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %12 = load i8, i8* %arrayidx, align 1
+  %13 = zext i8 %12 to i16
+  %mul = mul nsw i16 %13, %conv2
+  %add = add i16 %mul, %s.011
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)

From 3a1d6a361c822173abd87ff47fd8613892fc747f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 20 Feb 2022 14:44:47 +0100
Subject: [PATCH 362/748] [clangd] Remove uuidof warning. Clang never emits
 this one.

---
 clang-tools-extra/clangd/IncludeFixer.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang-tools-extra/clangd/IncludeFixer.cpp b/clang-tools-extra/clangd/IncludeFixer.cpp
index 1f0515c1df702..7994e5f499200 100644
--- a/clang-tools-extra/clangd/IncludeFixer.cpp
+++ b/clang-tools-extra/clangd/IncludeFixer.cpp
@@ -224,8 +224,6 @@ std::vector<Fix> IncludeFixer::fix(DiagnosticsEngine::Level DiagLevel,
     return only(insertHeader("<initializer_list>"));
   case diag::err_need_header_before_typeid:
     return only(insertHeader("<typeid>"));
-  case diag::err_need_header_before_ms_uuidof:
-    return only(insertHeader("<guiddef.h>"));
   case diag::err_need_header_before_placement_new:
   case diag::err_implicit_coroutine_std_nothrow_type_not_found:
     return only(insertHeader("<new>"));

From ce0fdf116334506bd5c4609ab86111f8136a1408 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 20 Feb 2022 15:42:20 +0100
Subject: [PATCH 363/748] Put back
 err_drv_negative_columns/err_drv_small_columns for flang

These are unused by Clang, but Flang references them.
---
 clang/include/clang/Basic/DiagnosticDriverKinds.td | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 276e83434d030..b608b8ec50682 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -323,6 +323,10 @@ def err_drv_unsupported_embed_bitcode
     : Error<"%0 is not supported with -fembed-bitcode">;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
+def err_drv_negative_columns : Error<
+  "invalid value '%1' in '%0', value must be 'none' or a positive integer">;
+def err_drv_small_columns : Error<
+  "invalid value '%1' in '%0', value must be '%2' or greater">;
 
 def err_drv_invalid_malign_branch_EQ : Error<
   "invalid argument '%0' to -malign-branch=; each element must be one of: %1">;

From 5c7ae10ceca09832efd5a4161cf468368665a51b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 20 Feb 2022 15:05:20 +0000
Subject: [PATCH 364/748] [LV] Add store to test to make sure the loop is not
 dead.

Add an extra store to the test, to make sure the operations in the loop
cannot be optimized away after D118051.
---
 .../first-order-recurrence-complex.ll         | 65 ++++++++++---------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index 70b8c81737e71..e7393b85b552d 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -374,73 +374,74 @@ exit:
 }
 
 ; Sink %tmp38 after %tmp60, then it enable the loop vectorization.
-define void @instruction_with_2_FOR_operands() {
+define void @instruction_with_2_FOR_operands(float* noalias %A, float* noalias %B, float* noalias %C) {
 ; CHECK-LABEL: @instruction_with_2_FOR_operands(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 undef, i64 0)
-; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i64 [[SMAX]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x float> [ <float poison, float poison, float poison, float undef>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x float> [ <float poison, float poison, float poison, float undef>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* undef, align 4
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x float> [ <float poison, float poison, float poison, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x float> [ <float poison, float poison, float poison, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR1]], <4 x float> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* undef, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[B:%.*]], align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <4 x float> [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP6]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1001, 1000
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT3]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT4:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x float> [[BROADCAST_SPLAT]], i32 2
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB74:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi float [ undef, [[BB:%.*]] ], [ [[VECTOR_RECUR_EXTRACT4]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ undef, [[BB]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi float [ 1.000000e+00, [[BB:%.*]] ], [ [[VECTOR_RECUR_EXTRACT4]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ 0.000000e+00, [[BB]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
 ; CHECK-NEXT:    br label [[BB13:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ [[TMP60:%.*]], [[BB13]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR7:%.*]] = phi float [ [[TMP49:%.*]], [[BB13]] ], [ [[SCALAR_RECUR_INIT6]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB13]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[BB13]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[TMP38:%.*]] = fmul fast float [[SCALAR_RECUR]], [[SCALAR_RECUR7]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP49]] = load float, float* undef, align 4
-; CHECK-NEXT:    [[TMP60]] = load float, float* undef, align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[INDVARS_IV]], undef
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP49]] = load float, float* [[A]], align 4
+; CHECK-NEXT:    [[TMP60]] = load float, float* [[B]], align 4
+; CHECK-NEXT:    store float [[TMP38]], float* [[GEP]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[IV]], 1000
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[BB13]], label [[BB74]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       bb74:
 ; CHECK-NEXT:    ret void
 ;
-
-
 bb:
   br label %bb13
 
 bb13:                                             ; preds = %bb13, %bb
-  %tmp37 = phi float [ %tmp60, %bb13 ], [ undef, %bb ]
-  %tmp27 = phi float [ %tmp49, %bb13 ], [ undef, %bb ]
-  %indvars.iv = phi i64 [ %indvars.iv.next, %bb13 ], [ 0, %bb ]
+  %tmp37 = phi float [ %tmp60, %bb13 ], [ 0.0, %bb ]
+  %tmp27 = phi float [ %tmp49, %bb13 ], [ 1.0, %bb ]
+  %iv = phi i64 [ %iv.next, %bb13 ], [ 0, %bb ]
   %tmp38 = fmul fast float %tmp37, %tmp27
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %tmp49 = load float, float* undef, align 4
-  %tmp60 = load float, float* undef, align 4
-  %tmp12 = icmp slt i64 %indvars.iv, undef
+  %iv.next = add nuw nsw i64 %iv, 1
+  %gep = getelementptr inbounds float, float* %C, i64 %iv
+  %tmp49 = load float, float* %A, align 4
+  %tmp60 = load float, float* %B, align 4
+  store float %tmp38, float* %gep
+  %tmp12 = icmp slt i64 %iv, 1000
   br i1 %tmp12, label %bb13, label %bb74
 
 bb74:                                             ; preds = %bb13

From 8ef3e895ad8ab1724e2b87cabad1dacdc7a397a3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 20 Feb 2022 15:59:23 +0000
Subject: [PATCH 365/748] [X86] combineX86ShufflesRecursively - add TODO not to
 generate temporary nodes

Extension to PR45974, unless we actual combine the target shuffles we shouldn't be generating temporary nodes as they may interfere with the one use checks in the shuffle recursions
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e2d37ee917f5e..3e9b51dbcaa0f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38704,6 +38704,8 @@ static SDValue combineX86ShufflesRecursively(
   // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
 
   // Widen any subvector shuffle inputs we've collected.
+  // TODO: Remove this to avoid generating temporary nodes, we should only
+  // widen once combineX86ShuffleChain has found a match.
   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
         return Op.getValueSizeInBits() < RootSizeInBits;
       })) {

From 43d48ed22029e92d88c608c55c6c42490ec3a243 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Mon, 21 Feb 2022 01:39:35 +0800
Subject: [PATCH 366/748] [PowerPC] Add option to disable perfect shuffle

Perfect shuffle was introduced into PowerPC backend years ago, and only
available in big-endian subtargets. This optimization has good effects
in simple cases, but brings serious negative impact in large programs
with many shuffle instructions sharing the same mask.

Here introduces a temporary backend hidden option to control it until we
implemented better way to fix the gap in vectorshuffle decomposition.

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D120072
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 102 +++++++++++---------
 1 file changed, 55 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 35e3f4e697e2d..7910ba899993b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -126,6 +126,11 @@ static cl::opt<bool> EnableQuadwordAtomics(
     cl::desc("enable quadword lock-free atomic operations"), cl::init(false),
     cl::Hidden);
 
+static cl::opt<bool>
+    DisablePerfectShuffle("ppc-disable-perfect-shuffle",
+                          cl::desc("disable vector permute decomposition"),
+                          cl::init(false), cl::Hidden);
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
@@ -10071,56 +10076,59 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // perfect shuffle table to emit an optimal matching sequence.
   ArrayRef<int> PermMask = SVOp->getMask();
 
-  unsigned PFIndexes[4];
-  bool isFourElementShuffle = true;
-  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
-    unsigned EltNo = 8;   // Start out undef.
-    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
-      if (PermMask[i*4+j] < 0)
-        continue;   // Undef, ignore it.
-
-      unsigned ByteSource = PermMask[i*4+j];
-      if ((ByteSource & 3) != j) {
-        isFourElementShuffle = false;
-        break;
-      }
+  if (!DisablePerfectShuffle && !isLittleEndian) {
+    unsigned PFIndexes[4];
+    bool isFourElementShuffle = true;
+    for (unsigned i = 0; i != 4 && isFourElementShuffle;
+         ++i) {                           // Element number
+      unsigned EltNo = 8;                 // Start out undef.
+      for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
+        if (PermMask[i * 4 + j] < 0)
+          continue; // Undef, ignore it.
+
+        unsigned ByteSource = PermMask[i * 4 + j];
+        if ((ByteSource & 3) != j) {
+          isFourElementShuffle = false;
+          break;
+        }
 
-      if (EltNo == 8) {
-        EltNo = ByteSource/4;
-      } else if (EltNo != ByteSource/4) {
-        isFourElementShuffle = false;
-        break;
+        if (EltNo == 8) {
+          EltNo = ByteSource / 4;
+        } else if (EltNo != ByteSource / 4) {
+          isFourElementShuffle = false;
+          break;
+        }
       }
+      PFIndexes[i] = EltNo;
+    }
+
+    // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+    // perfect shuffle vector to determine if it is cost effective to do this as
+    // discrete instructions, or whether we should use a vperm.
+    // For now, we skip this for little endian until such time as we have a
+    // little-endian perfect shuffle table.
+    if (isFourElementShuffle) {
+      // Compute the index in the perfect shuffle table.
+      unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                              PFIndexes[2] * 9 + PFIndexes[3];
+
+      unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+      unsigned Cost = (PFEntry >> 30);
+
+      // Determining when to avoid vperm is tricky.  Many things affect the cost
+      // of vperm, particularly how many times the perm mask needs to be
+      // computed. For example, if the perm mask can be hoisted out of a loop or
+      // is already used (perhaps because there are multiple permutes with the
+      // same shuffle mask?) the vperm has a cost of 1.  OTOH, hoisting the
+      // permute mask out of the loop requires an extra register.
+      //
+      // As a compromise, we only emit discrete instructions if the shuffle can
+      // be generated in 3 or fewer operations.  When we have loop information
+      // available, if this block is within a loop, we should avoid using vperm
+      // for 3-operation perms and use a constant pool load instead.
+      if (Cost < 3)
+        return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
     }
-    PFIndexes[i] = EltNo;
-  }
-
-  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
-  // perfect shuffle vector to determine if it is cost effective to do this as
-  // discrete instructions, or whether we should use a vperm.
-  // For now, we skip this for little endian until such time as we have a
-  // little-endian perfect shuffle table.
-  if (isFourElementShuffle && !isLittleEndian) {
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex =
-      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
-
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost  = (PFEntry >> 30);
-
-    // Determining when to avoid vperm is tricky.  Many things affect the cost
-    // of vperm, particularly how many times the perm mask needs to be computed.
-    // For example, if the perm mask can be hoisted out of a loop or is already
-    // used (perhaps because there are multiple permutes with the same shuffle
-    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
-    // the loop requires an extra register.
-    //
-    // As a compromise, we only emit discrete instructions if the shuffle can be
-    // generated in 3 or fewer operations.  When we have loop information
-    // available, if this block is within a loop, we should avoid using vperm
-    // for 3-operation perms and use a constant pool load instead.
-    if (Cost < 3)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
   // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant

From 56bc87322ccca2ae786d493410d9801cf9c87a16 Mon Sep 17 00:00:00 2001
From: Michel Weber <michel.weber@inf.ethz.ch>
Date: Sun, 20 Feb 2022 23:19:09 +0530
Subject: [PATCH 367/748] [MLIR][Presburger] Inequality Typing in coalesce

This patch adds typing of inequalities to the simplex. This is a cental part of the coalesce algorithm and will be heavily used in later coalesce patches. Currently, only the three most basic types are supported with more to be introduced when they are needed.

Reviewed By: arjunp

Differential Revision: https://reviews.llvm.org/D119925
---
 .../mlir/Analysis/Presburger/Simplex.h        | 10 +++++
 mlir/lib/Analysis/Presburger/Simplex.cpp      | 39 +++++++++++++++----
 .../Analysis/Presburger/SimplexTest.cpp       | 22 +++++++++++
 3 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/Simplex.h b/mlir/include/mlir/Analysis/Presburger/Simplex.h
index 4f4abc1579cd2..10600064710dc 100644
--- a/mlir/include/mlir/Analysis/Presburger/Simplex.h
+++ b/mlir/include/mlir/Analysis/Presburger/Simplex.h
@@ -558,6 +558,16 @@ class Simplex : public SimplexBase {
   /// otherwise. This should only be called for bounded sets.
   Optional<SmallVector<int64_t, 8>> findIntegerSample();
 
+  enum class IneqType { Redundant, Cut, Separate };
+
+  /// Returns the type of the inequality with coefficients `coeffs`.
+  ///
+  /// Possible types are:
+  /// Redundant   The inequality is satisfied in the polytope
+  /// Cut         The inequality is satisfied by some points, but not by others
+  /// Separate    The inequality is not satisfied by any point
+  IneqType findIneqType(ArrayRef<int64_t> coeffs);
+
   /// Check if the specified inequality already holds in the polytope.
   bool isRedundantInequality(ArrayRef<int64_t> coeffs);
 
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index 5ba213ac9e3b4..285fa91f34a07 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -1605,7 +1605,7 @@ bool Simplex::isRationalSubsetOf(const IntegerPolyhedron &poly) {
     return true;
 
   for (unsigned i = 0, e = poly.getNumInequalities(); i < e; ++i)
-    if (!isRedundantInequality(poly.getInequality(i)))
+    if (findIneqType(poly.getInequality(i)) != IneqType::Redundant)
       return false;
 
   for (unsigned i = 0, e = poly.getNumEqualities(); i < e; ++i)
@@ -1615,16 +1615,39 @@ bool Simplex::isRationalSubsetOf(const IntegerPolyhedron &poly) {
   return true;
 }
 
-/// Computes the minimum value `coeffs` can take. If the value is greater than
-/// or equal to zero, the polytope entirely lies in the half-space defined by
-/// `coeffs >= 0`.
+/// Returns the type of the inequality with coefficients `coeffs`.
+/// Possible types are:
+/// Redundant   The inequality is satisfied by all points in the polytope
+/// Cut         The inequality is satisfied by some points, but not by others
+/// Separate    The inequality is not satisfied by any point
+///
+/// Internally, this computes the minimum and the maximum the inequality with
+/// coefficients `coeffs` can take. If the minimum is >= 0, the inequality holds
+/// for all points in the polytope, so it is redundant.  If the minimum is <= 0
+/// and the maximum is >= 0, the points in between the minimum and the
+/// inequality do not satisfy it, the points in between the inequality and the
+/// maximum satisfy it. Hence, it is a cut inequality. If both are < 0, no
+/// points of the polytope satisfy the inequality, which means it is a separate
+/// inequality.
+Simplex::IneqType Simplex::findIneqType(ArrayRef<int64_t> coeffs) {
+  MaybeOptimum<Fraction> minimum = computeOptimum(Direction::Down, coeffs);
+  if (minimum.isBounded() && *minimum >= Fraction(0, 1)) {
+    return IneqType::Redundant;
+  }
+  MaybeOptimum<Fraction> maximum = computeOptimum(Direction::Up, coeffs);
+  if ((!minimum.isBounded() || *minimum <= Fraction(0, 1)) &&
+      (!maximum.isBounded() || *maximum >= Fraction(0, 1))) {
+    return IneqType::Cut;
+  }
+  return IneqType::Separate;
+}
+
+/// Checks whether the type of the inequality with coefficients `coeffs`
+/// is Redundant.
 bool Simplex::isRedundantInequality(ArrayRef<int64_t> coeffs) {
   assert(!empty &&
          "It is not meaningful to ask about redundancy in an empty set!");
-  MaybeOptimum<Fraction> minimum = computeOptimum(Direction::Down, coeffs);
-  assert(!minimum.isEmpty() &&
-         "Optima should be non-empty for a non-empty set");
-  return minimum.isBounded() && *minimum >= Fraction(0, 1);
+  return findIneqType(coeffs) == IneqType::Redundant;
 }
 
 /// Check whether the equality given by `coeffs == 0` is redundant given
diff --git a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
index eb403adb87e0a..fbe68070f39d6 100644
--- a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
@@ -464,6 +464,28 @@ TEST(SimplexTest, isRedundantInequality) {
   EXPECT_FALSE(simplex.isRedundantInequality({0, 1, -1}));  // y >= 1.
 }
 
+TEST(SimplexTest, ineqType) {
+  Simplex simplex(2);
+  simplex.addInequality({0, -1, 2}); // y <= 2.
+  simplex.addInequality({1, 0, 0});  // x >= 0.
+  simplex.addEquality({-1, 1, 0});   // y = x.
+
+  EXPECT_TRUE(simplex.findIneqType({-1, 0, 2}) ==
+              Simplex::IneqType::Redundant); // x <= 2.
+  EXPECT_TRUE(simplex.findIneqType({0, 1, 0}) ==
+              Simplex::IneqType::Redundant); // y >= 0.
+
+  EXPECT_TRUE(simplex.findIneqType({0, 1, -1}) ==
+              Simplex::IneqType::Cut); // y >= 1.
+  EXPECT_TRUE(simplex.findIneqType({-1, 0, 1}) ==
+              Simplex::IneqType::Cut); // x <= 1.
+  EXPECT_TRUE(simplex.findIneqType({0, 1, -2}) ==
+              Simplex::IneqType::Cut); // y >= 2.
+
+  EXPECT_TRUE(simplex.findIneqType({-1, 0, -1}) ==
+              Simplex::IneqType::Separate); // x <= -1.
+}
+
 TEST(SimplexTest, isRedundantEquality) {
   Simplex simplex(2);
   simplex.addInequality({0, -1, 2}); // y <= 2.

From 35b92c1464ad5b79c83a1982849c64b6ca261377 Mon Sep 17 00:00:00 2001
From: Nuno Lopes <nuno.lopes@tecnico.ulisboa.pt>
Date: Sun, 20 Feb 2022 18:43:30 +0000
Subject: [PATCH 368/748] [docs] fix typo

---
 llvm/docs/Frontend/PerformanceTips.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/Frontend/PerformanceTips.rst b/llvm/docs/Frontend/PerformanceTips.rst
index e4a3bc034b471..dfa3ccdd1ddd1 100644
--- a/llvm/docs/Frontend/PerformanceTips.rst
+++ b/llvm/docs/Frontend/PerformanceTips.rst
@@ -231,7 +231,7 @@ Undefined Values
 
 #. Use poison values instead of undef values whenever possible.
 
-#. Tag function attributes with the noundef attribute whenever possible.
+#. Tag function parameters with the noundef attribute whenever possible.
 
 Modeling Memory Effects
 ^^^^^^^^^^^^^^^^^^^^^^^^

From ec910751fe5c4a0e0fe4e232da0f8bed307bf4e8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 20 Feb 2022 18:50:28 +0000
Subject: [PATCH 369/748] [X86] combineX86ShufflesRecursively - attempt to fold
 ISD::EXTRACT_SUBVECTOR into a shuffle chain

Peek through if we're extracting a non-zero'th subvector in an attempt to fold the extract into a lane-crossing shuffle

This also exposes a failure to fold extract_subvector(movddup(x),c) -> movddup(extract_subvector(x,c))
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  47 +++++--
 llvm/test/CodeGen/X86/avx512-hadd-hsub.ll     |   5 +-
 .../copy-low-subvec-elt-to-high-subvec-elt.ll |  36 ++++--
 .../CodeGen/X86/vector-shuffle-256-v16.ll     |  32 ++++-
 .../CodeGen/X86/vector-shuffle-256-v32.ll     |  34 ++++-
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll | 120 +++++++++++++++---
 .../CodeGen/X86/vector-shuffle-512-v32.ll     |   4 +-
 7 files changed, 220 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3e9b51dbcaa0f..c372919f44f70 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38424,16 +38424,27 @@ static SDValue combineX86ShufflesRecursively(
   APInt OpUndef, OpZero;
   APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
-  if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
-                              OpZero, DAG, Depth, false))
-    return SDValue();
-
-  // Shuffle inputs must not be larger than the shuffle result.
-  // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
-  if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
-        return OpInput.getValueSizeInBits() > VT.getSizeInBits();
-      }))
+  if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
+                             OpZero, DAG, Depth, false)) {
+    // Shuffle inputs must not be larger than the shuffle result.
+    // TODO: Relax this for single input faux shuffles (e.g. trunc).
+    if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+          return OpInput.getValueSizeInBits() > VT.getSizeInBits();
+        }))
+      return SDValue();
+  } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+             (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
+             !isNullConstant(Op.getOperand(1))) {
+    SDValue SrcVec = Op.getOperand(0);
+    int ExtractIdx = Op.getConstantOperandVal(1);
+    unsigned NumElts = VT.getVectorNumElements();
+    OpInputs.assign({SrcVec});
+    OpMask.assign(NumElts, SM_SentinelUndef);
+    std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
+    OpZero = OpUndef = APInt::getNullValue(NumElts);
+  } else {
     return SDValue();
+  }
 
   // If the shuffle result was smaller than the root, we need to adjust the
   // mask indices and pad the mask with undefs.
@@ -53436,8 +53447,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   unsigned InOpcode = InVec.getOpcode();
-  if (IdxVal == 0 && InVec.hasOneUse()) {
-    if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
+  if (InVec.hasOneUse()) {
+    if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
       // v2f64 CVTDQ2PD(v4i32).
       if (InOpcode == ISD::SINT_TO_FP &&
           InVec.getOperand(0).getValueType() == MVT::v4i32) {
@@ -53454,7 +53465,8 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
       }
     }
-    if ((InOpcode == ISD::ANY_EXTEND ||
+    if (IdxVal == 0 &&
+        (InOpcode == ISD::ANY_EXTEND ||
          InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
          InOpcode == ISD::ZERO_EXTEND ||
          InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
@@ -53469,7 +53481,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
       return DAG.getNode(ExtOp, DL, VT, Ext);
     }
-    if (InOpcode == ISD::VSELECT &&
+    if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
         InVec.getOperand(0).getValueType().is256BitVector() &&
         InVec.getOperand(1).getValueType().is256BitVector() &&
         InVec.getOperand(2).getValueType().is256BitVector()) {
@@ -53479,7 +53491,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
     }
-    if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
+    if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
         (VT.is128BitVector() || VT.is256BitVector())) {
       SDLoc DL(N);
       SDValue InVecSrc = InVec.getOperand(0);
@@ -53487,6 +53499,13 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext);
     }
+    if (InOpcode == X86ISD::MOVDDUP &&
+        (VT.is128BitVector() || VT.is256BitVector())) {
+      SDLoc DL(N);
+      SDValue Ext0 =
+          extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+      return DAG.getNode(InOpcode, DL, VT, Ext0);
+    }
   }
 
   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
index 9e6c8b8becbc9..85266a7a682cd 100644
--- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -223,9 +223,8 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
 ; SKX-LABEL: fsub_noundef_ee:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
-; SKX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; SKX-NEXT:    vsubpd %xmm0, %xmm1, %xmm0
-; SKX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; SKX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index e624442020244..db7b8558a096c 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -439,12 +439,18 @@ define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_2_unar
 }
 
 define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary(<8 x float> %x, <8 x float> %y) nounwind {
-; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vbroadcastss %xmm1, %ymm1
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
-; CHECK-NEXT:    retq
+; CHECK-SLOW-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK-SLOW:       # %bb.0:
+; CHECK-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-SLOW-NEXT:    vbroadcastss %xmm1, %ymm1
+; CHECK-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-SLOW-NEXT:    retq
+;
+; CHECK-FAST-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; CHECK-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-FAST-NEXT:    retq
   %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
   ret <8 x float> %r
 }
@@ -562,12 +568,18 @@ define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_unary(<8
 }
 
 define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vbroadcastss %xmm1, %ymm1
-; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
-; CHECK-NEXT:    retq
+; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK-SLOW:       # %bb.0:
+; CHECK-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-SLOW-NEXT:    vbroadcastss %xmm1, %ymm1
+; CHECK-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-SLOW-NEXT:    retq
+;
+; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; CHECK-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-FAST-NEXT:    retq
   %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
   ret <8 x i32> %r
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 2e7e59ab456e0..63bc2331d4f7d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -4,8 +4,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-ALL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST,AVX512VL-FAST-CROSSLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST,AVX512VL-FAST-PERLANE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
 
@@ -6826,11 +6826,29 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
+; AVX512VL-FAST-CROSSLANE:       # %bb.0:
+; AVX512VL-FAST-CROSSLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-FAST-CROSSLANE-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-CROSSLANE-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
 ; XOPAVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d6731f851d0bf..73d94b208b6ea 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4555,11 +4555,35 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX512VLVBMI-SLOW:       # %bb.0:
+; AVX512VLVBMI-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VLVBMI-SLOW-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512VLVBMI-SLOW-NEXT:    retq
+;
+; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX512VLVBMI-FAST-ALL:       # %bb.0:
+; AVX512VLVBMI-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI-FAST-ALL-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-FAST-ALL-NEXT:    retq
+;
+; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX512VLVBMI-FAST-PERLANE:       # %bb.0:
+; AVX512VLVBMI-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VLVBMI-FAST-PERLANE-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512VLVBMI-FAST-PERLANE-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; XOPAVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 96d25cc66cb07..2bc2f9e91c8d3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1466,11 +1466,41 @@ define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8f32_44444444:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8f32_44444444:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: shuffle_v8f32_44444444:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_44444444:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_44444444:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_44444444:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_44444444:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle
 }
@@ -3085,11 +3115,41 @@ define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_44444444:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_44444444:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: shuffle_v8i32_44444444:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_44444444:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_44444444:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_44444444:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_44444444:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i32> %shuffle
 }
@@ -3101,11 +3161,41 @@ define <8 x i32> @shuffle_v8i32_44444444_bc(<8 x float> %a, <8 x float> %b) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
 ; AVX1-NEXT:    retq
 ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_44444444_bc:
-; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
-; AVX2OR512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: shuffle_v8i32_44444444_bc:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: shuffle_v8i32_44444444_bc:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_44444444_bc:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_44444444_bc:
+; AVX512VL-SLOW:       # %bb.0:
+; AVX512VL-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512VL-SLOW-NEXT:    retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_44444444_bc:
+; AVX512VL-FAST-ALL:       # %bb.0:
+; AVX512VL-FAST-ALL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; AVX512VL-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT:    retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_44444444_bc:
+; AVX512VL-FAST-PERLANE:       # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT:    retq
   %tmp0 = bitcast <8 x float> %a to <8 x i32>
   %tmp1 = bitcast <8 x float> %b to <8 x i32>
   %shuffle = shufflevector <8 x i32> %tmp0, <8 x i32> %tmp1, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 2b76d668f5fe2..08b56a32f3347 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -29,8 +29,8 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
 ;
 ; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpbroadcastw %xmm0, %zmm0
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <32 x i16> %c

From 053c2a0020577f7dfbaecf43347b88adf8dc047c Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Sun, 20 Feb 2022 11:29:54 -0800
Subject: [PATCH 370/748] [SimplifyCFG][OpaquePtr] Check store type when
 merging conditional store

---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  4 ++-
 .../test/Transforms/SimplifyCFG/opaque-ptr.ll | 33 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index dbf22ab1e2298..88a7e12a688af 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3732,7 +3732,9 @@ static bool mergeConditionalStoreToAddress(
     return false;
 
   // Now check the stores are compatible.
-  if (!QStore->isUnordered() || !PStore->isUnordered())
+  if (!QStore->isUnordered() || !PStore->isUnordered() ||
+      PStore->getValueOperand()->getType() !=
+          QStore->getValueOperand()->getType())
     return false;
 
   // Check that sinking the store won't cause program behavior changes. Sinking
diff --git a/llvm/test/Transforms/SimplifyCFG/opaque-ptr.ll b/llvm/test/Transforms/SimplifyCFG/opaque-ptr.ll
index 909346f90d99a..d63fdd7838574 100644
--- a/llvm/test/Transforms/SimplifyCFG/opaque-ptr.ll
+++ b/llvm/test/Transforms/SimplifyCFG/opaque-ptr.ll
@@ -44,3 +44,36 @@ join:
   %phi = phi ptr [ %gep1, %if ], [ %gep2, %else]
   ret ptr %phi
 }
+
+define void @test_cond_store_merge(i1 %arg, i1 %arg2, ptr %p) {
+; CHECK-LABEL: @test_cond_store_merge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i64 0, ptr [[P:%.*]], align 32
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB4:%.*]], label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[P]], align 32
+; CHECK-NEXT:    br label [[BB5]]
+; CHECK:       bb5:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br i1 %arg, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i64 0, ptr %p, align 32
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  br i1 %arg2, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  store double 0.000000e+00, ptr %p, align 32
+  br label %bb5
+
+bb5:                                              ; preds = %bb4, %bb3
+  ret void
+}

From eb5950666b7c162b8b6cc0142bfc85aeccedd195 Mon Sep 17 00:00:00 2001
From: Casey Carter <Casey@Carter.net>
Date: Sun, 20 Feb 2022 11:34:48 -0800
Subject: [PATCH 371/748] [libcxx][test] Update msvc_stdlib_force_include.h for
 C++23

Make distinct `TEST_STD_VER` values for C++20 and C++23; add C++23 deprecation suppression.

Fixes #53597
---
 libcxx/test/support/msvc_stdlib_force_include.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h
index dedd5d3ef8921..a5ed33b3f7319 100644
--- a/libcxx/test/support/msvc_stdlib_force_include.h
+++ b/libcxx/test/support/msvc_stdlib_force_include.h
@@ -69,18 +69,21 @@ const AssertionDialogAvoider assertion_dialog_avoider{};
     // Restore features that are removed in C++20.
     #define _HAS_FEATURES_REMOVED_IN_CXX20 1
 
-    // Silence warnings about features that are deprecated in C++17 and C++20.
+    // Silence warnings about features that are deprecated in non-default language modes.
     #define _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS
     #define _SILENCE_ALL_CXX20_DEPRECATION_WARNINGS
+    #define _SILENCE_ALL_CXX23_DEPRECATION_WARNINGS
 #endif // _LIBCXX_IN_DEVCRT
 
 #include <version>
 
-#if _HAS_CXX20
+#if _HAS_CXX23
     #define TEST_STD_VER 99
+#elif _HAS_CXX20
+    #define TEST_STD_VER 20
 #elif _HAS_CXX17
     #define TEST_STD_VER 17
-#else // !(_HAS_CXX20 || _HAS_CXX17)
+#else
     #define TEST_STD_VER 14
 #endif
 

From 8d894270a676a97952fabe824d0ac160aa222450 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Date: Sun, 20 Feb 2022 19:42:07 +0000
Subject: [PATCH 372/748] [docs] Update ReleaseNotes template

This change makes several updates to the ReleaseNotes template:

* Orders the backend/target updates alphabetically
* Adds RISC-V to the list
* Uses "Backend" rather than a mix of "Target" and "Backend" (I don't
  have a strong view on which term is used, but we should be
  consistent!)
* Uses * ... as the placeholder text, as this matches the format
  actually used for most updates in recent releases

Differential Revision: https://reviews.llvm.org/D120043
---
 llvm/docs/ReleaseNotes.rst | 41 +++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 2f840a5f4c91c..8cae7c77b8719 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -67,6 +67,11 @@ Changes to TableGen
 Changes to the AArch64 Backend
 ------------------------------
 
+Changes to the AMDGPU Backend
+-----------------------------
+
+* ...
+
 Changes to the ARM Backend
 --------------------------
 
@@ -79,40 +84,40 @@ Changes to the ARM Backend
   Previously it was on by default for Armv8 and off for all other architecture
   versions.
 
-Changes to the MIPS Target
+Changes to the AVR Backend
 --------------------------
 
-During this release ...
+* ...
 
-Changes to the Hexagon Target
------------------------------
+Changes to the Hexagon Backend
+------------------------------
 
 * ...
 
-Changes to the PowerPC Target
------------------------------
+Changes to the MIPS Backend
+---------------------------
 
-During this release ...
+* ...
 
-Changes to the X86 Target
--------------------------
+Changes to the PowerPC Backend
+------------------------------
 
-During this release ...
+* ...
 
-Changes to the AMDGPU Target
+Changes to the RISC-V Backend
 -----------------------------
 
-During this release ...
+* ...
 
-Changes to the AVR Target
------------------------------
+Changes to the WebAssembly Backend
+----------------------------------
 
-During this release ...
+* ...
 
-Changes to the WebAssembly Target
----------------------------------
+Changes to the X86 Backend
+--------------------------
 
-During this release ...
+* ...
 
 Changes to the OCaml bindings
 -----------------------------

From 7f827ebddc38dbe7c3d7a331d3ad5a95647ff25d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Fri, 18 Feb 2022 17:01:37 -0500
Subject: [PATCH 373/748] [AArch64][RISCV][x86] add tests for mul-add demanded
 bits; NFC

See #53829
---
 llvm/test/CodeGen/AArch64/mul_pow2.ll | 45 +++++++++++++++++++++++
 llvm/test/CodeGen/RISCV/mul.ll        | 51 +++++++++++++++++++++++++++
 llvm/test/CodeGen/X86/mul-demand.ll   | 28 +++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/mul-demand.ll

diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index 59ac56f34aa24..31ff289b7a2f4 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -700,3 +700,48 @@ define i32 @ntest16(i32 %x) {
   %mul = mul nsw i32 %x, -16
   ret i32 %mul
 }
+
+define i32 @muladd_demand(i32 %x, i32 %y) {
+; CHECK-LABEL: muladd_demand:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #131008
+; CHECK-NEXT:    madd w8, w0, w8, w1
+; CHECK-NEXT:    and w0, w8, #0x1ffc0
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: muladd_demand:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov w8, #131008
+; GISEL-NEXT:    madd w8, w0, w8, w1
+; GISEL-NEXT:    and w0, w8, #0x1ffc0
+; GISEL-NEXT:    ret
+  %m = mul i32 %x, 131008 ; 0x0001ffc0
+  %a = add i32 %y, %m
+  %r = and i32 %a, 131008
+  ret i32 %r
+}
+
+define <4 x i32> @muladd_demand_commute(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: muladd_demand_commute:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #131008
+; CHECK-NEXT:    dup v2.4s, w8
+; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
+; CHECK-NEXT:    movi v0.4s, #1, msl #16
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: muladd_demand_commute:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    adrp x8, .LCPI42_1
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI42_1]
+; GISEL-NEXT:    adrp x8, .LCPI42_0
+; GISEL-NEXT:    mla v1.4s, v0.4s, v2.4s
+; GISEL-NEXT:    ldr q0, [x8, :lo12:.LCPI42_0]
+; GISEL-NEXT:    and v0.16b, v1.16b, v0.16b
+; GISEL-NEXT:    ret
+  %m = mul <4 x i32> %x, <i32 131008, i32 131008, i32 131008, i32 131008>
+  %a = add <4 x i32> %m, %y
+  %r = and <4 x i32> %a, <i32 131071, i32 131071, i32 131071, i32 131071>
+  ret <4 x i32> %r
+}
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 3884c67d399da..ad720808c1b5d 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1547,3 +1547,54 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
   ret i64 %5
 }
 
+define i8 @muladd_demand(i8 %x, i8 %y) nounwind {
+; RV32I-LABEL: muladd_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    li a1, 14
+; RV32I-NEXT:    call __mulsi3@plt
+; RV32I-NEXT:    add a0, s0, a0
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IM-LABEL: muladd_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    li a2, 14
+; RV32IM-NEXT:    mul a0, a0, a2
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    andi a0, a0, 15
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muladd_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a1
+; RV64I-NEXT:    li a1, 14
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    addw a0, s0, a0
+; RV64I-NEXT:    andi a0, a0, 15
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: muladd_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a2, 14
+; RV64IM-NEXT:    mulw a0, a0, a2
+; RV64IM-NEXT:    addw a0, a1, a0
+; RV64IM-NEXT:    andi a0, a0, 15
+; RV64IM-NEXT:    ret
+  %m = mul i8 %x, 14
+  %a = add i8 %y, %m
+  %r = and i8 %a, 15
+  ret i8 %r
+}
diff --git a/llvm/test/CodeGen/X86/mul-demand.ll b/llvm/test/CodeGen/X86/mul-demand.ll
new file mode 100644
index 0000000000000..0af5cb3e7a8e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mul-demand.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define i64 @muladd_demand(i64 %x, i64 %y) {
+; CHECK-LABEL: muladd_demand:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $131008, %edi, %eax # imm = 0x1FFC0
+; CHECK-NEXT:    addl %esi, %eax
+; CHECK-NEXT:    shlq $47, %rax
+; CHECK-NEXT:    retq
+  %m = mul i64 %x, 131008 ; 0x0001ffc0
+  %a = add i64 %m, %y
+  %r = shl i64 %a, 47
+  ret i64 %r
+}
+
+define <2 x i64> @muladd_demand_commute(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: muladd_demand_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    paddq %xmm1, %xmm0
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %m = mul <2 x i64> %x, <i64 131008, i64 131008>
+  %a = add <2 x i64> %y, %m
+  %r = and <2 x i64> %a, <i64 131071, i64 131071>
+  ret <2 x i64> %r
+}

From 6694491affa182e9a004411fbddc863ed038d75b Mon Sep 17 00:00:00 2001
From: Yannic Bonenberger <yannic.bonenberger@gmail.com>
Date: Sun, 20 Feb 2022 12:32:38 -0800
Subject: [PATCH 374/748] [llvm] Add missind dep on Symbolize to Debuginfod

 `llvm/Debuginfod/DIFetcher.h` imports `llvm/DebugInfo/Symbolize/DIFetcher.h`,
so there should be a dependency on Symbolize.

Reviewed By: #debug-info, dblaikie, phosek

Differential Revision: https://reviews.llvm.org/D119626
---
 llvm/lib/Debuginfod/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Debuginfod/CMakeLists.txt b/llvm/lib/Debuginfod/CMakeLists.txt
index 67e2c2d07aeb7..be8965c9b2e43 100644
--- a/llvm/lib/Debuginfod/CMakeLists.txt
+++ b/llvm/lib/Debuginfod/CMakeLists.txt
@@ -18,4 +18,5 @@ add_llvm_library(LLVMDebuginfod
 
   LINK_COMPONENTS
   Support
+  Symbolize
   )

From d0505201c468ec9ed7f738af0ae3da58faf24a0f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 20 Feb 2022 20:42:31 +0000
Subject: [PATCH 375/748] [X86] Regenerate switch-default-only.ll

---
 llvm/test/CodeGen/X86/switch-default-only.ll | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/X86/switch-default-only.ll b/llvm/test/CodeGen/X86/switch-default-only.ll
index 4310e40b57a6d..ac40fbbe389de 100644
--- a/llvm/test/CodeGen/X86/switch-default-only.ll
+++ b/llvm/test/CodeGen/X86/switch-default-only.ll
@@ -1,12 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O0 -fast-isel=false -mtriple=i686-- < %s | FileCheck %s
 
 ; No need for branching when the default and only destination follows
 ; immediately after the switch.
-; CHECK-LABEL: no_branch:
-; CHECK-NOT: jmp
-; CHECK: ret
-
 define void @no_branch(i32 %x) {
+; CHECK-LABEL: no_branch:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    retl
 entry:
   switch i32 %x, label %exit [ ]
 exit:

From d9567babef302cfd7e827df64138151ba2614b83 Mon Sep 17 00:00:00 2001
From: Luis Penagos <luis@penagos.co>
Date: Sun, 20 Feb 2022 21:35:47 +0100
Subject: [PATCH 376/748] Fix extraneous whitespace addition in line comments
 on clang-format directives

Fixes https://github.com/llvm/llvm-project/issues/53844.
I believe this regression was caused by not accounting for clang-format directives in https://reviews.llvm.org/D92257.

Reviewed By: HazardyKnusperkeks, curdeius

Differential Revision: https://reviews.llvm.org/D120188
---
 clang/lib/Format/BreakableToken.cpp           | 11 +++++++----
 clang/unittests/Format/FormatTestComments.cpp |  5 +++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index 5138c7cd42cc6..967ddeb82383a 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -815,10 +815,13 @@ BreakableLineCommentSection::BreakableLineCommentSection(
 
         assert(Lines[i].size() > IndentPrefix.size());
         const auto FirstNonSpace = Lines[i][IndentPrefix.size()];
-        const auto AllowsSpaceChange =
-            SpacesInPrefix != 0 ||
-            (!NoSpaceBeforeFirstCommentChar() ||
-             (FirstNonSpace == '}' && FirstLineSpaceChange != 0));
+        const bool IsFormatComment = LineTok && switchesFormatting(*LineTok);
+        const bool LineRequiresLeadingSpace =
+            !NoSpaceBeforeFirstCommentChar() ||
+            (FirstNonSpace == '}' && FirstLineSpaceChange != 0);
+        const bool AllowsSpaceChange =
+            !IsFormatComment &&
+            (SpacesInPrefix != 0 || LineRequiresLeadingSpace);
 
         if (PrefixSpaceChange[i] > 0 && AllowsSpaceChange) {
           Prefix[i] = IndentPrefix.str();
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index c988a2869e568..f83ffb393ac2f 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -91,6 +91,11 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
                "// line 2\n"
                "void f() {}\n");
 
+  EXPECT_EQ("// comment\n"
+            "// clang-format on\n",
+            format("//comment\n"
+                   "// clang-format on\n"));
+
   verifyFormat("void f() {\n"
                "  // Doesn't do anything\n"
                "}");

From e021987273bece6e94bc6f43b6b5232de10637c8 Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Fri, 18 Feb 2022 18:24:14 +0100
Subject: [PATCH 377/748] [clang-format] Avoid inserting space after C++ casts.

Fixes https://github.com/llvm/llvm-project/issues/53876.

This is a solution for standard C++ casts: const_cast, dynamic_cast, reinterpret_cast, static_cast.

A general approach handling all possible casts is not possible without semantic information.
Consider the code:
```
static_cast<T>(*function_pointer_variable)(arguments);
```
vs.
```
some_return_type<T> (*function_pointer_variable)(parameters);
// Later used as:
function_pointer_variable = &some_function;
return function_pointer_variable(args);
```
In the latter case, it's not a cast but a variable declaration of a pointer to function.
Without knowing what `some_return_type<T>` is (and clang-format does not know it), it's hard to distinguish between the two cases. Theoretically, one could check whether "parameters" are types (not a cast) and "arguments" are value/expressions (a cast), but that might be inefficient (needs lots of lookahead).

Reviewed By: MyDeveloperDay, HazardyKnusperkeks, owenpan

Differential Revision: https://reviews.llvm.org/D120140
---
 clang/lib/Format/TokenAnnotator.cpp   | 19 +++++++++++++++++--
 clang/unittests/Format/FormatTest.cpp |  7 +++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 9a020eb6ca7dc..51e7c32b7d4c7 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1734,8 +1734,11 @@ class AnnotatingParser {
       else
         Current.setType(TT_LineComment);
     } else if (Current.is(tok::r_paren)) {
-      if (rParenEndsCast(Current))
+      if (rParenEndsCast(Current)) {
         Current.setType(TT_CastRParen);
+        assert(Current.MatchingParen);
+        Current.MatchingParen->setType(TT_Unknown);
+      }
       if (Current.MatchingParen && Current.Next &&
           !Current.Next->isBinaryOperator() &&
           !Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace,
@@ -1938,8 +1941,20 @@ class AnnotatingParser {
 
       // Certain other tokens right before the parentheses are also signals that
       // this cannot be a cast.
+      if (LeftOfParens->is(TT_TemplateCloser)) {
+        if (LeftOfParens->MatchingParen) {
+          auto *Prev = LeftOfParens->MatchingParen->getPreviousNonComment();
+          if (Prev &&
+              Prev->isOneOf(tok::kw_const_cast, tok::kw_dynamic_cast,
+                            tok::kw_reinterpret_cast, tok::kw_static_cast))
+            // FIXME: Maybe we should handle identifiers ending with "_cast",
+            // e.g. any_cast?
+            return true;
+        }
+        return false;
+      }
       if (LeftOfParens->isOneOf(tok::at, tok::r_square, TT_OverloadedOperator,
-                                TT_TemplateCloser, tok::ellipsis))
+                                tok::ellipsis))
         return false;
     }
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index f71f8dc5de456..d45146d5242fb 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -10565,6 +10565,13 @@ TEST_F(FormatTest, FormatsBinaryOperatorsPrecedingEquals) {
 
 TEST_F(FormatTest, FormatsCasts) {
   verifyFormat("Type *A = static_cast<Type *>(P);");
+  verifyFormat("static_cast<Type *>(P);");
+  verifyFormat("static_cast<Type &>(Fun)(Args);");
+  verifyFormat("static_cast<Type &>(*Fun)(Args);");
+  verifyFormat("a = static_cast<Type &>(*Fun)(Args);");
+  verifyFormat("const_cast<Type &>(*Fun)(Args);");
+  verifyFormat("dynamic_cast<Type &>(*Fun)(Args);");
+  verifyFormat("reinterpret_cast<Type &>(*Fun)(Args);");
   verifyFormat("Type *A = (Type *)P;");
   verifyFormat("Type *A = (vector<Type *, int *>)P;");
   verifyFormat("int a = (int)(2.0f);");

From 4701bcae974704a336ac8e111d5b104f4834099c Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Sun, 20 Feb 2022 22:18:16 +0100
Subject: [PATCH 378/748] Revert "[clang-format] Avoid inserting space after
 C++ casts."

This reverts commit e021987273bece6e94bc6f43b6b5232de10637c8.

This commit provokes failures in formatting tests of polly.
Cf. https://lab.llvm.org/buildbot/#/builders/205/builds/3320.

That's probably because of `)` being annotated as `CastRParen` instead of `Unknown` before, hence being kept on the same line with the next token.
---
 clang/lib/Format/TokenAnnotator.cpp   | 19 ++-----------------
 clang/unittests/Format/FormatTest.cpp |  7 -------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 51e7c32b7d4c7..9a020eb6ca7dc 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1734,11 +1734,8 @@ class AnnotatingParser {
       else
         Current.setType(TT_LineComment);
     } else if (Current.is(tok::r_paren)) {
-      if (rParenEndsCast(Current)) {
+      if (rParenEndsCast(Current))
         Current.setType(TT_CastRParen);
-        assert(Current.MatchingParen);
-        Current.MatchingParen->setType(TT_Unknown);
-      }
       if (Current.MatchingParen && Current.Next &&
           !Current.Next->isBinaryOperator() &&
           !Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace,
@@ -1941,20 +1938,8 @@ class AnnotatingParser {
 
       // Certain other tokens right before the parentheses are also signals that
       // this cannot be a cast.
-      if (LeftOfParens->is(TT_TemplateCloser)) {
-        if (LeftOfParens->MatchingParen) {
-          auto *Prev = LeftOfParens->MatchingParen->getPreviousNonComment();
-          if (Prev &&
-              Prev->isOneOf(tok::kw_const_cast, tok::kw_dynamic_cast,
-                            tok::kw_reinterpret_cast, tok::kw_static_cast))
-            // FIXME: Maybe we should handle identifiers ending with "_cast",
-            // e.g. any_cast?
-            return true;
-        }
-        return false;
-      }
       if (LeftOfParens->isOneOf(tok::at, tok::r_square, TT_OverloadedOperator,
-                                tok::ellipsis))
+                                TT_TemplateCloser, tok::ellipsis))
         return false;
     }
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d45146d5242fb..f71f8dc5de456 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -10565,13 +10565,6 @@ TEST_F(FormatTest, FormatsBinaryOperatorsPrecedingEquals) {
 
 TEST_F(FormatTest, FormatsCasts) {
   verifyFormat("Type *A = static_cast<Type *>(P);");
-  verifyFormat("static_cast<Type *>(P);");
-  verifyFormat("static_cast<Type &>(Fun)(Args);");
-  verifyFormat("static_cast<Type &>(*Fun)(Args);");
-  verifyFormat("a = static_cast<Type &>(*Fun)(Args);");
-  verifyFormat("const_cast<Type &>(*Fun)(Args);");
-  verifyFormat("dynamic_cast<Type &>(*Fun)(Args);");
-  verifyFormat("reinterpret_cast<Type &>(*Fun)(Args);");
   verifyFormat("Type *A = (Type *)P;");
   verifyFormat("Type *A = (vector<Type *, int *>)P;");
   verifyFormat("int a = (int)(2.0f);");

From be9a7fdd6a8aec669bcb1f6a68087ab4a70ddb2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Tue, 15 Feb 2022 22:59:23 +0100
Subject: [PATCH 379/748] [clang-format] Fixed handling of requires clauses
 followed by attributes

Fixes https://github.com/llvm/llvm-project/issues/53820.

Differential Revision: https://reviews.llvm.org/D119893
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 32 +++++++++++---
 clang/unittests/Format/FormatTest.cpp         | 12 +++++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 43 +++++++++++++++++++
 3 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index c1bd45beb7b37..4c5ab5346b7dd 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <algorithm>
+#include <utility>
 
 #define DEBUG_TYPE "format-parser"
 
@@ -3007,7 +3008,16 @@ void UnwrappedLineParser::parseRequiresExpression(FormatToken *RequiresToken) {
 /// clause. It returns, when the parsing is complete, or the expression is
 /// incorrect.
 void UnwrappedLineParser::parseConstraintExpression() {
+  // The special handling for lambdas is needed since tryToParseLambda() eats a
+  // token and if a requires expression is the last part of a requires clause
+  // and followed by an attribute like [[nodiscard]] the ClosesRequiresClause is
+  // not set on the correct token. Thus we need to be aware if we even expect a
+  // lambda to be possible.
+  // template <typename T> requires requires { ... } [[nodiscard]] ...;
+  bool LambdaNextTimeAllowed = true;
   do {
+    bool LambdaThisTimeAllowed = std::exchange(LambdaNextTimeAllowed, false);
+
     switch (FormatTok->Tok.getKind()) {
     case tok::kw_requires: {
       auto RequiresToken = FormatTok;
@@ -3021,7 +3031,7 @@ void UnwrappedLineParser::parseConstraintExpression() {
       break;
 
     case tok::l_square:
-      if (!tryToParseLambda())
+      if (!LambdaThisTimeAllowed || !tryToParseLambda())
         return;
       break;
 
@@ -3064,10 +3074,15 @@ void UnwrappedLineParser::parseConstraintExpression() {
     case tok::pipepipe:
       FormatTok->setType(TT_BinaryOperator);
       nextToken();
+      LambdaNextTimeAllowed = true;
+      break;
+
+    case tok::comma:
+    case tok::comment:
+      LambdaNextTimeAllowed = LambdaThisTimeAllowed;
+      nextToken();
       break;
 
-    case tok::kw_true:
-    case tok::kw_false:
     case tok::kw_sizeof:
     case tok::greater:
     case tok::greaterequal:
@@ -3082,11 +3097,16 @@ void UnwrappedLineParser::parseConstraintExpression() {
     case tok::minus:
     case tok::star:
     case tok::slash:
-    case tok::numeric_constant:
     case tok::kw_decltype:
-    case tok::comment:
-    case tok::comma:
+      LambdaNextTimeAllowed = true;
+      // Just eat them.
+      nextToken();
+      break;
+
+    case tok::numeric_constant:
     case tok::coloncolon:
+    case tok::kw_true:
+    case tok::kw_false:
       // Just eat them.
       nextToken();
       break;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index f71f8dc5de456..f6810766d83db 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -23784,7 +23784,7 @@ TEST_F(FormatTest, Concepts) {
                "concept C = [] && requires(T t) { typename T::size_type; };");
 }
 
-TEST_F(FormatTest, RequiresClauses) {
+TEST_F(FormatTest, RequiresClausesPositions) {
   auto Style = getLLVMStyle();
   EXPECT_EQ(Style.RequiresClausePosition, FormatStyle::RCPS_OwnLine);
   EXPECT_EQ(Style.IndentRequiresClause, true);
@@ -24007,6 +24007,16 @@ TEST_F(FormatTest, RequiresClauses) {
                ColumnStyle);
 }
 
+TEST_F(FormatTest, RequiresClauses) {
+  verifyFormat("struct [[nodiscard]] zero_t {\n"
+               "  template <class T>\n"
+               "    requires requires { number_zero_v<T>; }\n"
+               "  [[nodiscard]] constexpr operator T() const {\n"
+               "    return number_zero_v<T>;\n"
+               "  }\n"
+               "};");
+}
+
 TEST_F(FormatTest, StatementAttributeLikeMacros) {
   FormatStyle Style = getLLVMStyle();
   StringRef Source = "void Foo::slot() {\n"
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 0abe533dd5fc3..e7bc26b5a9b54 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -232,6 +232,20 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
                     "Namespace::Outer<T>::Inner::Constant) {}");
   ASSERT_EQ(Tokens.size(), 24u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::kw_requires, TT_RequiresClause);
+
+  Tokens = annotate("struct [[nodiscard]] zero_t {\n"
+                    "  template<class T>\n"
+                    "    requires requires { number_zero_v<T>; }\n"
+                    "  [[nodiscard]] constexpr operator T() const { "
+                    "return number_zero_v<T>; }\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 44u);
+  EXPECT_TOKEN(Tokens[13], tok::kw_requires, TT_RequiresClause);
+  EXPECT_TOKEN(Tokens[14], tok::kw_requires, TT_RequiresExpression);
+  EXPECT_TOKEN(Tokens[15], tok::l_brace, TT_RequiresExpressionLBrace);
+  EXPECT_TOKEN(Tokens[21], tok::r_brace, TT_Unknown);
+  EXPECT_EQ(Tokens[21]->MatchingParen, Tokens[15]);
+  EXPECT_TRUE(Tokens[21]->ClosesRequiresClause);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsRequiresExpressions) {
@@ -507,6 +521,35 @@ TEST_F(TokenAnnotatorTest, RequiresDoesNotChangeParsingOfTheRest) {
   NumberOfAdditionalRequiresClauseTokens = 14u;
   NumberOfTokensBeforeRequires = 5u;
 
+  ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens;
+  ASSERT_EQ(ConstrainedTokens.size(),
+            NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens)
+      << ConstrainedTokens;
+
+  for (auto I = 0u; I < NumberOfBaseTokens; ++I)
+    if (I < NumberOfTokensBeforeRequires)
+      EXPECT_EQ(*BaseTokens[I], *ConstrainedTokens[I]) << I;
+    else
+      EXPECT_EQ(*BaseTokens[I],
+                *ConstrainedTokens[I + NumberOfAdditionalRequiresClauseTokens])
+          << I;
+
+  BaseTokens = annotate("struct [[nodiscard]] zero_t {\n"
+                        "  template<class T>\n"
+                        "  [[nodiscard]] constexpr operator T() const { "
+                        "return number_zero_v<T>; }\n"
+                        "};");
+
+  ConstrainedTokens = annotate("struct [[nodiscard]] zero_t {\n"
+                               "  template<class T>\n"
+                               "    requires requires { number_zero_v<T>; }\n"
+                               "  [[nodiscard]] constexpr operator T() const { "
+                               "return number_zero_v<T>; }\n"
+                               "};");
+  NumberOfBaseTokens = 35u;
+  NumberOfAdditionalRequiresClauseTokens = 9u;
+  NumberOfTokensBeforeRequires = 13u;
+
   ASSERT_EQ(BaseTokens.size(), NumberOfBaseTokens) << BaseTokens;
   ASSERT_EQ(ConstrainedTokens.size(),
             NumberOfBaseTokens + NumberOfAdditionalRequiresClauseTokens)

From 9b139923bc6634c2d1667c54000debe00e7858f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Sun, 2 Jan 2022 21:42:08 +0100
Subject: [PATCH 380/748] [clang-format][NFC] Return early in
 ContinuationIndenter::mustBreak

We can return as early as possible and only calculate IsComparison if we
really need to. Also cache getPrecedence() instead of querying it at
most 4 times.

Differential Revision: https://reviews.llvm.org/D119923
---
 clang/lib/Format/ContinuationIndenter.cpp | 45 +++++++++++++----------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index a49e0f307cef1..f4a755268eae8 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -448,26 +448,31 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
   // current style uses wrapping before or after operators for the given
   // operator.
   if (Previous.is(TT_BinaryOperator) && Current.CanBreakBefore) {
-    // If we need to break somewhere inside the LHS of a binary expression, we
-    // should also break after the operator. Otherwise, the formatting would
-    // hide the operator precedence, e.g. in:
-    //   if (aaaaaaaaaaaaaa ==
-    //           bbbbbbbbbbbbbb && c) {..
-    // For comparisons, we only apply this rule, if the LHS is a binary
-    // expression itself as otherwise, the line breaks seem superfluous.
-    // We need special cases for ">>" which we have split into two ">" while
-    // lexing in order to make template parsing easier.
-    bool IsComparison = (Previous.getPrecedence() == prec::Relational ||
-                         Previous.getPrecedence() == prec::Equality ||
-                         Previous.getPrecedence() == prec::Spaceship) &&
-                        Previous.Previous &&
-                        Previous.Previous->isNot(TT_BinaryOperator); // For >>.
-    bool LHSIsBinaryExpr =
-        Previous.Previous && Previous.Previous->EndsBinaryExpression;
-    if ((!IsComparison || LHSIsBinaryExpr) && !Current.isTrailingComment() &&
-        Previous.getPrecedence() != prec::Assignment &&
-        CurrentState.BreakBeforeParameter)
-      return true;
+    const auto PreviousPrecedence = Previous.getPrecedence();
+    if (PreviousPrecedence != prec::Assignment &&
+        CurrentState.BreakBeforeParameter && !Current.isTrailingComment()) {
+      const bool LHSIsBinaryExpr =
+          Previous.Previous && Previous.Previous->EndsBinaryExpression;
+      if (LHSIsBinaryExpr)
+        return true;
+      // If we need to break somewhere inside the LHS of a binary expression, we
+      // should also break after the operator. Otherwise, the formatting would
+      // hide the operator precedence, e.g. in:
+      //   if (aaaaaaaaaaaaaa ==
+      //           bbbbbbbbbbbbbb && c) {..
+      // For comparisons, we only apply this rule, if the LHS is a binary
+      // expression itself as otherwise, the line breaks seem superfluous.
+      // We need special cases for ">>" which we have split into two ">" while
+      // lexing in order to make template parsing easier.
+      const bool IsComparison =
+          (PreviousPrecedence == prec::Relational ||
+           PreviousPrecedence == prec::Equality ||
+           PreviousPrecedence == prec::Spaceship) &&
+          Previous.Previous &&
+          Previous.Previous->isNot(TT_BinaryOperator); // For >>.
+      if (!IsComparison)
+        return true;
+    }
   } else if (Current.is(TT_BinaryOperator) && Current.CanBreakBefore &&
              CurrentState.BreakBeforeParameter) {
     return true;

From 8f310d1967c20d348c617af3a30999031c71fee0 Mon Sep 17 00:00:00 2001
From: Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
Date: Sun, 20 Feb 2022 21:07:53 +0100
Subject: [PATCH 381/748] [clang-format][docs] Fix incorrect 'clang-format 13'
 configuration ...

...options markers

Note: Option 'IndentRequiresClause' was previously known as
'IndentRequires' but the version marker should still indicate
'clang-format 15' as this option most recent name wasn't accessible
earlier and it would produce:
error: unknown key 'IndentRequiresClause'

Differential Revision: https://reviews.llvm.org/D119682
---
 clang/docs/ClangFormatStyleOptions.rst | 8 +++++---
 clang/include/clang/Format/Format.h    | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 8d6c80fb87e5a..0cddf022ead3c 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1988,7 +1988,7 @@ the configuration (without a prefix: ``Auto``).
 
 
 
-**BreakBeforeConceptDeclarations** (``BreakBeforeConceptDeclarationsStyle``) :versionbadge:`clang-format 13`
+**BreakBeforeConceptDeclarations** (``BreakBeforeConceptDeclarationsStyle``) :versionbadge:`clang-format 12`
   The concept declaration style to use.
 
   Possible values:
@@ -2278,7 +2278,7 @@ the configuration (without a prefix: ``Auto``).
 
 
 
-**EmptyLineBeforeAccessModifier** (``EmptyLineBeforeAccessModifierStyle``) :versionbadge:`clang-format 13`
+**EmptyLineBeforeAccessModifier** (``EmptyLineBeforeAccessModifierStyle``) :versionbadge:`clang-format 12`
   Defines in which cases to put empty line before access modifiers.
 
   Possible values:
@@ -2706,10 +2706,12 @@ the configuration (without a prefix: ``Auto``).
 
 
 
-**IndentRequiresClause** (``Boolean``) :versionbadge:`clang-format 13`
+**IndentRequiresClause** (``Boolean``) :versionbadge:`clang-format 15`
   Indent the requires clause in a template. This only applies when
   ``RequiresClausePosition`` is ``OwnLine``, or ``WithFollowing``.
 
+  In clang-format 13 and 14 it was named ``IndentRequires``.
+
   .. code-block:: c++
 
      true:
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index c86a700097160..d4a479e7c5120 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -1791,7 +1791,7 @@ struct FormatStyle {
   };
 
   /// The concept declaration style to use.
-  /// \version 13
+  /// \version 12
   BreakBeforeConceptDeclarationsStyle BreakBeforeConceptDeclarations;
 
   /// If ``true``, ternary operators will be placed after line breaks.
@@ -2185,7 +2185,7 @@ struct FormatStyle {
   };
 
   /// Defines in which cases to put empty line before access modifiers.
-  /// \version 13
+  /// \version 12
   EmptyLineBeforeAccessModifierStyle EmptyLineBeforeAccessModifier;
 
   /// If ``true``, clang-format detects whether function calls and
@@ -2523,6 +2523,8 @@ struct FormatStyle {
 
   /// Indent the requires clause in a template. This only applies when
   /// ``RequiresClausePosition`` is ``OwnLine``, or ``WithFollowing``.
+  ///
+  /// In clang-format 13 and 14 it was named ``IndentRequires``.
   /// \code
   ///    true:
   ///    template <typename It>
@@ -2538,7 +2540,7 @@ struct FormatStyle {
   ///      //....
   ///    }
   /// \endcode
-  /// \version 13
+  /// \version 15
   bool IndentRequiresClause;
 
   /// The number of columns to use for indentation.

From c57b8ca721dd2e88ed96b7df65a518fdab738445 Mon Sep 17 00:00:00 2001
From: Kesavan Yogeswaran <hikes@google.com>
Date: Sun, 20 Feb 2022 22:00:23 +0000
Subject: [PATCH 382/748] [clang-tidy] Provide fine control of color in
 run-clang-tidy

D90110 modified the behavior of `run-clang-tidy` to always pass the
`--use-color` option to clang-tidy, which enabled colored diagnostics
output regardless of TTY status or .clang-tidy settings. This left the
user with no option to disable the colored output.

This presents an issue when trying to parse the output of run-clang-tidy
programmaticall, as the output is polluted with ANSI escape characters.

This PR fixes this issue in two ways:
1. It restores the default behavior of `run-clang-tidy` to let
   `clang-tidy` decide whether to color output. This allows the user to
   configure color via the `UseColor` option in a .clang-tidy file.
2. It adds mutually exclusive, optional `-use-color` and `-no-use-color`
   argument flags that let the user explicitly set the color option via
   the invocation.

After this change the default behavior of `run-clang-tidy` when no
.clang-tidy file is available is now to show no color, presumably
because `clang-tidy` detects that the output is being piped and defaults
to not showing colored output. This seems like an acceptable tradeoff
to respect .clang-tidy configurations, as users can still use the
`-use-color` option to explicitly enable color.

Fixes #49441 (50097 in Bugzilla)

Reviewed By: njames93

Differential Revision: https://reviews.llvm.org/D119562
---
 .../clang-tidy/tool/run-clang-tidy.py         | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 090646c1b061b..fa98c217e2381 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -62,6 +62,21 @@
     import queue as queue
 
 
+def strtobool(val):
+  """Convert a string representation of truth to a bool following LLVM's CLI argument parsing."""
+
+  val = val.lower()
+  if val in ['', 'true', '1']:
+    return True
+  elif val in ['false', '0']:
+    return False
+
+  # Return ArgumentTypeError so that argparse does not substitute its own error message
+  raise argparse.ArgumentTypeError(
+    "'{}' is invalid value for boolean argument! Try 0 or 1.".format(val)
+  )
+
+
 def find_compilation_database(path):
   """Adjusts the directory until a compilation database is found."""
   result = './'
@@ -82,15 +97,20 @@ def make_absolute(f, directory):
 def get_tidy_invocation(f, clang_tidy_binary, checks, tmpdir, build_path,
                         header_filter, allow_enabling_alpha_checkers,
                         extra_arg, extra_arg_before, quiet, config,
-                        line_filter):
+                        line_filter, use_color):
   """Gets a command line for clang-tidy."""
-  start = [clang_tidy_binary, '--use-color']
+  start = [clang_tidy_binary]
   if allow_enabling_alpha_checkers:
     start.append('-allow-enabling-analyzer-alpha-checkers')
   if header_filter is not None:
     start.append('-header-filter=' + header_filter)
   if line_filter is not None:
     start.append('-line-filter=' + line_filter)
+  if use_color is not None:
+    if use_color:
+      start.append('--use-color')
+    else:
+      start.append('--use-color=false')
   if checks:
     start.append('-checks=' + checks)
   if tmpdir is not None:
@@ -168,7 +188,8 @@ def run_tidy(args, tmpdir, build_path, queue, lock, failed_files):
                                      tmpdir, build_path, args.header_filter,
                                      args.allow_enabling_alpha_checkers,
                                      args.extra_arg, args.extra_arg_before,
-                                     args.quiet, args.config, args.line_filter)
+                                     args.quiet, args.config, args.line_filter,
+                                     args.use_color)
 
     proc = subprocess.Popen(invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     output, err = proc.communicate()
@@ -231,6 +252,10 @@ def main():
                       'after applying fixes')
   parser.add_argument('-style', default='file', help='The style of reformat '
                       'code after applying fixes')
+  parser.add_argument('-use-color', type=strtobool, nargs='?', const=True,
+                      help='Use colors in diagnostics, overriding clang-tidy\'s'
+                      ' default behavior. This option overrides the \'UseColor'
+                      '\' option in .clang-tidy file, if any.')
   parser.add_argument('-p', dest='build_path',
                       help='Path used to read a compile command database.')
   parser.add_argument('-extra-arg', dest='extra_arg',
@@ -258,7 +283,8 @@ def main():
                                      None, build_path, args.header_filter,
                                      args.allow_enabling_alpha_checkers,
                                      args.extra_arg, args.extra_arg_before,
-                                     args.quiet, args.config, args.line_filter)
+                                     args.quiet, args.config, args.line_filter,
+                                     args.use_color)
     invocation.append('-list-checks')
     invocation.append('-')
     if args.quiet:

From 323c67278987538bf53eec281fe767504450ca33 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sun, 20 Feb 2022 14:52:33 -0800
Subject: [PATCH 383/748] DebugInfo: Add an assert about cross-unit references
 in dwo units

This is helping me debug some issues with simplified template names
---
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 5a2bd479f2774..30fff15b4fa9b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -380,6 +380,8 @@ void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
     CU = getUnitDie().getUnit();
   if (!EntryCU)
     EntryCU = getUnitDie().getUnit();
+  assert(EntryCU == CU || !DD->useSplitDwarf() || DD->shareAcrossDWOCUs() ||
+         !static_cast<const DwarfUnit*>(CU)->isDwoUnit());
   addAttribute(Die, Attribute,
                EntryCU == CU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
                Entry);

From aacc110bdce71d1405d820cf282196855afeee26 Mon Sep 17 00:00:00 2001
From: Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
Date: Sun, 20 Feb 2022 17:16:06 -0800
Subject: [PATCH 384/748] [clang-format][NFC] Fix typos and inconsistencies

Differential Revision: https://reviews.llvm.org/D120220
---
 clang/lib/Format/Format.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 6acd850cac2cb..bc3f0c93426bf 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -990,11 +990,11 @@ std::string ParseErrorCategory::message(int EV) const {
   case ParseError::InvalidQualifierSpecified:
     return "Invalid qualifier specified in QualifierOrder";
   case ParseError::DuplicateQualifierSpecified:
-    return "Duplicate qualifier specified in QualfierOrder";
+    return "Duplicate qualifier specified in QualifierOrder";
   case ParseError::MissingQualifierType:
-    return "Missing type in QualfierOrder";
+    return "Missing type in QualifierOrder";
   case ParseError::MissingQualifierOrder:
-    return "Missing QualfierOrder";
+    return "Missing QualifierOrder";
   }
   llvm_unreachable("unexpected parse error");
 }
@@ -1650,7 +1650,8 @@ ParseError validateQualifierOrder(FormatStyle *Style) {
     if (token == tok::identifier)
       return ParseError::InvalidQualifierSpecified;
   }
-  // Ensure the list is unqiue (no duplicates).
+
+  // Ensure the list is unique (no duplicates).
   std::set<std::string> UniqueQualifiers(Style->QualifierOrder.begin(),
                                          Style->QualifierOrder.end());
   if (Style->QualifierOrder.size() != UniqueQualifiers.size()) {
@@ -1660,10 +1661,12 @@ ParseError validateQualifierOrder(FormatStyle *Style) {
     return ParseError::DuplicateQualifierSpecified;
   }
 
+  // Ensure the list has 'type' in it
   auto type = std::find(Style->QualifierOrder.begin(),
                         Style->QualifierOrder.end(), "type");
   if (type == Style->QualifierOrder.end())
     return ParseError::MissingQualifierType;
+
   return ParseError::Success;
 }
 

From 36ada32727d8c9f075ea8943212d489fdbcf637e Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sun, 20 Feb 2022 14:44:09 -0800
Subject: [PATCH 385/748] [BOLT][NFC] Fix data race in ShrinkWrapping stats

Fix data race reported by ThreadSanitizer in clang.test:
```
ThreadSanitizer: data race /data/llvm-project/bolt/lib/Passes/ShrinkWrapping.cpp:1359:28
in llvm::bolt::ShrinkWrapping::moveSaveRestores()
```

The issue is with incrementing global counters from multiple threads.

Reviewed By: yota9

Differential Revision: https://reviews.llvm.org/D120218
---
 bolt/include/bolt/Passes/ShrinkWrapping.h | 4 ++--
 bolt/lib/Passes/ShrinkWrapping.cpp        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bolt/include/bolt/Passes/ShrinkWrapping.h b/bolt/include/bolt/Passes/ShrinkWrapping.h
index 20dbe4542b906..e8bd09302d527 100644
--- a/bolt/include/bolt/Passes/ShrinkWrapping.h
+++ b/bolt/include/bolt/Passes/ShrinkWrapping.h
@@ -308,8 +308,8 @@ class ShrinkWrapping {
   std::vector<MCInst *> BestSavePos;
 
   /// Pass stats
-  static uint64_t SpillsMovedRegularMode;
-  static uint64_t SpillsMovedPushPopMode;
+  static std::atomic_uint64_t SpillsMovedRegularMode;
+  static std::atomic_uint64_t SpillsMovedPushPopMode;
 
   Optional<unsigned> AnnotationIndex;
 
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index 9e390e5e27ebd..0e4e1504766fe 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -710,8 +710,8 @@ void StackLayoutModifier::initialize() {
   IsInitialized = true;
 }
 
-uint64_t ShrinkWrapping::SpillsMovedRegularMode = 0;
-uint64_t ShrinkWrapping::SpillsMovedPushPopMode = 0;
+std::atomic_uint64_t ShrinkWrapping::SpillsMovedRegularMode{0};
+std::atomic_uint64_t ShrinkWrapping::SpillsMovedPushPopMode{0};
 
 using BBIterTy = BinaryBasicBlock::iterator;
 

From d44f99c748e0f35e9a322c9c9bea18d03168fae5 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sun, 20 Feb 2022 17:23:40 -0800
Subject: [PATCH 386/748] [BOLT] Added fuzzer target (llvm-bolt-fuzzer)

This adds a target that would consume random binary as an
input ELF file.
TBD: add structured input support (ELF).

Build:
```
cmake /path/to/llvm-project/llvm -GNinja \
-DLLVM_TARGETS_TO_BUILD="X86;AArch64" \
-DCMAKE_BUILD_TYPE=Release \
-DLLVM_ENABLE_ASSERTIONS=1 \
-DCMAKE_C_COMPILER=<sanitizer-capable clang> \
-DCMAKE_CXX_COMPILER=<sanitizer-capable clang++> \
-DLLVM_ENABLE_PROJECTS="bolt"  \
-DLLVM_USE_SANITIZER=Address \
-DLLVM_USE_SANITIZE_COVERAGE=On
ninja llvm-bolt-fuzzer
```

Test Plan: ninja llvm-bolt-fuzzer

Reviewed By: maksfb

Differential Revision: https://reviews.llvm.org/D120016
---
 bolt/lib/Rewrite/RewriteInstance.cpp          |  2 +-
 bolt/tools/CMakeLists.txt                     |  1 +
 bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt    |  7 ++
 .../llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp     | 70 +++++++++++++++++++
 4 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt
 create mode 100644 bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp

diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 2671df8ebc31c..51607eab8c8a1 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -138,7 +138,7 @@ KeepTmp("keep-tmp",
   cl::Hidden,
   cl::cat(BoltCategory));
 
-static cl::opt<bool>
+cl::opt<bool>
 Lite("lite",
   cl::desc("skip processing of cold functions"),
   cl::init(false),
diff --git a/bolt/tools/CMakeLists.txt b/bolt/tools/CMakeLists.txt
index bd5a1d17af4c2..1fe85145d79a1 100644
--- a/bolt/tools/CMakeLists.txt
+++ b/bolt/tools/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(driver)
+add_subdirectory(llvm-bolt-fuzzer)
 add_subdirectory(merge-fdata)
 add_subdirectory(heatmap)
diff --git a/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt b/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt
new file mode 100644
index 0000000000000..3c26b78c855c2
--- /dev/null
+++ b/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_LINK_COMPONENTS
+  BOLTRewrite
+  )
+
+add_llvm_fuzzer(llvm-bolt-fuzzer
+  llvm-bolt-fuzzer.cpp
+  )
diff --git a/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp b/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp
new file mode 100644
index 0000000000000..ac129a0d9a625
--- /dev/null
+++ b/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp
@@ -0,0 +1,70 @@
+//===- llvm-bolt-fuzzer.cpp - Fuzzing target for llvm-bolt ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Rewrite/RewriteInstance.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetSelect.h"
+
+using namespace llvm;
+using namespace object;
+using namespace bolt;
+
+namespace opts {
+extern cl::opt<std::string> OutputFilename;
+extern cl::opt<bool> Lite;
+} // namespace opts
+
+extern "C" int LLVMFuzzerTestOneInput(const char *Data, size_t Size) {
+  const char *argv[] = {"llvm-bolt", nullptr};
+  const char argc = 1;
+  opts::OutputFilename = "/dev/null";
+  opts::Lite = false;
+
+  // Input has to be an ELF - we don't want to fuzz createBinary interface.
+  if (Size < 4 || strncmp("\177ELF", Data, 4) != 0)
+    return 0;
+  // Construct an ELF binary from fuzzer input.
+  std::unique_ptr<MemoryBuffer> Buffer =
+      MemoryBuffer::getMemBuffer(StringRef(Data, Size), "", false);
+  Expected<std::unique_ptr<Binary>> BinaryOrErr =
+      createBinary(Buffer->getMemBufferRef());
+  // Check that the input is a valid binary.
+  if (Error E = BinaryOrErr.takeError()) {
+    consumeError(std::move(E));
+    return 0;
+  }
+  Binary &Binary = *BinaryOrErr.get();
+  // Check that the binary is an ELF64LE object file.
+  auto *E = dyn_cast<ELF64LEObjectFile>(&Binary);
+  if (!E)
+    return 0;
+
+  // Fuzz RewriteInstance.
+  auto RIOrErr =
+      RewriteInstance::createRewriteInstance(E, argc, argv, "llvm-bolt");
+  if (Error E = RIOrErr.takeError()) {
+    consumeError(std::move(E));
+    return 0;
+  }
+  RewriteInstance &RI = *RIOrErr.get();
+  RI.run();
+  return 0;
+}
+
+extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(int *argc,
+                                                        char ***argv) {
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllDisassemblers();
+
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmPrinters();
+
+  return 0;
+}

From 67ef63138b28369c2fcc3b15c1a4e27c55817143 Mon Sep 17 00:00:00 2001
From: "Luo, Yuanke" <yuanke.luo@intel.com>
Date: Fri, 18 Feb 2022 21:04:08 +0800
Subject: [PATCH 387/748] [SDAG] enable binop identity constant folds for sub

This patch extract the sub folding from D119654 and leave only add
folding in that patch.

Differential Revision: https://reviews.llvm.org/D120116
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 +++++++++++++-
 llvm/test/CodeGen/X86/vector-bo-select.ll     | 16 +++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 89c3e41392882..ecabcacd0d3f1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2144,6 +2144,12 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
         return C->isExactlyValue(1.0);
       }
     }
+    if (ConstantSDNode *C = isConstOrConstSplat(V)) {
+      switch (Opcode) {
+      case ISD::SUB: // X - 0 --> X
+        return C->isZero();
+      }
+    }
     return false;
   };
 
@@ -3326,9 +3332,15 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
+  auto PeekThroughFreeze = [](SDValue N) {
+    if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
+      return N->getOperand(0);
+    return N;
+  };
+
   // fold (sub x, x) -> 0
   // FIXME: Refactor this and xor and other similar operations together.
-  if (N0 == N1)
+  if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
     return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
 
   // fold (sub c1, c2) -> c3
diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index 9f2141c48b6ab..8aab13b569b90 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -915,8 +915,8 @@ define <4 x i32> @sub_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} {z}
-; AVX512VL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpsubd %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
   %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> zeroinitializer
   %r = sub <4 x i32> %x, %s
@@ -979,9 +979,9 @@ define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k1
-; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpsubd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y
   %r = sub <16 x i32> %x, %s
@@ -1044,8 +1044,7 @@ define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef
 ; AVX512VL-LABEL: sub_v8i32_cast_cond:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    kmovw %edi, %k1
-; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm1 {%k1} {z}
-; AVX512VL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %b = bitcast i8 %pb to <8 x i1>
   %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer
@@ -1110,8 +1109,7 @@ define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
 ; AVX512-LABEL: sub_v8i64_cast_cond:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovw %edi, %k1
-; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm1 {%k1} {z}
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %b = bitcast i8 %pb to <8 x i1>
   %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> zeroinitializer

From 4abe484525a964dc3afb06845de22d3a1bc8a049 Mon Sep 17 00:00:00 2001
From: Lian Wang <Lian.Wang@streamcomputing.com>
Date: Mon, 21 Feb 2022 09:58:08 +0800
Subject: [PATCH 388/748] [RISCV][NFC] Add sched for some instructions in Zb
 extension

Add sched to brev8, zip and unzip instruction.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120009
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 3d93b41320aec..6d183a3a2e727 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -598,11 +598,14 @@ def ORC_B : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">,
 } // Predicates = [HasStdExtZbbOrZbp]
 
 let Predicates = [HasStdExtZbpOrZbkb] in 
-def BREV8 : RVBUnary<0b0110100, 0b00111, 0b101, OPC_OP_IMM, "brev8">;
+def BREV8 : RVBUnary<0b0110100, 0b00111, 0b101, OPC_OP_IMM, "brev8">,
+            Sched<[]>;
 
 let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in {
-def ZIP_RV32   : RVBUnary<0b0000100, 0b01111, 0b001, OPC_OP_IMM, "zip">;
-def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">;
+def ZIP_RV32   : RVBUnary<0b0000100, 0b01111, 0b001, OPC_OP_IMM, "zip">,
+                 Sched<[]>;
+def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">,
+                 Sched<[]>;
 } // Predicates = [HasStdExtZbpOrZbkb, IsRV32]
 
 

From efe5b8ad904bfb1d9abe6ac7123494b534040238 Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Thu, 17 Feb 2022 08:57:53 -0500
Subject: [PATCH 389/748] [ISEL] remove unnecessary getNode(); NFC

Reviewed By: RKSimon, craig.topper

Differential Revision: https://reviews.llvm.org/D120049
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |   5 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |   2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 141 +++++++++---------
 3 files changed, 72 insertions(+), 76 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 04c6b50197d46..1bc2a8d714faa 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1191,12 +1191,13 @@ inline void SDValue::dumpr(const SelectionDAG *G) const {
 inline void SDUse::set(const SDValue &V) {
   if (Val.getNode()) removeFromList();
   Val = V;
-  if (V.getNode()) V.getNode()->addUse(*this);
+  if (V.getNode())
+    V->addUse(*this);
 }
 
 inline void SDUse::setInitial(const SDValue &V) {
   Val = V;
-  V.getNode()->addUse(*this);
+  V->addUse(*this);
 }
 
 inline void SDUse::setNode(SDNode *N) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ec9f9b73b8f65..2e17722180dd4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3821,7 +3821,7 @@ class TargetLowering : public TargetLoweringBase {
     if (Neg && Cost == NegatibleCost::Cheaper)
       return Neg;
     // Remove the new created node to avoid the side effect to the DAG.
-    if (Neg && Neg.getNode()->use_empty())
+    if (Neg && Neg->use_empty())
       DAG.RemoveDeadNode(Neg.getNode());
     return SDValue();
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ecabcacd0d3f1..52a0330e1473c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -927,7 +927,7 @@ bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
 /// it is profitable to do so.
 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   SDValue N0, N1, N2;
-  if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
+  if (isSetCCEquivalent(N, N0, N1, N2) && N->hasOneUse())
     return true;
   return false;
 }
@@ -1104,7 +1104,7 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
   assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
-             To[0].getNode()->dump(&DAG);
+             To[0].dump(&DAG);
              dbgs() << " and " << NumTo - 1 << " other values\n");
   for (unsigned i = 0, e = NumTo; i != e; ++i)
     assert((!To[i].getNode() ||
@@ -1135,9 +1135,8 @@ void DAGCombiner::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Replace the old value with the new one.
   ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
-             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
-             dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.dump(&DAG);
+             dbgs() << "\nWith: "; TLO.New.dump(&DAG); dbgs() << '\n');
 
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
@@ -1150,7 +1149,7 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
-  if (TLO.Old.getNode()->use_empty())
+  if (TLO.Old->use_empty())
     deleteAndRecombine(TLO.Old.getNode());
 }
 
@@ -1197,7 +1196,7 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
 
   LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
-             Trunc.getNode()->dump(&DAG); dbgs() << '\n');
+             Trunc.dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
@@ -1296,7 +1295,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
-    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
 
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
@@ -1323,7 +1322,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
 
     // If operands have a use ordering, make sure we deal with
     // predecessor first.
-    if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
+    if (Replace0 && Replace1 && N0->isPredecessorOf(N1.getNode())) {
       std::swap(N0, N1);
       std::swap(NN0, NN1);
     }
@@ -1364,7 +1363,7 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
-    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
 
     bool Replace = false;
     SDValue N0 = Op.getOperand(0);
@@ -1415,7 +1414,7 @@ SDValue DAGCombiner::PromoteExtend(SDValue Op) {
     // fold (aext (aext x)) -> (aext x)
     // fold (aext (zext x)) -> (zext x)
     // fold (aext (sext x)) -> (sext x)
-    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
     return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
   }
   return SDValue();
@@ -1456,7 +1455,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
 
     LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
-               Result.getNode()->dump(&DAG); dbgs() << '\n');
+               Result.dump(&DAG); dbgs() << '\n');
     WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
@@ -1570,9 +1569,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
            RV.getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned new node!");
 
-    LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG));
+    LLVM_DEBUG(dbgs() << " ... into: "; RV.dump(&DAG));
 
-    if (N->getNumValues() == RV.getNode()->getNumValues())
+    if (N->getNumValues() == RV->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
@@ -2689,7 +2688,7 @@ static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
       V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
     return SDValue();
 
-  EVT VT = V.getNode()->getValueType(0);
+  EVT VT = V->getValueType(0);
   if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT))
     return SDValue();
 
@@ -4005,12 +4004,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
     if (N0.getOpcode() == ISD::SHL &&
-        isConstantOrConstantVector(N0.getOperand(1)) &&
-        N0.getNode()->hasOneUse()) {
+        isConstantOrConstantVector(N0.getOperand(1)) && N0->hasOneUse()) {
       Sh = N0; Y = N1;
     } else if (N1.getOpcode() == ISD::SHL &&
                isConstantOrConstantVector(N1.getOperand(1)) &&
-               N1.getNode()->hasOneUse()) {
+               N1->hasOneUse()) {
       Sh = N1; Y = N0;
     }
 
@@ -4143,7 +4141,7 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   SDValue Op0 = Node->getOperand(0);
   SDValue Op1 = Node->getOperand(1);
   SDValue combined;
-  for (SDNode *User : Op0.getNode()->uses()) {
+  for (SDNode *User : Op0->uses()) {
     if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
         User->use_empty())
       continue;
@@ -6331,7 +6329,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
       std::swap(N0, N1);
   if (N0.getOpcode() == ISD::AND) {
-    if (!N0.getNode()->hasOneUse())
+    if (!N0->hasOneUse())
       return SDValue();
     ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     // Also handle 0xffff since the LHS is guaranteed to have zeros there.
@@ -6344,7 +6342,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   }
 
   if (N1.getOpcode() == ISD::AND) {
-    if (!N1.getNode()->hasOneUse())
+    if (!N1->hasOneUse())
       return SDValue();
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C || N11C->getZExtValue() != 0xFF)
@@ -6357,7 +6355,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
     return SDValue();
-  if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse())
+  if (!N0->hasOneUse() || !N1->hasOneUse())
     return SDValue();
 
   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
@@ -6370,7 +6368,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
   SDValue N00 = N0->getOperand(0);
   if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
-    if (!N00.getNode()->hasOneUse())
+    if (!N00->hasOneUse())
       return SDValue();
     ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
     if (!N001C || N001C->getZExtValue() != 0xFF)
@@ -6381,7 +6379,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
 
   SDValue N10 = N1->getOperand(0);
   if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
-    if (!N10.getNode()->hasOneUse())
+    if (!N10->hasOneUse())
       return SDValue();
     ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
     // Also allow 0xFFFF since the bits will be shifted out. This is needed
@@ -6431,7 +6429,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
 /// ((x & 0x00ff0000) << 8) |
 /// ((x & 0xff000000) >> 8)
 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
-  if (!N.getNode()->hasOneUse())
+  if (!N->hasOneUse())
     return false;
 
   unsigned Opc = N.getOpcode();
@@ -6657,7 +6655,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
   // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
   if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
       // Don't increase # computations.
-      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+      (N0->hasOneUse() || N1->hasOneUse())) {
     // We can only do this xform if we know that bits from X that are set in C2
     // but not in C1 are already zero.  Likewise for Y.
     if (const ConstantSDNode *N0O1C =
@@ -6685,7 +6683,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
       N1.getOpcode() == ISD::AND &&
       N0.getOperand(0) == N1.getOperand(0) &&
       // Don't increase # computations.
-      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+      (N0->hasOneUse() || N1->hasOneUse())) {
     SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                             N0.getOperand(1), N1.getOperand(1));
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
@@ -6844,7 +6842,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
     return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue());
   };
-  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+  if (N0.getOpcode() == ISD::AND && N0->hasOneUse() &&
       ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
     if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
                                                  {N1, N0.getOperand(1)})) {
@@ -8794,7 +8792,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // Variant of version done on multiply, except mul by a power of 2 is turned
   // into a shift.
   if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
-      N0.getNode()->hasOneUse() &&
+      N0->hasOneUse() &&
       isConstantOrConstantVector(N1, /* No Opaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
       TLI.isDesirableToCommuteWithShift(N, Level)) {
@@ -8806,7 +8804,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   }
 
   // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
-  if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
+  if (N0.getOpcode() == ISD::MUL && N0->hasOneUse() &&
       isConstantOrConstantVector(N1, /* No Opaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
     SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
@@ -10124,7 +10122,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
       // Any flags available in a select/setcc fold will be on the setcc as they
       // migrated from fcmp
-      Flags = N0.getNode()->getFlags();
+      Flags = N0->getFlags();
       SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
                                        N2, N0.getOperand(2));
       SelectNode->setFlags(Flags);
@@ -10318,7 +10316,7 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   // If this is a TRUNC followed by a masked store, fold this into a masked
   // truncating store.  We can do this even if this is already a masked
   // truncstore.
-  if ((Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() &&
+  if ((Value.getOpcode() == ISD::TRUNCATE) && Value->hasOneUse() &&
       MST->isUnindexed() &&
       TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
                                MST->getMemoryVT(), LegalOperations)) {
@@ -11021,9 +11019,8 @@ static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
                                     const TargetLowering &TLI) {
   bool HasCopyToRegUses = false;
   bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
-  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
-                            UE = N0.getNode()->use_end();
-       UI != UE; ++UI) {
+  for (SDNode::use_iterator UI = N0->use_begin(), UE = N0->use_end(); UI != UE;
+       ++UI) {
     SDNode *User = *UI;
     if (User == N)
       continue;
@@ -13021,7 +13018,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
   // When the adde's carry is not used.
   if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) &&
-      N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
+      N0.hasOneUse() && !N0->hasAnyUseOfValue(1) &&
       // We only do for addcarry before legalize operation
       ((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) ||
        TLI.isOperationLegal(N0.getOpcode(), VT))) {
@@ -13209,7 +13206,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       (!LegalTypes ||
        (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
         TLI.isTypeLegal(VT.getVectorElementType()))) &&
-      N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0->hasOneUse() &&
       cast<BuildVectorSDNode>(N0)->isConstant())
     return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
                                              VT.getVectorElementType());
@@ -13277,8 +13274,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
-      N0.getNode()->hasOneUse() && VT.isInteger() &&
-      !VT.isVector() && !N0.getValueType().isVector()) {
+      N0->hasOneUse() && VT.isInteger() && !VT.isVector() &&
+      !N0.getValueType().isVector()) {
     SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
     AddToWorklist(NewConv.getNode());
 
@@ -13326,9 +13323,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   //                     (xor (bitcast cst), (bitcast x)), 0),
   //                    signbit)
   //     (xor (bitcast cst) (build_pair flipbit, flipbit))
-  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
-      isa<ConstantFPSDNode>(N0.getOperand(0)) &&
-      VT.isInteger() && !VT.isVector()) {
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(0)) && VT.isInteger() &&
+      !VT.isVector()) {
     unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
     EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
     if (isTypeLegal(IntXVT)) {
@@ -13542,7 +13539,7 @@ static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
 
 // Returns true if `N` can assume no infinities involved in its computation.
 static bool hasNoInfs(const TargetOptions &Options, SDValue N) {
-  return Options.NoInfsFPMath || N.getNode()->getFlags().hasNoInfs();
+  return Options.NoInfsFPMath || N->getFlags().hasNoInfs();
 }
 
 /// Try to perform FMA combining on a given FADD node.
@@ -13596,7 +13593,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
   if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
-    if (N0.getNode()->use_size() > N1.getNode()->use_size())
+    if (N0->use_size() > N1->use_size())
       std::swap(N0, N1);
   }
 
@@ -13826,7 +13823,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
   if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
-      (N0.getNode()->use_size() > N1.getNode()->use_size())) {
+      (N0->use_size() > N1->use_size())) {
     // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
     if (SDValue V = tryToFoldXSubYZ(N0, N1))
       return V;
@@ -15363,7 +15360,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
-  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse()) {
     SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
                               N0.getOperand(0), N1);
     AddToWorklist(Tmp.getNode());
@@ -15807,7 +15804,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
   // out.  There is no reason to make this a preinc/predec.
   if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
-      Ptr.getNode()->hasOneUse())
+      Ptr->hasOneUse())
     return false;
 
   // Ask the target to do addressing mode selection.
@@ -15867,8 +15864,8 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // a copy of the original base pointer.
   SmallVector<SDNode *, 16> OtherUses;
   if (isa<ConstantSDNode>(Offset))
-    for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
-                              UE = BasePtr.getNode()->use_end();
+    for (SDNode::use_iterator UI = BasePtr->use_begin(),
+                              UE = BasePtr->use_end();
          UI != UE; ++UI) {
       SDUse &Use = UI.getUse();
       // Skip the use that is Ptr and uses of other results from BasePtr's
@@ -15906,7 +15903,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // Now check for #3 and #4.
   bool RealUse = false;
 
-  for (SDNode *Use : Ptr.getNode()->uses()) {
+  for (SDNode *Use : Ptr->uses()) {
     if (Use == N)
       continue;
     if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
@@ -15939,7 +15936,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   ++PreIndexedNodes;
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
-             Result.getNode()->dump(&DAG); dbgs() << '\n');
+             Result.dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   if (IsLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
@@ -16029,7 +16026,7 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
     return false;
 
   SmallPtrSet<const SDNode *, 32> Visited;
-  for (SDNode *Use : BasePtr.getNode()->uses()) {
+  for (SDNode *Use : BasePtr->uses()) {
     if (Use == Ptr.getNode())
       continue;
 
@@ -16066,7 +16063,7 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
                                          const TargetLowering &TLI) {
   if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
                                 IsMasked, Ptr, TLI) ||
-      Ptr.getNode()->hasOneUse())
+      Ptr->hasOneUse())
     return nullptr;
 
   // Try turning it into a post-indexed load / store except when
@@ -16126,9 +16123,8 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
                                                 BasePtr, Offset, AM);
   ++PostIndexedNodes;
   ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
-             dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
-             dbgs() << '\n');
+  LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: ";
+             Result.dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   if (IsLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
@@ -16369,7 +16365,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
         // Now we replace use of chain2 with chain1.  This makes the second load
         // isomorphic to the one we are deleting, and thus makes this load live.
         LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
-                   dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG);
+                   dbgs() << "\nWith chain: "; Chain.dump(&DAG);
                    dbgs() << "\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
@@ -16400,7 +16396,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
         } else
           Index = DAG.getUNDEF(N->getValueType(1));
         LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
-                   dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG);
+                   dbgs() << "\nWith: "; Undef.dump(&DAG);
                    dbgs() << " and 2 other values\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
@@ -17361,7 +17357,7 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
 
   // If the add only has one use, and the target thinks the folding is
   // profitable or does not lead to worse code, this would be OK to do.
-  if (AddNode.getNode()->hasOneUse() &&
+  if (AddNode->hasOneUse() &&
       TLI.isMulAddWithConstProfitable(AddNode, ConstNode))
     return true;
 
@@ -18349,7 +18345,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
     for (unsigned i = 0; i < NumElem; ++i) {
       SDValue Val = StoreNodes[i].MemNode->getOperand(1);
       CombineTo(StoreNodes[i].MemNode, NewStore);
-      if (Val.getNode()->use_empty())
+      if (Val->use_empty())
         recursivelyDeleteUnusedNodes(Val.getNode());
     }
 
@@ -18712,7 +18708,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   // truncating store.  We can do this even if this is already a truncstore.
   if ((Value.getOpcode() == ISD::FP_ROUND ||
        Value.getOpcode() == ISD::TRUNCATE) &&
-      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      Value->hasOneUse() && ST->isUnindexed() &&
       TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
                                ST->getMemoryVT(), LegalOperations)) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
@@ -19121,8 +19117,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   // Do not combine these two vectors if the output vector will not replace
   // the input vector.
   if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
-    Ops.append(InVec.getNode()->op_begin(),
-               InVec.getNode()->op_end());
+    Ops.append(InVec->op_begin(), InVec->op_end());
   } else if (InVec.isUndef()) {
     Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
   } else {
@@ -19226,7 +19221,7 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
   SDValue Index = ExtElt->getOperand(1);
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
   if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
-      Vec.getNode()->getNumValues() != 1)
+      Vec->getNumValues() != 1)
     return SDValue();
 
   // Targets may want to avoid this to prevent an expensive register transfer.
@@ -19741,7 +19736,7 @@ SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
       if (!isa<ConstantSDNode>(ShiftAmtVal))
         return SDValue();
 
-      uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1);
+      uint64_t ShiftAmt = In.getConstantOperandVal(1);
 
       // The extracted value is not extracted at the right position
       if (ShiftAmt != i * ScalarTypeBitsize)
@@ -20781,7 +20776,7 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue BinOp = Extract->getOperand(0);
   unsigned BinOpcode = BinOp.getOpcode();
-  if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1)
+  if (!TLI.isBinOp(BinOpcode) || BinOp->getNumValues() != 1)
     return SDValue();
 
   EVT VecVT = BinOp.getValueType();
@@ -20830,7 +20825,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
   unsigned BOpcode = BinOp.getOpcode();
-  if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1)
+  if (!TLI.isBinOp(BOpcode) || BinOp->getNumValues() != 1)
     return SDValue();
 
   // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
@@ -20889,8 +20884,8 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
                             BinOp.getOperand(0), NewExtIndex);
     SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
                             BinOp.getOperand(1), NewExtIndex);
-    SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
-                                      BinOp.getNode()->getFlags());
+    SDValue NarrowBinOp =
+        DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, BinOp->getFlags());
     return DAG.getBitcast(VT, NarrowBinOp);
   }
 
@@ -21930,7 +21925,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
     int SplatIndex = SVN->getSplatIndex();
     if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
-        TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) {
+        TLI.isBinOp(N0.getOpcode()) && N0->getNumValues() == 1) {
       // splat (vector_bo L, R), Index -->
       // splat (scalar_bo (extelt L, Index), (extelt R, Index))
       SDValue L = N0.getOperand(0), R = N0.getOperand(1);
@@ -21939,8 +21934,8 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
       SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
       SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
-      SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,
-                                  N0.getNode()->getFlags());
+      SDValue NewBO =
+          DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, N0->getFlags());
       SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
       SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
       return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
@@ -22990,7 +22985,7 @@ SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
     // Check to see if we got a select_cc back (to turn into setcc/select).
     // Otherwise, just return whatever node we got back, like fabs.
     if (SCC.getOpcode() == ISD::SELECT_CC) {
-      const SDNodeFlags Flags = N0.getNode()->getFlags();
+      const SDNodeFlags Flags = N0->getFlags();
       SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
                                   N0.getValueType(),
                                   SCC.getOperand(0), SCC.getOperand(1),

From 7b67d2e398861e9f3bdcc991cd0a900aa9c8d740 Mon Sep 17 00:00:00 2001
From: esmeyi <esme.yi@ibm.com>
Date: Sun, 20 Feb 2022 21:51:10 -0500
Subject: [PATCH 390/748] Reland [XCOFF][llvm-objdump] change the priority of
 symbols with the same address by symbol types.

Fix the Buildbot failure #19373.

Differential Revision: https://reviews.llvm.org/D117642
---
 .../llvm/MC/MCDisassembler/MCDisassembler.h   | 21 ++++++++++----
 llvm/include/llvm/Object/ObjectFile.h         |  2 +-
 .../aix-prefixed-instruction-boundary.mir     |  2 +-
 llvm/test/CodeGen/PowerPC/aix-return55.ll     |  2 +-
 .../PowerPC/aix-user-defined-memcpy.ll        |  2 +-
 .../PowerPC/aix-xcoff-mergeable-const.ll      |  2 +-
 .../CodeGen/PowerPC/aix-xcoff-reloc-symb.mir  |  2 +-
 llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll  |  2 +-
 .../PowerPC/aix-xcoff-textdisassembly.ll      |  2 +-
 .../llvm-objdump/XCOFF/disassemble-all.test   |  2 +-
 .../XCOFF/disassemble-symbol-description.test |  2 +-
 .../XCOFF/disassemble-symbol-priority.ll      | 28 +++++++++++++++++++
 .../XCOFF/disassemble-symbolize-operands.ll   |  7 ++---
 .../llvm-objdump/XCOFF/print-linenumber.test  |  2 +-
 llvm/tools/llvm-objdump/llvm-objdump.cpp      |  3 ++
 15 files changed, 60 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll

diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 10037cd66ef12..7060620b6bd4b 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -40,26 +40,35 @@ struct SymbolInfoTy {
 
 private:
   bool IsXCOFF;
+  bool HasType;
 
 public:
   SymbolInfoTy(uint64_t Addr, StringRef Name,
                Optional<XCOFF::StorageMappingClass> Smc, Optional<uint32_t> Idx,
                bool Label)
-      : Addr(Addr), Name(Name), XCOFFSymInfo(Smc, Idx, Label), IsXCOFF(true) {}
-  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type)
-      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(false) {}
+      : Addr(Addr), Name(Name), XCOFFSymInfo(Smc, Idx, Label), IsXCOFF(true),
+        HasType(false) {}
+  SymbolInfoTy(uint64_t Addr, StringRef Name, uint8_t Type,
+               bool IsXCOFF = false)
+      : Addr(Addr), Name(Name), Type(Type), IsXCOFF(IsXCOFF), HasType(true) {}
   bool isXCOFF() const { return IsXCOFF; }
 
 private:
   friend bool operator<(const SymbolInfoTy &P1, const SymbolInfoTy &P2) {
-    assert(P1.IsXCOFF == P2.IsXCOFF &&
-           "P1.IsXCOFF should be equal to P2.IsXCOFF.");
+    assert((P1.IsXCOFF == P2.IsXCOFF && P1.HasType == P2.HasType) &&
+           "The value of IsXCOFF and HasType in P1 and P2 should be the same "
+           "respectively.");
+
+    if (P1.IsXCOFF && P1.HasType)
+      return std::tie(P1.Addr, P1.Type, P1.Name) <
+             std::tie(P2.Addr, P2.Type, P2.Name);
+
     if (P1.IsXCOFF)
       return std::tie(P1.Addr, P1.XCOFFSymInfo, P1.Name) <
              std::tie(P2.Addr, P2.XCOFFSymInfo, P2.Name);
 
     return std::tie(P1.Addr, P1.Name, P1.Type) <
-             std::tie(P2.Addr, P2.Name, P2.Type);
+           std::tie(P2.Addr, P2.Name, P2.Type);
   }
 };
 
diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index bb6f1321a68e8..1faa070052d5e 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -170,11 +170,11 @@ class SymbolRef : public BasicSymbolRef {
 public:
   enum Type {
     ST_Unknown, // Type not specified
+    ST_Other,
     ST_Data,
     ST_Debug,
     ST_File,
     ST_Function,
-    ST_Other
   };
 
   SymbolRef() = default;
diff --git a/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir b/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
index 9ea49bf40c897..2947ae2c39989 100644
--- a/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
+++ b/llvm/test/CodeGen/PowerPC/aix-prefixed-instruction-boundary.mir
@@ -43,7 +43,7 @@ body:             |
 ...
 
 # DIS:      Disassembly of section .text:
-# DIS:      00000000 <.text>:
+# DIS:      00000000 <.aix-prefixed-instruction-boundary>:
 # DIS-NEXT:   0: 38 60 00 02  	          li 3, 2
 # DIS-NEXT:   4: 06 00 00 00 38 63 00 0d  paddi 3, 3, 13, 0
 # DIS-NEXT:   c: 06 00 00 00 38 63 00 0d  paddi 3, 3, 13, 0
diff --git a/llvm/test/CodeGen/PowerPC/aix-return55.ll b/llvm/test/CodeGen/PowerPC/aix-return55.ll
index c16b75bb68d8d..19e8322f8f8a2 100644
--- a/llvm/test/CodeGen/PowerPC/aix-return55.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-return55.ll
@@ -21,7 +21,7 @@ entry:
 ; CHECK: blr
 }
 
-;CHECKOBJ:      00000000 <.text>:
+;CHECKOBJ:      00000000 <.foo>:
 ;CHECKOBJ-NEXT:       0: 38 60 00 37                    li 3, 55
 ;CHECKOBJ-NEXT:       4: 4e 80 00 20                    blr{{[[:space:]] *}}
 ;CHECKOBJ-NEXT: 00000008 <.rodata.str1.1>:
diff --git a/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll b/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
index b69b3760c9f4e..097eb302e4161 100644
--- a/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-user-defined-memcpy.ll
@@ -102,7 +102,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture r
 ; 32-REL-NOT:  Type: R_RBR (0x1A)
 
 ; 32-DIS:      Disassembly of section .text:
-; 32-DIS:      00000000 <.text>:
+; 32-DIS:      00000000 <.memcpy>:
 ; 32-DIS-NEXT:        0: 38 60 00 03                   li 3, 3
 ; 32-DIS-NEXT:        4: 4e 80 00 20                   blr
 ; 32-DIS-NEXT:        8: 60 00 00 00                   nop
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
index 255472d65c341..c7b1d2a0771c1 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-mergeable-const.ll
@@ -62,7 +62,7 @@ entry:
 ;CHECK-NEXT:         .space  1
 
 
-;CHECKOBJ:      00000000 <.text>:
+;CHECKOBJ:      00000000 <.main>:
 ;CHECKOBJ-NEXT:        0: 38 60 00 00                    li 3, 0
 ;CHECKOBJ-NEXT:        4: 4e 80 00 20                    blr
 ;CHECKOBJ-NEXT:          ...{{[[:space:]] *}}
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
index f650168d5877d..c64552f9852c0 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc-symb.mir
@@ -75,7 +75,7 @@ body:             |
 
 # DIS:      Disassembly of section .text:
 # DIS-EMPTY:
-# DIS-NEXT: 00000000 <.text>:
+# DIS-NEXT: 00000000 <.foo>:
 # DIS-NEXT:        0: 80 62 00 00                   lwz 3, 0(2)
 # DIS-NEXT:        4: 4e 80 00 20                   blr
 # DIS-EMPTY:
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
index 6ce251bb49fd8..1bbc12c5a3af5 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
@@ -422,7 +422,7 @@ declare i32 @bar(i32)
 
 ; DIS:      {{.*}}aix-xcoff-reloc.ll.tmp.o:   file format aixcoff-rs6000
 ; DIS:      Disassembly of section .text:
-; DIS:      00000000 <.text>:
+; DIS:      00000000 <.foo>:
 ; DIS-NEXT:        0: 7c 08 02 a6                   mflr 0
 ; DIS-NEXT:        4: 90 01 00 08                   stw 0, 8(1)
 ; DIS-NEXT:        8: 94 21 ff c0                   stwu 1, -64(1)
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
index c8df85da0c855..8b73e748e1a89 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-textdisassembly.ll
@@ -13,7 +13,7 @@ entry:
 }
 
 ; CHECK:     Disassembly of section .text:{{[[:space:]] *}}
-; CHECK-NEXT:     00000000 <.text>:
+; CHECK-NEXT:     00000000 <.foo>:
 ; CHECK-NEXT:        0: 38 60 00 00                   li 3, 0
 ; CHECK-NEXT:        4: 4e 80 00 20                   blr
 ; CHECK-NEXT:        8: 60 00 00 00                   nop
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
index d94d5734a1cbd..4c96662fc854f 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-all.test
@@ -18,7 +18,7 @@
 
 CHECK:        Inputs/xcoff-section-headers.o:	file format aixcoff-rs6000
 CHECK:        Disassembly of section .text:
-CHECK:        00000000 <.text>:
+CHECK:        00000000 <.func>:
 CHECK-NEXT:        0: 80 62 00 04                  	lwz 3, 4(2)
 WITH-R-NEXT:                         00000002:  R_TOC        a
 CHECK-NEXT:        4: 80 63 00 00                  	lwz 3, 0(3)
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
index 16f7137cf3796..f33421cc6c149 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description.test
@@ -22,7 +22,7 @@
 
 COMMON: Inputs/xcoff-section-headers.o:	file format aixcoff-rs6000
 COMMON: Disassembly of section .text:
-PLAIN:      00000000 <.text>:
+PLAIN:      00000000 <.func>:
 DESC:       00000000 (idx: 16) .func: 
 COMMON-NEXT:        0: 80 62 00 04                  	lwz 3, 4(2)
 RELOC:                              00000002:  R_TOC        (idx: 26) a[TC]
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll
new file mode 100644
index 0000000000000..6db8451ea6a13
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-priority.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=powerpc-ibm-aix-xcoff %s -filetype=obj -o %t
+; RUN: llvm-objdump %t -d --no-show-raw-insn | FileCheck %s
+
+; CHECK: Disassembly of section .text:
+; CHECK: 00000000 <.foo3>:
+; CHECK: 00000020 <.foo4>:
+; CHECK: 00000040 <.foo>:
+; CHECK: 00000060 <.foo2>:
+
+define dso_local signext i32 @foo(i32 noundef signext %a) #0 section "explicit_sec" {
+entry:
+  ret i32 %a
+}
+
+define dso_local signext i32 @foo2(i32 noundef signext %a) #0 section "explicit_sec" {
+entry:
+  ret i32 %a
+}
+
+define dso_local signext i32 @foo3(i32 noundef signext %a) #0 {
+entry:
+  ret i32 %a
+}
+
+define dso_local signext i32 @foo4(i32 noundef signext %a) #0 {
+entry:
+  ret i32 %a
+}
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
index a6742285a148e..95399aa4d41d2 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
@@ -3,8 +3,7 @@
 ; RUN:   | FileCheck %s
 
 ;; Expect to find the branch labels.
-; CHECK-LABEL: <.text>:
-;; TODO: <.internal> should be printed instead of <.text>.
+; CHECK-LABEL: <.internal>:
 ; CHECK-NEXT:         0:      mr 4, 3
 ; CHECK-NEXT:         4:      li 3, 0
 ; CHECK-NEXT:         8:      mtctr 4
@@ -19,11 +18,11 @@
 ; CHECK-NEXT:        60:      	bf	8, 0x84 <L1>
 ; CHECK-NEXT:  <L0>:
 ; CHECK-NEXT:        64:      	mr	3, 31
-; CHECK-NEXT:        68:      	bl 0x0 <.text>
+; CHECK-NEXT:        68:      	bl 0x0 <.internal>
 ; CHECK-NEXT:        6c:      	mr	31, 3
 ; CHECK-NEXT:        70:      	cmplwi	3, 11
 ; CHECK-NEXT:        74:      	bt	0, 0x60 <L2>
-; CHECK-NEXT:        78:      	bl 0x0 <.text>
+; CHECK-NEXT:        78:      	bl 0x0 <.internal>
 ; CHECK-NEXT:        7c:      	nop
 ; CHECK-NEXT:        80:      	b 0x60 <L2>
 ; CHECK-NEXT:  <L1>:
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test b/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
index 0f3acacae4389..8256e27c064dd 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/print-linenumber.test
@@ -17,7 +17,7 @@
 
 # LINES32:       Inputs/basic32.o:	file format aixcoff-rs6000
 # LINES32:       Disassembly of section .text:
-# LINES32:       00000000 <.text>:
+# LINES32:       00000000 <.main>:
 # LINES32:       ; .main():
 # LINES32-NEXT:  ; /basic.c:1
 # LINES32-NEXT:         0: 38 60 00 00  	li 3, 0
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 6b238fa01d258..4cb226b795255 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -957,6 +957,9 @@ SymbolInfoTy objdump::createSymbolInfo(const ObjectFile *Obj,
         getXCOFFSymbolCsectSMC(XCOFFObj, Symbol);
     return SymbolInfoTy(Addr, Name, Smc, SymbolIndex,
                         isLabel(XCOFFObj, Symbol));
+  } else if (Obj->isXCOFF()) {
+    const SymbolRef::Type SymType = unwrapOrError(Symbol.getType(), FileName);
+    return SymbolInfoTy(Addr, Name, SymType, true);
   } else
     return SymbolInfoTy(Addr, Name,
                         Obj->isELF() ? getElfSymbolType(Obj, Symbol)

From 7f2293ba2596248319f9fdd3d97436b81c319a11 Mon Sep 17 00:00:00 2001
From: Serguei Katkov <serguei.katkov@azul.com>
Date: Fri, 18 Feb 2022 18:10:14 +0700
Subject: [PATCH 391/748] [STATEPOINT] Mark LR is early-clobber implicit def.

LR is modified at the moment of the call and before any use is read.

Reviewers: reames
Reviewed By: reames
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D120114
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp  | 11 ++++++++---
 .../AArch64/statepoint-call-lowering-lr.ll       | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/statepoint-call-lowering-lr.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d4f9906e687f9..473984c658d39 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2353,9 +2353,14 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
   case TargetOpcode::STATEPOINT:
     // STATEPOINT is a pseudo instruction which has no implicit defs/uses
     // while bl call instruction (where statepoint will be lowered at the end)
-    // has implicit def. Add this implicit dead def here as a workaround.
-    MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true,
-                                                         true, false, true));
+    // has implicit def. This def is early-clobber as it will be set at
+    // the moment of the call and earlier than any use is read.
+    // Add this implicit dead def here as a workaround.
+    MI.addOperand(*MI.getMF(),
+                  MachineOperand::CreateReg(
+                      AArch64::LR, /*isDef*/ true,
+                      /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
+                      /*isUndef*/ false, /*isEarlyClobber*/ true));
     LLVM_FALLTHROUGH;
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
diff --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering-lr.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering-lr.ll
new file mode 100644
index 0000000000000..c07360810f4b2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering-lr.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple aarch64-none-linux-gnu -verify-machineinstrs -stop-after=prologepilog < %s | FileCheck %s
+
+; Check that STATEPOINT instruction has an early clobber implicit def for LR.
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test() "frame-pointer"="all" gc "statepoint-example" {
+entry:
+  %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, void ()* elementtype(void ()) @return_i1, i32 0, i32 0, i32 0, i32 0) ["gc-live" ()]
+; CHECK: STATEPOINT 0, 0, 0, @return_i1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, csr_aarch64_aapcs, implicit-def $sp, implicit-def dead early-clobber $lr
+  ret void
+}
+
+
+declare void @return_i1()
+declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, void ()*, i32, i32, ...)

From 440c4b705ad1d494a183b53cd65f21a481726157 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 20 Feb 2022 21:11:21 -0800
Subject: [PATCH 392/748] [SelectionDAG][RISCV][ARM][PowerPC][X86][WebAssembly]
 Change default abs expansion to use sra (X, size(X)-1); sub (xor (X, Y), Y).

Previous we used sra (X, size(X)-1); xor (add (X, Y), Y).

By placing sub at the end, we allow RISCV to combine sign_extend_inreg
with it to form subw.

Some X86 tests for Z - abs(X) seem to have improved as well.

Other targets look to be a wash.

I had to modify ARM's abs matching code to match from sub instead of
xor. Maybe instead ISD::ABS should be made legal. I'll try that in
parallel to this patch.

This is an alternative to D119099 which was focused on RISCV only.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D119171
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  10 +-
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp       |  29 +-
 llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll   | 411 +++++++++---------
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |   2 +-
 llvm/test/CodeGen/RISCV/rv64zbb.ll            |   9 +-
 llvm/test/CodeGen/Thumb/iabs.ll               |  12 +-
 .../CodeGen/Thumb/optionaldef-scheduling.ll   |  55 ++-
 llvm/test/CodeGen/Thumb2/abs.ll               |  42 +-
 llvm/test/CodeGen/WebAssembly/PR41149.ll      |   4 +-
 llvm/test/CodeGen/X86/abs.ll                  |  50 +--
 llvm/test/CodeGen/X86/combine-abs.ll          |   4 +-
 llvm/test/CodeGen/X86/iabs.ll                 |  12 +-
 llvm/test/CodeGen/X86/neg-abs.ll              |  37 +-
 llvm/test/CodeGen/X86/viabs.ll                |  70 +--
 .../Inputs/basic.ll.expected                  |  12 +-
 15 files changed, 384 insertions(+), 375 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 0b69496d14f9f..6619f1c42a888 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7443,13 +7443,13 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
   SDValue Shift =
       DAG.getNode(ISD::SRA, dl, VT, Op,
                   DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, ShVT));
-  if (!IsNegative) {
-    SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift);
-    return DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
-  }
+  SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
+
+  // abs(x) -> Y = sra (X, size(X)-1); sub (xor (X, Y), Y)
+  if (!IsNegative)
+    return DAG.getNode(ISD::SUB, dl, VT, Xor, Shift);
 
   // 0 - abs(x) -> Y = sra (X, size(X)-1); sub (Y, xor (X, Y))
-  SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
   return DAG.getNode(ISD::SUB, dl, VT, Shift, Xor);
 }
 
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 98c8133282a26..1735c0ddd11a5 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3464,40 +3464,39 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   return false;
 }
 
-/// Target-specific DAG combining for ISD::XOR.
+/// Target-specific DAG combining for ISD::SUB.
 /// Target-independent combining lowers SELECT_CC nodes of the form
 /// select_cc setg[ge] X,  0,  X, -X
 /// select_cc setgt    X, -1,  X, -X
 /// select_cc setl[te] X,  0, -X,  X
 /// select_cc setlt    X,  1, -X,  X
 /// which represent Integer ABS into:
-/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// Y = sra (X, size(X)-1); sub (xor (X, Y), Y)
 /// ARM instruction selection detects the latter and matches it to
 /// ARM::ABS or ARM::t2ABS machine node.
 bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
-  SDValue XORSrc0 = N->getOperand(0);
-  SDValue XORSrc1 = N->getOperand(1);
+  SDValue SUBSrc0 = N->getOperand(0);
+  SDValue SUBSrc1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   if (Subtarget->isThumb1Only())
     return false;
 
-  if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
+  if (SUBSrc0.getOpcode() != ISD::XOR || SUBSrc1.getOpcode() != ISD::SRA)
     return false;
 
-  SDValue ADDSrc0 = XORSrc0.getOperand(0);
-  SDValue ADDSrc1 = XORSrc0.getOperand(1);
-  SDValue SRASrc0 = XORSrc1.getOperand(0);
-  SDValue SRASrc1 = XORSrc1.getOperand(1);
+  SDValue XORSrc0 = SUBSrc0.getOperand(0);
+  SDValue XORSrc1 = SUBSrc0.getOperand(1);
+  SDValue SRASrc0 = SUBSrc1.getOperand(0);
+  SDValue SRASrc1 = SUBSrc1.getOperand(1);
   ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
   EVT XType = SRASrc0.getValueType();
   unsigned Size = XType.getSizeInBits() - 1;
 
-  if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
-      XType.isInteger() && SRAConstant != nullptr &&
-      Size == SRAConstant->getZExtValue()) {
+  if (XORSrc1 == SUBSrc1 && XORSrc0 == SRASrc0 && XType.isInteger() &&
+      SRAConstant != nullptr && Size == SRAConstant->getZExtValue()) {
     unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
-    CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    CurDAG->SelectNodeTo(N, Opcode, VT, XORSrc0);
     return true;
   }
 
@@ -3673,8 +3672,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (tryInlineAsm(N))
       return;
     break;
-  case ISD::XOR:
-    // Select special operations if XOR node forms integer ABS pattern
+  case ISD::SUB:
+    // Select special operations if SUB node forms integer ABS pattern
     if (tryABSOp(N))
       return;
     // Other cases are autogenerated.
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 1dc8a7b99bc37..ec5e433b57cf7 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -99,10 +99,10 @@ define <2 x i64> @sub_absv_64(<2 x i64> %a, <2 x i64> %b) local_unnamed_addr {
 ; CHECK-PWR7-NEXT:    sub r4, r5, r6
 ; CHECK-PWR7-NEXT:    sradi r5, r3, 63
 ; CHECK-PWR7-NEXT:    sradi r6, r4, 63
-; CHECK-PWR7-NEXT:    add r3, r3, r5
-; CHECK-PWR7-NEXT:    add r4, r4, r6
 ; CHECK-PWR7-NEXT:    xor r3, r3, r5
 ; CHECK-PWR7-NEXT:    xor r4, r4, r6
+; CHECK-PWR7-NEXT:    sub r3, r3, r5
+; CHECK-PWR7-NEXT:    sub r4, r4, r6
 ; CHECK-PWR7-NEXT:    std r3, -8(r1)
 ; CHECK-PWR7-NEXT:    addi r3, r1, -16
 ; CHECK-PWR7-NEXT:    std r4, -16(r1)
@@ -307,13 +307,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    sub r4, r7, r4
 ; CHECK-PWR9-LE-NEXT:    srawi r6, r3, 31
 ; CHECK-PWR9-LE-NEXT:    srawi r7, r4, 31
-; CHECK-PWR9-LE-NEXT:    add r3, r3, r6
-; CHECK-PWR9-LE-NEXT:    add r4, r4, r7
-; CHECK-PWR9-LE-NEXT:    xor r6, r3, r6
-; CHECK-PWR9-LE-NEXT:    srawi r3, r5, 31
+; CHECK-PWR9-LE-NEXT:    xor r3, r3, r6
 ; CHECK-PWR9-LE-NEXT:    xor r4, r4, r7
-; CHECK-PWR9-LE-NEXT:    add r5, r5, r3
-; CHECK-PWR9-LE-NEXT:    xor r3, r5, r3
+; CHECK-PWR9-LE-NEXT:    sub r6, r3, r6
+; CHECK-PWR9-LE-NEXT:    srawi r3, r5, 31
+; CHECK-PWR9-LE-NEXT:    sub r4, r4, r7
+; CHECK-PWR9-LE-NEXT:    xor r5, r5, r3
+; CHECK-PWR9-LE-NEXT:    sub r3, r5, r3
 ; CHECK-PWR9-LE-NEXT:    li r5, 3
 ; CHECK-PWR9-LE-NEXT:    vextubrx r7, r5, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r5, r5, v3
@@ -321,8 +321,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r5, r5, 24
 ; CHECK-PWR9-LE-NEXT:    sub r5, r7, r5
 ; CHECK-PWR9-LE-NEXT:    srawi r7, r5, 31
-; CHECK-PWR9-LE-NEXT:    add r5, r5, r7
 ; CHECK-PWR9-LE-NEXT:    xor r5, r5, r7
+; CHECK-PWR9-LE-NEXT:    sub r5, r5, r7
 ; CHECK-PWR9-LE-NEXT:    li r7, 4
 ; CHECK-PWR9-LE-NEXT:    vextubrx r8, r7, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r7, r7, v3
@@ -331,8 +331,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r7, r7, 24
 ; CHECK-PWR9-LE-NEXT:    sub r7, r8, r7
 ; CHECK-PWR9-LE-NEXT:    srawi r8, r7, 31
-; CHECK-PWR9-LE-NEXT:    add r7, r7, r8
 ; CHECK-PWR9-LE-NEXT:    xor r7, r7, r8
+; CHECK-PWR9-LE-NEXT:    sub r7, r7, r8
 ; CHECK-PWR9-LE-NEXT:    li r8, 5
 ; CHECK-PWR9-LE-NEXT:    vextubrx r9, r8, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r8, r8, v3
@@ -340,8 +340,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r8, r8, 24
 ; CHECK-PWR9-LE-NEXT:    sub r8, r9, r8
 ; CHECK-PWR9-LE-NEXT:    srawi r9, r8, 31
-; CHECK-PWR9-LE-NEXT:    add r8, r8, r9
 ; CHECK-PWR9-LE-NEXT:    xor r8, r8, r9
+; CHECK-PWR9-LE-NEXT:    sub r8, r8, r9
 ; CHECK-PWR9-LE-NEXT:    li r9, 6
 ; CHECK-PWR9-LE-NEXT:    vextubrx r10, r9, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r9, r9, v3
@@ -349,8 +349,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r9, r9, 24
 ; CHECK-PWR9-LE-NEXT:    sub r9, r10, r9
 ; CHECK-PWR9-LE-NEXT:    srawi r10, r9, 31
-; CHECK-PWR9-LE-NEXT:    add r9, r9, r10
 ; CHECK-PWR9-LE-NEXT:    xor r9, r9, r10
+; CHECK-PWR9-LE-NEXT:    sub r9, r9, r10
 ; CHECK-PWR9-LE-NEXT:    li r10, 7
 ; CHECK-PWR9-LE-NEXT:    vextubrx r11, r10, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r10, r10, v3
@@ -358,8 +358,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r10, r10, 24
 ; CHECK-PWR9-LE-NEXT:    sub r10, r11, r10
 ; CHECK-PWR9-LE-NEXT:    srawi r11, r10, 31
-; CHECK-PWR9-LE-NEXT:    add r10, r10, r11
 ; CHECK-PWR9-LE-NEXT:    xor r10, r10, r11
+; CHECK-PWR9-LE-NEXT:    sub r10, r10, r11
 ; CHECK-PWR9-LE-NEXT:    li r11, 8
 ; CHECK-PWR9-LE-NEXT:    vextubrx r12, r11, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r11, r11, v3
@@ -368,8 +368,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r11, r11, 24
 ; CHECK-PWR9-LE-NEXT:    sub r11, r12, r11
 ; CHECK-PWR9-LE-NEXT:    srawi r12, r11, 31
-; CHECK-PWR9-LE-NEXT:    add r11, r11, r12
 ; CHECK-PWR9-LE-NEXT:    xor r11, r11, r12
+; CHECK-PWR9-LE-NEXT:    sub r11, r11, r12
 ; CHECK-PWR9-LE-NEXT:    li r12, 9
 ; CHECK-PWR9-LE-NEXT:    vextubrx r0, r12, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r12, r12, v3
@@ -377,8 +377,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r12, r12, 24
 ; CHECK-PWR9-LE-NEXT:    sub r12, r0, r12
 ; CHECK-PWR9-LE-NEXT:    srawi r0, r12, 31
-; CHECK-PWR9-LE-NEXT:    add r12, r12, r0
 ; CHECK-PWR9-LE-NEXT:    xor r12, r12, r0
+; CHECK-PWR9-LE-NEXT:    sub r12, r12, r0
 ; CHECK-PWR9-LE-NEXT:    li r0, 10
 ; CHECK-PWR9-LE-NEXT:    vextubrx r30, r0, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r0, r0, v3
@@ -386,8 +386,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r0, r0, 24
 ; CHECK-PWR9-LE-NEXT:    sub r0, r30, r0
 ; CHECK-PWR9-LE-NEXT:    srawi r30, r0, 31
-; CHECK-PWR9-LE-NEXT:    add r0, r0, r30
 ; CHECK-PWR9-LE-NEXT:    xor r0, r0, r30
+; CHECK-PWR9-LE-NEXT:    sub r0, r0, r30
 ; CHECK-PWR9-LE-NEXT:    li r30, 11
 ; CHECK-PWR9-LE-NEXT:    vextubrx r29, r30, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r30, r30, v3
@@ -395,8 +395,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r30, r30, 24
 ; CHECK-PWR9-LE-NEXT:    sub r30, r29, r30
 ; CHECK-PWR9-LE-NEXT:    srawi r29, r30, 31
-; CHECK-PWR9-LE-NEXT:    add r30, r30, r29
 ; CHECK-PWR9-LE-NEXT:    xor r30, r30, r29
+; CHECK-PWR9-LE-NEXT:    sub r30, r30, r29
 ; CHECK-PWR9-LE-NEXT:    li r29, 12
 ; CHECK-PWR9-LE-NEXT:    vextubrx r28, r29, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r29, r29, v3
@@ -404,8 +404,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r29, r29, 24
 ; CHECK-PWR9-LE-NEXT:    sub r29, r28, r29
 ; CHECK-PWR9-LE-NEXT:    srawi r28, r29, 31
-; CHECK-PWR9-LE-NEXT:    add r29, r29, r28
 ; CHECK-PWR9-LE-NEXT:    xor r29, r29, r28
+; CHECK-PWR9-LE-NEXT:    sub r29, r29, r28
 ; CHECK-PWR9-LE-NEXT:    li r28, 13
 ; CHECK-PWR9-LE-NEXT:    vextubrx r27, r28, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r28, r28, v3
@@ -413,8 +413,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r28, r28, 24
 ; CHECK-PWR9-LE-NEXT:    sub r28, r27, r28
 ; CHECK-PWR9-LE-NEXT:    srawi r27, r28, 31
-; CHECK-PWR9-LE-NEXT:    add r28, r28, r27
 ; CHECK-PWR9-LE-NEXT:    xor r28, r28, r27
+; CHECK-PWR9-LE-NEXT:    sub r28, r28, r27
 ; CHECK-PWR9-LE-NEXT:    li r27, 14
 ; CHECK-PWR9-LE-NEXT:    vextubrx r26, r27, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r27, r27, v3
@@ -422,8 +422,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    clrlwi r27, r27, 24
 ; CHECK-PWR9-LE-NEXT:    sub r27, r26, r27
 ; CHECK-PWR9-LE-NEXT:    srawi r26, r27, 31
-; CHECK-PWR9-LE-NEXT:    add r27, r27, r26
 ; CHECK-PWR9-LE-NEXT:    xor r27, r27, r26
+; CHECK-PWR9-LE-NEXT:    sub r27, r27, r26
 ; CHECK-PWR9-LE-NEXT:    li r26, 15
 ; CHECK-PWR9-LE-NEXT:    vextubrx r25, r26, v2
 ; CHECK-PWR9-LE-NEXT:    vextubrx r26, r26, v3
@@ -441,10 +441,10 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-LE-NEXT:    srawi r25, r26, 31
 ; CHECK-PWR9-LE-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR9-LE-NEXT:    mtvsrd v4, r9
-; CHECK-PWR9-LE-NEXT:    add r26, r26, r25
+; CHECK-PWR9-LE-NEXT:    xor r26, r26, r25
 ; CHECK-PWR9-LE-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR9-LE-NEXT:    mtvsrd v5, r30
-; CHECK-PWR9-LE-NEXT:    xor r26, r26, r25
+; CHECK-PWR9-LE-NEXT:    sub r26, r26, r25
 ; CHECK-PWR9-LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-PWR9-LE-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
 ; CHECK-PWR9-LE-NEXT:    mtvsrd v0, r26
@@ -499,13 +499,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    srawi r6, r3, 31
 ; CHECK-PWR9-BE-NEXT:    srawi r7, r4, 31
 ; CHECK-PWR9-BE-NEXT:    srawi r8, r5, 31
-; CHECK-PWR9-BE-NEXT:    add r3, r3, r6
-; CHECK-PWR9-BE-NEXT:    add r4, r4, r7
-; CHECK-PWR9-BE-NEXT:    add r5, r5, r8
 ; CHECK-PWR9-BE-NEXT:    xor r3, r3, r6
-; CHECK-PWR9-BE-NEXT:    li r6, 3
 ; CHECK-PWR9-BE-NEXT:    xor r4, r4, r7
 ; CHECK-PWR9-BE-NEXT:    xor r5, r5, r8
+; CHECK-PWR9-BE-NEXT:    sub r3, r3, r6
+; CHECK-PWR9-BE-NEXT:    li r6, 3
+; CHECK-PWR9-BE-NEXT:    sub r4, r4, r7
+; CHECK-PWR9-BE-NEXT:    sub r5, r5, r8
 ; CHECK-PWR9-BE-NEXT:    vextublx r7, r6, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r6, r6, v3
 ; CHECK-PWR9-BE-NEXT:    mtvsrwz v1, r3
@@ -513,8 +513,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r6, r6, 24
 ; CHECK-PWR9-BE-NEXT:    sub r6, r7, r6
 ; CHECK-PWR9-BE-NEXT:    srawi r7, r6, 31
-; CHECK-PWR9-BE-NEXT:    add r6, r6, r7
 ; CHECK-PWR9-BE-NEXT:    xor r6, r6, r7
+; CHECK-PWR9-BE-NEXT:    sub r6, r6, r7
 ; CHECK-PWR9-BE-NEXT:    li r7, 4
 ; CHECK-PWR9-BE-NEXT:    vextublx r8, r7, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r7, r7, v3
@@ -522,8 +522,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r7, r7, 24
 ; CHECK-PWR9-BE-NEXT:    sub r7, r8, r7
 ; CHECK-PWR9-BE-NEXT:    srawi r8, r7, 31
-; CHECK-PWR9-BE-NEXT:    add r7, r7, r8
 ; CHECK-PWR9-BE-NEXT:    xor r7, r7, r8
+; CHECK-PWR9-BE-NEXT:    sub r7, r7, r8
 ; CHECK-PWR9-BE-NEXT:    li r8, 5
 ; CHECK-PWR9-BE-NEXT:    vextublx r9, r8, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r8, r8, v3
@@ -531,8 +531,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r8, r8, 24
 ; CHECK-PWR9-BE-NEXT:    sub r8, r9, r8
 ; CHECK-PWR9-BE-NEXT:    srawi r9, r8, 31
-; CHECK-PWR9-BE-NEXT:    add r8, r8, r9
 ; CHECK-PWR9-BE-NEXT:    xor r8, r8, r9
+; CHECK-PWR9-BE-NEXT:    sub r8, r8, r9
 ; CHECK-PWR9-BE-NEXT:    li r9, 6
 ; CHECK-PWR9-BE-NEXT:    vextublx r10, r9, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r9, r9, v3
@@ -540,8 +540,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r9, r9, 24
 ; CHECK-PWR9-BE-NEXT:    sub r9, r10, r9
 ; CHECK-PWR9-BE-NEXT:    srawi r10, r9, 31
-; CHECK-PWR9-BE-NEXT:    add r9, r9, r10
 ; CHECK-PWR9-BE-NEXT:    xor r9, r9, r10
+; CHECK-PWR9-BE-NEXT:    sub r9, r9, r10
 ; CHECK-PWR9-BE-NEXT:    li r10, 7
 ; CHECK-PWR9-BE-NEXT:    vextublx r11, r10, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r10, r10, v3
@@ -549,8 +549,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r10, r10, 24
 ; CHECK-PWR9-BE-NEXT:    sub r10, r11, r10
 ; CHECK-PWR9-BE-NEXT:    srawi r11, r10, 31
-; CHECK-PWR9-BE-NEXT:    add r10, r10, r11
 ; CHECK-PWR9-BE-NEXT:    xor r10, r10, r11
+; CHECK-PWR9-BE-NEXT:    sub r10, r10, r11
 ; CHECK-PWR9-BE-NEXT:    li r11, 8
 ; CHECK-PWR9-BE-NEXT:    vextublx r12, r11, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r11, r11, v3
@@ -558,8 +558,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r11, r11, 24
 ; CHECK-PWR9-BE-NEXT:    sub r11, r12, r11
 ; CHECK-PWR9-BE-NEXT:    srawi r12, r11, 31
-; CHECK-PWR9-BE-NEXT:    add r11, r11, r12
 ; CHECK-PWR9-BE-NEXT:    xor r11, r11, r12
+; CHECK-PWR9-BE-NEXT:    sub r11, r11, r12
 ; CHECK-PWR9-BE-NEXT:    li r12, 9
 ; CHECK-PWR9-BE-NEXT:    vextublx r0, r12, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r12, r12, v3
@@ -568,8 +568,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r12, r12, 24
 ; CHECK-PWR9-BE-NEXT:    sub r12, r0, r12
 ; CHECK-PWR9-BE-NEXT:    srawi r0, r12, 31
-; CHECK-PWR9-BE-NEXT:    add r12, r12, r0
 ; CHECK-PWR9-BE-NEXT:    xor r12, r12, r0
+; CHECK-PWR9-BE-NEXT:    sub r12, r12, r0
 ; CHECK-PWR9-BE-NEXT:    li r0, 10
 ; CHECK-PWR9-BE-NEXT:    vextublx r30, r0, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r0, r0, v3
@@ -577,8 +577,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r0, r0, 24
 ; CHECK-PWR9-BE-NEXT:    sub r0, r30, r0
 ; CHECK-PWR9-BE-NEXT:    srawi r30, r0, 31
-; CHECK-PWR9-BE-NEXT:    add r0, r0, r30
 ; CHECK-PWR9-BE-NEXT:    xor r0, r0, r30
+; CHECK-PWR9-BE-NEXT:    sub r0, r0, r30
 ; CHECK-PWR9-BE-NEXT:    li r30, 11
 ; CHECK-PWR9-BE-NEXT:    vextublx r29, r30, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r30, r30, v3
@@ -586,8 +586,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r30, r30, 24
 ; CHECK-PWR9-BE-NEXT:    sub r30, r29, r30
 ; CHECK-PWR9-BE-NEXT:    srawi r29, r30, 31
-; CHECK-PWR9-BE-NEXT:    add r30, r30, r29
 ; CHECK-PWR9-BE-NEXT:    xor r30, r30, r29
+; CHECK-PWR9-BE-NEXT:    sub r30, r30, r29
 ; CHECK-PWR9-BE-NEXT:    li r29, 12
 ; CHECK-PWR9-BE-NEXT:    vextublx r28, r29, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r29, r29, v3
@@ -595,8 +595,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r29, r29, 24
 ; CHECK-PWR9-BE-NEXT:    sub r29, r28, r29
 ; CHECK-PWR9-BE-NEXT:    srawi r28, r29, 31
-; CHECK-PWR9-BE-NEXT:    add r29, r29, r28
 ; CHECK-PWR9-BE-NEXT:    xor r29, r29, r28
+; CHECK-PWR9-BE-NEXT:    sub r29, r29, r28
 ; CHECK-PWR9-BE-NEXT:    li r28, 13
 ; CHECK-PWR9-BE-NEXT:    vextublx r27, r28, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r28, r28, v3
@@ -606,8 +606,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r28, r28, 24
 ; CHECK-PWR9-BE-NEXT:    sub r28, r27, r28
 ; CHECK-PWR9-BE-NEXT:    srawi r27, r28, 31
-; CHECK-PWR9-BE-NEXT:    add r28, r28, r27
 ; CHECK-PWR9-BE-NEXT:    xor r28, r28, r27
+; CHECK-PWR9-BE-NEXT:    sub r28, r28, r27
 ; CHECK-PWR9-BE-NEXT:    li r27, 14
 ; CHECK-PWR9-BE-NEXT:    vextublx r26, r27, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r27, r27, v3
@@ -615,8 +615,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    clrlwi r27, r27, 24
 ; CHECK-PWR9-BE-NEXT:    sub r27, r26, r27
 ; CHECK-PWR9-BE-NEXT:    srawi r26, r27, 31
-; CHECK-PWR9-BE-NEXT:    add r27, r27, r26
 ; CHECK-PWR9-BE-NEXT:    xor r27, r27, r26
+; CHECK-PWR9-BE-NEXT:    sub r27, r27, r26
 ; CHECK-PWR9-BE-NEXT:    li r26, 15
 ; CHECK-PWR9-BE-NEXT:    vextublx r25, r26, v2
 ; CHECK-PWR9-BE-NEXT:    vextublx r26, r26, v3
@@ -629,8 +629,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR9-BE-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
 ; CHECK-PWR9-BE-NEXT:    sub r26, r25, r26
 ; CHECK-PWR9-BE-NEXT:    srawi r25, r26, 31
-; CHECK-PWR9-BE-NEXT:    add r26, r26, r25
 ; CHECK-PWR9-BE-NEXT:    xor r26, r26, r25
+; CHECK-PWR9-BE-NEXT:    sub r26, r26, r25
 ; CHECK-PWR9-BE-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
 ; CHECK-PWR9-BE-NEXT:    mtvsrwz v2, r26
 ; CHECK-PWR9-BE-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
@@ -707,25 +707,25 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR8-NEXT:    srawi r12, r7, 31
 ; CHECK-PWR8-NEXT:    clrlwi r10, r0, 24
 ; CHECK-PWR8-NEXT:    clrlwi r0, r30, 24
-; CHECK-PWR8-NEXT:    add r4, r4, r3
-; CHECK-PWR8-NEXT:    add r7, r7, r12
+; CHECK-PWR8-NEXT:    xor r4, r4, r3
+; CHECK-PWR8-NEXT:    xor r7, r7, r12
 ; CHECK-PWR8-NEXT:    sub r10, r10, r0
 ; CHECK-PWR8-NEXT:    std r20, -96(r1) # 8-byte Folded Spill
 ; CHECK-PWR8-NEXT:    std r21, -88(r1) # 8-byte Folded Spill
-; CHECK-PWR8-NEXT:    xor r3, r4, r3
+; CHECK-PWR8-NEXT:    sub r3, r4, r3
 ; CHECK-PWR8-NEXT:    srawi r4, r9, 31
-; CHECK-PWR8-NEXT:    xor r7, r7, r12
+; CHECK-PWR8-NEXT:    sub r7, r7, r12
 ; CHECK-PWR8-NEXT:    std r22, -80(r1) # 8-byte Folded Spill
 ; CHECK-PWR8-NEXT:    rldicl r29, r5, 24, 56
 ; CHECK-PWR8-NEXT:    rldicl r28, r6, 24, 56
-; CHECK-PWR8-NEXT:    add r9, r9, r4
+; CHECK-PWR8-NEXT:    xor r9, r9, r4
 ; CHECK-PWR8-NEXT:    mtvsrd v3, r7
 ; CHECK-PWR8-NEXT:    rldicl r27, r5, 16, 56
 ; CHECK-PWR8-NEXT:    rldicl r25, r6, 16, 56
 ; CHECK-PWR8-NEXT:    clrlwi r30, r29, 24
 ; CHECK-PWR8-NEXT:    clrlwi r29, r28, 24
 ; CHECK-PWR8-NEXT:    mtvsrd v2, r3
-; CHECK-PWR8-NEXT:    xor r4, r9, r4
+; CHECK-PWR8-NEXT:    sub r4, r9, r4
 ; CHECK-PWR8-NEXT:    srawi r7, r10, 31
 ; CHECK-PWR8-NEXT:    srawi r3, r11, 31
 ; CHECK-PWR8-NEXT:    clrlwi r9, r27, 24
@@ -733,15 +733,15 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR8-NEXT:    sub r0, r30, r29
 ; CHECK-PWR8-NEXT:    mtvsrd v4, r4
 ; CHECK-PWR8-NEXT:    std r23, -72(r1) # 8-byte Folded Spill
-; CHECK-PWR8-NEXT:    add r10, r10, r7
-; CHECK-PWR8-NEXT:    add r11, r11, r3
+; CHECK-PWR8-NEXT:    xor r10, r10, r7
+; CHECK-PWR8-NEXT:    xor r11, r11, r3
 ; CHECK-PWR8-NEXT:    sub r9, r9, r12
 ; CHECK-PWR8-NEXT:    std r18, -112(r1) # 8-byte Folded Spill
 ; CHECK-PWR8-NEXT:    std r19, -104(r1) # 8-byte Folded Spill
 ; CHECK-PWR8-NEXT:    vmrghb v2, v3, v2
-; CHECK-PWR8-NEXT:    xor r7, r10, r7
+; CHECK-PWR8-NEXT:    sub r7, r10, r7
 ; CHECK-PWR8-NEXT:    rldicl r5, r5, 8, 56
-; CHECK-PWR8-NEXT:    xor r3, r11, r3
+; CHECK-PWR8-NEXT:    sub r3, r11, r3
 ; CHECK-PWR8-NEXT:    rldicl r6, r6, 8, 56
 ; CHECK-PWR8-NEXT:    srawi r4, r0, 31
 ; CHECK-PWR8-NEXT:    mtvsrd v0, r7
@@ -754,13 +754,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR8-NEXT:    clrlwi r5, r6, 24
 ; CHECK-PWR8-NEXT:    clrldi r22, r24, 56
 ; CHECK-PWR8-NEXT:    rldicl r21, r26, 56, 56
-; CHECK-PWR8-NEXT:    add r10, r0, r4
-; CHECK-PWR8-NEXT:    add r9, r9, r7
+; CHECK-PWR8-NEXT:    xor r10, r0, r4
+; CHECK-PWR8-NEXT:    xor r9, r9, r7
 ; CHECK-PWR8-NEXT:    rldicl r20, r24, 56, 56
 ; CHECK-PWR8-NEXT:    rldicl r19, r26, 48, 56
 ; CHECK-PWR8-NEXT:    sub r3, r3, r5
-; CHECK-PWR8-NEXT:    xor r4, r10, r4
-; CHECK-PWR8-NEXT:    xor r7, r9, r7
+; CHECK-PWR8-NEXT:    sub r4, r10, r4
+; CHECK-PWR8-NEXT:    sub r7, r9, r7
 ; CHECK-PWR8-NEXT:    clrlwi r9, r23, 24
 ; CHECK-PWR8-NEXT:    rldicl r18, r24, 48, 56
 ; CHECK-PWR8-NEXT:    clrlwi r10, r22, 24
@@ -779,7 +779,7 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR8-NEXT:    clrlwi r12, r18, 24
 ; CHECK-PWR8-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR8-NEXT:    std r31, -8(r1) # 8-byte Folded Spill
-; CHECK-PWR8-NEXT:    add r3, r3, r4
+; CHECK-PWR8-NEXT:    xor r3, r3, r4
 ; CHECK-PWR8-NEXT:    sub r7, r11, r12
 ; CHECK-PWR8-NEXT:    clrlwi r11, r17, 24
 ; CHECK-PWR8-NEXT:    clrlwi r12, r16, 24
@@ -787,7 +787,7 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR8-NEXT:    std r2, -152(r1) # 8-byte Folded Spill
 ; CHECK-PWR8-NEXT:    rldicl r15, r26, 32, 56
 ; CHECK-PWR8-NEXT:    rldicl r14, r24, 32, 56
-; CHECK-PWR8-NEXT:    xor r3, r3, r4
+; CHECK-PWR8-NEXT:    sub r3, r3, r4
 ; CHECK-PWR8-NEXT:    sub r11, r11, r12
 ; CHECK-PWR8-NEXT:    srawi r4, r9, 31
 ; CHECK-PWR8-NEXT:    srawi r12, r10, 31
@@ -795,40 +795,40 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR8-NEXT:    clrlwi r30, r14, 24
 ; CHECK-PWR8-NEXT:    mtvsrd v5, r3
 ; CHECK-PWR8-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
-; CHECK-PWR8-NEXT:    add r9, r9, r4
-; CHECK-PWR8-NEXT:    add r10, r10, r12
+; CHECK-PWR8-NEXT:    xor r9, r9, r4
+; CHECK-PWR8-NEXT:    xor r10, r10, r12
 ; CHECK-PWR8-NEXT:    sub r3, r0, r30
 ; CHECK-PWR8-NEXT:    ld r25, -56(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r23, -72(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r22, -80(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    srawi r28, r11, 31
-; CHECK-PWR8-NEXT:    xor r4, r9, r4
-; CHECK-PWR8-NEXT:    xor r10, r10, r12
+; CHECK-PWR8-NEXT:    sub r4, r9, r4
+; CHECK-PWR8-NEXT:    sub r10, r10, r12
 ; CHECK-PWR8-NEXT:    vmrghb v3, v5, v3
 ; CHECK-PWR8-NEXT:    ld r21, -88(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r20, -96(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    srawi r29, r7, 31
 ; CHECK-PWR8-NEXT:    srawi r9, r3, 31
 ; CHECK-PWR8-NEXT:    mtvsrd v5, r4
-; CHECK-PWR8-NEXT:    add r4, r11, r28
+; CHECK-PWR8-NEXT:    xor r4, r11, r28
 ; CHECK-PWR8-NEXT:    ld r19, -104(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r18, -112(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    mtvsrd v1, r10
 ; CHECK-PWR8-NEXT:    ld r10, -160(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    rldicl r31, r26, 24, 56
 ; CHECK-PWR8-NEXT:    rldicl r2, r24, 24, 56
-; CHECK-PWR8-NEXT:    add r7, r7, r29
-; CHECK-PWR8-NEXT:    add r3, r3, r9
+; CHECK-PWR8-NEXT:    xor r7, r7, r29
+; CHECK-PWR8-NEXT:    xor r3, r3, r9
 ; CHECK-PWR8-NEXT:    rldicl r8, r24, 16, 56
 ; CHECK-PWR8-NEXT:    rldicl r6, r26, 8, 56
-; CHECK-PWR8-NEXT:    xor r4, r4, r28
+; CHECK-PWR8-NEXT:    sub r4, r4, r28
 ; CHECK-PWR8-NEXT:    clrlwi r0, r31, 24
 ; CHECK-PWR8-NEXT:    clrlwi r30, r2, 24
-; CHECK-PWR8-NEXT:    xor r7, r7, r29
+; CHECK-PWR8-NEXT:    sub r7, r7, r29
 ; CHECK-PWR8-NEXT:    rldicl r5, r24, 8, 56
 ; CHECK-PWR8-NEXT:    clrlwi r10, r10, 24
 ; CHECK-PWR8-NEXT:    clrlwi r8, r8, 24
-; CHECK-PWR8-NEXT:    xor r3, r3, r9
+; CHECK-PWR8-NEXT:    sub r3, r3, r9
 ; CHECK-PWR8-NEXT:    mtvsrd v7, r4
 ; CHECK-PWR8-NEXT:    clrlwi r4, r6, 24
 ; CHECK-PWR8-NEXT:    clrlwi r5, r5, 24
@@ -845,18 +845,18 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR8-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    srawi r6, r7, 31
 ; CHECK-PWR8-NEXT:    srawi r5, r3, 31
-; CHECK-PWR8-NEXT:    add r8, r0, r12
+; CHECK-PWR8-NEXT:    xor r8, r0, r12
 ; CHECK-PWR8-NEXT:    vmrghb v5, v1, v5
 ; CHECK-PWR8-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r24, -64(r1) # 8-byte Folded Reload
-; CHECK-PWR8-NEXT:    add r4, r7, r6
-; CHECK-PWR8-NEXT:    add r3, r3, r5
-; CHECK-PWR8-NEXT:    xor r8, r8, r12
+; CHECK-PWR8-NEXT:    xor r4, r7, r6
+; CHECK-PWR8-NEXT:    xor r3, r3, r5
+; CHECK-PWR8-NEXT:    sub r8, r8, r12
 ; CHECK-PWR8-NEXT:    vmrghb v6, v7, v6
 ; CHECK-PWR8-NEXT:    ld r17, -120(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r16, -128(r1) # 8-byte Folded Reload
-; CHECK-PWR8-NEXT:    xor r4, r4, r6
-; CHECK-PWR8-NEXT:    xor r3, r3, r5
+; CHECK-PWR8-NEXT:    sub r4, r4, r6
+; CHECK-PWR8-NEXT:    sub r3, r3, r5
 ; CHECK-PWR8-NEXT:    mtvsrd v9, r8
 ; CHECK-PWR8-NEXT:    ld r15, -136(r1) # 8-byte Folded Reload
 ; CHECK-PWR8-NEXT:    ld r14, -144(r1) # 8-byte Folded Reload
@@ -875,15 +875,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ;
 ; CHECK-PWR7-LABEL: sub_absv_8_ext:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    stdu r1, -464(r1)
-; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 464
-; CHECK-PWR7-NEXT:    .cfi_offset r16, -128
-; CHECK-PWR7-NEXT:    .cfi_offset r17, -120
-; CHECK-PWR7-NEXT:    .cfi_offset r18, -112
-; CHECK-PWR7-NEXT:    .cfi_offset r19, -104
-; CHECK-PWR7-NEXT:    .cfi_offset r20, -96
-; CHECK-PWR7-NEXT:    .cfi_offset r21, -88
-; CHECK-PWR7-NEXT:    .cfi_offset r22, -80
+; CHECK-PWR7-NEXT:    stdu r1, -416(r1)
+; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 416
 ; CHECK-PWR7-NEXT:    .cfi_offset r23, -72
 ; CHECK-PWR7-NEXT:    .cfi_offset r24, -64
 ; CHECK-PWR7-NEXT:    .cfi_offset r25, -56
@@ -893,167 +886,156 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR7-NEXT:    .cfi_offset r29, -24
 ; CHECK-PWR7-NEXT:    .cfi_offset r30, -16
 ; CHECK-PWR7-NEXT:    addi r3, r1, 304
-; CHECK-PWR7-NEXT:    std r16, 336(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r23, 344(r1) # 8-byte Folded Spill
 ; CHECK-PWR7-NEXT:    addi r4, r1, 320
-; CHECK-PWR7-NEXT:    std r17, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r18, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r19, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r20, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r21, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r22, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r23, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r24, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r25, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r26, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r27, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r28, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r29, 440(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r30, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r24, 352(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r25, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r26, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r27, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r28, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r29, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r30, 400(r1) # 8-byte Folded Spill
 ; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
 ; CHECK-PWR7-NEXT:    lbz r3, 304(r1)
 ; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r4
-; CHECK-PWR7-NEXT:    lbz r4, 320(r1)
-; CHECK-PWR7-NEXT:    lbz r5, 305(r1)
-; CHECK-PWR7-NEXT:    lbz r6, 321(r1)
-; CHECK-PWR7-NEXT:    lbz r7, 306(r1)
-; CHECK-PWR7-NEXT:    lbz r8, 322(r1)
 ; CHECK-PWR7-NEXT:    lbz r9, 307(r1)
-; CHECK-PWR7-NEXT:    sub r3, r3, r4
 ; CHECK-PWR7-NEXT:    lbz r10, 323(r1)
 ; CHECK-PWR7-NEXT:    lbz r11, 308(r1)
-; CHECK-PWR7-NEXT:    sub r5, r5, r6
 ; CHECK-PWR7-NEXT:    lbz r12, 324(r1)
 ; CHECK-PWR7-NEXT:    lbz r0, 309(r1)
-; CHECK-PWR7-NEXT:    sub r6, r7, r8
 ; CHECK-PWR7-NEXT:    lbz r30, 325(r1)
-; CHECK-PWR7-NEXT:    lbz r29, 310(r1)
 ; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    lbz r29, 310(r1)
 ; CHECK-PWR7-NEXT:    lbz r28, 326(r1)
-; CHECK-PWR7-NEXT:    lbz r23, 313(r1)
-; CHECK-PWR7-NEXT:    sub r10, r11, r12
-; CHECK-PWR7-NEXT:    lbz r22, 329(r1)
-; CHECK-PWR7-NEXT:    lbz r4, 314(r1)
-; CHECK-PWR7-NEXT:    sub r0, r0, r30
-; CHECK-PWR7-NEXT:    lbz r21, 330(r1)
-; CHECK-PWR7-NEXT:    lbz r7, 315(r1)
-; CHECK-PWR7-NEXT:    sub r30, r29, r28
-; CHECK-PWR7-NEXT:    srawi r20, r0, 31
-; CHECK-PWR7-NEXT:    lbz r8, 331(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 316(r1)
-; CHECK-PWR7-NEXT:    sub r23, r23, r22
-; CHECK-PWR7-NEXT:    srawi r19, r30, 31
-; CHECK-PWR7-NEXT:    lbz r12, 332(r1)
-; CHECK-PWR7-NEXT:    lbz r29, 317(r1)
-; CHECK-PWR7-NEXT:    sub r4, r4, r21
-; CHECK-PWR7-NEXT:    add r0, r0, r20
-; CHECK-PWR7-NEXT:    lbz r28, 333(r1)
-; CHECK-PWR7-NEXT:    lbz r22, 319(r1)
-; CHECK-PWR7-NEXT:    sub r7, r7, r8
-; CHECK-PWR7-NEXT:    add r30, r30, r19
-; CHECK-PWR7-NEXT:    lbz r21, 335(r1)
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
 ; CHECK-PWR7-NEXT:    lbz r27, 311(r1)
-; CHECK-PWR7-NEXT:    sub r8, r11, r12
-; CHECK-PWR7-NEXT:    xor r0, r0, r20
 ; CHECK-PWR7-NEXT:    lbz r26, 327(r1)
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
 ; CHECK-PWR7-NEXT:    lbz r25, 312(r1)
-; CHECK-PWR7-NEXT:    sub r11, r29, r28
-; CHECK-PWR7-NEXT:    srawi r28, r3, 31
 ; CHECK-PWR7-NEXT:    lbz r24, 328(r1)
-; CHECK-PWR7-NEXT:    sub r29, r22, r21
-; CHECK-PWR7-NEXT:    add r3, r3, r28
-; CHECK-PWR7-NEXT:    xor r30, r30, r19
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
+; CHECK-PWR7-NEXT:    lbz r10, 315(r1)
+; CHECK-PWR7-NEXT:    lbz r12, 331(r1)
 ; CHECK-PWR7-NEXT:    sub r27, r27, r26
-; CHECK-PWR7-NEXT:    srawi r17, r29, 31
+; CHECK-PWR7-NEXT:    lbz r30, 316(r1)
+; CHECK-PWR7-NEXT:    lbz r28, 332(r1)
+; CHECK-PWR7-NEXT:    sub r25, r25, r24
+; CHECK-PWR7-NEXT:    lbz r4, 320(r1)
+; CHECK-PWR7-NEXT:    lbz r5, 305(r1)
+; CHECK-PWR7-NEXT:    sub r10, r10, r12
+; CHECK-PWR7-NEXT:    lbz r6, 321(r1)
+; CHECK-PWR7-NEXT:    lbz r26, 317(r1)
+; CHECK-PWR7-NEXT:    sub r30, r30, r28
+; CHECK-PWR7-NEXT:    lbz r24, 333(r1)
+; CHECK-PWR7-NEXT:    lbz r12, 319(r1)
+; CHECK-PWR7-NEXT:    sub r3, r3, r4
+; CHECK-PWR7-NEXT:    lbz r28, 335(r1)
+; CHECK-PWR7-NEXT:    lbz r7, 306(r1)
+; CHECK-PWR7-NEXT:    sub r5, r5, r6
+; CHECK-PWR7-NEXT:    lbz r8, 322(r1)
+; CHECK-PWR7-NEXT:    sub r26, r26, r24
+; CHECK-PWR7-NEXT:    srawi r24, r5, 31
+; CHECK-PWR7-NEXT:    lbz r23, 313(r1)
+; CHECK-PWR7-NEXT:    sub r12, r12, r28
+; CHECK-PWR7-NEXT:    srawi r28, r3, 31
+; CHECK-PWR7-NEXT:    xor r5, r5, r24
+; CHECK-PWR7-NEXT:    lbz r4, 329(r1)
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
 ; CHECK-PWR7-NEXT:    xor r3, r3, r28
-; CHECK-PWR7-NEXT:    ld r20, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r26, r25, r24
-; CHECK-PWR7-NEXT:    lbz r25, 318(r1)
-; CHECK-PWR7-NEXT:    lbz r24, 334(r1)
-; CHECK-PWR7-NEXT:    add r29, r29, r17
-; CHECK-PWR7-NEXT:    xor r29, r29, r17
-; CHECK-PWR7-NEXT:    srawi r18, r27, 31
-; CHECK-PWR7-NEXT:    ld r19, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r12, r25, r24
-; CHECK-PWR7-NEXT:    stb r29, 288(r1)
-; CHECK-PWR7-NEXT:    add r28, r27, r18
-; CHECK-PWR7-NEXT:    srawi r29, r12, 31
-; CHECK-PWR7-NEXT:    srawi r16, r26, 31
-; CHECK-PWR7-NEXT:    xor r28, r28, r18
-; CHECK-PWR7-NEXT:    ld r18, 352(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    add r12, r12, r29
-; CHECK-PWR7-NEXT:    add r27, r26, r16
-; CHECK-PWR7-NEXT:    xor r12, r12, r29
-; CHECK-PWR7-NEXT:    srawi r29, r7, 31
-; CHECK-PWR7-NEXT:    xor r27, r27, r16
-; CHECK-PWR7-NEXT:    ld r16, 336(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r26, r8, 31
-; CHECK-PWR7-NEXT:    srawi r25, r5, 31
-; CHECK-PWR7-NEXT:    add r7, r7, r29
-; CHECK-PWR7-NEXT:    add r8, r8, r26
-; CHECK-PWR7-NEXT:    srawi r24, r6, 31
-; CHECK-PWR7-NEXT:    add r5, r5, r25
-; CHECK-PWR7-NEXT:    xor r7, r7, r29
-; CHECK-PWR7-NEXT:    srawi r22, r9, 31
-; CHECK-PWR7-NEXT:    srawi r21, r10, 31
-; CHECK-PWR7-NEXT:    xor r8, r8, r26
-; CHECK-PWR7-NEXT:    xor r5, r5, r25
-; CHECK-PWR7-NEXT:    srawi r17, r11, 31
-; CHECK-PWR7-NEXT:    srawi r26, r23, 31
-; CHECK-PWR7-NEXT:    add r6, r6, r24
-; CHECK-PWR7-NEXT:    add r9, r9, r22
-; CHECK-PWR7-NEXT:    srawi r29, r4, 31
-; CHECK-PWR7-NEXT:    add r10, r10, r21
-; CHECK-PWR7-NEXT:    add r11, r11, r17
-; CHECK-PWR7-NEXT:    add r25, r23, r26
-; CHECK-PWR7-NEXT:    add r4, r4, r29
-; CHECK-PWR7-NEXT:    xor r6, r6, r24
-; CHECK-PWR7-NEXT:    xor r9, r9, r22
-; CHECK-PWR7-NEXT:    xor r10, r10, r21
-; CHECK-PWR7-NEXT:    xor r11, r11, r17
-; CHECK-PWR7-NEXT:    xor r4, r4, r29
-; CHECK-PWR7-NEXT:    xor r26, r25, r26
-; CHECK-PWR7-NEXT:    addi r29, r1, 224
-; CHECK-PWR7-NEXT:    stb r12, 272(r1)
+; CHECK-PWR7-NEXT:    lbz r6, 314(r1)
+; CHECK-PWR7-NEXT:    lbz r8, 330(r1)
+; CHECK-PWR7-NEXT:    sub r3, r3, r28
+; CHECK-PWR7-NEXT:    srawi r28, r7, 31
+; CHECK-PWR7-NEXT:    sub r5, r5, r24
+; CHECK-PWR7-NEXT:    srawi r24, r9, 31
+; CHECK-PWR7-NEXT:    xor r7, r7, r28
+; CHECK-PWR7-NEXT:    xor r9, r9, r24
+; CHECK-PWR7-NEXT:    sub r7, r7, r28
+; CHECK-PWR7-NEXT:    srawi r28, r11, 31
+; CHECK-PWR7-NEXT:    sub r9, r9, r24
+; CHECK-PWR7-NEXT:    srawi r24, r0, 31
+; CHECK-PWR7-NEXT:    xor r11, r11, r28
+; CHECK-PWR7-NEXT:    xor r0, r0, r24
+; CHECK-PWR7-NEXT:    sub r11, r11, r28
+; CHECK-PWR7-NEXT:    srawi r28, r29, 31
+; CHECK-PWR7-NEXT:    sub r0, r0, r24
+; CHECK-PWR7-NEXT:    srawi r24, r27, 31
+; CHECK-PWR7-NEXT:    sub r4, r23, r4
+; CHECK-PWR7-NEXT:    xor r29, r29, r28
+; CHECK-PWR7-NEXT:    lbz r23, 318(r1)
+; CHECK-PWR7-NEXT:    xor r27, r27, r24
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
+; CHECK-PWR7-NEXT:    srawi r28, r25, 31
+; CHECK-PWR7-NEXT:    sub r27, r27, r24
+; CHECK-PWR7-NEXT:    srawi r24, r4, 31
+; CHECK-PWR7-NEXT:    sub r6, r6, r8
+; CHECK-PWR7-NEXT:    xor r25, r25, r28
+; CHECK-PWR7-NEXT:    lbz r8, 334(r1)
+; CHECK-PWR7-NEXT:    xor r4, r4, r24
+; CHECK-PWR7-NEXT:    sub r28, r25, r28
+; CHECK-PWR7-NEXT:    srawi r25, r6, 31
+; CHECK-PWR7-NEXT:    sub r4, r4, r24
+; CHECK-PWR7-NEXT:    srawi r24, r10, 31
+; CHECK-PWR7-NEXT:    xor r6, r6, r25
+; CHECK-PWR7-NEXT:    xor r10, r10, r24
+; CHECK-PWR7-NEXT:    sub r6, r6, r25
+; CHECK-PWR7-NEXT:    srawi r25, r30, 31
+; CHECK-PWR7-NEXT:    sub r10, r10, r24
+; CHECK-PWR7-NEXT:    srawi r24, r26, 31
+; CHECK-PWR7-NEXT:    sub r8, r23, r8
+; CHECK-PWR7-NEXT:    xor r30, r30, r25
+; CHECK-PWR7-NEXT:    ld r23, 344(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    xor r26, r26, r24
+; CHECK-PWR7-NEXT:    sub r30, r30, r25
+; CHECK-PWR7-NEXT:    srawi r25, r12, 31
+; CHECK-PWR7-NEXT:    sub r26, r26, r24
+; CHECK-PWR7-NEXT:    srawi r24, r8, 31
+; CHECK-PWR7-NEXT:    xor r12, r12, r25
+; CHECK-PWR7-NEXT:    xor r8, r8, r24
+; CHECK-PWR7-NEXT:    sub r12, r12, r25
+; CHECK-PWR7-NEXT:    addi r25, r1, 272
+; CHECK-PWR7-NEXT:    sub r8, r8, r24
+; CHECK-PWR7-NEXT:    stb r12, 288(r1)
 ; CHECK-PWR7-NEXT:    addi r12, r1, 288
-; CHECK-PWR7-NEXT:    addi r25, r1, 208
-; CHECK-PWR7-NEXT:    stb r11, 256(r1)
-; CHECK-PWR7-NEXT:    addi r11, r1, 272
-; CHECK-PWR7-NEXT:    ld r24, 400(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    stb r8, 240(r1)
-; CHECK-PWR7-NEXT:    stb r7, 224(r1)
-; CHECK-PWR7-NEXT:    stb r4, 208(r1)
-; CHECK-PWR7-NEXT:    stb r26, 192(r1)
-; CHECK-PWR7-NEXT:    stb r27, 176(r1)
-; CHECK-PWR7-NEXT:    stb r28, 160(r1)
-; CHECK-PWR7-NEXT:    stb r30, 144(r1)
+; CHECK-PWR7-NEXT:    stb r8, 272(r1)
+; CHECK-PWR7-NEXT:    stb r26, 256(r1)
+; CHECK-PWR7-NEXT:    stb r30, 240(r1)
+; CHECK-PWR7-NEXT:    stb r10, 224(r1)
+; CHECK-PWR7-NEXT:    stb r6, 208(r1)
+; CHECK-PWR7-NEXT:    stb r4, 192(r1)
+; CHECK-PWR7-NEXT:    stb r28, 176(r1)
+; CHECK-PWR7-NEXT:    stb r27, 160(r1)
+; CHECK-PWR7-NEXT:    stb r29, 144(r1)
 ; CHECK-PWR7-NEXT:    stb r0, 128(r1)
-; CHECK-PWR7-NEXT:    stb r10, 112(r1)
+; CHECK-PWR7-NEXT:    stb r11, 112(r1)
 ; CHECK-PWR7-NEXT:    stb r9, 96(r1)
-; CHECK-PWR7-NEXT:    stb r6, 80(r1)
+; CHECK-PWR7-NEXT:    stb r7, 80(r1)
 ; CHECK-PWR7-NEXT:    stb r5, 64(r1)
 ; CHECK-PWR7-NEXT:    stb r3, 48(r1)
 ; CHECK-PWR7-NEXT:    addi r8, r1, 256
-; CHECK-PWR7-NEXT:    addi r7, r1, 240
+; CHECK-PWR7-NEXT:    addi r26, r1, 240
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r12
-; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r11
+; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r25
+; CHECK-PWR7-NEXT:    addi r10, r1, 224
+; CHECK-PWR7-NEXT:    addi r30, r1, 208
 ; CHECK-PWR7-NEXT:    addi r3, r1, 192
 ; CHECK-PWR7-NEXT:    addi r4, r1, 176
 ; CHECK-PWR7-NEXT:    addi r5, r1, 160
 ; CHECK-PWR7-NEXT:    addi r6, r1, 144
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r8
-; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r7
-; CHECK-PWR7-NEXT:    lxvw4x v0, 0, r29
-; CHECK-PWR7-NEXT:    lxvw4x v1, 0, r25
+; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r26
 ; CHECK-PWR7-NEXT:    addi r7, r1, 128
 ; CHECK-PWR7-NEXT:    addi r8, r1, 112
-; CHECK-PWR7-NEXT:    lxvw4x v6, 0, r3
-; CHECK-PWR7-NEXT:    lxvw4x v7, 0, r4
+; CHECK-PWR7-NEXT:    lxvw4x v0, 0, r10
+; CHECK-PWR7-NEXT:    lxvw4x v1, 0, r30
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    addi r9, r1, 96
-; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r5
-; CHECK-PWR7-NEXT:    lxvw4x v8, 0, r6
+; CHECK-PWR7-NEXT:    lxvw4x v6, 0, r3
+; CHECK-PWR7-NEXT:    lxvw4x v7, 0, r4
 ; CHECK-PWR7-NEXT:    addi r3, r1, 80
 ; CHECK-PWR7-NEXT:    addi r4, r1, 64
+; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r5
+; CHECK-PWR7-NEXT:    lxvw4x v8, 0, r6
 ; CHECK-PWR7-NEXT:    addi r5, r1, 48
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r7
@@ -1063,29 +1045,26 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR7-NEXT:    lxvw4x v10, 0, r3
 ; CHECK-PWR7-NEXT:    vmrghb v6, v7, v6
 ; CHECK-PWR7-NEXT:    lxvw4x v7, 0, r4
-; CHECK-PWR7-NEXT:    ld r30, 448(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v3, v8, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v8, 0, r5
-; CHECK-PWR7-NEXT:    ld r29, 440(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v5, v9, v5
-; CHECK-PWR7-NEXT:    ld r28, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r27, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r30, 400(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r29, 392(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v1, v10, v1
-; CHECK-PWR7-NEXT:    ld r26, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r25, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r28, 384(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r27, 376(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v7, v8, v7
-; CHECK-PWR7-NEXT:    ld r23, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r22, 384(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r26, 368(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r25, 360(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghh v2, v4, v2
-; CHECK-PWR7-NEXT:    ld r21, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    ld r17, 344(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r24, 352(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghh v4, v6, v0
 ; CHECK-PWR7-NEXT:    vmrghh v3, v5, v3
 ; CHECK-PWR7-NEXT:    vmrghh v5, v7, v1
 ; CHECK-PWR7-NEXT:    vmrghw v2, v4, v2
 ; CHECK-PWR7-NEXT:    vmrghw v3, v5, v3
 ; CHECK-PWR7-NEXT:    xxmrghd v2, v3, v2
-; CHECK-PWR7-NEXT:    addi r1, r1, 464
+; CHECK-PWR7-NEXT:    addi r1, r1, 416
 ; CHECK-PWR7-NEXT:    blr
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 64c9e35146f63..29e481198246c 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -734,8 +734,8 @@ define i32 @abs_i32(i32 %x) {
 ; RV32I-LABEL: abs_i32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a1, a0, 31
-; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    xor a0, a0, a1
+; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abs_i32:
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index a5e3061f50953..0127ac4d33a5b 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -947,8 +947,8 @@ define i32 @abs_i32(i32 %x) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    srai a1, a0, 63
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abs_i32:
@@ -961,14 +961,13 @@ define i32 @abs_i32(i32 %x) {
   ret i32 %abs
 }
 
-; FIXME: We can remove the sext.w by using addw for RV64I and negw for RV64ZBB.
+; FIXME: We can remove the sext.w on RV64ZBB by using negw.
 define signext i32 @abs_i32_sext(i32 signext %x) {
 ; RV64I-LABEL: abs_i32_sext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srai a1, a0, 63
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abs_i32_sext:
@@ -987,8 +986,8 @@ define i64 @abs_i64(i64 %x) {
 ; RV64I-LABEL: abs_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srai a1, a0, 63
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abs_i64:
diff --git a/llvm/test/CodeGen/Thumb/iabs.ll b/llvm/test/CodeGen/Thumb/iabs.ll
index 2d51288b5242a..6bebea67e265a 100644
--- a/llvm/test/CodeGen/Thumb/iabs.ll
+++ b/llvm/test/CodeGen/Thumb/iabs.ll
@@ -6,8 +6,8 @@ define i8 @test_i8(i8 %a) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    sxtb r1, r0
 ; CHECK-NEXT:    asrs r1, r1, #7
-; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    subs r0, r0, r1
 ; CHECK-NEXT:    bx lr
   %tmp1neg = sub i8 0, %a
   %b = icmp sgt i8 %a, -1
@@ -20,8 +20,8 @@ define i16 @test_i16(i16 %a) nounwind {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    sxth r1, r0
 ; CHECK-NEXT:    asrs r1, r1, #15
-; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    subs r0, r0, r1
 ; CHECK-NEXT:    bx lr
   %tmp1neg = sub i16 0, %a
   %b = icmp sgt i16 %a, -1
@@ -33,8 +33,8 @@ define i32 @test_i32(i32 %a) nounwind {
 ; CHECK-LABEL: test_i32:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    asrs r1, r0, #31
-; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    subs r0, r0, r1
 ; CHECK-NEXT:    bx lr
   %tmp1neg = sub i32 0, %a
   %b = icmp sgt i32 %a, -1
@@ -46,10 +46,10 @@ define i64 @test_i64(i64 %a) nounwind {
 ; CHECK-LABEL: test_i64:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    adds r0, r0, r2
-; CHECK-NEXT:    adcs r1, r2
-; CHECK-NEXT:    eors r0, r2
 ; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbcs r1, r2
 ; CHECK-NEXT:    bx lr
   %tmp1neg = sub i64 0, %a
   %b = icmp sgt i64 %a, -1
diff --git a/llvm/test/CodeGen/Thumb/optionaldef-scheduling.ll b/llvm/test/CodeGen/Thumb/optionaldef-scheduling.ll
index bd091cf2b6f84..152ac0fa3f168 100644
--- a/llvm/test/CodeGen/Thumb/optionaldef-scheduling.ll
+++ b/llvm/test/CodeGen/Thumb/optionaldef-scheduling.ll
@@ -1,7 +1,51 @@
-; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv6-eabi %s -verify-machineinstrs -o - | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | FileCheck %s --check-prefix=THUMB
+; RUN: llc -mtriple=thumbv6-eabi %s -verify-machineinstrs -o - | FileCheck %s --check-prefix=THUMBV6
+
+; The scheduler used to ignore OptionalDefs, and could unwittingly insert
+; a flag-setting instruction in between an ADDS and the corresponding ADC.
+
+; FIXME: The ABS lowering changed to XOR followed by SUB so this may no longer
+; be testing what it used to.
 
 define i1 @test(i64 %arg) {
+; THUMB-LABEL: test:
+; THUMB:       @ %bb.0: @ %entry
+; THUMB-NEXT:    .save {r4, lr}
+; THUMB-NEXT:    push {r4, lr}
+; THUMB-NEXT:    asrs r2, r1, #31
+; THUMB-NEXT:    movs r3, r1
+; THUMB-NEXT:    eors r3, r2
+; THUMB-NEXT:    movs r4, r0
+; THUMB-NEXT:    eors r4, r2
+; THUMB-NEXT:    subs r4, r4, r2
+; THUMB-NEXT:    sbcs r3, r2
+; THUMB-NEXT:    eors r3, r1
+; THUMB-NEXT:    eors r0, r4
+; THUMB-NEXT:    orrs r0, r3
+; THUMB-NEXT:    rsbs r1, r0, #0
+; THUMB-NEXT:    adcs r0, r1
+; THUMB-NEXT:    pop {r4}
+; THUMB-NEXT:    pop {r1}
+; THUMB-NEXT:    bx r1
+;
+; THUMBV6-LABEL: test:
+; THUMBV6:       @ %bb.0: @ %entry
+; THUMBV6-NEXT:    .save {r4, lr}
+; THUMBV6-NEXT:    push {r4, lr}
+; THUMBV6-NEXT:    asrs r2, r1, #31
+; THUMBV6-NEXT:    mov r3, r1
+; THUMBV6-NEXT:    eors r3, r2
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    eors r4, r2
+; THUMBV6-NEXT:    subs r4, r4, r2
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    eors r3, r1
+; THUMBV6-NEXT:    eors r0, r4
+; THUMBV6-NEXT:    orrs r0, r3
+; THUMBV6-NEXT:    rsbs r1, r0, #0
+; THUMBV6-NEXT:    adcs r0, r1
+; THUMBV6-NEXT:    pop {r4, pc}
 entry:
   %ispos = icmp sgt i64 %arg, -1
   %neg = sub i64 0, %arg
@@ -9,10 +53,3 @@ entry:
   %cmp2 = icmp eq i64 %sel, %arg
   ret i1 %cmp2
 }
-
-; The scheduler used to ignore OptionalDefs, and could unwittingly insert
-; a flag-setting instruction in between an ADDS and the corresponding ADC.
-
-; CHECK: adds
-; CHECK-NOT: eors
-; CHECK: adcs
diff --git a/llvm/test/CodeGen/Thumb2/abs.ll b/llvm/test/CodeGen/Thumb2/abs.ll
index 02a2a14c2a5cc..88259ba758803 100644
--- a/llvm/test/CodeGen/Thumb2/abs.ll
+++ b/llvm/test/CodeGen/Thumb2/abs.ll
@@ -120,18 +120,18 @@ define i64 @abs64(i64 %x) {
 ; CHECKT1-LABEL: abs64:
 ; CHECKT1:       @ %bb.0:
 ; CHECKT1-NEXT:    asrs r2, r1, #31
-; CHECKT1-NEXT:    adds r0, r0, r2
-; CHECKT1-NEXT:    adcs r1, r2
-; CHECKT1-NEXT:    eors r0, r2
 ; CHECKT1-NEXT:    eors r1, r2
+; CHECKT1-NEXT:    eors r0, r2
+; CHECKT1-NEXT:    subs r0, r0, r2
+; CHECKT1-NEXT:    sbcs r1, r2
 ; CHECKT1-NEXT:    bx lr
 ;
 ; CHECKT2-LABEL: abs64:
 ; CHECKT2:       @ %bb.0:
-; CHECKT2-NEXT:    adds.w r0, r0, r1, asr #31
-; CHECKT2-NEXT:    adc.w r2, r1, r1, asr #31
 ; CHECKT2-NEXT:    eor.w r0, r0, r1, asr #31
-; CHECKT2-NEXT:    eor.w r1, r2, r1, asr #31
+; CHECKT2-NEXT:    eor.w r2, r1, r1, asr #31
+; CHECKT2-NEXT:    subs.w r0, r0, r1, asr #31
+; CHECKT2-NEXT:    sbc.w r1, r2, r1, asr #31
 ; CHECKT2-NEXT:    bx lr
   %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
   ret i64 %abs
@@ -141,8 +141,8 @@ define i32 @abs32(i32 %x) {
 ; CHECKT1-LABEL: abs32:
 ; CHECKT1:       @ %bb.0:
 ; CHECKT1-NEXT:    asrs r1, r0, #31
-; CHECKT1-NEXT:    adds r0, r0, r1
 ; CHECKT1-NEXT:    eors r0, r1
+; CHECKT1-NEXT:    subs r0, r0, r1
 ; CHECKT1-NEXT:    bx lr
 ;
 ; CHECKT2-LABEL: abs32:
@@ -160,15 +160,15 @@ define i16 @abs16(i16 %x) {
 ; CHECKT1:       @ %bb.0:
 ; CHECKT1-NEXT:    sxth r1, r0
 ; CHECKT1-NEXT:    asrs r1, r1, #15
-; CHECKT1-NEXT:    adds r0, r0, r1
 ; CHECKT1-NEXT:    eors r0, r1
+; CHECKT1-NEXT:    subs r0, r0, r1
 ; CHECKT1-NEXT:    bx lr
 ;
 ; CHECKT2-LABEL: abs16:
 ; CHECKT2:       @ %bb.0:
 ; CHECKT2-NEXT:    sxth r1, r0
-; CHECKT2-NEXT:    add.w r0, r0, r1, asr #15
 ; CHECKT2-NEXT:    eor.w r0, r0, r1, asr #15
+; CHECKT2-NEXT:    sub.w r0, r0, r1, asr #15
 ; CHECKT2-NEXT:    bx lr
   %abs = tail call i16 @llvm.abs.i16(i16 %x, i1 true)
   ret i16 %abs
@@ -180,26 +180,26 @@ define i128 @abs128(i128 %x) {
 ; CHECKT1-NEXT:    .save {r4, lr}
 ; CHECKT1-NEXT:    push {r4, lr}
 ; CHECKT1-NEXT:    asrs r4, r3, #31
-; CHECKT1-NEXT:    adds r0, r0, r4
-; CHECKT1-NEXT:    adcs r1, r4
-; CHECKT1-NEXT:    adcs r2, r4
-; CHECKT1-NEXT:    adcs r3, r4
-; CHECKT1-NEXT:    eors r0, r4
-; CHECKT1-NEXT:    eors r1, r4
-; CHECKT1-NEXT:    eors r2, r4
 ; CHECKT1-NEXT:    eors r3, r4
+; CHECKT1-NEXT:    eors r2, r4
+; CHECKT1-NEXT:    eors r1, r4
+; CHECKT1-NEXT:    eors r0, r4
+; CHECKT1-NEXT:    subs r0, r0, r4
+; CHECKT1-NEXT:    sbcs r1, r4
+; CHECKT1-NEXT:    sbcs r2, r4
+; CHECKT1-NEXT:    sbcs r3, r4
 ; CHECKT1-NEXT:    pop {r4, pc}
 ;
 ; CHECKT2-LABEL: abs128:
 ; CHECKT2:       @ %bb.0:
-; CHECKT2-NEXT:    adds.w r0, r0, r3, asr #31
-; CHECKT2-NEXT:    adcs.w r1, r1, r3, asr #31
 ; CHECKT2-NEXT:    eor.w r0, r0, r3, asr #31
-; CHECKT2-NEXT:    adcs.w r2, r2, r3, asr #31
 ; CHECKT2-NEXT:    eor.w r1, r1, r3, asr #31
-; CHECKT2-NEXT:    adc.w r12, r3, r3, asr #31
+; CHECKT2-NEXT:    subs.w r0, r0, r3, asr #31
 ; CHECKT2-NEXT:    eor.w r2, r2, r3, asr #31
-; CHECKT2-NEXT:    eor.w r3, r12, r3, asr #31
+; CHECKT2-NEXT:    sbcs.w r1, r1, r3, asr #31
+; CHECKT2-NEXT:    eor.w r12, r3, r3, asr #31
+; CHECKT2-NEXT:    sbcs.w r2, r2, r3, asr #31
+; CHECKT2-NEXT:    sbc.w r3, r12, r3, asr #31
 ; CHECKT2-NEXT:    bx lr
   %abs = tail call i128 @llvm.abs.i128(i128 %x, i1 true)
   ret i128 %abs
diff --git a/llvm/test/CodeGen/WebAssembly/PR41149.ll b/llvm/test/CodeGen/WebAssembly/PR41149.ll
index 6a8dee1906f1b..0913bf0eba220 100644
--- a/llvm/test/CodeGen/WebAssembly/PR41149.ll
+++ b/llvm/test/CodeGen/WebAssembly/PR41149.ll
@@ -13,9 +13,9 @@ define void @mod() {
 ; CHECK-NEXT: i32.const       31
 ; CHECK-NEXT: i32.shr_s
 ; CHECK-NEXT: local.tee       0
-; CHECK-NEXT: i32.add
-; CHECK-NEXT: local.get       0
 ; CHECK-NEXT: i32.xor
+; CHECK-NEXT: local.get       0
+; CHECK-NEXT: i32.sub
 ; CHECK-NEXT: i32.store8      0
   %tmp = load <4 x i8>, <4 x i8>* undef
   %tmp2 = icmp slt <4 x i8> %tmp, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index c03923aa47ff5..df83381ababd3 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -25,11 +25,11 @@ declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
 define i8 @test_i8(i8 %a) nounwind {
 ; X64-LABEL: test_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    sarb $7, %cl
-; X64-NEXT:    leal (%rdi,%rcx), %eax
 ; X64-NEXT:    xorb %cl, %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -38,8 +38,8 @@ define i8 @test_i8(i8 %a) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarb $7, %cl
-; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    xorb %cl, %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    retl
   %r = call i8 @llvm.abs.i8(i8 %a, i1 false)
   ret i8 %r
@@ -197,8 +197,8 @@ define <2 x i32> @test_v2i32(<2 x i32> %a) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2i32:
@@ -226,8 +226,8 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v3i32:
@@ -261,8 +261,8 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4i32:
@@ -309,12 +309,12 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    psrad $31, %xmm2
-; SSE-NEXT:    paddd %xmm2, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    psubd %xmm2, %xmm0
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psrad $31, %xmm2
-; SSE-NEXT:    paddd %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    psubd %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v8i32:
@@ -496,86 +496,86 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movb %cl, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %cl
 ; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb %dl, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %dl
 ; X86-NEXT:    xorb %al, %dl
+; X86-NEXT:    subb %al, %dl
 ; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb %ah, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %ah
 ; X86-NEXT:    xorb %al, %ah
+; X86-NEXT:    subb %al, %ah
 ; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb %ch, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %ch
 ; X86-NEXT:    xorb %al, %ch
+; X86-NEXT:    subb %al, %ch
 ; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb %dh, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %dh
 ; X86-NEXT:    xorb %al, %dh
+; X86-NEXT:    subb %al, %dh
 ; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %bl
 ; X86-NEXT:    xorb %al, %bl
+; X86-NEXT:    subb %al, %bl
 ; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb %bh, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %bh
 ; X86-NEXT:    xorb %al, %bh
+; X86-NEXT:    subb %al, %bh
 ; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %cl
 ; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %cl
 ; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
 ; X86-NEXT:    movb %bh, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %bh
 ; X86-NEXT:    xorb %al, %bh
+; X86-NEXT:    subb %al, %bh
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %bl
 ; X86-NEXT:    xorb %al, %bl
+; X86-NEXT:    subb %al, %bl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
 ; X86-NEXT:    movb %dh, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %dh
 ; X86-NEXT:    xorb %al, %dh
+; X86-NEXT:    subb %al, %dh
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movb %ch, %al
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %ch
 ; X86-NEXT:    xorb %al, %ch
+; X86-NEXT:    subb %al, %ch
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %dl
 ; X86-NEXT:    xorb %al, %dl
+; X86-NEXT:    subb %al, %dl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    addb %al, %cl
 ; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movb %al, %ah
 ; X86-NEXT:    sarb $7, %ah
-; X86-NEXT:    addb %ah, %al
 ; X86-NEXT:    xorb %ah, %al
+; X86-NEXT:    subb %ah, %al
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movb %al, 15(%esi)
 ; X86-NEXT:    movb %cl, 14(%esi)
diff --git a/llvm/test/CodeGen/X86/combine-abs.ll b/llvm/test/CodeGen/X86/combine-abs.ll
index fd5930217cb06..de20b4dccb20a 100644
--- a/llvm/test/CodeGen/X86/combine-abs.ll
+++ b/llvm/test/CodeGen/X86/combine-abs.ll
@@ -110,13 +110,13 @@ define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: combine_v4i64_abs_abs:
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index a00ec41516c83..1cbb8360440d9 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -15,17 +15,17 @@ define i8 @test_i8(i8 %a) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarb $7, %cl
-; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    xorb %cl, %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    sarb $7, %cl
-; X64-NEXT:    leal (%rdi,%rcx), %eax
 ; X64-NEXT:    xorb %cl, %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp1neg = sub i8 0, %a
@@ -40,8 +40,8 @@ define i16 @test_i16(i16 %a) nounwind {
 ; X86-NO-CMOV-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X86-NO-CMOV-NEXT:    movl %eax, %ecx
 ; X86-NO-CMOV-NEXT:    sarl $15, %ecx
-; X86-NO-CMOV-NEXT:    addl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    xorl %ecx, %eax
+; X86-NO-CMOV-NEXT:    subl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NO-CMOV-NEXT:    retl
 ;
@@ -71,8 +71,8 @@ define i32 @test_i32(i32 %a) nounwind {
 ; X86-NO-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-CMOV-NEXT:    movl %eax, %ecx
 ; X86-NO-CMOV-NEXT:    sarl $31, %ecx
-; X86-NO-CMOV-NEXT:    addl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    xorl %ecx, %eax
+; X86-NO-CMOV-NEXT:    subl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    retl
 ;
 ; X86-CMOV-LABEL: test_i32:
diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index f34f683db078e..ee2564660a066 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -154,24 +154,21 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 define i8 @sub_abs_i8(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: sub_abs_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarb $7, %dl
-; X86-NEXT:    addb %dl, %cl
-; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    xorb %al, %cl
 ; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_abs_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    sarb $7, %cl
-; X64-NEXT:    addb %cl, %dil
-; X64-NEXT:    xorb %cl, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    sarb $7, %al
+; X64-NEXT:    xorb %al, %dil
 ; X64-NEXT:    subb %dil, %al
-; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    addb %sil, %al
 ; X64-NEXT:    retq
   %abs = tail call i8 @llvm.abs.i8(i8 %x, i1 false)
   %neg = sub nsw i8 %y, %abs
@@ -181,13 +178,12 @@ define i8 @sub_abs_i8(i8 %x, i8 %y) nounwind {
 define i16 @sub_abs_i16(i16 %x, i16 %y) nounwind {
 ; X86-LABEL: sub_abs_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $15, %edx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $15, %eax
+; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -207,13 +203,12 @@ define i16 @sub_abs_i16(i16 %x, i16 %y) nounwind {
 define i32 @sub_abs_i32(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: sub_abs_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_abs_i32:
diff --git a/llvm/test/CodeGen/X86/viabs.ll b/llvm/test/CodeGen/X86/viabs.ll
index 405d9eaa2c834..d892297d81ea6 100644
--- a/llvm/test/CodeGen/X86/viabs.ll
+++ b/llvm/test/CodeGen/X86/viabs.ll
@@ -12,8 +12,8 @@ define <4 x i32> @test_abs_gt_v4i32(<4 x i32> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_gt_v4i32:
@@ -51,8 +51,8 @@ define <4 x i32> @test_abs_ge_v4i32(<4 x i32> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_ge_v4i32:
@@ -176,8 +176,8 @@ define <4 x i32> @test_abs_le_v4i32(<4 x i32> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_le_v4i32:
@@ -215,12 +215,12 @@ define <8 x i32> @test_abs_gt_v8i32(<8 x i32> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psubd %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_gt_v8i32:
@@ -263,12 +263,12 @@ define <8 x i32> @test_abs_ge_v8i32(<8 x i32> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psubd %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_ge_v8i32:
@@ -413,12 +413,12 @@ define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psubd %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_le_v8i32:
@@ -461,20 +461,20 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    psubd %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psubd %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    psubd %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_le_16i32:
@@ -527,8 +527,8 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    psubq %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_ge_v2i64:
@@ -536,8 +536,8 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm1, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    psubq %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test_abs_ge_v2i64:
@@ -577,13 +577,13 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    psubq %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_gt_v4i64:
@@ -591,13 +591,13 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSSE3-NEXT:    psrad $31, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    psubq %xmm2, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    psrad $31, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm2, %xmm1
 ; SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSSE3-NEXT:    psubq %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test_abs_gt_v4i64:
@@ -646,23 +646,23 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    psubq %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psubq %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    psubq %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_le_v8i64:
@@ -670,23 +670,23 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    psubq %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm1
 ; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    psubq %xmm4, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    psubq %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    psubq %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test_abs_le_v8i64:
@@ -754,23 +754,23 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    psubq %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psubq %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    psubq %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    paddq %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test_abs_le_v8i64_fold:
@@ -782,23 +782,23 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSSE3-NEXT:    psubq %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm1
 ; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    psubq %xmm4, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    psubq %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
 ; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    paddq %xmm4, %xmm3
 ; SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSSE3-NEXT:    psubq %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test_abs_le_v8i64_fold:
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/basic.ll.expected
index 78abaf5168068..9ae01c167b8da 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/basic.ll.expected
@@ -9,17 +9,17 @@ define i8 @test_i8(i8 %a) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarb $7, %cl
-; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    xorb %cl, %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    sarb $7, %cl
-; X64-NEXT:    leal (%rdi,%rcx), %eax
 ; X64-NEXT:    xorb %cl, %al
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp1neg = sub i8 0, %a
@@ -34,8 +34,8 @@ define i16 @test_i16(i16 %a) nounwind {
 ; X86-NO-CMOV-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X86-NO-CMOV-NEXT:    movl %eax, %ecx
 ; X86-NO-CMOV-NEXT:    sarl $15, %ecx
-; X86-NO-CMOV-NEXT:    addl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    xorl %ecx, %eax
+; X86-NO-CMOV-NEXT:    subl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NO-CMOV-NEXT:    retl
 ;
@@ -65,8 +65,8 @@ define i32 @test_i32(i32 %a) nounwind {
 ; X86-NO-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-CMOV-NEXT:    movl %eax, %ecx
 ; X86-NO-CMOV-NEXT:    sarl $31, %ecx
-; X86-NO-CMOV-NEXT:    addl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    xorl %ecx, %eax
+; X86-NO-CMOV-NEXT:    subl %ecx, %eax
 ; X86-NO-CMOV-NEXT:    retl
 ;
 ; X86-CMOV-LABEL: test_i32:

From 079d13668bf1b7f929f1897af90f64caae41c81d Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng@sifive.com>
Date: Mon, 21 Feb 2022 14:06:19 +0800
Subject: [PATCH 393/748] [RISCV] Fix the include search path order between
 sysroot and resource folder

Resource folder[1] should include before sysroot[2] in general (Linux clang
toolchain, BareMetal clang toolchain, and GCC using that order), and that
prevent sysroot's header file override resource folder's one, this change is
reference from BareMetal::AddClangSystemIncludeArgs@BareMetal.cpp[3].

And also fix the behavior of `-nobuiltininc`.

[1] Include path from resource folder is something like this: `<toolchain-path>/lib/clang/13.0.0/include/`
[2] Include path from sysroot is something like this: `<toolchain-path>/riscv32-unknown-elf/include`
[3] https://github.com/llvm/llvm-project/blob/llvmorg-13.0.1/clang/lib/Driver/ToolChains/BareMetal.cpp#L193

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D119837
---
 clang/lib/Driver/ToolChains/RISCVToolchain.cpp     |  6 ++++++
 .../test/Driver/Inputs/resource_dir/include/.keep  |  0
 clang/test/Driver/riscv32-toolchain.c              | 14 ++++++++++++++
 clang/test/Driver/riscv64-toolchain.c              | 14 ++++++++++++++
 4 files changed, 34 insertions(+)
 create mode 100644 clang/test/Driver/Inputs/resource_dir/include/.keep

diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index 714325a2db39e..a63ada0cbb7e4 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -98,6 +98,12 @@ void RISCVToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> Dir(getDriver().ResourceDir);
+    llvm::sys::path::append(Dir, "include");
+    addSystemInclude(DriverArgs, CC1Args, Dir.str());
+  }
+
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) {
     SmallString<128> Dir(computeSysRoot());
     llvm::sys::path::append(Dir, "include");
diff --git a/clang/test/Driver/Inputs/resource_dir/include/.keep b/clang/test/Driver/Inputs/resource_dir/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/riscv32-toolchain.c b/clang/test/Driver/riscv32-toolchain.c
index fb262a19a0439..c480a7c00a367 100644
--- a/clang/test/Driver/riscv32-toolchain.c
+++ b/clang/test/Driver/riscv32-toolchain.c
@@ -197,6 +197,20 @@
 // C-RV32-RTLIB-COMPILERRT-ILP32: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv32.a"
 // C-RV32-RTLIB-COMPILERRT-ILP32: "{{.*}}clang_rt.crtend-riscv32.o"
 
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf/include"
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
+// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
+// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf/include"
+
 // RUN: %clang -target riscv32 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;
diff --git a/clang/test/Driver/riscv64-toolchain.c b/clang/test/Driver/riscv64-toolchain.c
index 2774e004854c3..91358298ecdd8 100644
--- a/clang/test/Driver/riscv64-toolchain.c
+++ b/clang/test/Driver/riscv64-toolchain.c
@@ -153,6 +153,20 @@
 // C-RV64-RTLIB-COMPILERRT-LP64: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv64.a"
 // C-RV64-RTLIB-COMPILERRT-LP64: "{{.*}}clang_rt.crtend-riscv64.o"
 
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
+// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}Inputs/resource_dir/include"
+// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
+
 // RUN: %clang -target riscv64 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;

From 0a17ee1ebe0c3384520ea14fdc1d33e38217341a Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng@sifive.com>
Date: Mon, 21 Feb 2022 14:25:49 +0800
Subject: [PATCH 394/748] Revert "[RISCV] Fix the include search path order
 between sysroot and resource folder"

This reverts commit 079d13668bf1b7f929f1897af90f64caae41c81d.
---
 clang/lib/Driver/ToolChains/RISCVToolchain.cpp     |  6 ------
 .../test/Driver/Inputs/resource_dir/include/.keep  |  0
 clang/test/Driver/riscv32-toolchain.c              | 14 --------------
 clang/test/Driver/riscv64-toolchain.c              | 14 --------------
 4 files changed, 34 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/resource_dir/include/.keep

diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index a63ada0cbb7e4..714325a2db39e 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -98,12 +98,6 @@ void RISCVToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    SmallString<128> Dir(getDriver().ResourceDir);
-    llvm::sys::path::append(Dir, "include");
-    addSystemInclude(DriverArgs, CC1Args, Dir.str());
-  }
-
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) {
     SmallString<128> Dir(computeSysRoot());
     llvm::sys::path::append(Dir, "include");
diff --git a/clang/test/Driver/Inputs/resource_dir/include/.keep b/clang/test/Driver/Inputs/resource_dir/include/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/riscv32-toolchain.c b/clang/test/Driver/riscv32-toolchain.c
index c480a7c00a367..fb262a19a0439 100644
--- a/clang/test/Driver/riscv32-toolchain.c
+++ b/clang/test/Driver/riscv32-toolchain.c
@@ -197,20 +197,6 @@
 // C-RV32-RTLIB-COMPILERRT-ILP32: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv32.a"
 // C-RV32-RTLIB-COMPILERRT-ILP32: "{{.*}}clang_rt.crtend-riscv32.o"
 
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
-// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf/include"
-
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
-// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
-// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
-// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf/include"
-
 // RUN: %clang -target riscv32 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;
diff --git a/clang/test/Driver/riscv64-toolchain.c b/clang/test/Driver/riscv64-toolchain.c
index 91358298ecdd8..2774e004854c3 100644
--- a/clang/test/Driver/riscv64-toolchain.c
+++ b/clang/test/Driver/riscv64-toolchain.c
@@ -153,20 +153,6 @@
 // C-RV64-RTLIB-COMPILERRT-LP64: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv64.a"
 // C-RV64-RTLIB-COMPILERRT-LP64: "{{.*}}clang_rt.crtend-riscv64.o"
 
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
-// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
-
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
-// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
-// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}Inputs/resource_dir/include"
-// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
-
 // RUN: %clang -target riscv64 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;

From 47b1fa5fc48821eefefd157ed4af2f2cf3bacef4 Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng@sifive.com>
Date: Mon, 21 Feb 2022 14:06:19 +0800
Subject: [PATCH 395/748] [RISCV] Fix the include search path order between
 sysroot and resource folder (Recommit)

Resource folder[1] should include before sysroot[2] in general (Linux clang
toolchain, BareMetal clang toolchain, and GCC using that order), and that
prevent sysroot's header file override resource folder's one, this change is
reference from BareMetal::AddClangSystemIncludeArgs@BareMetal.cpp[3].

And also fix the behavior of `-nobuiltininc`.

[1] Include path from resource folder is something like this: `<toolchain-path>/lib/clang/13.0.0/include/`
[2] Include path from sysroot is something like this: `<toolchain-path>/riscv32-unknown-elf/include`
[3] https://github.com/llvm/llvm-project/blob/llvmorg-13.0.1/clang/lib/Driver/ToolChains/BareMetal.cpp#L193

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D119837

The recommit fixes the Windows build failure due to path issue.
---
 clang/lib/Driver/ToolChains/RISCVToolchain.cpp     |  6 ++++++
 .../test/Driver/Inputs/resource_dir/include/.keep  |  0
 clang/test/Driver/riscv32-toolchain.c              | 14 ++++++++++++++
 clang/test/Driver/riscv64-toolchain.c              | 14 ++++++++++++++
 4 files changed, 34 insertions(+)
 create mode 100644 clang/test/Driver/Inputs/resource_dir/include/.keep

diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index 714325a2db39e..a63ada0cbb7e4 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -98,6 +98,12 @@ void RISCVToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> Dir(getDriver().ResourceDir);
+    llvm::sys::path::append(Dir, "include");
+    addSystemInclude(DriverArgs, CC1Args, Dir.str());
+  }
+
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) {
     SmallString<128> Dir(computeSysRoot());
     llvm::sys::path::append(Dir, "include");
diff --git a/clang/test/Driver/Inputs/resource_dir/include/.keep b/clang/test/Driver/Inputs/resource_dir/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/riscv32-toolchain.c b/clang/test/Driver/riscv32-toolchain.c
index fb262a19a0439..25aaca78dab2c 100644
--- a/clang/test/Driver/riscv32-toolchain.c
+++ b/clang/test/Driver/riscv32-toolchain.c
@@ -197,6 +197,20 @@
 // C-RV32-RTLIB-COMPILERRT-ILP32: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv32.a"
 // C-RV32-RTLIB-COMPILERRT-ILP32: "{{.*}}clang_rt.crtend-riscv32.o"
 
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf{{/|\\\\}}include"
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
+// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
+// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf{{/|\\\\}}include"
+
 // RUN: %clang -target riscv32 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;
diff --git a/clang/test/Driver/riscv64-toolchain.c b/clang/test/Driver/riscv64-toolchain.c
index 2774e004854c3..91358298ecdd8 100644
--- a/clang/test/Driver/riscv64-toolchain.c
+++ b/clang/test/Driver/riscv64-toolchain.c
@@ -153,6 +153,20 @@
 // C-RV64-RTLIB-COMPILERRT-LP64: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv64.a"
 // C-RV64-RTLIB-COMPILERRT-LP64: "{{.*}}clang_rt.crtend-riscv64.o"
 
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
+// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}Inputs/resource_dir/include"
+// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
+
 // RUN: %clang -target riscv64 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;

From a6fb1bb3061b4f6e01b9402bde1bef280bb90811 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 20 Feb 2022 22:35:30 -0800
Subject: [PATCH 396/748] [ARM] Remove unused lowerABS function. NFC

This function was added in D49837, but no setOperationAction call
was added with it. The code is equivalent to what is done by the
default ExpandIntRes_ABS implementation when ADDCARRY is supported.
Test case added to verify this. There was some existing coverage
from Thumb2 MVE tests, but they started from vectors.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 35 -------------------------
 llvm/lib/Target/ARM/ARMISelLowering.h   |  2 --
 llvm/test/CodeGen/ARM/iabs.ll           | 14 ++++++++++
 3 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 34add2c7a811b..93193e97820d8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -10514,9 +10514,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   case ISD::INTRINSIC_WO_CHAIN:
     return ReplaceLongIntrinsic(N, Results, DAG);
-  case ISD::ABS:
-     lowerABS(N, Results, DAG);
-     return ;
   case ISD::LOAD:
     LowerLOAD(N, Results, DAG);
     break;
@@ -20541,38 +20538,6 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
 }
 
-void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                 SelectionDAG &DAG) const {
-  assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
-  MVT HalfT = MVT::i32;
-  SDLoc dl(N);
-  SDValue Hi, Lo, Tmp;
-
-  if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
-      !isOperationLegalOrCustom(ISD::UADDO, HalfT))
-    return ;
-
-  unsigned OpTypeBits = HalfT.getScalarSizeInBits();
-  SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
-
-  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
-                   DAG.getConstant(0, dl, HalfT));
-  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
-                   DAG.getConstant(1, dl, HalfT));
-
-  Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
-                    DAG.getConstant(OpTypeBits - 1, dl,
-                    getShiftAmountTy(HalfT, DAG.getDataLayout())));
-  Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
-  Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
-                   SDValue(Lo.getNode(), 1));
-  Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
-  Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
-
-  Results.push_back(Lo);
-  Results.push_back(Hi);
-}
-
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 1c5f8389f57cd..08ccd9db1bb01 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -845,8 +845,6 @@ class VectorType;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFSETCC(SDValue Op, SelectionDAG &DAG) const;
-    void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                  SelectionDAG &DAG) const;
     void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
                    SelectionDAG &DAG) const;
 
diff --git a/llvm/test/CodeGen/ARM/iabs.ll b/llvm/test/CodeGen/ARM/iabs.ll
index c52caf605dd1a..00ad2be2edeb9 100644
--- a/llvm/test/CodeGen/ARM/iabs.ll
+++ b/llvm/test/CodeGen/ARM/iabs.ll
@@ -32,3 +32,17 @@ entry:
   %cond = select i1 %cmp, i32 %sub, i32 %sub1
   ret i32 %cond
 }
+
+define i64 @test3(i64 %a) {
+; CHECK-LABEL: test3:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    adds r0, r0, r1, asr #31
+; CHECK-NEXT:    adc r2, r1, r1, asr #31
+; CHECK-NEXT:    eor r0, r0, r1, asr #31
+; CHECK-NEXT:    eor r1, r2, r1, asr #31
+; CHECK-NEXT:    bx lr
+  %tmp1neg = sub i64 0, %a
+  %b = icmp sgt i64 %a, -1
+  %abs = select i1 %b, i64 %a, i64 %tmp1neg
+  ret i64 %abs
+}

From cc279529e8317301492f9625b6acc9a0bf52db56 Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng@sifive.com>
Date: Mon, 21 Feb 2022 14:56:50 +0800
Subject: [PATCH 397/748] Revert "[RISCV] Fix the include search path order
 between sysroot and resource folder (Recommit)"

This reverts commit 47b1fa5fc48821eefefd157ed4af2f2cf3bacef4.
---
 clang/lib/Driver/ToolChains/RISCVToolchain.cpp     |  6 ------
 .../test/Driver/Inputs/resource_dir/include/.keep  |  0
 clang/test/Driver/riscv32-toolchain.c              | 14 --------------
 clang/test/Driver/riscv64-toolchain.c              | 14 --------------
 4 files changed, 34 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/resource_dir/include/.keep

diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index a63ada0cbb7e4..714325a2db39e 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -98,12 +98,6 @@ void RISCVToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    SmallString<128> Dir(getDriver().ResourceDir);
-    llvm::sys::path::append(Dir, "include");
-    addSystemInclude(DriverArgs, CC1Args, Dir.str());
-  }
-
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) {
     SmallString<128> Dir(computeSysRoot());
     llvm::sys::path::append(Dir, "include");
diff --git a/clang/test/Driver/Inputs/resource_dir/include/.keep b/clang/test/Driver/Inputs/resource_dir/include/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/riscv32-toolchain.c b/clang/test/Driver/riscv32-toolchain.c
index 25aaca78dab2c..fb262a19a0439 100644
--- a/clang/test/Driver/riscv32-toolchain.c
+++ b/clang/test/Driver/riscv32-toolchain.c
@@ -197,20 +197,6 @@
 // C-RV32-RTLIB-COMPILERRT-ILP32: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv32.a"
 // C-RV32-RTLIB-COMPILERRT-ILP32: "{{.*}}clang_rt.crtend-riscv32.o"
 
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
-// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf{{/|\\\\}}include"
-
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
-// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
-// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
-// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}/riscv32-unknown-elf{{/|\\\\}}include"
-
 // RUN: %clang -target riscv32 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;
diff --git a/clang/test/Driver/riscv64-toolchain.c b/clang/test/Driver/riscv64-toolchain.c
index 91358298ecdd8..2774e004854c3 100644
--- a/clang/test/Driver/riscv64-toolchain.c
+++ b/clang/test/Driver/riscv64-toolchain.c
@@ -153,20 +153,6 @@
 // C-RV64-RTLIB-COMPILERRT-LP64: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv64.a"
 // C-RV64-RTLIB-COMPILERRT-LP64: "{{.*}}clang_rt.crtend-riscv64.o"
 
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
-// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir/include"
-// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
-
-// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
-// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
-// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
-// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}Inputs/resource_dir/include"
-// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}/riscv64-unknown-elf/include"
-
 // RUN: %clang -target riscv64 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;

From c1f17b0a9ea0d467eaa9589cc28db2787efe3ebf Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng@sifive.com>
Date: Mon, 21 Feb 2022 14:06:19 +0800
Subject: [PATCH 398/748] [RISCV] Fix the include search path order between
 sysroot and resource folder (Recommit again)

Resource folder[1] should include before sysroot[2] in general (Linux clang
toolchain, BareMetal clang toolchain, and GCC using that order), and that
prevent sysroot's header file override resource folder's one, this change is
reference from BareMetal::AddClangSystemIncludeArgs@BareMetal.cpp[3].

And also fix the behavior of `-nobuiltininc`.

[1] Include path from resource folder is something like this: `<toolchain-path>/lib/clang/13.0.0/include/`
[2] Include path from sysroot is something like this: `<toolchain-path>/riscv32-unknown-elf/include`
[3] https://github.com/llvm/llvm-project/blob/llvmorg-13.0.1/clang/lib/Driver/ToolChains/BareMetal.cpp#L193

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D119837

The recommit fixes the Windows build failure due to path issue.
---
 clang/lib/Driver/ToolChains/RISCVToolchain.cpp     |  6 ++++++
 .../test/Driver/Inputs/resource_dir/include/.keep  |  0
 clang/test/Driver/riscv32-toolchain.c              | 14 ++++++++++++++
 clang/test/Driver/riscv64-toolchain.c              | 14 ++++++++++++++
 4 files changed, 34 insertions(+)
 create mode 100644 clang/test/Driver/Inputs/resource_dir/include/.keep

diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index 714325a2db39e..a63ada0cbb7e4 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -98,6 +98,12 @@ void RISCVToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> Dir(getDriver().ResourceDir);
+    llvm::sys::path::append(Dir, "include");
+    addSystemInclude(DriverArgs, CC1Args, Dir.str());
+  }
+
   if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) {
     SmallString<128> Dir(computeSysRoot());
     llvm::sys::path::append(Dir, "include");
diff --git a/clang/test/Driver/Inputs/resource_dir/include/.keep b/clang/test/Driver/Inputs/resource_dir/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/riscv32-toolchain.c b/clang/test/Driver/riscv32-toolchain.c
index fb262a19a0439..50859aaccd7da 100644
--- a/clang/test/Driver/riscv32-toolchain.c
+++ b/clang/test/Driver/riscv32-toolchain.c
@@ -197,6 +197,20 @@
 // C-RV32-RTLIB-COMPILERRT-ILP32: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv32.a"
 // C-RV32-RTLIB-COMPILERRT-ILP32: "{{.*}}clang_rt.crtend-riscv32.o"
 
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree{{.*}}riscv32-unknown-elf{{/|\\\\}}include"
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
+// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
+// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree/{{.*}}riscv32-unknown-elf{{/|\\\\}}include"
+
 // RUN: %clang -target riscv32 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;
diff --git a/clang/test/Driver/riscv64-toolchain.c b/clang/test/Driver/riscv64-toolchain.c
index 2774e004854c3..59580370c0b34 100644
--- a/clang/test/Driver/riscv64-toolchain.c
+++ b/clang/test/Driver/riscv64-toolchain.c
@@ -153,6 +153,20 @@
 // C-RV64-RTLIB-COMPILERRT-LP64: "--start-group" "-lc" "-lgloss" "--end-group" "{{.*}}libclang_rt.builtins-riscv64.a"
 // C-RV64-RTLIB-COMPILERRT-LP64: "{{.*}}clang_rt.crtend-riscv64.o"
 
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
+// RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
+// RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}riscv64-unknown-elf{{/|\\\\}}include"
+
+// RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
+// RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
+// NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}Inputs/resource_dir{{/|\\\\}}include"
+// NO-RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}riscv64-unknown-elf{{/|\\\\}}include"
+
 // RUN: %clang -target riscv64 %s -emit-llvm -S -o - | FileCheck %s
 
 typedef __builtin_va_list va_list;

From c1e4e019454b38e3890589be977a3c2c445fefd1 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <Shraiysh.Vaishay@amd.com>
Date: Mon, 21 Feb 2022 12:50:58 +0530
Subject: [PATCH 399/748] [mlir][OpenMP] Added assemblyFormat for SectionsOp

This patch adds assemblyFormat for omp.sections operation.

Some existing functions have been altered to fit the custom directive
in assemblyFormat. This has led to their callsites to get modified too,
but those will be removed in later patches, when other operations get
their assemblyFormat. All operations were not changed in one patch for
ease of review.

Reviewed By: Mogball

Differential Revision: https://reviews.llvm.org/D120176
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 15 ++-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 99 +++++--------------
 mlir/test/Dialect/OpenMP/invalid.mlir         | 16 +--
 mlir/test/Dialect/OpenMP/ops.mlir             |  4 +-
 4 files changed, 49 insertions(+), 85 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 6ed13e6d8ff2c..d316ca6314b53 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -188,7 +188,20 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments]> {
 
   let regions = (region SizedRegion<1>:$region);
 
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = [{
+    oilist( `reduction` `(`
+              custom<ReductionVarList>(
+                $reduction_vars, type($reduction_vars), $reductions
+              ) `)`
+          | `allocate` `(`
+              custom<AllocateAndAllocator>(
+                $allocate_vars, type($allocate_vars),
+                $allocators_vars, type($allocators_vars)
+              ) `)`
+          | `nowait`
+    ) $region attr-dict
+  }];
+
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index babd71e85bd09..4fa4e5819b339 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -77,7 +77,6 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
 
 /// Parse an allocate clause with allocators and a list of operands with types.
 ///
-/// allocate ::= `allocate` `(` allocate-operand-list `)`
 /// allocate-operand-list :: = allocate-operand |
 ///                            allocator-operand `,` allocate-operand-list
 /// allocate-operand :: = ssa-id-and-type -> ssa-id-and-type
@@ -300,39 +299,35 @@ static void printScheduleClause(OpAsmPrinter &p, ClauseScheduleKind sched,
 // Parser, printer and verifier for ReductionVarList
 //===----------------------------------------------------------------------===//
 
-/// reduction ::= `reduction` `(` reduction-entry-list `)`
 /// reduction-entry-list ::= reduction-entry
 ///                        | reduction-entry-list `,` reduction-entry
 /// reduction-entry ::= symbol-ref `->` ssa-id `:` type
-static ParseResult
-parseReductionVarList(OpAsmParser &parser,
-                      SmallVectorImpl<SymbolRefAttr> &symbols,
-                      SmallVectorImpl<OpAsmParser::OperandType> &operands,
-                      SmallVectorImpl<Type> &types) {
-  if (failed(parser.parseLParen()))
-    return failure();
-
+static ParseResult parseReductionVarList(
+    OpAsmParser &parser, SmallVectorImpl<OpAsmParser::OperandType> &operands,
+    SmallVectorImpl<Type> &types, ArrayAttr &redcuctionSymbols) {
+  SmallVector<SymbolRefAttr> reductionVec;
   do {
-    if (parser.parseAttribute(symbols.emplace_back()) || parser.parseArrow() ||
-        parser.parseOperand(operands.emplace_back()) ||
+    if (parser.parseAttribute(reductionVec.emplace_back()) ||
+        parser.parseArrow() || parser.parseOperand(operands.emplace_back()) ||
         parser.parseColonType(types.emplace_back()))
       return failure();
   } while (succeeded(parser.parseOptionalComma()));
-  return parser.parseRParen();
+  SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
+  redcuctionSymbols = ArrayAttr::get(parser.getContext(), reductions);
+  return success();
 }
 
 /// Print Reduction clause
-static void printReductionVarList(OpAsmPrinter &p,
-                                  Optional<ArrayAttr> reductions,
-                                  OperandRange reductionVars) {
-  p << "reduction(";
+static void printReductionVarList(OpAsmPrinter &p, Operation *op,
+                                  OperandRange reductionVars,
+                                  TypeRange reductionTypes,
+                                  Optional<ArrayAttr> reductions) {
   for (unsigned i = 0, e = reductions->size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
     p << (*reductions)[i] << " -> " << reductionVars[i] << " : "
       << reductionVars[i].getType();
   }
-  p << ") ";
 }
 
 /// Verifies Reduction Clause
@@ -552,7 +547,7 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
   SmallVector<OpAsmParser::OperandType> allocates, allocators;
   SmallVector<Type> allocateTypes, allocatorTypes;
 
-  SmallVector<SymbolRefAttr> reductionSymbols;
+  ArrayAttr reductions;
   SmallVector<OpAsmParser::OperandType> reductionVars;
   SmallVector<Type> reductionVarTypes;
 
@@ -639,9 +634,10 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
                                                   "proc_bind_val", "proc bind"))
         return failure();
     } else if (clauseKeyword == "reduction") {
-      if (checkAllowed(reductionClause) ||
-          parseReductionVarList(parser, reductionSymbols, reductionVars,
-                                reductionVarTypes))
+      if (checkAllowed(reductionClause) || parser.parseLParen() ||
+          parseReductionVarList(parser, reductionVars, reductionVarTypes,
+                                reductions) ||
+          parser.parseRParen())
         return failure();
       clauseSegments[pos[reductionClause]] = reductionVars.size();
     } else if (clauseKeyword == "nowait") {
@@ -746,11 +742,7 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
     if (failed(parser.resolveOperands(reductionVars, reductionVarTypes,
                                       parser.getNameLoc(), result.operands)))
       return failure();
-
-    SmallVector<Attribute> reductions(reductionSymbols.begin(),
-                                      reductionSymbols.end());
-    result.addAttribute("reductions",
-                        parser.getBuilder().getArrayAttr(reductions));
+    result.addAttribute("reductions", reductions);
   }
 
   // Add linear parameters
@@ -805,53 +797,9 @@ static ParseResult parseClauses(OpAsmParser &parser, OperationState &result,
 }
 
 //===----------------------------------------------------------------------===//
-// Parser, printer and verifier for SectionsOp
+// Verifier for SectionsOp
 //===----------------------------------------------------------------------===//
 
-/// Parses an OpenMP Sections operation
-///
-/// sections ::= `omp.sections` clause-list
-/// clause-list ::= clause clause-list | empty
-/// clause ::= reduction | allocate | nowait
-ParseResult SectionsOp::parse(OpAsmParser &parser, OperationState &result) {
-  SmallVector<ClauseType> clauses = {reductionClause, allocateClause,
-                                     nowaitClause};
-
-  SmallVector<int> segments;
-
-  if (failed(parseClauses(parser, result, clauses, segments)))
-    return failure();
-
-  result.addAttribute("operand_segment_sizes",
-                      parser.getBuilder().getI32VectorAttr(segments));
-
-  // Now parse the body.
-  Region *body = result.addRegion();
-  if (parser.parseRegion(*body))
-    return failure();
-  return success();
-}
-
-void SectionsOp::print(OpAsmPrinter &p) {
-  p << " ";
-
-  if (!reduction_vars().empty())
-    printReductionVarList(p, reductions(), reduction_vars());
-
-  if (!allocate_vars().empty()) {
-    printAllocateAndAllocator(p << "allocate(", *this, allocate_vars(),
-                              allocate_vars().getTypes(), allocators_vars(),
-                              allocators_vars().getTypes());
-    p << ")";
-  }
-
-  if (nowait())
-    p << "nowait";
-
-  p << ' ';
-  p.printRegion(region());
-}
-
 LogicalResult SectionsOp::verify() {
   if (allocate_vars().size() != allocators_vars().size())
     return emitError(
@@ -960,8 +908,11 @@ void WsLoopOp::print(OpAsmPrinter &p) {
   if (auto order = order_val())
     p << "order(" << stringifyClauseOrderKind(*order) << ") ";
 
-  if (!reduction_vars().empty())
-    printReductionVarList(p, reductions(), reduction_vars());
+  if (!reduction_vars().empty()) {
+    printReductionVarList(p << "reduction(", *this, reduction_vars(),
+                          reduction_vars().getTypes(), reductions());
+    p << ")";
+  }
 
   p << ' ';
   p.printRegion(region(), /*printEntryBlockArgs=*/false);
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 8a5d50dd0fb96..a991d5f20f6b7 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -793,7 +793,7 @@ func @omp_sections(%data_var : memref<i32>) -> () {
 // -----
 
 func @omp_sections(%cond : i1) {
-  // expected-error @below {{if is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections if(%cond) {
     omp.terminator
   }
@@ -803,7 +803,7 @@ func @omp_sections(%cond : i1) {
 // -----
 
 func @omp_sections() {
-  // expected-error @below {{num_threads is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections num_threads(10) {
     omp.terminator
   }
@@ -813,7 +813,7 @@ func @omp_sections() {
 // -----
 
 func @omp_sections() {
-  // expected-error @below {{proc_bind is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections proc_bind(close) {
     omp.terminator
   }
@@ -823,7 +823,7 @@ func @omp_sections() {
 // -----
 
 func @omp_sections(%data_var : memref<i32>, %linear_var : i32) {
-  // expected-error @below {{linear is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections linear(%data_var = %linear_var : memref<i32>) {
     omp.terminator
   }
@@ -833,7 +833,7 @@ func @omp_sections(%data_var : memref<i32>, %linear_var : i32) {
 // -----
 
 func @omp_sections() {
-  // expected-error @below {{schedule is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections schedule(static, none) {
     omp.terminator
   }
@@ -843,7 +843,7 @@ func @omp_sections() {
 // -----
 
 func @omp_sections() {
-  // expected-error @below {{collapse is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections collapse(3) {
     omp.terminator
   }
@@ -853,7 +853,7 @@ func @omp_sections() {
 // -----
 
 func @omp_sections() {
-  // expected-error @below {{ordered is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections ordered(2) {
     omp.terminator
   }
@@ -863,7 +863,7 @@ func @omp_sections() {
 // -----
 
 func @omp_sections() {
-  // expected-error @below {{order is not a valid clause for the omp.sections operation}}
+  // expected-error @below {{expected '{' to begin a region}}
   omp.sections order(concurrent) {
     omp.terminator
   }
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index cbb8b1f550da4..e2cc900bf3787 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -624,13 +624,13 @@ func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
   "omp.sections" (%data_var1, %data_var1) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = dense<[0,1,1]> : vector<3xi32>} : (memref<i32>, memref<i32>) -> ()
+  }) {allocate, operand_segment_sizes = dense<[0,1,1]> : vector<3xi32>} : (memref<i32>, memref<i32>) -> ()
 
     // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr<f32>)
   "omp.sections" (%redn_var) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operand_segment_sizes = dense<[1,0,0]> : vector<3xi32>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
+  }) {reduction, operand_segment_sizes = dense<[1,0,0]> : vector<3xi32>, reductions=[@add_f32]} : (!llvm.ptr<f32>) -> ()
 
   // CHECK: omp.sections nowait {
   omp.sections nowait {

From 1a2bb03edab9d7aa31beb587d0c863acc6715d27 Mon Sep 17 00:00:00 2001
From: Prateek Gupta <gprateek93@gmail.com>
Date: Thu, 3 Feb 2022 15:55:30 +0000
Subject: [PATCH 400/748] [MLIR][LINALG] Add canonicalization pattern in
 `linalg.generic` op for static shape inference.

This commit adds canonicalization pattern in `linalg.generic` op
for static shape inference. If any of the inputs or outputs have
static shape or is casted from a tensor of static shape, then
shapes of all the inputs and outputs can be inferred by using the
affine map of the static shape input/output.

Signed-Off-By: Prateek Gupta <prateek@nod-labs.com>

Reviewed By: mravishankar

Differential Revision: https://reviews.llvm.org/D118929
---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp     | 160 ++++++++++++++++++-
 mlir/test/Dialect/Linalg/canonicalize.mlir   | 130 +++++++++++++++
 mlir/test/Dialect/Linalg/reshape_fusion.mlir |  19 +--
 3 files changed, 299 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 6b0d22c8f939e..319a1c318ff8a 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -841,11 +841,169 @@ struct EraseIdentityGenericOp : public OpRewritePattern<GenericOp> {
     return success();
   }
 };
+
+/// For each of the operand in `operands` this function maps the static sizes of
+/// dimensions to their affine dim expressions.
+static void populateMap(GenericOp genericOp, ArrayRef<OpOperand *> operands,
+                        llvm::DenseMap<AffineExpr, int64_t> &affineExprToSize) {
+  for (OpOperand *opOperand : operands) {
+    if (genericOp.isScalar(opOperand))
+      continue;
+    Value src = opOperand->get();
+    auto sourceType = src.getType().cast<RankedTensorType>();
+    auto sourceMap = genericOp.getTiedIndexingMap(opOperand);
+
+    // Get the `sourceShape` of the `sourceType`. If the operand is a result of
+    // `tensor.cast` operation and source of the cast operation has a static
+    // shape, then assign it to the `sourceShape`.
+    auto parentOp = src.getDefiningOp();
+    ArrayRef<int64_t> sourceShape = sourceType.getShape();
+    if (parentOp) {
+      if (auto castOp = dyn_cast<tensor::CastOp>(parentOp)) {
+        Value castSource = castOp.source();
+        auto castSourceType = castSource.getType().cast<RankedTensorType>();
+        if (castSourceType.hasStaticShape())
+          sourceShape = castSourceType.getShape();
+      }
+    }
+
+    // If the source shape's dimension has a static shape, map the affine dim
+    // expression to the known static size.
+    for (unsigned i = 0; i < sourceShape.size(); i++) {
+      if (sourceType.isDynamicDim(i))
+        continue;
+      if (auto affineDimExpr = sourceMap.getResult(i).dyn_cast<AffineDimExpr>())
+        affineExprToSize.try_emplace(affineDimExpr, sourceShape[i]);
+    }
+  }
+}
+
+/// Creates new operand w.r.t 'opOperand' of `genericOp` with static sizes
+/// mapped in `affineExprToSize`. New operands are created in `newOperands` and
+/// their result types is stored in `resultTypes`. If `opOperand` requires no
+/// change then `changeNeeded` is false and same operand is added in the
+/// `newOperands` list.
+static void createNewOperandWithStaticSizes(
+    Location loc, PatternRewriter &rewriter, OpOperand *opOperand,
+    llvm::DenseMap<AffineExpr, int64_t> &affineExprToSize, GenericOp genericOp,
+    SmallVector<Value> &newOperands, SmallVector<Type> &resultTypes,
+    bool &changeNeeded) {
+  Value src = opOperand->get();
+  newOperands.push_back(src);
+  if (genericOp.isScalar(opOperand))
+    return;
+  auto sourceType = src.getType().cast<RankedTensorType>();
+  Type resultType = sourceType;
+  if (sourceType.hasStaticShape() && genericOp.isOutputTensor(opOperand)) {
+    resultTypes.push_back(resultType);
+    return;
+  }
+  ArrayRef<int64_t> sourceShape = sourceType.getShape();
+  AffineMap sourceMap = genericOp.getTiedIndexingMap(opOperand);
+  SmallVector<int64_t> newShape;
+  // If operand is updated with new shape, `newOperandNeeded` will be
+  // true.
+  bool newOperandNeeded = false;
+  for (unsigned i = 0; i < sourceShape.size(); i++) {
+    int64_t dimShape = sourceShape[i];
+    AffineExpr dimExpr = sourceMap.getResult(i);
+    if (affineExprToSize.find(dimExpr) == affineExprToSize.end() ||
+        !sourceType.isDynamicDim(i)) {
+      newShape.push_back(dimShape);
+      continue;
+    }
+    // Dimension has a dynamic shape and corresponding affine dim
+    // expression is present in the map. So assign the size for the
+    // given affine dim expression to the dimension.
+    newShape.push_back(affineExprToSize[dimExpr]);
+    newOperandNeeded = true;
+  }
+  resultType = RankedTensorType::get(newShape, sourceType.getElementType());
+  if (newOperandNeeded) {
+    changeNeeded = true;
+    // Get the new operand value given its size and element type by
+    // casting it.
+    Value newOperand = rewriter.create<tensor::CastOp>(loc, resultType, src);
+    unsigned index = opOperand->getOperandNumber();
+    newOperands[index] = newOperand;
+  }
+  if (genericOp.isOutputTensor(opOperand))
+    resultTypes.push_back(resultType);
+}
+
+/// Static shapes for the operands can be inferred if any one of the operands
+/// have a static shape. This can be done by referring to the affine dim
+/// expressions for the operand.
+struct InferStaticShapeOfOperands : public OpRewritePattern<GenericOp> {
+  using OpRewritePattern<GenericOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GenericOp genericOp,
+                                PatternRewriter &rewriter) const override {
+    if (!genericOp.hasTensorSemantics())
+      return failure();
+
+    // Maps must be projected permutations.
+    if (llvm::any_of(genericOp.getIndexingMaps(), [](AffineMap map) {
+          return !map.isProjectedPermutation();
+        }))
+      return failure();
+
+    // Maps affine dim expressions to the static size of that dimension.
+    llvm::DenseMap<AffineExpr, int64_t> affineExprToSize;
+    Location loc = genericOp.getLoc();
+
+    // For each of the affine dim expression, check if the size is known. If
+    // known add that in the map.
+    populateMap(genericOp, genericOp.getInputAndOutputOperands(),
+                affineExprToSize);
+
+    SmallVector<Value> newOperands;
+    SmallVector<Type> resultTypes;
+
+    // `changeNeeded` is `false` if the operands of `genericOp` require no
+    // change in their types.
+    bool changeNeeded = false;
+    newOperands.reserve(genericOp.getNumInputsAndOutputs());
+    resultTypes.reserve(genericOp.getNumOutputs());
+
+    // Iterate over all the operands and update the static sizes.
+    for (OpOperand *opOperand : genericOp.getInputAndOutputOperands()) {
+      createNewOperandWithStaticSizes(loc, rewriter, opOperand,
+                                      affineExprToSize, genericOp, newOperands,
+                                      resultTypes, changeNeeded);
+    }
+
+    // If the generic op has all the required static information, no
+    // canonicalization needed.
+    if (!changeNeeded)
+      return failure();
+
+    // Clone op.
+    Operation *newOp =
+        cast<linalg::LinalgOp>(genericOp.getOperation())
+            .clone(rewriter, genericOp->getLoc(), resultTypes, newOperands);
+    SmallVector<Value> replacements;
+    replacements.reserve(newOp->getNumResults());
+    for (auto it : llvm::zip(genericOp->getResults(), newOp->getResults())) {
+      Value newResult = std::get<1>(it);
+      Value oldResult = std::get<0>(it);
+      Type newType = newResult.getType();
+      Type oldType = oldResult.getType();
+      replacements.push_back(
+          (newType != oldType)
+              ? rewriter.create<tensor::CastOp>(loc, newType, newResult)
+              : newResult);
+    }
+    rewriter.replaceOp(genericOp, replacements);
+    return success();
+  }
+};
 } // namespace
 
 void GenericOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
-  results.add<DeduplicateGenericOpInputs, EraseIdentityGenericOp>(context);
+  results.add<DeduplicateGenericOpInputs, EraseIdentityGenericOp,
+              InferStaticShapeOfOperands>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index cbc8e4a50de5f..8a3f201f7cc26 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -650,3 +650,133 @@ func @no_fold_pad_fill_value_mismatch() -> tensor<412x276xf32> {
   } : tensor<400x273xf32> to tensor<412x276xf32>
   return %pad : tensor<412x276xf32>
 }
+
+// -----
+
+// Tests below verify whether static information is propagated through all the operands of generic op.
+// 1. If one of the inputs of generic op has static info and it has no cast source.
+// 2. If one of the inputs of generic op has static info and it is coming from tensr.cast operation.
+// 3. If one of the outputs of generic op has static info and it is coming from tenso.cast operation.
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @static_input_without_cast
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x3x4xf32>, %[[ARG1:.*]]: tensor<?x?x?xf32>) -> tensor<2x3x4xf32> {
+func @static_input_without_cast(%arg0 : tensor<2x3x4xf32>, %arg1: tensor<?x?x?xf32>) -> tensor<2x3x4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0 = tensor.dim %arg0, %c0 : tensor<2x3x4xf32>
+  %1 = tensor.dim %arg0, %c1 : tensor<2x3x4xf32>
+  %2 = tensor.dim %arg0, %c2 : tensor<2x3x4xf32>
+  %3 = linalg.init_tensor [%0, %1, %2] : tensor<?x?x?xf32>
+  %4 = linalg.generic {
+    indexing_maps = [#map, #map, #map],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } ins(%arg0, %arg1 : tensor<2x3x4xf32>, tensor<?x?x?xf32>)
+    outs(%3 : tensor<?x?x?xf32>) {
+  ^bb0(%arg2 : f32, %arg3 : f32, %arg4 : f32):
+    %9 = arith.addf %arg2, %arg3 : f32
+    linalg.yield %9 : f32
+  } -> (tensor<?x?x?xf32>)
+  %5 = tensor.cast %4 : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+  return %5 : tensor<2x3x4xf32>
+    //  CHECK:      %[[CAST_ARG1:.*]] = tensor.cast %[[ARG1]] : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+    //  CHECK-NEXT: %[[GENERIC_OP:.*]] = linalg.generic
+    //  CHECK-SAME: ins(%[[ARG0]], %[[CAST_ARG1]] : tensor<2x3x4xf32>, tensor<2x3x4xf32>)
+    //  CHECK-SAME: outs({{.*}} : tensor<2x3x4xf32>)
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @static_input_with_cast
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x3x4xf32>, %[[ARG1:.*]]: tensor<?x?x?xf32>) -> tensor<2x3x4xf32> {
+func @static_input_with_cast(%arg0 : tensor<2x3x4xf32>, %arg1: tensor<?x?x?xf32>) -> tensor<2x3x4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0 = tensor.dim %arg0, %c0 : tensor<2x3x4xf32>
+  %1 = tensor.dim %arg0, %c1 : tensor<2x3x4xf32>
+  %2 = tensor.dim %arg0, %c2 : tensor<2x3x4xf32>
+  %3 = linalg.init_tensor [%0, %1, %2] : tensor<?x?x?xf32>
+  %4 = tensor.cast %arg1 : tensor<?x?x?xf32> to tensor<2x?x?xf32>
+  %5 = linalg.generic {
+    indexing_maps = [#map, #map, #map],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } ins(%arg0, %4 : tensor<2x3x4xf32>, tensor<2x?x?xf32>)
+    outs(%3 : tensor<?x?x?xf32>) {
+  ^bb0(%arg2 : f32, %arg3 : f32, %arg4 : f32):
+    %9 = arith.addf %arg2, %arg3 : f32
+    linalg.yield %9 : f32
+  } -> (tensor<?x?x?xf32>)
+  %6 = tensor.cast %5 : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+  return %6: tensor<2x3x4xf32>
+    //  CHECK:      %[[CAST_ARG1:.*]] = tensor.cast %[[ARG1]] : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+    //  CHECK-NEXT: %[[GENERIC_OP:.*]] = linalg.generic
+    //  CHECK-SAME: ins(%[[ARG0]], %[[CAST_ARG1]] : tensor<2x3x4xf32>, tensor<2x3x4xf32>)
+    //  CHECK-SAME: outs({{.*}} : tensor<2x3x4xf32>)
+}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @static_output_with_cast
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<?x?x?xf32>, %[[ARG1:.*]]: tensor<?x?x?xf32>, %[[ARG2:.*]]: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
+func @static_output_with_cast(%arg0 : tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0 = tensor.dim %arg2, %c0 : tensor<2x3x4xf32>
+  %1 = tensor.dim %arg2, %c1 : tensor<2x3x4xf32>
+  %2 = tensor.dim %arg2, %c2 : tensor<2x3x4xf32>
+  %3 = linalg.init_tensor [%0, %1, %2] : tensor<?x?x?xf32>
+  %4 = tensor.cast %3 : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+  %5 = tensor.cast %arg1 : tensor<?x?x?xf32> to tensor<2x?x?xf32>
+  %6 = linalg.generic {
+    indexing_maps = [#map, #map, #map],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } ins(%arg0, %5 : tensor<?x?x?xf32>, tensor<2x?x?xf32>)
+    outs(%4 : tensor<2x3x4xf32>) {
+  ^bb0(%arg3 : f32, %arg4 : f32, %arg5 : f32):
+    %9 = arith.addf %arg3, %arg4 : f32
+    linalg.yield %9 : f32
+  } -> (tensor<2x3x4xf32>)
+  return %6: tensor<2x3x4xf32>
+    //  CHECK:      %[[CAST_ARG0:.*]] = tensor.cast %[[ARG0]] : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+    //  CHECK-NEXT: %[[CAST_ARG1:.*]] = tensor.cast %[[ARG1]] : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+    //  CHECK-NEXT: %[[GENERIC_OP:.*]] = linalg.generic
+    //  CHECK-SAME: ins(%[[CAST_ARG0]], %[[CAST_ARG1]] : tensor<2x3x4xf32>, tensor<2x3x4xf32>)
+    //  CHECK-SAME: outs({{.*}} : tensor<2x3x4xf32>)
+}
+
+// -----
+
+// This test checks the folding of tensor.cast operation when the source value of cast
+// has more static information than the destination value.
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-LABEL: func @cast_source
+// CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x3x4xf32>, %[[ARG1:.*]]: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
+func @cast_source(%arg0 : tensor<2x3x4xf32>, %arg1: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0 = tensor.dim %arg0, %c0 : tensor<2x3x4xf32>
+  %1 = tensor.dim %arg0, %c1 : tensor<2x3x4xf32>
+  %2 = tensor.dim %arg0, %c2 : tensor<2x3x4xf32>
+  %3 = linalg.init_tensor [%0, %1, %2] : tensor<?x?x?xf32>
+  %4 = tensor.cast %arg0 : tensor<2x3x4xf32> to tensor<2x?x?xf32>
+  %5 = tensor.cast %arg1 : tensor<2x3x4xf32> to tensor<2x?x?xf32>
+  %6 = linalg.generic {
+    indexing_maps = [#map, #map, #map],
+    iterator_types = ["parallel", "parallel", "parallel"]
+  } ins(%4, %5 : tensor<2x?x?xf32>, tensor<2x?x?xf32>)
+    outs(%3 : tensor<?x?x?xf32>) {
+  ^bb0(%arg2 : f32, %arg3 : f32, %arg4 : f32):
+    %9 = arith.addf %arg2, %arg3 : f32
+    linalg.yield %9 : f32
+  } -> (tensor<?x?x?xf32>)
+  %7 = tensor.cast %6 : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+  return %7: tensor<2x3x4xf32>
+    //  CHECK:      %[[GENERIC_OP:.*]] = linalg.generic
+    //  CHECK-SAME: ins(%[[ARG0]], %[[ARG1]] : tensor<2x3x4xf32>, tensor<2x3x4xf32>)
+    //  CHECK-SAME: outs({{.*}} : tensor<2x3x4xf32>)
+}
diff --git a/mlir/test/Dialect/Linalg/reshape_fusion.mlir b/mlir/test/Dialect/Linalg/reshape_fusion.mlir
index 12d64651cdf36..5aebfcadc33e7 100644
--- a/mlir/test/Dialect/Linalg/reshape_fusion.mlir
+++ b/mlir/test/Dialect/Linalg/reshape_fusion.mlir
@@ -533,27 +533,28 @@ func @no_fuse_dynamic_dims(%arg0: tensor<?x?xf32>) -> tensor<?xf32> {
 
 // -----
 
-func @no_fuse_mismatched_dynamism(%arg0: tensor<1x1xi64>, %arg1: tensor<?xi64>) -> tensor<1xi64> {
-  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<1x1xi64> into tensor<1xi64>
-  %1 = linalg.init_tensor [1] : tensor<1xi64>
+func @no_fuse_mismatched_dynamism(%arg0: tensor<2x1xi64>, %arg1: tensor<?xi64>) -> tensor<2xi64> {
+  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<2x1xi64> into tensor<2xi64>
+  %1 = linalg.init_tensor [2] : tensor<2xi64>
   %2 = linalg.generic
     {indexing_maps = [affine_map<(d0) -> (d0)>,
                       affine_map<(d0) -> (d0)>,
                       affine_map<(d0) -> (d0)>],
      iterator_types = ["parallel"]}
-    ins(%0, %arg1 : tensor<1xi64>, tensor<?xi64>)
-    outs(%1 : tensor<1xi64>) {
+    ins(%0, %arg1 : tensor<2xi64>, tensor<?xi64>)
+    outs(%1 : tensor<2xi64>) {
   ^bb0(%arg4: i64, %arg5: i64, %arg6: i64):  
     %3 = arith.addi %arg4, %arg5 : i64
     linalg.yield %3 : i64
-  } -> tensor<1xi64>
-  return %2 : tensor<1xi64>
+  } -> tensor<2xi64>
+  return %2 : tensor<2xi64>
 }
 
 //      CHECK: func @no_fuse_mismatched_dynamism
-// CHECK-SAME:     %[[ARG0:.+]]: tensor<1x1xi64>
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<2x1xi64>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<?xi64>
 //      CHECK:   %[[RESHAPE:.+]] = tensor.collapse_shape %[[ARG0]]
+//      CHECK:   %[[CAST:.+]] = tensor.cast %[[ARG1]] : tensor<?xi64> to tensor<2xi64>
 //      CHECK:   %[[GENERIC:.+]] = linalg.generic
-// CHECK-SAME:       ins(%[[RESHAPE]], %[[ARG1]] : tensor<1xi64>, tensor<?xi64>)
+// CHECK-SAME:       ins(%[[RESHAPE]], %[[CAST]] : tensor<2xi64>, tensor<2xi64>)
 //      CHECK:   return %[[GENERIC]]

From 41cb504b7c4b18ac15830107431a0c1eec73a6b2 Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Mon, 21 Feb 2022 17:13:13 +0900
Subject: [PATCH 401/748] [mlir][linalg][bufferize][NFC] Move interface impl to
 Linalg Transforms

This is for consistency with other dialects.

Differential Revision: https://reviews.llvm.org/D120190
---
 .../BufferizableOpInterfaceImpl.h}            | 12 ++++-------
 .../ComprehensiveBufferize/CMakeLists.txt     | 14 -------------
 .../BufferizableOpInterfaceImpl.cpp}          | 19 +++++++----------
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |  2 +-
 .../Transforms/ComprehensiveBufferizePass.cpp |  7 +++----
 mlir/test/lib/Dialect/Linalg/CMakeLists.txt   |  1 -
 .../Linalg/TestComprehensiveBufferize.cpp     |  4 ++--
 .../llvm-project-overlay/mlir/BUILD.bazel     | 21 +------------------
 .../mlir/test/BUILD.bazel                     |  1 -
 9 files changed, 19 insertions(+), 62 deletions(-)
 rename mlir/include/mlir/Dialect/Linalg/{ComprehensiveBufferize/LinalgInterfaceImpl.h => Transforms/BufferizableOpInterfaceImpl.h} (84%)
 rename mlir/lib/Dialect/Linalg/{ComprehensiveBufferize/LinalgInterfaceImpl.cpp => Transforms/BufferizableOpInterfaceImpl.cpp} (97%)

diff --git a/mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h b/mlir/include/mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h
similarity index 84%
rename from mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h
rename to mlir/include/mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h
index 010fd565faa92..13d6f189721be 100644
--- a/mlir/include/mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h
@@ -1,4 +1,4 @@
-//===- LinalgInterfaceImpl.h - Linalg Impl. of BufferizableOpInterface ----===//
+//===- BufferizableOpInterfaceImpl.h - Impl. of BufferizableOpInterface ---===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_LINALG_COMPREHENSIVEBUFFERIZE_LINALGINTERFACEIMPL_H
-#define MLIR_DIALECT_LINALG_COMPREHENSIVEBUFFERIZE_LINALGINTERFACEIMPL_H
+#ifndef MLIR_DIALECT_LINALG_BUFFERIZABLEOPINTERFACEIMPL_H
+#define MLIR_DIALECT_LINALG_BUFFERIZABLEOPINTERFACEIMPL_H
 
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 
@@ -15,8 +15,6 @@ namespace mlir {
 class DialectRegistry;
 
 namespace linalg {
-namespace comprehensive_bufferize {
-namespace linalg_ext {
 
 /// A function that matches anchor OpOperands for InitTensorOp elimination.
 /// If an OpOperand is matched, the function should populate the SmallVector
@@ -53,9 +51,7 @@ LogicalResult insertSliceAnchoredInitTensorEliminationStep(
 
 void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
 
-} // namespace linalg_ext
-} // namespace comprehensive_bufferize
 } // namespace linalg
 } // namespace mlir
 
-#endif // MLIR_DIALECT_LINALG_COMPREHENSIVEBUFFERIZE_LINALGINTERFACEIMPL_H
+#endif // MLIR_DIALECT_LINALG_BUFFERIZABLEOPINTERFACEIMPL_H
diff --git a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/CMakeLists.txt b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/CMakeLists.txt
index c72e99caf6dba..74104ec1d5ebe 100644
--- a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/CMakeLists.txt
@@ -1,9 +1,6 @@
 set(LLVM_OPTIONAL_SOURCES
   AffineInterfaceImpl.cpp
-  LinalgInterfaceImpl.cpp
   ModuleBufferization.cpp
-  StdInterfaceImpl.cpp
-  VectorInterfaceImpl.cpp
 )
 
 add_mlir_dialect_library(MLIRAffineBufferizableOpInterfaceImpl
@@ -14,17 +11,6 @@ add_mlir_dialect_library(MLIRAffineBufferizableOpInterfaceImpl
   MLIRBufferization
 )
 
-add_mlir_dialect_library(MLIRLinalgBufferizableOpInterfaceImpl
-  LinalgInterfaceImpl.cpp
-
-  LINK_LIBS PUBLIC
-  MLIRBufferization
-  MLIRBufferizationTransforms
-  MLIRIR
-  MLIRLinalg
-  MLIRTensor
-)
-
 add_mlir_dialect_library(MLIRModuleBufferization
   ModuleBufferization.cpp
 
diff --git a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
similarity index 97%
rename from mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp
rename to mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
index 7a6a04fb03873..f08be273248fe 100644
--- a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1,4 +1,4 @@
-//===- LinalgInterfaceImpl.cpp - Linalg Impl. of BufferizableOpInterface --===//
+//===- BufferizableOpInterfaceImpl.cpp - Impl. of BufferizableOpInterface -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -17,7 +17,6 @@
 
 using namespace mlir;
 using namespace linalg;
-using namespace comprehensive_bufferize;
 using namespace mlir::bufferization;
 
 namespace {
@@ -529,8 +528,7 @@ findValidInsertionPoint(Operation *initTensorOp,
 /// OpOperand. "Anchored" means that there is a path on the reverse SSA use-def
 /// chain, starting from the OpOperand and always following the aliasing
 /// OpOperand, that eventually ends at a single InitTensorOp.
-LogicalResult
-mlir::linalg::comprehensive_bufferize::linalg_ext::eliminateInitTensors(
+LogicalResult mlir::linalg::eliminateInitTensors(
     Operation *op, BufferizationState &state, BufferizationAliasInfo &aliasInfo,
     AnchorMatchFn anchorMatchFunc, RewriteFn rewriteFunc,
     SmallVector<Operation *> &newOps) {
@@ -632,10 +630,9 @@ mlir::linalg::comprehensive_bufferize::linalg_ext::eliminateInitTensors(
 ///
 /// Note that the newly inserted ExtractSliceOp may have to bufferize
 /// out-of-place due to RaW conflicts.
-LogicalResult mlir::linalg::comprehensive_bufferize::linalg_ext::
-    insertSliceAnchoredInitTensorEliminationStep(
-        Operation *op, BufferizationState &state,
-        BufferizationAliasInfo &aliasInfo, SmallVector<Operation *> &newOps) {
+LogicalResult mlir::linalg::insertSliceAnchoredInitTensorEliminationStep(
+    Operation *op, BufferizationState &state, BufferizationAliasInfo &aliasInfo,
+    SmallVector<Operation *> &newOps) {
   return eliminateInitTensors(
       op, state, aliasInfo,
       /*anchorMatchFunc=*/
@@ -688,8 +685,8 @@ LogicalResult mlir::linalg::comprehensive_bufferize::linalg_ext::
       newOps);
 }
 
-void mlir::linalg::comprehensive_bufferize::linalg_ext::
-    registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry) {
+void mlir::linalg::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
   registry.addOpInterface<linalg::InitTensorOp, InitTensorOpInterface>();
   registry.addOpInterface<linalg::TiledLoopOp, TiledLoopOpInterface>();
   registry.addOpInterface<linalg::YieldOp, YieldOpInterface>();
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 57bef39d65338..f758546bb9afc 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_dialect_library(MLIRLinalgTransforms
+  BufferizableOpInterfaceImpl.cpp
   Bufferize.cpp
   CodegenStrategy.cpp
   ComprehensiveBufferizePass.cpp
@@ -43,7 +44,6 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   MLIRMemRef
   MLIRLinalg
   MLIRLinalgAnalysis
-  MLIRLinalgBufferizableOpInterfaceImpl
   MLIRLinalgUtils
   MLIRModuleBufferization
   MLIRSCF
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
index b4ac512463cb5..ed5067c0db220 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
@@ -13,9 +13,9 @@
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
@@ -52,7 +52,7 @@ struct LinalgComprehensiveModuleBufferize
                 arith::ArithmeticDialect, StandardOpsDialect, AffineDialect>();
     affine_ext::registerBufferizableOpInterfaceExternalModels(registry);
     arith::registerBufferizableOpInterfaceExternalModels(registry);
-    linalg_ext::registerBufferizableOpInterfaceExternalModels(registry);
+    linalg::registerBufferizableOpInterfaceExternalModels(registry);
     scf::registerBufferizableOpInterfaceExternalModels(registry);
     std_ext::registerModuleBufferizationExternalModels(registry);
     tensor::registerBufferizableOpInterfaceExternalModels(registry);
@@ -98,8 +98,7 @@ void LinalgComprehensiveModuleBufferize::runOnOperation() {
     opt.printConflicts = printConflicts;
     opt.testAnalysisOnly = testAnalysisOnly;
     if (initTensorElimination) {
-      opt.addPostAnalysisStep(
-          linalg_ext::insertSliceAnchoredInitTensorEliminationStep);
+      opt.addPostAnalysisStep(insertSliceAnchoredInitTensorEliminationStep);
     }
   } else {
     opt = *options;
diff --git a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
index 92a3f74177eed..51996a7df576b 100644
--- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
@@ -20,7 +20,6 @@ add_mlir_library(MLIRLinalgTestPasses
   MLIRBufferizationTransforms
   MLIRGPUTransforms
   MLIRLinalg
-  MLIRLinalgBufferizableOpInterfaceImpl
   MLIRLinalgTransforms
   MLIRLLVMToLLVMIRTranslation
   MLIRMemRef
diff --git a/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp b/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
index 9eb68343eaadf..f4e9b871398fe 100644
--- a/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
@@ -17,9 +17,9 @@
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
@@ -59,7 +59,7 @@ struct TestComprehensiveFunctionBufferize
                     arith::ArithmeticDialect, AffineDialect>();
     affine_ext::registerBufferizableOpInterfaceExternalModels(registry);
     arith::registerBufferizableOpInterfaceExternalModels(registry);
-    linalg_ext::registerBufferizableOpInterfaceExternalModels(registry);
+    linalg::registerBufferizableOpInterfaceExternalModels(registry);
     scf::registerBufferizableOpInterfaceExternalModels(registry);
     tensor::registerBufferizableOpInterfaceExternalModels(registry);
     vector::registerBufferizableOpInterfaceExternalModels(registry);
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 4fb2c01892f3f..bb61bb6a9c7b8 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6841,25 +6841,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "LinalgBufferizableOpInterfaceImpl",
-    srcs = [
-        "lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp",
-    ],
-    hdrs = [
-        "include/mlir/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.h",
-    ],
-    includes = ["include"],
-    deps = [
-        ":BufferizationDialect",
-        ":BufferizationTransforms",
-        ":IR",
-        ":LinalgOps",
-        ":LinalgStructuredOpsIncGen",
-        ":TensorDialect",
-    ],
-)
-
 td_library(
     name = "LinalgDocTdFiles",
     srcs = ["include/mlir/Dialect/Linalg/IR/LinalgDoc.td"],
@@ -7050,6 +7031,7 @@ cc_library(
     hdrs = [
         "include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h",
         "include/mlir/Dialect/Linalg/Passes.h",
+        "include/mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h",
         "include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h",
         "include/mlir/Dialect/Linalg/Transforms/HoistPadding.h",
         "include/mlir/Dialect/Linalg/Transforms/Hoisting.h",
@@ -7073,7 +7055,6 @@ cc_library(
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
-        ":LinalgBufferizableOpInterfaceImpl",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 86aaabedd14f2..a48d35fd71434 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -396,7 +396,6 @@ cc_library(
         "//mlir:BufferizationTransforms",
         "//mlir:GPUDialect",
         "//mlir:IR",
-        "//mlir:LinalgBufferizableOpInterfaceImpl",
         "//mlir:LinalgOps",
         "//mlir:LinalgTransforms",
         "//mlir:MemRefDialect",

From ae1ba6194f09b7e310fd49cf18a2829dcbeb7f6b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 21 Feb 2022 00:39:26 -0800
Subject: [PATCH 402/748] [ELF] Replace uncompressed InputSectionBase::data()
 with rawData. NFC

In many call sites we know uncompression cannot happen (non-SHF_ALLOC, or the
data (even if compressed) must have been uncompressed by a previous pass).
Prefer rawData in these cases. data() increases code size and prevents
optimization on rawData.
---
 lld/ELF/AArch64ErrataFix.cpp  |  6 +++---
 lld/ELF/ARMErrataFix.cpp      |  4 ++--
 lld/ELF/Arch/X86_64.cpp       |  2 +-
 lld/ELF/Driver.cpp            |  2 +-
 lld/ELF/EhFrame.cpp           |  2 +-
 lld/ELF/ICF.cpp               |  4 ++--
 lld/ELF/InputFiles.cpp        |  4 ++--
 lld/ELF/InputSection.cpp      |  5 +++--
 lld/ELF/InputSection.h        | 14 +++++++-------
 lld/ELF/MarkLive.cpp          |  2 +-
 lld/ELF/Relocations.cpp       |  6 +++---
 lld/ELF/SyntheticSections.cpp | 12 ++++++------
 lld/ELF/Writer.cpp            |  2 +-
 13 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/lld/ELF/AArch64ErrataFix.cpp b/lld/ELF/AArch64ErrataFix.cpp
index 9c9bd41d6e6c4..c0774be3fd2c9 100644
--- a/lld/ELF/AArch64ErrataFix.cpp
+++ b/lld/ELF/AArch64ErrataFix.cpp
@@ -349,7 +349,7 @@ static uint64_t scanCortexA53Errata843419(InputSection *isec, uint64_t &off,
   }
 
   uint64_t patchOff = 0;
-  const uint8_t *buf = isec->data().begin();
+  const uint8_t *buf = isec->rawData.begin();
   const ulittle32_t *instBuf = reinterpret_cast<const ulittle32_t *>(buf + off);
   uint32_t instr1 = *instBuf++;
   uint32_t instr2 = *instBuf++;
@@ -408,7 +408,7 @@ uint64_t Patch843419Section::getLDSTAddr() const {
 void Patch843419Section::writeTo(uint8_t *buf) {
   // Copy the instruction that we will be replacing with a branch in the
   // patchee Section.
-  write32le(buf, read32le(patchee->data().begin() + patcheeOffset));
+  write32le(buf, read32le(patchee->rawData.begin() + patcheeOffset));
 
   // Apply any relocation transferred from the original patchee section.
   relocateAlloc(buf, buf + getSize());
@@ -591,7 +591,7 @@ AArch64Err843419Patcher::patchInputSectionDescription(
       auto dataSym = std::next(codeSym);
       uint64_t off = (*codeSym)->value;
       uint64_t limit =
-          (dataSym == mapSyms.end()) ? isec->data().size() : (*dataSym)->value;
+          (dataSym == mapSyms.end()) ? isec->rawData.size() : (*dataSym)->value;
 
       while (off < limit) {
         uint64_t startAddr = isec->getVA(off);
diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp
index bf277b298587d..bfeeb5db9f33b 100644
--- a/lld/ELF/ARMErrataFix.cpp
+++ b/lld/ELF/ARMErrataFix.cpp
@@ -265,7 +265,7 @@ static ScanResult scanCortexA8Errata657417(InputSection *isec, uint64_t &off,
   }
 
   ScanResult scanRes = {0, 0, nullptr};
-  const uint8_t *buf = isec->data().begin();
+  const uint8_t *buf = isec->rawData.begin();
   // ARMv7-A Thumb 32-bit instructions are encoded 2 consecutive
   // little-endian halfwords.
   const ulittle16_t *instBuf = reinterpret_cast<const ulittle16_t *>(buf + off);
@@ -497,7 +497,7 @@ ARMErr657417Patcher::patchInputSectionDescription(
     while (thumbSym != mapSyms.end()) {
       auto nonThumbSym = std::next(thumbSym);
       uint64_t off = (*thumbSym)->value;
-      uint64_t limit = (nonThumbSym == mapSyms.end()) ? isec->data().size()
+      uint64_t limit = (nonThumbSym == mapSyms.end()) ? isec->rawData.size()
                                                       : (*nonThumbSym)->value;
 
       while (off < limit) {
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index ebf0a479b62a1..8c2333cd9bd8c 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -263,7 +263,7 @@ bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
   Relocation &r = is.relocations[rIndex];
 
   // Check if the relocation corresponds to a direct jmp.
-  const uint8_t *secContents = is.data().data();
+  const uint8_t *secContents = is.rawData.data();
   // If it is not a direct jmp instruction, there is nothing to do here.
   if (*(secContents + r.offset - 1) != 0xe9)
     return false;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 4f5f58dadbd06..4910a7d5a1633 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1923,7 +1923,7 @@ static void readSymbolPartitionSection(InputSectionBase *s) {
   if (!isa<Defined>(sym) || !sym->includeInDynsym())
     return;
 
-  StringRef partName = reinterpret_cast<const char *>(s->data().data());
+  StringRef partName = reinterpret_cast<const char *>(s->rawData.data());
   for (Partition &part : partitions) {
     if (part.name == partName) {
       sym->partition = part.getNumber();
diff --git a/lld/ELF/EhFrame.cpp b/lld/ELF/EhFrame.cpp
index 9ac2ed772073a..794fe04346a99 100644
--- a/lld/ELF/EhFrame.cpp
+++ b/lld/ELF/EhFrame.cpp
@@ -42,7 +42,7 @@ class EhReader {
 private:
   template <class P> void failOn(const P *loc, const Twine &msg) {
     fatal("corrupted .eh_frame: " + msg + "\n>>> defined in " +
-          isec->getObjMsg((const uint8_t *)loc - isec->data().data()));
+          isec->getObjMsg((const uint8_t *)loc - isec->rawData.data()));
   }
 
   uint8_t readByte();
diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index 4c2c0a76f01db..76beff491f52e 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -312,7 +312,7 @@ bool ICF<ELFT>::constantEq(const InputSection *secA, ArrayRef<RelTy> ra,
 template <class ELFT>
 bool ICF<ELFT>::equalsConstant(const InputSection *a, const InputSection *b) {
   if (a->flags != b->flags || a->getSize() != b->getSize() ||
-      a->data() != b->data())
+      a->rawData != b->rawData)
     return false;
 
   // If two sections have different output sections, we cannot merge them.
@@ -491,7 +491,7 @@ template <class ELFT> void ICF<ELFT>::run() {
   // Initially, we use hash values to partition sections.
   parallelForEach(sections, [&](InputSection *s) {
     // Set MSB to 1 to avoid collisions with unique IDs.
-    s->eqClass[0] = xxHash64(s->data()) | (1U << 31);
+    s->eqClass[0] = xxHash64(s->rawData) | (1U << 31);
   });
 
   // Perform 2 rounds of relocation hash propagation. 2 is an empirical value to
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 4f5c69f4e08ac..e4509c1f78880 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -789,10 +789,10 @@ template <class ELFT> static uint32_t readAndFeatures(const InputSection &sec) {
   using Elf_Note = typename ELFT::Note;
 
   uint32_t featuresSet = 0;
-  ArrayRef<uint8_t> data = sec.data();
+  ArrayRef<uint8_t> data = sec.rawData;
   auto reportFatal = [&](const uint8_t *place, const char *msg) {
     fatal(toString(sec.file) + ":(" + sec.name + "+0x" +
-          Twine::utohexstr(place - sec.data().data()) + "): " + msg);
+          Twine::utohexstr(place - sec.rawData.data()) + "): " + msg);
   };
   while (!data.empty()) {
     // Read one NOTE record.
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 01a10a797d5eb..7f6d275e327d4 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -367,6 +367,7 @@ template <class ELFT, class RelTy>
 void InputSection::copyRelocations(uint8_t *buf, ArrayRef<RelTy> rels) {
   const TargetInfo &target = *elf::target;
   InputSectionBase *sec = getRelocatedSection();
+  (void)sec->data(); // uncompress if needed
 
   for (const RelTy &rel : rels) {
     RelType type = rel.getType(config->isMips64EL);
@@ -419,7 +420,7 @@ void InputSection::copyRelocations(uint8_t *buf, ArrayRef<RelTy> rels) {
       }
 
       int64_t addend = getAddend<ELFT>(rel);
-      const uint8_t *bufLoc = sec->data().begin() + rel.r_offset;
+      const uint8_t *bufLoc = sec->rawData.begin() + rel.r_offset;
       if (!RelTy::IsRela)
         addend = target.getImplicitAddend(bufLoc, type);
 
@@ -1423,7 +1424,7 @@ void MergeInputSection::splitIntoPieces() {
 }
 
 SectionPiece *MergeInputSection::getSectionPiece(uint64_t offset) {
-  if (this->data().size() <= offset)
+  if (this->rawData.size() <= offset)
     fatal(toString(this) + ": offset is outside the section");
 
   // If Offset is not at beginning of a section piece, it is not in the map.
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index e4fb354592a09..a6413ff2e11ba 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -215,18 +215,18 @@ class InputSectionBase : public SectionBase {
 
 
   template <typename T> llvm::ArrayRef<T> getDataAs() const {
-    size_t s = data().size();
+    size_t s = rawData.size();
     assert(s % sizeof(T) == 0);
-    return llvm::makeArrayRef<T>((const T *)data().data(), s / sizeof(T));
+    return llvm::makeArrayRef<T>((const T *)rawData.data(), s / sizeof(T));
   }
 
+  mutable ArrayRef<uint8_t> rawData;
+
 protected:
   template <typename ELFT>
   void parseCompressedHeader();
   void uncompress() const;
 
-  mutable ArrayRef<uint8_t> rawData;
-
   // This field stores the uncompressed size of the compressed data in rawData,
   // or -1 if rawData is not compressed (either because the section wasn't
   // compressed in the first place, or because we ended up uncompressing it).
@@ -277,8 +277,8 @@ class MergeInputSection : public InputSectionBase {
   llvm::CachedHashStringRef getData(size_t i) const {
     size_t begin = pieces[i].inputOff;
     size_t end =
-        (pieces.size() - 1 == i) ? data().size() : pieces[i + 1].inputOff;
-    return {toStringRef(data().slice(begin, end - begin)), pieces[i].hash};
+        (pieces.size() - 1 == i) ? rawData.size() : pieces[i + 1].inputOff;
+    return {toStringRef(rawData.slice(begin, end - begin)), pieces[i].hash};
   }
 
   // Returns the SectionPiece at a given input section offset.
@@ -300,7 +300,7 @@ struct EhSectionPiece {
       : inputOff(off), sec(sec), size(size), firstRelocation(firstRelocation) {}
 
   ArrayRef<uint8_t> data() const {
-    return {sec->data().data() + this->inputOff, size};
+    return {sec->rawData.data() + this->inputOff, size};
   }
 
   size_t inputOff;
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index a55758b20f99c..b197dd45d765b 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -74,7 +74,7 @@ template <class ELFT> class MarkLive {
 template <class ELFT>
 static uint64_t getAddend(InputSectionBase &sec,
                           const typename ELFT::Rel &rel) {
-  return target->getImplicitAddend(sec.data().begin() + rel.r_offset,
+  return target->getImplicitAddend(sec.rawData.begin() + rel.r_offset,
                                    rel.getType(config->isMips64EL));
 }
 
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 20433b4552edd..8bc52ed2e3771 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -497,7 +497,7 @@ int64_t RelocationScanner::computeMipsAddend(const RelTy &rel, RelExpr expr,
   if (pairTy == R_MIPS_NONE)
     return 0;
 
-  const uint8_t *buf = sec.data().data();
+  const uint8_t *buf = sec.rawData.data();
   uint32_t symIndex = rel.getSymbol(config->isMips64EL);
 
   // To make things worse, paired relocations might not be contiguous in
@@ -524,7 +524,7 @@ int64_t RelocationScanner::computeAddend(const RelTy &rel, RelExpr expr,
   if (RelTy::IsRela) {
     addend = getAddend<ELFT>(rel);
   } else {
-    const uint8_t *buf = sec.data().data();
+    const uint8_t *buf = sec.rawData.data();
     addend = target.getImplicitAddend(buf + rel.r_offset, type);
   }
 
@@ -1326,7 +1326,7 @@ template <class ELFT, class RelTy> void RelocationScanner::scanOne(RelTy *&i) {
       maybeReportUndefined(cast<Undefined>(sym), sec, offset))
     return;
 
-  const uint8_t *relocatedAddr = sec.data().begin() + offset;
+  const uint8_t *relocatedAddr = sec.rawData.begin() + offset;
   RelExpr expr = target.getRelExpr(type, sym, relocatedAddr);
 
   // Ignore R_*_NONE and other marker relocations.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 37b6877d699d1..2a93e66e7c029 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -112,7 +112,7 @@ std::unique_ptr<MipsAbiFlagsSection<ELFT>> MipsAbiFlagsSection<ELFT>::create() {
     create = true;
 
     std::string filename = toString(sec->file);
-    const size_t size = sec->data().size();
+    const size_t size = sec->rawData.size();
     // Older version of BFD (such as the default FreeBSD linker) concatenate
     // .MIPS.abiflags instead of merging. To allow for this case (or potential
     // zero padding) we ignore everything after the first Elf_Mips_ABIFlags
@@ -121,7 +121,7 @@ std::unique_ptr<MipsAbiFlagsSection<ELFT>> MipsAbiFlagsSection<ELFT>::create() {
             Twine(size) + " instead of " + Twine(sizeof(Elf_Mips_ABIFlags)));
       return nullptr;
     }
-    auto *s = reinterpret_cast<const Elf_Mips_ABIFlags *>(sec->data().data());
+    auto *s = reinterpret_cast<const Elf_Mips_ABIFlags *>(sec->rawData.data());
     if (s->version != 0) {
       error(filename + ": unexpected .MIPS.abiflags version " +
             Twine(s->version));
@@ -184,7 +184,7 @@ std::unique_ptr<MipsOptionsSection<ELFT>> MipsOptionsSection<ELFT>::create() {
     sec->markDead();
 
     std::string filename = toString(sec->file);
-    ArrayRef<uint8_t> d = sec->data();
+    ArrayRef<uint8_t> d = sec->rawData;
 
     while (!d.empty()) {
       if (d.size() < sizeof(Elf_Mips_Options)) {
@@ -240,12 +240,12 @@ std::unique_ptr<MipsReginfoSection<ELFT>> MipsReginfoSection<ELFT>::create() {
   for (InputSectionBase *sec : sections) {
     sec->markDead();
 
-    if (sec->data().size() != sizeof(Elf_Mips_RegInfo)) {
+    if (sec->rawData.size() != sizeof(Elf_Mips_RegInfo)) {
       error(toString(sec->file) + ": invalid size of .reginfo section");
       return nullptr;
     }
 
-    auto *r = reinterpret_cast<const Elf_Mips_RegInfo *>(sec->data().data());
+    auto *r = reinterpret_cast<const Elf_Mips_RegInfo *>(sec->rawData.data());
     reginfo.ri_gprmask |= r->ri_gprmask;
     sec->getFile<ELFT>()->mipsGp0 = r->ri_gp_value;
   };
@@ -3535,7 +3535,7 @@ void ARMExidxSyntheticSection::writeTo(uint8_t *buf) {
   for (InputSection *isec : executableSections) {
     assert(isec->getParent() != nullptr);
     if (InputSection *d = findExidxSection(isec)) {
-      memcpy(buf + offset, d->data().data(), d->data().size());
+      memcpy(buf + offset, d->rawData.data(), d->rawData.size());
       d->relocateAlloc(buf + d->outSecOff, buf + d->outSecOff + d->getSize());
       offset += d->getSize();
     } else {
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 0282d7d6b5a78..cd43e79b82760 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1695,7 +1695,7 @@ static void fixSymbolsAfterShrinking() {
       if (!inputSec || !inputSec->bytesDropped)
         return;
 
-      const size_t OldSize = inputSec->data().size();
+      const size_t OldSize = inputSec->rawData.size();
       const size_t NewSize = OldSize - inputSec->bytesDropped;
 
       if (def->value > NewSize && def->value <= OldSize) {

From ab28488efe6de6f8fa856a1dfd8c0320d41d7608 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Thu, 11 Feb 2021 00:15:56 +0000
Subject: [PATCH 403/748] [C++20][Modules][1/8] Track valid import state.

In C++20 modules imports must be together and at the start of the module.
Rather than growing more ad-hoc flags to test state, this keeps track of the
phase of of a valid module TU (first decl, global module frag, module,
private module frag).  If the phasing is broken (with some diagnostic) the
pattern does not conform to a valid C++20 module, and we set the state
accordingly.

We can thus issue diagnostics when imports appear in the wrong places and
decouple the C++20 modules state from other module variants (modules-ts and
clang modules).  Additionally, we attempt to diagnose wrong imports before
trying to find the module where possible (the latter will generally emit an
unhelpful diagnostic about the module not being available).

Although this generally simplifies the handling of C++20 module import
diagnostics, the motivation was that, in particular, it allows detecting
invalid imports like:

import module A;

int some_decl();

import module B;

where being in a module purview is insufficient to identify them.

Differential Revision: https://reviews.llvm.org/D118893
---
 .../clang/Basic/DiagnosticParseKinds.td       |   4 +
 clang/include/clang/Parse/Parser.h            |  14 +-
 clang/include/clang/Sema/Sema.h               |  15 +-
 clang/lib/Interpreter/IncrementalParser.cpp   |   5 +-
 clang/lib/Parse/ParseAST.cpp                  |   5 +-
 clang/lib/Parse/ParseObjc.cpp                 |   3 +-
 clang/lib/Parse/Parser.cpp                    |  91 ++++++++++--
 clang/lib/Sema/SemaModule.cpp                 |  46 ++++--
 .../Modules/cxx20-import-diagnostics-a.cpp    | 140 ++++++++++++++++++
 .../Clang/ClangModulesDeclVendor.cpp          |   4 +-
 10 files changed, 288 insertions(+), 39 deletions(-)
 create mode 100644 clang/test/Modules/cxx20-import-diagnostics-a.cpp

diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index e23810f402365..f21e841bcdd38 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1539,6 +1539,10 @@ def err_private_module_fragment_expected_semi : Error<
 def err_missing_before_module_end : Error<"expected %0 at end of module">;
 def err_unsupported_module_partition : Error<
   "sorry, module partitions are not yet supported">;
+def err_import_not_allowed_here : Error<
+  "imports must immediately follow the module declaration">;
+def err_import_in_wrong_fragment : Error<
+  "module%select{| partition}0 imports cannot be in the %select{global|private}1 module fragment">;
 
 def err_export_empty : Error<"export declaration cannot be empty">;
 }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 981800a7e2356..08d492a7ec721 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -464,14 +464,17 @@ class Parser : public CodeCompletionHandler {
   void Initialize();
 
   /// Parse the first top-level declaration in a translation unit.
-  bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result);
+  bool ParseFirstTopLevelDecl(DeclGroupPtrTy &Result,
+                              Sema::ModuleImportState &ImportState);
 
   /// ParseTopLevelDecl - Parse one top-level declaration. Returns true if
   /// the EOF was encountered.
-  bool ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl = false);
+  bool ParseTopLevelDecl(DeclGroupPtrTy &Result,
+                         Sema::ModuleImportState &ImportState);
   bool ParseTopLevelDecl() {
     DeclGroupPtrTy Result;
-    return ParseTopLevelDecl(Result);
+    Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
+    return ParseTopLevelDecl(Result, IS);
   }
 
   /// ConsumeToken - Consume the current 'peek token' and lex the next one.
@@ -3491,8 +3494,9 @@ class Parser : public CodeCompletionHandler {
 
   //===--------------------------------------------------------------------===//
   // Modules
-  DeclGroupPtrTy ParseModuleDecl(bool IsFirstDecl);
-  Decl *ParseModuleImport(SourceLocation AtLoc);
+  DeclGroupPtrTy ParseModuleDecl(Sema::ModuleImportState &ImportState);
+  Decl *ParseModuleImport(SourceLocation AtLoc,
+                          Sema::ModuleImportState &ImportState);
   bool parseMisplacedModuleImport();
   bool tryParseMisplacedModuleImport() {
     tok::TokenKind Kind = Tok.getKind();
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index c1e846c55dee7..dfa12ad40b72a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2949,11 +2949,24 @@ class Sema final {
     Implementation, ///< 'module X;'
   };
 
+  /// An enumeration to represent the transition of states in parsing module
+  /// fragments and imports.  If we are not parsing a C++20 TU, or we find
+  /// an error in state transition, the state is set to NotACXX20Module.
+  enum class ModuleImportState {
+    FirstDecl,       ///< Parsing the first decl in a TU.
+    GlobalFragment,  ///< after 'module;' but before 'module X;'
+    ImportAllowed,   ///< after 'module X;' but before any non-import decl.
+    ImportFinished,  ///< after any non-import decl.
+    PrivateFragment, ///< after 'module :private;'.
+    NotACXX20Module  ///< Not a C++20 TU, or an invalid state was found.
+  };
+
   /// The parser has processed a module-declaration that begins the definition
   /// of a module interface or implementation.
   DeclGroupPtrTy ActOnModuleDecl(SourceLocation StartLoc,
                                  SourceLocation ModuleLoc, ModuleDeclKind MDK,
-                                 ModuleIdPath Path, bool IsFirstDecl);
+                                 ModuleIdPath Path,
+                                 ModuleImportState &ImportState);
 
   /// The parser has processed a global-module-fragment declaration that begins
   /// the definition of the global module fragment of the current module unit.
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
index 4ade8b8bb0741..0f1ef3233a2a1 100644
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -164,8 +164,9 @@ IncrementalParser::ParseOrWrapTopLevelDecl() {
   }
 
   Parser::DeclGroupPtrTy ADecl;
-  for (bool AtEOF = P->ParseFirstTopLevelDecl(ADecl); !AtEOF;
-       AtEOF = P->ParseTopLevelDecl(ADecl)) {
+  Sema::ModuleImportState ImportState;
+  for (bool AtEOF = P->ParseFirstTopLevelDecl(ADecl, ImportState); !AtEOF;
+       AtEOF = P->ParseTopLevelDecl(ADecl, ImportState)) {
     // If we got a null return and something *was* parsed, ignore it.  This
     // is due to a top-level semicolon, an action override, or a parse error
     // skipping something.
diff --git a/clang/lib/Parse/ParseAST.cpp b/clang/lib/Parse/ParseAST.cpp
index 01510e8caf3b7..fd79ed3ca158b 100644
--- a/clang/lib/Parse/ParseAST.cpp
+++ b/clang/lib/Parse/ParseAST.cpp
@@ -154,8 +154,9 @@ void clang::ParseAST(Sema &S, bool PrintStats, bool SkipFunctionBodies) {
     llvm::TimeTraceScope TimeScope("Frontend");
     P.Initialize();
     Parser::DeclGroupPtrTy ADecl;
-    for (bool AtEOF = P.ParseFirstTopLevelDecl(ADecl); !AtEOF;
-         AtEOF = P.ParseTopLevelDecl(ADecl)) {
+    Sema::ModuleImportState ImportState;
+    for (bool AtEOF = P.ParseFirstTopLevelDecl(ADecl, ImportState); !AtEOF;
+         AtEOF = P.ParseTopLevelDecl(ADecl, ImportState)) {
       // If we got a null return and something *was* parsed, ignore it.  This
       // is due to a top-level semicolon, an action override, or a parse error
       // skipping something.
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index f493ac9b92caf..08f131ed0d874 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -79,7 +79,8 @@ Parser::ParseObjCAtDirectives(ParsedAttributesWithRange &Attrs) {
     break;
   case tok::objc_import:
     if (getLangOpts().Modules || getLangOpts().DebuggerSupport) {
-      SingleDecl = ParseModuleImport(AtLoc);
+      Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
+      SingleDecl = ParseModuleImport(AtLoc, IS);
       break;
     }
     Diag(AtLoc, diag::err_atimport);
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index ffa1e0f027f1d..87500a0405531 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -581,15 +581,20 @@ void Parser::DestroyTemplateIds() {
 ///                 top-level-declaration-seq[opt] private-module-fragment[opt]
 ///
 /// Note that in C, it is an error if there is no first declaration.
-bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result) {
+bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result,
+                                    Sema::ModuleImportState &ImportState) {
   Actions.ActOnStartOfTranslationUnit();
 
+  // For C++20 modules, a module decl must be the first in the TU.  We also
+  // need to track module imports.
+  ImportState = Sema::ModuleImportState::FirstDecl;
+  bool NoTopLevelDecls = ParseTopLevelDecl(Result, ImportState);
+
   // C11 6.9p1 says translation units must have at least one top-level
   // declaration. C++ doesn't have this restriction. We also don't want to
   // complain if we have a precompiled header, although technically if the PCH
   // is empty we should still emit the (pedantic) diagnostic.
   // If the main file is a header, we're only pretending it's a TU; don't warn.
-  bool NoTopLevelDecls = ParseTopLevelDecl(Result, true);
   if (NoTopLevelDecls && !Actions.getASTContext().getExternalSource() &&
       !getLangOpts().CPlusPlus && !getLangOpts().IsHeaderFile)
     Diag(diag::ext_empty_translation_unit);
@@ -603,7 +608,8 @@ bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result) {
 ///   top-level-declaration:
 ///           declaration
 /// [C++20]   module-import-declaration
-bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
+bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
+                               Sema::ModuleImportState &ImportState) {
   DestroyTemplateIdAnnotationsRAIIObj CleanupRAII(*this);
 
   // Skip over the EOF token, flagging end of previous input for incremental
@@ -647,13 +653,12 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
 
   case tok::kw_module:
   module_decl:
-    Result = ParseModuleDecl(IsFirstDecl);
+    Result = ParseModuleDecl(ImportState);
     return false;
 
-  // tok::kw_import is handled by ParseExternalDeclaration. (Under the Modules
-  // TS, an import can occur within an export block.)
+  case tok::kw_import:
   import_decl: {
-    Decl *ImportDecl = ParseModuleImport(SourceLocation());
+    Decl *ImportDecl = ParseModuleImport(SourceLocation(), ImportState);
     Result = Actions.ConvertDeclToDeclGroup(ImportDecl);
     return false;
   }
@@ -669,12 +674,14 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
     Actions.ActOnModuleBegin(Tok.getLocation(), reinterpret_cast<Module *>(
                                                     Tok.getAnnotationValue()));
     ConsumeAnnotationToken();
+    ImportState = Sema::ModuleImportState::NotACXX20Module;
     return false;
 
   case tok::annot_module_end:
     Actions.ActOnModuleEnd(Tok.getLocation(), reinterpret_cast<Module *>(
                                                   Tok.getAnnotationValue()));
     ConsumeAnnotationToken();
+    ImportState = Sema::ModuleImportState::NotACXX20Module;
     return false;
 
   case tok::eof:
@@ -718,6 +725,16 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result, bool IsFirstDecl) {
   MaybeParseCXX11Attributes(attrs);
 
   Result = ParseExternalDeclaration(attrs);
+  // An empty Result might mean a line with ';' or some parsing error, ignore
+  // it.
+  if (Result) {
+    if (ImportState == Sema::ModuleImportState::FirstDecl)
+      // First decl was not modular.
+      ImportState = Sema::ModuleImportState::NotACXX20Module;
+    else if (ImportState == Sema::ModuleImportState::ImportAllowed)
+      // Non-imports disallow further imports.
+      ImportState = Sema::ModuleImportState::ImportFinished;
+  }
   return false;
 }
 
@@ -887,11 +904,17 @@ Parser::ParseExternalDeclaration(ParsedAttributesWithRange &attrs,
         getCurScope(),
         CurParsedObjCImpl ? Sema::PCC_ObjCImplementation : Sema::PCC_Namespace);
     return nullptr;
-  case tok::kw_import:
-    SingleDecl = ParseModuleImport(SourceLocation());
-    break;
+  case tok::kw_import: {
+    Sema::ModuleImportState IS = Sema::ModuleImportState::NotACXX20Module;
+    if (getLangOpts().CPlusPlusModules) {
+      llvm_unreachable("not expecting a c++20 import here");
+      ProhibitAttributes(attrs);
+    }
+    SingleDecl = ParseModuleImport(SourceLocation(), IS);
+  } break;
   case tok::kw_export:
     if (getLangOpts().CPlusPlusModules || getLangOpts().ModulesTS) {
+      ProhibitAttributes(attrs);
       SingleDecl = ParseExportDeclaration();
       break;
     }
@@ -2291,7 +2314,8 @@ void Parser::ParseMicrosoftIfExistsExternalDeclaration() {
 ///            attribute-specifier-seq[opt] ';'
 ///   private-module-fragment: [C++2a]
 ///     'module' ':' 'private' ';' top-level-declaration-seq[opt]
-Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
+Parser::DeclGroupPtrTy
+Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   SourceLocation StartLoc = Tok.getLocation();
 
   Sema::ModuleDeclKind MDK = TryConsumeToken(tok::kw_export)
@@ -2311,7 +2335,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
   // Parse a global-module-fragment, if present.
   if (getLangOpts().CPlusPlusModules && Tok.is(tok::semi)) {
     SourceLocation SemiLoc = ConsumeToken();
-    if (!IsFirstDecl) {
+    if (ImportState != Sema::ModuleImportState::FirstDecl) {
       Diag(StartLoc, diag::err_global_module_introducer_not_at_start)
         << SourceRange(StartLoc, SemiLoc);
       return nullptr;
@@ -2320,6 +2344,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
       Diag(StartLoc, diag::err_module_fragment_exported)
         << /*global*/0 << FixItHint::CreateRemoval(StartLoc);
     }
+    ImportState = Sema::ModuleImportState::GlobalFragment;
     return Actions.ActOnGlobalModuleFragmentDecl(ModuleLoc);
   }
 
@@ -2334,6 +2359,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
     SourceLocation PrivateLoc = ConsumeToken();
     DiagnoseAndSkipCXX11Attributes();
     ExpectAndConsumeSemi(diag::err_private_module_fragment_expected_semi);
+    ImportState = Sema::ModuleImportState::PrivateFragment;
     return Actions.ActOnPrivateModuleFragmentDecl(ModuleLoc, PrivateLoc);
   }
 
@@ -2361,7 +2387,7 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
 
   ExpectAndConsumeSemi(diag::err_module_expected_semi);
 
-  return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, IsFirstDecl);
+  return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, ImportState);
 }
 
 /// Parse a module import declaration. This is essentially the same for
@@ -2379,7 +2405,8 @@ Parser::DeclGroupPtrTy Parser::ParseModuleDecl(bool IsFirstDecl) {
 ///                   attribute-specifier-seq[opt] ';'
 ///           'export'[opt] 'import' header-name
 ///                   attribute-specifier-seq[opt] ';'
-Decl *Parser::ParseModuleImport(SourceLocation AtLoc) {
+Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
+                                Sema::ModuleImportState &ImportState) {
   SourceLocation StartLoc = AtLoc.isInvalid() ? Tok.getLocation() : AtLoc;
 
   SourceLocation ExportLoc;
@@ -2428,6 +2455,42 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc) {
     return nullptr;
   }
 
+  // Diagnose mis-imports.
+  bool SeenError = true;
+  switch (ImportState) {
+  case Sema::ModuleImportState::ImportAllowed:
+    SeenError = false;
+    break;
+  case Sema::ModuleImportState::FirstDecl:
+  case Sema::ModuleImportState::NotACXX20Module:
+    // TODO: These cases will be an error when partitions are implemented.
+    SeenError = false;
+    break;
+  case Sema::ModuleImportState::GlobalFragment:
+    // We can only have pre-processor directives in the global module
+    // fragment.  We can, however have a header unit import here.
+    if (!HeaderUnit)
+      // We do not have partition support yet, so first arg is 0.
+      Diag(ImportLoc, diag::err_import_in_wrong_fragment) << 0 << 0;
+    else
+      SeenError = false;
+    break;
+  case Sema::ModuleImportState::ImportFinished:
+    if (getLangOpts().CPlusPlusModules)
+      Diag(ImportLoc, diag::err_import_not_allowed_here);
+    else
+      SeenError = false;
+    break;
+  case Sema::ModuleImportState::PrivateFragment:
+    // We do not have partition support yet, so first arg is 0.
+    Diag(ImportLoc, diag::err_import_in_wrong_fragment) << 0 << 1;
+    break;
+  }
+  if (SeenError) {
+    ExpectAndConsumeSemi(diag::err_module_expected_semi);
+    return nullptr;
+  }
+
   DeclResult Import;
   if (HeaderUnit)
     Import =
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 85e58640044dc..9bed3cb769f70 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -80,12 +80,20 @@ Sema::ActOnGlobalModuleFragmentDecl(SourceLocation ModuleLoc) {
   return nullptr;
 }
 
-Sema::DeclGroupPtrTy
-Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
-                      ModuleDeclKind MDK, ModuleIdPath Path, bool IsFirstDecl) {
+Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
+                                           SourceLocation ModuleLoc,
+                                           ModuleDeclKind MDK,
+                                           ModuleIdPath Path,
+                                           ModuleImportState &ImportState) {
   assert((getLangOpts().ModulesTS || getLangOpts().CPlusPlusModules) &&
          "should only have module decl in Modules TS or C++20");
 
+  bool IsFirstDecl = ImportState == ModuleImportState::FirstDecl;
+  bool SeenGMF = ImportState == ModuleImportState::GlobalFragment;
+  // If any of the steps here fail, we count that as invalidating C++20
+  // module state;
+  ImportState = ModuleImportState::NotACXX20Module;
+
   // A module implementation unit requires that we are not compiling a module
   // of any kind. A module interface unit requires that we are not compiling a
   // module map.
@@ -134,9 +142,13 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
       ModuleScopes.back().Module->Kind == Module::GlobalModuleFragment)
     GlobalModuleFragment = ModuleScopes.back().Module;
 
+  assert((!getLangOpts().CPlusPlusModules ||
+          SeenGMF == (bool)GlobalModuleFragment) &&
+         "mismatched global module state");
+
   // In C++20, the module-declaration must be the first declaration if there
   // is no global module fragment.
-  if (getLangOpts().CPlusPlusModules && !IsFirstDecl && !GlobalModuleFragment) {
+  if (getLangOpts().CPlusPlusModules && !IsFirstDecl && !SeenGMF) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
     SourceLocation BeginLoc =
         ModuleScopes.empty()
@@ -231,6 +243,10 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   TU->setModuleOwnershipKind(Decl::ModuleOwnershipKind::ModulePrivate);
   TU->setLocalOwningModule(Mod);
 
+  // We are in the module purview, but before any other (non import)
+  // statements, so imports are allowed.
+  ImportState = ModuleImportState::ImportAllowed;
+
   // FIXME: Create a ModuleDecl.
   return nullptr;
 }
@@ -301,10 +317,10 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
                                    SourceLocation ExportLoc,
                                    SourceLocation ImportLoc,
                                    ModuleIdPath Path) {
-  // Flatten the module path for a Modules TS module name.
+  // Flatten the module path for a C++20 or Modules TS module name.
   std::pair<IdentifierInfo *, SourceLocation> ModuleNameLoc;
-  if (getLangOpts().ModulesTS) {
-    std::string ModuleName;
+  std::string ModuleName;
+  if (getLangOpts().CPlusPlusModules || getLangOpts().ModulesTS) {
     for (auto &Piece : Path) {
       if (!ModuleName.empty())
         ModuleName += ".";
@@ -314,6 +330,14 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
     Path = ModuleIdPath(ModuleNameLoc);
   }
 
+  // Diagnose self-import before attempting a load.
+  if (getLangOpts().CPlusPlusModules && isCurrentModulePurview() &&
+      getCurrentModule()->Name == ModuleName) {
+    Diag(ImportLoc, diag::err_module_self_import)
+        << ModuleName << getLangOpts().CurrentModule;
+    return true;
+  }
+
   Module *Mod =
       getModuleLoader().loadModule(ImportLoc, Path, Module::AllVisible,
                                    /*IsInclusionDirective=*/false);
@@ -342,11 +366,9 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
   // FIXME: we should support importing a submodule within a different submodule
   // of the same top-level module. Until we do, make it an error rather than
   // silently ignoring the import.
-  // Import-from-implementation is valid in the Modules TS. FIXME: Should we
-  // warn on a redundant import of the current module?
-  // FIXME: Import of a module from an implementation partition of the same
-  // module is permitted.
-  if (Mod->getTopLevelModuleName() == getLangOpts().CurrentModule &&
+  // FIXME: Should we warn on a redundant import of the current module?
+  if (!getLangOpts().CPlusPlusModules &&
+      Mod->getTopLevelModuleName() == getLangOpts().CurrentModule &&
       (getLangOpts().isCompilingModule() || !getLangOpts().ModulesTS)) {
     Diag(ImportLoc, getLangOpts().isCompilingModule()
                         ? diag::err_module_self_import
diff --git a/clang/test/Modules/cxx20-import-diagnostics-a.cpp b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
new file mode 100644
index 0000000000000..fd4085bcb4713
--- /dev/null
+++ b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
@@ -0,0 +1,140 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=0 -x c++ %s \
+// RUN:  -o %t/B.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=1 -x c++ %s \
+// RUN:  -o %t/C.pcm
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=2 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/AOK1.pcm
+
+// RUN: %clang_cc1 -std=c++20 -S -D TU=3 -x c++ %s \
+// RUN:  -fmodule-file=%t/AOK1.pcm -o %t/tu_3.s -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=4 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/BC.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -S -D TU=5 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/tu_5.s -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=6 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=7 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -S -D TU=8 -x c++ %s \
+// RUN:  -fmodule-file=%t/B.pcm -o %t/tu_8.s -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=9 -x c++ %s \
+// RUN:  -o %t/B.pcm -verify
+
+// RUN: %clang_cc1 -std=c++20 -emit-obj -D TU=10 -x c++ %s \
+// RUN:  -fmodule-file=%t/C.pcm  -o %t/impl.o
+
+// Test diagnostics for incorrect module import sequences.
+
+#if TU == 0
+
+export module B;
+
+int foo ();
+
+// expected-no-diagnostics
+
+#elif TU == 1
+
+export module C;
+
+int bar ();
+
+// expected-no-diagnostics
+
+#elif TU == 2
+
+export module AOK1;
+
+import B;
+export import C;
+
+export int theAnswer ();
+
+// expected-no-diagnostics
+
+#elif TU == 3
+
+module;
+
+module AOK1;
+
+export import C; // expected-error {{export declaration can only be used within a module interface unit}}
+
+int theAnswer () { return 42; }
+
+#elif TU == 4
+
+export module BC;
+
+export import B;
+
+int foo () { return 10; }
+
+import C; // expected-error {{imports must immediately follow the module declaration}}
+
+#elif TU == 5
+
+module B; // implicitly imports B.
+
+int foo () { return 10; }
+
+import C; // expected-error {{imports must immediately follow the module declaration}}
+
+#elif TU == 6
+
+module;
+// We can only have preprocessor commands here, which could include an include
+// translated header unit.  However those are identified specifically by the
+// preprocessor; non-preprocessed user code should not contain an import here.
+import B; // expected-error {{module imports cannot be in the global module fragment}}
+
+export module D;
+
+int delta ();
+
+#elif TU == 7
+
+export module D;
+
+int delta ();
+
+module :private;
+
+import B; // expected-error {{module imports cannot be in the private module fragment}}
+
+#elif TU == 8
+
+module B;
+
+import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
+
+#elif TU == 9
+
+export module B;
+
+import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
+
+#elif TU == 10
+
+int x;
+
+import C;
+
+int baz() { return 6174; }
+
+// expected-no-diagnostics
+
+#else
+#error "no MODE set"
+#endif
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
index 169f1612afc5f..a15e0d8c588e6 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
@@ -726,8 +726,8 @@ ClangModulesDeclVendor::Create(Target &target) {
   parser->Initialize();
 
   clang::Parser::DeclGroupPtrTy parsed;
-
-  while (!parser->ParseTopLevelDecl(parsed))
+  auto ImportState = clang::Sema::ModuleImportState::NotACXX20Module;
+  while (!parser->ParseTopLevelDecl(parsed, ImportState))
     ;
 
   return new ClangModulesDeclVendorImpl(std::move(diagnostics_engine),

From cf964eb5bd666c870cc21963fc5bf017699c29d7 Mon Sep 17 00:00:00 2001
From: Simon Moll <simon.moll@emea.nec.com>
Date: Mon, 21 Feb 2022 10:37:47 +0100
Subject: [PATCH 404/748] [VE] v512i1 mask arithmetic isel

Packed vector and mask registers (v512) are composed of two v256
subregisters that occupy the even and odd element positions.  We add
packing support SDNodes (vec_unpack_lo|hi and vec_pack) and splitting of
v512i1 mask arithmetic ops with those.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D120053
---
 llvm/lib/Target/VE/VECustomDAG.cpp         | 25 +++++++++++++
 llvm/lib/Target/VE/VECustomDAG.h           | 12 +++++++
 llvm/lib/Target/VE/VEISelLowering.cpp      | 25 +++++++++++++
 llvm/lib/Target/VE/VEISelLowering.h        | 11 ++++--
 llvm/lib/Target/VE/VEInstrInfo.td          | 12 +++++++
 llvm/lib/Target/VE/VEInstrPatternsVec.td   | 13 +++++++
 llvm/test/CodeGen/VE/Packed/mask_binary.ll | 42 ++++++++++++++++++++++
 7 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/VE/Packed/mask_binary.ll

diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
index 7b12bb898c391..d605cdcc7ee15 100644
--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -41,6 +41,17 @@ bool isMaskType(EVT SomeVT) {
   return SomeVT.getVectorElementType() == MVT::i1;
 }
 
+bool isMaskArithmetic(SDValue Op) {
+  switch (Op.getOpcode()) {
+  default:
+    return false;
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR:
+    return isMaskType(Op.getValueType());
+  }
+}
+
 /// \returns the VVP_* SDNode opcode corresponsing to \p OC.
 Optional<unsigned> getVVPOpcode(unsigned Opcode) {
   switch (Opcode) {
@@ -206,4 +217,18 @@ SDValue VECustomDAG::annotateLegalAVL(SDValue AVL) const {
   return getNode(VEISD::LEGALAVL, AVL.getValueType(), AVL);
 }
 
+SDValue VECustomDAG::getUnpack(EVT DestVT, SDValue Vec, PackElem Part,
+                               SDValue AVL) {
+  // TODO: Peek through VEC_PACK and VEC_BROADCAST(REPL_<sth> ..) operands.
+  unsigned OC =
+      (Part == PackElem::Lo) ? VEISD::VEC_UNPACK_LO : VEISD::VEC_UNPACK_HI;
+  return DAG.getNode(OC, DL, DestVT, Vec, AVL);
+}
+
+SDValue VECustomDAG::getPack(EVT DestVT, SDValue LoVec, SDValue HiVec,
+                             SDValue AVL) {
+  // TODO: Peek through VEC_UNPACK_LO|HI operands.
+  return DAG.getNode(VEISD::VEC_PACK, DL, DestVT, LoVec, HiVec, AVL);
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
index ff57645b4d11a..4adceef341f48 100644
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -29,6 +29,8 @@ bool isPackedVectorType(EVT SomeVT);
 
 bool isMaskType(EVT SomeVT);
 
+bool isMaskArithmetic(SDValue Op);
+
 bool isVVPOrVEC(unsigned);
 
 bool maySafelyIgnoreMask(SDValue Op);
@@ -86,6 +88,11 @@ MVT getLegalVectorType(Packing P, MVT ElemVT);
 // Whether this type belongs to a packed mask or vector register.
 Packing getTypePacking(EVT);
 
+enum class PackElem : int8_t {
+  Lo = 0, // Integer (63, 32]
+  Hi = 1  // Float   (32,  0]
+};
+
 class VECustomDAG {
   SelectionDAG &DAG;
   SDLoc DL;
@@ -127,6 +134,11 @@ class VECustomDAG {
   SDValue getUNDEF(EVT VT) const { return DAG.getUNDEF(VT); }
   /// } getNode
 
+  /// Packing {
+  SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL);
+  SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL);
+  /// } Packing
+
   SDValue getConstant(uint64_t Val, EVT VT, bool IsTarget = false,
                       bool IsOpaque = false) const;
 
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 0585bfa9ae78c..38182dca7ba76 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -299,6 +299,9 @@ void VETargetLowering::initVPUActions() {
   for (MVT LegalMaskVT : AllMaskVTs)
     setOperationAction(ISD::BUILD_VECTOR, LegalMaskVT, Custom);
 
+  for (unsigned Opc : {ISD::AND, ISD::OR, ISD::XOR})
+    setOperationAction(Opc, MVT::v512i1, Custom);
+
   for (MVT LegalVecVT : AllVectorVTs) {
     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
@@ -903,6 +906,9 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
     TARGET_NODE_CASE(MEMBARRIER)
     TARGET_NODE_CASE(RET_FLAG)
     TARGET_NODE_CASE(TS1AM)
+    TARGET_NODE_CASE(VEC_UNPACK_LO)
+    TARGET_NODE_CASE(VEC_UNPACK_HI)
+    TARGET_NODE_CASE(VEC_PACK)
     TARGET_NODE_CASE(VEC_BROADCAST)
     TARGET_NODE_CASE(REPL_I32)
     TARGET_NODE_CASE(REPL_F32)
@@ -1746,6 +1752,8 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     // Translate into a VEC_*/VVP_* layer operation.
 #define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
 #include "VVPNodes.def"
+    if (isMaskArithmetic(Op) && isPackedVectorType(Op.getValueType()))
+      return splitMaskArithmetic(Op, DAG);
     return lowerToVVP(Op, DAG);
   }
 }
@@ -2690,6 +2698,23 @@ bool VETargetLowering::hasAndNot(SDValue Y) const {
   return true;
 }
 
+SDValue VETargetLowering::splitMaskArithmetic(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  VECustomDAG CDAG(DAG, Op);
+  SDValue AVL =
+      CDAG.getConstant(Op.getValueType().getVectorNumElements(), MVT::i32);
+  SDValue A = Op->getOperand(0);
+  SDValue B = Op->getOperand(1);
+  SDValue LoA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Lo, AVL);
+  SDValue HiA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Hi, AVL);
+  SDValue LoB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Lo, AVL);
+  SDValue HiB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Hi, AVL);
+  unsigned Opc = Op.getOpcode();
+  auto LoRes = CDAG.getNode(Opc, MVT::v256i1, {LoA, LoB});
+  auto HiRes = CDAG.getNode(Opc, MVT::v256i1, {HiA, HiB});
+  return CDAG.getPack(MVT::v512i1, LoRes, HiRes, AVL);
+}
+
 SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
   // Can we represent this as a VVP node.
   const unsigned Opcode = Op->getOpcode();
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index 30d1faa7495d8..604f34fa2086a 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -38,8 +38,14 @@ enum NodeType : unsigned {
   MEMBARRIER,             // Compiler barrier only; generate a no-op.
   RET_FLAG,               // Return with a flag operand.
   TS1AM,                  // A TS1AM instruction used for 1/2 bytes swap.
-  VEC_BROADCAST,          // A vector broadcast instruction.
-                          //   0: scalar value, 1: VL
+  VEC_UNPACK_LO,          // unpack the lo v256 slice of a packed v512 vector.
+  VEC_UNPACK_HI,          // unpack the hi v256 slice of a packed v512 vector.
+                          //    0: v512 vector, 1: AVL
+  VEC_PACK,               // pack a lo and a hi vector into one v512 vector
+                          //    0: v256 lo vector, 1: v256 hi vector, 2: AVL
+
+  VEC_BROADCAST, // A vector broadcast instruction.
+                 //   0: scalar value, 1: VL
   REPL_I32,
   REPL_F32, // Replicate subregister to other half.
 
@@ -182,6 +188,7 @@ class VETargetLowering : public TargetLowering {
   SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const;
   SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const;
+  SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const;
   /// } VVPLowering
 
   /// Custom DAGCombine {
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 717427c3f48da..dc9fa4352170b 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -2293,6 +2293,18 @@ class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
 def vec_broadcast       : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
                                  [SDTCisVec<0>, IsVLVT<2>]>>;
 
+///// Packed mode Support /////
+// unpack the lo part of this vector
+def vec_unpack_lo   : SDNode<"VEISD::VEC_UNPACK_LO", SDTypeProfile<1, 2,
+                             [SDTCisVec<0>, SDTCisVec<1>, IsVLVT<2>]>>;
+// unpack the hipart of this vector
+def vec_unpack_hi   : SDNode<"VEISD::VEC_UNPACK_HI", SDTypeProfile<1, 2,
+                             [SDTCisVec<0>, SDTCisVec<1>, IsVLVT<2>]>>;
+// re-pack v256i32, v256f32 back into tone v512.32
+def vec_pack        : SDNode<"VEISD::VEC_PACK", SDTypeProfile<1, 3,
+                             [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>,
+                              SDTCisSameNumEltsAs<1,2>, IsVLVT<3>]>>;
+
 // replicate lower 32bit to upper 32bit (f32 scalar replication).
 def repl_f32            : SDNode<"VEISD::REPL_F32",
                             SDTypeProfile<1, 1,
diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td
index f33c4ac0fb42a..e17b418201c65 100644
--- a/llvm/lib/Target/VE/VEInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -112,3 +112,16 @@ class Mask_Binary<ValueType MaskVT, SDPatternOperator MaskOp, string InstName> :
 def: Mask_Binary<v256i1, and, "ANDM">;
 def: Mask_Binary<v256i1, or,  "ORM">;
 def: Mask_Binary<v256i1, xor, "XORM">;
+
+///// Packing support /////
+
+// v256i1 <> v512i1
+def : Pat<(v256i1 (vec_unpack_lo v512i1:$vm, (i32 srcvalue))),
+          (EXTRACT_SUBREG $vm, sub_vm_odd)>;
+def : Pat<(v256i1 (vec_unpack_hi v512i1:$vm, (i32 srcvalue))),
+          (EXTRACT_SUBREG $vm, sub_vm_even)>;
+def : Pat<(v512i1 (vec_pack v256i1:$vlo, v256i1:$vhi, (i32 srcvalue))),
+          (INSERT_SUBREG (INSERT_SUBREG
+                         (v512i1 (IMPLICIT_DEF)),
+                         $vlo, sub_vm_odd),
+                         $vhi, sub_vm_even)>;
diff --git a/llvm/test/CodeGen/VE/Packed/mask_binary.ll b/llvm/test/CodeGen/VE/Packed/mask_binary.ll
new file mode 100644
index 0000000000000..d6b2d7fdcfe15
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/mask_binary.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve -mattr=+vpu | FileCheck %s
+
+; Function Attrs: nounwind
+define fastcc <512 x i1> @and_mm_v512i1(<512 x i1> %x, <512 x i1> %y) {
+; CHECK-LABEL: and_mm_v512i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andm %vm6, %vm2, %vm4
+; CHECK-NEXT:    andm %vm7, %vm3, %vm5
+; CHECK-NEXT:    andm %vm2, %vm0, %vm6
+; CHECK-NEXT:    andm %vm3, %vm0, %vm7
+; CHECK-NEXT:    b.l.t (, %s10)
+  %z = and <512 x i1> %x, %y
+  ret <512 x i1> %z
+}
+
+; Function Attrs: nounwind
+define fastcc <512 x i1> @or_mm_v512i1(<512 x i1> %x, <512 x i1> %y) {
+; CHECK-LABEL: or_mm_v512i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    orm %vm6, %vm2, %vm4
+; CHECK-NEXT:    orm %vm7, %vm3, %vm5
+; CHECK-NEXT:    andm %vm2, %vm0, %vm6
+; CHECK-NEXT:    andm %vm3, %vm0, %vm7
+; CHECK-NEXT:    b.l.t (, %s10)
+  %z = or <512 x i1> %x, %y
+  ret <512 x i1> %z
+}
+
+; Function Attrs: nounwind
+define fastcc <512 x i1> @xor_mm_v512i1(<512 x i1> %x, <512 x i1> %y) {
+; CHECK-LABEL: xor_mm_v512i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorm %vm6, %vm2, %vm4
+; CHECK-NEXT:    xorm %vm7, %vm3, %vm5
+; CHECK-NEXT:    andm %vm2, %vm0, %vm6
+; CHECK-NEXT:    andm %vm3, %vm0, %vm7
+; CHECK-NEXT:    b.l.t (, %s10)
+  %z = xor <512 x i1> %x, %y
+  ret <512 x i1> %z
+}
+

From 2e153038b4a7003f1032ab0340c4e9cce3a066cd Mon Sep 17 00:00:00 2001
From: Lian Wang <Lian.Wang@streamcomputing.com>
Date: Mon, 21 Feb 2022 09:58:55 +0000
Subject: [PATCH 405/748] [RISCV] Add tests for SHFLI and UNSHFLI aliases in
 Zbp extension

Supplement tests alias of SHFLI and UNSHFLI instructions.

RV32: zip8/zip4/zip2/unzip8/unzip4/unzip2

RV64: zip8.w/zip4.w/zip2.w/zip.w/zip8/zip4/zip2/zip/
      unzip8.w/unzip4.w/unzip2.w/unzip.w/unzip8/unzip4/unzip2/unzip

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120015
---
 llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll |  54 ++++++
 llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll | 188 ++++++++++++++++++-
 2 files changed, 238 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
index 4f1dd3d588844..816a27b2be4da 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
@@ -92,6 +92,24 @@ define i32 @shfli32(i32 %a) nounwind {
  ret i32 %tmp
 }
 
+define i32 @zip4i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zip4i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip4 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 12)
+ ret i32 %tmp
+}
+
+define i32 @zip2i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zip2i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip2 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 14)
+ ret i32 %tmp
+}
+
 define i32 @zipi32(i32 %a) nounwind {
 ; RV32ZBP-LABEL: zipi32:
 ; RV32ZBP:       # %bb.0:
@@ -101,6 +119,15 @@ define i32 @zipi32(i32 %a) nounwind {
  ret i32 %tmp
 }
 
+define i32 @zip8i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zip8i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip8 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 8)
+ ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.unshfl.i32(i32 %a, i32 %b)
 
 define i32 @unshfl32(i32 %a, i32 %b) nounwind {
@@ -131,6 +158,24 @@ define i32 @unshfli32(i32 %a) nounwind {
  ret i32 %tmp
 }
 
+define i32 @unzip4i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzip4i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip4 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 12)
+ ret i32 %tmp
+}
+
+define i32 @unzip2i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzip2i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip2 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 14)
+ ret i32 %tmp
+}
+
 define i32 @unzipi32(i32 %a) nounwind {
 ; RV32ZBP-LABEL: unzipi32:
 ; RV32ZBP:       # %bb.0:
@@ -140,6 +185,15 @@ define i32 @unzipi32(i32 %a) nounwind {
  ret i32 %tmp
 }
 
+define i32 @unzip8i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzip8i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip8 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 8)
+ ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.xperm.n.i32(i32 %a, i32 %b)
 
 define i32 @xpermn32(i32 %a, i32 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
index 50b2a1b322c92..5a1736a250e57 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
@@ -100,8 +100,26 @@ define signext i32 @shfli32(i32 signext %a) nounwind {
  ret i32 %tmp
 }
 
-define signext i32 @zip_w(i32 signext %a) nounwind {
-; RV64ZBP-LABEL: zip_w:
+define signext i32 @zip4wi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zip4wi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip4.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 12)
+ ret i32 %tmp
+}
+
+define signext i32 @zip2wi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zip2wi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip2.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 14)
+ ret i32 %tmp
+}
+
+define signext i32 @zipwi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zipwi32:
 ; RV64ZBP:       # %bb.0:
 ; RV64ZBP-NEXT:    zip.w a0, a0
 ; RV64ZBP-NEXT:    ret
@@ -109,6 +127,15 @@ define signext i32 @zip_w(i32 signext %a) nounwind {
  ret i32 %tmp
 }
 
+define signext i32 @zip8wi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zip8wi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip8.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 8)
+ ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.unshfl.i32(i32 %a, i32 %b)
 
 define signext i32 @unshfl32(i32 signext %a, i32 signext %b) nounwind {
@@ -141,8 +168,26 @@ define signext i32 @unshfli32(i32 signext %a) nounwind {
  ret i32 %tmp
 }
 
-define signext i32 @unzip_w(i32 signext %a) nounwind {
-; RV64ZBP-LABEL: unzip_w:
+define signext i32 @unzip4wi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzip4wi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip4.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 12)
+ ret i32 %tmp
+}
+
+define signext i32 @unzip2wi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzip2wi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip2.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 14)
+ ret i32 %tmp
+}
+
+define signext i32 @unzipwi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzipwi32:
 ; RV64ZBP:       # %bb.0:
 ; RV64ZBP-NEXT:    unzip.w a0, a0
 ; RV64ZBP-NEXT:    ret
@@ -150,6 +195,15 @@ define signext i32 @unzip_w(i32 signext %a) nounwind {
  ret i32 %tmp
 }
 
+define signext i32 @unzip8wi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzip8wi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip8.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 8)
+ ret i32 %tmp
+}
+
 declare i64 @llvm.riscv.grev.i64(i64 %a, i64 %b)
 
 define i64 @grev64(i64 %a, i64 %b) nounwind {
@@ -242,6 +296,69 @@ define i64 @shfli64(i64 %a) nounwind {
  ret i64 %tmp
 }
 
+define i64 @zip4wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip4wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip4.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 12)
+ ret i64 %tmp
+}
+
+define i64 @zip2wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip2wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip2.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 14)
+ ret i64 %tmp
+}
+
+define i64 @zipwi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zipwi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 15)
+ ret i64 %tmp
+}
+
+define i64 @zip8i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip8i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip8 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 24)
+ ret i64 %tmp
+}
+
+define i64 @zip4i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip4i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip4 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 28)
+ ret i64 %tmp
+}
+
+define i64 @zip2i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip2i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip2 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 30)
+ ret i64 %tmp
+}
+
+define i64 @zipi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zipi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 31)
+ ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.unshfl.i64(i64 %a, i64 %b)
 
 define i64 @unshfl64(i64 %a, i64 %b) nounwind {
@@ -272,6 +389,69 @@ define i64 @unshfli64(i64 %a) nounwind {
  ret i64 %tmp
 }
 
+define i64 @unzip4wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip4wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip4.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 12)
+ ret i64 %tmp
+}
+
+define i64 @unzip2wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip2wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip2.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 14)
+ ret i64 %tmp
+}
+
+define i64 @unzipwi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzipwi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 15)
+ ret i64 %tmp
+}
+
+define i64 @unzip8i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip8i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip8 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 24)
+ ret i64 %tmp
+}
+
+define i64 @unzip4i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip4i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip4 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 28)
+ ret i64 %tmp
+}
+
+define i64 @unzip2i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip2i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip2 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 30)
+ ret i64 %tmp
+}
+
+define i64 @unzipi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzipi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 31)
+ ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.xperm.n.i64(i64 %a, i64 %b)
 
 define i64 @xpermn64(i64 %a, i64 %b) nounwind {

From 722ad3c48d639f9072120b452184cd04652f823e Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Mon, 21 Feb 2022 11:23:15 +0100
Subject: [PATCH 406/748] [DebugInfo][test] XFAIL
 DebugInfo/Generic/no-empty-child-vars.ll on SPARC

`DebugInfo/Generic/no-empty-child-vars.ll` `FAIL`s on SPARC.  As discussed
in D95617 <https://reviews.llvm.org/D95617>, this is yet another instance
of Issue #46473.

As was done for other failures due to this bug, this patch `XFAIL`s the test.

Tested on `sparcv9-sun-solaris2.11`.

Differential Revision: https://reviews.llvm.org/D120238
---
 llvm/test/DebugInfo/Generic/no-empty-child-vars.ll | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/DebugInfo/Generic/no-empty-child-vars.ll b/llvm/test/DebugInfo/Generic/no-empty-child-vars.ll
index 70e5ead6fc59d..53579580db859 100644
--- a/llvm/test/DebugInfo/Generic/no-empty-child-vars.ll
+++ b/llvm/test/DebugInfo/Generic/no-empty-child-vars.ll
@@ -1,5 +1,8 @@
 ; RUN: %llc_dwarf %s -o - -filetype=obj | llvm-dwarfdump - | FileCheck %s -implicit-check-not=DW_TAG
 ;
+; Issue #46473
+; XFAIL: sparc
+;
 ; This tests that we do not create concrete variable DIEs for variables that
 ; have no location -- for both ways that LLVM-IR can express a variable with
 ; no location. It's possible to:

From 9a547e7009f7502f6e32361f6e8812837a30576f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 18 Feb 2022 11:55:09 +0000
Subject: [PATCH 407/748] [StableHashing] Hash vregs with multiple defs

This allows stableHashValue to be used on Machine IR that is
not in SSA form.

Differential Revision: https://reviews.llvm.org/D120121
---
 llvm/lib/CodeGen/MachineStableHash.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 0803c2b8b85a3..6b213f8d0bdfc 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -64,7 +64,10 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_Register:
     if (Register::isVirtualRegister(MO.getReg())) {
       const MachineRegisterInfo &MRI = MO.getParent()->getMF()->getRegInfo();
-      return MRI.getVRegDef(MO.getReg())->getOpcode();
+      SmallVector<unsigned> DefOpcodes;
+      for (auto &Def : MRI.def_instructions(MO.getReg()))
+        DefOpcodes.push_back(Def.getOpcode());
+      return hash_combine_range(DefOpcodes.begin(), DefOpcodes.end());
     }
 
     // Register operands don't have target flags.

From b32ead41b16af993918b14b243d08d04dcdcd4c9 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 21 Feb 2022 11:43:40 +0100
Subject: [PATCH 408/748] Increase the limit on parser diagnostics

We're really close to the limit
$ grep -c DIAG tools/clang/include/clang/Basic/DiagnosticParseKinds.inc
598
---
 clang/include/clang/Basic/DiagnosticIDs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index ba5f5acc8ce68..8139ffd375a28 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -34,7 +34,7 @@ namespace clang {
       DIAG_SIZE_FRONTEND      =  150,
       DIAG_SIZE_SERIALIZATION =  120,
       DIAG_SIZE_LEX           =  400,
-      DIAG_SIZE_PARSE         =  600,
+      DIAG_SIZE_PARSE         =  700,
       DIAG_SIZE_AST           =  250,
       DIAG_SIZE_COMMENT       =  100,
       DIAG_SIZE_CROSSTU       =  100,

From 18bfc577088df93334f46479aad1f6c15a34d907 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Mon, 21 Feb 2022 17:48:08 +0700
Subject: [PATCH 409/748] [Test] Add failing test for PR53969

---
 .../test/Transforms/IndVarSimplify/pr53969.ll | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/pr53969.ll

diff --git a/llvm/test/Transforms/IndVarSimplify/pr53969.ll b/llvm/test/Transforms/IndVarSimplify/pr53969.ll
new file mode 100644
index 0000000000000..2765f01637779
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/pr53969.ll
@@ -0,0 +1,83 @@
+; RUN: opt -passes="loop(indvars,loop-deletion)" -S  < %s | FileCheck %s
+; XFAIL: *
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure we don't crash.
+define void @test() {
+; CHECK-LABEL: test
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb31, %bb
+  %tmp = phi i32 [ %tmp29, %bb31 ], [ undef, %bb ]
+  %tmp2 = phi i32 [ %tmp4, %bb31 ], [ 11, %bb ]
+  %tmp3 = add nsw i32 112, -1
+  %tmp4 = add nuw nsw i32 %tmp2, 1
+  %tmp5 = mul i32 %tmp3, %tmp3
+  %tmp6 = mul nsw i32 %tmp2, -6
+  %tmp7 = mul i32 %tmp6, %tmp5
+  %tmp8 = add i32 %tmp7, %tmp2
+  %tmp9 = and i32 undef, 1
+  %tmp10 = icmp eq i32 %tmp9, 0
+  br i1 %tmp10, label %bb33, label %bb34
+
+bb11:                                             ; preds = %bb34
+  br i1 undef, label %bb33, label %bb34
+
+bb12:                                             ; preds = %bb34
+  %tmp13 = icmp eq i8 addrspace(1)* undef, null
+  br label %bb14
+
+bb14:                                             ; preds = %bb25, %bb12
+  %tmp15 = phi i32 [ %tmp29, %bb25 ], [ %tmp37, %bb12 ]
+  %tmp16 = phi i64 [ undef, %bb25 ], [ %tmp41, %bb12 ]
+  %tmp17 = phi i32 [ %tmp26, %bb25 ], [ 4, %bb12 ]
+  %tmp18 = add i64 %tmp16, undef
+  %tmp19 = add i32 %tmp15, 1
+  %tmp20 = and i32 %tmp19, 1
+  %tmp21 = icmp eq i32 %tmp20, 0
+  br i1 %tmp21, label %bb32, label %bb22
+
+bb22:                                             ; preds = %bb14
+  %tmp23 = or i32 %tmp17, undef
+  %tmp24 = add i32 %tmp23, undef
+  br i1 %tmp13, label %bb42, label %bb25
+
+bb25:                                             ; preds = %bb22
+  %tmp26 = add nuw nsw i32 %tmp17, 1
+  %tmp27 = zext i32 %tmp26 to i64
+  %tmp28 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %tmp27
+  %tmp29 = add i32 %tmp15, 3
+  %tmp30 = icmp ugt i32 %tmp17, 110
+  br i1 %tmp30, label %bb31, label %bb14
+
+bb31:                                             ; preds = %bb25
+  br label %bb1
+
+bb32:                                             ; preds = %bb14
+  ret void
+
+bb33:                                             ; preds = %bb11, %bb1
+  call void @use(i32 %tmp2)
+  ret void
+
+bb34:                                             ; preds = %bb11, %bb1
+  %tmp35 = phi i32 [ %tmp37, %bb11 ], [ %tmp, %bb1 ]
+  %tmp36 = xor i32 0, %tmp8
+  %tmp37 = add i32 %tmp35, 2
+  %tmp38 = add i32 %tmp36, undef
+  %tmp39 = add i32 %tmp38, undef
+  %tmp40 = sext i32 %tmp39 to i64
+  %tmp41 = add i64 undef, %tmp40
+  br i1 undef, label %bb11, label %bb12
+
+bb42:                                             ; preds = %bb22
+  store atomic i64 %tmp18, i64 addrspace(1)* undef unordered, align 8
+  call void @use(i32 %tmp24)
+  ret void
+}
+
+declare void @use(i32)

From 359a792f9b13f6eefeb5fb2a1af2dcbb1cded6e8 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 28 Jan 2022 13:49:12 +0000
Subject: [PATCH 410/748] [AMDGPU] SILoadStoreOptimizer: avoid unbounded
 register pressure increases

Previously when combining two loads this pass would sink the
first one down to the second one, putting the combined load
where the second one was. It would also sink any intervening
instructions which depended on the first load down to just
after the combined load.

For example, if we started with this sequence of
instructions (code flowing from left to right):

  X A B C D E F Y

After combining loads X and Y into XY we might end up with:

  A B C D E F XY

But if B D and F depended on X, we would get:

  A C E XY B D F

Now if the original code had some short disjoint live ranges
from A to B, C to D and E to F, in the transformed code
these live ranges will be long and overlapping. In this way
a single merge of two loads could cause an unbounded
increase in register pressure.

To fix this, change the way the way that loads are moved in
order to merge them so that:
- The second load is moved up to the first one. (But when
  merging stores, we still move the first store down to the
  second one.)
- Intervening instructions are never moved.
- Instead, if we find an intervening instruction that would
  need to be moved, give up on the merge. But this case
  should now be pretty rare because normal stores have no
  outputs, and normal loads only have address register
  inputs, but these will be identical for any pair of loads
  that we try to merge.

As well as fixing the unbounded register pressure increase
problem, moving loads up and stores down seems like it
should usually be a win for memory latency reasons.

Differential Revision: https://reviews.llvm.org/D119006
---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    | 323 +++++++-----------
 .../AMDGPU/ds-combine-with-dependence.ll      |   8 +-
 llvm/test/CodeGen/AMDGPU/ds_read2.ll          |  14 +-
 .../CodeGen/AMDGPU/lower-lds-struct-aa.ll     |  16 +-
 .../AMDGPU/merge-load-store-physreg.mir       |   2 +-
 .../AMDGPU/merge-out-of-order-ldst.mir        |   3 +-
 llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir    |   8 +-
 .../AMDGPU/si-triv-disjoint-mem-access.ll     |   2 +-
 8 files changed, 151 insertions(+), 225 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index fdc8f30c01b07..d041c831b6db0 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -185,6 +185,9 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   AliasAnalysis *AA = nullptr;
   bool OptimizeAgain;
 
+  bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
+                           const DenseSet<Register> &ARegUses,
+                           const MachineInstr &A, const MachineInstr &B) const;
   static bool dmasksCanBeCombined(const CombineInfo &CI,
                                   const SIInstrInfo &TII,
                                   const CombineInfo &Paired);
@@ -199,38 +202,37 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                                     const CombineInfo &Paired);
   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
 
-  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
-                            SmallVectorImpl<MachineInstr *> &InstsToMove);
+  CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
-                                             CombineInfo &Paired,
-                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+                 MachineBasicBlock::iterator InsertBefore);
 
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
   MachineBasicBlock::iterator
   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
-                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                  MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
-                 const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                 MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
-                          const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                          MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
-                      const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                      MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
-                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                       MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
-                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                       MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
-                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                        MachineBasicBlock::iterator InsertBefore);
 
   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                            int32_t NewOffset) const;
@@ -580,74 +582,31 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
   return new SILoadStoreOptimizer();
 }
 
-static void moveInstsAfter(MachineBasicBlock::iterator I,
-                           ArrayRef<MachineInstr *> InstsToMove) {
-  MachineBasicBlock *MBB = I->getParent();
-  ++I;
-  for (MachineInstr *MI : InstsToMove) {
-    MI->removeFromParent();
-    MBB->insert(I, MI);
-  }
-}
-
 static void addDefsUsesToList(const MachineInstr &MI,
                               DenseSet<Register> &RegDefs,
-                              DenseSet<Register> &PhysRegUses) {
-  for (const MachineOperand &Op : MI.operands()) {
-    if (Op.isReg()) {
-      if (Op.isDef())
-        RegDefs.insert(Op.getReg());
-      else if (Op.readsReg() && Op.getReg().isPhysical())
-        PhysRegUses.insert(Op.getReg());
-    }
-  }
-}
-
-static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
-                                      MachineBasicBlock::iterator B,
-                                      AliasAnalysis *AA) {
-  // RAW or WAR - cannot reorder
-  // WAW - cannot reorder
-  // RAR - safe to reorder
-  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
-}
-
-// Add MI and its defs to the lists if MI reads one of the defs that are
-// already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
-                                  DenseSet<Register> &PhysRegUses,
-                                  SmallVectorImpl<MachineInstr *> &Insts) {
-  for (MachineOperand &Use : MI.operands()) {
-    // If one of the defs is read, then there is a use of Def between I and the
-    // instruction that I will potentially be merged with. We will need to move
-    // this instruction after the merged instructions.
-    //
-    // Similarly, if there is a def which is read by an instruction that is to
-    // be moved for merging, then we need to move the def-instruction as well.
-    // This can only happen for physical registers such as M0; virtual
-    // registers are in SSA form.
-    if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
-                        (Use.isDef() && RegDefs.count(Use.getReg())) ||
-                        (Use.isDef() && Use.getReg().isPhysical() &&
-                         PhysRegUses.count(Use.getReg())))) {
-      Insts.push_back(&MI);
-      addDefsUsesToList(MI, RegDefs, PhysRegUses);
-      return true;
-    }
+                              DenseSet<Register> &RegUses) {
+  for (const auto &Op : MI.operands()) {
+    if (!Op.isReg())
+      continue;
+    if (Op.isDef())
+      RegDefs.insert(Op.getReg());
+    if (Op.readsReg())
+      RegUses.insert(Op.getReg());
   }
-
-  return false;
 }
 
-static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
-                                    ArrayRef<MachineInstr *> InstsToMove,
-                                    AliasAnalysis *AA) {
-  assert(MemOp.mayLoadOrStore());
-
-  for (MachineInstr *InstToMove : InstsToMove) {
-    if (!InstToMove->mayLoadOrStore())
+bool SILoadStoreOptimizer::canSwapInstructions(
+    const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
+    const MachineInstr &A, const MachineInstr &B) const {
+  if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
+      (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
+    return false;
+  for (const auto &BOp : B.operands()) {
+    if (!BOp.isReg())
       continue;
-    if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
+    if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
+      return false;
+    if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
       return false;
   }
   return true;
@@ -890,86 +849,59 @@ SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
   return nullptr;
 }
 
-/// This function assumes that CI comes before Paired in a basic block.
-bool SILoadStoreOptimizer::checkAndPrepareMerge(
-    CombineInfo &CI, CombineInfo &Paired,
-    SmallVectorImpl<MachineInstr *> &InstsToMove) {
+/// This function assumes that CI comes before Paired in a basic block. Return
+/// an insertion point for the merged instruction or nullptr on failure.
+SILoadStoreOptimizer::CombineInfo *
+SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
+                                           CombineInfo &Paired) {
   // If another instruction has already been merged into CI, it may now be a
   // type that we can't do any further merging into.
   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
-    return false;
+    return nullptr;
   assert(CI.InstClass == Paired.InstClass);
 
   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
       getInstSubclass(Paired.I->getOpcode(), *TII))
-    return false;
+    return nullptr;
 
   // Check both offsets (or masks for MIMG) can be combined and fit in the
   // reduced range.
   if (CI.InstClass == MIMG) {
     if (!dmasksCanBeCombined(CI, *TII, Paired))
-      return false;
+      return nullptr;
   } else {
     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
-      return false;
+      return nullptr;
   }
 
-  DenseSet<Register> RegDefsToMove;
-  DenseSet<Register> PhysRegUsesToMove;
-  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
-
-  MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
-  for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
-    if (MBBI == MBBE) {
-      // CombineInfo::Order is a hint on the instruction ordering within the
-      // basic block. This hint suggests that CI precedes Paired, which is
-      // true most of the time. However, moveInstsAfter() processing a
-      // previous list may have changed this order in a situation when it
-      // moves an instruction which exists in some other merge list.
-      // In this case it must be dependent.
-      return false;
+  DenseSet<Register> RegDefs;
+  DenseSet<Register> RegUses;
+  CombineInfo *Where;
+  if (CI.I->mayLoad()) {
+    // Try to hoist Paired up to CI.
+    addDefsUsesToList(*Paired.I, RegDefs, RegUses);
+    for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
+      if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
+        return nullptr;
     }
-
-    // Keep going as long as one of these conditions are met:
-    // 1. It is safe to move I down past MBBI.
-    // 2. It is safe to move MBBI down past the instruction that I will
-    //    be merged into.
-
-    if (MBBI->mayLoadOrStore() &&
-        (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-         !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
-      // We fail condition #1, but we may still be able to satisfy condition
-      // #2.  Add this instruction to the move list and then we will check
-      // if condition #2 holds once we have selected the matching instruction.
-      InstsToMove.push_back(&*MBBI);
-      addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
-      continue;
+    Where = &CI;
+  } else {
+    // Try to sink CI down to Paired.
+    addDefsUsesToList(*CI.I, RegDefs, RegUses);
+    for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
+      if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
+        return nullptr;
     }
-
-    // When we match I with another load/store instruction we will be moving I
-    // down to the location of the matched instruction any uses of I will need
-    // to be moved down as well.
-    addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, InstsToMove);
+    Where = &Paired;
   }
 
-  // If Paired depends on any of the instructions we plan to move, give up.
-  if (addToListsIfDependent(*Paired.I, RegDefsToMove, PhysRegUsesToMove,
-                            InstsToMove))
-    return false;
-
-  // We need to go through the list of instructions that we plan to
-  // move and make sure they are all safe to move down past the merged
-  // instruction.
-  if (!canMoveInstsAcrossMemOp(*Paired.I, InstsToMove, AA))
-    return false;
-
   // Call offsetsCanBeCombined with modify = true so that the offsets are
   // correct for the new instruction.  This should return true, because
   // this function should only be called on CombineInfo objects that
   // have already been confirmed to be mergeable.
   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
     offsetsCanBeCombined(CI, *STM, Paired, true);
-  return true;
+  return Where;
 }
 
 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
@@ -988,7 +920,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
 
 MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+                                     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -1027,13 +959,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
         .addReg(AddrReg->getReg(), 0, BaseSubReg)
         .addImm(0); // clamp bit
@@ -1041,7 +973,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   }
 
   MachineInstrBuilder Read2 =
-      BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
+      BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
           .addImm(NewOffset0)                        // offset0
           .addImm(NewOffset1)                        // offset1
@@ -1053,14 +985,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 
   // Copy to the old destination registers.
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1085,9 +1015,9 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
-                                      const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1121,13 +1051,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
         .addReg(AddrReg->getReg(), 0, BaseSubReg)
         .addImm(0); // clamp bit
@@ -1135,7 +1065,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
   }
 
   MachineInstrBuilder Write2 =
-      BuildMI(*MBB, Paired.I, DL, Write2Desc)
+      BuildMI(*MBB, InsertBefore, DL, Write2Desc)
           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
           .add(*Data0)                               // data0
           .add(*Data1)                               // data1
@@ -1144,8 +1074,6 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  moveInstsAfter(Write2, InstsToMove);
-
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
 
@@ -1155,7 +1083,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
 
 MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
-                           const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+                                     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1167,7 +1095,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   unsigned DMaskIdx =
       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
     if (I == DMaskIdx)
       MIB.addImm(MergedDMask);
@@ -1193,14 +1121,12 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1209,7 +1135,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1228,11 +1154,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
 
   MachineInstr *New =
-    BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
-        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
-        .addImm(MergedOffset) // offset
-        .addImm(CI.CPol)      // cpol
-        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+      BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
+          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+          .addImm(MergedOffset) // offset
+          .addImm(CI.CPol)      // cpol
+          .addMemOperand(
+              combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1243,14 +1170,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1259,7 +1184,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1271,7 +1196,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
 
@@ -1304,14 +1229,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1320,7 +1243,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1332,7 +1255,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
 
@@ -1370,14 +1293,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1386,7 +1307,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1403,13 +1324,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
       .add(*Src0)
       .addImm(SubRegIdx0)
       .add(*Src1)
       .addImm(SubRegIdx1);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1439,8 +1360,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
           .addMemOperand(
               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
-  moveInstsAfter(MIB, InstsToMove);
-
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
@@ -1545,7 +1464,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1562,13 +1481,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
       .add(*Src0)
       .addImm(SubRegIdx0)
       .add(*Src1)
       .addImm(SubRegIdx1);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1594,8 +1513,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
         .addImm(0)            // swz
         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
-  moveInstsAfter(MIB, InstsToMove);
-
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
@@ -2074,8 +1991,8 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
     CombineInfo &CI = *First;
     CombineInfo &Paired = *Second;
 
-    SmallVector<MachineInstr *, 8> InstsToMove;
-    if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+    CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
+    if (!Where) {
       ++I;
       continue;
     }
@@ -2090,38 +2007,38 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
       llvm_unreachable("unknown InstClass");
       break;
     case DS_READ:
-      NewMI = mergeRead2Pair(CI, Paired, InstsToMove);
+      NewMI = mergeRead2Pair(CI, Paired, Where->I);
       break;
     case DS_WRITE:
-      NewMI = mergeWrite2Pair(CI, Paired, InstsToMove);
+      NewMI = mergeWrite2Pair(CI, Paired, Where->I);
       break;
     case S_BUFFER_LOAD_IMM:
-      NewMI = mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
+      NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 8;
       break;
     case BUFFER_LOAD:
-      NewMI = mergeBufferLoadPair(CI, Paired, InstsToMove);
+      NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     case BUFFER_STORE:
-      NewMI = mergeBufferStorePair(CI, Paired, InstsToMove);
+      NewMI = mergeBufferStorePair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     case MIMG:
-      NewMI = mergeImagePair(CI, Paired, InstsToMove);
+      NewMI = mergeImagePair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     case TBUFFER_LOAD:
-      NewMI = mergeTBufferLoadPair(CI, Paired, InstsToMove);
+      NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     case TBUFFER_STORE:
-      NewMI = mergeTBufferStorePair(CI, Paired, InstsToMove);
+      NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     }
     CI.setMI(NewMI, *this);
-    CI.Order = Paired.Order;
+    CI.Order = Where->Order;
     if (I == Second)
       I = Next;
 
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
index c6ba1119ed685..0f4b2778b5915 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -66,13 +66,15 @@ define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrsp
 }
 
 
-; The second load depends on the store. We can combine the two loads, and the combined load is
-; at the original place of the second load.
+; The second load depends on the store. We could combine the two loads, putting
+; the combined load at the original place of the second load, but we prefer to
+; leave the first load near the start of the function to hide its latency.
 
 ; GCN-LABEL: {{^}}ds_combine_RAW
 
 ; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
-; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
+; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
 define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
 
   %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index c44c597775966..7af9c948a1e41 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1244,28 +1244,28 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving(
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v1
-; CI-NEXT:    v_add_i32_e32 v3, vcc, s5, v4
-; CI-NEXT:    v_add_i32_e32 v5, vcc, s6, v1
+; CI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
+; CI-NEXT:    v_add_i32_e32 v4, vcc, s6, v1
+; CI-NEXT:    v_add_i32_e32 v6, vcc, s7, v0
 ; CI-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
 ; CI-NEXT:    ds_read2_b32 v[2:3], v3 offset1:4
-; CI-NEXT:    v_add_i32_e32 v6, vcc, s7, v4
-; CI-NEXT:    ds_read2_b32 v[4:5], v5 offset1:1
+; CI-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
 ; CI-NEXT:    ds_read2_b32 v[6:7], v6 offset1:4
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(2)
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-NEXT:    v_add_f32_e32 v0, 2.0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mul_f32_e32 v2, v4, v6
 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v2
+; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; CI-NEXT:    v_mul_f32_e32 v1, v5, v7
-; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:40
 ; CI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index 6b24510fc9253..82665e435f30b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -6,9 +6,15 @@
 @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 @c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 
+; FIXME: Should combine the DS instructions into ds_write2 and ds_read2. This
+; does not happen because when SILoadStoreOptimizer is run, the reads and writes
+; are not adjacent. They are only moved later by MachineScheduler.
+
 ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
-; GCN: ds_write2st64_b32
-; GCN: ds_read2st64_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_read_b32
+; GCN: ds_read_b32
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2
 ; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
@@ -30,9 +36,11 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
-; GCN-DAG: ds_write2st64_b32
 ; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_read2st64_b32
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read_b32
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x3
diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
index 15b3607d79d91..cc2054ae4c89c 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
@@ -7,9 +7,9 @@
 # However, an equivalent situation can occur with buffer instructions as well.
 
 # CHECK-LABEL: name: scc_def_and_use_no_dependency
+# CHECK: DS_READ2_B32
 # CHECK: S_ADD_U32
 # CHECK: S_ADDC_U32
-# CHECK: DS_READ2_B32
 ---
 name:            scc_def_and_use_no_dependency
 machineFunctionInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir
index 5477f5ea3b2a7..21149d33afa89 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir
@@ -2,9 +2,8 @@
 
 # GCN-LABEL: name: out_of_order_merge
 # GCN: DS_READ2_B64_gfx9
-# GCN: DS_WRITE_B64_gfx9
 # GCN: DS_READ2_B64_gfx9
-# GCN: DS_WRITE_B64_gfx9
+# GCN: DS_WRITE2_B64_gfx9
 # GCN: DS_WRITE_B64_gfx9
 ---
 name:            out_of_order_merge
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
index 961e8213a85bf..45f3eb87d8b12 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
@@ -780,8 +780,8 @@ body:             |
 
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+# GFX9-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+# GFX9-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
 name: gfx9_tbuffer_load_merge_across_swizzle
 body:             |
   bb.0.entry:
@@ -1597,8 +1597,8 @@ body:             |
 
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+# GFX10-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+# GFX10-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
 name: gfx10_tbuffer_load_merge_across_swizzle
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 0196f7d5eb447..bc978cc3347f8 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -9,9 +9,9 @@
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
 
-; GFX9: global_store_dword
 ; GFX9: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; GFX9: global_store_dword
+; GFX9: global_store_dword
 define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 

From 55c181a6c786cfbfa8b7aabe0a8ba721a65b1445 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 21 Feb 2022 10:52:09 +0000
Subject: [PATCH 411/748] Revert "[AArch64][GlobalISel] Optimize conjunctions
 of compares to conditional compares."

This reverts commit 2a46450849de6904fc64f9a65303b20ca7fc9dbd.

This triggers the following assertion in an internal project:

    Assertion failed: (VRegInfo[Reg.id()].first.is<const TargetRegisterClass
    *>() && "Register class not set, wrong accessor"), function getRegClass,
    file MachineRegisterInfo.h, line 646.

I'll work with the author directly to get a reproducer.
---
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |  32 --
 .../GISel/AArch64InstructionSelector.cpp      | 381 +-----------------
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       | 260 ++++++++----
 3 files changed, 194 insertions(+), 479 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 58fe48200e732..7103656365b1b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 #define LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 
-#include "llvm/IR/Instructions.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -227,37 +226,6 @@ class GSelect : public GenericMachineInstr {
   }
 };
 
-/// Represent a G_ICMP or G_FCMP.
-class GAnyCmp : public GenericMachineInstr {
-public:
-  CmpInst::Predicate getCond() const {
-    return static_cast<CmpInst::Predicate>(getOperand(1).getPredicate());
-  }
-  Register getLHSReg() const { return getReg(2); }
-  Register getRHSReg() const { return getReg(3); }
-
-  static bool classof(const MachineInstr *MI) {
-    return MI->getOpcode() == TargetOpcode::G_ICMP ||
-           MI->getOpcode() == TargetOpcode::G_FCMP;
-  }
-};
-
-/// Represent a G_ICMP.
-class GICmp : public GAnyCmp {
-public:
-  static bool classof(const MachineInstr *MI) {
-    return MI->getOpcode() == TargetOpcode::G_ICMP;
-  }
-};
-
-/// Represent a G_FCMP.
-class GFCmp : public GAnyCmp {
-public:
-  static bool classof(const MachineInstr *MI) {
-    return MI->getOpcode() == TargetOpcode::G_FCMP;
-  }
-};
-
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0b065398ccee5..8a79d2426c8f0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -64,7 +63,6 @@ namespace {
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATE_BITSET
 
-
 class AArch64InstructionSelector : public InstructionSelector {
 public:
   AArch64InstructionSelector(const AArch64TargetMachine &TM,
@@ -296,20 +294,6 @@ class AArch64InstructionSelector : public InstructionSelector {
   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
 
-  /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
-  /// In some cases this is even possible with OR operations in the expression.
-  MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
-                                MachineIRBuilder &MIB) const;
-  MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
-                                          CmpInst::Predicate CC,
-                                          AArch64CC::CondCode Predicate,
-                                          AArch64CC::CondCode OutCC,
-                                          MachineIRBuilder &MIB) const;
-  MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
-                                   bool Negate, Register CCOp,
-                                   AArch64CC::CondCode Predicate,
-                                   MachineIRBuilder &MIB) const;
-
   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
   /// \p IsNegative is true if the test should be "not zero".
   /// This will also optimize the test bit instruction when possible.
@@ -441,8 +425,7 @@ class AArch64InstructionSelector : public InstructionSelector {
   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
 
   // Optimization methods.
-  bool tryOptSelect(GSelect &Sel);
-  bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
+  bool tryOptSelect(MachineInstr &MI);
   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                       MachineOperand &Predicate,
                                       MachineIRBuilder &MIRBuilder) const;
@@ -1327,90 +1310,6 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
   }
 }
 
-/// changeFPCCToAArch64CC - Convert an IR fp condition code to an AArch64 CC.
-static void changeFPCCToAArch64CC(CmpInst::Predicate CC,
-                                  AArch64CC::CondCode &CondCode,
-                                  AArch64CC::CondCode &CondCode2) {
-  CondCode2 = AArch64CC::AL;
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown FP condition!");
-  case CmpInst::FCMP_OEQ:
-    CondCode = AArch64CC::EQ;
-    break;
-  case CmpInst::FCMP_OGT:
-    CondCode = AArch64CC::GT;
-    break;
-  case CmpInst::FCMP_OGE:
-    CondCode = AArch64CC::GE;
-    break;
-  case CmpInst::FCMP_OLT:
-    CondCode = AArch64CC::MI;
-    break;
-  case CmpInst::FCMP_OLE:
-    CondCode = AArch64CC::LS;
-    break;
-  case CmpInst::FCMP_ONE:
-    CondCode = AArch64CC::MI;
-    CondCode2 = AArch64CC::GT;
-    break;
-  case CmpInst::FCMP_ORD:
-    CondCode = AArch64CC::VC;
-    break;
-  case CmpInst::FCMP_UNO:
-    CondCode = AArch64CC::VS;
-    break;
-  case CmpInst::FCMP_UEQ:
-    CondCode = AArch64CC::EQ;
-    CondCode2 = AArch64CC::VS;
-    break;
-  case CmpInst::FCMP_UGT:
-    CondCode = AArch64CC::HI;
-    break;
-  case CmpInst::FCMP_UGE:
-    CondCode = AArch64CC::PL;
-    break;
-  case CmpInst::FCMP_ULT:
-    CondCode = AArch64CC::LT;
-    break;
-  case CmpInst::FCMP_ULE:
-    CondCode = AArch64CC::LE;
-    break;
-  case CmpInst::FCMP_UNE:
-    CondCode = AArch64CC::NE;
-    break;
-  }
-}
-
-/// Convert an IR fp condition code to an AArch64 CC.
-/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
-/// should be AND'ed instead of OR'ed.
-static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
-                                     AArch64CC::CondCode &CondCode,
-                                     AArch64CC::CondCode &CondCode2) {
-  CondCode2 = AArch64CC::AL;
-  switch (CC) {
-  default:
-    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
-    assert(CondCode2 == AArch64CC::AL);
-    break;
-  case CmpInst::FCMP_ONE:
-    // (a one b)
-    // == ((a olt b) || (a ogt b))
-    // == ((a ord b) && (a une b))
-    CondCode = AArch64CC::VC;
-    CondCode2 = AArch64CC::NE;
-    break;
-  case CmpInst::FCMP_UEQ:
-    // (a ueq b)
-    // == ((a uno b) || (a oeq b))
-    // == ((a ule b) && (a uge b))
-    CondCode = AArch64CC::PL;
-    CondCode2 = AArch64CC::LE;
-    break;
-  }
-}
-
 /// Return a register which can be used as a bit to test in a TB(N)Z.
 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
                               MachineRegisterInfo &MRI) {
@@ -3393,18 +3292,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return selectCopy(I, TII, MRI, TRI, RBI);
 
   case TargetOpcode::G_SELECT: {
-    auto &Sel = cast<GSelect>(I);
-    if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) {
+    if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
                         << ", expected: " << LLT::scalar(1) << '\n');
       return false;
     }
 
-    const Register CondReg = Sel.getCondReg();
-    const Register TReg = Sel.getTrueReg();
-    const Register FReg = Sel.getFalseReg();
+    const Register CondReg = I.getOperand(1).getReg();
+    const Register TReg = I.getOperand(2).getReg();
+    const Register FReg = I.getOperand(3).getReg();
 
-    if (tryOptSelect(Sel))
+    if (tryOptSelect(I))
       return true;
 
     // Make sure to use an unused vreg instead of wzr, so that the peephole
@@ -3413,9 +3311,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
-    if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
+    if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
       return false;
-    Sel.eraseFromParent();
+    I.eraseFromParent();
     return true;
   }
   case TargetOpcode::G_ICMP: {
@@ -4804,263 +4702,7 @@ AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
   }
 }
 
-/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
-/// expressed as a conjunction.
-/// \param CanNegate    Set to true if we can negate the whole sub-tree just by
-///                     changing the conditions on the CMP tests.
-///                     (this means we can call emitConjunctionRec() with
-///                      Negate==true on this sub-tree)
-/// \param MustBeFirst  Set to true if this subtree needs to be negated and we
-///                     cannot do the negation naturally. We are required to
-///                     emit the subtree first in this case.
-/// \param WillNegate   Is true if are called when the result of this
-///                     subexpression must be negated. This happens when the
-///                     outer expression is an OR. We can use this fact to know
-///                     that we have a double negation (or (or ...) ...) that
-///                     can be implemented for free.
-static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
-                               bool WillNegate, MachineRegisterInfo &MRI,
-                               unsigned Depth = 0) {
-  if (!MRI.hasOneNonDBGUse(Val))
-    return false;
-  MachineInstr *ValDef = MRI.getVRegDef(Val);
-  unsigned Opcode = ValDef->getOpcode();
-  if (Opcode == TargetOpcode::G_TRUNC) {
-    // Look through a trunc.
-    Val = ValDef->getOperand(1).getReg();
-    ValDef = MRI.getVRegDef(Val);
-    Opcode = ValDef->getOpcode();
-  }
-  if (isa<GAnyCmp>(ValDef)) {
-    CanNegate = true;
-    MustBeFirst = false;
-    return true;
-  }
-  // Protect against exponential runtime and stack overflow.
-  if (Depth > 6)
-    return false;
-  if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
-    bool IsOR = Opcode == TargetOpcode::G_OR;
-    Register O0 = ValDef->getOperand(1).getReg();
-    Register O1 = ValDef->getOperand(2).getReg();
-    bool CanNegateL;
-    bool MustBeFirstL;
-    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
-      return false;
-    bool CanNegateR;
-    bool MustBeFirstR;
-    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
-      return false;
-
-    if (MustBeFirstL && MustBeFirstR)
-      return false;
-
-    if (IsOR) {
-      // For an OR expression we need to be able to naturally negate at least
-      // one side or we cannot do the transformation at all.
-      if (!CanNegateL && !CanNegateR)
-        return false;
-      // If we the result of the OR will be negated and we can naturally negate
-      // the leafs, then this sub-tree as a whole negates naturally.
-      CanNegate = WillNegate && CanNegateL && CanNegateR;
-      // If we cannot naturally negate the whole sub-tree, then this must be
-      // emitted first.
-      MustBeFirst = !CanNegate;
-    } else {
-      assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
-      // We cannot naturally negate an AND operation.
-      CanNegate = false;
-      MustBeFirst = MustBeFirstL || MustBeFirstR;
-    }
-    return true;
-  }
-  return false;
-}
-
-MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
-    Register LHS, Register RHS, CmpInst::Predicate CC,
-    AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
-    MachineIRBuilder &MIB) const {
-  // TODO: emit CMN as an optimization.
-  auto &MRI = *MIB.getMRI();
-  LLT OpTy = MRI.getType(LHS);
-  assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
-  unsigned CCmpOpc;
-  if (CmpInst::isIntPredicate(CC)) {
-    CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
-  } else {
-    switch (OpTy.getSizeInBits()) {
-    case 16:
-      CCmpOpc = AArch64::FCCMPHrr;
-      break;
-    case 32:
-      CCmpOpc = AArch64::FCCMPSrr;
-      break;
-    case 64:
-      CCmpOpc = AArch64::FCCMPDrr;
-      break;
-    default:
-      return nullptr;
-    }
-  }
-  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
-  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
-  auto CCmp =
-      MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate);
-  constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
-  return &*CCmp;
-}
-
-MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
-    Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
-    AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
-  // We're at a tree leaf, produce a conditional comparison operation.
-  auto &MRI = *MIB.getMRI();
-  MachineInstr *ValDef = MRI.getVRegDef(Val);
-  unsigned Opcode = ValDef->getOpcode();
-  if (Opcode == TargetOpcode::G_TRUNC) {
-    // Look through a trunc.
-    Val = ValDef->getOperand(1).getReg();
-    ValDef = MRI.getVRegDef(Val);
-    Opcode = ValDef->getOpcode();
-  }
-  if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
-    Register LHS = Cmp->getLHSReg();
-    Register RHS = Cmp->getRHSReg();
-    CmpInst::Predicate CC = Cmp->getCond();
-    if (Negate)
-      CC = CmpInst::getInversePredicate(CC);
-    // We only handle integer compares for now.
-    if (isa<GICmp>(Cmp)) {
-      OutCC = changeICMPPredToAArch64CC(CC);
-    } else {
-      // Handle special FP cases.
-      AArch64CC::CondCode ExtraCC;
-      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
-      // Some floating point conditions can't be tested with a single condition
-      // code. Construct an additional comparison in this case.
-      if (ExtraCC != AArch64CC::AL) {
-        MachineInstr *ExtraCmp;
-        if (!CCOp)
-          ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
-        else
-          ExtraCmp =
-              emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
-        CCOp = ExtraCmp->getOperand(0).getReg();
-        Predicate = ExtraCC;
-      }
-    }
-
-    // Produce a normal comparison if we are first in the chain
-    if (!CCOp) {
-      auto Dst = MRI.cloneVirtualRegister(LHS);
-      if (isa<GICmp>(Cmp))
-        return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
-      return emitFPCompare(Cmp->getOperand(2).getReg(),
-                           Cmp->getOperand(3).getReg(), MIB);
-    }
-    // Otherwise produce a ccmp.
-    return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
-  }
-  assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
-
-  bool IsOR = Opcode == TargetOpcode::G_OR;
-
-  Register LHS = ValDef->getOperand(1).getReg();
-  bool CanNegateL;
-  bool MustBeFirstL;
-  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
-  assert(ValidL && "Valid conjunction/disjunction tree");
-  (void)ValidL;
-
-  Register RHS = ValDef->getOperand(2).getReg();
-  bool CanNegateR;
-  bool MustBeFirstR;
-  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
-  assert(ValidR && "Valid conjunction/disjunction tree");
-  (void)ValidR;
-
-  // Swap sub-tree that must come first to the right side.
-  if (MustBeFirstL) {
-    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
-    std::swap(LHS, RHS);
-    std::swap(CanNegateL, CanNegateR);
-    std::swap(MustBeFirstL, MustBeFirstR);
-  }
-
-  bool NegateR;
-  bool NegateAfterR;
-  bool NegateL;
-  bool NegateAfterAll;
-  if (Opcode == TargetOpcode::G_OR) {
-    // Swap the sub-tree that we can negate naturally to the left.
-    if (!CanNegateL) {
-      assert(CanNegateR && "at least one side must be negatable");
-      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
-      assert(!Negate);
-      std::swap(LHS, RHS);
-      NegateR = false;
-      NegateAfterR = true;
-    } else {
-      // Negate the left sub-tree if possible, otherwise negate the result.
-      NegateR = CanNegateR;
-      NegateAfterR = !CanNegateR;
-    }
-    NegateL = true;
-    NegateAfterAll = !Negate;
-  } else {
-    assert(Opcode == TargetOpcode::G_AND &&
-           "Valid conjunction/disjunction tree");
-    assert(!Negate && "Valid conjunction/disjunction tree");
-
-    NegateL = false;
-    NegateR = false;
-    NegateAfterR = false;
-    NegateAfterAll = false;
-  }
-
-  // Emit sub-trees.
-  AArch64CC::CondCode RHSCC;
-  MachineInstr *CmpR =
-      emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
-  if (NegateAfterR)
-    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
-  MachineInstr *CmpL = emitConjunctionRec(
-      LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
-  if (NegateAfterAll)
-    OutCC = AArch64CC::getInvertedCondCode(OutCC);
-  return CmpL;
-}
-
-MachineInstr *AArch64InstructionSelector::emitConjunction(
-    Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
-  bool DummyCanNegate;
-  bool DummyMustBeFirst;
-  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
-                          *MIB.getMRI()))
-    return nullptr;
-  return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
-}
-
-bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
-                                                         MachineInstr &CondMI) {
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-  AArch64CC::CondCode AArch64CC;
-  MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
-  if (!ConjMI)
-    return false;
-  auto CSel =
-      MIB.buildInstr(MRI.getType(SelI.getReg(0)).getSizeInBits() == 32
-                         ? AArch64::CSELWr
-                         : AArch64::CSELXr,
-                     {SelI.getReg(0)}, {SelI.getTrueReg(), SelI.getFalseReg()})
-          .addImm(AArch64CC);
-  constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI);
-  SelI.eraseFromParent();
-  return true;
-}
-
-bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
+bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
   MachineRegisterInfo &MRI = *MIB.getMRI();
   // We want to recognize this pattern:
   //
@@ -5113,11 +4755,8 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
     return false;
 
   unsigned CondOpc = CondDef->getOpcode();
-  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
-    if (tryOptSelectConjunction(I, *CondDef))
-      return true;
+  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
     return false;
-  }
 
   AArch64CC::CondCode CondCode;
   if (CondOpc == TargetOpcode::G_ICMP) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 58bf419715519..f81ed69b137f6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -569,10 +569,14 @@ define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_and:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
-; GISEL-NEXT:    cmp w8, w1
-; GISEL-NEXT:    ccmp w0, w1, #0, ne
-; GISEL-NEXT:    csel x0, x2, x3, lt
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    cset w8, lt
+; GISEL-NEXT:    mov w9, #5
+; GISEL-NEXT:    cmp w9, w1
+; GISEL-NEXT:    cset w9, ne
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel x0, x2, x3, ne
 ; GISEL-NEXT:    ret
   %1 = icmp slt i32 %w0, %w1
   %2 = icmp ne i32 5, %w1
@@ -591,10 +595,14 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
-; GISEL-NEXT:    cmp w8, w1
-; GISEL-NEXT:    ccmp w0, w1, #8, eq
-; GISEL-NEXT:    csel x0, x2, x3, lt
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    cset w8, lt
+; GISEL-NEXT:    mov w9, #5
+; GISEL-NEXT:    cmp w9, w1
+; GISEL-NEXT:    cset w9, ne
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel x0, x2, x3, ne
 ; GISEL-NEXT:    ret
   %1 = icmp slt i32 %w0, %w1
   %2 = icmp ne i32 5, %w1
@@ -615,13 +623,17 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 ;
 ; GISEL-LABEL: gccbug:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #2
-; GISEL-NEXT:    mov w9, #4
-; GISEL-NEXT:    mov w10, #1
+; GISEL-NEXT:    cmp x1, #0
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    mov w9, #2
 ; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    ccmp x0, x9, #4, ne
-; GISEL-NEXT:    ccmp x1, xzr, #0, eq
-; GISEL-NEXT:    csel x0, x8, x10, eq
+; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    cmp x0, #4
+; GISEL-NEXT:    cset w11, eq
+; GISEL-NEXT:    orr w10, w11, w10
+; GISEL-NEXT:    and w8, w10, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csinc x0, x9, xzr, ne
 ; GISEL-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
@@ -646,13 +658,19 @@ define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) {
 ;
 ; GISEL-LABEL: select_ororand:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #13
-; GISEL-NEXT:    mov w9, #2
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cmp w1, #13
+; GISEL-NEXT:    cset w9, hi
+; GISEL-NEXT:    cmp w2, #2
+; GISEL-NEXT:    cset w10, lt
 ; GISEL-NEXT:    cmp w3, #4
-; GISEL-NEXT:    ccmp w2, w9, #0, gt
-; GISEL-NEXT:    ccmp w1, w8, #2, ge
-; GISEL-NEXT:    ccmp w0, wzr, #4, ls
-; GISEL-NEXT:    csel w0, w3, wzr, eq
+; GISEL-NEXT:    cset w11, gt
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w3, wzr, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %w0, 0
   %c1 = icmp ugt i32 %w1, 13
@@ -676,10 +694,16 @@ define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) {
 ;
 ; GISEL-LABEL: select_andor:
 ; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, w1
+; GISEL-NEXT:    cset w8, eq
 ; GISEL-NEXT:    cmp w1, w2
-; GISEL-NEXT:    ccmp w0, wzr, #4, lt
-; GISEL-NEXT:    ccmp w0, w1, #0, eq
-; GISEL-NEXT:    csel w0, w0, w1, eq
+; GISEL-NEXT:    cset w9, ge
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    orr w9, w10, w9
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %v1, %v2
   %c1 = icmp sge i32 %v2, %v3
@@ -848,9 +872,14 @@ define i32 @select_and_olt_one(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_olt_one:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d2, d3, #4, mi
-; GISEL-NEXT:    fccmp d2, d3, #1, ne
-; GISEL-NEXT:    csel w0, w0, w1, vc
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, mi
+; GISEL-NEXT:    cset w10, gt
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp one double %v2, %v3
@@ -871,9 +900,14 @@ define i32 @select_and_one_olt(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_one_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d0, d1, #1, ne
-; GISEL-NEXT:    fccmp d2, d3, #0, vc
-; GISEL-NEXT:    csel w0, w0, w1, mi
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    cset w9, gt
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, mi
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp one double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -894,9 +928,14 @@ define i32 @select_and_olt_ueq(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_olt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d2, d3, #0, mi
-; GISEL-NEXT:    fccmp d2, d3, #8, le
-; GISEL-NEXT:    csel w0, w0, w1, pl
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, eq
+; GISEL-NEXT:    cset w10, vs
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -917,9 +956,14 @@ define i32 @select_and_ueq_olt(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_ueq_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d0, d1, #8, le
-; GISEL-NEXT:    fccmp d2, d3, #0, pl
-; GISEL-NEXT:    csel w0, w0, w1, mi
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cset w9, vs
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, mi
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp ueq double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -940,9 +984,14 @@ define i32 @select_or_olt_one(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_olt_one:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d2, d3, #0, pl
-; GISEL-NEXT:    fccmp d2, d3, #8, le
-; GISEL-NEXT:    csel w0, w0, w1, mi
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, mi
+; GISEL-NEXT:    cset w10, gt
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    orr w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp one double %v2, %v3
@@ -963,9 +1012,14 @@ define i32 @select_or_one_olt(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_one_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d0, d1, #8, le
-; GISEL-NEXT:    fccmp d2, d3, #8, pl
-; GISEL-NEXT:    csel w0, w0, w1, mi
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    cset w9, gt
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, mi
+; GISEL-NEXT:    orr w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp one double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -986,9 +1040,14 @@ define i32 @select_or_olt_ueq(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_olt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d2, d3, #4, pl
-; GISEL-NEXT:    fccmp d2, d3, #1, ne
-; GISEL-NEXT:    csel w0, w0, w1, vs
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, eq
+; GISEL-NEXT:    cset w10, vs
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    orr w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -1009,9 +1068,14 @@ define i32 @select_or_ueq_olt(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_ueq_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d0, d1, #1, ne
-; GISEL-NEXT:    fccmp d2, d3, #8, vc
-; GISEL-NEXT:    csel w0, w0, w1, mi
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cset w9, vs
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, mi
+; GISEL-NEXT:    orr w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp ueq double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -1033,10 +1097,17 @@ define i32 @select_or_olt_ogt_ueq(double %v0, double %v1, double %v2, double %v3
 ; GISEL-LABEL: select_or_olt_ogt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d2, d3, #0, pl
-; GISEL-NEXT:    fccmp d4, d5, #4, le
-; GISEL-NEXT:    fccmp d4, d5, #1, ne
-; GISEL-NEXT:    csel w0, w0, w1, vs
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, gt
+; GISEL-NEXT:    fcmp d4, d5
+; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    cset w11, vs
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    orr w8, w9, w8
+; GISEL-NEXT:    orr w8, w10, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ogt double %v2, %v3
@@ -1060,10 +1131,17 @@ define i32 @select_or_olt_ueq_ogt(double %v0, double %v1, double %v2, double %v3
 ; GISEL-LABEL: select_or_olt_ueq_ogt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    fccmp d2, d3, #4, pl
-; GISEL-NEXT:    fccmp d2, d3, #1, ne
-; GISEL-NEXT:    fccmp d4, d5, #0, vc
-; GISEL-NEXT:    csel w0, w0, w1, gt
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcmp d2, d3
+; GISEL-NEXT:    cset w9, eq
+; GISEL-NEXT:    cset w10, vs
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    fcmp d4, d5
+; GISEL-NEXT:    cset w10, gt
+; GISEL-NEXT:    orr w8, w9, w8
+; GISEL-NEXT:    orr w8, w10, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -1092,11 +1170,15 @@ define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    fcvt s1, h1
-; GISEL-NEXT:    fcvt s2, h2
-; GISEL-NEXT:    fcvt s3, h3
 ; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fccmp s2, s3, #8, mi
-; GISEL-NEXT:    csel w0, w0, w1, ge
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcvt s0, h2
+; GISEL-NEXT:    fcvt s1, h3
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    cset w9, ge
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp oge half %v2, %v3
@@ -1122,12 +1204,17 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    fcvt s1, h1
-; GISEL-NEXT:    fcvt s2, h2
-; GISEL-NEXT:    fcvt s3, h3
 ; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fccmp s2, s3, #4, mi
-; GISEL-NEXT:    fccmp s2, s3, #1, ne
-; GISEL-NEXT:    csel w0, w0, w1, vc
+; GISEL-NEXT:    cset w8, mi
+; GISEL-NEXT:    fcvt s0, h2
+; GISEL-NEXT:    fcvt s1, h3
+; GISEL-NEXT:    fcmp s0, s1
+; GISEL-NEXT:    cset w9, mi
+; GISEL-NEXT:    cset w10, gt
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp one half %v2, %v3
@@ -1207,11 +1294,18 @@ define i32 @deep_or(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #15
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cset w8, ne
+; GISEL-NEXT:    cmp w1, #0
+; GISEL-NEXT:    cset w9, ne
+; GISEL-NEXT:    cmp w2, #15
+; GISEL-NEXT:    cset w10, eq
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    ccmp w2, w8, #4, ne
-; GISEL-NEXT:    ccmp w1, wzr, #4, eq
-; GISEL-NEXT:    ccmp w0, wzr, #4, ne
+; GISEL-NEXT:    cset w11, eq
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    and w9, w10, w9
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    tst w8, #0x1
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0
@@ -1239,11 +1333,18 @@ define i32 @deep_or1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or1:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #15
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cset w8, ne
+; GISEL-NEXT:    cmp w1, #0
+; GISEL-NEXT:    cset w9, ne
+; GISEL-NEXT:    cmp w2, #15
+; GISEL-NEXT:    cset w10, eq
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    ccmp w2, w8, #4, ne
-; GISEL-NEXT:    ccmp w0, wzr, #4, eq
-; GISEL-NEXT:    ccmp w1, wzr, #4, ne
+; GISEL-NEXT:    cset w11, eq
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    and w8, w8, w10
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    tst w8, #0x1
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0
@@ -1271,11 +1372,18 @@ define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or2:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #15
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cset w8, ne
+; GISEL-NEXT:    cmp w1, #0
+; GISEL-NEXT:    cset w9, ne
+; GISEL-NEXT:    cmp w2, #15
+; GISEL-NEXT:    cset w10, eq
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    ccmp w2, w8, #4, ne
-; GISEL-NEXT:    ccmp w1, wzr, #4, eq
-; GISEL-NEXT:    ccmp w0, wzr, #4, ne
+; GISEL-NEXT:    cset w11, eq
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w8, w8, w10
+; GISEL-NEXT:    tst w8, #0x1
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0

From 52df8666158cc9716976b3bd15c9ed799c5d2d4d Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Mon, 21 Feb 2022 11:29:10 +0000
Subject: [PATCH 412/748] [OpenCL] opencl-c.h: remove arg names from atomics;
 NFC

This simplifies completeness comparisons against OpenCLBuiltins.td and
also makes the header no longer "claim" the identifiers "success",
"failure", "desired", "value".

Differential Revision: https://reviews.llvm.org/D119560
---
 clang/lib/Headers/opencl-c.h | 2239 +++++++++++++++-------------------
 1 file changed, 1003 insertions(+), 1236 deletions(-)

diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h
index bf3f01253df32..6c9c3cacf3ec6 100644
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@@ -12426,7 +12426,7 @@ void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
 void __ovld __conv barrier(cl_mem_fence_flags flags);
 
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope);
 void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
@@ -13274,38 +13274,38 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v
 
 // atomic_init()
 #if defined(__opencl_c_generic_address_space)
-void __ovld atomic_init(volatile atomic_int *object, int value);
-void __ovld atomic_init(volatile atomic_uint *object, uint value);
-void __ovld atomic_init(volatile atomic_float *object, float value);
+void __ovld atomic_init(volatile atomic_int *, int);
+void __ovld atomic_init(volatile atomic_uint *, uint);
+void __ovld atomic_init(volatile atomic_float *, float);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-void __ovld atomic_init(volatile atomic_long *object, long value);
-void __ovld atomic_init(volatile atomic_ulong *object, ulong value);
+void __ovld atomic_init(volatile atomic_long *, long);
+void __ovld atomic_init(volatile atomic_ulong *, ulong);
 #ifdef cl_khr_fp64
-void __ovld atomic_init(volatile atomic_double *object, double value);
+void __ovld atomic_init(volatile atomic_double *, double);
 #endif //cl_khr_fp64
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-void __ovld atomic_init(volatile __global atomic_int *object, int value);
-void __ovld atomic_init(volatile __local atomic_int *object, int value);
-void __ovld atomic_init(volatile __global atomic_uint *object, uint value);
-void __ovld atomic_init(volatile __local atomic_uint *object, uint value);
-void __ovld atomic_init(volatile __global atomic_float *object, float value);
-void __ovld atomic_init(volatile __local atomic_float *object, float value);
+void __ovld atomic_init(volatile __global atomic_int *, int);
+void __ovld atomic_init(volatile __local atomic_int *, int);
+void __ovld atomic_init(volatile __global atomic_uint *, uint);
+void __ovld atomic_init(volatile __local atomic_uint *, uint);
+void __ovld atomic_init(volatile __global atomic_float *, float);
+void __ovld atomic_init(volatile __local atomic_float *, float);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-void __ovld atomic_init(volatile __global atomic_long *object, long value);
-void __ovld atomic_init(volatile __local atomic_long *object, long value);
-void __ovld atomic_init(volatile __global atomic_ulong *object, ulong value);
-void __ovld atomic_init(volatile __local atomic_ulong *object, ulong value);
+void __ovld atomic_init(volatile __global atomic_long *, long);
+void __ovld atomic_init(volatile __local atomic_long *, long);
+void __ovld atomic_init(volatile __global atomic_ulong *, ulong);
+void __ovld atomic_init(volatile __local atomic_ulong *, ulong);
 #ifdef cl_khr_fp64
-void __ovld atomic_init(volatile __global atomic_double *object, double value);
-void __ovld atomic_init(volatile __local atomic_double *object, double value);
+void __ovld atomic_init(volatile __global atomic_double *, double);
+void __ovld atomic_init(volatile __local atomic_double *, double);
 #endif //cl_khr_fp64
 #endif
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 
 // atomic_work_item_fence()
-void __ovld atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
+void __ovld atomic_work_item_fence(cl_mem_fence_flags, memory_order, memory_scope);
 
 // atomic_fetch()
 // OpenCL v2.0 s6.13.11.7.5:
@@ -13313,356 +13313,356 @@ void __ovld atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order,
 
 #if defined(__opencl_c_atomic_order_seq_cst) && defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_fetch_add(volatile atomic_int *object, int operand);
-uint __ovld atomic_fetch_add(volatile atomic_uint *object, uint operand);
-int __ovld atomic_fetch_sub(volatile atomic_int *object, int operand);
-uint __ovld atomic_fetch_sub(volatile atomic_uint *object, uint operand);
-int __ovld atomic_fetch_or(volatile atomic_int *object, int operand);
-uint __ovld atomic_fetch_or(volatile atomic_uint *object, uint operand);
-int __ovld atomic_fetch_xor(volatile atomic_int *object, int operand);
-uint __ovld atomic_fetch_xor(volatile atomic_uint *object, uint operand);
-int __ovld atomic_fetch_and(volatile atomic_int *object, int operand);
-uint __ovld atomic_fetch_and(volatile atomic_uint *object, uint operand);
-int __ovld atomic_fetch_min(volatile atomic_int *object, int operand);
-uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
-int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
-uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
+int __ovld atomic_fetch_add(volatile atomic_int *, int);
+uint __ovld atomic_fetch_add(volatile atomic_uint *, uint);
+int __ovld atomic_fetch_sub(volatile atomic_int *, int);
+uint __ovld atomic_fetch_sub(volatile atomic_uint *, uint);
+int __ovld atomic_fetch_or(volatile atomic_int *, int);
+uint __ovld atomic_fetch_or(volatile atomic_uint *, uint);
+int __ovld atomic_fetch_xor(volatile atomic_int *, int);
+uint __ovld atomic_fetch_xor(volatile atomic_uint *, uint);
+int __ovld atomic_fetch_and(volatile atomic_int *, int);
+uint __ovld atomic_fetch_and(volatile atomic_uint *, uint);
+int __ovld atomic_fetch_min(volatile atomic_int *, int);
+uint __ovld atomic_fetch_min(volatile atomic_uint *, uint);
+int __ovld atomic_fetch_max(volatile atomic_int *, int);
+uint __ovld atomic_fetch_max(volatile atomic_uint *, uint);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
-ulong __ovld atomic_fetch_add(volatile atomic_ulong *object, ulong operand);
-long __ovld atomic_fetch_sub(volatile atomic_long *object, long operand);
-ulong __ovld atomic_fetch_sub(volatile atomic_ulong *object, ulong operand);
-long __ovld atomic_fetch_or(volatile atomic_long *object, long operand);
-ulong __ovld atomic_fetch_or(volatile atomic_ulong *object, ulong operand);
-long __ovld atomic_fetch_xor(volatile atomic_long *object, long operand);
-ulong __ovld atomic_fetch_xor(volatile atomic_ulong *object, ulong operand);
-long __ovld atomic_fetch_and(volatile atomic_long *object, long operand);
-ulong __ovld atomic_fetch_and(volatile atomic_ulong *object, ulong operand);
-long __ovld atomic_fetch_min(volatile atomic_long *object, long operand);
-ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
-long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
-ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
-uintptr_t __ovld atomic_fetch_sub(volatile atomic_uintptr_t *object, ptrdiff_t operand);
+long __ovld atomic_fetch_add(volatile atomic_long *, long);
+ulong __ovld atomic_fetch_add(volatile atomic_ulong *, ulong);
+long __ovld atomic_fetch_sub(volatile atomic_long *, long);
+ulong __ovld atomic_fetch_sub(volatile atomic_ulong *, ulong);
+long __ovld atomic_fetch_or(volatile atomic_long *, long);
+ulong __ovld atomic_fetch_or(volatile atomic_ulong *, ulong);
+long __ovld atomic_fetch_xor(volatile atomic_long *, long);
+ulong __ovld atomic_fetch_xor(volatile atomic_ulong *, ulong);
+long __ovld atomic_fetch_and(volatile atomic_long *, long);
+ulong __ovld atomic_fetch_and(volatile atomic_ulong *, ulong);
+long __ovld atomic_fetch_min(volatile atomic_long *, long);
+ulong __ovld atomic_fetch_min(volatile atomic_ulong *, ulong);
+long __ovld atomic_fetch_max(volatile atomic_long *, long);
+ulong __ovld atomic_fetch_max(volatile atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *, ptrdiff_t);
+uintptr_t __ovld atomic_fetch_sub(volatile atomic_uintptr_t *, ptrdiff_t);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_fetch_add(volatile __global atomic_int *object, int operand);
-int __ovld atomic_fetch_add(volatile __local atomic_int *object, int operand);
-uint __ovld atomic_fetch_add(volatile __global atomic_uint *object, uint operand);
-uint __ovld atomic_fetch_add(volatile __local atomic_uint *object, uint operand);
-int __ovld atomic_fetch_sub(volatile __global atomic_int *object, int operand);
-int __ovld atomic_fetch_sub(volatile __local atomic_int *object, int operand);
-uint __ovld atomic_fetch_sub(volatile __global atomic_uint *object, uint operand);
-uint __ovld atomic_fetch_sub(volatile __local atomic_uint *object, uint operand);
-int __ovld atomic_fetch_or(volatile __global atomic_int *object, int operand);
-int __ovld atomic_fetch_or(volatile __local atomic_int *object, int operand);
-uint __ovld atomic_fetch_or(volatile __global atomic_uint *object, uint operand);
-uint __ovld atomic_fetch_or(volatile __local atomic_uint *object, uint operand);
-int __ovld atomic_fetch_xor(volatile __global atomic_int *object, int operand);
-int __ovld atomic_fetch_xor(volatile __local atomic_int *object, int operand);
-uint __ovld atomic_fetch_xor(volatile __global atomic_uint *object, uint operand);
-uint __ovld atomic_fetch_xor(volatile __local atomic_uint *object, uint operand);
-int __ovld atomic_fetch_and(volatile __global atomic_int *object, int operand);
-int __ovld atomic_fetch_and(volatile __local atomic_int *object, int operand);
-uint __ovld atomic_fetch_and(volatile __global atomic_uint *object, uint operand);
-uint __ovld atomic_fetch_and(volatile __local atomic_uint *object, uint operand);
-int __ovld atomic_fetch_min(volatile __global atomic_int *object, int operand);
-int __ovld atomic_fetch_min(volatile __local atomic_int *object, int operand);
-uint __ovld atomic_fetch_min(volatile __global atomic_uint *object, uint operand);
-uint __ovld atomic_fetch_min(volatile __local atomic_uint *object, uint operand);
-int __ovld atomic_fetch_max(volatile __global atomic_int *object, int operand);
-int __ovld atomic_fetch_max(volatile __local atomic_int *object, int operand);
-uint __ovld atomic_fetch_max(volatile __global atomic_uint *object, uint operand);
-uint __ovld atomic_fetch_max(volatile __local atomic_uint *object, uint operand);
+int __ovld atomic_fetch_add(volatile __global atomic_int *, int);
+int __ovld atomic_fetch_add(volatile __local atomic_int *, int);
+uint __ovld atomic_fetch_add(volatile __global atomic_uint *, uint);
+uint __ovld atomic_fetch_add(volatile __local atomic_uint *, uint);
+int __ovld atomic_fetch_sub(volatile __global atomic_int *, int);
+int __ovld atomic_fetch_sub(volatile __local atomic_int *, int);
+uint __ovld atomic_fetch_sub(volatile __global atomic_uint *, uint);
+uint __ovld atomic_fetch_sub(volatile __local atomic_uint *, uint);
+int __ovld atomic_fetch_or(volatile __global atomic_int *, int);
+int __ovld atomic_fetch_or(volatile __local atomic_int *, int);
+uint __ovld atomic_fetch_or(volatile __global atomic_uint *, uint);
+uint __ovld atomic_fetch_or(volatile __local atomic_uint *, uint);
+int __ovld atomic_fetch_xor(volatile __global atomic_int *, int);
+int __ovld atomic_fetch_xor(volatile __local atomic_int *, int);
+uint __ovld atomic_fetch_xor(volatile __global atomic_uint *, uint);
+uint __ovld atomic_fetch_xor(volatile __local atomic_uint *, uint);
+int __ovld atomic_fetch_and(volatile __global atomic_int *, int);
+int __ovld atomic_fetch_and(volatile __local atomic_int *, int);
+uint __ovld atomic_fetch_and(volatile __global atomic_uint *, uint);
+uint __ovld atomic_fetch_and(volatile __local atomic_uint *, uint);
+int __ovld atomic_fetch_min(volatile __global atomic_int *, int);
+int __ovld atomic_fetch_min(volatile __local atomic_int *, int);
+uint __ovld atomic_fetch_min(volatile __global atomic_uint *, uint);
+uint __ovld atomic_fetch_min(volatile __local atomic_uint *, uint);
+int __ovld atomic_fetch_max(volatile __global atomic_int *, int);
+int __ovld atomic_fetch_max(volatile __local atomic_int *, int);
+uint __ovld atomic_fetch_max(volatile __global atomic_uint *, uint);
+uint __ovld atomic_fetch_max(volatile __local atomic_uint *, uint);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-long __ovld atomic_fetch_add(volatile __global atomic_long *object, long operand);
-long __ovld atomic_fetch_add(volatile __local atomic_long *object, long operand);
-ulong __ovld atomic_fetch_add(volatile __global atomic_ulong *object, ulong operand);
-ulong __ovld atomic_fetch_add(volatile __local atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_add(volatile __global atomic_uintptr_t *object, ptrdiff_t operand);
-uintptr_t __ovld atomic_fetch_add(volatile __local atomic_uintptr_t *object, ptrdiff_t operand);
-long __ovld atomic_fetch_sub(volatile __global atomic_long *object, long operand);
-long __ovld atomic_fetch_sub(volatile __local atomic_long *object, long operand);
-ulong __ovld atomic_fetch_sub(volatile __global atomic_ulong *object, ulong operand);
-ulong __ovld atomic_fetch_sub(volatile __local atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_sub(volatile __global atomic_uintptr_t *object, ptrdiff_t operand);
-uintptr_t __ovld atomic_fetch_sub(volatile __local atomic_uintptr_t *object, ptrdiff_t operand);
-long __ovld atomic_fetch_or(volatile __global atomic_long *object, long operand);
-long __ovld atomic_fetch_or(volatile __local atomic_long *object, long operand);
-ulong __ovld atomic_fetch_or(volatile __global atomic_ulong *object, ulong operand);
-ulong __ovld atomic_fetch_or(volatile __local atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_or(volatile __global atomic_uintptr_t *object, intptr_t operand);
-uintptr_t __ovld atomic_fetch_or(volatile __local atomic_uintptr_t *object, intptr_t operand);
-intptr_t __ovld atomic_fetch_or(volatile __global atomic_intptr_t *object, uintptr_t operand);
-intptr_t __ovld atomic_fetch_or(volatile __local atomic_intptr_t *object, uintptr_t operand);
-long __ovld atomic_fetch_xor(volatile __global atomic_long *object, long operand);
-long __ovld atomic_fetch_xor(volatile __local atomic_long *object, long operand);
-ulong __ovld atomic_fetch_xor(volatile __global atomic_ulong *object, ulong operand);
-ulong __ovld atomic_fetch_xor(volatile __local atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_xor(volatile __global atomic_uintptr_t *object, intptr_t operand);
-uintptr_t __ovld atomic_fetch_xor(volatile __local atomic_uintptr_t *object, intptr_t operand);
-intptr_t __ovld atomic_fetch_xor(volatile __global atomic_intptr_t *object, uintptr_t operand);
-intptr_t __ovld atomic_fetch_xor(volatile __local atomic_intptr_t *object, uintptr_t operand);
-long __ovld atomic_fetch_and(volatile __global atomic_long *object, long operand);
-long __ovld atomic_fetch_and(volatile __local atomic_long *object, long operand);
-ulong __ovld atomic_fetch_and(volatile __global atomic_ulong *object, ulong operand);
-ulong __ovld atomic_fetch_and(volatile __local atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_and(volatile __global atomic_uintptr_t *object, intptr_t operand);
-uintptr_t __ovld atomic_fetch_and(volatile __local atomic_uintptr_t *object, intptr_t operand);
-intptr_t __ovld atomic_fetch_and(volatile __global atomic_intptr_t *object, uintptr_t operand);
-intptr_t __ovld atomic_fetch_and(volatile __local atomic_intptr_t *object, uintptr_t operand);
-long __ovld atomic_fetch_min(volatile __global atomic_long *object, long operand);
-long __ovld atomic_fetch_min(volatile __local atomic_long *object, long operand);
-ulong __ovld atomic_fetch_min(volatile __global atomic_ulong *object, ulong operand);
-ulong __ovld atomic_fetch_min(volatile __local atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_min(volatile __global atomic_uintptr_t *object, intptr_t operand);
-uintptr_t __ovld atomic_fetch_min(volatile __local atomic_uintptr_t *object, intptr_t operand);
-intptr_t __ovld atomic_fetch_min(volatile __global atomic_intptr_t *object, uintptr_t operand);
-intptr_t __ovld atomic_fetch_min(volatile __local atomic_intptr_t *object, uintptr_t operand);
-long __ovld atomic_fetch_max(volatile __global atomic_long *object, long operand);
-long __ovld atomic_fetch_max(volatile __local atomic_long *object, long operand);
-ulong __ovld atomic_fetch_max(volatile __global atomic_ulong *object, ulong operand);
-ulong __ovld atomic_fetch_max(volatile __local atomic_ulong *object, ulong operand);
-uintptr_t __ovld atomic_fetch_max(volatile __global atomic_uintptr_t *object, uintptr_t operand);
-uintptr_t __ovld atomic_fetch_max(volatile __local atomic_uintptr_t *object, uintptr_t operand);
+long __ovld atomic_fetch_add(volatile __global atomic_long *, long);
+long __ovld atomic_fetch_add(volatile __local atomic_long *, long);
+ulong __ovld atomic_fetch_add(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_fetch_add(volatile __local atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_add(volatile __global atomic_uintptr_t *, ptrdiff_t);
+uintptr_t __ovld atomic_fetch_add(volatile __local atomic_uintptr_t *, ptrdiff_t);
+long __ovld atomic_fetch_sub(volatile __global atomic_long *, long);
+long __ovld atomic_fetch_sub(volatile __local atomic_long *, long);
+ulong __ovld atomic_fetch_sub(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_fetch_sub(volatile __local atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_sub(volatile __global atomic_uintptr_t *, ptrdiff_t);
+uintptr_t __ovld atomic_fetch_sub(volatile __local atomic_uintptr_t *, ptrdiff_t);
+long __ovld atomic_fetch_or(volatile __global atomic_long *, long);
+long __ovld atomic_fetch_or(volatile __local atomic_long *, long);
+ulong __ovld atomic_fetch_or(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_fetch_or(volatile __local atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_or(volatile __global atomic_uintptr_t *, intptr_t);
+uintptr_t __ovld atomic_fetch_or(volatile __local atomic_uintptr_t *, intptr_t);
+intptr_t __ovld atomic_fetch_or(volatile __global atomic_intptr_t *, uintptr_t);
+intptr_t __ovld atomic_fetch_or(volatile __local atomic_intptr_t *, uintptr_t);
+long __ovld atomic_fetch_xor(volatile __global atomic_long *, long);
+long __ovld atomic_fetch_xor(volatile __local atomic_long *, long);
+ulong __ovld atomic_fetch_xor(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_fetch_xor(volatile __local atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_xor(volatile __global atomic_uintptr_t *, intptr_t);
+uintptr_t __ovld atomic_fetch_xor(volatile __local atomic_uintptr_t *, intptr_t);
+intptr_t __ovld atomic_fetch_xor(volatile __global atomic_intptr_t *, uintptr_t);
+intptr_t __ovld atomic_fetch_xor(volatile __local atomic_intptr_t *, uintptr_t);
+long __ovld atomic_fetch_and(volatile __global atomic_long *, long);
+long __ovld atomic_fetch_and(volatile __local atomic_long *, long);
+ulong __ovld atomic_fetch_and(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_fetch_and(volatile __local atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_and(volatile __global atomic_uintptr_t *, intptr_t);
+uintptr_t __ovld atomic_fetch_and(volatile __local atomic_uintptr_t *, intptr_t);
+intptr_t __ovld atomic_fetch_and(volatile __global atomic_intptr_t *, uintptr_t);
+intptr_t __ovld atomic_fetch_and(volatile __local atomic_intptr_t *, uintptr_t);
+long __ovld atomic_fetch_min(volatile __global atomic_long *, long);
+long __ovld atomic_fetch_min(volatile __local atomic_long *, long);
+ulong __ovld atomic_fetch_min(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_fetch_min(volatile __local atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_min(volatile __global atomic_uintptr_t *, intptr_t);
+uintptr_t __ovld atomic_fetch_min(volatile __local atomic_uintptr_t *, intptr_t);
+intptr_t __ovld atomic_fetch_min(volatile __global atomic_intptr_t *, uintptr_t);
+intptr_t __ovld atomic_fetch_min(volatile __local atomic_intptr_t *, uintptr_t);
+long __ovld atomic_fetch_max(volatile __global atomic_long *, long);
+long __ovld atomic_fetch_max(volatile __local atomic_long *, long);
+ulong __ovld atomic_fetch_max(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_fetch_max(volatile __local atomic_ulong *, ulong);
+uintptr_t __ovld atomic_fetch_max(volatile __global atomic_uintptr_t *, uintptr_t);
+uintptr_t __ovld atomic_fetch_max(volatile __local atomic_uintptr_t *, uintptr_t);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *, uint, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
-long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
-long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
-long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
-long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
-long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
-long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *, ulong, memory_order);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *, ulong, memory_order);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *, ulong, memory_order);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *, ulong, memory_order);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *, ulong, memory_order);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *, ulong, memory_order);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *, ptrdiff_t, memory_order);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *, ptrdiff_t, memory_order);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_fetch_add_explicit(volatile __global atomic_int *object, int operand, memory_order order);
-int __ovld atomic_fetch_add_explicit(volatile __local atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_add_explicit(volatile __global atomic_uint *object, uint operand, memory_order order);
-uint __ovld atomic_fetch_add_explicit(volatile __local atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_sub_explicit(volatile __global atomic_int *object, int operand, memory_order order);
-int __ovld atomic_fetch_sub_explicit(volatile __local atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_sub_explicit(volatile __global atomic_uint *object, uint operand, memory_order order);
-uint __ovld atomic_fetch_sub_explicit(volatile __local atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_or_explicit(volatile __global atomic_int *object, int operand, memory_order order);
-int __ovld atomic_fetch_or_explicit(volatile __local atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_or_explicit(volatile __global atomic_uint *object, uint operand, memory_order order);
-uint __ovld atomic_fetch_or_explicit(volatile __local atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_xor_explicit(volatile __global atomic_int *object, int operand, memory_order order);
-int __ovld atomic_fetch_xor_explicit(volatile __local atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_xor_explicit(volatile __global atomic_uint *object, uint operand, memory_order order);
-uint __ovld atomic_fetch_xor_explicit(volatile __local atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_and_explicit(volatile __global atomic_int *object, int operand, memory_order order);
-int __ovld atomic_fetch_and_explicit(volatile __local atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_and_explicit(volatile __global atomic_uint *object, uint operand, memory_order order);
-uint __ovld atomic_fetch_and_explicit(volatile __local atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_min_explicit(volatile __global atomic_int *object, int operand, memory_order order);
-int __ovld atomic_fetch_min_explicit(volatile __local atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_min_explicit(volatile __global atomic_uint *object, uint operand, memory_order order);
-uint __ovld atomic_fetch_min_explicit(volatile __local atomic_uint *object, uint operand, memory_order order);
-int __ovld atomic_fetch_max_explicit(volatile __global atomic_int *object, int operand, memory_order order);
-int __ovld atomic_fetch_max_explicit(volatile __local atomic_int *object, int operand, memory_order order);
-uint __ovld atomic_fetch_max_explicit(volatile __global atomic_uint *object, uint operand, memory_order order);
-uint __ovld atomic_fetch_max_explicit(volatile __local atomic_uint *object, uint operand, memory_order order);
+int __ovld atomic_fetch_add_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_fetch_add_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_add_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_fetch_add_explicit(volatile __local atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_sub_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_fetch_sub_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_sub_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_fetch_sub_explicit(volatile __local atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_or_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_fetch_or_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_or_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_fetch_or_explicit(volatile __local atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_xor_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_fetch_xor_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_xor_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_fetch_xor_explicit(volatile __local atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_and_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_fetch_and_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_and_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_fetch_and_explicit(volatile __local atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_min_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_fetch_min_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_min_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_fetch_min_explicit(volatile __local atomic_uint *, uint, memory_order);
+int __ovld atomic_fetch_max_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_fetch_max_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_fetch_max_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_fetch_max_explicit(volatile __local atomic_uint *, uint, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-long __ovld atomic_fetch_add_explicit(volatile __global atomic_long *object, long operand, memory_order order);
-long __ovld atomic_fetch_add_explicit(volatile __local atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_add_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order);
-ulong __ovld atomic_fetch_add_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_add_explicit(volatile __global atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_add_explicit(volatile __local atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
-long __ovld atomic_fetch_sub_explicit(volatile __global atomic_long *object, long operand, memory_order order);
-long __ovld atomic_fetch_sub_explicit(volatile __local atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_sub_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order);
-ulong __ovld atomic_fetch_sub_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_sub_explicit(volatile __global atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_sub_explicit(volatile __local atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
-long __ovld atomic_fetch_or_explicit(volatile __global atomic_long *object, long operand, memory_order order);
-long __ovld atomic_fetch_or_explicit(volatile __local atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_or_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order);
-ulong __ovld atomic_fetch_or_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order);
-long __ovld atomic_fetch_xor_explicit(volatile __global atomic_long *object, long operand, memory_order order);
-long __ovld atomic_fetch_xor_explicit(volatile __local atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_xor_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order);
-ulong __ovld atomic_fetch_xor_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order);
-long __ovld atomic_fetch_and_explicit(volatile __global atomic_long *object, long operand, memory_order order);
-long __ovld atomic_fetch_and_explicit(volatile __local atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_and_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order);
-ulong __ovld atomic_fetch_and_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order);
-long __ovld atomic_fetch_min_explicit(volatile __global atomic_long *object, long operand, memory_order order);
-long __ovld atomic_fetch_min_explicit(volatile __local atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_min_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order);
-ulong __ovld atomic_fetch_min_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order);
-intptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order);
-long __ovld atomic_fetch_max_explicit(volatile __global atomic_long *object, long operand, memory_order order);
-long __ovld atomic_fetch_max_explicit(volatile __local atomic_long *object, long operand, memory_order order);
-ulong __ovld atomic_fetch_max_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order);
-ulong __ovld atomic_fetch_max_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order);
-uintptr_t __ovld atomic_fetch_max_explicit(volatile __global atomic_uintptr_t *object, uintptr_t operand, memory_order order);
-uintptr_t __ovld atomic_fetch_max_explicit(volatile __local atomic_uintptr_t *object, uintptr_t operand, memory_order order);
+long __ovld atomic_fetch_add_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_fetch_add_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_add_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_fetch_add_explicit(volatile __local atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile __global atomic_uintptr_t *, ptrdiff_t, memory_order);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile __local atomic_uintptr_t *, ptrdiff_t, memory_order);
+long __ovld atomic_fetch_sub_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_fetch_sub_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_sub_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_fetch_sub_explicit(volatile __local atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile __global atomic_uintptr_t *, ptrdiff_t, memory_order);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile __local atomic_uintptr_t *, ptrdiff_t, memory_order);
+long __ovld atomic_fetch_or_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_fetch_or_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_or_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_fetch_or_explicit(volatile __local atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order);
+intptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order);
+intptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order);
+long __ovld atomic_fetch_xor_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_fetch_xor_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_xor_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_fetch_xor_explicit(volatile __local atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order);
+long __ovld atomic_fetch_and_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_fetch_and_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_and_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_fetch_and_explicit(volatile __local atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order);
+intptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order);
+intptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order);
+long __ovld atomic_fetch_min_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_fetch_min_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_min_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_fetch_min_explicit(volatile __local atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order);
+intptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order);
+intptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order);
+long __ovld atomic_fetch_max_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_fetch_max_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_fetch_max_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_fetch_max_explicit(volatile __local atomic_ulong *, ulong, memory_order);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile __global atomic_uintptr_t *, uintptr_t, memory_order);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile __local atomic_uintptr_t *, uintptr_t, memory_order);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *, ptrdiff_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *, ptrdiff_t, memory_order, memory_scope);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_fetch_add_explicit(volatile __global atomic_int *object, int operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_add_explicit(volatile __local atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_add_explicit(volatile __global atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_add_explicit(volatile __local atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_sub_explicit(volatile __global atomic_int *object, int operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_sub_explicit(volatile __local atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_sub_explicit(volatile __global atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_sub_explicit(volatile __local atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_or_explicit(volatile __global atomic_int *object, int operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_or_explicit(volatile __local atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_or_explicit(volatile __global atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_or_explicit(volatile __local atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_xor_explicit(volatile __global atomic_int *object, int operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_xor_explicit(volatile __local atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_xor_explicit(volatile __global atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_xor_explicit(volatile __local atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_and_explicit(volatile __global atomic_int *object, int operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_and_explicit(volatile __local atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_and_explicit(volatile __global atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_and_explicit(volatile __local atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_min_explicit(volatile __global atomic_int *object, int operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_min_explicit(volatile __local atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_min_explicit(volatile __global atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_min_explicit(volatile __local atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_max_explicit(volatile __global atomic_int *object, int operand, memory_order order, memory_scope scope);
-int __ovld atomic_fetch_max_explicit(volatile __local atomic_int *object, int operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_max_explicit(volatile __global atomic_uint *object, uint operand, memory_order order, memory_scope scope);
-uint __ovld atomic_fetch_max_explicit(volatile __local atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_add_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_fetch_add_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_add_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_fetch_add_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_sub_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_fetch_sub_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_sub_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_fetch_sub_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_or_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_fetch_or_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_or_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_fetch_or_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_xor_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_fetch_xor_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_xor_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_fetch_xor_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_and_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_fetch_and_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_and_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_fetch_and_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_min_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_fetch_min_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_min_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_fetch_min_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+int __ovld atomic_fetch_max_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_fetch_max_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_fetch_max_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_fetch_max_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
-long __ovld atomic_fetch_add_explicit(volatile __global atomic_long *object, long operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_add_explicit(volatile __local atomic_long *object, long operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_add_explicit(volatile __global atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_add_explicit(volatile __local atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_add_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_add_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_sub_explicit(volatile __global atomic_long *object, long operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_sub_explicit(volatile __local atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_sub_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_sub_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_sub_explicit(volatile __global atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_sub_explicit(volatile __local atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_or_explicit(volatile __global atomic_long *object, long operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_or_explicit(volatile __local atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_or_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_or_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_xor_explicit(volatile __global atomic_long *object, long operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_xor_explicit(volatile __local atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_xor_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_xor_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_and_explicit(volatile __global atomic_long *object, long operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_and_explicit(volatile __local atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_and_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_and_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_min_explicit(volatile __global atomic_long *object, long operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_min_explicit(volatile __local atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_min_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_min_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-intptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_max_explicit(volatile __global atomic_long *object, long operand, memory_order order, memory_scope scope);
-long __ovld atomic_fetch_max_explicit(volatile __local atomic_long *object, long operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_max_explicit(volatile __global atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-ulong __ovld atomic_fetch_max_explicit(volatile __local atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_max_explicit(volatile __global atomic_uintptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
-uintptr_t __ovld atomic_fetch_max_explicit(volatile __local atomic_uintptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_add_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_fetch_add_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile __global atomic_uintptr_t *, ptrdiff_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile __local atomic_uintptr_t *, ptrdiff_t, memory_order, memory_scope);
+ulong __ovld atomic_fetch_add_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_fetch_add_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
+long __ovld atomic_fetch_sub_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_fetch_sub_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_sub_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_fetch_sub_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile __global atomic_uintptr_t *, ptrdiff_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile __local atomic_uintptr_t *, ptrdiff_t, memory_order, memory_scope);
+long __ovld atomic_fetch_or_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_fetch_or_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_or_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_fetch_or_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_or_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_or_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+long __ovld atomic_fetch_xor_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_fetch_xor_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_xor_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_fetch_xor_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+long __ovld atomic_fetch_and_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_fetch_and_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_and_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_fetch_and_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_and_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_and_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+long __ovld atomic_fetch_min_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_fetch_min_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_min_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_fetch_min_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_uintptr_t *, intptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_min_explicit(volatile __global atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+intptr_t __ovld atomic_fetch_min_explicit(volatile __local atomic_intptr_t *, uintptr_t, memory_order, memory_scope);
+long __ovld atomic_fetch_max_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_fetch_max_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_fetch_max_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_fetch_max_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile __global atomic_uintptr_t *, uintptr_t, memory_order, memory_scope);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile __local atomic_uintptr_t *, uintptr_t, memory_order, memory_scope);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 
@@ -13670,373 +13670,308 @@ uintptr_t __ovld atomic_fetch_max_explicit(volatile __local atomic_uintptr_t *ob
 #if defined(cl_ext_float_atomics)
 
 #if defined(__opencl_c_ext_fp16_global_atomic_load_store)
-void __ovld atomic_store(volatile __global atomic_half *object, half operand);
-void __ovld atomic_store_explicit(volatile __global atomic_half *object,
-                                  half operand, memory_order order);
-void __ovld atomic_store_explicit(volatile __global atomic_half *object,
-                                  half operand, memory_order order,
-                                  memory_scope scope);
-half __ovld atomic_load(volatile __global atomic_half *object);
-half __ovld atomic_load_explicit(volatile __global atomic_half *object,
-                                 memory_order order);
-half __ovld atomic_load_explicit(volatile __global atomic_half *object,
-                                 memory_order order, memory_scope scope);
-half __ovld atomic_exchange(volatile __global atomic_half *object,
-                            half operand);
-half __ovld atomic_exchange_explicit(volatile __global atomic_half *object,
-                                     half operand, memory_order order);
-half __ovld atomic_exchange_explicit(volatile __global atomic_half *object,
-                                     half operand, memory_order order,
-                                     memory_scope scope);
+void __ovld atomic_store(volatile __global atomic_half *, half);
+void __ovld atomic_store_explicit(volatile __global atomic_half *,
+                                  half, memory_order);
+void __ovld atomic_store_explicit(volatile __global atomic_half *,
+                                  half, memory_order, memory_scope);
+half __ovld atomic_load(volatile __global atomic_half *);
+half __ovld atomic_load_explicit(volatile __global atomic_half *,
+                                 memory_order);
+half __ovld atomic_load_explicit(volatile __global atomic_half *,
+                                 memory_order, memory_scope);
+half __ovld atomic_exchange(volatile __global atomic_half *, half);
+half __ovld atomic_exchange_explicit(volatile __global atomic_half *,
+                                     half, memory_order);
+half __ovld atomic_exchange_explicit(volatile __global atomic_half *,
+                                     half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_global_atomic_load_store)
 
 #if defined(__opencl_c_ext_fp16_local_atomic_load_store)
-void __ovld atomic_store(volatile __local atomic_half *object, half operand);
-void __ovld atomic_store_explicit(volatile __local atomic_half *object,
-                                  half operand, memory_order order);
-void __ovld atomic_store_explicit(volatile __local atomic_half *object,
-                                  half operand, memory_order order,
-                                  memory_scope scope);
-half __ovld atomic_load(volatile __local atomic_half *object);
-half __ovld atomic_load_explicit(volatile __local atomic_half *object,
-                                 memory_order order);
-half __ovld atomic_load_explicit(volatile __local atomic_half *object,
-                                 memory_order order, memory_scope scope);
-half __ovld atomic_exchange(volatile __local atomic_half *object, half operand);
-half __ovld atomic_exchange_explicit(volatile __local atomic_half *object,
-                                     half operand, memory_order order);
-half __ovld atomic_exchange_explicit(volatile __local atomic_half *object,
-                                     half operand, memory_order order,
-                                     memory_scope scope);
+void __ovld atomic_store(volatile __local atomic_half *, half);
+void __ovld atomic_store_explicit(volatile __local atomic_half *,
+                                  half, memory_order);
+void __ovld atomic_store_explicit(volatile __local atomic_half *,
+                                  half, memory_order, memory_scope);
+half __ovld atomic_load(volatile __local atomic_half *);
+half __ovld atomic_load_explicit(volatile __local atomic_half *,
+                                 memory_order);
+half __ovld atomic_load_explicit(volatile __local atomic_half *,
+                                 memory_order, memory_scope);
+half __ovld atomic_exchange(volatile __local atomic_half *, half);
+half __ovld atomic_exchange_explicit(volatile __local atomic_half *,
+                                     half, memory_order);
+half __ovld atomic_exchange_explicit(volatile __local atomic_half *,
+                                     half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_local_atomic_load_store)
 
 #if defined(__opencl_c_ext_fp16_global_atomic_load_store) &&                   \
     defined(__opencl_c_ext_fp16_local_atomic_load_store)
-void __ovld atomic_store(volatile atomic_half *object, half operand);
-void __ovld atomic_store_explicit(volatile atomic_half *object, half operand,
-                                  memory_order order);
-void __ovld atomic_store_explicit(volatile atomic_half *object, half operand,
-                                  memory_order order, memory_scope scope);
-half __ovld atomic_load(volatile atomic_half *object);
-half __ovld atomic_load_explicit(volatile atomic_half *object,
-                                 memory_order order);
-half __ovld atomic_load_explicit(volatile atomic_half *object,
-                                 memory_order order, memory_scope scope);
-half __ovld atomic_exchange(volatile atomic_half *object, half operand);
-half __ovld atomic_exchange_explicit(volatile atomic_half *object, half operand,
-                                     memory_order order);
-half __ovld atomic_exchange_explicit(volatile atomic_half *object, half operand,
-                                     memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_half *, half);
+void __ovld atomic_store_explicit(volatile atomic_half *, half,
+                                  memory_order);
+void __ovld atomic_store_explicit(volatile atomic_half *, half,
+                                  memory_order, memory_scope);
+half __ovld atomic_load(volatile atomic_half *);
+half __ovld atomic_load_explicit(volatile atomic_half *,
+                                 memory_order);
+half __ovld atomic_load_explicit(volatile atomic_half *,
+                                 memory_order, memory_scope);
+half __ovld atomic_exchange(volatile atomic_half *, half);
+half __ovld atomic_exchange_explicit(volatile atomic_half *, half,
+                                     memory_order);
+half __ovld atomic_exchange_explicit(volatile atomic_half *, half,
+                                     memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_global_atomic_load_store) &&
        // defined(__opencl_c_ext_fp16_local_atomic_load_store)
 
 #if defined(__opencl_c_ext_fp16_global_atomic_min_max)
-half __ovld atomic_fetch_min(volatile __global atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_max(volatile __global atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_min_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_max_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_min_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
-half __ovld atomic_fetch_max_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
+half __ovld atomic_fetch_min(volatile __global atomic_half *, half);
+half __ovld atomic_fetch_max(volatile __global atomic_half *, half);
+half __ovld atomic_fetch_min_explicit(volatile __global atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_max_explicit(volatile __global atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_min_explicit(volatile __global atomic_half *,
+                                      half, memory_order, memory_scope);
+half __ovld atomic_fetch_max_explicit(volatile __global atomic_half *,
+                                      half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_global_atomic_min_max)
 
 #if defined(__opencl_c_ext_fp16_local_atomic_min_max)
-half __ovld atomic_fetch_min(volatile __local atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_max(volatile __local atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_min_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_max_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_min_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
-half __ovld atomic_fetch_max_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
+half __ovld atomic_fetch_min(volatile __local atomic_half *, half);
+half __ovld atomic_fetch_max(volatile __local atomic_half *, half);
+half __ovld atomic_fetch_min_explicit(volatile __local atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_max_explicit(volatile __local atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_min_explicit(volatile __local atomic_half *,
+                                      half, memory_order, memory_scope);
+half __ovld atomic_fetch_max_explicit(volatile __local atomic_half *,
+                                      half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_local_atomic_min_max)
 
 #if defined(__opencl_c_ext_fp16_global_atomic_min_max) &&                      \
     defined(__opencl_c_ext_fp16_local_atomic_min_max)
-half __ovld atomic_fetch_min(volatile atomic_half *object, half operand);
-half __ovld atomic_fetch_max(volatile atomic_half *object, half operand);
-half __ovld atomic_fetch_min_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_max_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_min_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
-half __ovld atomic_fetch_max_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
+half __ovld atomic_fetch_min(volatile atomic_half *, half);
+half __ovld atomic_fetch_max(volatile atomic_half *, half);
+half __ovld atomic_fetch_min_explicit(volatile atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_max_explicit(volatile atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_min_explicit(volatile atomic_half *,
+                                      half, memory_order, memory_scope);
+half __ovld atomic_fetch_max_explicit(volatile atomic_half *,
+                                      half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_global_atomic_min_max) &&                \
     defined(__opencl_c_ext_fp16_local_atomic_min_max)
 
 #if defined(__opencl_c_ext_fp32_global_atomic_min_max)
-float __ovld atomic_fetch_min(volatile __global atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_max(volatile __global atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_min_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_max_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_min_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
-float __ovld atomic_fetch_max_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
+float __ovld atomic_fetch_min(volatile __global atomic_float *, float);
+float __ovld atomic_fetch_max(volatile __global atomic_float *, float);
+float __ovld atomic_fetch_min_explicit(volatile __global atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_max_explicit(volatile __global atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_min_explicit(volatile __global atomic_float *,
+                                       float, memory_order, memory_scope);
+float __ovld atomic_fetch_max_explicit(volatile __global atomic_float *,
+                                       float, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp32_global_atomic_min_max)
 
 #if defined(__opencl_c_ext_fp32_local_atomic_min_max)
-float __ovld atomic_fetch_min(volatile __local atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_max(volatile __local atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_min_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_max_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_min_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
-float __ovld atomic_fetch_max_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
+float __ovld atomic_fetch_min(volatile __local atomic_float *, float);
+float __ovld atomic_fetch_max(volatile __local atomic_float *, float);
+float __ovld atomic_fetch_min_explicit(volatile __local atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_max_explicit(volatile __local atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_min_explicit(volatile __local atomic_float *,
+                                       float, memory_order, memory_scope);
+float __ovld atomic_fetch_max_explicit(volatile __local atomic_float *,
+                                       float, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp32_local_atomic_min_max)
 
 #if defined(__opencl_c_ext_fp32_global_atomic_min_max) &&                      \
     defined(__opencl_c_ext_fp32_local_atomic_min_max)
-float __ovld atomic_fetch_min(volatile atomic_float *object, float operand);
-float __ovld atomic_fetch_max(volatile atomic_float *object, float operand);
-float __ovld atomic_fetch_min_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_max_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_min_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
-float __ovld atomic_fetch_max_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
+float __ovld atomic_fetch_min(volatile atomic_float *, float);
+float __ovld atomic_fetch_max(volatile atomic_float *, float);
+float __ovld atomic_fetch_min_explicit(volatile atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_max_explicit(volatile atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_min_explicit(volatile atomic_float *,
+                                       float, memory_order, memory_scope);
+float __ovld atomic_fetch_max_explicit(volatile atomic_float *,
+                                       float, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp32_global_atomic_min_max) &&                \
     defined(__opencl_c_ext_fp32_local_atomic_min_max)
 
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #if defined(__opencl_c_ext_fp64_global_atomic_min_max)
-double __ovld atomic_fetch_min(volatile __global atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_max(volatile __global atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_min_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_max_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_min_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
-double __ovld atomic_fetch_max_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
+double __ovld atomic_fetch_min(volatile __global atomic_double *, double);
+double __ovld atomic_fetch_max(volatile __global atomic_double *, double);
+double __ovld atomic_fetch_min_explicit(volatile __global atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_max_explicit(volatile __global atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_min_explicit(volatile __global atomic_double *,
+                                        double, memory_order, memory_scope);
+double __ovld atomic_fetch_max_explicit(volatile __global atomic_double *,
+                                        double, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp64_global_atomic_min_max)
 
 #if defined(__opencl_c_ext_fp64_local_atomic_min_max)
-double __ovld atomic_fetch_min(volatile __local atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_max(volatile __local atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_min_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_max_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_min_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
-double __ovld atomic_fetch_max_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
+double __ovld atomic_fetch_min(volatile __local atomic_double *, double);
+double __ovld atomic_fetch_max(volatile __local atomic_double *, double);
+double __ovld atomic_fetch_min_explicit(volatile __local atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_max_explicit(volatile __local atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_min_explicit(volatile __local atomic_double *,
+                                        double, memory_order, memory_scope);
+double __ovld atomic_fetch_max_explicit(volatile __local atomic_double *,
+                                        double, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp64_local_atomic_min_max)
 
 #if defined(__opencl_c_ext_fp64_global_atomic_min_max) &&                      \
     defined(__opencl_c_ext_fp64_local_atomic_min_max)
-double __ovld atomic_fetch_min(volatile atomic_double *object, double operand);
-double __ovld atomic_fetch_max(volatile atomic_double *object, double operand);
-double __ovld atomic_fetch_min_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_max_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_min_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
-double __ovld atomic_fetch_max_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
+double __ovld atomic_fetch_min(volatile atomic_double *, double);
+double __ovld atomic_fetch_max(volatile atomic_double *, double);
+double __ovld atomic_fetch_min_explicit(volatile atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_max_explicit(volatile atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_min_explicit(volatile atomic_double *,
+                                        double, memory_order, memory_scope);
+double __ovld atomic_fetch_max_explicit(volatile atomic_double *,
+                                        double, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp64_global_atomic_min_max) &&                \
     defined(__opencl_c_ext_fp64_local_atomic_min_max)
 #endif // defined(cl_khr_int64_base_atomics) &&                                \
     defined(cl_khr_int64_extended_atomics)
 
 #if defined(__opencl_c_ext_fp16_global_atomic_add)
-half __ovld atomic_fetch_add(volatile __global atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_sub(volatile __global atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_add_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_sub_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_add_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
-half __ovld atomic_fetch_sub_explicit(volatile __global atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
+half __ovld atomic_fetch_add(volatile __global atomic_half *, half);
+half __ovld atomic_fetch_sub(volatile __global atomic_half *, half);
+half __ovld atomic_fetch_add_explicit(volatile __global atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_sub_explicit(volatile __global atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_add_explicit(volatile __global atomic_half *,
+                                      half, memory_order, memory_scope);
+half __ovld atomic_fetch_sub_explicit(volatile __global atomic_half *,
+                                      half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_global_atomic_add)
 
 #if defined(__opencl_c_ext_fp16_local_atomic_add)
-half __ovld atomic_fetch_add(volatile __local atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_sub(volatile __local atomic_half *object,
-                             half operand);
-half __ovld atomic_fetch_add_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_sub_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_add_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
-half __ovld atomic_fetch_sub_explicit(volatile __local atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
+half __ovld atomic_fetch_add(volatile __local atomic_half *, half);
+half __ovld atomic_fetch_sub(volatile __local atomic_half *, half);
+half __ovld atomic_fetch_add_explicit(volatile __local atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_sub_explicit(volatile __local atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_add_explicit(volatile __local atomic_half *,
+                                      half, memory_order, memory_scope);
+half __ovld atomic_fetch_sub_explicit(volatile __local atomic_half *,
+                                      half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_local_atomic_add)
 
 #if defined(__opencl_c_ext_fp16_global_atomic_add) &&                          \
     defined(__opencl_c_ext_fp16_local_atomic_add)
-half __ovld atomic_fetch_add(volatile atomic_half *object, half operand);
-half __ovld atomic_fetch_sub(volatile atomic_half *object, half operand);
-half __ovld atomic_fetch_add_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_sub_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order);
-half __ovld atomic_fetch_add_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
-half __ovld atomic_fetch_sub_explicit(volatile atomic_half *object,
-                                      half operand, memory_order order,
-                                      memory_scope scope);
+half __ovld atomic_fetch_add(volatile atomic_half *, half);
+half __ovld atomic_fetch_sub(volatile atomic_half *, half);
+half __ovld atomic_fetch_add_explicit(volatile atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_sub_explicit(volatile atomic_half *,
+                                      half, memory_order);
+half __ovld atomic_fetch_add_explicit(volatile atomic_half *,
+                                      half, memory_order, memory_scope);
+half __ovld atomic_fetch_sub_explicit(volatile atomic_half *,
+                                      half, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp16_global_atomic_add) &&                    \
     defined(__opencl_c_ext_fp16_local_atomic_add)
 
 #if defined(__opencl_c_ext_fp32_global_atomic_add)
-float __ovld atomic_fetch_add(volatile __global atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_sub(volatile __global atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_add_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_sub_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_add_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
-float __ovld atomic_fetch_sub_explicit(volatile __global atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
+float __ovld atomic_fetch_add(volatile __global atomic_float *, float);
+float __ovld atomic_fetch_sub(volatile __global atomic_float *, float);
+float __ovld atomic_fetch_add_explicit(volatile __global atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_sub_explicit(volatile __global atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_add_explicit(volatile __global atomic_float *,
+                                       float, memory_order, memory_scope);
+float __ovld atomic_fetch_sub_explicit(volatile __global atomic_float *,
+                                       float, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp32_global_atomic_add)
 
 #if defined(__opencl_c_ext_fp32_local_atomic_add)
-float __ovld atomic_fetch_add(volatile __local atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_sub(volatile __local atomic_float *object,
-                              float operand);
-float __ovld atomic_fetch_add_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_sub_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_add_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
-float __ovld atomic_fetch_sub_explicit(volatile __local atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
+float __ovld atomic_fetch_add(volatile __local atomic_float *, float);
+float __ovld atomic_fetch_sub(volatile __local atomic_float *, float);
+float __ovld atomic_fetch_add_explicit(volatile __local atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_sub_explicit(volatile __local atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_add_explicit(volatile __local atomic_float *,
+                                       float, memory_order, memory_scope);
+float __ovld atomic_fetch_sub_explicit(volatile __local atomic_float *,
+                                       float, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp32_local_atomic_add)
 
 #if defined(__opencl_c_ext_fp32_global_atomic_add) &&                          \
     defined(__opencl_c_ext_fp32_local_atomic_add)
-float __ovld atomic_fetch_add(volatile atomic_float *object, float operand);
-float __ovld atomic_fetch_sub(volatile atomic_float *object, float operand);
-float __ovld atomic_fetch_add_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_sub_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order);
-float __ovld atomic_fetch_add_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
-float __ovld atomic_fetch_sub_explicit(volatile atomic_float *object,
-                                       float operand, memory_order order,
-                                       memory_scope scope);
+float __ovld atomic_fetch_add(volatile atomic_float *, float);
+float __ovld atomic_fetch_sub(volatile atomic_float *, float);
+float __ovld atomic_fetch_add_explicit(volatile atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_sub_explicit(volatile atomic_float *,
+                                       float, memory_order);
+float __ovld atomic_fetch_add_explicit(volatile atomic_float *,
+                                       float, memory_order, memory_scope);
+float __ovld atomic_fetch_sub_explicit(volatile atomic_float *,
+                                       float, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp32_global_atomic_add) &&                    \
     defined(__opencl_c_ext_fp32_local_atomic_add)
 
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #if defined(__opencl_c_ext_fp64_global_atomic_add)
-double __ovld atomic_fetch_add(volatile __global atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_sub(volatile __global atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_add_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_sub_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_add_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
-double __ovld atomic_fetch_sub_explicit(volatile __global atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
+double __ovld atomic_fetch_add(volatile __global atomic_double *, double);
+double __ovld atomic_fetch_sub(volatile __global atomic_double *, double);
+double __ovld atomic_fetch_add_explicit(volatile __global atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_sub_explicit(volatile __global atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_add_explicit(volatile __global atomic_double *,
+                                        double, memory_order, memory_scope);
+double __ovld atomic_fetch_sub_explicit(volatile __global atomic_double *,
+                                        double, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp64_global_atomic_add)
 
 #if defined(__opencl_c_ext_fp64_local_atomic_add)
-double __ovld atomic_fetch_add(volatile __local atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_sub(volatile __local atomic_double *object,
-                               double operand);
-double __ovld atomic_fetch_add_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_sub_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_add_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
-double __ovld atomic_fetch_sub_explicit(volatile __local atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
+double __ovld atomic_fetch_add(volatile __local atomic_double *, double);
+double __ovld atomic_fetch_sub(volatile __local atomic_double *, double);
+double __ovld atomic_fetch_add_explicit(volatile __local atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_sub_explicit(volatile __local atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_add_explicit(volatile __local atomic_double *,
+                                        double, memory_order, memory_scope);
+double __ovld atomic_fetch_sub_explicit(volatile __local atomic_double *,
+                                        double, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp64_local_atomic_add)
 
 #if defined(__opencl_c_ext_fp64_global_atomic_add) &&                          \
     defined(__opencl_c_ext_fp64_local_atomic_add)
-double __ovld atomic_fetch_add(volatile atomic_double *object, double operand);
-double __ovld atomic_fetch_sub(volatile atomic_double *object, double operand);
-double __ovld atomic_fetch_add_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_sub_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order);
-double __ovld atomic_fetch_add_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
-double __ovld atomic_fetch_sub_explicit(volatile atomic_double *object,
-                                        double operand, memory_order order,
-                                        memory_scope scope);
+double __ovld atomic_fetch_add(volatile atomic_double *, double);
+double __ovld atomic_fetch_sub(volatile atomic_double *, double);
+double __ovld atomic_fetch_add_explicit(volatile atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_sub_explicit(volatile atomic_double *,
+                                        double, memory_order);
+double __ovld atomic_fetch_add_explicit(volatile atomic_double *,
+                                        double, memory_order, memory_scope);
+double __ovld atomic_fetch_sub_explicit(volatile atomic_double *,
+                                        double, memory_order, memory_scope);
 #endif // defined(__opencl_c_ext_fp64_global_atomic_add) &&                    \
     defined(__opencl_c_ext_fp64_local_atomic_add)
 #endif // defined(cl_khr_int64_base_atomics) &&                                \
@@ -14048,197 +13983,197 @@ double __ovld atomic_fetch_sub_explicit(volatile atomic_double *object,
 
 #if defined(__opencl_c_atomic_order_seq_cst) && defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-void __ovld atomic_store(volatile atomic_int *object, int desired);
-void __ovld atomic_store(volatile atomic_uint *object, uint desired);
-void __ovld atomic_store(volatile atomic_float *object, float desired);
+void __ovld atomic_store(volatile atomic_int *, int);
+void __ovld atomic_store(volatile atomic_uint *, uint);
+void __ovld atomic_store(volatile atomic_float *, float);
 
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-void __ovld atomic_store(volatile atomic_double *object, double desired);
+void __ovld atomic_store(volatile atomic_double *, double);
 #endif //cl_khr_fp64
-void __ovld atomic_store(volatile atomic_long *object, long desired);
-void __ovld atomic_store(volatile atomic_ulong *object, ulong desired);
+void __ovld atomic_store(volatile atomic_long *, long);
+void __ovld atomic_store(volatile atomic_ulong *, ulong);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-void __ovld atomic_store(volatile __global atomic_int *object, int desired);
-void __ovld atomic_store(volatile __local atomic_int *object, int desired);
-void __ovld atomic_store(volatile __global atomic_uint *object, uint desired);
-void __ovld atomic_store(volatile __local atomic_uint *object, uint desired);
-void __ovld atomic_store(volatile __global atomic_float *object, float desired);
-void __ovld atomic_store(volatile __local atomic_float *object, float desired);
+void __ovld atomic_store(volatile __global atomic_int *, int);
+void __ovld atomic_store(volatile __local atomic_int *, int);
+void __ovld atomic_store(volatile __global atomic_uint *, uint);
+void __ovld atomic_store(volatile __local atomic_uint *, uint);
+void __ovld atomic_store(volatile __global atomic_float *, float);
+void __ovld atomic_store(volatile __local atomic_float *, float);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-void __ovld atomic_store(volatile __global atomic_double *object, double desired);
-void __ovld atomic_store(volatile __local atomic_double *object, double desired);
+void __ovld atomic_store(volatile __global atomic_double *, double);
+void __ovld atomic_store(volatile __local atomic_double *, double);
 #endif //cl_khr_fp64
-void __ovld atomic_store(volatile __global atomic_long *object, long desired);
-void __ovld atomic_store(volatile __local atomic_long *object, long desired);
-void __ovld atomic_store(volatile __global atomic_ulong *object, ulong desired);
-void __ovld atomic_store(volatile __local atomic_ulong *object, ulong desired);
+void __ovld atomic_store(volatile __global atomic_long *, long);
+void __ovld atomic_store(volatile __local atomic_long *, long);
+void __ovld atomic_store(volatile __global atomic_ulong *, ulong);
+void __ovld atomic_store(volatile __local atomic_ulong *, ulong);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order);
-void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order);
-void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_int *, int, memory_order);
+void __ovld atomic_store_explicit(volatile atomic_uint *, uint, memory_order);
+void __ovld atomic_store_explicit(volatile atomic_float *, float, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_double *, double, memory_order);
 #endif //cl_khr_fp64
-void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order);
-void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_long *, long, memory_order);
+void __ovld atomic_store_explicit(volatile atomic_ulong *, ulong, memory_order);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-void __ovld atomic_store_explicit(volatile __global atomic_int *object, int desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __local atomic_int *object, int desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __global atomic_uint *object, uint desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __local atomic_uint *object, uint desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __global atomic_float *object, float desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __local atomic_float *object, float desired, memory_order order);
+void __ovld atomic_store_explicit(volatile __global atomic_int *, int, memory_order);
+void __ovld atomic_store_explicit(volatile __local atomic_int *, int, memory_order);
+void __ovld atomic_store_explicit(volatile __global atomic_uint *, uint, memory_order);
+void __ovld atomic_store_explicit(volatile __local atomic_uint *, uint, memory_order);
+void __ovld atomic_store_explicit(volatile __global atomic_float *, float, memory_order);
+void __ovld atomic_store_explicit(volatile __local atomic_float *, float, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-void __ovld atomic_store_explicit(volatile __global atomic_double *object, double desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __local atomic_double *object, double desired, memory_order order);
+void __ovld atomic_store_explicit(volatile __global atomic_double *, double, memory_order);
+void __ovld atomic_store_explicit(volatile __local atomic_double *, double, memory_order);
 #endif
-void __ovld atomic_store_explicit(volatile __global atomic_long *object, long desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __local atomic_long *object, long desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __global atomic_ulong *object, ulong desired, memory_order order);
-void __ovld atomic_store_explicit(volatile __local atomic_ulong *object, ulong desired, memory_order order);
+void __ovld atomic_store_explicit(volatile __global atomic_long *, long, memory_order);
+void __ovld atomic_store_explicit(volatile __local atomic_long *, long, memory_order);
+void __ovld atomic_store_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+void __ovld atomic_store_explicit(volatile __local atomic_ulong *, ulong, memory_order);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_generic_address_space)
-void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+void __ovld atomic_store_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile atomic_float *, float, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+void __ovld atomic_store_explicit(volatile atomic_double *, double, memory_order, memory_scope);
 #endif //cl_khr_fp64
-void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+void __ovld atomic_store_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-void __ovld atomic_store_explicit(volatile __global atomic_int *object, int desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __local atomic_int *object, int desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __global atomic_uint *object, uint desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __local atomic_uint *object, uint desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __global atomic_float *object, float desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __local atomic_float *object, float desired, memory_order order, memory_scope scope);
+void __ovld atomic_store_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __global atomic_float *, float, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __local atomic_float *, float, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-void __ovld atomic_store_explicit(volatile __global atomic_double *object, double desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __local atomic_double *object, double desired, memory_order order, memory_scope scope);
+void __ovld atomic_store_explicit(volatile __global atomic_double *, double, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __local atomic_double *, double, memory_order, memory_scope);
 #endif //cl_khr_fp64
-void __ovld atomic_store_explicit(volatile __global atomic_long *object, long desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __local atomic_long *object, long desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __global atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
-void __ovld atomic_store_explicit(volatile __local atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+void __ovld atomic_store_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+void __ovld atomic_store_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 
 // atomic_load()
 #if defined(__opencl_c_atomic_order_seq_cst) && defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_load(volatile atomic_int *object);
-uint __ovld atomic_load(volatile atomic_uint *object);
-float __ovld atomic_load(volatile atomic_float *object);
+int __ovld atomic_load(volatile atomic_int *);
+uint __ovld atomic_load(volatile atomic_uint *);
+float __ovld atomic_load(volatile atomic_float *);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_load(volatile atomic_double *object);
+double __ovld atomic_load(volatile atomic_double *);
 #endif //cl_khr_fp64
-long __ovld atomic_load(volatile atomic_long *object);
-ulong __ovld atomic_load(volatile atomic_ulong *object);
+long __ovld atomic_load(volatile atomic_long *);
+ulong __ovld atomic_load(volatile atomic_ulong *);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_load(volatile __global atomic_int *object);
-int __ovld atomic_load(volatile __local atomic_int *object);
-uint __ovld atomic_load(volatile __global atomic_uint *object);
-uint __ovld atomic_load(volatile __local atomic_uint *object);
-float __ovld atomic_load(volatile __global atomic_float *object);
-float __ovld atomic_load(volatile __local atomic_float *object);
+int __ovld atomic_load(volatile __global atomic_int *);
+int __ovld atomic_load(volatile __local atomic_int *);
+uint __ovld atomic_load(volatile __global atomic_uint *);
+uint __ovld atomic_load(volatile __local atomic_uint *);
+float __ovld atomic_load(volatile __global atomic_float *);
+float __ovld atomic_load(volatile __local atomic_float *);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_load(volatile __global atomic_double *object);
-double __ovld atomic_load(volatile __local atomic_double *object);
+double __ovld atomic_load(volatile __global atomic_double *);
+double __ovld atomic_load(volatile __local atomic_double *);
 #endif //cl_khr_fp64
-long __ovld atomic_load(volatile __global atomic_long *object);
-long __ovld atomic_load(volatile __local atomic_long *object);
-ulong __ovld atomic_load(volatile __global atomic_ulong *object);
-ulong __ovld atomic_load(volatile __local atomic_ulong *object);
+long __ovld atomic_load(volatile __global atomic_long *);
+long __ovld atomic_load(volatile __local atomic_long *);
+ulong __ovld atomic_load(volatile __global atomic_ulong *);
+ulong __ovld atomic_load(volatile __local atomic_ulong *);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order);
-uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order);
-float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order);
+int __ovld atomic_load_explicit(volatile atomic_int *, memory_order);
+uint __ovld atomic_load_explicit(volatile atomic_uint *, memory_order);
+float __ovld atomic_load_explicit(volatile atomic_float *, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order);
+double __ovld atomic_load_explicit(volatile atomic_double *, memory_order);
 #endif //cl_khr_fp64
-long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order);
-ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order);
+long __ovld atomic_load_explicit(volatile atomic_long *, memory_order);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *, memory_order);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_load_explicit(volatile __global atomic_int *object, memory_order order);
-int __ovld atomic_load_explicit(volatile __local atomic_int *object, memory_order order);
-uint __ovld atomic_load_explicit(volatile __global atomic_uint *object, memory_order order);
-uint __ovld atomic_load_explicit(volatile __local atomic_uint *object, memory_order order);
-float __ovld atomic_load_explicit(volatile __global atomic_float *object, memory_order order);
-float __ovld atomic_load_explicit(volatile __local atomic_float *object, memory_order order);
+int __ovld atomic_load_explicit(volatile __global atomic_int *, memory_order);
+int __ovld atomic_load_explicit(volatile __local atomic_int *, memory_order);
+uint __ovld atomic_load_explicit(volatile __global atomic_uint *, memory_order);
+uint __ovld atomic_load_explicit(volatile __local atomic_uint *, memory_order);
+float __ovld atomic_load_explicit(volatile __global atomic_float *, memory_order);
+float __ovld atomic_load_explicit(volatile __local atomic_float *, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_load_explicit(volatile __global atomic_double *object, memory_order order);
-double __ovld atomic_load_explicit(volatile __local atomic_double *object, memory_order order);
+double __ovld atomic_load_explicit(volatile __global atomic_double *, memory_order);
+double __ovld atomic_load_explicit(volatile __local atomic_double *, memory_order);
 #endif //cl_khr_fp64
-long __ovld atomic_load_explicit(volatile __global atomic_long *object, memory_order order);
-long __ovld atomic_load_explicit(volatile __local atomic_long *object, memory_order order);
-ulong __ovld atomic_load_explicit(volatile __global atomic_ulong *object, memory_order order);
-ulong __ovld atomic_load_explicit(volatile __local atomic_ulong *object, memory_order order);
+long __ovld atomic_load_explicit(volatile __global atomic_long *, memory_order);
+long __ovld atomic_load_explicit(volatile __local atomic_long *, memory_order);
+ulong __ovld atomic_load_explicit(volatile __global atomic_ulong *, memory_order);
+ulong __ovld atomic_load_explicit(volatile __local atomic_ulong *, memory_order);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order, memory_scope scope);
-uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order, memory_scope scope);
-float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order, memory_scope scope);
+int __ovld atomic_load_explicit(volatile atomic_int *, memory_order, memory_scope);
+uint __ovld atomic_load_explicit(volatile atomic_uint *, memory_order, memory_scope);
+float __ovld atomic_load_explicit(volatile atomic_float *, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order, memory_scope scope);
+double __ovld atomic_load_explicit(volatile atomic_double *, memory_order, memory_scope);
 #endif //cl_khr_fp64
-long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order, memory_scope scope);
-ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order, memory_scope scope);
+long __ovld atomic_load_explicit(volatile atomic_long *, memory_order, memory_scope);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *, memory_order, memory_scope);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_load_explicit(volatile __global atomic_int *object, memory_order order, memory_scope scope);
-int __ovld atomic_load_explicit(volatile __local atomic_int *object, memory_order order, memory_scope scope);
-uint __ovld atomic_load_explicit(volatile __global atomic_uint *object, memory_order order, memory_scope scope);
-uint __ovld atomic_load_explicit(volatile __local atomic_uint *object, memory_order order, memory_scope scope);
-float __ovld atomic_load_explicit(volatile __global atomic_float *object, memory_order order, memory_scope scope);
-float __ovld atomic_load_explicit(volatile __local atomic_float *object, memory_order order, memory_scope scope);
+int __ovld atomic_load_explicit(volatile __global atomic_int *, memory_order, memory_scope);
+int __ovld atomic_load_explicit(volatile __local atomic_int *, memory_order, memory_scope);
+uint __ovld atomic_load_explicit(volatile __global atomic_uint *, memory_order, memory_scope);
+uint __ovld atomic_load_explicit(volatile __local atomic_uint *, memory_order, memory_scope);
+float __ovld atomic_load_explicit(volatile __global atomic_float *, memory_order, memory_scope);
+float __ovld atomic_load_explicit(volatile __local atomic_float *, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_load_explicit(volatile __global atomic_double *object, memory_order order, memory_scope scope);
-double __ovld atomic_load_explicit(volatile __local atomic_double *object, memory_order order, memory_scope scope);
+double __ovld atomic_load_explicit(volatile __global atomic_double *, memory_order, memory_scope);
+double __ovld atomic_load_explicit(volatile __local atomic_double *, memory_order, memory_scope);
 #endif
-long __ovld atomic_load_explicit(volatile __global atomic_long *object, memory_order order, memory_scope scope);
-long __ovld atomic_load_explicit(volatile __local atomic_long *object, memory_order order, memory_scope scope);
-ulong __ovld atomic_load_explicit(volatile __global atomic_ulong *object, memory_order order, memory_scope scope);
-ulong __ovld atomic_load_explicit(volatile __local atomic_ulong *object, memory_order order, memory_scope scope);
+long __ovld atomic_load_explicit(volatile __global atomic_long *, memory_order, memory_scope);
+long __ovld atomic_load_explicit(volatile __local atomic_long *, memory_order, memory_scope);
+ulong __ovld atomic_load_explicit(volatile __global atomic_ulong *, memory_order, memory_scope);
+ulong __ovld atomic_load_explicit(volatile __local atomic_ulong *, memory_order, memory_scope);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 
@@ -14246,599 +14181,431 @@ ulong __ovld atomic_load_explicit(volatile __local atomic_ulong *object, memory_
 
 #if defined(__opencl_c_atomic_order_seq_cst) && defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_exchange(volatile atomic_int *object, int desired);
-uint __ovld atomic_exchange(volatile atomic_uint *object, uint desired);
-float __ovld atomic_exchange(volatile atomic_float *object, float desired);
+int __ovld atomic_exchange(volatile atomic_int *, int);
+uint __ovld atomic_exchange(volatile atomic_uint *, uint);
+float __ovld atomic_exchange(volatile atomic_float *, float);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_exchange(volatile atomic_double *object, double desired);
+double __ovld atomic_exchange(volatile atomic_double *, double);
 #endif //cl_khr_fp64
-long __ovld atomic_exchange(volatile atomic_long *object, long desired);
-ulong __ovld atomic_exchange(volatile atomic_ulong *object, ulong desired);
+long __ovld atomic_exchange(volatile atomic_long *, long);
+ulong __ovld atomic_exchange(volatile atomic_ulong *, ulong);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_exchange(volatile __global atomic_int *object, int desired);
-int __ovld atomic_exchange(volatile __local atomic_int *object, int desired);
-uint __ovld atomic_exchange(volatile __global atomic_uint *object, uint desired);
-uint __ovld atomic_exchange(volatile __local atomic_uint *object, uint desired);
-float __ovld atomic_exchange(volatile __global atomic_float *object, float desired);
-float __ovld atomic_exchange(volatile __local atomic_float *object, float desired);
+int __ovld atomic_exchange(volatile __global atomic_int *, int);
+int __ovld atomic_exchange(volatile __local atomic_int *, int);
+uint __ovld atomic_exchange(volatile __global atomic_uint *, uint);
+uint __ovld atomic_exchange(volatile __local atomic_uint *, uint);
+float __ovld atomic_exchange(volatile __global atomic_float *, float);
+float __ovld atomic_exchange(volatile __local atomic_float *, float);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_exchange(volatile __global atomic_double *object, double desired);
-double __ovld atomic_exchange(volatile __local atomic_double *object, double desired);
+double __ovld atomic_exchange(volatile __global atomic_double *, double);
+double __ovld atomic_exchange(volatile __local atomic_double *, double);
 #endif //cl_khr_fp64
-long __ovld atomic_exchange(volatile __global atomic_long *object, long desired);
-long __ovld atomic_exchange(volatile __local atomic_long *object, long desired);
-ulong __ovld atomic_exchange(volatile __global atomic_ulong *object, ulong desired);
-ulong __ovld atomic_exchange(volatile __local atomic_ulong *object, ulong desired);
+long __ovld atomic_exchange(volatile __global atomic_long *, long);
+long __ovld atomic_exchange(volatile __local atomic_long *, long);
+ulong __ovld atomic_exchange(volatile __global atomic_ulong *, ulong);
+ulong __ovld atomic_exchange(volatile __local atomic_ulong *, ulong);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order);
-uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order);
-float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order);
+int __ovld atomic_exchange_explicit(volatile atomic_int *, int, memory_order);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *, uint, memory_order);
+float __ovld atomic_exchange_explicit(volatile atomic_float *, float, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order);
+double __ovld atomic_exchange_explicit(volatile atomic_double *, double, memory_order);
 #endif //cl_khr_fp64
-long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order);
-ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+long __ovld atomic_exchange_explicit(volatile atomic_long *, long, memory_order);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *, ulong, memory_order);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_exchange_explicit(volatile __global atomic_int *object, int desired, memory_order order);
-int __ovld atomic_exchange_explicit(volatile __local atomic_int *object, int desired, memory_order order);
-uint __ovld atomic_exchange_explicit(volatile __global atomic_uint *object, uint desired, memory_order order);
-uint __ovld atomic_exchange_explicit(volatile __local atomic_uint *object, uint desired, memory_order order);
-float __ovld atomic_exchange_explicit(volatile __global atomic_float *object, float desired, memory_order order);
-float __ovld atomic_exchange_explicit(volatile __local atomic_float *object, float desired, memory_order order);
+int __ovld atomic_exchange_explicit(volatile __global atomic_int *, int, memory_order);
+int __ovld atomic_exchange_explicit(volatile __local atomic_int *, int, memory_order);
+uint __ovld atomic_exchange_explicit(volatile __global atomic_uint *, uint, memory_order);
+uint __ovld atomic_exchange_explicit(volatile __local atomic_uint *, uint, memory_order);
+float __ovld atomic_exchange_explicit(volatile __global atomic_float *, float, memory_order);
+float __ovld atomic_exchange_explicit(volatile __local atomic_float *, float, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_exchange_explicit(volatile __global atomic_double *object, double desired, memory_order order);
-double __ovld atomic_exchange_explicit(volatile __local atomic_double *object, double desired, memory_order order);
+double __ovld atomic_exchange_explicit(volatile __global atomic_double *, double, memory_order);
+double __ovld atomic_exchange_explicit(volatile __local atomic_double *, double, memory_order);
 #endif //cl_khr_fp64
-long __ovld atomic_exchange_explicit(volatile __global atomic_long *object, long desired, memory_order order);
-long __ovld atomic_exchange_explicit(volatile __local atomic_long *object, long desired, memory_order order);
-ulong __ovld atomic_exchange_explicit(volatile __global atomic_ulong *object, ulong desired, memory_order order);
-ulong __ovld atomic_exchange_explicit(volatile __local atomic_ulong *object, ulong desired, memory_order order);
+long __ovld atomic_exchange_explicit(volatile __global atomic_long *, long, memory_order);
+long __ovld atomic_exchange_explicit(volatile __local atomic_long *, long, memory_order);
+ulong __ovld atomic_exchange_explicit(volatile __global atomic_ulong *, ulong, memory_order);
+ulong __ovld atomic_exchange_explicit(volatile __local atomic_ulong *, ulong, memory_order);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)wi
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_generic_address_space)
-int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
-uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
-float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+int __ovld atomic_exchange_explicit(volatile atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *, uint, memory_order, memory_scope);
+float __ovld atomic_exchange_explicit(volatile atomic_float *, float, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+double __ovld atomic_exchange_explicit(volatile atomic_double *, double, memory_order, memory_scope);
 #endif //cl_khr_fp64
-long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
-ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+long __ovld atomic_exchange_explicit(volatile atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *, ulong, memory_order, memory_scope);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-int __ovld atomic_exchange_explicit(volatile __global atomic_int *object, int desired, memory_order order, memory_scope scope);
-int __ovld atomic_exchange_explicit(volatile __local atomic_int *object, int desired, memory_order order, memory_scope scope);
-uint __ovld atomic_exchange_explicit(volatile __global atomic_uint *object, uint desired, memory_order order, memory_scope scope);
-uint __ovld atomic_exchange_explicit(volatile __local atomic_uint *object, uint desired, memory_order order, memory_scope scope);
-float __ovld atomic_exchange_explicit(volatile __global atomic_float *object, float desired, memory_order order, memory_scope scope);
-float __ovld atomic_exchange_explicit(volatile __local atomic_float *object, float desired, memory_order order, memory_scope scope);
+int __ovld atomic_exchange_explicit(volatile __global atomic_int *, int, memory_order, memory_scope);
+int __ovld atomic_exchange_explicit(volatile __local atomic_int *, int, memory_order, memory_scope);
+uint __ovld atomic_exchange_explicit(volatile __global atomic_uint *, uint, memory_order, memory_scope);
+uint __ovld atomic_exchange_explicit(volatile __local atomic_uint *, uint, memory_order, memory_scope);
+float __ovld atomic_exchange_explicit(volatile __global atomic_float *, float, memory_order, memory_scope);
+float __ovld atomic_exchange_explicit(volatile __local atomic_float *, float, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-double __ovld atomic_exchange_explicit(volatile __global atomic_double *object, double desired, memory_order order, memory_scope scope);
-double __ovld atomic_exchange_explicit(volatile __local atomic_double *object, double desired, memory_order order, memory_scope scope);
+double __ovld atomic_exchange_explicit(volatile __global atomic_double *, double, memory_order, memory_scope);
+double __ovld atomic_exchange_explicit(volatile __local atomic_double *, double, memory_order, memory_scope);
 #endif //cl_khr_fp64
-long __ovld atomic_exchange_explicit(volatile __global atomic_long *object, long desired, memory_order order, memory_scope scope);
-long __ovld atomic_exchange_explicit(volatile __local atomic_long *object, long desired, memory_order order, memory_scope scope);
-ulong __ovld atomic_exchange_explicit(volatile __global atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
-ulong __ovld atomic_exchange_explicit(volatile __local atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+long __ovld atomic_exchange_explicit(volatile __global atomic_long *, long, memory_order, memory_scope);
+long __ovld atomic_exchange_explicit(volatile __local atomic_long *, long, memory_order, memory_scope);
+ulong __ovld atomic_exchange_explicit(volatile __global atomic_ulong *, ulong, memory_order, memory_scope);
+ulong __ovld atomic_exchange_explicit(volatile __local atomic_ulong *, ulong, memory_order, memory_scope);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 
 // atomic_compare_exchange_strong() and atomic_compare_exchange_weak()
 #if defined(__opencl_c_atomic_order_seq_cst) && defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-bool __ovld atomic_compare_exchange_strong(volatile atomic_int *object, int *expected, int desired);
-bool __ovld atomic_compare_exchange_strong(volatile atomic_uint *object, uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_weak(volatile atomic_int *object, int *expected, int desired);
-bool __ovld atomic_compare_exchange_weak(volatile atomic_uint *object, uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_strong(volatile atomic_float *object, float *expected, float desired);
-bool __ovld atomic_compare_exchange_weak(volatile atomic_float *object, float *expected, float desired);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_int *, int *, int);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_uint *, uint *, uint);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_int *, int *, int);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_uint *, uint *, uint);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_float *, float *, float);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_float *, float *, float);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong(volatile atomic_double *object, double *expected, double desired);
-bool __ovld atomic_compare_exchange_weak(volatile atomic_double *object, double *expected, double desired);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_double *, double *, double);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_double *, double *, double);
 #endif //cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong(volatile atomic_long *object, long *expected, long desired);
-bool __ovld atomic_compare_exchange_weak(volatile atomic_long *object, long *expected, long desired);
-bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong *object, ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong *object, ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_long *, long *, long);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_long *, long *, long);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong *, ulong *, ulong);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong *, ulong *, ulong);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_int *object, __global int *expected, int desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_int *object, __local int *expected, int desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_int *object, __private int *expected, int desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_int *object, __global int *expected, int desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_int *object, __local int *expected, int desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_int *object, __private int *expected, int desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_uint *object, __global uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_uint *object, __local uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_uint *object, __private uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_uint *object, __global uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_uint *object, __local uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_uint *object, __private uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_float *object, __global float *expected, float desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_float *object, __local float *expected, float desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_float *object, __private float *expected, float desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_float *object, __global float *expected, float desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_float *object, __local float *expected, float desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_float *object, __private float *expected, float desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_int *object, __global int *expected, int desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_int *object, __local int *expected, int desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_int *object, __private int *expected, int desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_int *object, __global int *expected, int desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_int *object, __local int *expected, int desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_int *object, __private int *expected, int desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_uint *object, __global uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_uint *object, __local uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_uint *object, __private uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_uint *object, __global uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_uint *object, __local uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_uint *object, __private uint *expected, uint desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_float *object, __global float *expected, float desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_float *object, __local float *expected, float desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_float *object, __private float *expected, float desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_float *object, __global float *expected, float desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_float *object, __local float *expected, float desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_float *object, __private float *expected, float desired);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_int *, __global int *, int);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_int *, __local int *, int);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_int *, __private int *, int);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_int *, __global int *, int);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_int *, __local int *, int);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_int *, __private int *, int);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_uint *, __global uint *, uint);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_uint *, __local uint *, uint);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_uint *, __private uint *, uint);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_uint *, __global uint *, uint);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_uint *, __local uint *, uint);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_uint *, __private uint *, uint);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_float *, __global float *, float);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_float *, __local float *, float);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_float *, __private float *, float);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_float *, __global float *, float);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_float *, __local float *, float);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_float *, __private float *, float);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_int *, __global int *, int);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_int *, __local int *, int);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_int *, __private int *, int);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_int *, __global int *, int);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_int *, __local int *, int);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_int *, __private int *, int);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_uint *, __global uint *, uint);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_uint *, __local uint *, uint);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_uint *, __private uint *, uint);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_uint *, __global uint *, uint);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_uint *, __local uint *, uint);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_uint *, __private uint *, uint);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_float *, __global float *, float);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_float *, __local float *, float);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_float *, __private float *, float);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_float *, __global float *, float);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_float *, __local float *, float);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_float *, __private float *, float);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_double *object, __global double *expected, double desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_double *object, __local double *expected, double desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_double *object, __private double *expected, double desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_double *object, __global double *expected, double desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_double *object, __local double *expected, double desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_double *object, __private double *expected, double desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_double *object, __global double *expected, double desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_double *object, __local double *expected, double desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_double *object, __private double *expected, double desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_double *object, __global double *expected, double desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_double *object, __local double *expected, double desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_double *object, __private double *expected, double desired);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_double *, __global double *, double);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_double *, __local double *, double);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_double *, __private double *, double);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_double *, __global double *, double);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_double *, __local double *, double);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_double *, __private double *, double);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_double *, __global double *, double);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_double *, __local double *, double);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_double *, __private double *, double);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_double *, __global double *, double);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_double *, __local double *, double);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_double *, __private double *, double);
 #endif //cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_long *object, __global long *expected, long desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_long *object, __local long *expected, long desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_long *object, __private long *expected, long desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_long *object, __global long *expected, long desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_long *object, __local long *expected, long desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_long *object, __private long *expected, long desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_ulong *object, __global ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_ulong *object, __local ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_strong(volatile __global atomic_ulong *object, __private ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_ulong *object, __global ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_ulong *object, __local ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_strong(volatile __local atomic_ulong *object, __private ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_long *object, __global long *expected, long desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_long *object, __local long *expected, long desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_long *object, __private long *expected, long desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_long *object, __global long *expected, long desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_long *object, __local long *expected, long desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_long *object, __private long *expected, long desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_ulong *object, __global ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_ulong *object, __local ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_weak(volatile __global atomic_ulong *object, __private ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_ulong *object, __global ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_ulong *object, __local ulong *expected, ulong desired);
-bool __ovld atomic_compare_exchange_weak(volatile __local atomic_ulong *object, __private ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_long *, __global long *, long);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_long *, __local long *, long);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_long *, __private long *, long);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_long *, __global long *, long);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_long *, __local long *, long);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_long *, __private long *, long);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_ulong *, __global ulong *, ulong);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_ulong *, __local ulong *, ulong);
+bool __ovld atomic_compare_exchange_strong(volatile __global atomic_ulong *, __private ulong *, ulong);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_ulong *, __global ulong *, ulong);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_ulong *, __local ulong *, ulong);
+bool __ovld atomic_compare_exchange_strong(volatile __local atomic_ulong *, __private ulong *, ulong);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_long *, __global long *, long);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_long *, __local long *, long);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_long *, __private long *, long);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_long *, __global long *, long);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_long *, __local long *, long);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_long *, __private long *, long);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_ulong *, __global ulong *, ulong);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_ulong *, __local ulong *, ulong);
+bool __ovld atomic_compare_exchange_weak(volatile __global atomic_ulong *, __private ulong *, ulong);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_ulong *, __global ulong *, ulong);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_ulong *, __local ulong *, ulong);
+bool __ovld atomic_compare_exchange_weak(volatile __local atomic_ulong *, __private ulong *, ulong);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_generic_address_space)
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *, int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *, uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *, int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *, uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *, float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *, float *, float, memory_order, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *, double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *, double *, double, memory_order, memory_order);
 #endif //cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *, long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *, long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *, ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *, ulong *, ulong, memory_order, memory_order);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *, __global int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *, __local int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *, __private int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *, __global int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *, __local int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *, __private int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *, __global uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *, __local uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *, __private uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *, __global uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *, __local uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *, __private uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *, __global float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *, __local float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *, __private float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *, __global float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *, __local float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *, __private float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *, __global int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *, __local int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *, __private int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *, __global int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *, __local int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *, __private int *, int, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *, __global uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *, __local uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *, __private uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *, __global uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *, __local uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *, __private uint *, uint, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *, __global float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *, __local float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *, __private float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *, __global float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *, __local float *, float, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *, __private float *, float, memory_order, memory_order);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *, __global double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *, __local double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *, __private double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *, __global double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *, __local double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *, __private double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *, __global double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *, __local double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *, __private double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *, __global double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *, __local double *, double, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *, __private double *, double, memory_order, memory_order);
 #endif //cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *, __global long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *, __local long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *, __private long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *, __global long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *, __local long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *, __private long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *, __global ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *, __local ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *, __private ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *, __global ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *, __local ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *, __private ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *, __global long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *, __local long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *, __private long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *, __global long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *, __local long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *, __private long *, long, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *, __global ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *, __local ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *, __private ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *, __global ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *, __local ulong *, ulong, memory_order, memory_order);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *, __private ulong *, ulong, memory_order, memory_order);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 
 #if defined(__opencl_c_generic_address_space)
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *, int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *, uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *, int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *, uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *, float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *, float *, float, memory_order, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *, double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *, double *, double, memory_order, memory_order, memory_scope);
 #endif //cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *, long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *, long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *, ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *, ulong *, ulong, memory_order, memory_order, memory_scope);
 #endif
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *object, __global int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *object, __local int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *object, __private int *expected,
-                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *object, __global uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *object, __local uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *object, __private uint *expected,
-                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *object, __global float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *object, __local float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *object, __private float *expected,
-                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *, __global int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *, __local int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_int *, __private int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *, __global int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *, __local int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_int *, __private int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *, __global uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *, __local uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_uint *, __private uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *, __global uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *, __local uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_uint *, __private uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *, __global float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *, __local float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_float *, __private float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *, __global float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *, __local float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_float *, __private float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *, __global int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *, __local int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_int *, __private int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *, __global int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *, __local int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_int *, __private int *, int, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *, __global uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *, __local uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_uint *, __private uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *, __global uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *, __local uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_uint *, __private uint *, uint, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *, __global float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *, __local float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_float *, __private float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *, __global float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *, __local float *, float, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_float *, __private float *, float, memory_order, memory_order, memory_scope);
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #ifdef cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *object, __global double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *object, __local double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *object, __private double *expected,
-                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *, __global double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *, __local double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_double *, __private double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *, __global double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *, __local double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_double *, __private double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *, __global double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *, __local double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_double *, __private double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *, __global double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *, __local double *, double, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_double *, __private double *, double, memory_order, memory_order, memory_scope);
 #endif //cl_khr_fp64
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *object, __global long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *object, __local long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *object, __private long *expected,
-                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *object, __global ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *object, __local ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
-bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *object, __private ulong *expected,
-                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *, __global long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *, __local long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_long *, __private long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *, __global long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *, __local long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_long *, __private long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *, __global ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *, __local ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __global atomic_ulong *, __private ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *, __global ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *, __local ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile __local atomic_ulong *, __private ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *, __global long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *, __local long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_long *, __private long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *, __global long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *, __local long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_long *, __private long *, long, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *, __global ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *, __local ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __global atomic_ulong *, __private ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *, __global ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *, __local ulong *, ulong, memory_order, memory_order, memory_scope);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile __local atomic_ulong *, __private ulong *, ulong, memory_order, memory_order, memory_scope);
 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 
 // atomic_flag_test_and_set() and atomic_flag_clear()
 #if defined(__opencl_c_atomic_order_seq_cst) && defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-bool __ovld atomic_flag_test_and_set(volatile atomic_flag *object);
-void __ovld atomic_flag_clear(volatile atomic_flag *object);
+bool __ovld atomic_flag_test_and_set(volatile atomic_flag *);
+void __ovld atomic_flag_clear(volatile atomic_flag *);
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-bool __ovld atomic_flag_test_and_set(volatile __global atomic_flag *object);
-bool __ovld atomic_flag_test_and_set(volatile __local atomic_flag *object);
-void __ovld atomic_flag_clear(volatile __global atomic_flag *object);
-void __ovld atomic_flag_clear(volatile __local atomic_flag *object);
+bool __ovld atomic_flag_test_and_set(volatile __global atomic_flag *);
+bool __ovld atomic_flag_test_and_set(volatile __local atomic_flag *);
+void __ovld atomic_flag_clear(volatile __global atomic_flag *);
+void __ovld atomic_flag_clear(volatile __local atomic_flag *);
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_atomic_scope_device)
 #if defined(__opencl_c_generic_address_space)
-bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
-void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *, memory_order);
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-bool __ovld atomic_flag_test_and_set_explicit(volatile __global atomic_flag *object, memory_order order);
-bool __ovld atomic_flag_test_and_set_explicit(volatile __local atomic_flag *object, memory_order order);
-void __ovld atomic_flag_clear_explicit(volatile __global atomic_flag *object, memory_order order);
-void __ovld atomic_flag_clear_explicit(volatile __local atomic_flag *object, memory_order order);
+bool __ovld atomic_flag_test_and_set_explicit(volatile __global atomic_flag *, memory_order);
+bool __ovld atomic_flag_test_and_set_explicit(volatile __local atomic_flag *, memory_order);
+void __ovld atomic_flag_clear_explicit(volatile __global atomic_flag *, memory_order);
+void __ovld atomic_flag_clear_explicit(volatile __local atomic_flag *, memory_order);
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif
 
 #if defined(__opencl_c_generic_address_space)
-bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
-void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order, memory_scope);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *, memory_order, memory_scope);
 #endif //defined(__opencl_c_generic_address_space)
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
-bool __ovld atomic_flag_test_and_set_explicit(volatile __global atomic_flag *object, memory_order order, memory_scope scope);
-bool __ovld atomic_flag_test_and_set_explicit(volatile __local atomic_flag *object, memory_order order, memory_scope scope);
-void __ovld atomic_flag_clear_explicit(volatile __global atomic_flag *object, memory_order order, memory_scope scope);
-void __ovld atomic_flag_clear_explicit(volatile __local atomic_flag *object, memory_order order, memory_scope scope);
+bool __ovld atomic_flag_test_and_set_explicit(volatile __global atomic_flag *, memory_order, memory_scope);
+bool __ovld atomic_flag_test_and_set_explicit(volatile __local atomic_flag *, memory_order, memory_scope);
+void __ovld atomic_flag_clear_explicit(volatile __global atomic_flag *, memory_order, memory_scope);
+void __ovld atomic_flag_clear_explicit(volatile __local atomic_flag *, memory_order, memory_scope);
 #endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
@@ -16507,7 +16274,7 @@ void __ovld set_user_event_status(clk_event_t e, int state);
 
 bool __ovld is_valid_event (clk_event_t event);
 
-void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
+void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void*);
 
 queue_t __ovld get_default_queue(void);
 #endif //__opencl_c_device_enqueue
@@ -16528,7 +16295,7 @@ uint    __ovld get_sub_group_local_id(void);
 
 void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope);
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 int     __ovld __conv sub_group_all(int predicate);

From 9453cda088c65b117ed127dffcd07ba9c7c929da Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Mon, 21 Feb 2022 18:39:04 +0700
Subject: [PATCH 413/748] [Test] Move test for PR53969 to LoopDeletion folder
 where it truly belongs

---
 llvm/test/Transforms/{IndVarSimplify => LoopDeletion}/pr53969.ll | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/Transforms/{IndVarSimplify => LoopDeletion}/pr53969.ll (100%)

diff --git a/llvm/test/Transforms/IndVarSimplify/pr53969.ll b/llvm/test/Transforms/LoopDeletion/pr53969.ll
similarity index 100%
rename from llvm/test/Transforms/IndVarSimplify/pr53969.ll
rename to llvm/test/Transforms/LoopDeletion/pr53969.ll

From 85f4023e731c0c42e45bf32bfcbf5f73c2013384 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Sat, 19 Feb 2022 20:29:08 +0200
Subject: [PATCH 414/748] [COFF] Move section name encoding into BinaryFormat

Large COFF section names are moved into the string table and the
section header field is the offset into the string table encoded in
ASCII for offset smaller than 7 digits and in base64 for larger
offsets.

The operation of taking the string table offsets is done in a few
places in the codebase, so it is helpful to move this operation into
`BinaryFormat` so that it can be shared everywhere it's done.

So this patch takes the implementation of this operation from
`llvm/lib/MC/WinCOFFObjectWriter.cpp` and moves it into `BinaryFormat`.

Reviewed By: jhenderson, rnk

Differential Revision: https://reviews.llvm.org/D118793
---
 llvm/include/llvm/BinaryFormat/COFF.h |  4 ++
 llvm/lib/BinaryFormat/CMakeLists.txt  |  1 +
 llvm/lib/BinaryFormat/COFF.cpp        | 57 +++++++++++++++++++++++++++
 llvm/lib/MC/WinCOFFObjectWriter.cpp   | 41 +------------------
 4 files changed, 64 insertions(+), 39 deletions(-)
 create mode 100644 llvm/lib/BinaryFormat/COFF.cpp

diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h
index e7dde986784f7..016fe02894065 100644
--- a/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/llvm/include/llvm/BinaryFormat/COFF.h
@@ -731,6 +731,10 @@ inline bool isReservedSectionNumber(int32_t SectionNumber) {
   return SectionNumber <= 0;
 }
 
+/// Encode section name based on string table offset.
+/// The size of Out must be at least COFF::NameSize.
+bool encodeSectionName(char *Out, uint64_t Offset);
+
 } // End namespace COFF.
 } // End namespace llvm.
 
diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 37f6865a487e8..50c1713804368 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_component_library(LLVMBinaryFormat
   AMDGPUMetadataVerifier.cpp
+  COFF.cpp
   Dwarf.cpp
   ELF.cpp
   MachO.cpp
diff --git a/llvm/lib/BinaryFormat/COFF.cpp b/llvm/lib/BinaryFormat/COFF.cpp
new file mode 100644
index 0000000000000..8fbee0218b79b
--- /dev/null
+++ b/llvm/lib/BinaryFormat/COFF.cpp
@@ -0,0 +1,57 @@
+//===- llvm/BinaryFormat/COFF.cpp - The COFF format -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+
+// Maximum offsets for different string table entry encodings.
+enum : unsigned { Max7DecimalOffset = 9999999U };
+enum : uint64_t { MaxBase64Offset = 0xFFFFFFFFFULL }; // 64^6, including 0
+
+// Encode a string table entry offset in base 64, padded to 6 chars, and
+// prefixed with a double slash: '//AAAAAA', '//AAAAAB', ...
+// Buffer must be at least 8 bytes large. No terminating null appended.
+static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
+  assert(Value > Max7DecimalOffset && Value <= MaxBase64Offset &&
+         "Illegal section name encoding for value");
+
+  static const char Alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                 "abcdefghijklmnopqrstuvwxyz"
+                                 "0123456789+/";
+
+  Buffer[0] = '/';
+  Buffer[1] = '/';
+
+  char *Ptr = Buffer + 7;
+  for (unsigned i = 0; i < 6; ++i) {
+    unsigned Rem = Value % 64;
+    Value /= 64;
+    *(Ptr--) = Alphabet[Rem];
+  }
+}
+
+bool llvm::COFF::encodeSectionName(char *Out, uint64_t Offset) {
+  if (Offset <= Max7DecimalOffset) {
+    // Offsets of 7 digits or less are encoded in ASCII.
+    SmallVector<char, COFF::NameSize> Buffer;
+    Twine('/').concat(Twine(Offset)).toVector(Buffer);
+    assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
+    std::memcpy(Out, Buffer.data(), Buffer.size());
+    return true;
+  }
+
+  if (Offset <= MaxBase64Offset) {
+    // Starting with 10,000,000, offsets are encoded as base64.
+    encodeBase64StringEntry(Out, Offset);
+    return true;
+  }
+
+  // The offset is too large to be encoded.
+  return false;
+}
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 0cbe51dfe3250..2af94b4f69a7b 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -451,32 +451,6 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
   Sym->MC = &MCSym;
 }
 
-// Maximum offsets for different string table entry encodings.
-enum : unsigned { Max7DecimalOffset = 9999999U };
-enum : uint64_t { MaxBase64Offset = 0xFFFFFFFFFULL }; // 64^6, including 0
-
-// Encode a string table entry offset in base 64, padded to 6 chars, and
-// prefixed with a double slash: '//AAAAAA', '//AAAAAB', ...
-// Buffer must be at least 8 bytes large. No terminating null appended.
-static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
-  assert(Value > Max7DecimalOffset && Value <= MaxBase64Offset &&
-         "Illegal section name encoding for value");
-
-  static const char Alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                                 "abcdefghijklmnopqrstuvwxyz"
-                                 "0123456789+/";
-
-  Buffer[0] = '/';
-  Buffer[1] = '/';
-
-  char *Ptr = Buffer + 7;
-  for (unsigned i = 0; i < 6; ++i) {
-    unsigned Rem = Value % 64;
-    Value /= 64;
-    *(Ptr--) = Alphabet[Rem];
-  }
-}
-
 void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
   if (S.Name.size() <= COFF::NameSize) {
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
@@ -484,19 +458,8 @@ void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
   }
 
   uint64_t StringTableEntry = Strings.getOffset(S.Name);
-  if (StringTableEntry <= Max7DecimalOffset) {
-    SmallVector<char, COFF::NameSize> Buffer;
-    Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
-    assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
-    std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
-    return;
-  }
-  if (StringTableEntry <= MaxBase64Offset) {
-    // Starting with 10,000,000, offsets are encoded as base64.
-    encodeBase64StringEntry(S.Header.Name, StringTableEntry);
-    return;
-  }
-  report_fatal_error("COFF string table is greater than 64 GB.");
+  if (!COFF::encodeSectionName(S.Header.Name, StringTableEntry))
+    report_fatal_error("COFF string table is greater than 64 GB.");
 }
 
 void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {

From ddf528b7a092fd24647d7c6186ece7392c92de92 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Sat, 19 Feb 2022 20:28:38 +0200
Subject: [PATCH 415/748] [llvm-objcopy][COFF] Fix section name encoding

The section name encoding for `llvm-objcopy` had two main issues, the
first is that the size used for the `snprintf` in the original code is
incorrect because `snprintf` adds a null byte, so this code was only
able to encode offsets of 6 digits - `/`, `\0` and 6 digits of the
offset - rather than the 7 digits it should support.

And the second part is that it didn't support the base64 encoding for
offsets larger than 7 digits.

This issue specifically showed up when using the `clang-offload-bundler`
with a binary containing a lot of symbols/sections, since it uses
`llvm-objcopy` to add the sections containing the offload code.

Reviewed By: jhenderson

Differential Revision: https://reviews.llvm.org/D118692
---
 llvm/lib/ObjCopy/COFF/Writer.cpp              | 21 ++--
 llvm/lib/ObjCopy/COFF/Writer.h                |  2 +-
 .../llvm-objcopy/COFF/section-name-encoding.s | 95 +++++++++++++++++++
 llvm/tools/llvm-objcopy/CMakeLists.txt        |  1 +
 4 files changed, 112 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objcopy/COFF/section-name-encoding.s

diff --git a/llvm/lib/ObjCopy/COFF/Writer.cpp b/llvm/lib/ObjCopy/COFF/Writer.cpp
index cbd0e42612387..fcbfef96d8609 100644
--- a/llvm/lib/ObjCopy/COFF/Writer.cpp
+++ b/llvm/lib/ObjCopy/COFF/Writer.cpp
@@ -116,7 +116,7 @@ void COFFWriter::layoutSections() {
   }
 }
 
-size_t COFFWriter::finalizeStringTable() {
+Expected<size_t> COFFWriter::finalizeStringTable() {
   for (const auto &S : Obj.getSections())
     if (S.Name.size() > COFF::NameSize)
       StrTabBuilder.add(S.Name);
@@ -129,11 +129,16 @@ size_t COFFWriter::finalizeStringTable() {
 
   for (auto &S : Obj.getMutableSections()) {
     memset(S.Header.Name, 0, sizeof(S.Header.Name));
-    if (S.Name.size() > COFF::NameSize) {
-      snprintf(S.Header.Name, sizeof(S.Header.Name), "/%d",
-               (int)StrTabBuilder.getOffset(S.Name));
-    } else {
+    if (S.Name.size() <= COFF::NameSize) {
+      // Short names can go in the field directly.
       memcpy(S.Header.Name, S.Name.data(), S.Name.size());
+    } else {
+      // Offset of the section name in the string table.
+      size_t Offset = StrTabBuilder.getOffset(S.Name);
+      if (!COFF::encodeSectionName(S.Header.Name, Offset))
+        return createStringError(object_error::invalid_section_index,
+                                 "COFF string table is greater than 64GB, "
+                                 "unable to encode section name offset");
     }
   }
   for (auto &S : Obj.getMutableSymbols()) {
@@ -219,7 +224,11 @@ Error COFFWriter::finalize(bool IsBigObj) {
     Obj.PeHeader.CheckSum = 0;
   }
 
-  size_t StrTabSize = finalizeStringTable();
+  Expected<size_t> StrTabSizeOrErr = finalizeStringTable();
+  if (!StrTabSizeOrErr)
+    return StrTabSizeOrErr.takeError();
+
+  size_t StrTabSize = *StrTabSizeOrErr;
 
   size_t PointerToSymbolTable = FileSize;
   // StrTabSize <= 4 is the size of an empty string table, only consisting
diff --git a/llvm/lib/ObjCopy/COFF/Writer.h b/llvm/lib/ObjCopy/COFF/Writer.h
index 5856c0f30b9f0..95e7f5da1ad4b 100644
--- a/llvm/lib/ObjCopy/COFF/Writer.h
+++ b/llvm/lib/ObjCopy/COFF/Writer.h
@@ -35,7 +35,7 @@ class COFFWriter {
   Error finalizeRelocTargets();
   Error finalizeSymbolContents();
   void layoutSections();
-  size_t finalizeStringTable();
+  Expected<size_t> finalizeStringTable();
 
   Error finalize(bool IsBigObj);
 
diff --git a/llvm/test/tools/llvm-objcopy/COFF/section-name-encoding.s b/llvm/test/tools/llvm-objcopy/COFF/section-name-encoding.s
new file mode 100644
index 0000000000000..bd8b7c1bcf960
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/COFF/section-name-encoding.s
@@ -0,0 +1,95 @@
+## Check that COFF section names of sections added by llvm-objcopy are properly
+## encoded.
+##
+## Encodings for different name lengths and string table index:
+##   [0, 8]:               raw name
+##   (8, 999999]:          base 10 string table index (/9999999)
+##   (999999, 0xFFFFFFFF]: base 64 string table index (##AAAAAA)
+##
+## Note: the names in the string table will be sorted in reverse
+## lexicographical order. Use a suffix letter (z, y, x, ...) to
+## get the preferred ordering of names in the test.
+##
+# REQUIRES: x86-registered-target
+##
+# RUN: echo DEADBEEF > %t.sec
+# RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s -o %t.obj
+# RUN: llvm-objcopy --add-section=s1234567=%t.sec     \
+# RUN:              --add-section=s1234567z=%t.sec    \
+# RUN:              --add-section=sevendigitx=%t.sec  \
+# RUN:              --add-section=doubleslashv=%t.sec \
+# RUN:              %t.obj %t
+# RUN: llvm-readobj --sections %t | FileCheck %s
+
+## Raw encoding
+
+# CHECK:   Section {
+# CHECK:     Number: 14
+# CHECK:     Name: s1234567 (73 31 32 33 34 35 36 37)
+# CHECK:   }
+
+## Base 10 encoding with a small offset, section name at the beginning of the
+## string table.
+
+## /4
+##
+# CHECK:   Section {
+# CHECK:     Number: 15
+# CHECK:     Name: s1234567z (2F 34 00 00 00 00 00 00)
+# CHECK:   }
+
+## Base 10 encoding with a 7 digit offset, section name after the y padding in
+## the string table.
+
+## /1000029 == 4 + 10 + (5 * (2 + (20 * 10 * 1000) + 1))
+##             v   |     |    v    ~~~~~~~~~~~~~~    v
+##    table size   v     v   "p0"      y pad         NULL separator
+##     "s1234567z\0"     # of pad sections
+##
+# CHECK:   Section {
+# CHECK:     Number: 16
+# CHECK:     Name: sevendigitx (2F 31 30 30 30 30 32 39)
+# CHECK:   }
+
+## Base 64 encoding, section name after the w padding in the string table.
+
+## //AAmJa4 == 1000029 + 12 + (5 * (2 + (9 * 20 * 10 * 1000) + 1)) == 38*64^3 + 9*64^2 + 26*64 + 56
+##             v         |     |    v    ~~~~~~~~~~~~~~~~~~    v
+## sevendigitx offset    v     v   "p0"       w pad            NULL separator
+##         "sevendigitx\0"     # of pad sections
+##
+## "2F 2F 41 41 6D 4A 61 34" is "//AAmJa4", which decodes to "0 0 38 9 26 56".
+##
+# CHECK:   Section {
+# CHECK:     Number: 17
+# CHECK:     Name: doubleslashv (2F 2F 41 41 6D 4A 61 34)
+# CHECK:   }
+
+## Generate padding sections to increase the string table size to at least
+## 1,000,000 bytes.
+.macro pad_sections2 pad
+  ## 10x \pad
+  .section p0\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad; .long 1
+  .section p1\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad; .long 1
+  .section p2\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad; .long 1
+  .section p3\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad; .long 1
+  .section p4\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad; .long 1
+.endm
+
+.macro pad_sections pad
+  ## 20x \pad
+  pad_sections2 \pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad
+.endm
+
+## 1000x 'y'
+pad_sections yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
+
+## Generate padding sections to increase the string table size to at least
+## 10,000,000 bytes.
+.macro pad_sections_ex pad
+  ## 9x \pad
+  pad_sections \pad\pad\pad\pad\pad\pad\pad\pad\pad
+.endm
+
+## 1000x 'w'
+pad_sections_ex wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww
diff --git a/llvm/tools/llvm-objcopy/CMakeLists.txt b/llvm/tools/llvm-objcopy/CMakeLists.txt
index 99e884a8cf0fa..493cc87b0768c 100644
--- a/llvm/tools/llvm-objcopy/CMakeLists.txt
+++ b/llvm/tools/llvm-objcopy/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
   Option
   Support
   MC
+  BinaryFormat
   )
 
 set(LLVM_TARGET_DEFINITIONS ObjcopyOpts.td)

From c234b2793b57ecfb6723abae895fab22f0e05b44 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 21 Feb 2022 11:51:36 +0000
Subject: [PATCH 416/748] [gn build] Port 85f4023e731c

---
 llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
index a70ffd10cd8bc..2b07b8b76c9d8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
@@ -3,6 +3,7 @@ static_library("BinaryFormat") {
   deps = [ "//llvm/lib/Support" ]
   sources = [
     "AMDGPUMetadataVerifier.cpp",
+    "COFF.cpp",
     "Dwarf.cpp",
     "ELF.cpp",
     "MachO.cpp",

From 8b734798a55bdd8e49d1c0f782382a3c87529352 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Wed, 19 Jan 2022 13:43:24 +0100
Subject: [PATCH 417/748] [mlir] Annotate methods on a correct class in
 PybindAdaptors.h

The `.def` and `.def_property_readonly` functions in PybindAdaptors.h should
construct the functions as method of the current class rather than as method of
pybind11:none(), which is an object and not even a class.

Depends On D117658

Reviewed By: gysit

Differential Revision: https://reviews.llvm.org/D117659
---
 mlir/include/mlir/Bindings/Python/PybindAdaptors.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
index 9d5a512a4dbe6..661ed48f9c1dd 100644
--- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
+++ b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
@@ -249,7 +249,7 @@ class pure_subclass {
   template <typename Func, typename... Extra>
   pure_subclass &def(const char *name, Func &&f, const Extra &... extra) {
     py::cpp_function cf(
-        std::forward<Func>(f), py::name(name), py::is_method(py::none()),
+        std::forward<Func>(f), py::name(name), py::is_method(thisClass),
         py::sibling(py::getattr(thisClass, name, py::none())), extra...);
     thisClass.attr(cf.name()) = cf;
     return *this;
@@ -259,7 +259,7 @@ class pure_subclass {
   pure_subclass &def_property_readonly(const char *name, Func &&f,
                                        const Extra &... extra) {
     py::cpp_function cf(
-        std::forward<Func>(f), py::name(name), py::is_method(py::none()),
+        std::forward<Func>(f), py::name(name), py::is_method(thisClass),
         py::sibling(py::getattr(thisClass, name, py::none())), extra...);
     auto builtinProperty =
         py::reinterpret_borrow<py::object>((PyObject *)&PyProperty_Type);

From 90faaf811f38f649a7629db8e9a1d1637e38c516 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 21 Feb 2022 04:02:55 -0800
Subject: [PATCH 418/748] issue-subscriber: Fix handling of labels with spaces

Fixes #53288

Reviewed By: mehdi_amini, asl, Quuxplusone

Differential Revision: https://reviews.llvm.org/D117745
---
 .github/workflows/issue-subscriber.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/issue-subscriber.yml b/.github/workflows/issue-subscriber.yml
index 3801eec26d545..43014e4ccc256 100644
--- a/.github/workflows/issue-subscriber.yml
+++ b/.github/workflows/issue-subscriber.yml
@@ -18,9 +18,12 @@ jobs:
         pip install -r requirements.txt
 
     - name: Update watchers
+      # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
+      env:
+        LABEL_NAME: ${{ github.event.label.name }}
       run: |
         ./github-automation.py \
-          --token ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }} \
+          --token '${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}' \
           issue-subscriber \
-          --issue-number ${{ github.event.issue.number }} \
-          --label-name ${{ github.event.label.name }}
+          --issue-number '${{ github.event.issue.number }}' \
+          --label-name "$LABEL_NAME"

From b9b6938183e837e66ff7450fb2b8a73dce5889c0 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Fri, 11 Feb 2022 12:09:57 +0100
Subject: [PATCH 419/748] [clangd] Treat 'auto' params as deduced if there's a
 single instantiation.

This makes hover/go-to-definition/expand-auto etc work for auto params in many
common cases.
This includes when a generic lambda is passed to a function accepting
std::function. (The tests don't use this case, it requires a lot of setup).

Note that this doesn't affect the AST of the function body itself, cause its
nodes not to be dependent, improve code completion etc.
(These sort of improvements seem possible, in a similar "if there's a single
instantiation, traverse it instead of the primary template" way).

Fixes https://github.com/clangd/clangd/issues/493
Fixes https://github.com/clangd/clangd/issues/1015

Differential Revision: https://reviews.llvm.org/D119537
---
 clang-tools-extra/clangd/AST.cpp              | 81 +++++++++++++++++++
 .../clangd/refactor/tweaks/ExpandAutoType.cpp | 28 ++++---
 .../clangd/unittests/ASTTests.cpp             | 64 ++++++++++++++-
 .../clangd/unittests/XRefsTests.cpp           |  6 ++
 .../unittests/tweaks/ExpandAutoTypeTests.cpp  | 12 ++-
 5 files changed, 175 insertions(+), 16 deletions(-)

diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp
index b970325098c65..fe9c4c9f0b18a 100644
--- a/clang-tools-extra/clangd/AST.cpp
+++ b/clang-tools-extra/clangd/AST.cpp
@@ -486,6 +486,87 @@ class DeducedTypeVisitor : public RecursiveASTVisitor<DeducedTypeVisitor> {
     return true;
   }
 
+  // Handle functions/lambdas with `auto` typed parameters.
+  // We'll examine visible specializations and see if they yield a unique type.
+  bool VisitParmVarDecl(ParmVarDecl *PVD) {
+    if (!PVD->getType()->isDependentType())
+      return true;
+    // 'auto' here does not name an AutoType, but an implicit template param.
+    TemplateTypeParmTypeLoc Auto =
+        findContainedAutoTTPLoc(PVD->getTypeSourceInfo()->getTypeLoc());
+    if (Auto.isNull() || Auto.getNameLoc() != SearchedLocation)
+      return true;
+    // We expect the TTP to be attached to this function template.
+    // Find the template and the param index.
+    auto *FD = llvm::dyn_cast<FunctionDecl>(PVD->getDeclContext());
+    if (!FD)
+      return true;
+    auto *FTD = FD->getDescribedFunctionTemplate();
+    if (!FTD)
+      return true;
+    int ParamIndex = paramIndex(*FTD, *Auto.getDecl());
+    if (ParamIndex < 0) {
+      assert(false && "auto TTP is not from enclosing function?");
+      return true;
+    }
+
+    // Now determine the unique type arg among the implicit specializations.
+    const ASTContext &Ctx = PVD->getASTContext();
+    QualType UniqueType;
+    CanQualType CanUniqueType;
+    for (const FunctionDecl *Spec : FTD->specializations()) {
+      // Meaning `auto` is a bit overloaded if the function is specialized.
+      if (Spec->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+        return true;
+      // Find the type for this specialization.
+      const auto *Args = Spec->getTemplateSpecializationArgs();
+      if (Args->size() != FTD->getTemplateParameters()->size())
+        continue; // no weird variadic stuff
+      QualType SpecType = Args->get(ParamIndex).getAsType();
+      if (SpecType.isNull())
+        continue;
+
+      // Deduced types need only be *canonically* equal.
+      CanQualType CanSpecType = Ctx.getCanonicalType(SpecType);
+      if (CanUniqueType.isNull()) {
+        CanUniqueType = CanSpecType;
+        UniqueType = SpecType;
+        continue;
+      }
+      if (CanUniqueType != CanSpecType)
+        return true; // deduced type is not unique
+    }
+    DeducedType = UniqueType;
+    return true;
+  }
+
+  // Find the abbreviated-function-template `auto` within a type.
+  // Similar to getContainedAutoTypeLoc, but these `auto`s are
+  // TemplateTypeParmTypes for implicit TTPs, instead of AutoTypes.
+  // Also we don't look very hard, just stripping const, references, pointers.
+  // FIXME: handle more types: vector<auto>?
+  static TemplateTypeParmTypeLoc findContainedAutoTTPLoc(TypeLoc TL) {
+    if (auto QTL = TL.getAs<QualifiedTypeLoc>())
+      return findContainedAutoTTPLoc(QTL.getUnqualifiedLoc());
+    if (llvm::isa<PointerType, ReferenceType>(TL.getTypePtr()))
+      return findContainedAutoTTPLoc(TL.getNextTypeLoc());
+    if (auto TTPTL = TL.getAs<TemplateTypeParmTypeLoc>()) {
+      if (TTPTL.getTypePtr()->getDecl()->isImplicit())
+        return TTPTL;
+    }
+    return {};
+  }
+
+  static int paramIndex(const TemplateDecl &TD, NamedDecl &Param) {
+    unsigned I = 0;
+    for (auto *ND : *TD.getTemplateParameters()) {
+      if (&Param == ND)
+        return I;
+      ++I;
+    }
+    return -1;
+  }
+
   QualType DeducedType;
 };
 } // namespace
diff --git a/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp b/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp
index 914564e9ae218..a717743ce3a71 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/ExpandAutoType.cpp
@@ -45,8 +45,7 @@ class ExpandAutoType : public Tweak {
   std::string title() const override;
 
 private:
-  /// Cache the AutoTypeLoc, so that we do not need to search twice.
-  llvm::Optional<clang::AutoTypeLoc> CachedLocation;
+  SourceRange AutoRange;
 };
 
 REGISTER_TWEAK(ExpandAutoType)
@@ -91,27 +90,35 @@ bool isTemplateParam(const SelectionTree::Node *Node) {
   return false;
 }
 
-bool ExpandAutoType::prepare(const Selection& Inputs) {
-  CachedLocation = llvm::None;
+bool ExpandAutoType::prepare(const Selection &Inputs) {
   if (auto *Node = Inputs.ASTSelection.commonAncestor()) {
     if (auto *TypeNode = Node->ASTNode.get<TypeLoc>()) {
       if (const AutoTypeLoc Result = TypeNode->getAs<AutoTypeLoc>()) {
         if (!isStructuredBindingType(Node) &&
             !isDeducedAsLambda(Node, Result.getBeginLoc()) &&
             !isTemplateParam(Node))
-          CachedLocation = Result;
+          AutoRange = Result.getSourceRange();
+      }
+      if (auto TTPAuto = TypeNode->getAs<TemplateTypeParmTypeLoc>()) {
+        // We exclude concept constraints for now, as the SourceRange is wrong.
+        // void foo(C auto x) {};
+        //            ^^^^
+        // TTPAuto->getSourceRange only covers "auto", not "C auto".
+        if (TTPAuto.getDecl()->isImplicit() &&
+            !TTPAuto.getDecl()->hasTypeConstraint())
+          AutoRange = TTPAuto.getSourceRange();
       }
     }
   }
 
-  return (bool) CachedLocation;
+  return AutoRange.isValid();
 }
 
 Expected<Tweak::Effect> ExpandAutoType::apply(const Selection& Inputs) {
   auto &SrcMgr = Inputs.AST->getSourceManager();
 
-  llvm::Optional<clang::QualType> DeducedType = getDeducedType(
-      Inputs.AST->getASTContext(), CachedLocation->getBeginLoc());
+  llvm::Optional<clang::QualType> DeducedType =
+      getDeducedType(Inputs.AST->getASTContext(), AutoRange.getBegin());
 
   // if we can't resolve the type, return an error message
   if (DeducedType == llvm::None || (*DeducedType)->isUndeducedAutoType())
@@ -133,9 +140,8 @@ Expected<Tweak::Effect> ExpandAutoType::apply(const Selection& Inputs) {
   std::string PrettyTypeName = printType(*DeducedType,
       Inputs.ASTSelection.commonAncestor()->getDeclContext());
 
-  tooling::Replacement
-      Expansion(SrcMgr, CharSourceRange(CachedLocation->getSourceRange(), true),
-                PrettyTypeName);
+  tooling::Replacement Expansion(SrcMgr, CharSourceRange(AutoRange, true),
+                                 PrettyTypeName);
 
   return Effect::mainFileEdit(SrcMgr, tooling::Replacements(Expansion));
 }
diff --git a/clang-tools-extra/clangd/unittests/ASTTests.cpp b/clang-tools-extra/clangd/unittests/ASTTests.cpp
index 53d399eed72a1..08935d8ed7b58 100644
--- a/clang-tools-extra/clangd/unittests/ASTTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ASTTests.cpp
@@ -179,20 +179,76 @@ TEST(GetDeducedType, KwAutoKwDecltypeExpansion) {
           )cpp",
           "Bar",
       },
+      {
+          R"cpp(
+            // Generic lambda param.
+            struct Foo{};
+            auto Generic = [](^auto x) { return 0; };
+            int m = Generic(Foo{});
+          )cpp",
+          "struct Foo",
+      },
+      {
+          R"cpp(
+            // Generic lambda instantiated twice, matching deduction.
+            struct Foo{};
+            using Bar = Foo;
+            auto Generic = [](^auto x, auto y) { return 0; };
+            int m = Generic(Bar{}, "one");
+            int n = Generic(Foo{}, 2);
+          )cpp",
+          "struct Foo",
+      },
+      {
+          R"cpp(
+            // Generic lambda instantiated twice, conflicting deduction.
+            struct Foo{};
+            auto Generic = [](^auto y) { return 0; };
+            int m = Generic("one");
+            int n = Generic(2);
+          )cpp",
+          nullptr,
+      },
+      {
+          R"cpp(
+            // Generic function param.
+            struct Foo{};
+            int generic(^auto x) { return 0; }
+            int m = generic(Foo{});
+          )cpp",
+          "struct Foo",
+      },
+      {
+          R"cpp(
+            // More complicated param type involving auto.
+            template <class> concept C = true;
+            struct Foo{};
+            int generic(C ^auto *x) { return 0; }
+            const Foo *Ptr = nullptr;
+            int m = generic(Ptr);
+          )cpp",
+          "const struct Foo",
+      },
   };
   for (Test T : Tests) {
     Annotations File(T.AnnotatedCode);
-    auto AST = TestTU::withCode(File.code()).build();
+    auto TU = TestTU::withCode(File.code());
+    TU.ExtraArgs.push_back("-std=c++20");
+    auto AST = TU.build();
     SourceManagerForFile SM("foo.cpp", File.code());
 
-    SCOPED_TRACE(File.code());
+    SCOPED_TRACE(T.AnnotatedCode);
     EXPECT_FALSE(File.points().empty());
     for (Position Pos : File.points()) {
       auto Location = sourceLocationInMainFile(SM.get(), Pos);
       ASSERT_TRUE(!!Location) << llvm::toString(Location.takeError());
       auto DeducedType = getDeducedType(AST.getASTContext(), *Location);
-      ASSERT_TRUE(DeducedType);
-      EXPECT_EQ(DeducedType->getAsString(), T.DeducedType);
+      if (T.DeducedType == nullptr) {
+        EXPECT_FALSE(DeducedType);
+      } else {
+        ASSERT_TRUE(DeducedType);
+        EXPECT_EQ(DeducedType->getAsString(), T.DeducedType);
+      }
     }
   }
 }
diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 2921eaa66b579..36c5c9c045dea 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -815,6 +815,12 @@ TEST(LocateSymbol, All) {
         }
       )cpp",
 
+      R"cpp(// auto lambda param where there's a single instantiation
+        struct [[Bar]] {};
+        auto Lambda = [](^auto){ return 0; };
+        int x = Lambda(Bar{});
+      )cpp",
+
       R"cpp(// decltype(auto) in function return
         struct [[Bar]] {};
         ^decltype(auto) test() {
diff --git a/clang-tools-extra/clangd/unittests/tweaks/ExpandAutoTypeTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/ExpandAutoTypeTests.cpp
index 6d9d4362be7af..1ab4fb3fd2eaf 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/ExpandAutoTypeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/ExpandAutoTypeTests.cpp
@@ -80,8 +80,18 @@ TEST_F(ExpandAutoTypeTest, Test) {
   EXPECT_THAT(apply("template <typename T> void x() { ^auto y = T::z(); }"),
               StartsWith("fail: Could not deduce type for 'auto' type"));
 
-  ExtraArgs.push_back("-std=c++17");
+  ExtraArgs.push_back("-std=c++20");
   EXPECT_UNAVAILABLE("template <au^to X> class Y;");
+
+  EXPECT_THAT(apply("auto X = [](^auto){};"),
+              StartsWith("fail: Could not deduce"));
+  EXPECT_EQ(apply("auto X = [](^auto){return 0;}; int Y = X(42);"),
+            "auto X = [](int){return 0;}; int Y = X(42);");
+  EXPECT_THAT(apply("auto X = [](^auto){return 0;}; int Y = X(42) + X('c');"),
+              StartsWith("fail: Could not deduce"));
+  // FIXME: should work on constrained auto params, once SourceRange is fixed.
+  EXPECT_UNAVAILABLE("template<class> concept C = true;"
+                     "auto X = [](C ^auto *){return 0;};");
 }
 
 } // namespace

From 071a9b751a46205dc276069dfbc0d38582736990 Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng@sifive.com>
Date: Mon, 21 Feb 2022 20:43:40 +0800
Subject: [PATCH 420/748] [NFC][RISCV] Fix path checking issue if default
 sysroot is given

---
 clang/test/Driver/riscv32-toolchain.c | 4 ++--
 clang/test/Driver/riscv64-toolchain.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/Driver/riscv32-toolchain.c b/clang/test/Driver/riscv32-toolchain.c
index 50859aaccd7da..5d65a2e0acd36 100644
--- a/clang/test/Driver/riscv32-toolchain.c
+++ b/clang/test/Driver/riscv32-toolchain.c
@@ -198,14 +198,14 @@
 // C-RV32-RTLIB-COMPILERRT-ILP32: "{{.*}}clang_rt.crtend-riscv32.o"
 
 // RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree --sysroot= \
 // RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
 // RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
 // RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
 // RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv32_tree{{.*}}riscv32-unknown-elf{{/|\\\\}}include"
 
 // RUN: %clang %s -### -no-canonical-prefixes -target riscv32 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv32_tree --sysroot= \
 // RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
 // RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
 // NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
diff --git a/clang/test/Driver/riscv64-toolchain.c b/clang/test/Driver/riscv64-toolchain.c
index 59580370c0b34..deb8d077ea9e7 100644
--- a/clang/test/Driver/riscv64-toolchain.c
+++ b/clang/test/Driver/riscv64-toolchain.c
@@ -154,14 +154,14 @@
 // C-RV64-RTLIB-COMPILERRT-LP64: "{{.*}}clang_rt.crtend-riscv64.o"
 
 // RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree --sysroot= \
 // RUN:   -resource-dir=%s/Inputs/resource_dir 2>&1 \
 // RUN:   | FileCheck -check-prefix=RESOURCE-INC %s
 // RESOURCE-INC: "-internal-isystem" "{{.*}}/Inputs/resource_dir{{/|\\\\}}include"
 // RESOURCE-INC: "-internal-isystem" "{{.*}}/basic_riscv64_tree/{{.*}}riscv64-unknown-elf{{/|\\\\}}include"
 
 // RUN: %clang %s -### -no-canonical-prefixes -target riscv64 \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_riscv64_tree --sysroot= \
 // RUN:   -resource-dir=%s/Inputs/resource_dir -nobuiltininc 2>&1 \
 // RUN:   | FileCheck -check-prefix=NO-RESOURCE-INC %s
 // NO-RESOURCE-INC-NOT: "-internal-isystem" "{{.*}}Inputs/resource_dir{{/|\\\\}}include"

From 46f1e8359eb43c5510c0515ed05bbf7c76b82f89 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 21 Feb 2022 13:07:51 +0000
Subject: [PATCH 421/748] [DAG] visitBSWAP - pull out repeated SDLoc. NFC

Cleanup for D120192
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 52a0330e1473c..3196ba9f7689d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9570,20 +9570,20 @@ SDValue DAGCombiner::visitABS(SDNode *N) {
 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // fold (bswap c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
-    return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
+    return DAG.getNode(ISD::BSWAP, DL, VT, N0);
   // fold (bswap (bswap x)) -> x
   if (N0.getOpcode() == ISD::BSWAP)
-    return N0->getOperand(0);
+    return N0.getOperand(0);
 
   // Canonicalize bswap(bitreverse(x)) -> bitreverse(bswap(x)). If bitreverse
   // isn't supported, it will be expanded to bswap followed by a manual reversal
   // of bits in each byte. By placing bswaps before bitreverse, we can remove
   // the two bswaps if the bitreverse gets expanded.
   if (N0.getOpcode() == ISD::BITREVERSE && N0.hasOneUse()) {
-    SDLoc DL(N);
     SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
     return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
   }

From 40d06c4ce94d4f3ba36596cee511d436b3d22ffd Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Mon, 21 Feb 2022 20:10:42 +0700
Subject: [PATCH 422/748] [SCEV][NFC] Replace contains+insert check with
 insert.second

---
 llvm/lib/Analysis/ScalarEvolution.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 613379a54a3d4..73fa48cee7bcf 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -13527,10 +13527,8 @@ void ScalarEvolution::verify() const {
   SmallVector<Loop *, 32> Worklist(LI.begin(), LI.end());
   while (!Worklist.empty()) {
     Loop *L = Worklist.pop_back_val();
-    if (ValidLoops.contains(L))
-      continue;
-    ValidLoops.insert(L);
-    Worklist.append(L->begin(), L->end());
+    if (ValidLoops.insert(L).second)
+      Worklist.append(L->begin(), L->end());
   }
   for (auto &KV : ValueExprMap) {
 #ifndef NDEBUG

From 6da60647cda2fad4d93e359ef3afaea18599b2ba Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 21 Feb 2022 08:20:26 -0500
Subject: [PATCH 423/748] [Clang][Sema] Check unexpected else statement in
 cond-update-stmt

In 'cond-update-stmt', `else` statement is not expected. This patch adds
the check in Sema.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D120225
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 +-
 clang/lib/Sema/SemaOpenMP.cpp                    | 9 +++++++++
 clang/test/OpenMP/atomic_messages.c              | 9 +++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 1719db4871ff3..0d301e76c92d7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10526,7 +10526,7 @@ def err_omp_atomic_compare : Error<
 def note_omp_atomic_compare: Note<
   "%select{expected compound statement|expected exactly one expression statement|expected assignment statement|expected conditional operator|expect result value to be at false expression|"
   "expect binary operator in conditional expression|expect '<', '>' or '==' as order operator|expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'|"
-  "expect lvalue for result value|expect scalar value|expect integer value}0">;
+  "expect lvalue for result value|expect scalar value|expect integer value|unexpected 'else' statement}0">;
 def err_omp_atomic_several_clauses : Error<
   "directive '#pragma omp atomic' cannot contain more than one 'read', 'write', 'update', 'capture', or 'compare' clause">;
 def err_omp_several_mem_order_clauses : Error<
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index ec0d095e89950..c32609e4e32e3 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -10974,6 +10974,8 @@ class OpenMPAtomicCompareChecker {
     NotScalar,
     /// Not an integer.
     NotInteger,
+    /// 'else' statement is not expected.
+    UnexpectedElse,
     /// No error.
     NoError,
   };
@@ -11111,6 +11113,13 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
     return false;
   }
 
+  if (S->getElse()) {
+    ErrorInfo.Error = ErrorTy::UnexpectedElse;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getElse()->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getElse()->getSourceRange();
+    return false;
+  }
+
   return true;
 }
 
diff --git a/clang/test/OpenMP/atomic_messages.c b/clang/test/OpenMP/atomic_messages.c
index 22f7be91662b5..c66cd19b5aca3 100644
--- a/clang/test/OpenMP/atomic_messages.c
+++ b/clang/test/OpenMP/atomic_messages.c
@@ -473,6 +473,15 @@ void compare(void) {
       x = e;
     d = e;
   }
+// omp51-error@+7 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+6 {{unexpected 'else' statement}}
+#pragma omp atomic compare
+  {
+    if (x > e)
+      x = e;
+    else
+      d = e;
+  }
   float fx = 0.0f;
   float fd = 0.0f;
   float fe = 0.0f;

From d41bf287815419158b963b7203c8a4cb2d9e2746 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 21 Feb 2022 13:44:36 +0000
Subject: [PATCH 424/748] [X86] use-cr-result-of-dom-icmp-st.ll - add checks
 without -cgp-icmp-eq2icmp-st flag

---
 .../X86/use-cr-result-of-dom-icmp-st.ll       | 837 ++++++++++++------
 1 file changed, 570 insertions(+), 267 deletions(-)

diff --git a/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll
index ae25f05225759..95831d506d3c0 100644
--- a/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll
+++ b/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -O3 -cgp-icmp-eq2icmp-st -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -O3 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=DEFAULT
+; RUN: llc -mtriple=x86_64-unknown-unknown -O3 -cgp-icmp-eq2icmp-st -verify-machineinstrs < %s | FileCheck %s --check-prefixes=EQ2ICMP
 
 ; Test cases are generated from:
 ; long long NAME(PARAM a, PARAM b) {
@@ -15,20 +16,36 @@
 target datalayout = "e-m:e-i64:64-n32:64"
 
 define i64 @ll_a_op_b__2(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_op_b__2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    cmpq $-2, %rdx
-; CHECK-NEXT:    jg .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:  .LBB0_2: # %return
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_op_b__2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movq %rsi, %rcx
+; DEFAULT-NEXT:    movq %rdi, %rax
+; DEFAULT-NEXT:    shlq %cl, %rax
+; DEFAULT-NEXT:    cmpq $-2, %rax
+; DEFAULT-NEXT:    jle .LBB0_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rcx, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB0_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rcx, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_op_b__2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    movq %rdi, %rdx
+; EQ2ICMP-NEXT:    movl %eax, %ecx
+; EQ2ICMP-NEXT:    shlq %cl, %rdx
+; EQ2ICMP-NEXT:    cmpq $-2, %rdx
+; EQ2ICMP-NEXT:    jg .LBB0_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:  .LBB0_2: # %return
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i64 %a, %b
   %cmp = icmp sgt i64 %shl, -2
@@ -45,22 +62,39 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a_op_b__1(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_op_b__1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    js .LBB1_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB1_1: # %if.end
-; CHECK-NEXT:    cmpq $-1, %rdx
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_op_b__1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movq %rsi, %rcx
+; DEFAULT-NEXT:    movq %rdi, %rax
+; DEFAULT-NEXT:    shlq %cl, %rax
+; DEFAULT-NEXT:    testq %rax, %rax
+; DEFAULT-NEXT:    js .LBB1_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rcx, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB1_1: # %if.end
+; DEFAULT-NEXT:    cmpq $-1, %rax
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rcx, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_op_b__1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    movq %rdi, %rdx
+; EQ2ICMP-NEXT:    movl %eax, %ecx
+; EQ2ICMP-NEXT:    shlq %cl, %rdx
+; EQ2ICMP-NEXT:    testq %rdx, %rdx
+; EQ2ICMP-NEXT:    js .LBB1_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB1_1: # %if.end
+; EQ2ICMP-NEXT:    cmpq $-1, %rdx
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i64 %a, %b
   %cmp = icmp sgt i64 %shl, -1
@@ -77,21 +111,37 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a_op_b_0(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_op_b_0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    jle .LBB2_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB2_1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovsq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_op_b_0:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movq %rsi, %rcx
+; DEFAULT-NEXT:    movq %rdi, %rax
+; DEFAULT-NEXT:    shlq %cl, %rax
+; DEFAULT-NEXT:    testq %rax, %rax
+; DEFAULT-NEXT:    jle .LBB2_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rcx, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB2_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rcx, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_op_b_0:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    movq %rdi, %rdx
+; EQ2ICMP-NEXT:    movl %eax, %ecx
+; EQ2ICMP-NEXT:    shlq %cl, %rdx
+; EQ2ICMP-NEXT:    testq %rdx, %rdx
+; EQ2ICMP-NEXT:    jle .LBB2_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB2_1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovsq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i64 %a, %b
   %cmp = icmp sgt i64 %shl, 0
@@ -108,20 +158,36 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a_op_b_1(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_op_b_1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    cmpq $1, %rdx
-; CHECK-NEXT:    jg .LBB3_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:  .LBB3_2: # %return
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_op_b_1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movq %rsi, %rcx
+; DEFAULT-NEXT:    movq %rdi, %rax
+; DEFAULT-NEXT:    shlq %cl, %rax
+; DEFAULT-NEXT:    cmpq $1, %rax
+; DEFAULT-NEXT:    jle .LBB3_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rcx, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB3_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rcx, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_op_b_1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    movq %rdi, %rdx
+; EQ2ICMP-NEXT:    movl %eax, %ecx
+; EQ2ICMP-NEXT:    shlq %cl, %rdx
+; EQ2ICMP-NEXT:    cmpq $1, %rdx
+; EQ2ICMP-NEXT:    jg .LBB3_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:  .LBB3_2: # %return
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i64 %a, %b
   %cmp = icmp sgt i64 %shl, 1
@@ -138,20 +204,36 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a_op_b_2(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_op_b_2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rdx
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shlq %cl, %rdx
-; CHECK-NEXT:    cmpq $2, %rdx
-; CHECK-NEXT:    jg .LBB4_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:  .LBB4_2: # %return
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_op_b_2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movq %rsi, %rcx
+; DEFAULT-NEXT:    movq %rdi, %rax
+; DEFAULT-NEXT:    shlq %cl, %rax
+; DEFAULT-NEXT:    cmpq $2, %rax
+; DEFAULT-NEXT:    jle .LBB4_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rcx, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB4_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rcx, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_op_b_2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    movq %rdi, %rdx
+; EQ2ICMP-NEXT:    movl %eax, %ecx
+; EQ2ICMP-NEXT:    shlq %cl, %rdx
+; EQ2ICMP-NEXT:    cmpq $2, %rdx
+; EQ2ICMP-NEXT:    jg .LBB4_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:  .LBB4_2: # %return
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i64 %a, %b
   %cmp = icmp sgt i64 %shl, 2
@@ -168,17 +250,30 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a__2(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a__2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    cmpq $-2, %rdi
-; CHECK-NEXT:    jg .LBB5_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:  .LBB5_2: # %return
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a__2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    cmpq $-2, %rdi
+; DEFAULT-NEXT:    jle .LBB5_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rsi, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB5_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rsi, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a__2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    cmpq $-2, %rdi
+; EQ2ICMP-NEXT:    jg .LBB5_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:  .LBB5_2: # %return
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i64 %a, -2
   br i1 %cmp, label %return, label %if.end
@@ -194,19 +289,33 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a__1(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a__1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    js .LBB6_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB6_1: # %if.end
-; CHECK-NEXT:    cmpq $-1, %rdi
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a__1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    testq %rdi, %rdi
+; DEFAULT-NEXT:    js .LBB6_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rsi, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB6_1: # %if.end
+; DEFAULT-NEXT:    cmpq $-1, %rdi
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rsi, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a__1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    testq %rdi, %rdi
+; EQ2ICMP-NEXT:    js .LBB6_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB6_1: # %if.end
+; EQ2ICMP-NEXT:    cmpq $-1, %rdi
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i64 %a, -1
   br i1 %cmp, label %return, label %if.end
@@ -222,18 +331,31 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a_0(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    jle .LBB7_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB7_1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovsq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_0:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    testq %rdi, %rdi
+; DEFAULT-NEXT:    jle .LBB7_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rsi, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB7_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rsi, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_0:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    testq %rdi, %rdi
+; EQ2ICMP-NEXT:    jle .LBB7_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB7_1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovsq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i64 %a, 0
   br i1 %cmp, label %return, label %if.end
@@ -249,17 +371,30 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a_1(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    cmpq $1, %rdi
-; CHECK-NEXT:    jg .LBB8_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:  .LBB8_2: # %return
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    cmpq $1, %rdi
+; DEFAULT-NEXT:    jle .LBB8_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rsi, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB8_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rsi, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    cmpq $1, %rdi
+; EQ2ICMP-NEXT:    jg .LBB8_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:  .LBB8_2: # %return
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i64 %a, 1
   br i1 %cmp, label %return, label %if.end
@@ -275,17 +410,30 @@ return:                                           ; preds = %entry
 }
 
 define i64 @ll_a_2(i64 %a, i64 %b) {
-; CHECK-LABEL: ll_a_2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    cmpq $2, %rdi
-; CHECK-NEXT:    jg .LBB9_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %ecx
-; CHECK-NEXT:    cmovlq %rcx, %rax
-; CHECK-NEXT:    imulq %rdi, %rax
-; CHECK-NEXT:  .LBB9_2: # %return
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: ll_a_2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    cmpq $2, %rdi
+; DEFAULT-NEXT:    jle .LBB9_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movq %rsi, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB9_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmoveq %rsi, %rax
+; DEFAULT-NEXT:    imulq %rdi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: ll_a_2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movq %rsi, %rax
+; EQ2ICMP-NEXT:    cmpq $2, %rdi
+; EQ2ICMP-NEXT:    jg .LBB9_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %ecx
+; EQ2ICMP-NEXT:    cmovlq %rcx, %rax
+; EQ2ICMP-NEXT:    imulq %rdi, %rax
+; EQ2ICMP-NEXT:  .LBB9_2: # %return
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i64 %a, 2
   br i1 %cmp, label %return, label %if.end
@@ -301,20 +449,36 @@ return:                                           ; preds = %entry
 }
 
 define i64 @i_a_op_b__2(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_op_b__2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    cmpl $-2, %eax
-; CHECK-NEXT:    jg .LBB10_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %ecx
-; CHECK-NEXT:    imull %edi, %ecx
-; CHECK-NEXT:  .LBB10_2: # %return
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_op_b__2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movl %esi, %ecx
+; DEFAULT-NEXT:    movl %edi, %eax
+; DEFAULT-NEXT:    shll %cl, %eax
+; DEFAULT-NEXT:    cmpl $-2, %eax
+; DEFAULT-NEXT:    jg .LBB10_2
+; DEFAULT-NEXT:  # %bb.1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %ecx, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %ecx
+; DEFAULT-NEXT:  .LBB10_2: # %return
+; DEFAULT-NEXT:    movslq %ecx, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_op_b__2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movl %esi, %ecx
+; EQ2ICMP-NEXT:    movl %edi, %eax
+; EQ2ICMP-NEXT:    shll %cl, %eax
+; EQ2ICMP-NEXT:    cmpl $-2, %eax
+; EQ2ICMP-NEXT:    jg .LBB10_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %ecx
+; EQ2ICMP-NEXT:    imull %edi, %ecx
+; EQ2ICMP-NEXT:  .LBB10_2: # %return
+; EQ2ICMP-NEXT:    movslq %ecx, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i32 %a, %b
   %cmp = icmp sgt i32 %shl, -2
@@ -333,23 +497,42 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a_op_b__1(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_op_b__1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    js .LBB11_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB11_1: # %if.end
-; CHECK-NEXT:    cmpl $-1, %eax
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %ecx
-; CHECK-NEXT:    imull %edi, %ecx
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_op_b__1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movl %esi, %ecx
+; DEFAULT-NEXT:    movl %edi, %eax
+; DEFAULT-NEXT:    shll %cl, %eax
+; DEFAULT-NEXT:    testl %eax, %eax
+; DEFAULT-NEXT:    js .LBB11_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movslq %ecx, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB11_1: # %if.end
+; DEFAULT-NEXT:    cmpl $-1, %eax
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %ecx, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %ecx
+; DEFAULT-NEXT:    movslq %ecx, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_op_b__1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movl %esi, %ecx
+; EQ2ICMP-NEXT:    movl %edi, %eax
+; EQ2ICMP-NEXT:    shll %cl, %eax
+; EQ2ICMP-NEXT:    testl %eax, %eax
+; EQ2ICMP-NEXT:    js .LBB11_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    movslq %ecx, %rax
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB11_1: # %if.end
+; EQ2ICMP-NEXT:    cmpl $-1, %eax
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %ecx
+; EQ2ICMP-NEXT:    imull %edi, %ecx
+; EQ2ICMP-NEXT:    movslq %ecx, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i32 %a, %b
   %cmp = icmp sgt i32 %shl, -1
@@ -368,22 +551,40 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a_op_b_0(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_op_b_0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    jle .LBB12_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB12_1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovsl %eax, %ecx
-; CHECK-NEXT:    imull %edi, %ecx
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_op_b_0:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movl %esi, %ecx
+; DEFAULT-NEXT:    movl %edi, %eax
+; DEFAULT-NEXT:    shll %cl, %eax
+; DEFAULT-NEXT:    testl %eax, %eax
+; DEFAULT-NEXT:    jle .LBB12_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movslq %ecx, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB12_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %ecx, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %ecx
+; DEFAULT-NEXT:    movslq %ecx, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_op_b_0:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movl %esi, %ecx
+; EQ2ICMP-NEXT:    movl %edi, %eax
+; EQ2ICMP-NEXT:    shll %cl, %eax
+; EQ2ICMP-NEXT:    testl %eax, %eax
+; EQ2ICMP-NEXT:    jle .LBB12_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    movslq %ecx, %rax
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB12_1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovsl %eax, %ecx
+; EQ2ICMP-NEXT:    imull %edi, %ecx
+; EQ2ICMP-NEXT:    movslq %ecx, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i32 %a, %b
   %cmp = icmp sgt i32 %shl, 0
@@ -402,20 +603,36 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a_op_b_1(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_op_b_1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    jg .LBB13_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %ecx
-; CHECK-NEXT:    imull %edi, %ecx
-; CHECK-NEXT:  .LBB13_2: # %return
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_op_b_1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movl %esi, %ecx
+; DEFAULT-NEXT:    movl %edi, %eax
+; DEFAULT-NEXT:    shll %cl, %eax
+; DEFAULT-NEXT:    cmpl $1, %eax
+; DEFAULT-NEXT:    jg .LBB13_2
+; DEFAULT-NEXT:  # %bb.1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %ecx, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %ecx
+; DEFAULT-NEXT:  .LBB13_2: # %return
+; DEFAULT-NEXT:    movslq %ecx, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_op_b_1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movl %esi, %ecx
+; EQ2ICMP-NEXT:    movl %edi, %eax
+; EQ2ICMP-NEXT:    shll %cl, %eax
+; EQ2ICMP-NEXT:    cmpl $1, %eax
+; EQ2ICMP-NEXT:    jg .LBB13_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %ecx
+; EQ2ICMP-NEXT:    imull %edi, %ecx
+; EQ2ICMP-NEXT:  .LBB13_2: # %return
+; EQ2ICMP-NEXT:    movslq %ecx, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i32 %a, %b
   %cmp = icmp sgt i32 %shl, 1
@@ -434,20 +651,36 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a_op_b_2(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_op_b_2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll %cl, %eax
-; CHECK-NEXT:    cmpl $2, %eax
-; CHECK-NEXT:    jg .LBB14_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %ecx
-; CHECK-NEXT:    imull %edi, %ecx
-; CHECK-NEXT:  .LBB14_2: # %return
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_op_b_2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    movl %esi, %ecx
+; DEFAULT-NEXT:    movl %edi, %eax
+; DEFAULT-NEXT:    shll %cl, %eax
+; DEFAULT-NEXT:    cmpl $2, %eax
+; DEFAULT-NEXT:    jg .LBB14_2
+; DEFAULT-NEXT:  # %bb.1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %ecx, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %ecx
+; DEFAULT-NEXT:  .LBB14_2: # %return
+; DEFAULT-NEXT:    movslq %ecx, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_op_b_2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    movl %esi, %ecx
+; EQ2ICMP-NEXT:    movl %edi, %eax
+; EQ2ICMP-NEXT:    shll %cl, %eax
+; EQ2ICMP-NEXT:    cmpl $2, %eax
+; EQ2ICMP-NEXT:    jg .LBB14_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %ecx
+; EQ2ICMP-NEXT:    imull %edi, %ecx
+; EQ2ICMP-NEXT:  .LBB14_2: # %return
+; EQ2ICMP-NEXT:    movslq %ecx, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %shl = shl i32 %a, %b
   %cmp = icmp sgt i32 %shl, 2
@@ -466,17 +699,30 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a__2(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a__2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpl $-2, %edi
-; CHECK-NEXT:    jg .LBB15_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %esi
-; CHECK-NEXT:    imull %edi, %esi
-; CHECK-NEXT:  .LBB15_2: # %return
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a__2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    cmpl $-2, %edi
+; DEFAULT-NEXT:    jg .LBB15_2
+; DEFAULT-NEXT:  # %bb.1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %esi, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %esi
+; DEFAULT-NEXT:  .LBB15_2: # %return
+; DEFAULT-NEXT:    movslq %esi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a__2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    cmpl $-2, %edi
+; EQ2ICMP-NEXT:    jg .LBB15_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %esi
+; EQ2ICMP-NEXT:    imull %edi, %esi
+; EQ2ICMP-NEXT:  .LBB15_2: # %return
+; EQ2ICMP-NEXT:    movslq %esi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %a, -2
   br i1 %cmp, label %return, label %if.end
@@ -494,20 +740,36 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a__1(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a__1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    js .LBB16_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB16_1: # %if.end
-; CHECK-NEXT:    cmpl $-1, %edi
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %esi
-; CHECK-NEXT:    imull %edi, %esi
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a__1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    testl %edi, %edi
+; DEFAULT-NEXT:    js .LBB16_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movslq %esi, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB16_1: # %if.end
+; DEFAULT-NEXT:    cmpl $-1, %edi
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %esi, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %esi
+; DEFAULT-NEXT:    movslq %esi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a__1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    testl %edi, %edi
+; EQ2ICMP-NEXT:    js .LBB16_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    movslq %esi, %rax
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB16_1: # %if.end
+; EQ2ICMP-NEXT:    cmpl $-1, %edi
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %esi
+; EQ2ICMP-NEXT:    imull %edi, %esi
+; EQ2ICMP-NEXT:    movslq %esi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %a, -1
   br i1 %cmp, label %return, label %if.end
@@ -525,19 +787,34 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a_0(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    jle .LBB17_1
-; CHECK-NEXT:  # %bb.2: # %return
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB17_1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovsl %eax, %esi
-; CHECK-NEXT:    imull %edi, %esi
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_0:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    testl %edi, %edi
+; DEFAULT-NEXT:    jle .LBB17_1
+; DEFAULT-NEXT:  # %bb.2: # %return
+; DEFAULT-NEXT:    movslq %esi, %rax
+; DEFAULT-NEXT:    retq
+; DEFAULT-NEXT:  .LBB17_1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %esi, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %esi
+; DEFAULT-NEXT:    movslq %esi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_0:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    testl %edi, %edi
+; EQ2ICMP-NEXT:    jle .LBB17_1
+; EQ2ICMP-NEXT:  # %bb.2: # %return
+; EQ2ICMP-NEXT:    movslq %esi, %rax
+; EQ2ICMP-NEXT:    retq
+; EQ2ICMP-NEXT:  .LBB17_1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovsl %eax, %esi
+; EQ2ICMP-NEXT:    imull %edi, %esi
+; EQ2ICMP-NEXT:    movslq %esi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %a, 0
   br i1 %cmp, label %return, label %if.end
@@ -555,17 +832,30 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a_1(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    jg .LBB18_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %esi
-; CHECK-NEXT:    imull %edi, %esi
-; CHECK-NEXT:  .LBB18_2: # %return
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_1:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    cmpl $1, %edi
+; DEFAULT-NEXT:    jg .LBB18_2
+; DEFAULT-NEXT:  # %bb.1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %esi, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %esi
+; DEFAULT-NEXT:  .LBB18_2: # %return
+; DEFAULT-NEXT:    movslq %esi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_1:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    cmpl $1, %edi
+; EQ2ICMP-NEXT:    jg .LBB18_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %esi
+; EQ2ICMP-NEXT:    imull %edi, %esi
+; EQ2ICMP-NEXT:  .LBB18_2: # %return
+; EQ2ICMP-NEXT:    movslq %esi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %a, 1
   br i1 %cmp, label %return, label %if.end
@@ -583,17 +873,30 @@ return:                                           ; preds = %if.end, %entry
 }
 
 define i64 @i_a_2(i32 signext %a, i32 signext %b) {
-; CHECK-LABEL: i_a_2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpl $2, %edi
-; CHECK-NEXT:    jg .LBB19_2
-; CHECK-NEXT:  # %bb.1: # %if.end
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cmovll %eax, %esi
-; CHECK-NEXT:    imull %edi, %esi
-; CHECK-NEXT:  .LBB19_2: # %return
-; CHECK-NEXT:    movslq %esi, %rax
-; CHECK-NEXT:    retq
+; DEFAULT-LABEL: i_a_2:
+; DEFAULT:       # %bb.0: # %entry
+; DEFAULT-NEXT:    cmpl $2, %edi
+; DEFAULT-NEXT:    jg .LBB19_2
+; DEFAULT-NEXT:  # %bb.1: # %if.end
+; DEFAULT-NEXT:    movl $1, %eax
+; DEFAULT-NEXT:    cmovel %esi, %eax
+; DEFAULT-NEXT:    imull %edi, %eax
+; DEFAULT-NEXT:    movl %eax, %esi
+; DEFAULT-NEXT:  .LBB19_2: # %return
+; DEFAULT-NEXT:    movslq %esi, %rax
+; DEFAULT-NEXT:    retq
+;
+; EQ2ICMP-LABEL: i_a_2:
+; EQ2ICMP:       # %bb.0: # %entry
+; EQ2ICMP-NEXT:    cmpl $2, %edi
+; EQ2ICMP-NEXT:    jg .LBB19_2
+; EQ2ICMP-NEXT:  # %bb.1: # %if.end
+; EQ2ICMP-NEXT:    movl $1, %eax
+; EQ2ICMP-NEXT:    cmovll %eax, %esi
+; EQ2ICMP-NEXT:    imull %edi, %esi
+; EQ2ICMP-NEXT:  .LBB19_2: # %return
+; EQ2ICMP-NEXT:    movslq %esi, %rax
+; EQ2ICMP-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %a, 2
   br i1 %cmp, label %return, label %if.end

From 14f143c9084fc49b45f30a199dc8a16b7506f959 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Mon, 21 Feb 2022 13:55:29 +0000
Subject: [PATCH 425/748] Fix llvm-objcopy shared lib build

Fix after ddf528b7a092 ("[llvm-objcopy][COFF] Fix section name
encoding", 2022-02-21) caused "undefined reference to
`llvm::COFF::encodeSectionName" failures.
---
 llvm/lib/ObjCopy/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/ObjCopy/CMakeLists.txt b/llvm/lib/ObjCopy/CMakeLists.txt
index 1e516394c74ae..ec1160e331c9a 100644
--- a/llvm/lib/ObjCopy/CMakeLists.txt
+++ b/llvm/lib/ObjCopy/CMakeLists.txt
@@ -64,6 +64,7 @@ add_llvm_component_library(LLVMObjCopy
   intrinsics_gen
 
   LINK_COMPONENTS
+  BinaryFormat
   Object
   Support
   MC

From bb850d422b6449d00c999ba4a1f2d1d68a9a2823 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 21 Feb 2022 09:07:59 -0500
Subject: [PATCH 426/748] [AArch64][RISCV][x86] add tests for funnel shift with
 shift logic; NFC

---
 llvm/test/CodeGen/AArch64/funnel-shift.ll | 161 +++++++++++++
 llvm/test/CodeGen/RISCV/rv32zbp.ll        | 256 ++++++++++++++++++++
 llvm/test/CodeGen/X86/funnel-shift.ll     | 278 ++++++++++++++++++++++
 3 files changed, 695 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index 51dc7ce2d061d..b4b4e37b4cba5 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -343,3 +343,164 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %f
 }
 
+define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_shl_fshl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mvn w9, w2
+; CHECK-NEXT:    lsr w10, w1, #1
+; CHECK-NEXT:    lsr w9, w10, w9
+; CHECK-NEXT:    lsl w8, w0, w8
+; CHECK-NEXT:    lsl w10, w1, w2
+; CHECK-NEXT:    orr w8, w8, w9
+; CHECK-NEXT:    orr w0, w8, w10
+; CHECK-NEXT:    ret
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_shl_rotl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w2
+; CHECK-NEXT:    lsl w9, w0, w2
+; CHECK-NEXT:    ror w8, w1, w8
+; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    ret
+  %shx = shl i32 %x, %s
+  %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %rot, %shx
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_shl_fshl_commute:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mvn w9, w2
+; CHECK-NEXT:    lsr w10, w1, #1
+; CHECK-NEXT:    lsr w9, w10, w9
+; CHECK-NEXT:    lsl w8, w0, w8
+; CHECK-NEXT:    lsl w10, w1, w2
+; CHECK-NEXT:    orr w8, w8, w9
+; CHECK-NEXT:    orr w0, w10, w8
+; CHECK-NEXT:    ret
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_shl_rotl_commute:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w2
+; CHECK-NEXT:    lsl w9, w0, w2
+; CHECK-NEXT:    ror w8, w1, w8
+; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    ret
+  %shx = shl i32 %x, %s
+  %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %shx, %rot
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_lshr_fshr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mvn w9, w2
+; CHECK-NEXT:    lsl w10, w1, #1
+; CHECK-NEXT:    lsr w8, w0, w8
+; CHECK-NEXT:    lsl w9, w10, w9
+; CHECK-NEXT:    lsr w10, w1, w2
+; CHECK-NEXT:    orr w8, w9, w8
+; CHECK-NEXT:    orr w0, w8, w10
+; CHECK-NEXT:    ret
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_lshr_rotr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, w2
+; CHECK-NEXT:    ror w9, w1, w2
+; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    ret
+  %shx = lshr i32 %x, %s
+  %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %rot, %shx
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_lshr_fshr_commute:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mvn w9, w2
+; CHECK-NEXT:    lsl w10, w1, #1
+; CHECK-NEXT:    lsr w8, w0, w8
+; CHECK-NEXT:    lsl w9, w10, w9
+; CHECK-NEXT:    lsr w10, w1, w2
+; CHECK-NEXT:    orr w8, w9, w8
+; CHECK-NEXT:    orr w0, w10, w8
+; CHECK-NEXT:    ret
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_lshr_rotr_commute(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_lshr_rotr_commute:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, w2
+; CHECK-NEXT:    ror w9, w1, w2
+; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    ret
+  %shx = lshr i32 %x, %s
+  %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %shx, %rot
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_shl_fshl_simplify:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mvn w9, w2
+; CHECK-NEXT:    lsr w10, w0, #1
+; CHECK-NEXT:    lsr w9, w10, w9
+; CHECK-NEXT:    lsl w8, w1, w8
+; CHECK-NEXT:    lsl w10, w1, w2
+; CHECK-NEXT:    orr w8, w8, w9
+; CHECK-NEXT:    orr w0, w8, w10
+; CHECK-NEXT:    ret
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: or_lshr_fshr_simplify:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w2
+; CHECK-NEXT:    mvn w9, w2
+; CHECK-NEXT:    lsl w10, w0, #1
+; CHECK-NEXT:    lsr w8, w1, w8
+; CHECK-NEXT:    lsl w9, w10, w9
+; CHECK-NEXT:    lsr w10, w1, w2
+; CHECK-NEXT:    orr w8, w9, w8
+; CHECK-NEXT:    orr w0, w10, w8
+; CHECK-NEXT:    ret
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index d021b26f45612..7e113d6be7d0a 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -2823,3 +2823,259 @@ define i64 @zexth_i64(i64 %a) nounwind {
   %and = and i64 %a, 65535
   ret i64 %and
 }
+
+define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_shl_fshl:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a3, a1, a2
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_shl_fshl:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    sll a3, a1, a2
+; RV32ZBP-NEXT:    sll a0, a0, a2
+; RV32ZBP-NEXT:    not a2, a2
+; RV32ZBP-NEXT:    srli a1, a1, 1
+; RV32ZBP-NEXT:    srl a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a0, a1
+; RV32ZBP-NEXT:    or a0, a0, a3
+; RV32ZBP-NEXT:    ret
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_shl_rot(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_shl_rot:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    sll a3, a1, a2
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_shl_rot:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    sll a0, a0, a2
+; RV32ZBP-NEXT:    rol a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a1, a0
+; RV32ZBP-NEXT:    ret
+  %shx = shl i32 %x, %s
+  %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %rot, %shx
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_shl_fshl_commute:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a3, a1, a2
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_shl_fshl_commute:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    sll a3, a1, a2
+; RV32ZBP-NEXT:    sll a0, a0, a2
+; RV32ZBP-NEXT:    not a2, a2
+; RV32ZBP-NEXT:    srli a1, a1, 1
+; RV32ZBP-NEXT:    srl a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a0, a1
+; RV32ZBP-NEXT:    or a0, a3, a0
+; RV32ZBP-NEXT:    ret
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_shl_rot_commute(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_shl_rot_commute:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    sll a3, a1, a2
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_shl_rot_commute:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    sll a0, a0, a2
+; RV32ZBP-NEXT:    rol a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a0, a1
+; RV32ZBP-NEXT:    ret
+  %shx = shl i32 %x, %s
+  %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %shx, %rot
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_lshr_fshr:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a3, a1, a2
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_lshr_fshr:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    srl a3, a1, a2
+; RV32ZBP-NEXT:    srl a0, a0, a2
+; RV32ZBP-NEXT:    not a2, a2
+; RV32ZBP-NEXT:    slli a1, a1, 1
+; RV32ZBP-NEXT:    sll a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a1, a0
+; RV32ZBP-NEXT:    or a0, a0, a3
+; RV32ZBP-NEXT:    ret
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_lshr_rotr:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    srl a3, a1, a2
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_lshr_rotr:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    srl a0, a0, a2
+; RV32ZBP-NEXT:    ror a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a1, a0
+; RV32ZBP-NEXT:    ret
+  %shx = lshr i32 %x, %s
+  %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %rot, %shx
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_lshr_fshr_commute:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a3, a1, a2
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_lshr_fshr_commute:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    srl a3, a1, a2
+; RV32ZBP-NEXT:    srl a0, a0, a2
+; RV32ZBP-NEXT:    not a2, a2
+; RV32ZBP-NEXT:    slli a1, a1, 1
+; RV32ZBP-NEXT:    sll a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a1, a0
+; RV32ZBP-NEXT:    or a0, a3, a0
+; RV32ZBP-NEXT:    ret
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_lshr_rotr_commute(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_lshr_rotr_commute:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    srl a3, a1, a2
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_lshr_rotr_commute:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    srl a0, a0, a2
+; RV32ZBP-NEXT:    ror a1, a1, a2
+; RV32ZBP-NEXT:    or a0, a0, a1
+; RV32ZBP-NEXT:    ret
+  %shx = lshr i32 %x, %s
+  %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %shx, %rot
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_shl_fshl_simplify:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_shl_fshl_simplify:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    sll a1, a1, a2
+; RV32ZBP-NEXT:    not a2, a2
+; RV32ZBP-NEXT:    srli a0, a0, 1
+; RV32ZBP-NEXT:    srl a0, a0, a2
+; RV32ZBP-NEXT:    or a0, a1, a0
+; RV32ZBP-NEXT:    or a0, a0, a1
+; RV32ZBP-NEXT:    ret
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) {
+; RV32I-LABEL: or_lshr_fshr_simplify:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: or_lshr_fshr_simplify:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    srl a1, a1, a2
+; RV32ZBP-NEXT:    not a2, a2
+; RV32ZBP-NEXT:    slli a0, a0, 1
+; RV32ZBP-NEXT:    sll a0, a0, a2
+; RV32ZBP-NEXT:    or a0, a0, a1
+; RV32ZBP-NEXT:    or a0, a1, a0
+; RV32ZBP-NEXT:    ret
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 2577c333c9287..49cf2684c7a82 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -1036,3 +1036,281 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind {
   ret void
 }
 declare dso_local void @_Z3foov()
+
+define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_shl_fshl:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %edx, %esi
+; X86-SSE2-NEXT:    shll %cl, %esi
+; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
+; X86-SSE2-NEXT:    orl %esi, %eax
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_shl_fshl:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shll %cl, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    shldl %cl, %esi, %edi
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_shl_rotl:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    shll %cl, %edx
+; X86-SSE2-NEXT:    roll %cl, %eax
+; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_shl_rotl:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shll %cl, %edi
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    roll %cl, %eax
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shx = shl i32 %x, %s
+  %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %rot, %shx
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_shl_fshl_commute:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %edx, %esi
+; X86-SSE2-NEXT:    shll %cl, %esi
+; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
+; X86-SSE2-NEXT:    orl %esi, %eax
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_shl_fshl_commute:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shll %cl, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    shldl %cl, %esi, %edi
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_shl_rotl_commute:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    shll %cl, %edx
+; X86-SSE2-NEXT:    roll %cl, %eax
+; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_shl_rotl_commute:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shll %cl, %edi
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    roll %cl, %eax
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shx = shl i32 %x, %s
+  %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %shx, %rot
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_lshr_fshr:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %edx, %esi
+; X86-SSE2-NEXT:    shrl %cl, %esi
+; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
+; X86-SSE2-NEXT:    orl %esi, %eax
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_lshr_fshr:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shrl %cl, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    shrdl %cl, %esi, %edi
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_lshr_rotr:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    shrl %cl, %edx
+; X86-SSE2-NEXT:    rorl %cl, %eax
+; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_lshr_rotr:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shrl %cl, %edi
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    rorl %cl, %eax
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shx = lshr i32 %x, %s
+  %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %rot, %shx
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_lshr_fshr_commute:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %edx, %esi
+; X86-SSE2-NEXT:    shrl %cl, %esi
+; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
+; X86-SSE2-NEXT:    orl %esi, %eax
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_lshr_fshr_commute:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shrl %cl, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    shrdl %cl, %esi, %edi
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_lshr_rotr_commute(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_lshr_rotr_commute:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    shrl %cl, %edx
+; X86-SSE2-NEXT:    rorl %cl, %eax
+; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_lshr_rotr_commute:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shrl %cl, %edi
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    rorl %cl, %eax
+; X64-AVX2-NEXT:    orl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %shx = lshr i32 %x, %s
+  %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
+  %or = or i32 %shx, %rot
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_shl_fshl_simplify:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl %eax, %esi
+; X86-SSE2-NEXT:    shll %cl, %esi
+; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
+; X86-SSE2-NEXT:    orl %esi, %eax
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_shl_fshl_simplify:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shll %cl, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    shldl %cl, %edi, %esi
+; X64-AVX2-NEXT:    orl %esi, %eax
+; X64-AVX2-NEXT:    retq
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) nounwind {
+; X86-SSE2-LABEL: or_lshr_fshr_simplify:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl %eax, %esi
+; X86-SSE2-NEXT:    shrl %cl, %esi
+; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
+; X86-SSE2-NEXT:    orl %esi, %eax
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: or_lshr_fshr_simplify:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    shrl %cl, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    shrdl %cl, %edi, %esi
+; X64-AVX2-NEXT:    orl %esi, %eax
+; X64-AVX2-NEXT:    retq
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}

From ee5580a8ebf264cdff0a9e149c21991f5e87431d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 21 Feb 2022 09:26:07 -0500
Subject: [PATCH 427/748] [InstSimplify] add tests for funnel shift with
 redundant shift; NFC

---
 llvm/test/Transforms/InstSimplify/or.ll | 81 +++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/or.ll b/llvm/test/Transforms/InstSimplify/or.ll
index 627d08e439148..07910443ad759 100644
--- a/llvm/test/Transforms/InstSimplify/or.ll
+++ b/llvm/test/Transforms/InstSimplify/or.ll
@@ -1039,3 +1039,84 @@ define <2 x i4> @or_nand_xor_undef_elt(<2 x i4> %x, <2 x i4> %y) {
   %or = or <2 x i4> %xor, %nand
   ret <2 x i4> %or
 }
+
+declare i32 @llvm.fshl.i32 (i32, i32, i32)
+declare i32 @llvm.fshr.i32 (i32, i32, i32)
+
+define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: @or_shl_fshl(
+; CHECK-NEXT:    [[SHY:%.*]] = shl i32 [[Y:%.*]], [[S:%.*]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshl.i32(i32 [[Y]], i32 [[X:%.*]], i32 [[S]])
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[FUN]], [[SHY]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: @or_shl_fshl_commute(
+; CHECK-NEXT:    [[SHY:%.*]] = shl i32 [[Y:%.*]], [[S:%.*]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshl.i32(i32 [[Y]], i32 [[X:%.*]], i32 [[S]])
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHY]], [[FUN]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_shl_fshl_wrong_order(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: @or_shl_fshl_wrong_order(
+; CHECK-NEXT:    [[SHY:%.*]] = shl i32 [[Y:%.*]], [[S:%.*]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y]], i32 [[S]])
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[FUN]], [[SHY]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %shy = shl i32 %y, %s
+  %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: @or_lshr_fshr(
+; CHECK-NEXT:    [[SHY:%.*]] = lshr i32 [[Y:%.*]], [[S:%.*]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[Y]], i32 [[S]])
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[FUN]], [[SHY]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: @or_lshr_fshr_commute(
+; CHECK-NEXT:    [[SHY:%.*]] = lshr i32 [[Y:%.*]], [[S:%.*]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[Y]], i32 [[S]])
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHY]], [[FUN]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
+  %or = or i32 %shy, %fun
+  ret i32 %or
+}
+
+define i32 @or_lshr_fshr_wrong_order(i32 %x, i32 %y, i32 %s) {
+; CHECK-LABEL: @or_lshr_fshr_wrong_order(
+; CHECK-NEXT:    [[SHY:%.*]] = lshr i32 [[Y:%.*]], [[S:%.*]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshr.i32(i32 [[Y]], i32 [[X:%.*]], i32 [[S]])
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[FUN]], [[SHY]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %shy = lshr i32 %y, %s
+  %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
+  %or = or i32 %fun, %shy
+  ret i32 %or
+}

From 9c7ca51b2c9ec648dc69f4891000f2a11ca7698e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 18 Feb 2022 16:22:20 -0500
Subject: [PATCH 428/748] MIR: Start diagnosing too many operands on an
 instruction

Previously this would just assert which was annoying and didn't point
to the specific instruction/operand.
---
 llvm/include/llvm/CodeGen/MachineOperand.h       | 10 ++++++++++
 llvm/lib/CodeGen/MIRParser/MIParser.cpp          | 16 ++++++++++++++--
 llvm/lib/CodeGen/MachineInstr.cpp                |  8 ++------
 .../CodeGen/MIR/AMDGPU/extra-imm-operand.mir     | 12 ++++++++++++
 .../CodeGen/MIR/AMDGPU/extra-reg-operand.mir     | 12 ++++++++++++
 5 files changed, 50 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/extra-imm-operand.mir
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/extra-reg-operand.mir

diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index eded28183ea25..d9e610a728995 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -460,6 +460,16 @@ class MachineOperand {
     return !isUndef() && !isInternalRead() && (isUse() || getSubReg());
   }
 
+  /// Return true if this operand can validly be appended to an arbitrary
+  /// operand list. i.e. this behaves like an implicit operand.
+  bool isValidExcessOperand() const {
+    if ((isReg() && isImplicit()) || isRegMask())
+      return true;
+
+    // Debug operands
+    return isMetadata() || isMCSymbol();
+  }
+
   //===--------------------------------------------------------------------===//
   // Mutators for Register Operands
   //===--------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 6477965bdc210..26ae21a9b752e 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1094,11 +1094,23 @@ bool MIParser::parse(MachineInstr *&MI) {
       return true;
   }
 
-  // TODO: Check for extraneous machine operands.
   MI = MF.CreateMachineInstr(MCID, DebugLocation, /*NoImplicit=*/true);
   MI->setFlags(Flags);
-  for (const auto &Operand : Operands)
+
+  unsigned NumExplicitOps = 0;
+  for (const auto &Operand : Operands) {
+    bool IsImplicitOp = Operand.Operand.isReg() && Operand.Operand.isImplicit();
+    if (!IsImplicitOp) {
+      if (!MCID.isVariadic() && NumExplicitOps >= MCID.getNumOperands() &&
+          !Operand.Operand.isValidExcessOperand())
+        return error("too many operands for instruction");
+
+      ++NumExplicitOps;
+    }
+
     MI->addOperand(MF, Operand.Operand);
+  }
+
   if (assignRegisterTies(*MI, Operands))
     return true;
   if (PreInstrSymbol)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 85b266afceefe..5e63fecd1bf6a 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -232,16 +232,12 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
     }
   }
 
-#ifndef NDEBUG
-  bool isDebugOp = Op.getType() == MachineOperand::MO_Metadata ||
-                   Op.getType() == MachineOperand::MO_MCSymbol;
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
   // RegMask operands go between the explicit and implicit operands.
-  assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
-          OpNo < MCID->getNumOperands() || isDebugOp) &&
+  assert((MCID->isVariadic() || OpNo < MCID->getNumOperands() ||
+          Op.isValidExcessOperand()) &&
          "Trying to add an operand to a machine instr that is already done!");
-#endif
 
   MachineRegisterInfo *MRI = getRegInfo();
 
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/extra-imm-operand.mir b/llvm/test/CodeGen/MIR/AMDGPU/extra-imm-operand.mir
new file mode 100644
index 0000000000000..db484f0798fcf
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/extra-imm-operand.mir
@@ -0,0 +1,12 @@
+# RUN: not llc -march=amdgcn -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s
+
+---
+name: extra_imm_operand
+body: |
+  bb.0:
+    ; CHECK: [[@LINE+3]]:18: too many operands for instruction
+    ; CHECK-NEXT: S_ENDPGM 0, 0
+    ; CHECK_NEXT:               ^
+    S_ENDPGM 0, 0
+
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/extra-reg-operand.mir b/llvm/test/CodeGen/MIR/AMDGPU/extra-reg-operand.mir
new file mode 100644
index 0000000000000..03a6777167fa6
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/extra-reg-operand.mir
@@ -0,0 +1,12 @@
+# RUN: not llc -march=amdgcn -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s
+
+---
+name: extra_reg_operand
+body: |
+  bb.0:
+    ; CHECK: [[@LINE+3]]:29: too many operands for instruction
+    ; S_ENDPGM 0, undef $vgpr0
+    ; CHECK_NEXT:               ^
+    S_ENDPGM 0, undef $vgpr0
+
+...

From 9fc1a0dcb79afb31470751651c30e843c12e9ca5 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 21 Feb 2022 15:44:30 +0000
Subject: [PATCH 429/748] [AArch64] Alter mull shuffle(ext(..)) combine to work
 on buildvectors

We have a combine for converting mul(dup(ext(..)), ...) into
mul(ext(dup(..)), ..), for allowing more uses of smull and umull
instructions. Currently it looks for vector insert and shuffle vectors
to detect the element that we can convert to a vector extend. Not all
cases will have a shufflevector/insert element though.

This started by extending the recognition to buildvectors (with elements
that may be individually extended). The new method seems to cover all
the cases that the old method captured though, as the shuffle will
eventually be lowered to buildvectors, so the old method has been
removed to keep the code a little simpler. The new code detects legal
build_vector(ext(a), ext(b), ..), converting them to ext(build_vector(a,
b, ..)) providing all the extends/types match up.

Differential Revision: https://reviews.llvm.org/D120018
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 74 +++++++------------
 llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll  |  6 +-
 .../AArch64/aarch64-matrix-umull-smull.ll     | 49 ++++++------
 3 files changed, 52 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 473984c658d39..30d30e88f2740 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13448,33 +13448,17 @@ static EVT calculatePreExtendType(SDValue Extend) {
   }
 }
 
-/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
+/// Combines a buildvector(sext/zext) node pattern into sext/zext(buildvector)
 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
-static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
-                                                SelectionDAG &DAG) {
-  ShuffleVectorSDNode *ShuffleNode =
-      dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
-  if (!ShuffleNode)
-    return SDValue();
-
-  // Ensuring the mask is zero before continuing
-  if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
-    return SDValue();
-
-  SDValue InsertVectorElt = VectorShuffle.getOperand(0);
-
-  if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
-    return SDValue();
-
-  SDValue InsertLane = InsertVectorElt.getOperand(2);
-  ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
-  // Ensures the insert is inserting into lane 0
-  if (!Constant || Constant->getZExtValue() != 0)
+static SDValue performBuildVectorExtendCombine(SDValue BV, SelectionDAG &DAG) {
+  EVT VT = BV.getValueType();
+  if (BV.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
-  SDValue Extend = InsertVectorElt.getOperand(1);
+  // Use the first item in the buildvector to get the size of the extend, and
+  // make sure it looks valid.
+  SDValue Extend = BV->getOperand(0);
   unsigned ExtendOpcode = Extend.getOpcode();
-
   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
                 ExtendOpcode == ISD::AssertSext;
@@ -13484,30 +13468,28 @@ static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
 
   // Restrict valid pre-extend data type
   EVT PreExtendType = calculatePreExtendType(Extend);
-  if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
-      PreExtendType != MVT::i32)
-    return SDValue();
-
-  EVT TargetType = VectorShuffle.getValueType();
-  EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
-  if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
+  if (PreExtendType.getSizeInBits() != VT.getScalarSizeInBits() / 2)
     return SDValue();
 
-  SDLoc DL(VectorShuffle);
-
-  SDValue InsertVectorNode = DAG.getNode(
-      InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
-      DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
-      DAG.getConstant(0, DL, MVT::i64));
-
-  std::vector<int> ShuffleMask(TargetType.getVectorNumElements());
-
-  SDValue VectorShuffleNode =
-      DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
-                           DAG.getUNDEF(PreExtendVT), ShuffleMask);
+  // Make sure all other operands are equally extended
+  for (SDValue Op : drop_begin(BV->ops())) {
+    unsigned Opc = Op.getOpcode();
+    bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
+                     Opc == ISD::AssertSext;
+    if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
+      return SDValue();
+  }
 
-  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
-                     TargetType, VectorShuffleNode);
+  EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
+  EVT PreExtendLegalType =
+      PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
+  SDLoc DL(BV);
+  SmallVector<SDValue, 8> NewOps;
+  for (SDValue Op : BV->ops())
+    NewOps.push_back(
+        DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, PreExtendLegalType));
+  SDValue NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
+  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
 }
 
 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
@@ -13518,8 +13500,8 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
   if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
     return SDValue();
 
-  SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
-  SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
+  SDValue Op0 = performBuildVectorExtendCombine(Mul->getOperand(0), DAG);
+  SDValue Op1 = performBuildVectorExtendCombine(Mul->getOperand(1), DAG);
 
   // Neither operands have been changed, don't make any further changes
   if (!Op0 && !Op1)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index bc31d41a55f43..5a57e6e82dd2e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -156,10 +156,8 @@ entry:
 define <8 x i16> @nonsplat_shuffleinsert(i8 %src, <8 x i8> %b) {
 ; CHECK-LABEL: nonsplat_shuffleinsert:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    dup v1.8b, w0
+; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
 ; CHECK-NEXT:    ret
 entry:
     %in = sext i8 %src to i16
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 4f999edf3d571..12b451f509f73 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -201,25 +201,22 @@ define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB3_6
 ; CHECK-NEXT:  .LBB3_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    stp q1, q3, [x11, #-32]
-; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    smull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
+; CHECK-NEXT:    smull2 v5.4s, v1.8h, v3.8h
+; CHECK-NEXT:    smull v3.4s, v0.4h, v3.4h
+; CHECK-NEXT:    stp q2, q4, [x11, #-32]
+; CHECK-NEXT:    stp q3, q5, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB3_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9
@@ -317,25 +314,22 @@ define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB4_6
 ; CHECK-NEXT:  .LBB4_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v4.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    stp q1, q3, [x11, #-32]
-; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    umull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
+; CHECK-NEXT:    umull2 v5.4s, v1.8h, v3.8h
+; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
+; CHECK-NEXT:    stp q2, q4, [x11, #-32]
+; CHECK-NEXT:    stp q3, q5, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB4_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9
@@ -435,12 +429,13 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
+; CHECK-NEXT:    dup v2.8b, w9
 ; CHECK-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov x12, x11
+; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    dup v2.8h, w9
+; CHECK-NEXT:    mov x12, x11
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp d3, d4, [x8, #-8]

From 175d5fa388298934629f5cf183c7de45a672a063 Mon Sep 17 00:00:00 2001
From: fuzzypixelz <mazouz.mahmoud@outlook.com>
Date: Mon, 21 Feb 2022 07:53:27 -0800
Subject: [PATCH 430/748] [MLIR] replace C++ function type defintion in the C
 API's Interfaces.h

Clearly this something of a typo, and it obviously doesn't even compile.

Reviewed By: ftynse

Differential Revision: https://reviews.llvm.org/D120247
---
 mlir/include/mlir-c/Interfaces.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir-c/Interfaces.h b/mlir/include/mlir-c/Interfaces.h
index 7ab6b8af3a8c7..233f828b924c7 100644
--- a/mlir/include/mlir-c/Interfaces.h
+++ b/mlir/include/mlir-c/Interfaces.h
@@ -48,7 +48,7 @@ MLIR_CAPI_EXPORTED MlirTypeID mlirInferTypeOpInterfaceTypeID();
 /// transferring ownership to the caller. The first argument is the number of
 /// consecutive elements pointed to by the second argument. The third argument
 /// is an opaque pointer forwarded to the callback by the caller.
-using MlirTypesCallback = void (*)(intptr_t, MlirType *, void *);
+typedef void (*MlirTypesCallback)(intptr_t, MlirType *, void *);
 
 /// Infers the return types of the operation identified by its canonical given
 /// the arguments that will be supplied to its generic builder. Calls `callback`

From 52577cd26f26f6428c72395e7337af3fc84bc6f6 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Sun, 20 Feb 2022 15:15:31 +0300
Subject: [PATCH 431/748] [ArgPromotion] Regenerate test checks for crash.ll -
 removed ALL_NEWPM prefix.

Rename %tmp => %temp IR values to avoid update warning.

Reviewed by Nikita Popov

Differential revision: https://reviews.llvm.org/D120207
---
 .../Transforms/ArgumentPromotion/crash.ll     | 92 ++++++++++---------
 1 file changed, 48 insertions(+), 44 deletions(-)

diff --git a/llvm/test/Transforms/ArgumentPromotion/crash.ll b/llvm/test/Transforms/ArgumentPromotion/crash.ll
index d55f4624e0c34..7909ef909fb49 100644
--- a/llvm/test/Transforms/ArgumentPromotion/crash.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/crash.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
-; RUN: opt -S < %s -inline -argpromotion | FileCheck %s --check-prefix=ARGPROMOTION
-; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s --check-prefixes=ARGPROMOTION,ALL_NEWPM
+; RUN: opt -S < %s -inline -argpromotion | FileCheck %s
+; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s
 
 %S = type { %S* }
 
 ; Inlining should nuke the invoke (and any inlined calls) here even with
 ; argument promotion running along with it.
 define void @zot() personality i32 (...)* @wibble {
-; ARGPROMOTION-LABEL: define {{[^@]+}}@zot() personality i32 (...)* @wibble
-; ARGPROMOTION-NEXT:  bb:
-; ARGPROMOTION-NEXT:    unreachable
-; ARGPROMOTION:       hoge.exit:
-; ARGPROMOTION-NEXT:    br label [[BB1:%.*]]
-; ARGPROMOTION:       bb1:
-; ARGPROMOTION-NEXT:    unreachable
-; ARGPROMOTION:       bb2:
-; ARGPROMOTION-NEXT:    [[TMP:%.*]] = landingpad { i8*, i32 }
-; ARGPROMOTION-NEXT:    cleanup
-; ARGPROMOTION-NEXT:    unreachable
+; CHECK-LABEL: define {{[^@]+}}@zot() personality i32 (...)* @wibble {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    unreachable
+; CHECK:       hoge.exit:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TEMP:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    unreachable
 ;
 bb:
   invoke void @hoge()
@@ -27,15 +27,15 @@ bb1:
   unreachable
 
 bb2:
-  %tmp = landingpad { i8*, i32 }
+  %temp = landingpad { i8*, i32 }
   cleanup
   unreachable
 }
 
 define internal void @hoge() {
 bb:
-  %tmp = call fastcc i8* @spam(i1 (i8*)* @eggs)
-  %tmp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
+  %temp = call fastcc i8* @spam(i1 (i8*)* @eggs)
+  %temp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
   unreachable
 }
 
@@ -45,54 +45,58 @@ bb:
 }
 
 define internal i1 @eggs(i8* %arg) {
-; ALL_NEWPM-LABEL: define {{[^@]+}}@eggs()
-; ALL_NEWPM-NEXT:  bb:
-; ALL_NEWPM-NEXT:    unreachable
+; CHECK-LABEL: define {{[^@]+}}@eggs() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    unreachable
 ;
 bb:
-  %tmp = call zeroext i1 @barney(i8* %arg)
+  %temp = call zeroext i1 @barney(i8* %arg)
   unreachable
 }
 
 define internal i1 @barney(i8* %arg) {
+; CHECK-LABEL: define {{[^@]+}}@barney() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    ret i1 undef
+;
 bb:
   ret i1 undef
 }
 
 define i32 @test_inf_promote_caller(i32 %arg) {
-; ARGPROMOTION-LABEL: define {{[^@]+}}@test_inf_promote_caller
-; ARGPROMOTION-SAME: (i32 [[ARG:%.*]])
-; ARGPROMOTION-NEXT:  bb:
-; ARGPROMOTION-NEXT:    [[TMP:%.*]] = alloca [[S:%.*]]
-; ARGPROMOTION-NEXT:    [[TMP1:%.*]] = alloca [[S]]
-; ARGPROMOTION-NEXT:    [[TMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TMP]], %S* [[TMP1]])
-; ARGPROMOTION-NEXT:    ret i32 0
+; CHECK-LABEL: define {{[^@]+}}@test_inf_promote_caller
+; CHECK-SAME: (i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TEMP:%.*]] = alloca [[S:%.*]], align 8
+; CHECK-NEXT:    [[TEMP1:%.*]] = alloca [[S]], align 8
+; CHECK-NEXT:    [[TEMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TEMP]], %S* [[TEMP1]])
+; CHECK-NEXT:    ret i32 0
 ;
 bb:
-  %tmp = alloca %S
-  %tmp1 = alloca %S
-  %tmp2 = call i32 @test_inf_promote_callee(%S* %tmp, %S* %tmp1)
+  %temp = alloca %S
+  %temp1 = alloca %S
+  %temp2 = call i32 @test_inf_promote_callee(%S* %temp, %S* %temp1)
 
   ret i32 0
 }
 
 define internal i32 @test_inf_promote_callee(%S* %arg, %S* %arg1) {
-; ARGPROMOTION-LABEL: define {{[^@]+}}@test_inf_promote_callee
-; ARGPROMOTION-SAME: (%S* [[ARG:%.*]], %S* [[ARG1:%.*]])
-; ARGPROMOTION-NEXT:  bb:
-; ARGPROMOTION-NEXT:    [[TMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
-; ARGPROMOTION-NEXT:    [[TMP2:%.*]] = load %S*, %S** [[TMP]]
-; ARGPROMOTION-NEXT:    [[TMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
-; ARGPROMOTION-NEXT:    [[TMP4:%.*]] = load %S*, %S** [[TMP3]]
-; ARGPROMOTION-NEXT:    [[TMP5:%.*]] = call i32 @test_inf_promote_callee(%S* [[TMP4]], %S* [[TMP2]])
-; ARGPROMOTION-NEXT:    ret i32 0
+; CHECK-LABEL: define {{[^@]+}}@test_inf_promote_callee
+; CHECK-SAME: (%S* [[ARG:%.*]], %S* [[ARG1:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP2:%.*]] = load %S*, %S** [[TEMP]], align 8
+; CHECK-NEXT:    [[TEMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
+; CHECK-NEXT:    [[TEMP4:%.*]] = load %S*, %S** [[TEMP3]], align 8
+; CHECK-NEXT:    [[TEMP5:%.*]] = call i32 @test_inf_promote_callee(%S* [[TEMP4]], %S* [[TEMP2]])
+; CHECK-NEXT:    ret i32 0
 ;
 bb:
-  %tmp = getelementptr %S, %S* %arg1, i32 0, i32 0
-  %tmp2 = load %S*, %S** %tmp
-  %tmp3 = getelementptr %S, %S* %arg, i32 0, i32 0
-  %tmp4 = load %S*, %S** %tmp3
-  %tmp5 = call i32 @test_inf_promote_callee(%S* %tmp4, %S* %tmp2)
+  %temp = getelementptr %S, %S* %arg1, i32 0, i32 0
+  %temp2 = load %S*, %S** %temp
+  %temp3 = getelementptr %S, %S* %arg, i32 0, i32 0
+  %temp4 = load %S*, %S** %temp3
+  %temp5 = call i32 @test_inf_promote_callee(%S* %temp4, %S* %temp2)
 
   ret i32 0
 }

From 4d5b020d6e0df8e34bd79154660cefd3676d21f2 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 21 Feb 2022 16:24:58 +0000
Subject: [PATCH 432/748] [ARM] Addition SSAT/USAT tests for min/max patterns.
 NFC

---
 llvm/test/CodeGen/ARM/ssat-unroll-loops.ll | 136 +++++++
 llvm/test/CodeGen/ARM/ssat.ll              | 336 ++++++++++++++++
 llvm/test/CodeGen/ARM/usat.ll              | 424 +++++++++++++++++++++
 3 files changed, 896 insertions(+)

diff --git a/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll b/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
index f1b4ab2d937d7..1f7574a8cca98 100644
--- a/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
+++ b/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
@@ -121,3 +121,139 @@ while.body:                                       ; preds = %while.body.prol.loo
 while.end:                                        ; preds = %while.body, %while.body.prol.loopexit, %entry
   ret void
 }
+
+define void @ssat_unroll_minmax(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* nocapture writeonly %pDst, i32 %blockSize) {
+; CHECK-LABEL: ssat_unroll_minmax:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r11, lr}
+; CHECK-NEXT:    push {r4, r5, r11, lr}
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    beq .LBB1_6
+; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
+; CHECK-NEXT:    movw r12, #32768
+; CHECK-NEXT:    sub lr, r3, #1
+; CHECK-NEXT:    tst r3, #1
+; CHECK-NEXT:    movt r12, #65535
+; CHECK-NEXT:    beq .LBB1_3
+; CHECK-NEXT:  @ %bb.2: @ %while.body.prol.preheader
+; CHECK-NEXT:    ldrsh r3, [r0], #2
+; CHECK-NEXT:    ldrsh r4, [r1], #2
+; CHECK-NEXT:    smulbb r3, r4, r3
+; CHECK-NEXT:    asr r4, r3, #14
+; CHECK-NEXT:    cmn r4, #32768
+; CHECK-NEXT:    mov r4, r12
+; CHECK-NEXT:    asrgt r4, r3, #14
+; CHECK-NEXT:    movw r3, #32767
+; CHECK-NEXT:    cmp r4, r3
+; CHECK-NEXT:    movge r4, r3
+; CHECK-NEXT:    mov r3, lr
+; CHECK-NEXT:    strh r4, [r2], #2
+; CHECK-NEXT:  .LBB1_3: @ %while.body.prol.loopexit
+; CHECK-NEXT:    cmp lr, #0
+; CHECK-NEXT:    beq .LBB1_6
+; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader1
+; CHECK-NEXT:    movw lr, #32767
+; CHECK-NEXT:  .LBB1_5: @ %while.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrsh r4, [r0]
+; CHECK-NEXT:    ldrsh r5, [r1]
+; CHECK-NEXT:    smulbb r4, r5, r4
+; CHECK-NEXT:    asr r5, r4, #14
+; CHECK-NEXT:    cmn r5, #32768
+; CHECK-NEXT:    mov r5, r12
+; CHECK-NEXT:    asrgt r5, r4, #14
+; CHECK-NEXT:    cmp r5, lr
+; CHECK-NEXT:    movge r5, lr
+; CHECK-NEXT:    strh r5, [r2]
+; CHECK-NEXT:    ldrsh r4, [r0, #2]
+; CHECK-NEXT:    add r0, r0, #4
+; CHECK-NEXT:    ldrsh r5, [r1, #2]
+; CHECK-NEXT:    add r1, r1, #4
+; CHECK-NEXT:    smulbb r4, r5, r4
+; CHECK-NEXT:    asr r5, r4, #14
+; CHECK-NEXT:    cmn r5, #32768
+; CHECK-NEXT:    mov r5, r12
+; CHECK-NEXT:    asrgt r5, r4, #14
+; CHECK-NEXT:    cmp r5, lr
+; CHECK-NEXT:    movge r5, lr
+; CHECK-NEXT:    subs r3, r3, #2
+; CHECK-NEXT:    strh r5, [r2, #2]
+; CHECK-NEXT:    add r2, r2, #4
+; CHECK-NEXT:    bne .LBB1_5
+; CHECK-NEXT:  .LBB1_6: @ %while.end
+; CHECK-NEXT:    pop {r4, r5, r11, pc}
+entry:
+  %cmp.not7 = icmp eq i32 %blockSize, 0
+  br i1 %cmp.not7, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  %0 = add i32 %blockSize, -1
+  %xtraiter = and i32 %blockSize, 1
+  %lcmp.mod.not = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod.not, label %while.body.prol.loopexit, label %while.body.prol.preheader
+
+while.body.prol.preheader:                        ; preds = %while.body.preheader
+  %incdec.ptr.prol = getelementptr inbounds i16, i16* %pSrcA, i64 1
+  %1 = load i16, i16* %pSrcA, align 2
+  %conv.prol = sext i16 %1 to i32
+  %incdec.ptr1.prol = getelementptr inbounds i16, i16* %pSrcB, i64 1
+  %2 = load i16, i16* %pSrcB, align 2
+  %conv2.prol = sext i16 %2 to i32
+  %mul.prol = mul nsw i32 %conv2.prol, %conv.prol
+  %shr.prol = ashr i32 %mul.prol, 14
+  %3 = call i32 @llvm.smax.i32(i32 %shr.prol, i32 -32768)
+  %4 = call i32 @llvm.smin.i32(i32 %3, i32 32767)
+  %conv3.prol = trunc i32 %4 to i16
+  %incdec.ptr4.prol = getelementptr inbounds i16, i16* %pDst, i64 1
+  store i16 %conv3.prol, i16* %pDst, align 2
+  br label %while.body.prol.loopexit
+
+while.body.prol.loopexit:                         ; preds = %while.body.prol.preheader, %while.body.preheader
+  %blkCnt.011.unr = phi i32 [ %blockSize, %while.body.preheader ], [ %0, %while.body.prol.preheader ]
+  %pSrcA.addr.010.unr = phi i16* [ %pSrcA, %while.body.preheader ], [ %incdec.ptr.prol, %while.body.prol.preheader ]
+  %pDst.addr.09.unr = phi i16* [ %pDst, %while.body.preheader ], [ %incdec.ptr4.prol, %while.body.prol.preheader ]
+  %pSrcB.addr.08.unr = phi i16* [ %pSrcB, %while.body.preheader ], [ %incdec.ptr1.prol, %while.body.prol.preheader ]
+  %5 = icmp eq i32 %0, 0
+  br i1 %5, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.body.prol.loopexit, %while.body
+  %blkCnt.011 = phi i32 [ %dec.1, %while.body ], [ %blkCnt.011.unr, %while.body.prol.loopexit ]
+  %pSrcA.addr.010 = phi i16* [ %incdec.ptr.1, %while.body ], [ %pSrcA.addr.010.unr, %while.body.prol.loopexit ]
+  %pDst.addr.09 = phi i16* [ %incdec.ptr4.1, %while.body ], [ %pDst.addr.09.unr, %while.body.prol.loopexit ]
+  %pSrcB.addr.08 = phi i16* [ %incdec.ptr1.1, %while.body ], [ %pSrcB.addr.08.unr, %while.body.prol.loopexit ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.010, i64 1
+  %6 = load i16, i16* %pSrcA.addr.010, align 2
+  %conv = sext i16 %6 to i32
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %pSrcB.addr.08, i64 1
+  %7 = load i16, i16* %pSrcB.addr.08, align 2
+  %conv2 = sext i16 %7 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %shr = ashr i32 %mul, 14
+  %8 = call i32 @llvm.smax.i32(i32 %shr, i32 -32768)
+  %9 = call i32 @llvm.smin.i32(i32 %8, i32 32767)
+  %conv3 = trunc i32 %9 to i16
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %pDst.addr.09, i64 1
+  store i16 %conv3, i16* %pDst.addr.09, align 2
+  %incdec.ptr.1 = getelementptr inbounds i16, i16* %pSrcA.addr.010, i64 2
+  %10 = load i16, i16* %incdec.ptr, align 2
+  %conv.1 = sext i16 %10 to i32
+  %incdec.ptr1.1 = getelementptr inbounds i16, i16* %pSrcB.addr.08, i64 2
+  %11 = load i16, i16* %incdec.ptr1, align 2
+  %conv2.1 = sext i16 %11 to i32
+  %mul.1 = mul nsw i32 %conv2.1, %conv.1
+  %shr.1 = ashr i32 %mul.1, 14
+  %12 = call i32 @llvm.smax.i32(i32 %shr.1, i32 -32768)
+  %13 = call i32 @llvm.smin.i32(i32 %12, i32 32767)
+  %conv3.1 = trunc i32 %13 to i16
+  %incdec.ptr4.1 = getelementptr inbounds i16, i16* %pDst.addr.09, i64 2
+  store i16 %conv3.1, i16* %incdec.ptr4, align 2
+  %dec.1 = add i32 %blkCnt.011, -2
+  %cmp.not.1 = icmp eq i32 %dec.1, 0
+  br i1 %cmp.not.1, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %while.body.prol.loopexit, %entry
+  ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32) #1
+declare i32 @llvm.smin.i32(i32, i32) #1
diff --git a/llvm/test/CodeGen/ARM/ssat.ll b/llvm/test/CodeGen/ARM/ssat.ll
index 436f6edc2bbbb..ff16b59489f5b 100644
--- a/llvm/test/CodeGen/ARM/ssat.ll
+++ b/llvm/test/CodeGen/ARM/ssat.ll
@@ -649,3 +649,339 @@ define i32 @formulated_invalid(i32 %a) {
   %r = and i32 %s2, 16777215
   ret i32 %r
 }
+
+
+define i32 @mm_sat_base_32bit(i32 %x) {
+; V4T-LABEL: mm_sat_base_32bit:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI18_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI18_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_sat_base_32bit:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    movw r1, #0
+; V6T2-NEXT:    movt r1, #65408
+; V6T2-NEXT:    cmn r0, #8388608
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
+  %1 = call i32 @llvm.smax.i32(i32 %0, i32 -8388608)
+  ret i32 %1
+}
+
+define i16 @mm_sat_base_16bit(i16 %x) {
+; V4T-LABEL: mm_sat_base_16bit:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    mov r2, #255
+; V4T-NEXT:    lsl r0, r0, #16
+; V4T-NEXT:    orr r2, r2, #1792
+; V4T-NEXT:    asr r1, r0, #16
+; V4T-NEXT:    cmp r1, r2
+; V4T-NEXT:    asrlt r2, r0, #16
+; V4T-NEXT:    ldr r0, .LCPI19_0
+; V4T-NEXT:    cmn r2, #2048
+; V4T-NEXT:    movgt r0, r2
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI19_0:
+; V4T-NEXT:    .long 4294965248 @ 0xfffff800
+;
+; V6T2-LABEL: mm_sat_base_16bit:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    sxth r0, r0
+; V6T2-NEXT:    movw r1, #2047
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movlt r1, r0
+; V6T2-NEXT:    movw r0, #63488
+; V6T2-NEXT:    movt r0, #65535
+; V6T2-NEXT:    cmn r1, #2048
+; V6T2-NEXT:    movgt r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i16 @llvm.smin.i16(i16 %x, i16 2047)
+  %1 = call i16 @llvm.smax.i16(i16 %0, i16 -2048)
+  ret i16 %1
+}
+
+define i8 @mm_sat_base_8bit(i8 %x) {
+; V4T-LABEL: mm_sat_base_8bit:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    lsl r1, r0, #24
+; V4T-NEXT:    mov r0, #31
+; V4T-NEXT:    asr r2, r1, #24
+; V4T-NEXT:    cmp r2, #31
+; V4T-NEXT:    asrlt r0, r1, #24
+; V4T-NEXT:    cmn r0, #32
+; V4T-NEXT:    mvnle r0, #31
+; V4T-NEXT:    bx lr
+;
+; V6T2-LABEL: mm_sat_base_8bit:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    sxtb r0, r0
+; V6T2-NEXT:    cmp r0, #31
+; V6T2-NEXT:    movge r0, #31
+; V6T2-NEXT:    cmn r0, #32
+; V6T2-NEXT:    mvnle r0, #31
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i8 @llvm.smin.i8(i8 %x, i8 31)
+  %1 = call i8 @llvm.smax.i8(i8 %0, i8 -32)
+  ret i8 %1
+}
+
+define i32 @mm_sat_lower_upper_1(i32 %x) {
+; V4T-LABEL: mm_sat_lower_upper_1:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI21_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI21_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_sat_lower_upper_1:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    movw r1, #0
+; V6T2-NEXT:    movt r1, #65408
+; V6T2-NEXT:    cmn r0, #8388608
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
+  %1 = call i32 @llvm.smax.i32(i32 %0, i32 -8388608)
+  ret i32 %1
+}
+
+define i32 @mm_sat_lower_upper_2(i32 %x) {
+; V4T-LABEL: mm_sat_lower_upper_2:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI22_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI22_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_sat_lower_upper_2:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    movw r1, #0
+; V6T2-NEXT:    movt r1, #65408
+; V6T2-NEXT:    cmn r0, #8388608
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
+  %1 = call i32 @llvm.smax.i32(i32 %0, i32 -8388608)
+  ret i32 %1
+}
+
+define i32 @mm_sat_upper_lower_1(i32 %x) {
+; V4T-LABEL: mm_sat_upper_lower_1:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI23_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI23_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_sat_upper_lower_1:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #0
+; V6T2-NEXT:    cmn r0, #8388608
+; V6T2-NEXT:    movt r1, #65408
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
+  %1 = call i32 @llvm.smin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_sat_upper_lower_2(i32 %x) {
+; V4T-LABEL: mm_sat_upper_lower_2:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI24_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI24_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_sat_upper_lower_2:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #0
+; V6T2-NEXT:    cmn r0, #8388608
+; V6T2-NEXT:    movt r1, #65408
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
+  %1 = call i32 @llvm.smin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_sat_upper_lower_3(i32 %x) {
+; V4T-LABEL: mm_sat_upper_lower_3:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI25_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI25_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_sat_upper_lower_3:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #0
+; V6T2-NEXT:    cmn r0, #8388608
+; V6T2-NEXT:    movt r1, #65408
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
+  %1 = call i32 @llvm.smin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_sat_le_ge(i32 %x) {
+; V4T-LABEL: mm_sat_le_ge:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    mov r1, #1065353216
+; V4T-NEXT:    cmn r0, #8388608
+; V4T-NEXT:    orr r1, r1, #-1073741824
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI26_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI26_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_sat_le_ge:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #0
+; V6T2-NEXT:    cmn r0, #8388608
+; V6T2-NEXT:    movt r1, #65408
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
+  %1 = call i32 @llvm.smin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_no_sat_incorrect_interval(i32 %x) {
+; V4T-LABEL: mm_no_sat_incorrect_interval:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI27_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movle r0, r1
+; V4T-NEXT:    ldr r1, .LCPI27_1
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI27_0:
+; V4T-NEXT:    .long 4275878552 @ 0xfedcba98
+; V4T-NEXT:  .LCPI27_1:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_no_sat_incorrect_interval:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #47768
+; V6T2-NEXT:    movt r1, #65244
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 -19088744)
+  %1 = call i32 @llvm.smin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i16 @llvm.smin.i16(i16, i16)
+declare i16 @llvm.smax.i16(i16, i16)
+declare i8 @llvm.smin.i8(i8, i8)
+declare i8 @llvm.smax.i8(i8, i8)
+
+
diff --git a/llvm/test/CodeGen/ARM/usat.ll b/llvm/test/CodeGen/ARM/usat.ll
index 84de3c9a0ecae..077aa9de317d2 100644
--- a/llvm/test/CodeGen/ARM/usat.ll
+++ b/llvm/test/CodeGen/ARM/usat.ll
@@ -608,3 +608,427 @@ entry:
   %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow
   ret i32 %saturateUp
 }
+
+define i32 @mm_unsigned_sat_base_32bit(i32 %x) {
+; V4T-LABEL: mm_unsigned_sat_base_32bit:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI15_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movlt r1, r0
+; V4T-NEXT:    bic r0, r1, r1, asr #31
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI15_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_unsigned_sat_base_32bit:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    ldr r1, .LCPI15_0
+; V6-NEXT:    cmp r0, r1
+; V6-NEXT:    movlt r1, r0
+; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI15_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_unsigned_sat_base_32bit:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movlt r1, r0
+; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
+  %1 = call i32 @llvm.smax.i32(i32 %0, i32 0)
+  ret i32 %1
+}
+
+define i16 @mm_unsigned_sat_base_16bit(i16 %x) {
+; V4T-LABEL: mm_unsigned_sat_base_16bit:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    mov r2, #255
+; V4T-NEXT:    lsl r0, r0, #16
+; V4T-NEXT:    orr r2, r2, #1792
+; V4T-NEXT:    asr r1, r0, #16
+; V4T-NEXT:    cmp r1, r2
+; V4T-NEXT:    asrlt r2, r0, #16
+; V4T-NEXT:    bic r0, r2, r2, asr #31
+; V4T-NEXT:    bx lr
+;
+; V6-LABEL: mm_unsigned_sat_base_16bit:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    mov r1, #255
+; V6-NEXT:    sxth r0, r0
+; V6-NEXT:    orr r1, r1, #1792
+; V6-NEXT:    cmp r0, r1
+; V6-NEXT:    movlt r1, r0
+; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    bx lr
+;
+; V6T2-LABEL: mm_unsigned_sat_base_16bit:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    sxth r0, r0
+; V6T2-NEXT:    movw r1, #2047
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movlt r1, r0
+; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i16 @llvm.smin.i16(i16 %x, i16 2047)
+  %1 = call i16 @llvm.smax.i16(i16 %0, i16 0)
+  ret i16 %1
+}
+
+define i8 @mm_unsigned_sat_base_8bit(i8 %x) {
+; V4T-LABEL: mm_unsigned_sat_base_8bit:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    lsl r0, r0, #24
+; V4T-NEXT:    mov r2, #31
+; V4T-NEXT:    asr r1, r0, #24
+; V4T-NEXT:    cmp r1, #31
+; V4T-NEXT:    asrlt r2, r0, #24
+; V4T-NEXT:    bic r0, r2, r2, asr #31
+; V4T-NEXT:    bx lr
+;
+; V6-LABEL: mm_unsigned_sat_base_8bit:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    sxtb r0, r0
+; V6-NEXT:    cmp r0, #31
+; V6-NEXT:    movge r0, #31
+; V6-NEXT:    bic r0, r0, r0, asr #31
+; V6-NEXT:    bx lr
+;
+; V6T2-LABEL: mm_unsigned_sat_base_8bit:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    sxtb r0, r0
+; V6T2-NEXT:    cmp r0, #31
+; V6T2-NEXT:    movge r0, #31
+; V6T2-NEXT:    bic r0, r0, r0, asr #31
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i8 @llvm.smin.i8(i8 %x, i8 31)
+  %1 = call i8 @llvm.smax.i8(i8 %0, i8 0)
+  ret i8 %1
+}
+
+define i32 @mm_unsigned_sat_lower_upper_1(i32 %x) {
+; V4T-LABEL: mm_unsigned_sat_lower_upper_1:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI18_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movlt r1, r0
+; V4T-NEXT:    bic r0, r1, r1, asr #31
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI18_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_unsigned_sat_lower_upper_1:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    ldr r1, .LCPI18_0
+; V6-NEXT:    cmp r0, r1
+; V6-NEXT:    movlt r1, r0
+; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI18_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_unsigned_sat_lower_upper_1:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movlt r1, r0
+; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
+  %1 = call i32 @llvm.smax.i32(i32 %0, i32 0)
+  ret i32 %1
+}
+
+define i32 @mm_unsigned_sat_lower_upper_2(i32 %x) {
+; V4T-LABEL: mm_unsigned_sat_lower_upper_2:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI19_0
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movlt r1, r0
+; V4T-NEXT:    bic r0, r1, r1, asr #31
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI19_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_unsigned_sat_lower_upper_2:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    ldr r1, .LCPI19_0
+; V6-NEXT:    cmp r0, r1
+; V6-NEXT:    movlt r1, r0
+; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI19_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_unsigned_sat_lower_upper_2:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movlt r1, r0
+; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
+  %1 = call i32 @llvm.smax.i32(i32 %0, i32 0)
+  ret i32 %1
+}
+
+define i32 @mm_unsigned_sat_upper_lower_1(i32 %x) {
+; V4T-LABEL: mm_unsigned_sat_upper_lower_1:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    bic r1, r0, r0, asr #31
+; V4T-NEXT:    ldr r0, .LCPI20_0
+; V4T-NEXT:    cmp r1, r0
+; V4T-NEXT:    movlo r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI20_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_unsigned_sat_upper_lower_1:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    bic r1, r0, r0, asr #31
+; V6-NEXT:    ldr r0, .LCPI20_0
+; V6-NEXT:    cmp r1, r0
+; V6-NEXT:    movlo r0, r1
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI20_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_unsigned_sat_upper_lower_1:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    bic r1, r0, r0, asr #31
+; V6T2-NEXT:    movw r0, #65535
+; V6T2-NEXT:    movt r0, #127
+; V6T2-NEXT:    cmp r1, r0
+; V6T2-NEXT:    movlo r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %1 = call i32 @llvm.umin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_unsigned_sat_upper_lower_2(i32 %x) {
+; V4T-LABEL: mm_unsigned_sat_upper_lower_2:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    bic r1, r0, r0, asr #31
+; V4T-NEXT:    ldr r0, .LCPI21_0
+; V4T-NEXT:    cmp r1, r0
+; V4T-NEXT:    movlo r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI21_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_unsigned_sat_upper_lower_2:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    bic r1, r0, r0, asr #31
+; V6-NEXT:    ldr r0, .LCPI21_0
+; V6-NEXT:    cmp r1, r0
+; V6-NEXT:    movlo r0, r1
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI21_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_unsigned_sat_upper_lower_2:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    bic r1, r0, r0, asr #31
+; V6T2-NEXT:    movw r0, #65535
+; V6T2-NEXT:    movt r0, #127
+; V6T2-NEXT:    cmp r1, r0
+; V6T2-NEXT:    movlo r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %1 = call i32 @llvm.umin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_unsigned_sat_upper_lower_3(i32 %x) {
+; V4T-LABEL: mm_unsigned_sat_upper_lower_3:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    bic r1, r0, r0, asr #31
+; V4T-NEXT:    ldr r0, .LCPI22_0
+; V4T-NEXT:    cmp r1, r0
+; V4T-NEXT:    movlo r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI22_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_unsigned_sat_upper_lower_3:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    bic r1, r0, r0, asr #31
+; V6-NEXT:    ldr r0, .LCPI22_0
+; V6-NEXT:    cmp r1, r0
+; V6-NEXT:    movlo r0, r1
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI22_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_unsigned_sat_upper_lower_3:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    bic r1, r0, r0, asr #31
+; V6T2-NEXT:    movw r0, #65535
+; V6T2-NEXT:    movt r0, #127
+; V6T2-NEXT:    cmp r1, r0
+; V6T2-NEXT:    movlo r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %1 = call i32 @llvm.umin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_no_unsigned_sat_incorrect_constant(i32 %x) {
+; V4T-LABEL: mm_no_unsigned_sat_incorrect_constant:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    orr r1, r0, r0, asr #31
+; V4T-NEXT:    ldr r0, .LCPI23_0
+; V4T-NEXT:    cmp r1, r0
+; V4T-NEXT:    movlt r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI23_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_no_unsigned_sat_incorrect_constant:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    orr r1, r0, r0, asr #31
+; V6-NEXT:    ldr r0, .LCPI23_0
+; V6-NEXT:    cmp r1, r0
+; V6-NEXT:    movlt r0, r1
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI23_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_no_unsigned_sat_incorrect_constant:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    orr r1, r0, r0, asr #31
+; V6T2-NEXT:    movw r0, #65535
+; V6T2-NEXT:    movt r0, #127
+; V6T2-NEXT:    cmp r1, r0
+; V6T2-NEXT:    movlt r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 -1)
+  %1 = call i32 @llvm.smin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+define i32 @mm_no_unsigned_sat_incorrect_constant2(i32 %x) {
+; V4T-LABEL: mm_no_unsigned_sat_incorrect_constant2:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    bic r1, r0, r0, asr #31
+; V4T-NEXT:    mov r0, #1
+; V4T-NEXT:    orr r0, r0, #8388608
+; V4T-NEXT:    cmp r1, #8388608
+; V4T-NEXT:    movls r0, r1
+; V4T-NEXT:    bx lr
+;
+; V6-LABEL: mm_no_unsigned_sat_incorrect_constant2:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    bic r1, r0, r0, asr #31
+; V6-NEXT:    mov r0, #1
+; V6-NEXT:    orr r0, r0, #8388608
+; V6-NEXT:    cmp r1, #8388608
+; V6-NEXT:    movls r0, r1
+; V6-NEXT:    bx lr
+;
+; V6T2-LABEL: mm_no_unsigned_sat_incorrect_constant2:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    bic r1, r0, r0, asr #31
+; V6T2-NEXT:    movw r0, #1
+; V6T2-NEXT:    movt r0, #128
+; V6T2-NEXT:    cmp r1, #8388608
+; V6T2-NEXT:    movls r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %1 = call i32 @llvm.umin.i32(i32 %0, i32 8388609)
+  ret i32 %1
+}
+
+define i32 @mm_no_unsigned_sat_incorrect_interval(i32 %x) {
+; V4T-LABEL: mm_no_unsigned_sat_incorrect_interval:
+; V4T:       @ %bb.0: @ %entry
+; V4T-NEXT:    ldr r1, .LCPI25_0
+; V4T-NEXT:    cmn r0, #4
+; V4T-NEXT:    mvnle r0, #3
+; V4T-NEXT:    cmp r0, r1
+; V4T-NEXT:    movge r0, r1
+; V4T-NEXT:    bx lr
+; V4T-NEXT:    .p2align 2
+; V4T-NEXT:  @ %bb.1:
+; V4T-NEXT:  .LCPI25_0:
+; V4T-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6-LABEL: mm_no_unsigned_sat_incorrect_interval:
+; V6:       @ %bb.0: @ %entry
+; V6-NEXT:    ldr r1, .LCPI25_0
+; V6-NEXT:    cmn r0, #4
+; V6-NEXT:    mvnle r0, #3
+; V6-NEXT:    cmp r0, r1
+; V6-NEXT:    movge r0, r1
+; V6-NEXT:    bx lr
+; V6-NEXT:    .p2align 2
+; V6-NEXT:  @ %bb.1:
+; V6-NEXT:  .LCPI25_0:
+; V6-NEXT:    .long 8388607 @ 0x7fffff
+;
+; V6T2-LABEL: mm_no_unsigned_sat_incorrect_interval:
+; V6T2:       @ %bb.0: @ %entry
+; V6T2-NEXT:    cmn r0, #4
+; V6T2-NEXT:    movw r1, #65535
+; V6T2-NEXT:    mvnle r0, #3
+; V6T2-NEXT:    movt r1, #127
+; V6T2-NEXT:    cmp r0, r1
+; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    bx lr
+entry:
+  %0 = call i32 @llvm.smax.i32(i32 %x, i32 -4)
+  %1 = call i32 @llvm.smin.i32(i32 %0, i32 8388607)
+  ret i32 %1
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i16 @llvm.smin.i16(i16, i16)
+declare i16 @llvm.smax.i16(i16, i16)
+declare i8 @llvm.smin.i8(i8, i8)
+declare i8 @llvm.smax.i8(i8, i8)
+declare i32 @llvm.umin.i32(i32, i32)

From 3a3d9ae545925162ebbe820639cd2fe072ff4dd8 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 21 Feb 2022 11:40:01 -0500
Subject: [PATCH 433/748] [Clang][OpenMP] Fix wrong form of 'cond-update-stmt'
 in atomic_ast_print.cpp

In `clang/test/OpenMP/atomic_ast_print.cpp` for `atomic compare capture`,
it was using 'cond-expr-stmt' instead of 'cond-update-stmt'. The spec only supports
'cond-update-stmt'.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D120252
---
 clang/test/OpenMP/atomic_ast_print.cpp | 224 +++++++++++++++++--------
 1 file changed, 154 insertions(+), 70 deletions(-)

diff --git a/clang/test/OpenMP/atomic_ast_print.cpp b/clang/test/OpenMP/atomic_ast_print.cpp
index 7502fdc339c2a..201f62ab2117e 100644
--- a/clang/test/OpenMP/atomic_ast_print.cpp
+++ b/clang/test/OpenMP/atomic_ast_print.cpp
@@ -47,9 +47,9 @@ T foo(T argc) {
 #pragma omp atomic compare
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture
   { v = a == b; if (v) a = c; }
 #endif
@@ -76,9 +76,9 @@ T foo(T argc) {
 #pragma omp atomic compare seq_cst
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture seq_cst
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare seq_cst capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture seq_cst
   { v = a == b; if (v) a = c; }
 #endif
@@ -105,9 +105,9 @@ T foo(T argc) {
 #pragma omp atomic compare acq_rel
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture acq_rel
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare acq_rel capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture acq_rel
   { v = a == b; if (v) a = c; }
 #endif
@@ -134,9 +134,9 @@ T foo(T argc) {
 #pragma omp atomic compare acquire
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture acquire
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare acquire capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture acquire
   { v = a == b; if (v) a = c; }
 #endif
@@ -163,9 +163,9 @@ T foo(T argc) {
 #pragma omp atomic compare release
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture release
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare release capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture release
   { v = a == b; if (v) a = c; }
 #endif
@@ -192,9 +192,9 @@ T foo(T argc) {
 #pragma omp atomic compare relaxed
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture relaxed
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare relaxed capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture relaxed
   { v = a == b; if (v) a = c; }
 #endif
@@ -221,9 +221,9 @@ T foo(T argc) {
 #pragma omp atomic compare hint(6)
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture hint(6)
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare hint(6) capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture hint(6)
   { v = a == b; if (v) a = c; }
 #endif
@@ -261,12 +261,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture
 // CHECK-51-NEXT: {
@@ -304,12 +308,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare seq_cst capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
 // CHECK-51-NEXT: {
@@ -347,12 +355,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare acq_rel capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
 // CHECK-51-NEXT: {
@@ -390,12 +402,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare acquire capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
 // CHECK-51-NEXT: {
@@ -433,12 +449,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture release
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare release capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture release
 // CHECK-51-NEXT: {
@@ -476,12 +496,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare relaxed capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
 // CHECK-51-NEXT: {
@@ -519,12 +543,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare hint(6) capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
 // CHECK-51-NEXT: {
@@ -563,12 +591,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture
 // CHECK-51-NEXT: {
@@ -606,12 +638,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare seq_cst capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
 // CHECK-51-NEXT: {
@@ -649,12 +685,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare acq_rel capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
 // CHECK-51-NEXT: {
@@ -692,12 +732,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare acquire capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
 // CHECK-51-NEXT: {
@@ -735,12 +779,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture release
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare release capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture release
 // CHECK-51-NEXT: {
@@ -778,12 +826,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare relaxed capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
 // CHECK-51-NEXT: {
@@ -821,12 +873,16 @@ T foo(T argc) {
 // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a > b ? b : a;
+// CHECK-51-NEXT: if (a > b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare hint(6) capture
 // CHECK-51-NEXT: {
 // CHECK-51-NEXT: v = a;
-// CHECK-51-NEXT: a = a < b ? b : a;
+// CHECK-51-NEXT: if (a < b) {
+// CHECK-51-NEXT: a = b;
+// CHECK-51-NEXT: }
 // CHECK-51-NEXT: }
 // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
 // CHECK-51-NEXT: {
@@ -864,9 +920,9 @@ int main(int argc, char **argv) {
 #pragma omp atomic compare
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture
   { v = a == b; if (v) a = c; }
 #endif
@@ -893,9 +949,9 @@ int main(int argc, char **argv) {
 #pragma omp atomic compare seq_cst
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture seq_cst
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare seq_cst capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture seq_cst
   { v = a == b; if (v) a = c; }
 #endif
@@ -922,9 +978,9 @@ int main(int argc, char **argv) {
 #pragma omp atomic compare acq_rel
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture acq_rel
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare acq_rel capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture acq_rel
   { v = a == b; if (v) a = c; }
 #endif
@@ -951,9 +1007,9 @@ int main(int argc, char **argv) {
 #pragma omp atomic compare acquire
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture acquire
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare acquire capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture acquire
   { v = a == b; if (v) a = c; }
 #endif
@@ -980,9 +1036,9 @@ int main(int argc, char **argv) {
 #pragma omp atomic compare release
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture release
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare release capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture release
   { v = a == b; if (v) a = c; }
 #endif
@@ -1009,9 +1065,9 @@ int main(int argc, char **argv) {
 #pragma omp atomic compare relaxed
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture relaxed
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare relaxed capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture relaxed
   { v = a == b; if (v) a = c; }
 #endif
@@ -1038,9 +1094,9 @@ int main(int argc, char **argv) {
 #pragma omp atomic compare hint(6)
   { a = a == b ? c : a; }
 #pragma omp atomic compare capture hint(6)
-  { v = a; a = a > b ? b : a; }
+  { v = a; if (a > b) { a = b; } }
 #pragma omp atomic compare hint(6) capture
-  { v = a; a = a < b ? b : a; }
+  { v = a; if (a < b) { a = b; } }
 #pragma omp atomic compare capture hint(6)
   { v = a == b; if (v) a = c; }
 #endif
@@ -1074,12 +1130,16 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: #pragma omp atomic compare capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: if (a > b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: if (a < b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture
   // CHECK-51-NEXT: {
@@ -1117,12 +1177,16 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: if (a > b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare seq_cst capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: if (a < b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture seq_cst
   // CHECK-51-NEXT: {
@@ -1160,12 +1224,16 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: if (a > b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare acq_rel capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: if (a < b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture acq_rel
   // CHECK-51-NEXT: {
@@ -1203,12 +1271,16 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: if (a > b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare acquire capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: if (a < b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture acquire
   // CHECK-51-NEXT: {
@@ -1246,12 +1318,16 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: #pragma omp atomic compare capture release
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: if (a > b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare release capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: if (a < b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture release
   // CHECK-51-NEXT: {
@@ -1289,12 +1365,16 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: if (a > b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare relaxed capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: if (a < b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture relaxed
   // CHECK-51-NEXT: {
@@ -1332,12 +1412,16 @@ int main(int argc, char **argv) {
   // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a > b ? b : a;
+  // CHECK-51-NEXT: if (a > b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare hint(6) capture
   // CHECK-51-NEXT: {
   // CHECK-51-NEXT: v = a;
-  // CHECK-51-NEXT: a = a < b ? b : a;
+  // CHECK-51-NEXT: if (a < b) {
+  // CHECK-51-NEXT: a = b;
+  // CHECK-51-NEXT: }
   // CHECK-51-NEXT: }
   // CHECK-51-NEXT: #pragma omp atomic compare capture hint(6)
   // CHECK-51-NEXT: {

From 7662d1687b09505106718d412a40a0db8149e0bf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 21 Feb 2022 16:54:02 +0000
Subject: [PATCH 434/748] [MemCpyOpt] Check all access for MemoryUses in
 writtenBetween.

Currently writtenBetween can miss clobbers of Loc between End and Start,
if End is a MemoryUse.

To guarantee we see all write clobbers of Loc between Start and End
for MemoryUses, restrict to Start and End being in the same block
and check all accesses between them.

This fixes 2 mis-compiles illustrated in
llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D119929
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 24 +++++++++++++++----
 .../memcpy-byval-forwarding-clobbers.ll       |  6 ++---
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 6698db26626b7..e9688854ae0ea 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -352,9 +352,25 @@ static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc,
 
 // Check for mod of Loc between Start and End, excluding both boundaries.
 // Start and End can be in different blocks.
-static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
-                           const MemoryUseOrDef *Start,
+static bool writtenBetween(MemorySSA *MSSA, AliasAnalysis &AA,
+                           MemoryLocation Loc, const MemoryUseOrDef *Start,
                            const MemoryUseOrDef *End) {
+  if (isa<MemoryUse>(End)) {
+    // For MemoryUses, getClobberingMemoryAccess may skip non-clobbering writes.
+    // Manually check read accesses between Start and End, if they are in the
+    // same block, for clobbers. Otherwise assume Loc is clobbered.
+    return Start->getBlock() != End->getBlock() ||
+           any_of(
+               make_range(std::next(Start->getIterator()), End->getIterator()),
+               [&AA, Loc](const MemoryAccess &Acc) {
+                 if (isa<MemoryUse>(&Acc))
+                   return false;
+                 Instruction *AccInst =
+                     cast<MemoryUseOrDef>(&Acc)->getMemoryInst();
+                 return isModSet(AA.getModRefInfo(AccInst, Loc));
+               });
+  }
+
   // TODO: Only walk until we hit Start.
   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
       End->getDefiningAccess(), Loc);
@@ -1118,7 +1134,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // then we could still perform the xform by moving M up to the first memcpy.
   // TODO: It would be sufficient to check the MDep source up to the memcpy
   // size of M, rather than MDep.
-  if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+  if (writtenBetween(MSSA, *AA, MemoryLocation::getForSource(MDep),
                      MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
     return false;
 
@@ -1557,7 +1573,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   //    *b = 42;
   //    foo(*a)
   // It would be invalid to transform the second memcpy into foo(*b).
-  if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
+  if (writtenBetween(MSSA, *AA, MemoryLocation::getForSource(MDep),
                      MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
     return false;
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
index f2eae367dae4d..818034275a453 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-byval-forwarding-clobbers.ll
@@ -13,7 +13,6 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noa
 
 ; %a.2's lifetime ends before the call to @check. Cannot replace
 ; %a.1 with %a.2 in the call to @check.
-; FIXME: Find lifetime.end, prevent optimization.
 define i1 @alloca_forwarding_lifetime_end_clobber() {
 ; CHECK-LABEL: @alloca_forwarding_lifetime_end_clobber(
 ; CHECK-NEXT:  entry:
@@ -26,7 +25,7 @@ define i1 @alloca_forwarding_lifetime_end_clobber() {
 ; CHECK-NEXT:    store i8 0, i8* [[BC_A_2]], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC_A_1]], i8* [[BC_A_2]], i64 8, i1 false)
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 8, i8* [[BC_A_2]])
-; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(i64* byval(i64) align 8 [[A_2]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(i64* byval(i64) align 8 [[A_1]])
 ; CHECK-NEXT:    ret i1 [[CALL]]
 ;
 entry:
@@ -46,7 +45,6 @@ entry:
 
 ; There is a call clobbering %a.2 before the call to @check. Cannot replace
 ; %a.1 with %a.2 in the call to @check.
-; FIXME: Find clobber, prevent optimization.
 define i1 @alloca_forwarding_call_clobber() {
 ; CHECK-LABEL: @alloca_forwarding_call_clobber(
 ; CHECK-NEXT:  entry:
@@ -59,7 +57,7 @@ define i1 @alloca_forwarding_call_clobber() {
 ; CHECK-NEXT:    store i8 0, i8* [[BC_A_2]], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC_A_1]], i8* [[BC_A_2]], i64 8, i1 false)
 ; CHECK-NEXT:    call void @clobber(i8* [[BC_A_2]])
-; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(i64* byval(i64) align 8 [[A_2]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i1 @check(i64* byval(i64) align 8 [[A_1]])
 ; CHECK-NEXT:    ret i1 [[CALL]]
 ;
 entry:

From ea7be7e32d9f43e59dad9a7ef020feb0f0168125 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 21 Feb 2022 18:37:19 +0100
Subject: [PATCH 435/748] [MLIR][PDL] Fix C++20 build. concept is a new
 keyword. NFC.

---
 mlir/lib/Rewrite/ByteCode.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Rewrite/ByteCode.cpp b/mlir/lib/Rewrite/ByteCode.cpp
index d6a07f9067fe4..c8bc206268a94 100644
--- a/mlir/lib/Rewrite/ByteCode.cpp
+++ b/mlir/lib/Rewrite/ByteCode.cpp
@@ -1542,12 +1542,12 @@ void ByteCodeExecutor::executeCreateOperation(PatternRewriter &rewriter,
     }
 
     // Handle the case where the operation has inferred types.
-    InferTypeOpInterface::Concept *concept =
+    InferTypeOpInterface::Concept *inferInterface =
         state.name.getRegisteredInfo()->getInterface<InferTypeOpInterface>();
 
     // TODO: Handle failure.
     state.types.clear();
-    if (failed(concept->inferReturnTypes(
+    if (failed(inferInterface->inferReturnTypes(
             state.getContext(), state.location, state.operands,
             state.attributes.getDictionary(state.getContext()), state.regions,
             state.types)))

From c31ef42530713048e4b14d61e25b63af8d1084f8 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 21 Feb 2022 13:29:37 -0500
Subject: [PATCH 436/748] Revert "[ArgPromotion] Regenerate test checks for
 crash.ll - removed ALL_NEWPM prefix."

This reverts commit 52577cd26f26f6428c72395e7337af3fc84bc6f6.
Breaks check-llvm, see comments on https://reviews.llvm.org/D120207
---
 .../Transforms/ArgumentPromotion/crash.ll     | 92 +++++++++----------
 1 file changed, 44 insertions(+), 48 deletions(-)

diff --git a/llvm/test/Transforms/ArgumentPromotion/crash.ll b/llvm/test/Transforms/ArgumentPromotion/crash.ll
index 7909ef909fb49..d55f4624e0c34 100644
--- a/llvm/test/Transforms/ArgumentPromotion/crash.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/crash.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
-; RUN: opt -S < %s -inline -argpromotion | FileCheck %s
-; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s
+; RUN: opt -S < %s -inline -argpromotion | FileCheck %s --check-prefix=ARGPROMOTION
+; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s --check-prefixes=ARGPROMOTION,ALL_NEWPM
 
 %S = type { %S* }
 
 ; Inlining should nuke the invoke (and any inlined calls) here even with
 ; argument promotion running along with it.
 define void @zot() personality i32 (...)* @wibble {
-; CHECK-LABEL: define {{[^@]+}}@zot() personality i32 (...)* @wibble {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    unreachable
-; CHECK:       hoge.exit:
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    unreachable
-; CHECK:       bb2:
-; CHECK-NEXT:    [[TEMP:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    cleanup
-; CHECK-NEXT:    unreachable
+; ARGPROMOTION-LABEL: define {{[^@]+}}@zot() personality i32 (...)* @wibble
+; ARGPROMOTION-NEXT:  bb:
+; ARGPROMOTION-NEXT:    unreachable
+; ARGPROMOTION:       hoge.exit:
+; ARGPROMOTION-NEXT:    br label [[BB1:%.*]]
+; ARGPROMOTION:       bb1:
+; ARGPROMOTION-NEXT:    unreachable
+; ARGPROMOTION:       bb2:
+; ARGPROMOTION-NEXT:    [[TMP:%.*]] = landingpad { i8*, i32 }
+; ARGPROMOTION-NEXT:    cleanup
+; ARGPROMOTION-NEXT:    unreachable
 ;
 bb:
   invoke void @hoge()
@@ -27,15 +27,15 @@ bb1:
   unreachable
 
 bb2:
-  %temp = landingpad { i8*, i32 }
+  %tmp = landingpad { i8*, i32 }
   cleanup
   unreachable
 }
 
 define internal void @hoge() {
 bb:
-  %temp = call fastcc i8* @spam(i1 (i8*)* @eggs)
-  %temp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
+  %tmp = call fastcc i8* @spam(i1 (i8*)* @eggs)
+  %tmp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
   unreachable
 }
 
@@ -45,58 +45,54 @@ bb:
 }
 
 define internal i1 @eggs(i8* %arg) {
-; CHECK-LABEL: define {{[^@]+}}@eggs() {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    unreachable
+; ALL_NEWPM-LABEL: define {{[^@]+}}@eggs()
+; ALL_NEWPM-NEXT:  bb:
+; ALL_NEWPM-NEXT:    unreachable
 ;
 bb:
-  %temp = call zeroext i1 @barney(i8* %arg)
+  %tmp = call zeroext i1 @barney(i8* %arg)
   unreachable
 }
 
 define internal i1 @barney(i8* %arg) {
-; CHECK-LABEL: define {{[^@]+}}@barney() {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    ret i1 undef
-;
 bb:
   ret i1 undef
 }
 
 define i32 @test_inf_promote_caller(i32 %arg) {
-; CHECK-LABEL: define {{[^@]+}}@test_inf_promote_caller
-; CHECK-SAME: (i32 [[ARG:%.*]]) {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TEMP:%.*]] = alloca [[S:%.*]], align 8
-; CHECK-NEXT:    [[TEMP1:%.*]] = alloca [[S]], align 8
-; CHECK-NEXT:    [[TEMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TEMP]], %S* [[TEMP1]])
-; CHECK-NEXT:    ret i32 0
+; ARGPROMOTION-LABEL: define {{[^@]+}}@test_inf_promote_caller
+; ARGPROMOTION-SAME: (i32 [[ARG:%.*]])
+; ARGPROMOTION-NEXT:  bb:
+; ARGPROMOTION-NEXT:    [[TMP:%.*]] = alloca [[S:%.*]]
+; ARGPROMOTION-NEXT:    [[TMP1:%.*]] = alloca [[S]]
+; ARGPROMOTION-NEXT:    [[TMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TMP]], %S* [[TMP1]])
+; ARGPROMOTION-NEXT:    ret i32 0
 ;
 bb:
-  %temp = alloca %S
-  %temp1 = alloca %S
-  %temp2 = call i32 @test_inf_promote_callee(%S* %temp, %S* %temp1)
+  %tmp = alloca %S
+  %tmp1 = alloca %S
+  %tmp2 = call i32 @test_inf_promote_callee(%S* %tmp, %S* %tmp1)
 
   ret i32 0
 }
 
 define internal i32 @test_inf_promote_callee(%S* %arg, %S* %arg1) {
-; CHECK-LABEL: define {{[^@]+}}@test_inf_promote_callee
-; CHECK-SAME: (%S* [[ARG:%.*]], %S* [[ARG1:%.*]]) {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TEMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP2:%.*]] = load %S*, %S** [[TEMP]], align 8
-; CHECK-NEXT:    [[TEMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
-; CHECK-NEXT:    [[TEMP4:%.*]] = load %S*, %S** [[TEMP3]], align 8
-; CHECK-NEXT:    [[TEMP5:%.*]] = call i32 @test_inf_promote_callee(%S* [[TEMP4]], %S* [[TEMP2]])
-; CHECK-NEXT:    ret i32 0
+; ARGPROMOTION-LABEL: define {{[^@]+}}@test_inf_promote_callee
+; ARGPROMOTION-SAME: (%S* [[ARG:%.*]], %S* [[ARG1:%.*]])
+; ARGPROMOTION-NEXT:  bb:
+; ARGPROMOTION-NEXT:    [[TMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
+; ARGPROMOTION-NEXT:    [[TMP2:%.*]] = load %S*, %S** [[TMP]]
+; ARGPROMOTION-NEXT:    [[TMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
+; ARGPROMOTION-NEXT:    [[TMP4:%.*]] = load %S*, %S** [[TMP3]]
+; ARGPROMOTION-NEXT:    [[TMP5:%.*]] = call i32 @test_inf_promote_callee(%S* [[TMP4]], %S* [[TMP2]])
+; ARGPROMOTION-NEXT:    ret i32 0
 ;
 bb:
-  %temp = getelementptr %S, %S* %arg1, i32 0, i32 0
-  %temp2 = load %S*, %S** %temp
-  %temp3 = getelementptr %S, %S* %arg, i32 0, i32 0
-  %temp4 = load %S*, %S** %temp3
-  %temp5 = call i32 @test_inf_promote_callee(%S* %temp4, %S* %temp2)
+  %tmp = getelementptr %S, %S* %arg1, i32 0, i32 0
+  %tmp2 = load %S*, %S** %tmp
+  %tmp3 = getelementptr %S, %S* %arg, i32 0, i32 0
+  %tmp4 = load %S*, %S** %tmp3
+  %tmp5 = call i32 @test_inf_promote_callee(%S* %tmp4, %S* %tmp2)
 
   ret i32 0
 }

From 2d653b7e5b351b152b2c2bddef93c75f84042e15 Mon Sep 17 00:00:00 2001
From: Casey Carter <Casey@Carter.net>
Date: Wed, 29 Dec 2021 14:26:30 -0800
Subject: [PATCH 437/748] [libcxx][test] array and basic_string_view iterators
 are not portably pointers

Fixup tests that believe them to be so. Most notably including some heavy refactoring in `std/iterators/iterator.primitives/iterator.traits/cxx20_iterator_traits.compile.pass.cpp`, which now detects pointers and validates that `iterator_concept` is present only for pointers.

Differential Revision: https://reviews.llvm.org/D117368
---
 .../cxx20_iterator_traits.compile.pass.cpp    | 221 +++++++-----------
 .../iterator.cust.move/iter_move.pass.cpp     |  12 +-
 .../from_iterator_sentinel.pass.cpp           |  37 +--
 .../iterator_sentinel.pass.cpp                |  33 +--
 .../format.parse.ctx/advance_to.pass.cpp      |   8 +-
 .../format.parse.ctx/begin.pass.cpp           |   2 +-
 .../format.parse.ctx/ctor.pass.cpp            |   4 +-
 .../format.parse.ctx/end.pass.cpp             |   2 +-
 8 files changed, 140 insertions(+), 179 deletions(-)

diff --git a/libcxx/test/std/iterators/iterator.primitives/iterator.traits/cxx20_iterator_traits.compile.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/iterator.traits/cxx20_iterator_traits.compile.pass.cpp
index ed0b4210b95c2..8857b5895e3be 100644
--- a/libcxx/test/std/iterators/iterator.primitives/iterator.traits/cxx20_iterator_traits.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/iterator.traits/cxx20_iterator_traits.compile.pass.cpp
@@ -53,169 +53,116 @@ constexpr bool has_iterator_concept_v = requires {
   typename Traits::iterator_concept;
 };
 
-template <class Iter, class Category>
-constexpr bool testIOIterator() {
-  using Traits = std::iterator_traits<Iter>;
-  static_assert(std::same_as<typename Traits::iterator_category, Category>);
-  static_assert(std::same_as<typename Traits::value_type, void>);
-  static_assert(std::same_as<typename Traits::difference_type, std::ptrdiff_t>);
-  static_assert(std::same_as<typename Traits::reference, void>);
-  static_assert(std::same_as<typename Traits::pointer, void>);
-  static_assert(!has_iterator_concept_v<Traits>);
-
-  return true;
-}
-
-template <class Iter, class ValueType, class Category>
-constexpr bool testConstWithoutConcept() {
-  using Traits = std::iterator_traits<Iter>;
-  static_assert(std::same_as<typename Traits::iterator_category, Category>);
-  static_assert(std::same_as<typename Traits::value_type, ValueType>);
-  static_assert(std::same_as<typename Traits::difference_type, std::ptrdiff_t>);
-  static_assert(std::same_as<typename Traits::reference, const ValueType&>);
-  static_assert(std::same_as<typename Traits::pointer, const ValueType*>);
-  static_assert(!has_iterator_concept_v<Traits>);
-
-  return true;
-}
-
-template <class Iter, class ValueType, class Category, class IterConcept>
-constexpr bool testConstWithConcept() {
-  using Traits = std::iterator_traits<Iter>;
-  static_assert(std::same_as<typename Traits::iterator_category, Category>);
-  static_assert(std::same_as<typename Traits::value_type, ValueType>);
-  static_assert(std::same_as<typename Traits::difference_type, std::ptrdiff_t>);
-  static_assert(std::same_as<typename Traits::reference, const ValueType&>);
-  static_assert(std::same_as<typename Traits::pointer, const ValueType*>);
-  static_assert(std::same_as<typename Traits::iterator_concept, IterConcept>);
-
-  return true;
-}
-
-template <class Iter, class ValueType, class Category>
-constexpr bool testWithoutConcept() {
-  using Traits = std::iterator_traits<Iter>;
-  static_assert(std::same_as<typename Traits::iterator_category, Category>);
-  static_assert(std::same_as<typename Traits::value_type, ValueType>);
-  static_assert(std::same_as<typename Traits::difference_type, std::ptrdiff_t>);
-  static_assert(std::same_as<typename Traits::reference, ValueType&>);
-  static_assert(std::same_as<typename Traits::pointer, ValueType*>);
-  static_assert(!has_iterator_concept_v<Traits>);
-
-  return true;
-}
-
-template <class Iter, class ValueType, class Category, class IterConcept>
-constexpr bool testWithConcept() {
-  using Traits = std::iterator_traits<Iter>;
-  static_assert(std::same_as<typename Traits::iterator_category, Category>);
-  static_assert(std::same_as<typename Traits::value_type, ValueType>);
-  static_assert(std::same_as<typename Traits::difference_type, std::ptrdiff_t>);
-  static_assert(std::same_as<typename Traits::reference, ValueType&>);
-  static_assert(std::same_as<typename Traits::pointer, ValueType*>);
-  static_assert(std::same_as<typename Traits::iterator_concept, IterConcept>);
-
-  return true;
-}
-
-template <class Iter, class ValueType, class DiffType, class RefType, class PtrType, class Category>
-constexpr bool testWithoutConcept() {
+template <class Iter, class Category, class ValueType, class DiffType, class RefType, class PtrType>
+constexpr bool test() {
   using Traits = std::iterator_traits<Iter>;
   static_assert(std::same_as<typename Traits::iterator_category, Category>);
   static_assert(std::same_as<typename Traits::value_type, ValueType>);
   static_assert(std::same_as<typename Traits::difference_type, DiffType>);
   static_assert(std::same_as<typename Traits::reference, RefType>);
   static_assert(std::same_as<typename Traits::pointer, PtrType>);
-  static_assert(!has_iterator_concept_v<Traits>);
+  if constexpr (std::is_pointer_v<Iter>) {
+    static_assert(std::same_as<typename Traits::iterator_concept, std::contiguous_iterator_tag>);
+  } else {
+    static_assert(!has_iterator_concept_v<Traits>);
+  }
 
   return true;
 }
 
-template <class Iter, class ValueType, class DiffType, class RefType, class PtrType, class Category, class IterConcept>
-constexpr bool testWithConcept() {
-  using Traits = std::iterator_traits<Iter>;
-  static_assert(std::same_as<typename Traits::iterator_category, Category>);
-  static_assert(std::same_as<typename Traits::value_type, ValueType>);
-  static_assert(std::same_as<typename Traits::difference_type, DiffType>);
-  static_assert(std::same_as<typename Traits::reference, RefType>);
-  static_assert(std::same_as<typename Traits::pointer, PtrType>);
-  static_assert(std::same_as<typename Traits::iterator_concept, IterConcept>);
+template <class Iter, class Category>
+constexpr bool testIOIterator() {
+  return test<Iter, Category, void, std::ptrdiff_t, void, void>();
+}
 
-  return true;
+template <class Iter, class Category, class ValueType>
+constexpr bool testConst() {
+  return test<Iter, Category, ValueType, std::ptrdiff_t, const ValueType&, const ValueType*>();
+}
+
+template <class Iter, class Category, class ValueType>
+constexpr bool testMutable() {
+  return test<Iter, Category, ValueType, std::ptrdiff_t, ValueType&, ValueType*>();
 }
 
 // Standard types.
 
-// These tests depend on implementation details of libc++,
-// e.g. that std::array::iterator is a raw pointer type but std::string::iterator is not.
-// The Standard does not specify whether iterator_traits<It>::iterator_concept exists for any particular non-pointer type.
+// The Standard does not specify whether iterator_traits<It>::iterator_concept
+// exists for any particular non-pointer type, we assume it is present
+// only for pointers.
 //
-static_assert(testWithConcept<std::array<int, 10>::iterator, int, std::random_access_iterator_tag, std::contiguous_iterator_tag>());
-static_assert(testConstWithConcept<std::array<int, 10>::const_iterator, int, std::random_access_iterator_tag, std::contiguous_iterator_tag>());
-static_assert(testWithoutConcept<std::string::iterator, char, std::random_access_iterator_tag>());
-static_assert(testConstWithoutConcept<std::string::const_iterator, char, std::random_access_iterator_tag>());
-static_assert(testConstWithConcept<std::string_view::iterator, char, std::random_access_iterator_tag, std::contiguous_iterator_tag>());
-static_assert(testConstWithConcept<std::string_view::const_iterator, char, std::random_access_iterator_tag, std::contiguous_iterator_tag>());
-static_assert(testWithoutConcept<std::vector<int>::iterator, int, std::random_access_iterator_tag>());
-static_assert(testConstWithoutConcept<std::vector<int>::const_iterator, int, std::random_access_iterator_tag>());
-
-static_assert(testWithoutConcept<std::deque<int>::iterator, int, std::random_access_iterator_tag>());
-static_assert(testConstWithoutConcept<std::deque<int>::const_iterator, int, std::random_access_iterator_tag>());
-static_assert(testWithoutConcept<std::forward_list<int>::iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::forward_list<int>::const_iterator, int, std::forward_iterator_tag>());
-static_assert(testWithoutConcept<std::list<int>::iterator, int, std::bidirectional_iterator_tag>());
-static_assert(testConstWithoutConcept<std::list<int>::const_iterator, int, std::bidirectional_iterator_tag>());
-
-static_assert(testWithoutConcept<std::map<int, int>::iterator, std::pair<const int, int>, std::bidirectional_iterator_tag>());
-static_assert(testConstWithoutConcept<std::map<int, int>::const_iterator, std::pair<const int, int>, std::bidirectional_iterator_tag>());
-static_assert(testWithoutConcept<std::multimap<int, int>::iterator, std::pair<const int, int>, std::bidirectional_iterator_tag>());
-static_assert(testConstWithoutConcept<std::multimap<int, int>::const_iterator, std::pair<const int, int>, std::bidirectional_iterator_tag>());
-
-static_assert(testConstWithoutConcept<std::set<int>::iterator, int, std::bidirectional_iterator_tag>());
-static_assert(testConstWithoutConcept<std::set<int>::const_iterator, int, std::bidirectional_iterator_tag>());
-static_assert(testConstWithoutConcept<std::multiset<int>::iterator, int, std::bidirectional_iterator_tag>());
-static_assert(testConstWithoutConcept<std::multiset<int>::const_iterator, int, std::bidirectional_iterator_tag>());
-
-static_assert(testWithoutConcept<std::unordered_map<int, int>::iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_map<int, int>::const_iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-static_assert(testWithoutConcept<std::unordered_map<int, int>::local_iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_map<int, int>::const_local_iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-static_assert(testWithoutConcept<std::unordered_multimap<int, int>::iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_multimap<int, int>::const_iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-static_assert(testWithoutConcept<std::unordered_multimap<int, int>::local_iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_multimap<int, int>::const_local_iterator, std::pair<const int, int>, std::forward_iterator_tag>());
-
-static_assert(testConstWithoutConcept<std::unordered_set<int>::iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_set<int>::const_iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_set<int>::local_iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_set<int>::const_local_iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_multiset<int>::iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_multiset<int>::const_iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_multiset<int>::local_iterator, int, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::unordered_multiset<int>::const_local_iterator, int, std::forward_iterator_tag>());
-
-static_assert(testWithoutConcept<std::reverse_iterator<int*>, int, std::random_access_iterator_tag>());
+static_assert(testMutable<std::array<int, 10>::iterator, std::random_access_iterator_tag, int>());
+static_assert(testConst<std::array<int, 10>::const_iterator, std::random_access_iterator_tag, int>());
+static_assert(testMutable<std::string::iterator, std::random_access_iterator_tag, char>());
+static_assert(testConst<std::string::const_iterator, std::random_access_iterator_tag, char>());
+static_assert(testConst<std::string_view::iterator, std::random_access_iterator_tag, char>());
+static_assert(testConst<std::string_view::const_iterator, std::random_access_iterator_tag, char>());
+static_assert(testMutable<std::vector<int>::iterator, std::random_access_iterator_tag, int>());
+static_assert(testConst<std::vector<int>::const_iterator, std::random_access_iterator_tag, int>());
+
+static_assert(testMutable<std::deque<int>::iterator, std::random_access_iterator_tag, int>());
+static_assert(testConst<std::deque<int>::const_iterator, std::random_access_iterator_tag, int>());
+static_assert(testMutable<std::forward_list<int>::iterator, std::forward_iterator_tag, int>());
+static_assert(testConst<std::forward_list<int>::const_iterator, std::forward_iterator_tag, int>());
+static_assert(testMutable<std::list<int>::iterator, std::bidirectional_iterator_tag, int>());
+static_assert(testConst<std::list<int>::const_iterator, std::bidirectional_iterator_tag, int>());
+
+static_assert(testMutable<std::map<int, int>::iterator, std::bidirectional_iterator_tag, std::pair<const int, int>>());
+static_assert(testConst<std::map<int, int>::const_iterator, std::bidirectional_iterator_tag, std::pair<const int, int>>());
+static_assert(testMutable<std::multimap<int, int>::iterator, std::bidirectional_iterator_tag, std::pair<const int, int>>());
+static_assert(testConst<std::multimap<int, int>::const_iterator, std::bidirectional_iterator_tag, std::pair<const int, int>>());
+
+static_assert(testConst<std::set<int>::iterator, std::bidirectional_iterator_tag, int>());
+static_assert(testConst<std::set<int>::const_iterator, std::bidirectional_iterator_tag, int>());
+static_assert(testConst<std::multiset<int>::iterator, std::bidirectional_iterator_tag, int>());
+static_assert(testConst<std::multiset<int>::const_iterator, std::bidirectional_iterator_tag, int>());
+
+#ifdef _MSVC_STL_VERSION
+using unordered_iterator_category = std::bidirectional_iterator_tag;
+#else // ^^^ MSVC STL / other vvv
+using unordered_iterator_category = std::forward_iterator_tag;
+#endif // _MSVC_STL_VERSION
+
+static_assert(testMutable<std::unordered_map<int, int>::iterator, unordered_iterator_category, std::pair<const int, int>>());
+static_assert(testConst<std::unordered_map<int, int>::const_iterator, unordered_iterator_category, std::pair<const int, int>>());
+static_assert(testMutable<std::unordered_map<int, int>::local_iterator, unordered_iterator_category, std::pair<const int, int>>());
+static_assert(testConst<std::unordered_map<int, int>::const_local_iterator, unordered_iterator_category, std::pair<const int, int>>());
+static_assert(testMutable<std::unordered_multimap<int, int>::iterator, unordered_iterator_category, std::pair<const int, int>>());
+static_assert(testConst<std::unordered_multimap<int, int>::const_iterator, unordered_iterator_category, std::pair<const int, int>>());
+static_assert(testMutable<std::unordered_multimap<int, int>::local_iterator, unordered_iterator_category, std::pair<const int, int>>());
+static_assert(testConst<std::unordered_multimap<int, int>::const_local_iterator, unordered_iterator_category, std::pair<const int, int>>());
+
+static_assert(testConst<std::unordered_set<int>::iterator, unordered_iterator_category, int>());
+static_assert(testConst<std::unordered_set<int>::const_iterator, unordered_iterator_category, int>());
+static_assert(testConst<std::unordered_set<int>::local_iterator, unordered_iterator_category, int>());
+static_assert(testConst<std::unordered_set<int>::const_local_iterator, unordered_iterator_category, int>());
+static_assert(testConst<std::unordered_multiset<int>::iterator, unordered_iterator_category, int>());
+static_assert(testConst<std::unordered_multiset<int>::const_iterator, unordered_iterator_category, int>());
+static_assert(testConst<std::unordered_multiset<int>::local_iterator, unordered_iterator_category, int>());
+static_assert(testConst<std::unordered_multiset<int>::const_local_iterator, unordered_iterator_category, int>());
+
+static_assert(testMutable<std::reverse_iterator<int*>, std::random_access_iterator_tag, int>());
 static_assert(testIOIterator<std::back_insert_iterator<std::vector<int>>, std::output_iterator_tag>());
 static_assert(testIOIterator<std::front_insert_iterator<std::vector<int>>, std::output_iterator_tag>());
 static_assert(testIOIterator<std::insert_iterator<std::vector<int>>, std::output_iterator_tag>());
-static_assert(testConstWithoutConcept<std::istream_iterator<int, char>, int, std::input_iterator_tag>());
+static_assert(testConst<std::istream_iterator<int, char>, std::input_iterator_tag, int>());
 
 #if !defined(TEST_HAS_NO_LOCALIZATION)
-static_assert(testWithoutConcept<std::istreambuf_iterator<char>, char, long long, char, char*, std::input_iterator_tag>());
-static_assert(testWithoutConcept<std::move_iterator<int*>, int, std::ptrdiff_t, int&&, int*, std::random_access_iterator_tag>());
+// libc++-specific since pointer type is unspecified:
+LIBCPP_STATIC_ASSERT(test<std::istreambuf_iterator<char>, std::input_iterator_tag, char, long long, char, char*>());
+static_assert(test<std::move_iterator<int*>, std::random_access_iterator_tag, int, std::ptrdiff_t, int&&, int*>());
 static_assert(testIOIterator<std::ostream_iterator<int, char>, std::output_iterator_tag>());
 static_assert(testIOIterator<std::ostreambuf_iterator<int, char>, std::output_iterator_tag>());
-static_assert(testConstWithoutConcept<std::cregex_iterator, std::cmatch, std::forward_iterator_tag>());
-static_assert(testConstWithoutConcept<std::cregex_token_iterator, std::csub_match, std::forward_iterator_tag>());
+static_assert(testConst<std::cregex_iterator, std::forward_iterator_tag, std::cmatch>());
+static_assert(testConst<std::cregex_token_iterator, std::forward_iterator_tag, std::csub_match>());
 #endif // !TEST_HAS_NO_LOCALIZATION
 
 #ifndef TEST_HAS_NO_FILESYSTEM_LIBRARY
-static_assert(testWithoutConcept<std::filesystem::directory_iterator, std::filesystem::directory_entry, std::ptrdiff_t,
-                                 const std::filesystem::directory_entry&, const std::filesystem::directory_entry*,
-                                 std::input_iterator_tag>());
-static_assert(testWithoutConcept<std::filesystem::recursive_directory_iterator, std::filesystem::directory_entry,
-                                 std::ptrdiff_t, const std::filesystem::directory_entry&,
-                                 const std::filesystem::directory_entry*, std::input_iterator_tag>());
+static_assert(test<std::filesystem::directory_iterator, std::input_iterator_tag, std::filesystem::directory_entry,
+                   std::ptrdiff_t, const std::filesystem::directory_entry&, const std::filesystem::directory_entry*>());
+static_assert(test<std::filesystem::recursive_directory_iterator, std::input_iterator_tag,
+                   std::filesystem::directory_entry, std::ptrdiff_t, const std::filesystem::directory_entry&,
+                   const std::filesystem::directory_entry*>());
 #endif
 
 // Local test iterators.
diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.move/iter_move.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.move/iter_move.pass.cpp
index 39ecab6641e54..b53caead03f6e 100644
--- a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.move/iter_move.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.move/iter_move.pass.cpp
@@ -51,8 +51,8 @@ class iterator_wrapper {
   I base_ = I{};
 };
 
-template <typename I>
-constexpr void unqualified_lookup_move(I first_, I last_, I result_first_, I result_last_) {
+template <typename It, typename Out>
+constexpr void unqualified_lookup_move(It first_, It last_, Out result_first_, Out result_last_) {
   auto first = ::check_unqualified_lookup::unqualified_lookup_wrapper{std::move(first_)};
   auto last = ::check_unqualified_lookup::unqualified_lookup_wrapper{std::move(last_)};
   auto result_first = ::check_unqualified_lookup::unqualified_lookup_wrapper{std::move(result_first_)};
@@ -65,8 +65,8 @@ constexpr void unqualified_lookup_move(I first_, I last_, I result_first_, I res
   }
 }
 
-template <typename I>
-constexpr void lvalue_move(I first_, I last_, I result_first_, I result_last_) {
+template <typename It, typename Out>
+constexpr void lvalue_move(It first_, It last_, Out result_first_, Out result_last_) {
   auto first = iterator_wrapper{std::move(first_)};
   auto last = ::iterator_wrapper{std::move(last_)};
   auto result_first = iterator_wrapper{std::move(result_first_)};
@@ -80,8 +80,8 @@ constexpr void lvalue_move(I first_, I last_, I result_first_, I result_last_) {
   }
 }
 
-template <typename I>
-constexpr void rvalue_move(I first_, I last_, I result_first_, I result_last_) {
+template <typename It, typename Out>
+constexpr void rvalue_move(It first_, It last_, Out result_first_, Out result_last_) {
   auto first = iterator_wrapper{std::move(first_)};
   auto last = iterator_wrapper{std::move(last_)};
   auto result_first = iterator_wrapper{std::move(result_first_)};
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/from_iterator_sentinel.pass.cpp b/libcxx/test/std/strings/string.view/string.view.cons/from_iterator_sentinel.pass.cpp
index 931eac89aede3..4aae72d33cd8d 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/from_iterator_sentinel.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/from_iterator_sentinel.pass.cpp
@@ -20,25 +20,33 @@
 #include "make_string.h"
 #include "test_iterators.h"
 
-template<class CharT, class Sentinel>
-constexpr void test() {
-  auto val = MAKE_STRING_VIEW(CharT, "test");
-  auto sv = std::basic_string_view<CharT>(val.begin(), Sentinel(val.end()));
-  ASSERT_SAME_TYPE(decltype(sv), std::basic_string_view<CharT>);
-  assert(sv.size() == val.size());
+template<class It, class Sentinel, class CharT>
+constexpr void test_construction(std::basic_string_view<CharT> val) {
+  auto sv = std::basic_string_view<CharT>(It(val.data()), Sentinel(It(val.data() + val.size())));
   assert(sv.data() == val.data());
+  assert(sv.size() == val.size());
+}
+
+template<class CharT>
+constexpr void test_with_char() {
+  const auto val = MAKE_STRING_VIEW(CharT, "test");
+  test_construction<CharT*, CharT*>(val);
+  test_construction<CharT*, const CharT*>(val);
+  test_construction<const CharT*, CharT*>(val);
+  test_construction<const CharT*, sized_sentinel<const CharT*>>(val);
+  test_construction<contiguous_iterator<const CharT*>, contiguous_iterator<const CharT*>>(val);
+  test_construction<contiguous_iterator<const CharT*>, sized_sentinel<contiguous_iterator<const CharT*>>>(val);
 }
 
 constexpr bool test() {
-  test<char, char*>();
+  test_with_char<char>();
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-  test<wchar_t, wchar_t*>();
+  test_with_char<wchar_t>();
 #endif
-  test<char8_t, char8_t*>();
-  test<char16_t, char16_t*>();
-  test<char32_t, char32_t*>();
-  test<char, const char*>();
-  test<char, sized_sentinel<const char*>>();
+  test_with_char<char8_t>();
+  test_with_char<char16_t>();
+  test_with_char<char32_t>();
+
   return true;
 }
 
@@ -54,7 +62,7 @@ template <class CharT>
 void test_throwing() {
   auto val = MAKE_STRING_VIEW(CharT, "test");
   try {
-    (void)std::basic_string_view<CharT>(val.begin(), ThrowingSentinel<CharT>());
+    (void)std::basic_string_view<CharT>(val.data(), ThrowingSentinel<CharT>());
     assert(false);
   } catch (int i) {
     assert(i == 42);
@@ -89,4 +97,3 @@ int main(int, char**) {
 
   return 0;
 }
-
diff --git a/libcxx/test/std/strings/string.view/string.view.deduct/iterator_sentinel.pass.cpp b/libcxx/test/std/strings/string.view/string.view.deduct/iterator_sentinel.pass.cpp
index e0ff5f6c0cd53..e39917528cf79 100644
--- a/libcxx/test/std/strings/string.view/string.view.deduct/iterator_sentinel.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.deduct/iterator_sentinel.pass.cpp
@@ -20,25 +20,33 @@
 #include "test_macros.h"
 #include "test_iterators.h"
 
-template<class CharT, class Sentinel>
-constexpr void test() {
-  auto val = MAKE_STRING_VIEW(CharT, "test");
-  auto sv = std::basic_string_view(val.begin(), Sentinel(val.end()));
+template<class It, class Sentinel, class CharT>
+constexpr void test_ctad(std::basic_string_view<CharT> val) {
+  auto sv = std::basic_string_view(It(val.data()), Sentinel(It(val.data() + val.size())));
   ASSERT_SAME_TYPE(decltype(sv), std::basic_string_view<CharT>);
-  assert(sv.size() == val.size());
   assert(sv.data() == val.data());
+  assert(sv.size() == val.size());
+}
+
+template<class CharT>
+constexpr void test_with_char() {
+  const auto val = MAKE_STRING_VIEW(CharT, "test");
+  test_ctad<CharT*, CharT*>(val);
+  test_ctad<CharT*, const CharT*>(val);
+  test_ctad<const CharT*, CharT*>(val);
+  test_ctad<const CharT*, sized_sentinel<const CharT*>>(val);
+  test_ctad<contiguous_iterator<const CharT*>, contiguous_iterator<const CharT*>>(val);
+  test_ctad<contiguous_iterator<const CharT*>, sized_sentinel<contiguous_iterator<const CharT*>>>(val);
 }
 
 constexpr void test() {
-  test<char, char*>();
+  test_with_char<char>();
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-  test<wchar_t, wchar_t*>();
+  test_with_char<wchar_t>();
 #endif
-  test<char8_t, char8_t*>();
-  test<char16_t, char16_t*>();
-  test<char32_t, char32_t*>();
-  test<char, const char*>();
-  test<char, sized_sentinel<const char*>>();
+  test_with_char<char8_t>();
+  test_with_char<char16_t>();
+  test_with_char<char32_t>();
 }
 
 int main(int, char**) {
@@ -46,4 +54,3 @@ int main(int, char**) {
 
   return 0;
 }
-
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
index 7bb0d3e26610c..8a42a547f9452 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
@@ -26,10 +26,10 @@ constexpr void test(const CharT* fmt) {
     std::basic_format_parse_context<CharT> context(fmt);
 
     context.advance_to(context.begin() + 1);
-    assert(context.begin() == &fmt[1]);
+    assert(std::to_address(context.begin()) == fmt + 1);
 
     context.advance_to(context.begin() + 1);
-    assert(context.begin() == &fmt[2]);
+    assert(std::to_address(context.begin()) == fmt + 2);
 
     context.advance_to(context.begin() + 1);
     assert(context.begin() == context.end());
@@ -39,10 +39,10 @@ constexpr void test(const CharT* fmt) {
     std::basic_format_parse_context context(view);
 
     context.advance_to(context.begin() + 1);
-    assert(std::to_address(context.begin()) == std::to_address(view.begin()) + 1);
+    assert(std::to_address(context.begin()) == fmt + 1);
 
     context.advance_to(context.begin() + 1);
-    assert(std::to_address(context.begin()) == std::to_address(view.begin()) + 2);
+    assert(std::to_address(context.begin()) == fmt + 2);
 
     context.advance_to(context.begin() + 1);
     assert(context.begin() == context.end());
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
index fd209e611c6bc..d01f25625d582 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
@@ -24,7 +24,7 @@ template <class CharT>
 constexpr void test(const CharT* fmt) {
   {
     std::basic_format_parse_context<CharT> context(fmt);
-    assert(context.begin() == &fmt[0]);
+    assert(std::to_address(context.begin()) == &fmt[0]);
     ASSERT_NOEXCEPT(context.begin());
   }
   {
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
index 195f07742a016..771fc7131beff 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
@@ -47,8 +47,8 @@ constexpr void test(const CharT* fmt) {
 
   {
     std::basic_format_parse_context<CharT> context(fmt);
-    assert(context.begin() == &fmt[0]);
-    assert(context.end() == &fmt[3]);
+    assert(std::to_address(context.begin()) == &fmt[0]);
+    assert(std::to_address(context.end()) == &fmt[3]);
   }
   {
     std::basic_string_view view{fmt};
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
index 9a878ef42ba18..9e7ca7e391c48 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
@@ -24,7 +24,7 @@ template <class CharT>
 constexpr void test(const CharT* fmt) {
   {
     std::basic_format_parse_context<CharT> context(fmt);
-    assert(context.end() == &fmt[3]);
+    assert(std::to_address(context.end()) == &fmt[3]);
     ASSERT_NOEXCEPT(context.end());
   }
   {

From e2855e17601e8a193bf07b0533be69dbf85b811c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 21 Feb 2022 11:47:02 -0500
Subject: [PATCH 438/748] [Clang][OpenMP] Add Sema support for atomic compare
 capture

This patch adds Sema support for `atomic compare capture`.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D120200
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   8 +-
 clang/lib/Sema/SemaOpenMP.cpp                 | 495 ++++++++++++++++--
 clang/test/OpenMP/atomic_messages.c           | 195 +++++++
 clang/test/OpenMP/atomic_messages.cpp         |   4 +-
 4 files changed, 667 insertions(+), 35 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0d301e76c92d7..1854c8e522b82 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10523,10 +10523,16 @@ def err_omp_atomic_compare : Error<
   " '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}',"
   " 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type,"
   " and 'ordop' is one of '<' or '>'.">;
+def err_omp_atomic_compare_capture : Error<
+  "the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}',"
+  " '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}',"
+  " 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x', 'r', and 'v' are lvalue expressions with scalar type, 'expr', 'e', and 'd' are expressions with scalar type,"
+  " and 'ordop' is one of '<' or '>'.">;
 def note_omp_atomic_compare: Note<
   "%select{expected compound statement|expected exactly one expression statement|expected assignment statement|expected conditional operator|expect result value to be at false expression|"
   "expect binary operator in conditional expression|expect '<', '>' or '==' as order operator|expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'|"
-  "expect lvalue for result value|expect scalar value|expect integer value|unexpected 'else' statement}0">;
+  "expect lvalue for result value|expect scalar value|expect integer value|unexpected 'else' statement|expect '==' operator|expect an assignment statement 'v = x'|"
+  "expect a 'if' statement|expect no more than two statements|expect a compound statement|expect 'else' statement|expect a form 'r = x == e; if (r) ...'}0">;
 def err_omp_atomic_several_clauses : Error<
   "directive '#pragma omp atomic' cannot contain more than one 'read', 'write', 'update', 'capture', or 'compare' clause">;
 def err_omp_several_mem_order_clauses : Error<
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index c32609e4e32e3..43386c1ef8edb 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -10976,6 +10976,20 @@ class OpenMPAtomicCompareChecker {
     NotInteger,
     /// 'else' statement is not expected.
     UnexpectedElse,
+    /// Not an equality operator.
+    NotEQ,
+    /// Invalid assignment (not v == x).
+    InvalidAssignment,
+    /// Not if statement
+    NotIfStmt,
+    /// More than two statements in a compund statement.
+    MoreThanTwoStmts,
+    /// Not a compound statement.
+    NotCompoundStmt,
+    /// No else statement.
+    NoElse,
+    /// Not 'if (r)'.
+    InvalidCondition,
     /// No error.
     NoError,
   };
@@ -10999,7 +11013,7 @@ class OpenMPAtomicCompareChecker {
   Expr *getCond() const { return C; }
   bool isXBinopExpr() const { return IsXBinopExpr; }
 
-private:
+protected:
   /// Reference to ASTContext
   ASTContext &ContextRef;
   /// 'x' lvalue part of the source atomic expression.
@@ -11026,6 +11040,35 @@ class OpenMPAtomicCompareChecker {
 
   /// Check if all captured values have right type.
   bool checkType(ErrorInfoTy &ErrorInfo) const;
+
+  static bool CheckValue(const Expr *E, ErrorInfoTy &ErrorInfo,
+                         bool ShouldBeLValue) {
+    if (ShouldBeLValue && !E->isLValue()) {
+      ErrorInfo.Error = ErrorTy::XNotLValue;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = E->getSourceRange();
+      return false;
+    }
+
+    if (!E->isInstantiationDependent()) {
+      QualType QTy = E->getType();
+      if (!QTy->isScalarType()) {
+        ErrorInfo.Error = ErrorTy::NotScalar;
+        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
+        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = E->getSourceRange();
+        return false;
+      }
+
+      if (!QTy->isIntegerType()) {
+        ErrorInfo.Error = ErrorTy::NotInteger;
+        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
+        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = E->getSourceRange();
+        return false;
+      }
+    }
+
+    return true;
+  }
 };
 
 bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
@@ -11215,41 +11258,13 @@ bool OpenMPAtomicCompareChecker::checkType(ErrorInfoTy &ErrorInfo) const {
   // 'x' and 'e' cannot be nullptr
   assert(X && E && "X and E cannot be nullptr");
 
-  auto CheckValue = [&ErrorInfo](const Expr *E, bool ShouldBeLValue) {
-    if (ShouldBeLValue && !E->isLValue()) {
-      ErrorInfo.Error = ErrorTy::XNotLValue;
-      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
-      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = E->getSourceRange();
-      return false;
-    }
-
-    if (!E->isInstantiationDependent()) {
-      QualType QTy = E->getType();
-      if (!QTy->isScalarType()) {
-        ErrorInfo.Error = ErrorTy::NotScalar;
-        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
-        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = E->getSourceRange();
-        return false;
-      }
-
-      if (!QTy->isIntegerType()) {
-        ErrorInfo.Error = ErrorTy::NotInteger;
-        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = E->getExprLoc();
-        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = E->getSourceRange();
-        return false;
-      }
-    }
-
-    return true;
-  };
-
-  if (!CheckValue(X, true))
+  if (!CheckValue(X, ErrorInfo, true))
     return false;
 
-  if (!CheckValue(E, false))
+  if (!CheckValue(E, ErrorInfo, false))
     return false;
 
-  if (D && !CheckValue(D, false))
+  if (D && !CheckValue(D, ErrorInfo, false))
     return false;
 
   return true;
@@ -11297,6 +11312,413 @@ bool OpenMPAtomicCompareChecker::checkStmt(
 
   return checkType(ErrorInfo);
 }
+
+class OpenMPAtomicCompareCaptureChecker final
+    : public OpenMPAtomicCompareChecker {
+public:
+  OpenMPAtomicCompareCaptureChecker(Sema &S) : OpenMPAtomicCompareChecker(S) {}
+
+  Expr *getV() const { return V; }
+  Expr *getR() const { return R; }
+  bool isFailOnly() const { return IsFailOnly; }
+
+  /// Check if statement \a S is valid for <tt>atomic compare capture</tt>.
+  bool checkStmt(Stmt *S, ErrorInfoTy &ErrorInfo);
+
+private:
+  bool checkType(ErrorInfoTy &ErrorInfo);
+
+  // NOTE: Form 3, 4, 5 in the following comments mean the 3rd, 4th, and 5th
+  // form of 'conditional-update-capture-atomic' structured block on the v5.2
+  // spec p.p. 82:
+  // (1) { v = x; cond-update-stmt }
+  // (2) { cond-update-stmt v = x; }
+  // (3) if(x == e) { x = d; } else { v = x; }
+  // (4) { r = x == e; if(r) { x = d; } }
+  // (5) { r = x == e; if(r) { x = d; } else { v = x; } }
+
+  /// Check if it is valid 'if(x == e) { x = d; } else { v = x; }' (form 3)
+  bool checkForm3(IfStmt *S, ErrorInfoTy &ErrorInfo);
+
+  /// Check if it is valid '{ r = x == e; if(r) { x = d; } }',
+  /// or '{ r = x == e; if(r) { x = d; } else { v = x; } }' (form 4 and 5)
+  bool checkForm45(Stmt *S, ErrorInfoTy &ErrorInfo);
+
+  /// 'v' lvalue part of the source atomic expression.
+  Expr *V = nullptr;
+  /// 'r' lvalue part of the source atomic expression.
+  Expr *R = nullptr;
+  /// If 'v' is only updated when the comparison fails.
+  bool IsFailOnly = false;
+};
+
+bool OpenMPAtomicCompareCaptureChecker::checkType(ErrorInfoTy &ErrorInfo) {
+  if (!OpenMPAtomicCompareChecker::checkType(ErrorInfo))
+    return false;
+
+  if (V && !CheckValue(V, ErrorInfo, true))
+    return false;
+
+  if (R && !CheckValue(R, ErrorInfo, true))
+    return false;
+
+  return true;
+}
+
+bool OpenMPAtomicCompareCaptureChecker::checkForm3(IfStmt *S,
+                                                   ErrorInfoTy &ErrorInfo) {
+  IsFailOnly = true;
+
+  auto *Then = S->getThen();
+  if (auto *CS = dyn_cast<CompoundStmt>(Then)) {
+    if (CS->body_empty()) {
+      ErrorInfo.Error = ErrorTy::NoStmt;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CS->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = CS->getSourceRange();
+      return false;
+    }
+    if (CS->size() > 1) {
+      ErrorInfo.Error = ErrorTy::MoreThanOneStmt;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CS->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = CS->getSourceRange();
+      return false;
+    }
+    Then = CS->body_front();
+  }
+
+  auto *BO = dyn_cast<BinaryOperator>(Then);
+  if (!BO) {
+    ErrorInfo.Error = ErrorTy::NotAnAssignment;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Then->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Then->getSourceRange();
+    return false;
+  }
+  if (BO->getOpcode() != BO_Assign) {
+    ErrorInfo.Error = ErrorTy::NotAnAssignment;
+    ErrorInfo.ErrorLoc = BO->getExprLoc();
+    ErrorInfo.NoteLoc = BO->getOperatorLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = BO->getSourceRange();
+    return false;
+  }
+
+  X = BO->getLHS();
+  D = BO->getRHS();
+
+  auto *Cond = dyn_cast<BinaryOperator>(S->getCond());
+  if (!Cond) {
+    ErrorInfo.Error = ErrorTy::NotABinaryOp;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getCond()->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getCond()->getSourceRange();
+    return false;
+  }
+  if (Cond->getOpcode() != BO_EQ) {
+    ErrorInfo.Error = ErrorTy::NotEQ;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
+    return false;
+  }
+
+  if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) {
+    E = Cond->getRHS();
+  } else if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) {
+    E = Cond->getLHS();
+  } else {
+    ErrorInfo.Error = ErrorTy::InvalidComparison;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Cond->getSourceRange();
+    return false;
+  }
+
+  C = Cond;
+
+  if (!S->getElse()) {
+    ErrorInfo.Error = ErrorTy::NoElse;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getSourceRange();
+    return false;
+  }
+
+  auto *Else = S->getElse();
+  if (auto *CS = dyn_cast<CompoundStmt>(Else)) {
+    if (CS->body_empty()) {
+      ErrorInfo.Error = ErrorTy::NoStmt;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CS->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = CS->getSourceRange();
+      return false;
+    }
+    if (CS->size() > 1) {
+      ErrorInfo.Error = ErrorTy::MoreThanOneStmt;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CS->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getSourceRange();
+      return false;
+    }
+    Else = CS->body_front();
+  }
+
+  auto *ElseBO = dyn_cast<BinaryOperator>(Else);
+  if (!ElseBO) {
+    ErrorInfo.Error = ErrorTy::NotAnAssignment;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Else->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Else->getSourceRange();
+    return false;
+  }
+  if (ElseBO->getOpcode() != BO_Assign) {
+    ErrorInfo.Error = ErrorTy::NotAnAssignment;
+    ErrorInfo.ErrorLoc = ElseBO->getExprLoc();
+    ErrorInfo.NoteLoc = ElseBO->getOperatorLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = ElseBO->getSourceRange();
+    return false;
+  }
+
+  if (!checkIfTwoExprsAreSame(ContextRef, X, ElseBO->getRHS())) {
+    ErrorInfo.Error = ErrorTy::InvalidAssignment;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = ElseBO->getRHS()->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange =
+        ElseBO->getRHS()->getSourceRange();
+    return false;
+  }
+
+  V = ElseBO->getLHS();
+
+  return checkType(ErrorInfo);
+}
+
+bool OpenMPAtomicCompareCaptureChecker::checkForm45(Stmt *S,
+                                                    ErrorInfoTy &ErrorInfo) {
+  // We don't check here as they should be already done before call this
+  // function.
+  auto *CS = cast<CompoundStmt>(S);
+  assert(CS->size() == 2 && "CompoundStmt size is not expected");
+  auto *S1 = cast<BinaryOperator>(CS->body_front());
+  auto *S2 = cast<IfStmt>(CS->body_back());
+  assert(S1->getOpcode() == BO_Assign && "unexpected binary operator");
+
+  if (!checkIfTwoExprsAreSame(ContextRef, S1->getLHS(), S2->getCond())) {
+    ErrorInfo.Error = ErrorTy::InvalidCondition;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S2->getCond()->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S1->getLHS()->getSourceRange();
+    return false;
+  }
+
+  R = S1->getLHS();
+
+  auto *Then = S2->getThen();
+  if (auto *ThenCS = dyn_cast<CompoundStmt>(Then)) {
+    if (ThenCS->body_empty()) {
+      ErrorInfo.Error = ErrorTy::NoStmt;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = ThenCS->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = ThenCS->getSourceRange();
+      return false;
+    }
+    if (ThenCS->size() > 1) {
+      ErrorInfo.Error = ErrorTy::MoreThanOneStmt;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = ThenCS->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = ThenCS->getSourceRange();
+      return false;
+    }
+    Then = ThenCS->body_front();
+  }
+
+  auto *ThenBO = dyn_cast<BinaryOperator>(Then);
+  if (!ThenBO) {
+    ErrorInfo.Error = ErrorTy::NotAnAssignment;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S2->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S2->getSourceRange();
+    return false;
+  }
+  if (ThenBO->getOpcode() != BO_Assign) {
+    ErrorInfo.Error = ErrorTy::NotAnAssignment;
+    ErrorInfo.ErrorLoc = ThenBO->getExprLoc();
+    ErrorInfo.NoteLoc = ThenBO->getOperatorLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = ThenBO->getSourceRange();
+    return false;
+  }
+
+  X = ThenBO->getLHS();
+  D = ThenBO->getRHS();
+
+  auto *BO = cast<BinaryOperator>(S1->getRHS()->IgnoreImpCasts());
+  if (BO->getOpcode() != BO_EQ) {
+    ErrorInfo.Error = ErrorTy::NotEQ;
+    ErrorInfo.ErrorLoc = BO->getExprLoc();
+    ErrorInfo.NoteLoc = BO->getOperatorLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = BO->getSourceRange();
+    return false;
+  }
+
+  C = BO;
+
+  if (checkIfTwoExprsAreSame(ContextRef, X, BO->getLHS())) {
+    E = BO->getRHS();
+  } else if (checkIfTwoExprsAreSame(ContextRef, X, BO->getRHS())) {
+    E = BO->getLHS();
+  } else {
+    ErrorInfo.Error = ErrorTy::InvalidComparison;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = BO->getExprLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = BO->getSourceRange();
+    return false;
+  }
+
+  if (S2->getElse()) {
+    IsFailOnly = true;
+
+    auto *Else = S2->getElse();
+    if (auto *ElseCS = dyn_cast<CompoundStmt>(Else)) {
+      if (ElseCS->body_empty()) {
+        ErrorInfo.Error = ErrorTy::NoStmt;
+        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = ElseCS->getBeginLoc();
+        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = ElseCS->getSourceRange();
+        return false;
+      }
+      if (ElseCS->size() > 1) {
+        ErrorInfo.Error = ErrorTy::MoreThanOneStmt;
+        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = ElseCS->getBeginLoc();
+        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = ElseCS->getSourceRange();
+        return false;
+      }
+      Else = ElseCS->body_front();
+    }
+
+    auto *ElseBO = dyn_cast<BinaryOperator>(Else);
+    if (!ElseBO) {
+      ErrorInfo.Error = ErrorTy::NotAnAssignment;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Else->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = Else->getSourceRange();
+      return false;
+    }
+    if (ElseBO->getOpcode() != BO_Assign) {
+      ErrorInfo.Error = ErrorTy::NotAnAssignment;
+      ErrorInfo.ErrorLoc = ElseBO->getExprLoc();
+      ErrorInfo.NoteLoc = ElseBO->getOperatorLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange = ElseBO->getSourceRange();
+      return false;
+    }
+    if (!checkIfTwoExprsAreSame(ContextRef, X, ElseBO->getRHS())) {
+      ErrorInfo.Error = ErrorTy::InvalidAssignment;
+      ErrorInfo.ErrorLoc = ElseBO->getRHS()->getExprLoc();
+      ErrorInfo.NoteLoc = X->getExprLoc();
+      ErrorInfo.ErrorRange = ElseBO->getRHS()->getSourceRange();
+      ErrorInfo.NoteRange = X->getSourceRange();
+      return false;
+    }
+
+    V = ElseBO->getLHS();
+  }
+
+  return checkType(ErrorInfo);
+}
+
+bool OpenMPAtomicCompareCaptureChecker::checkStmt(Stmt *S,
+                                                  ErrorInfoTy &ErrorInfo) {
+  // if(x == e) { x = d; } else { v = x; }
+  if (auto *IS = dyn_cast<IfStmt>(S))
+    return checkForm3(IS, ErrorInfo);
+
+  auto *CS = dyn_cast<CompoundStmt>(S);
+  if (!CS) {
+    ErrorInfo.Error = ErrorTy::NotCompoundStmt;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = S->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = S->getSourceRange();
+    return false;
+  }
+  if (CS->body_empty()) {
+    ErrorInfo.Error = ErrorTy::NoStmt;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CS->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = CS->getSourceRange();
+    return false;
+  }
+
+  // { if(x == e) { x = d; } else { v = x; } }
+  if (CS->size() == 1) {
+    auto *IS = dyn_cast<IfStmt>(CS->body_front());
+    if (!IS) {
+      ErrorInfo.Error = ErrorTy::NotIfStmt;
+      ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CS->body_front()->getBeginLoc();
+      ErrorInfo.ErrorRange = ErrorInfo.NoteRange =
+          CS->body_front()->getSourceRange();
+      return false;
+    }
+
+    return checkForm3(IS, ErrorInfo);
+  } else if (CS->size() == 2) {
+    auto *S1 = CS->body_front();
+    auto *S2 = CS->body_back();
+
+    Stmt *UpdateStmt = nullptr;
+    Stmt *CondUpdateStmt = nullptr;
+
+    if (auto *BO = dyn_cast<BinaryOperator>(S1)) {
+      // { v = x; cond-update-stmt } or form 45.
+      UpdateStmt = S1;
+      CondUpdateStmt = S2;
+      // Check if form 45.
+      if (dyn_cast<BinaryOperator>(BO->getRHS()->IgnoreImpCasts()) &&
+          dyn_cast<IfStmt>(S2))
+        return checkForm45(CS, ErrorInfo);
+    } else {
+      // { cond-update-stmt v = x; }
+      UpdateStmt = S2;
+      CondUpdateStmt = S1;
+    }
+
+    auto CheckCondUpdateStmt = [this, &ErrorInfo](Stmt *CUS) {
+      auto *IS = dyn_cast<IfStmt>(CUS);
+      if (!IS) {
+        ErrorInfo.Error = ErrorTy::NotIfStmt;
+        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CUS->getBeginLoc();
+        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = CUS->getSourceRange();
+        return false;
+      }
+
+      if (!checkCondUpdateStmt(IS, ErrorInfo))
+        return false;
+
+      return true;
+    };
+
+    // CheckUpdateStmt has to be called *after* CheckCondUpdateStmt.
+    auto CheckUpdateStmt = [this, &ErrorInfo](Stmt *US) {
+      auto *BO = dyn_cast<BinaryOperator>(US);
+      if (!BO) {
+        ErrorInfo.Error = ErrorTy::NotAnAssignment;
+        ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = US->getBeginLoc();
+        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = US->getSourceRange();
+        return false;
+      }
+      if (BO->getOpcode() != BO_Assign) {
+        ErrorInfo.Error = ErrorTy::NotAnAssignment;
+        ErrorInfo.ErrorLoc = BO->getExprLoc();
+        ErrorInfo.NoteLoc = BO->getOperatorLoc();
+        ErrorInfo.ErrorRange = ErrorInfo.NoteRange = BO->getSourceRange();
+        return false;
+      }
+      if (!checkIfTwoExprsAreSame(ContextRef, this->X, BO->getRHS())) {
+        ErrorInfo.Error = ErrorTy::InvalidAssignment;
+        ErrorInfo.ErrorLoc = BO->getRHS()->getExprLoc();
+        ErrorInfo.NoteLoc = this->X->getExprLoc();
+        ErrorInfo.ErrorRange = BO->getRHS()->getSourceRange();
+        ErrorInfo.NoteRange = this->X->getSourceRange();
+        return false;
+      }
+
+      this->V = BO->getLHS();
+
+      return true;
+    };
+
+    if (!CheckCondUpdateStmt(CondUpdateStmt))
+      return false;
+    if (!CheckUpdateStmt(UpdateStmt))
+      return false;
+  } else {
+    ErrorInfo.Error = ErrorTy::MoreThanTwoStmts;
+    ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = CS->getBeginLoc();
+    ErrorInfo.ErrorRange = ErrorInfo.NoteRange = CS->getSourceRange();
+    return false;
+  }
+
+  return checkType(ErrorInfo);
+}
 } // namespace
 
 StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
@@ -11794,6 +12216,15 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       UE = V = E = X = nullptr;
   } else if (AtomicKind == OMPC_compare) {
     if (IsCompareCapture) {
+      OpenMPAtomicCompareCaptureChecker::ErrorInfoTy ErrorInfo;
+      OpenMPAtomicCompareCaptureChecker Checker(*this);
+      if (!Checker.checkStmt(Body, ErrorInfo)) {
+        Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare_capture)
+            << ErrorInfo.ErrorRange;
+        Diag(ErrorInfo.NoteLoc, diag::note_omp_atomic_compare)
+            << ErrorInfo.Error << ErrorInfo.NoteRange;
+        return StmtError();
+      }
       // TODO: We don't set X, D, E, etc. here because in code gen we will emit
       // error directly.
     } else {
diff --git a/clang/test/OpenMP/atomic_messages.c b/clang/test/OpenMP/atomic_messages.c
index c66cd19b5aca3..4066d7518c628 100644
--- a/clang/test/OpenMP/atomic_messages.c
+++ b/clang/test/OpenMP/atomic_messages.c
@@ -500,4 +500,199 @@ void compare(void) {
       fx = fe;
   }
 }
+
+void compare_capture(void) {
+  int x = 0;
+  int d = 0;
+  int e = 0;
+  int v = 0;
+  int r = 0;
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected compound statement}}
+#pragma omp atomic compare capture
+  if (x == e) {}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected exactly one expression statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x = d;
+    v = x;
+  }
+// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+3 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    bbar();
+  }
+// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+3 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x += d;
+  }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect binary operator in conditional expression}}
+#pragma omp atomic compare capture
+  if (ffoo()) {
+    x = d;
+  }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect '==' operator}}
+#pragma omp atomic compare capture
+  if (x > e) {
+    x = d;
+  }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
+#pragma omp atomic compare capture
+  if (d == e) {
+    x = d;
+  }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect 'else' statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x = d;
+  }
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+4 {{expected compound statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x = d;
+  } else {
+  }
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+4 {{expected exactly one expression statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x = d;
+  } else {
+    v = x;
+    d = e;
+  }
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+5 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x = d;
+  } else {
+    bbar();
+  }
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+5 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x = d;
+  } else {
+    v += x;
+  }
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+5 {{expect an assignment statement 'v = x'}}
+#pragma omp atomic compare capture
+  if (x == e) {
+    x = d;
+  } else {
+    v = d;
+  }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected compound statement}}
+#pragma omp atomic compare capture
+  {}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect a compound statement}}
+#pragma omp atomic compare capture
+  x = x > e ? e : x;
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect a 'if' statement}}
+#pragma omp atomic compare capture
+  { x = x > e ? e : x; }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect a form 'r = x == e; if (r) ...'}}
+#pragma omp atomic compare capture
+  { r = x == e; if (x == d) { x = e; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  { r = x == e; if (r) { bbar(); } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  { r = x == e; if (r) { x += d; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected compound statement}}
+#pragma omp atomic compare capture
+  { r = x == e; if (r) {} }
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+4 {{expected exactly one expression statement}}
+#pragma omp atomic compare capture
+  {
+    r = x == e;
+    if (r) {
+      x = d;
+      v = x;
+    }
+  }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect '==' operator}}
+#pragma omp atomic compare capture
+  { r = x > e; if (r) { x = d; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
+#pragma omp atomic compare capture
+  { r = d == e; if (r) { x = d; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected compound statement}}
+#pragma omp atomic compare capture
+  { r = x == e; if (r) { x = d; } else {} }
+// omp51-error@+7 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+6 {{expected exactly one expression statement}}
+#pragma omp atomic compare capture
+  {
+    r = x == e;
+    if (r) {
+      x = d;
+    } else {
+      v = x;
+      d = e;
+    }
+  }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  { r = x == e; if (r) { x = d; } else { bbar(); } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  { r = x == e; if (r) { x = d; } else { v += x; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect an assignment statement 'v = x'}}
+#pragma omp atomic compare capture
+  { r = x == e; if (r) { x = d; } else { v = d; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  { v += x; if (x == e) { x = d; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expected assignment statement}}
+#pragma omp atomic compare capture
+  { if (x == e) { x = d; } v += x; }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect an assignment statement 'v = x'}}
+#pragma omp atomic compare capture
+  { v = d; if (x == e) { x = d; } }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect an assignment statement 'v = x'}}
+#pragma omp atomic compare capture
+  { if (x == e) { x = d; } v = d; }
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect a 'if' statement}}
+#pragma omp atomic compare capture
+  { v = x; bbar(); }
+
+  float fv;
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-note@+2 {{expect integer value}}
+#pragma omp atomic compare capture
+  { fv = x; if (x == e) { x = d; } }
+}
 #endif
diff --git a/clang/test/OpenMP/atomic_messages.cpp b/clang/test/OpenMP/atomic_messages.cpp
index 700c86da1a588..23fd24bfcf118 100644
--- a/clang/test/OpenMP/atomic_messages.cpp
+++ b/clang/test/OpenMP/atomic_messages.cpp
@@ -928,7 +928,7 @@ T mixed() {
 }
 
 int mixed() {
-  int a, b = 0;
+  int a, v, b = 0;
 // expected-error@+2 {{directive '#pragma omp atomic' cannot contain more than one 'read', 'write', 'update', 'capture', or 'compare' clause}}
 // expected-note@+1 {{'read' clause used here}}
 #pragma omp atomic read write
@@ -957,7 +957,7 @@ int mixed() {
 // expected-error@+2 {{directive '#pragma omp atomic' cannot contain more than one 'compare' clause}}
 // expected-error@+1 {{directive '#pragma omp atomic' cannot contain more than one 'capture' clause}}
 #pragma omp atomic compare compare capture capture
-  a = b;
+  { v = a; if (a > b) a = b; }
 #endif
   // expected-note@+1 {{in instantiation of function template specialization 'mixed<int>' requested here}}
   return mixed<int>();

From 7a837d38bdff1f40317e1c21b4fa65f14b1d2822 Mon Sep 17 00:00:00 2001
From: Kristof Beyls <kristof.beyls@arm.com>
Date: Wed, 16 Feb 2022 14:53:18 +0100
Subject: [PATCH 439/748] Create office hours documentation.

Differential Revision: https://reviews.llvm.org/D120036
---
 llvm/docs/GettingInvolved.rst | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index a1a047d34c90a..5d0ed6299f0b4 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -223,6 +223,37 @@ writing, the following sync-ups are organized:
        `gcal <https://calendar.google.com/calendar/u/0?cid=ZDQyc3ZlajJmbjIzNG1jaTUybjFsdjA2dWNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ>`__
      - `Minutes/docs <https://docs.google.com/document/d/1GahxppHJ7o1O_fn1Mbidu1DHEg7V2aOr92LXCtNV1_o/edit?usp=sharing>`__
 
+
+Office hours
+------------
+
+A number of experienced LLVM contributors make themselves available for a chat
+on a regular schedule, to anyone who is looking for some guidance. Please find
+the list of who is available when, through which medium, and what their area of
+expertise is. Don't by shy to dial in!
+
+Of course, people take time off from time to time, so if you dial in and you
+don't find anyone present, chances are they happen to be off that day.
+
+.. list-table:: LLVM office hours
+  :widths: 15 40 15 15 15
+  :header-rows: 1
+
+  * - Name
+    - In-scope topics
+    - When?
+    - Where?
+    - Languages
+  * - Kristof Beyls
+    - General questions on how to contribute to LLVM; organizing meetups;
+      submitting talks; and other general LLVM-related topics. Arm/AArch64
+      codegen.
+    - Every 2nd and 4th Wednesday of the month at 9.30am CET, for 30 minutes.
+      `ics <https://calendar.google.com/calendar/ical/co0h4ndpvtfe64opn7eraiq3ac%40group.calendar.google.com/public/basic.ics>`__
+    - `Jitsi <https://meet.jit.si/KristofBeylsLLVMOfficeHour>`__
+    - English, Flemish, Dutch
+
+
 IRC
 ---
 

From 90d240553d1fe5b97c3481d564c78d16e23d9236 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 21 Feb 2022 11:52:57 -0800
Subject: [PATCH 440/748] [RISCV] Teach shouldSinkOperands to sink splat
 operands of vp.fma intrinsics.

Reviewed By: frasercrmck

Differential Revision: https://reviews.llvm.org/D120167
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  1 +
 .../CodeGen/RISCV/rvv/sink-splat-operands.ll  | 90 +++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ca11d0d431ffe..4395c139b7220 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1287,6 +1287,7 @@ bool RISCVTargetLowering::shouldSinkOperands(
       if (auto *II = dyn_cast<IntrinsicInst>(I)) {
         switch (II->getIntrinsicID()) {
         case Intrinsic::fma:
+        case Intrinsic::vp_fma:
           return Operand == 0 || Operand == 1;
         // FIXME: Our patterns can only match vx/vf instructions when the splat
         // it on the RHS, because TableGen doesn't recognize our VP operations
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 5bfa79e02e437..dd318c9792465 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -3726,3 +3726,93 @@ vector.body:                                      ; preds = %vector.body, %entry
 for.cond.cleanup:                                 ; preds = %vector.body
   ret void
 }
+
+declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @sink_splat_vp_fma(float* noalias nocapture %a, float* nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fma:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.w.x ft0, a2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:  .LBB65_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    vsetvli zero, a3, e32, m1, tu, mu
+; CHECK-NEXT:    vfmadd.vf v8, ft0, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bnez a2, .LBB65_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %1, align 4
+  %2 = getelementptr inbounds float, float* %b, i64 %index
+  %3 = bitcast float* %2 to <4 x float>*
+  %wide.load12 = load <4 x float>, <4 x float>* %3, align 4
+  %4 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
+  %5 = bitcast float* %0 to <4 x float>*
+  store <4 x float> %4, <4 x float>* %5, align 4
+  %index.next = add nuw i64 %index, 4
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @sink_splat_vp_fma_commute(float* noalias nocapture %a, float* nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: sink_splat_vp_fma_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.w.x ft0, a2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:  .LBB66_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    vsetvli zero, a3, e32, m1, tu, mu
+; CHECK-NEXT:    vfmadd.vf v8, ft0, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, -4
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bnez a2, .LBB66_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
+  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %1, align 4
+  %2 = getelementptr inbounds float, float* %b, i64 %index
+  %3 = bitcast float* %2 to <4 x float>*
+  %wide.load12 = load <4 x float>, <4 x float>* %3, align 4
+  %4 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl)
+  %5 = bitcast float* %0 to <4 x float>*
+  store <4 x float> %4, <4 x float>* %5, align 4
+  %index.next = add nuw i64 %index, 4
+  %6 = icmp eq i64 %index.next, 1024
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}

From f9c3310d32c62b28c10084a0104563aeeecc06ec Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 26 Mar 2021 12:43:25 -0700
Subject: [PATCH 441/748] [OPENMP]Fix PR49366: crash on VLAs in task untied
 regions.

We need to capture the local variables into a record in task untied
regions but clang does not support record with VLA data members.

Differential Revision: https://reviews.llvm.org/D99436
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  2 ++
 clang/include/clang/Sema/Sema.h                  |  4 ++++
 clang/lib/Sema/SemaOpenMP.cpp                    | 16 ++++++++++++++++
 clang/lib/Sema/SemaType.cpp                      |  3 +++
 clang/test/OpenMP/task_messages.cpp              |  8 +++++++-
 clang/test/OpenMP/taskloop_loop_messages.cpp     |  9 ++++++++-
 6 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 1854c8e522b82..11fcd5ff5a323 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10893,6 +10893,8 @@ def err_omp_clause_requires_dispatch_construct : Error<
   "'%0' clause requires 'dispatch' context selector">;
 def err_omp_append_args_with_varargs : Error<
   "'append_args' is not allowed with varargs functions">;
+def err_openmp_vla_in_task_untied : Error<
+  "variable length arrays are not supported in OpenMP tasking regions with 'untied' clause">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index dfa12ad40b72a..2d47a20711817 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10688,6 +10688,10 @@ class Sema final {
   void finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
                                      const FunctionDecl *Callee,
                                      SourceLocation Loc);
+
+  /// Return true if currently in OpenMP task with untied clause context.
+  bool isInOpenMPTaskUntiedContext() const;
+
   /// Return true inside OpenMP declare target region.
   bool isInOpenMPDeclareTargetContext() const {
     return !DeclareTargetNesting.empty();
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 43386c1ef8edb..ad8d304ef43c3 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -176,6 +176,7 @@ class DSAStackTy {
     bool HasMutipleLoops = false;
     const Decl *PossiblyLoopCounter = nullptr;
     bool NowaitRegion = false;
+    bool UntiedRegion = false;
     bool CancelRegion = false;
     bool LoopStart = false;
     bool BodyComplete = false;
@@ -851,6 +852,15 @@ class DSAStackTy {
       return Parent->NowaitRegion;
     return false;
   }
+  /// Marks current region as untied (it has a 'untied' clause).
+  void setUntiedRegion(bool IsUntied = true) {
+    getTopOfStack().UntiedRegion = IsUntied;
+  }
+  /// Return true if current region is untied.
+  bool isUntiedRegion() const {
+    const SharingMapTy *Top = getTopOfStackOrNull();
+    return Top ? Top->UntiedRegion : false;
+  }
   /// Marks parent region as cancel region.
   void setParentCancelRegion(bool Cancel = true) {
     if (SharingMapTy *Parent = getSecondOnStackOrNull())
@@ -2158,6 +2168,11 @@ unsigned Sema::getOpenMPNestingLevel() const {
   return DSAStack->getNestingLevel();
 }
 
+bool Sema::isInOpenMPTaskUntiedContext() const {
+  return isOpenMPTaskingDirective(DSAStack->getCurrentDirective()) &&
+         DSAStack->isUntiedRegion();
+}
+
 bool Sema::isInOpenMPTargetExecutionDirective() const {
   return (isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) &&
           !DSAStack->isClauseParsingMode()) ||
@@ -16232,6 +16247,7 @@ OMPClause *Sema::ActOnOpenMPNowaitClause(SourceLocation StartLoc,
 
 OMPClause *Sema::ActOnOpenMPUntiedClause(SourceLocation StartLoc,
                                          SourceLocation EndLoc) {
+  DSAStack->setUntiedRegion();
   return new (Context) OMPUntiedClause(StartLoc, EndLoc);
 }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 74969749e54ae..35d4c386211e5 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2458,6 +2458,9 @@ QualType Sema::BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM,
   } else if (isSFINAEContext()) {
     VLADiag = diag::err_vla_in_sfinae;
     VLAIsError = true;
+  } else if (getLangOpts().OpenMP && isInOpenMPTaskUntiedContext()) {
+    VLADiag = diag::err_openmp_vla_in_task_untied;
+    VLAIsError = true;
   } else {
     VLADiag = diag::ext_vla;
     VLAIsError = false;
diff --git a/clang/test/OpenMP/task_messages.cpp b/clang/test/OpenMP/task_messages.cpp
index 13cbfb6c45693..86a3f0d481316 100644
--- a/clang/test/OpenMP/task_messages.cpp
+++ b/clang/test/OpenMP/task_messages.cpp
@@ -173,7 +173,7 @@ int main(int argc, char **argv) {
   int &b = a;
   S sa;
   S &sb = sa;
-  int r;
+  int r; // expected-note {{declared here}}
 #pragma omp task { // expected-warning {{extra tokens at the end of '#pragma omp task' are ignored}}
   foo();
 #pragma omp task( // expected-warning {{extra tokens at the end of '#pragma omp task' are ignored}}
@@ -330,6 +330,12 @@ int main(int argc, char **argv) {
 // expected-error@+1 {{directive '#pragma omp task' cannot contain more than one 'mergeable' clause}}
 #pragma omp task mergeable mergeable
   ++r;
+// expected-error@+4 {{variable length arrays are not supported in OpenMP tasking regions with 'untied' clause}}
+// expected-note@+3 {{read of non-const variable 'r' is not allowed in a constant expression}}
+#pragma omp task untied
+  {
+    int array[r];
+  }
   volatile omp_event_handle_t evt;
   omp_event_handle_t sevt;
   const omp_event_handle_t cevt = evt;
diff --git a/clang/test/OpenMP/taskloop_loop_messages.cpp b/clang/test/OpenMP/taskloop_loop_messages.cpp
index b3b24e96abc9d..677bb3c012fae 100644
--- a/clang/test/OpenMP/taskloop_loop_messages.cpp
+++ b/clang/test/OpenMP/taskloop_loop_messages.cpp
@@ -691,7 +691,7 @@ void test_loop_break() {
 
 void test_loop_eh() {
   const int N = 100;
-  float a[N], b[N], c[N];
+  float a[N], b[N], c[N]; // expected-note {{declared here}}
 #pragma omp parallel
 #pragma omp taskloop
   for (int i = 0; i < 10; i++) {
@@ -729,6 +729,13 @@ void test_loop_eh() {
       void g() { throw 0; }
     };
   }
+// expected-error@+5 {{variable length arrays are not supported in OpenMP tasking regions with 'untied' clause}}
+// expected-note@+4 {{read of non-constexpr variable 'c' is not allowed in a constant expression}}
+#pragma omp taskloop untied
+  {
+  for (int i = 0; i < 10; ++i)
+    int array[(int)c[0]];
+  }
 }
 
 void test_loop_firstprivate_lastprivate() {

From 3fa2e66c10aadac1d209afadba34d90c9bd95221 Mon Sep 17 00:00:00 2001
From: George Koehler <kernigh@gmail.com>
Date: Mon, 21 Feb 2022 15:15:53 -0500
Subject: [PATCH 442/748] [libunwind] Further fix for 32-bit PowerPC processors
 without AltiVec

https://reviews.llvm.org/D91906 did most of the work necessary to fix libunwind on
32-bit PowerPC processors without AltiVec, but there was one more piece necessary.

Reviewed By: luporl

Differential Revision: https://reviews.llvm.org/D120197
---
 libunwind/src/UnwindRegistersSave.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index 9566bb0335fee..b39489235ce63 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -603,9 +603,11 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   stw     30,128(3)
   stw     31,132(3)
 
+#if defined(__ALTIVEC__)
   // save VRSave register
   mfspr   0, 256
   stw     0, 156(3)
+#endif
   // save CR registers
   mfcr    0
   stw     0, 136(3)

From df0c16ce00629dd55d85ece34403535c39c1da9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <chfast@gmail.com>
Date: Mon, 21 Feb 2022 20:27:47 +0100
Subject: [PATCH 443/748] [NFC][DAGCombine] Use isOperandOf() in
 combineCarryDiamond

Pre-commit for https://reviews.llvm.org/D118362.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3196ba9f7689d..6c5286277ba9d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3146,12 +3146,12 @@ static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
   if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
     return SDValue();
 
-  // Canonicalize the add/sub of A and B as Carry0 and the add/sub of the
-  // carry/borrow in as Carry1. (The top and middle uaddo nodes respectively in
-  // the above ASCII art.)
-  if (Carry1.getOperand(0) != Carry0.getValue(0) &&
-      Carry1.getOperand(1) != Carry0.getValue(0))
+  // Canonicalize the add/sub of A and B (the top node in the above ASCII art)
+  // as Carry0 and the add/sub of the carry in as Carry1 (the middle node).
+  if (Carry1.getNode()->isOperandOf(Carry0.getNode()))
     std::swap(Carry0, Carry1);
+
+  // Check if nodes are connected in expected way.
   if (Carry1.getOperand(0) != Carry0.getValue(0) &&
       Carry1.getOperand(1) != Carry0.getValue(0))
     return SDValue();

From 9f8cb68570d886025df36445ae04d4e16e32a128 Mon Sep 17 00:00:00 2001
From: Arjun P <arjunpitchanathan@gmail.com>
Date: Mon, 21 Feb 2022 19:04:12 +0000
Subject: [PATCH 444/748] [MLIR][Presburger] Support finding integer lexmin in
 IntegerPolyhedron

Note: this does not yet support PrebsurgerSets.

Reviewed By: Groverkss

Differential Revision: https://reviews.llvm.org/D120239
---
 .../mlir/Analysis/Presburger/Fraction.h       |  9 ++-
 .../Analysis/Presburger/IntegerPolyhedron.h   |  7 ++
 .../mlir/Analysis/Presburger/Simplex.h        | 23 +++++++
 .../Analysis/Presburger/IntegerPolyhedron.cpp | 20 ++++++
 mlir/lib/Analysis/Presburger/Simplex.cpp      | 69 +++++++++++++++++--
 .../Presburger/IntegerPolyhedronTest.cpp      | 62 +++++++++++++++--
 6 files changed, 176 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/Fraction.h b/mlir/include/mlir/Analysis/Presburger/Fraction.h
index 0f8ee6e01636b..c1ff333f6e441 100644
--- a/mlir/include/mlir/Analysis/Presburger/Fraction.h
+++ b/mlir/include/mlir/Analysis/Presburger/Fraction.h
@@ -25,7 +25,7 @@ namespace mlir {
 /// representable by 64-bit integers.
 struct Fraction {
   /// Default constructor initializes the represented rational number to zero.
-  Fraction() {}
+  Fraction() = default;
 
   /// Construct a Fraction from a numerator and denominator.
   Fraction(int64_t oNum, int64_t oDen) : num(oNum), den(oDen) {
@@ -35,6 +35,13 @@ struct Fraction {
     }
   }
 
+  // Return the value of the fraction as an integer. This should only be called
+  // when the fraction's value is really an integer.
+  int64_t getAsInteger() const {
+    assert(num % den == 0 && "Get as integer called on non-integral fraction!");
+    return num / den;
+  }
+
   /// The numerator and denominator, respectively. The denominator is always
   /// positive.
   int64_t num{0}, den{1};
diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
index 1a786d89f27b8..711a34950e753 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
@@ -212,6 +212,13 @@ class IntegerPolyhedron : public PresburgerLocalSpace {
   presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>>
   getRationalLexMin() const;
 
+  /// Same as above, but returns lexicographically minimal integer point.
+  /// Note: this should be used only when the lexmin is really required.
+  /// For a generic integer sampling operation, findIntegerSample is more
+  /// robust and should be preferred.
+  presburger_utils::MaybeOptimum<SmallVector<int64_t, 8>>
+  getIntegerLexMin() const;
+
   /// Swap the posA^th identifier with the posB^th identifier.
   virtual void swapId(unsigned posA, unsigned posB);
 
diff --git a/mlir/include/mlir/Analysis/Presburger/Simplex.h b/mlir/include/mlir/Analysis/Presburger/Simplex.h
index 10600064710dc..d5e14f717e925 100644
--- a/mlir/include/mlir/Analysis/Presburger/Simplex.h
+++ b/mlir/include/mlir/Analysis/Presburger/Simplex.h
@@ -265,6 +265,10 @@ class SimplexBase {
   /// Returns the unknown associated with row.
   Unknown &unknownFromRow(unsigned row);
 
+  /// Add a new row to the tableau and the associated data structures. The row
+  /// is initialized to zero.
+  unsigned addZeroRow(bool makeRestricted = false);
+
   /// Add a new row to the tableau and the associated data structures.
   /// The new row is considered to be a constraint; the new Unknown lives in
   /// con.
@@ -436,6 +440,12 @@ class LexSimplex : public SimplexBase {
   /// Return the lexicographically minimum rational solution to the constraints.
   presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>> getRationalLexMin();
 
+  /// Return the lexicographically minimum integer solution to the constraints.
+  ///
+  /// Note: this should be used only when the lexmin is really needed. To obtain
+  /// any integer sample, use Simplex::findIntegerSample as that is more robust.
+  presburger_utils::MaybeOptimum<SmallVector<int64_t, 8>> getIntegerLexMin();
+
 protected:
   /// Returns the current sample point, which may contain non-integer (rational)
   /// coordinates. Returns an empty optimum when the tableau is empty.
@@ -446,6 +456,15 @@ class LexSimplex : public SimplexBase {
   presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>>
   getRationalSample() const;
 
+  /// Given a row that has a non-integer sample value, add an inequality such
+  /// that this fractional sample value is cut away from the polytope. The added
+  /// inequality will be such that no integer points are removed.
+  ///
+  /// Returns whether the cut constraint could be enforced, i.e. failure if the
+  /// cut made the polytope empty, and success if it didn't. Failure status
+  /// indicates that the polytope didn't have any integer points.
+  LogicalResult addCut(unsigned row);
+
   /// Undo the addition of the last constraint. This is only called while
   /// rolling back.
   void undoLastConstraint() final;
@@ -460,6 +479,10 @@ class LexSimplex : public SimplexBase {
   /// Otherwise, return an empty optional.
   Optional<unsigned> maybeGetViolatedRow() const;
 
+  /// Get a row corresponding to a var that has a non-integral sample value, if
+  /// one exists. Otherwise, return an empty optional.
+  Optional<unsigned> maybeGetNonIntegeralVarRow() const;
+
   /// Given two potential pivot columns for a row, return the one that results
   /// in the lexicographically smallest sample vector.
   unsigned getLexMinPivotColumn(unsigned row, unsigned colA,
diff --git a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
index ce0f339967a52..5e26149303e6e 100644
--- a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
@@ -92,6 +92,26 @@ IntegerPolyhedron::getRationalLexMin() const {
   return maybeLexMin;
 }
 
+MaybeOptimum<SmallVector<int64_t, 8>>
+IntegerPolyhedron::getIntegerLexMin() const {
+  assert(getNumSymbolIds() == 0 && "Symbols are not supported!");
+  MaybeOptimum<SmallVector<int64_t, 8>> maybeLexMin =
+      LexSimplex(*this).getIntegerLexMin();
+
+  if (!maybeLexMin.isBounded())
+    return maybeLexMin.getKind();
+
+  // The Simplex returns the lexmin over all the variables including locals. But
+  // locals are not actually part of the space and should not be returned in the
+  // result. Since the locals are placed last in the list of identifiers, they
+  // will be minimized last in the lexmin. So simply truncating out the locals
+  // from the end of the answer gives the desired lexmin over the dimensions.
+  assert(maybeLexMin->size() == getNumIds() &&
+         "Incorrect number of vars in lexMin!");
+  maybeLexMin->resize(getNumDimAndSymbolIds());
+  return maybeLexMin;
+}
+
 unsigned IntegerPolyhedron::insertDimId(unsigned pos, unsigned num) {
   return insertId(IdKind::SetDim, pos, num);
 }
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index 285fa91f34a07..79ccae57573e5 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -59,13 +59,7 @@ Simplex::Unknown &SimplexBase::unknownFromRow(unsigned row) {
   return unknownFromIndex(rowUnknown[row]);
 }
 
-/// Add a new row to the tableau corresponding to the given constant term and
-/// list of coefficients. The coefficients are specified as a vector of
-/// (variable index, coefficient) pairs.
-unsigned SimplexBase::addRow(ArrayRef<int64_t> coeffs, bool makeRestricted) {
-  assert(coeffs.size() == var.size() + 1 &&
-         "Incorrect number of coefficients!");
-
+unsigned SimplexBase::addZeroRow(bool makeRestricted) {
   ++nRow;
   // If the tableau is not big enough to accomodate the extra row, we extend it.
   if (nRow >= tableau.getNumRows())
@@ -77,6 +71,17 @@ unsigned SimplexBase::addRow(ArrayRef<int64_t> coeffs, bool makeRestricted) {
   tableau.fillRow(nRow - 1, 0);
 
   tableau(nRow - 1, 0) = 1;
+  return con.size() - 1;
+}
+
+/// Add a new row to the tableau corresponding to the given constant term and
+/// list of coefficients. The coefficients are specified as a vector of
+/// (variable index, coefficient) pairs.
+unsigned SimplexBase::addRow(ArrayRef<int64_t> coeffs, bool makeRestricted) {
+  assert(coeffs.size() == var.size() + 1 &&
+         "Incorrect number of coefficients!");
+
+  addZeroRow(makeRestricted);
   tableau(nRow - 1, 1) = coeffs.back();
   if (usingBigM) {
     // When the lexicographic pivot rule is used, instead of the variables
@@ -164,6 +169,56 @@ MaybeOptimum<SmallVector<Fraction, 8>> LexSimplex::getRationalLexMin() {
   return getRationalSample();
 }
 
+LogicalResult LexSimplex::addCut(unsigned row) {
+  int64_t denom = tableau(row, 0);
+  addZeroRow(/*makeRestricted=*/true);
+  tableau(nRow - 1, 0) = denom;
+  tableau(nRow - 1, 1) = -mod(-tableau(row, 1), denom);
+  tableau(nRow - 1, 2) = 0; // M has all factors in it.
+  for (unsigned col = 3; col < nCol; ++col)
+    tableau(nRow - 1, col) = mod(tableau(row, col), denom);
+  return moveRowUnknownToColumn(nRow - 1);
+}
+
+Optional<unsigned> LexSimplex::maybeGetNonIntegeralVarRow() const {
+  for (const Unknown &u : var) {
+    if (u.orientation == Orientation::Column)
+      continue;
+    // If the sample value is of the form (a/d)M + b/d, we need b to be
+    // divisible by d. We assume M is very large and contains all possible
+    // factors and is divisible by everything.
+    unsigned row = u.pos;
+    if (tableau(row, 1) % tableau(row, 0) != 0)
+      return row;
+  }
+  return {};
+}
+
+MaybeOptimum<SmallVector<int64_t, 8>> LexSimplex::getIntegerLexMin() {
+  while (!empty) {
+    restoreRationalConsistency();
+    if (empty)
+      return OptimumKind::Empty;
+
+    if (Optional<unsigned> maybeRow = maybeGetNonIntegeralVarRow()) {
+      // Failure occurs when the polytope is integer empty.
+      if (failed(addCut(*maybeRow)))
+        return OptimumKind::Empty;
+      continue;
+    }
+
+    MaybeOptimum<SmallVector<Fraction, 8>> sample = getRationalSample();
+    assert(!sample.isEmpty() && "If we reached here the sample should exist!");
+    if (sample.isUnbounded())
+      return OptimumKind::Unbounded;
+    return llvm::to_vector<8>(llvm::map_range(
+        *sample, [](const Fraction &f) { return f.getAsInteger(); }));
+  }
+
+  // Polytope is integer empty.
+  return OptimumKind::Empty;
+}
+
 bool LexSimplex::rowIsViolated(unsigned row) const {
   if (tableau(row, 2) < 0)
     return true;
diff --git a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
index d7e9b967136b5..fffbf7527f994 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Analysis/Presburger/IntegerPolyhedron.h"
 #include "./Utils.h"
+#include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/IR/MLIRContext.h"
 
 #include <gmock/gmock.h>
@@ -36,29 +37,53 @@ makeSetFromConstraints(unsigned ids, ArrayRef<SmallVector<int64_t, 4>> ineqs,
   return set;
 }
 
+static void dump(ArrayRef<int64_t> vec) {
+  for (int64_t x : vec)
+    llvm::errs() << x << ' ';
+  llvm::errs() << '\n';
+}
+
 /// If fn is TestFunction::Sample (default):
-/// If hasSample is true, check that findIntegerSample returns a valid sample
-/// for the IntegerPolyhedron poly.
-/// If hasSample is false, check that findIntegerSample returns None.
+///
+///   If hasSample is true, check that findIntegerSample returns a valid sample
+///   for the IntegerPolyhedron poly. Also check that getIntegerLexmin finds a
+///   non-empty lexmin.
+///
+///   If hasSample is false, check that findIntegerSample returns None and
+///   getIntegerLexMin returns Empty.
 ///
 /// If fn is TestFunction::Empty, check that isIntegerEmpty returns the
 /// opposite of hasSample.
 static void checkSample(bool hasSample, const IntegerPolyhedron &poly,
                         TestFunction fn = TestFunction::Sample) {
   Optional<SmallVector<int64_t, 8>> maybeSample;
+  MaybeOptimum<SmallVector<int64_t, 8>> maybeLexMin;
   switch (fn) {
   case TestFunction::Sample:
     maybeSample = poly.findIntegerSample();
+    maybeLexMin = poly.getIntegerLexMin();
+
     if (!hasSample) {
       EXPECT_FALSE(maybeSample.hasValue());
       if (maybeSample.hasValue()) {
-        for (auto x : *maybeSample)
-          llvm::errs() << x << ' ';
-        llvm::errs() << '\n';
+        llvm::errs() << "findIntegerSample gave sample: ";
+        dump(*maybeSample);
+      }
+
+      EXPECT_TRUE(maybeLexMin.isEmpty());
+      if (maybeLexMin.isBounded()) {
+        llvm::errs() << "getIntegerLexMin gave sample: ";
+        dump(*maybeLexMin);
       }
     } else {
       ASSERT_TRUE(maybeSample.hasValue());
       EXPECT_TRUE(poly.containsPoint(*maybeSample));
+
+      ASSERT_FALSE(maybeLexMin.isEmpty());
+      if (maybeLexMin.isUnbounded())
+        EXPECT_TRUE(Simplex(poly).isUnbounded());
+      if (maybeLexMin.isBounded())
+        EXPECT_TRUE(poly.containsPoint(*maybeLexMin));
     }
     break;
   case TestFunction::Empty:
@@ -1138,6 +1163,31 @@ TEST(IntegerPolyhedronTest, getRationalLexMin) {
                          parsePoly("(x) : (2*x >= 0, -x - 1 >= 0)", &context));
 }
 
+void expectIntegerLexMin(const IntegerPolyhedron &poly, ArrayRef<int64_t> min) {
+  auto lexMin = poly.getIntegerLexMin();
+  ASSERT_TRUE(lexMin.isBounded());
+  EXPECT_EQ(ArrayRef<int64_t>(*lexMin), min);
+}
+
+void expectNoIntegerLexMin(OptimumKind kind, const IntegerPolyhedron &poly) {
+  ASSERT_NE(kind, OptimumKind::Bounded)
+      << "Use expectRationalLexMin for bounded min";
+  EXPECT_EQ(poly.getRationalLexMin().getKind(), kind);
+}
+
+TEST(IntegerPolyhedronTest, getIntegerLexMin) {
+  MLIRContext context;
+  expectIntegerLexMin(parsePoly("(x, y, z) : (2*x + 13 >= 0, 4*y - 3*x - 2  >= "
+                                "0, 11*z + 5*y - 3*x + 7 >= 0)",
+                                &context),
+                      {-6, -4, 0});
+  // Similar to above but no lower bound on z.
+  expectNoIntegerLexMin(OptimumKind::Unbounded,
+                        parsePoly("(x, y, z) : (2*x + 13 >= 0, 4*y - 3*x - 2  "
+                                  ">= 0, -11*z + 5*y - 3*x + 7 >= 0)",
+                                  &context));
+}
+
 static void
 expectComputedVolumeIsValidOverapprox(const IntegerPolyhedron &poly,
                                       Optional<uint64_t> trueVolume,

From 865c7ab421a6620e3ae7a45943db4eebc7717daa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 18 Jan 2022 09:33:08 +0000
Subject: [PATCH 445/748] [libcxx] [test] Fix moneypunct grouping tests on
 Windows

For grouping strings, "\3" and "\3\3" are equivalent.

Differential Revision: https://reviews.llvm.org/D120091
---
 .../grouping.pass.cpp                         | 28 ++++++++++++-------
 .../locale.numpunct.byname/grouping.pass.cpp  | 11 ++++++--
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp
index 3dae97f873dd8..dc1e3424553a5 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/grouping.pass.cpp
@@ -11,8 +11,6 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // REQUIRES: locale.en_US.UTF-8
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ru_RU.UTF-8
@@ -89,22 +87,27 @@ int main(int, char**)
     }
 #endif
 
+#ifdef _WIN32
+    std::string us_grouping = "\3";
+#else
+    std::string us_grouping = "\3\3";
+#endif
     {
         Fnf f(LOCALE_en_US_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == us_grouping);
     }
     {
         Fnt f(LOCALE_en_US_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == us_grouping);
     }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
     {
         Fwf f(LOCALE_en_US_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == us_grouping);
     }
     {
         Fwt f(LOCALE_en_US_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == us_grouping);
     }
 #endif
 
@@ -127,22 +130,27 @@ int main(int, char**)
     }
 #endif
 
+#ifdef _WIN32
+    std::string ru_grouping = "\3";
+#else
+    std::string ru_grouping = "\3\3";
+#endif
     {
         Fnf f(LOCALE_ru_RU_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == ru_grouping);
     }
     {
         Fnt f(LOCALE_ru_RU_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == ru_grouping);
     }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
     {
         Fwf f(LOCALE_ru_RU_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == ru_grouping);
     }
     {
         Fwt f(LOCALE_ru_RU_UTF_8, 1);
-        assert(f.grouping() == "\3\3");
+        assert(f.grouping() == ru_grouping);
     }
 #endif
 
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
index b7223ebcb68a1..7f9dbc9ded618 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/grouping.pass.cpp
@@ -9,7 +9,6 @@
 // NetBSD does not support LC_NUMERIC at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
 // XFAIL: LIBCXX-AIX-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
@@ -49,19 +48,27 @@ int main(int, char**)
         {
             typedef char C;
             const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
+#ifdef _WIN32
+            assert(np.grouping() == "\3");
+#else
             assert(np.grouping() == "\3\3");
+#endif
         }
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
         {
             typedef wchar_t C;
             const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
+#ifdef _WIN32
+            assert(np.grouping() == "\3");
+#else
             assert(np.grouping() == "\3\3");
+#endif
         }
 #endif
     }
     {
         std::locale l(LOCALE_fr_FR_UTF_8);
-#if defined(TEST_HAS_GLIBC)
+#if defined(TEST_HAS_GLIBC) || defined(_WIN32)
         const char* const group = "\3";
 #else
         const char* const group = "\x7f";

From bcee450937061139de4f75b4deaa06dee69aaf25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 13 Jan 2022 10:28:41 +0000
Subject: [PATCH 446/748] [libcxx] [test] Fix the thousands_sep expectation for
 fr_FR locales on Windows

Windows uses U+00A0 NO-BREAK SPACE as thousands separator in the
fr_FR locale.

Differential Revision: https://reviews.llvm.org/D120090
---
 .../get_long_double_fr_FR.pass.cpp                 | 14 +++++++++-----
 .../put_long_double_fr_FR.pass.cpp                 | 14 +++++++++-----
 .../thousands_sep.pass.cpp                         |  6 ++++--
 .../locale.numpunct.byname/thousands_sep.pass.cpp  |  5 +++--
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
index b90677d854de8..16fc12feac62b 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
@@ -11,7 +11,6 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
 // XFAIL: LIBCXX-AIX-FIXME
 
 // REQUIRES: locale.fr_FR.UTF-8
@@ -53,11 +52,12 @@ class my_facetw
         : Fw(refs) {}
 };
 
-// GLIBC 2.27 and newer use U2027 (narrow non-breaking space) as a thousands sep.
-// this function converts the spaces in string inputs to that character if need
-// be. FreeBSD's locale data also uses U2027 since 2018.
+// GLIBC 2.27 and newer use U+202F NARROW NO-BREAK SPACE as a thousands separator.
+// This function converts the spaces in string inputs to U+202F if need
+// be. FreeBSD's locale data also uses U+202F, since 2018.
+// Windows uses U+00A0 NO-BREAK SPACE.
 static std::wstring convert_thousands_sep(std::wstring const& in) {
-#if defined(_CS_GNU_LIBC_VERSION) || defined(__FreeBSD__)
+#if defined(_CS_GNU_LIBC_VERSION) || defined(__FreeBSD__) || defined(_WIN32)
 #if defined(_CS_GNU_LIBC_VERSION)
   if (glibc_version_less_than("2.27"))
     return in;
@@ -72,7 +72,11 @@ static std::wstring convert_thousands_sep(std::wstring const& in) {
       continue;
     }
     assert(in[I] == L' ');
+#if defined(_WIN32)
+    out.push_back(L'\u00A0');
+#else
     out.push_back(L'\u202F');
+#endif
   }
   return out;
 #else
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
index 492ad1379e2ac..becb46e091590 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
@@ -11,7 +11,6 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
 // XFAIL: LIBCXX-AIX-FIXME
 
 // REQUIRES: locale.fr_FR.UTF-8
@@ -53,11 +52,12 @@ class my_facetw
         : Fw(refs) {}
 };
 
-// GLIBC 2.27 and newer use U2027 (narrow non-breaking space) as a thousands sep.
-// this function converts the spaces in string inputs to that character if need
-// be. FreeBSD's locale data also uses U2027 since 2018.
+// GLIBC 2.27 and newer use U+202F NARROW NO-BREAK SPACE as a thousands separator.
+// This function converts the spaces in string inputs to U+202F if need
+// be. FreeBSD's locale data also uses U+202F, since 2018.
+// Windows uses U+00A0 NO-BREAK SPACE.
 static std::wstring convert_thousands_sep(std::wstring const& in) {
-#if defined(_CS_GNU_LIBC_VERSION) || defined(__FreeBSD__)
+#if defined(_CS_GNU_LIBC_VERSION) || defined(__FreeBSD__) || defined(_WIN32)
 #if defined(_CS_GNU_LIBC_VERSION)
   if (glibc_version_less_than("2.27"))
     return in;
@@ -74,7 +74,11 @@ static std::wstring convert_thousands_sep(std::wstring const& in) {
       continue;
     }
     assert(in[I] == L' ');
+#if defined(_WIN32)
+    out.push_back(L'\u00A0');
+#else
     out.push_back(L'\u202F');
+#endif
   }
   return out;
 #else
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp
index 4953625916485..17f25d78f0b1c 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.moneypunct.byname/thousands_sep.pass.cpp
@@ -9,8 +9,6 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // REQUIRES: locale.en_US.UTF-8
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ru_RU.UTF-8
@@ -114,6 +112,8 @@ int main(int, char**)
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
 #if defined(_CS_GNU_LIBC_VERSION)
     const wchar_t fr_sep = glibc_version_less_than("2.27") ? L' ' : L'\u202F';
+#elif defined(_WIN32)
+    const wchar_t fr_sep = L'\u00A0';
 #else
     const wchar_t fr_sep = L' ';
 #endif
@@ -144,6 +144,8 @@ int main(int, char**)
     // FIXME libc++ specifically works around \u00A0 by translating it into
     // a regular space.
     const wchar_t wsep = glibc_version_less_than("2.27") ? L'\u00A0' : L'\u202F';
+#   elif defined(_WIN32)
+    const wchar_t wsep = L'\u00A0';
 #   else
     const wchar_t wsep = L' ';
 #   endif
diff --git a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
index 3e82e9f81b92b..b3539880253be 100644
--- a/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/facet.numpunct/locale.numpunct.byname/thousands_sep.pass.cpp
@@ -9,7 +9,6 @@
 // NetBSD does not support LC_NUMERIC at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
 // XFAIL: LIBCXX-AIX-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
@@ -64,7 +63,7 @@ int main(int, char**)
         // The below tests work around GLIBC's use of U202F as LC_NUMERIC thousands_sep.
         std::locale l(LOCALE_fr_FR_UTF_8);
         {
-#if defined(_CS_GNU_LIBC_VERSION)
+#if defined(_CS_GNU_LIBC_VERSION) || defined(_WIN32)
             const char sep = ' ';
 #else
             const char sep = ',';
@@ -77,6 +76,8 @@ int main(int, char**)
         {
 #if defined(_CS_GNU_LIBC_VERSION)
             const wchar_t wsep = glibc_version_less_than("2.27") ? L' ' : L'\u202f';
+#elif defined(_WIN32)
+            const wchar_t wsep = L'\u00A0';
 #else
             const wchar_t wsep = L',';
 #endif

From e1191965da38b5f9e8ce29becfef9e35c3730109 Mon Sep 17 00:00:00 2001
From: Groverkss <groverkss@gmail.com>
Date: Tue, 22 Feb 2022 02:27:26 +0530
Subject: [PATCH 447/748] [MLIR][Presburger] Add support for IntegerRelation

This patch adds a class to represent a relation in Presburger Library.

This patch only adds the skeleton class. Functionality from IntegerPolyhedron
will be moved to IntegerRelation in later patches to make it easier to review.

This patch is a part of a series of patches adding support for relations in
Presburger Library.

Reviewed By: arjunp

Differential Revision: https://reviews.llvm.org/D120156
---
 .../Analysis/Presburger/IntegerPolyhedron.h   | 107 ++++++++++++------
 .../Analysis/Presburger/PresburgerSpace.h     |  27 ++++-
 2 files changed, 95 insertions(+), 39 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
index 711a34950e753..834e3b0edce18 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
@@ -21,37 +21,86 @@
 
 namespace mlir {
 
-/// An integer polyhedron is the set of solutions to a list of affine
-/// constraints over n integer-valued variables/identifiers. Affine constraints
-/// can be inequalities or equalities in the form:
+/// An IntegerRelation is a PresburgerLocalSpace subject to affine constraints.
+/// Affine constraints can be inequalities or equalities in the form:
 ///
 /// Inequality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} + c_n >= 0
 /// Equality  : c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} + c_n == 0
 ///
-/// where c_0, c_1, ..., c_n are integers.
+/// where c_0, c_1, ..., c_n are integers and n is the total number of
+/// identifiers in the space.
 ///
-/// Such a set corresponds to the set of integer points lying in a convex
-/// polyhedron. For example, consider the set: (x, y) : (1 <= x <= 7, x = 2y).
-/// This set contains the points (2, 1), (4, 2), and (6, 3).
+/// Such a relation corresponds to the set of integer points lying in a convex
+/// polyhedron. For example, consider the relation:
+///         (x) -> (y) : (1 <= x <= 7, x = 2y)
+/// These can be thought of as points in the polyhedron:
+///         (x, y) : (1 <= x <= 7, x = 2y)
+/// This relation contains the pairs (2, 1), (4, 2), and (6, 3).
 ///
-/// The integer-valued variables are distinguished into 3 types of:
+/// Since IntegerRelation makes a distinction between dimensions, IdKind::Range
+/// and IdKind::Domain should be used to refer to dimension identifiers.
+class IntegerRelation : public PresburgerLocalSpace {
+public:
+  /// Constructs a relation reserving memory for the specified number
+  /// of constraints and identifiers.
+  IntegerRelation(unsigned numReservedInequalities,
+                  unsigned numReservedEqualities, unsigned numReservedCols,
+                  unsigned numDomain, unsigned numRange, unsigned numSymbols,
+                  unsigned numLocals)
+      : PresburgerLocalSpace(numDomain, numRange, numSymbols, numLocals),
+        equalities(0, getNumIds() + 1, numReservedEqualities, numReservedCols),
+        inequalities(0, getNumIds() + 1, numReservedInequalities,
+                     numReservedCols) {
+    assert(numReservedCols >= getNumIds() + 1);
+  }
+
+  /// Constructs a relation with the specified number of dimensions and symbols.
+  IntegerRelation(unsigned numDomain = 0, unsigned numRange = 0,
+                  unsigned numSymbols = 0, unsigned numLocals = 0)
+      : IntegerRelation(/*numReservedInequalities=*/0,
+                        /*numReservedEqualities=*/0,
+                        /*numReservedCols=*/numDomain + numRange + numSymbols +
+                            numLocals + 1,
+                        numDomain, numRange, numSymbols, numLocals) {}
+
+protected:
+  /// Constructs a set reserving memory for the specified number
+  /// of constraints and identifiers.  This constructor should not be used
+  /// directly to create a relation and should only be used to create Sets.
+  IntegerRelation(unsigned numReservedInequalities,
+                  unsigned numReservedEqualities, unsigned numReservedCols,
+                  unsigned numDims, unsigned numSymbols, unsigned numLocals)
+      : PresburgerLocalSpace(numDims, numSymbols, numLocals),
+        equalities(0, getNumIds() + 1, numReservedEqualities, numReservedCols),
+        inequalities(0, getNumIds() + 1, numReservedInequalities,
+                     numReservedCols) {
+    assert(numReservedCols >= getNumIds() + 1);
+  }
+
+  /// Coefficients of affine equalities (in == 0 form).
+  Matrix equalities;
+
+  /// Coefficients of affine inequalities (in >= 0 form).
+  Matrix inequalities;
+};
+
+/// An IntegerPolyhedron is a PresburgerLocalSpace subject to affine
+/// constraints. Affine constraints can be inequalities or equalities in the
+/// form:
+///
+/// Inequality: c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} + c_n >= 0
+/// Equality  : c_0*x_0 + c_1*x_1 + .... + c_{n-1}*x_{n-1} + c_n == 0
 ///
-/// Dimension: Ordinary variables over which the set is represented.
+/// where c_0, c_1, ..., c_n are integers and n is the total number of
+/// identifiers in the space.
 ///
-/// Symbol: Symbol variables correspond to fixed but unknown values.
-/// Mathematically, an integer polyhedron with symbolic variables is like a
-/// family of integer polyhedra indexed by the symbolic variables.
+/// An IntegerPolyhedron is similar to a IntegerRelation but it does not make a
+/// distinction between Domain and Range identifiers. Internally,
+/// IntegerPolyhedron is implemented as a IntegerRelation with zero domain ids.
 ///
-/// Local: Local variables correspond to existentially quantified variables. For
-/// example, consider the set: (x) : (exists q : 1 <= x <= 7, x = 2q). An
-/// assignment to symbolic and dimension variables is valid if there exists some
-/// assignment to the local variable `q` satisfying these constraints. For this
-/// example, the set is equivalent to {2, 4, 6}. Mathematically, existential
-/// quantification can be thought of as the result of projection. In this
-/// example, `q` is existentially quantified. This can be thought of as the
-/// result of projecting out `q` from the previous example, i.e. we obtained {2,
-/// 4, 6} by projecting out the second dimension from {(2, 1), (4, 2), (6, 2)}.
-class IntegerPolyhedron : public PresburgerLocalSpace {
+/// Since IntegerPolyhedron does not make a distinction between dimensions,
+/// IdKind::SetDim should be used to refer to dimension identifiers.
+class IntegerPolyhedron : public IntegerRelation {
 public:
   /// All derived classes of IntegerPolyhedron.
   enum class Kind {
@@ -66,12 +115,8 @@ class IntegerPolyhedron : public PresburgerLocalSpace {
   IntegerPolyhedron(unsigned numReservedInequalities,
                     unsigned numReservedEqualities, unsigned numReservedCols,
                     unsigned numDims, unsigned numSymbols, unsigned numLocals)
-      : PresburgerLocalSpace(numDims, numSymbols, numLocals),
-        equalities(0, getNumIds() + 1, numReservedEqualities, numReservedCols),
-        inequalities(0, getNumIds() + 1, numReservedInequalities,
-                     numReservedCols) {
-    assert(numReservedCols >= getNumIds() + 1);
-  }
+      : IntegerRelation(numReservedInequalities, numReservedEqualities,
+                        numReservedCols, numDims, numSymbols, numLocals) {}
 
   /// Constructs a constraint system with the specified number of
   /// dimensions and symbols.
@@ -516,12 +561,6 @@ class IntegerPolyhedron : public PresburgerLocalSpace {
   // don't expect an identifier to have more than 32 lower/upper/equality
   // constraints. This is conservatively set low and can be raised if needed.
   constexpr static unsigned kExplosionFactor = 32;
-
-  /// Coefficients of affine equalities (in == 0 form).
-  Matrix equalities;
-
-  /// Coefficients of affine inequalities (in >= 0 form).
-  Matrix inequalities;
 };
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
index a7a93980d4aed..bff48f6a5662d 100644
--- a/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
+++ b/mlir/include/mlir/Analysis/Presburger/PresburgerSpace.h
@@ -21,8 +21,8 @@ namespace mlir {
 
 class PresburgerLocalSpace;
 
-/// PresburgerSpace is a tuple of identifiers with information about what kind
-/// they correspond to. The identifiers can be split into three types:
+/// PresburgerSpace is the space of all possible values of a tuple of integer
+/// valued variables/identifiers. Each identifier has one of the three types:
 ///
 /// Dimension: Ordinary variables over which the space is represented.
 ///
@@ -31,18 +31,35 @@ class PresburgerLocalSpace;
 /// family of spaces indexed by the symbolic identifiers.
 ///
 /// Local: Local identifiers correspond to existentially quantified variables.
+/// For example, consider the space: `(x, exists q)` where x is a dimension
+/// identifier and q is a local identifier. Let us put the constraints:
+///       `1 <= x <= 7, x = 2q`
+/// on this space to get the set:
+///       `(x) : (exists q : q <= x <= 7, x = 2q)`.
+/// An assignment to symbolic and dimension variables is valid if there
+/// exists some assignment to the local variable `q` satisfying these
+/// constraints. For this example, the set is equivalent to {2, 4, 6}.
+/// Mathematically, existential quantification can be thought of as the result
+/// of projection. In this example, `q` is existentially quantified. This can be
+/// thought of as the result of projecting out `q` from the previous example,
+/// i.e. we obtained {2, 4, 6} by projecting out the second dimension from
+/// {(2, 1), (4, 2), (6, 2)}.
 ///
 /// Dimension identifiers are further divided into Domain and Range identifiers
 /// to support building relations.
 ///
 /// Spaces with distinction between domain and range identifiers should use
 /// IdKind::Domain and IdKind::Range to refer to domain and range identifiers.
+/// Identifiers for such spaces are stored in the following order:
+///       [Domain, Range, Symbols, Locals]
 ///
 /// Spaces with no distinction between domain and range identifiers should use
-/// IdKind::SetDim to refer to dimension identifiers.
+/// IdKind::SetDim to refer to dimension identifiers. Identifiers for such
+/// spaces are stored in the following order:
+///       [SetDim, Symbol, Locals]
 ///
-/// PresburgerSpace does not support identifiers of kind Local. See
-/// PresburgerLocalSpace for an extension that supports Local ids.
+/// PresburgerSpace does not allow identifiers of kind Local. See
+/// PresburgerLocalSpace for an extension that does allow local identifiers.
 class PresburgerSpace {
   friend PresburgerLocalSpace;
 

From eec3488cf1d84d3503d5a8535b64374b79287bb9 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 21 Feb 2022 13:23:39 -0800
Subject: [PATCH 448/748] [CMake][Fuchsia] Disable assertions and analyzer for
 stage 1

We don't need these in the first stage compiler and disabling these
helps a bit with the compile time and runtime performance.

Differential Revision: https://reviews.llvm.org/D120280
---
 clang/cmake/caches/Fuchsia.cmake | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index a531f9f1c10d8..8e9e44d5917ed 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -6,8 +6,8 @@ set(PACKAGE_VENDOR Fuchsia CACHE STRING "")
 
 set(LLVM_ENABLE_PROJECTS "clang;clang-tools-extra;lld;llvm;polly" CACHE STRING "")
 
-set(LLVM_ENABLE_BACKTRACES OFF CACHE BOOL "")
 set(LLVM_ENABLE_DIA_SDK OFF CACHE BOOL "")
+set(LLVM_ENABLE_LIBXML2 OFF CACHE BOOL "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "")
@@ -29,13 +29,14 @@ if(NOT APPLE)
 endif()
 set(CLANG_DEFAULT_RTLIB compiler-rt CACHE STRING "")
 set(CLANG_ENABLE_ARCMT OFF CACHE BOOL "")
-set(CLANG_ENABLE_STATIC_ANALYZER ON CACHE BOOL "")
+set(CLANG_ENABLE_STATIC_ANALYZER OFF CACHE BOOL "")
 set(CLANG_PLUGIN_SUPPORT OFF CACHE BOOL "")
 
 set(ENABLE_LINKER_BUILD_ID ON CACHE BOOL "")
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL "")
 
-set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
+set(LLVM_ENABLE_ASSERTIONS OFF CACHE BOOL "")
+set(LLVM_ENABLE_BACKTRACES OFF CACHE BOOL "")
 set(CMAKE_BUILD_TYPE Release CACHE STRING "")
 if(APPLE)
   set(CMAKE_OSX_DEPLOYMENT_TARGET "10.13" CACHE STRING "")

From 6766ece133b7c6aa8643d5e348488ab2fad5d13b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 21 Feb 2022 15:19:23 -0500
Subject: [PATCH 449/748] [x86] add tests for rmw add with cmov; NFC

---
 llvm/test/CodeGen/X86/add-cmov.ll | 102 +++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll
index 4bb9bc9146488..a47cad269da96 100644
--- a/llvm/test/CodeGen/X86/add-cmov.ll
+++ b/llvm/test/CodeGen/X86/add-cmov.ll
@@ -136,7 +136,7 @@ define i64 @select_max32_2_i64(i64 %offset, i64 %x) {
 ; CHECK-LABEL: select_max32_2_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    leaq 2(%rdi), %rax
-; CHECK-NEXT:    2147483647(%rdi), %rcx
+; CHECK-NEXT:    leaq 2147483647(%rdi), %rcx
 ; CHECK-NEXT:    cmpq $41, %rsi
 ; CHECK-NEXT:    cmovneq %rcx, %rax
 ; CHECK-NEXT:    retq
@@ -210,7 +210,7 @@ define i32 @select_20_43_i32(i32 %offset, i64 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    leal 43(%rdi), %ecx
-; CHECK-NEXT:    20(%rdi), %eax
+; CHECK-NEXT:    leal 20(%rdi), %eax
 ; CHECK-NEXT:    cmpq $42, %rsi
 ; CHECK-NEXT:    cmovll %ecx, %eax
 ; CHECK-NEXT:    retq
@@ -473,3 +473,101 @@ define void @complex_lea_alt8(i1 %b, i16* readnone %ptr, i64 %idx) {
   store i16 %dec, i16* %sel, align 4
   ret void
 }
+
+define i32 @loadfold_select_const_arms(i32* %x, i1 %y) {
+; CHECK-LABEL: loadfold_select_const_arms:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    leal -10(%rax), %ecx
+; CHECK-NEXT:    addl $10, %eax
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovel %ecx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cond = select i1 %y, i32 10, i32 -10
+  %t0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %t0, %cond
+  ret i32 %add
+}
+
+define void @rmw_add(i32* %x, i1 %y, i32 %z, i32 %w) {
+; CHECK-LABEL: rmw_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovel %ecx, %edx
+; CHECK-NEXT:    addl %edx, (%rdi)
+; CHECK-NEXT:    retq
+  %cond = select i1 %y, i32 %z, i32 %w
+  %t0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %t0, %cond
+  store i32 %add, i32* %x, align 4
+  ret void
+}
+
+define void @rmw_add_select_const_arm(i32* %x, i1 %y, i32 %z) {
+; CHECK-LABEL: rmw_add_select_const_arm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    movl $-10, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    addl %eax, (%rdi)
+; CHECK-NEXT:    retq
+  %cond = select i1 %y, i32 %z, i32 -10
+  %t0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %t0, %cond
+  store i32 %add, i32* %x, align 4
+  ret void
+}
+
+define void @rmw_select_const_arms(i32* %x, i1 %y) {
+; CHECK-LABEL: rmw_select_const_arms:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    leal -10(%rax), %ecx
+; CHECK-NEXT:    addl $10, %eax
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovel %ecx, %eax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    retq
+  %cond = select i1 %y, i32 10, i32 -10
+  %t0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %t0, %cond
+  store i32 %add, i32* %x, align 4
+  ret void
+}
+
+define i32 @rmw_select_const_arms_extra_load_use(i32* %x, i1 %y) {
+; CHECK-LABEL: rmw_select_const_arms_extra_load_use:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    leal -10(%rax), %ecx
+; CHECK-NEXT:    leal 10(%rax), %edx
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovel %ecx, %edx
+; CHECK-NEXT:    movl %edx, (%rdi)
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cond = select i1 %y, i32 10, i32 -10
+  %t0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %t0, %cond
+  store i32 %add, i32* %x, align 4
+  ret i32 %t0
+}
+
+define i32 @rmw_select_const_arms_extra_add_use(i32* %x, i1 %y) {
+; CHECK-LABEL: rmw_select_const_arms_extra_add_use:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    leal -10(%rax), %ecx
+; CHECK-NEXT:    addl $10, %eax
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovel %ecx, %eax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cond = select i1 %y, i32 10, i32 -10
+  %t0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %t0, %cond
+  store i32 %add, i32* %x, align 4
+  ret i32 %add
+}

From 807766be3a89e2d4c9c935db2edd8c665f4d7567 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 21 Feb 2022 22:48:36 +0100
Subject: [PATCH 450/748] [libc++][ranges] Add ranges::min_max_result

Reviewed By: Quuxplusone, #libc

Spies: libcxx-commits, mgorny

Differential Revision: https://reviews.llvm.org/D119751
---
 libcxx/include/CMakeLists.txt                 |  1 +
 libcxx/include/__algorithm/min_max_result.h   | 56 +++++++++++++
 libcxx/include/algorithm                      | 10 ++-
 libcxx/include/module.modulemap               |  1 +
 .../min_max_result.module.verify.cpp          | 15 ++++
 .../min_max_result.pass.cpp                   | 84 +++++++++++++++++++
 6 files changed, 164 insertions(+), 3 deletions(-)
 create mode 100644 libcxx/include/__algorithm/min_max_result.h
 create mode 100644 libcxx/test/libcxx/diagnostics/detail.headers/algorithm/min_max_result.module.verify.cpp
 create mode 100644 libcxx/test/std/algorithms/algorithms.results/min_max_result.pass.cpp

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 72d06fc3a7532..3e5baa2d33d28 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -48,6 +48,7 @@ set(files
   __algorithm/merge.h
   __algorithm/min.h
   __algorithm/min_element.h
+  __algorithm/min_max_result.h
   __algorithm/minmax.h
   __algorithm/minmax_element.h
   __algorithm/mismatch.h
diff --git a/libcxx/include/__algorithm/min_max_result.h b/libcxx/include/__algorithm/min_max_result.h
new file mode 100644
index 0000000000000..d20d98a521d8c
--- /dev/null
+++ b/libcxx/include/__algorithm/min_max_result.h
@@ -0,0 +1,56 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_MIN_MAX_RESULT_H
+#define _LIBCPP___ALGORITHM_MIN_MAX_RESULT_H
+
+#include <__concepts/convertible_to.h>
+#include <__config>
+#include <__utility/move.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if!defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+namespace ranges {
+
+template <class _T1>
+struct min_max_result {
+  [[no_unique_address]] _T1 min;
+  [[no_unique_address]] _T1 max;
+
+  template <class _T2>
+    requires convertible_to<const _T1&, _T2>
+  _LIBCPP_HIDE_FROM_ABI constexpr operator min_max_result<_T2>() const & {
+    return {min, max};
+  }
+
+  template <class _T2>
+    requires convertible_to<_T1, _T2>
+  _LIBCPP_HIDE_FROM_ABI constexpr operator min_max_result<_T2>() && {
+    return {std::move(min), std::move(max)};
+  }
+};
+
+} // namespace ranges
+
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index b35ad674cbe77..9149995caef7a 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -20,17 +20,20 @@ namespace std
 
 namespace ranges {
   template <class I, class F>
-    struct in_fun_result; // since C++20
+    struct in_fun_result;     // since C++20
 
   template <class I1, class I2>
-    struct in_in_result;     // since C++20
+    struct in_in_result;      // since C++20
 
   template <class I1, class I2, class O>
-    struct in_in_out_result; // since C++20
+    struct in_in_out_result;  // since C++20
 
   template <class I, class O1, class O2>
     struct in_out_out_result; // since C++20
 
+  template <class I1, class I2>
+    struct min_max_result;    // since C++20
+
   template<forward_iterator I, sentinel_for<I> S, class Proj = identity,
     indirect_strict_weak_order<projected<I, Proj>> Comp = ranges::less>             // since C++20
   constexpr I min_element(I first, S last, Comp comp = {}, Proj proj = {});
@@ -738,6 +741,7 @@ template <class BidirectionalIterator, class Compare>
 #include <__algorithm/merge.h>
 #include <__algorithm/min.h>
 #include <__algorithm/min_element.h>
+#include <__algorithm/min_max_result.h>
 #include <__algorithm/minmax.h>
 #include <__algorithm/minmax_element.h>
 #include <__algorithm/mismatch.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 890b7fef933ad..846da2f77a2d0 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -270,6 +270,7 @@ module std [system] {
       module merge                    { private header "__algorithm/merge.h" }
       module min                      { private header "__algorithm/min.h" }
       module min_element              { private header "__algorithm/min_element.h" }
+      module min_max_result           { private header "__algorithm/min_max_result.h" }
       module minmax                   { private header "__algorithm/minmax.h" }
       module minmax_element           { private header "__algorithm/minmax_element.h" }
       module mismatch                 { private header "__algorithm/mismatch.h" }
diff --git a/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/min_max_result.module.verify.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/min_max_result.module.verify.cpp
new file mode 100644
index 0000000000000..69270642bdfac
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/min_max_result.module.verify.cpp
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: modules-build
+
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
+
+// expected-error@*:* {{use of private header from outside its module: '__algorithm/min_max_result.h'}}
+#include <__algorithm/min_max_result.h>
diff --git a/libcxx/test/std/algorithms/algorithms.results/min_max_result.pass.cpp b/libcxx/test/std/algorithms/algorithms.results/min_max_result.pass.cpp
new file mode 100644
index 0000000000000..b2d6081b6a88e
--- /dev/null
+++ b/libcxx/test/std/algorithms/algorithms.results/min_max_result.pass.cpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template <class I1, class I2>
+// struct min_max_result;
+
+#include <algorithm>
+#include <cassert>
+#include <type_traits>
+
+#include "MoveOnly.h"
+
+template <class T>
+struct ConvertibleFrom {
+  constexpr ConvertibleFrom(T c) : content{c} {}
+  T content;
+};
+
+struct A {
+  explicit A(int);
+};
+static_assert(!std::is_constructible_v<std::ranges::min_max_result<A>, std::ranges::min_max_result<int>>);
+
+struct B {
+  B(const int&);
+  B(int&&);
+};
+static_assert(std::is_constructible_v<std::ranges::min_max_result<B>, std::ranges::min_max_result<int>>);
+static_assert(std::is_constructible_v<std::ranges::min_max_result<B>, std::ranges::min_max_result<int>&>);
+static_assert(std::is_constructible_v<std::ranges::min_max_result<B>, const std::ranges::min_max_result<int>>);
+static_assert(std::is_constructible_v<std::ranges::min_max_result<B>, const std::ranges::min_max_result<int>&>);
+
+struct C {
+  C(int&);
+};
+static_assert(!std::is_constructible_v<std::ranges::min_max_result<C>, std::ranges::min_max_result<int>&>);
+
+static_assert(std::is_convertible_v<std::ranges::min_max_result<int>&, std::ranges::min_max_result<long>>);
+static_assert(std::is_convertible_v<const std::ranges::min_max_result<int>&, std::ranges::min_max_result<long>>);
+static_assert(std::is_convertible_v<std::ranges::min_max_result<int>&&, std::ranges::min_max_result<long>>);
+static_assert(std::is_convertible_v<const std::ranges::min_max_result<int>&&, std::ranges::min_max_result<long>>);
+
+struct NotConvertible {};
+static_assert(!std::is_convertible_v<std::ranges::min_max_result<NotConvertible>, std::ranges::min_max_result<int>>);
+
+constexpr bool test() {
+  {
+    std::ranges::min_max_result<double> res{10, 1};
+    assert(res.min == 10);
+    assert(res.max == 1);
+    std::ranges::min_max_result<ConvertibleFrom<int>> res2 = res;
+    assert(res2.min.content == 10);
+    assert(res2.max.content == 1);
+  }
+  {
+    std::ranges::min_max_result<MoveOnly> res{MoveOnly{}, MoveOnly{}};
+    assert(res.min.get() == 1);
+    assert(res.max.get() == 1);
+    [[maybe_unused]] auto res2 = static_cast<std::ranges::min_max_result<MoveOnly>>(std::move(res));
+    assert(res.min.get() == 0);
+    assert(res.max.get() == 0);
+  }
+  auto [min, max] = std::ranges::min_max_result<int>{1, 2};
+  assert(min == 1);
+  assert(max == 2);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}

From b6ca853b323dfdb7e3111a483d3f8bb4e15681d8 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 21 Feb 2022 21:52:37 +0000
Subject: [PATCH 451/748] [gn build] Port 807766be3a89

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 8767544d79e0c..fcf08806812f4 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -103,6 +103,7 @@ if (current_toolchain == default_toolchain) {
       "__algorithm/merge.h",
       "__algorithm/min.h",
       "__algorithm/min_element.h",
+      "__algorithm/min_max_result.h",
       "__algorithm/minmax.h",
       "__algorithm/minmax_element.h",
       "__algorithm/mismatch.h",

From c7b43b01dcfab173982bddd5268df35a8d54813f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 21 Feb 2022 22:56:14 +0100
Subject: [PATCH 452/748] [libc++] Replace [[no_unique_addredd]] with
 _LIBCPP_NO_UNIQUE_ADDRESS in __algorithm/min_max_result.h

---
 libcxx/include/__algorithm/min_max_result.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/min_max_result.h b/libcxx/include/__algorithm/min_max_result.h
index d20d98a521d8c..1d56a741f5bc5 100644
--- a/libcxx/include/__algorithm/min_max_result.h
+++ b/libcxx/include/__algorithm/min_max_result.h
@@ -29,8 +29,8 @@ namespace ranges {
 
 template <class _T1>
 struct min_max_result {
-  [[no_unique_address]] _T1 min;
-  [[no_unique_address]] _T1 max;
+  _LIBCPP_NO_UNIQUE_ADDRESS _T1 min;
+  _LIBCPP_NO_UNIQUE_ADDRESS _T1 max;
 
   template <class _T2>
     requires convertible_to<const _T1&, _T2>

From 68f4131c94d444c8c5d9c860a402df11cab34c76 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 21 Feb 2022 23:07:02 +0100
Subject: [PATCH 453/748] [libc++][ranges] Add ranges::in_found_result

Reviewed By: Quuxplusone, #libc

Spies: libcxx-commits, mgorny

Differential Revision: https://reviews.llvm.org/D119763
---
 libcxx/include/CMakeLists.txt                 |  1 +
 libcxx/include/__algorithm/in_found_result.h  | 49 ++++++++++
 libcxx/include/algorithm                      |  4 +
 libcxx/include/module.modulemap               |  1 +
 .../in_found_result.module.verify.cpp         | 15 ++++
 .../in_found_result.pass.cpp                  | 89 +++++++++++++++++++
 6 files changed, 159 insertions(+)
 create mode 100644 libcxx/include/__algorithm/in_found_result.h
 create mode 100644 libcxx/test/libcxx/diagnostics/detail.headers/algorithm/in_found_result.module.verify.cpp
 create mode 100644 libcxx/test/std/algorithms/algorithms.results/in_found_result.pass.cpp

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 3e5baa2d33d28..ce7954b0440f1 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -26,6 +26,7 @@ set(files
   __algorithm/generate.h
   __algorithm/generate_n.h
   __algorithm/half_positive.h
+  __algorithm/in_found_result.h
   __algorithm/in_fun_result.h
   __algorithm/in_in_out_result.h
   __algorithm/in_in_result.h
diff --git a/libcxx/include/__algorithm/in_found_result.h b/libcxx/include/__algorithm/in_found_result.h
new file mode 100644
index 0000000000000..08ebf2fbcc1ac
--- /dev/null
+++ b/libcxx/include/__algorithm/in_found_result.h
@@ -0,0 +1,49 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_IN_FOUND_RESULT_H
+#define _LIBCPP___ALGORITHM_IN_FOUND_RESULT_H
+
+#include <__concepts/convertible_to.h>
+#include <__config>
+#include <__utility/move.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace ranges {
+template <class _I1>
+struct in_found_result {
+  _LIBCPP_NO_UNIQUE_ADDRESS _I1 in;
+  bool found;
+
+  template <class _I2>
+    requires convertible_to<const _I1&, _I2>
+  _LIBCPP_HIDE_FROM_ABI constexpr operator in_found_result<_I2>() const & {
+    return {in, found};
+  }
+
+  template <class _I2>
+    requires convertible_to<_I1, _I2>
+  _LIBCPP_HIDE_FROM_ABI constexpr operator in_found_result<_I2>() && {
+    return {std::move(in), found};
+  }
+};
+} // namespace ranges
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_NO_CONCEPTS
+
+#endif // _LIBCPP___ALGORITHM_IN_FOUND_RESULT_H
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index 9149995caef7a..06314bdc66fb0 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -34,6 +34,9 @@ namespace ranges {
   template <class I1, class I2>
     struct min_max_result;    // since C++20
 
+  template <class I>
+    struct in_found_result;   // since C++20
+
   template<forward_iterator I, sentinel_for<I> S, class Proj = identity,
     indirect_strict_weak_order<projected<I, Proj>> Comp = ranges::less>             // since C++20
   constexpr I min_element(I first, S last, Comp comp = {}, Proj proj = {});
@@ -719,6 +722,7 @@ template <class BidirectionalIterator, class Compare>
 #include <__algorithm/generate.h>
 #include <__algorithm/generate_n.h>
 #include <__algorithm/half_positive.h>
+#include <__algorithm/in_found_result.h>
 #include <__algorithm/in_fun_result.h>
 #include <__algorithm/in_in_out_result.h>
 #include <__algorithm/in_in_result.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 846da2f77a2d0..ecfe080d325a0 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -248,6 +248,7 @@ module std [system] {
       module generate                 { private header "__algorithm/generate.h" }
       module generate_n               { private header "__algorithm/generate_n.h" }
       module half_positive            { private header "__algorithm/half_positive.h" }
+      module in_found_result          { private header "__algorithm/in_found_result.h" }
       module in_fun_result            { private header "__algorithm/in_fun_result.h" }
       module in_in_out_result         { private header "__algorithm/in_in_out_result.h" }
       module in_in_result             { private header "__algorithm/in_in_result.h" }
diff --git a/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/in_found_result.module.verify.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/in_found_result.module.verify.cpp
new file mode 100644
index 0000000000000..8c6b4acf96e1c
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/algorithm/in_found_result.module.verify.cpp
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: modules-build
+
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
+
+// expected-error@*:* {{use of private header from outside its module: '__algorithm/in_found_result.h'}}
+#include <__algorithm/in_found_result.h>
diff --git a/libcxx/test/std/algorithms/algorithms.results/in_found_result.pass.cpp b/libcxx/test/std/algorithms/algorithms.results/in_found_result.pass.cpp
new file mode 100644
index 0000000000000..2d923f017f170
--- /dev/null
+++ b/libcxx/test/std/algorithms/algorithms.results/in_found_result.pass.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template <class I>
+// struct in_found_result;
+
+#include <algorithm>
+#include <cassert>
+#include <type_traits>
+
+#include "MoveOnly.h"
+
+template <class T>
+struct ConvertibleFrom {
+  constexpr ConvertibleFrom(T c) : content{c} {}
+  T content;
+};
+
+struct A {
+  explicit A(int);
+};
+static_assert(!std::is_constructible_v<std::ranges::in_found_result<A>, std::ranges::in_found_result<int>>);
+
+struct B {
+  B(const int&);
+  B(int&&);
+};
+static_assert(std::is_constructible_v<std::ranges::in_found_result<B>, std::ranges::in_found_result<int>>);
+static_assert(std::is_constructible_v<std::ranges::in_found_result<B>, std::ranges::in_found_result<int>&>);
+static_assert(std::is_constructible_v<std::ranges::in_found_result<B>, const std::ranges::in_found_result<int>>);
+static_assert(std::is_constructible_v<std::ranges::in_found_result<B>, const std::ranges::in_found_result<int>&>);
+
+struct C {
+  C(int&);
+};
+static_assert(!std::is_constructible_v<std::ranges::in_found_result<C>, std::ranges::in_found_result<int>&>);
+
+static_assert(std::is_convertible_v<std::ranges::in_found_result<int>&, std::ranges::in_found_result<long>>);
+static_assert(std::is_convertible_v<const std::ranges::in_found_result<int>&, std::ranges::in_found_result<long>>);
+static_assert(std::is_convertible_v<std::ranges::in_found_result<int>&&, std::ranges::in_found_result<long>>);
+static_assert(std::is_convertible_v<const std::ranges::in_found_result<int>&&, std::ranges::in_found_result<long>>);
+
+struct NotConvertible {};
+static_assert(!std::is_convertible_v<std::ranges::in_found_result<NotConvertible>, std::ranges::in_found_result<int>>);
+
+static_assert(std::is_same_v<decltype(std::ranges::in_found_result<int>::in), int>);
+static_assert(std::is_same_v<decltype(std::ranges::in_found_result<int>::found), bool>);
+
+constexpr bool test() {
+  {
+    std::ranges::in_found_result<double> res{10, true};
+    assert(res.in == 10);
+    assert(res.found == true);
+    std::ranges::in_found_result<ConvertibleFrom<int>> res2 = res;
+    assert(res2.in.content == 10);
+    assert(res2.found);
+  }
+  {
+    std::ranges::in_found_result<MoveOnly> res{MoveOnly{}, false};
+    assert(res.in.get() == 1);
+    assert(!res.found);
+    auto res2 = std::move(res);
+    assert(res2.in.get() == 1);
+    assert(!res2.found);
+    assert(res.in.get() == 0);
+    assert(!res.found);
+  }
+  auto [in, found] = std::ranges::in_found_result<int>{2, false};
+  assert(in == 2);
+  assert(!found);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}

From 7f019317b612043e30efbe410290b4e4a720c409 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 21 Feb 2022 22:08:44 +0000
Subject: [PATCH 454/748] [gn build] Port 68f4131c94d4

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index fcf08806812f4..8daaf0889a0ac 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -81,6 +81,7 @@ if (current_toolchain == default_toolchain) {
       "__algorithm/generate.h",
       "__algorithm/generate_n.h",
       "__algorithm/half_positive.h",
+      "__algorithm/in_found_result.h",
       "__algorithm/in_fun_result.h",
       "__algorithm/in_in_out_result.h",
       "__algorithm/in_in_result.h",

From 3a17a817735708a401b1c73bc486d13f4f0215b5 Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao@amd.com>
Date: Mon, 21 Feb 2022 14:14:25 -0800
Subject: [PATCH 455/748] [AMDGPU] Regenerate tests to include -NEXT. NFC.

---
 .../AMDGPU/debug-value-scheduler-crash.mir    | 108 +++++++++---------
 ...ssert-dead-def-subreg-use-other-subreg.mir |  49 ++++----
 2 files changed, 83 insertions(+), 74 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
index 1da9cc14327a1..83a7321ea2984 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
@@ -22,57 +22,63 @@ frameInfo:
 body:             |
   ; CHECK-LABEL: name: could_not_use_debug_inst_to_query_mi2mimap
   ; CHECK: bb.0:
-  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   %9:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec
-  ; CHECK:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK: bb.1:
-  ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   DBG_VALUE
-  ; CHECK:   DBG_VALUE
-  ; CHECK:   DBG_VALUE
-  ; CHECK: bb.2:
-  ; CHECK:   successors: %bb.3(0x80000000)
-  ; CHECK:   S_BRANCH %bb.3
-  ; CHECK: bb.3:
-  ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]]
-  ; CHECK:   %16:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
-  ; CHECK:   %17:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
-  ; CHECK:   %18:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
-  ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
-  ; CHECK:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   %21:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
-  ; CHECK:   %22:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
-  ; CHECK:   dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 %22, [[DEF13]], implicit $mode, implicit $exec
-  ; CHECK:   dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 %21, [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
-  ; CHECK:   [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; CHECK:   $sgpr4 = IMPLICIT_DEF
-  ; CHECK:   $vgpr0 = COPY [[DEF11]]
-  ; CHECK:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
-  ; CHECK:   $vgpr1 = COPY [[DEF7]]
-  ; CHECK:   $vgpr0 = COPY %16
-  ; CHECK:   $vgpr1 = COPY %17
-  ; CHECK:   $vgpr2 = COPY %18
-  ; CHECK:   dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu_highregs, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
-  ; CHECK:   %25:vgpr_32 = nofpexcept V_ADD_F32_e32 %9, [[DEF8]], implicit $mode, implicit $exec
-  ; CHECK:   %25:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], %25, implicit $mode, implicit $exec
-  ; CHECK:   dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK:   dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK:   dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec
-  ; CHECK:   S_ENDPGM 0
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %9:vgpr_32 = nofpexcept V_MUL_F32_e32 1082130432, [[DEF1]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   DBG_VALUE
+  ; CHECK-NEXT:   DBG_VALUE
+  ; CHECK-NEXT:   DBG_VALUE
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   %16:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   %17:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   %18:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1092616192, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %21:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   %22:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 %22, [[DEF13]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 %21, [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $sgpr4 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $vgpr0 = COPY [[DEF11]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   $vgpr1 = COPY [[DEF7]]
+  ; CHECK-NEXT:   $vgpr0 = COPY %16
+  ; CHECK-NEXT:   $vgpr1 = COPY %17
+  ; CHECK-NEXT:   $vgpr2 = COPY %18
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu_highregs, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
+  ; CHECK-NEXT:   %25:vgpr_32 = nofpexcept V_ADD_F32_e32 %9, [[DEF8]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   %25:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], %25, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
 
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index 46ce3930ce5c6..aa6d2782525c9 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -18,29 +18,32 @@ machineFunctionInfo:
 body:             |
   ; CHECK-LABEL: name: multi_def_dead_reg_subreg_check
   ; CHECK: bb.0:
-  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $sgpr6_sgpr7
-  ; CHECK:   undef %0.sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_]], implicit $exec
-  ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   [[COPY:%[0-9]+]]:vreg_512 = COPY %0
-  ; CHECK: bb.1:
-  ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), align 8, addrspace 5)
-  ; CHECK:   dead %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec
-  ; CHECK:   dead %8:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
-  ; CHECK:   dead %9:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
-  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-  ; CHECK:   undef %11.sub1:vreg_512 = COPY [[COPY]].sub1
-  ; CHECK:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_LO16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
-  ; CHECK:   %11.sub0:vreg_512 = COPY [[COPY]].sub0
-  ; CHECK:   %11.sub3:vreg_512 = COPY [[COPY]].sub3
-  ; CHECK:   %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
-  ; CHECK:   %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
-  ; CHECK:   [[COPY2:%[0-9]+]]:vreg_512 = COPY %11
-  ; CHECK:   dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
-  ; CHECK:   S_BRANCH %bb.1
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %0.sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_]], implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_512 = COPY %0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, implicit $exec :: (store (s32), align 8, addrspace 5)
+  ; CHECK-NEXT:   dead %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead %8:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead %9:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+  ; CHECK-NEXT:   undef %11.sub1:vreg_512 = COPY [[COPY]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_LO16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
+  ; CHECK-NEXT:   %11.sub0:vreg_512 = COPY [[COPY]].sub0
+  ; CHECK-NEXT:   %11.sub3:vreg_512 = COPY [[COPY]].sub3
+  ; CHECK-NEXT:   %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_512 = COPY %11
+  ; CHECK-NEXT:   dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $sgpr6_sgpr7
 

From 13681ad6540f949e48230c5fb295ee9603c6be69 Mon Sep 17 00:00:00 2001
From: Michael Gottesman <mgottesman@apple.com>
Date: Mon, 21 Feb 2022 12:07:31 -0800
Subject: [PATCH 456/748] [move-function] Make test more generally by removing
 unneeded line.

Otherwise this is can be sensitive in the face of changes in register names.

I also gardened the test case a little to make it look a little nicer.

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D120276
---
 .../Coroutines/coro-debug-dbg.addr-swift.ll         | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.addr-swift.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.addr-swift.ll
index 38d50696f417f..23b38e138c131 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.addr-swift.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.addr-swift.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: define swifttailcc void @"$s10async_args14withGenericArgyyxnYalF"(%swift.context* swiftasync %0, %swift.opaque* noalias %1, %swift.type* %T){{.*}} {
 ; CHECK: call void @llvm.dbg.declare(
-; CHECK: llvm.dbg.addr
+; CHECK: call void @llvm.dbg.addr(
 ; CHECK-NOT: llvm.dbg.value
 ; CHECK-NOT: llvm.dbg.addr
 ; CHECK-NOT: llvm.dbg.declare
@@ -19,17 +19,16 @@
 
 ; CHECK-LABEL: define internal swifttailcc void @"$s10async_args14withGenericArgyyxnYalFTY0_"(i8* swiftasync %0)
 ; CHECK: entryresume.0
-; CHECK-NEXT: %.debug
-; CHECK-NEXT: call void @llvm.dbg.declare(
-; CHECK: llvm.dbg.addr
+; CHECK: call void @llvm.dbg.declare(
+; CHECK: call void @llvm.dbg.addr(
 ; CHECK: musttail call swifttailcc void @"$s10async_args10forceSplityyYaF"(%swift.context* swiftasync
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 
 ; CHECK: define internal swifttailcc void @"$s10async_args14withGenericArgyyxnYalFTQ1_"(i8* swiftasync %0)
-; CHECK: llvm.dbg.declare
-; CHECK: llvm.dbg.addr
-; CHECK: llvm.dbg.value(metadata %swift.opaque** undef,
+; CHECK: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.addr
+; CHECK: call void @llvm.dbg.value(metadata %swift.opaque** undef,
 ; CHECK: ret void
 ; CHECK-NEXT: }
 

From f7dfc5d1af6cfd1f9b6f2ac2d11f6074e8425ba7 Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Mon, 21 Feb 2022 14:31:14 -0800
Subject: [PATCH 457/748] [RISCV] Optimize tail agnostic vmv.s.x which don't
 need to select tail value.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120250
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 ++
 llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll  | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4395c139b7220..a54ae084cfe6b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4757,6 +4757,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     SDValue VL = getVLOperand(Op);
 
     SDValue SplattedVal = splatSplitI64WithVL(DL, VT, SDValue(), Scalar, VL, DAG);
+    if (Op.getOperand(1).isUndef())
+      return SplattedVal;
     SDValue SplattedIdx =
         DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, DAG.getUNDEF(VT),
                     DAG.getConstant(0, DL, MVT::i32), VL);
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
index 65b225ede7152..e7a11c6a930dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
@@ -1124,9 +1124,6 @@ define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(i64 %0, iXLen %1) nounwin
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vlse64.v v8, (a0), zero
-; RV32-NEXT:    vid.v v9
-; RV32-NEXT:    vmseq.vi v0, v9, 0
-; RV32-NEXT:    vmerge.vvm v8, v8, v8, v0
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;

From dc0981562e520a95e264a1fbe4596022d6055343 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 21 Feb 2022 11:22:11 -0800
Subject: [PATCH 458/748] [AMDGPU] Remove redundand check in the
 SILoadStoreOptimizer

Differential Revision: https://reviews.llvm.org/D120268
---
 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index d041c831b6db0..ed2b957e28d9a 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -747,8 +747,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
     return (EltOffset0 + CI.Width == EltOffset1 ||
             EltOffset1 + Paired.Width == EltOffset0) &&
-           CI.CPol == Paired.CPol &&
-           (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
+           CI.CPol == Paired.CPol;
   }
 
   // If the offset in elements doesn't fit in 8-bits, we might be able to use

From d97f997eb79d91b2872ac13619f49cb3a7120781 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Wed, 16 Feb 2022 12:50:31 -0800
Subject: [PATCH 459/748] [MachineOutliner][AArch64] NFC: Split MBBs into
 "outlinable ranges"

We found a case in the Swift benchmarks where the MachineOutliner introduces
about a 20% compile time overhead in comparison to building without the
MachineOutliner.

The origin of this slowdown is that the benchmark has long blocks which incur
lots of LRU checks for lots of candidates.

Imagine a case like this:

```
bb:
  i1
  i2
  i3
  ...
  i123456
```

Now imagine that all of the outlining candidates appear early in the block, and
that something like, say, NZCV is defined at the end of the block.

The outliner has to check liveness for certain registers across all candidates,
because outlining from areas where those registers are used is unsafe at call
boundaries.

This is fairly wasteful because in the previously-described case, the outlining
candidates will never appear in an area where those registers are live.

To avoid this, precalculate areas where we will consider outlining from.
Anything outside of these areas is mapped to illegal and not included in the
outlining search space. This allows us to reduce the size of the outliner's
suffix tree as well, giving us a potential memory win.

By precalculating areas, we can also optimize other checks too, like whether
or not LR is live across an outlining candidate.

Doing all of this is about a 16% compile time improvement on the case.

This is likely useful for other targets (e.g. ARM + RISCV) as well, but for now,
this only implements the AArch64 path. The original "is the MBB safe" method
still works as before.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  |  19 ++
 llvm/lib/CodeGen/MachineOutliner.cpp         |  67 ++++---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 175 +++++++++----------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h   |   9 +-
 4 files changed, 145 insertions(+), 125 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 12cd21617b0d4..a3209af8b2352 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1940,6 +1940,25 @@ class TargetInstrInfo : public MCInstrInfo {
   virtual bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
                                       unsigned &Flags) const;
 
+  /// Optional target hook which partitions \p MBB into outlinable ranges for
+  /// instruction mapping purposes. Each range is defined by two iterators:
+  /// [start, end).
+  ///
+  /// Ranges are expected to be ordered top-down. That is, ranges closer to the
+  /// top of the block should come before ranges closer to the end of the block.
+  ///
+  /// Ranges cannot overlap.
+  ///
+  /// If an entire block is mappable, then its range is [MBB.begin(), MBB.end())
+  ///
+  /// All instructions not present in an outlinable range are considered
+  /// illegal.
+  virtual SmallVector<
+      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
+  getOutlinableRanges(MachineBasicBlock &MBB, unsigned &Flags) const {
+    return {std::make_pair(MBB.begin(), MBB.end())};
+  }
+
   /// Insert a custom frame for outlined functions.
   virtual void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
                                   const outliner::OutlinedFunction &OF) const {
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 7ce655dce8e34..e74264276d4c1 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -258,6 +258,10 @@ struct InstructionMapper {
     if (!TII.isMBBSafeToOutlineFrom(MBB, Flags))
       return;
 
+    auto Ranges = TII.getOutlinableRanges(MBB, Flags);
+    if (Ranges.empty())
+      return;
+
     // Store info for the MBB for later outlining.
     MBBFlagsMap[&MBB] = Flags;
 
@@ -280,34 +284,47 @@ struct InstructionMapper {
     std::vector<unsigned> UnsignedVecForMBB;
     std::vector<MachineBasicBlock::iterator> InstrListForMBB;
 
-    for (MachineBasicBlock::iterator Et = MBB.end(); It != Et; ++It) {
-      // Keep track of where this instruction is in the module.
-      switch (TII.getOutliningType(It, Flags)) {
-      case InstrType::Illegal:
+    for (auto &Range : Ranges) {
+      auto RangeStart = Range.first;
+      auto RangeEnd = Range.second;
+      // Everything outside of an outlinable range is illegal.
+      for (; It != RangeStart; ++It)
         mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
                              InstrListForMBB);
-        break;
-
-      case InstrType::Legal:
-        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
-                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
-        break;
-
-      case InstrType::LegalTerminator:
-        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
-                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
-        // The instruction also acts as a terminator, so we have to record that
-        // in the string.
-        mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+      assert(It != MBB.end() && "Should still have instructions?");
+      // `It` is now positioned at the beginning of a range of instructions
+      // which may be outlinable. Check if each instruction is known to be safe.
+      for (; It != RangeEnd; ++It) {
+        // Keep track of where this instruction is in the module.
+        switch (TII.getOutliningType(It, Flags)) {
+        case InstrType::Illegal:
+          mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+                               InstrListForMBB);
+          break;
+
+        case InstrType::Legal:
+          mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                             NumLegalInBlock, UnsignedVecForMBB,
                              InstrListForMBB);
-        break;
-
-      case InstrType::Invisible:
-        // Normally this is set by mapTo(Blah)Unsigned, but we just want to
-        // skip this instruction. So, unset the flag here.
-        ++NumInvisible;
-        AddedIllegalLastTime = false;
-        break;
+          break;
+
+        case InstrType::LegalTerminator:
+          mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                             NumLegalInBlock, UnsignedVecForMBB,
+                             InstrListForMBB);
+          // The instruction also acts as a terminator, so we have to record
+          // that in the string.
+          mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+                               InstrListForMBB);
+          break;
+
+        case InstrType::Invisible:
+          // Normally this is set by mapTo(Blah)Unsigned, but we just want to
+          // skip this instruction. So, unset the flag here.
+          ++NumInvisible;
+          AddedIllegalLastTime = false;
+          break;
+        }
       }
     }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 84469dd257cab..d160a33b529e2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6783,48 +6783,11 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
 
   // Properties about candidate MBBs that hold for all of them.
   unsigned FlagsSetInAll = 0xF;
-
-  // Compute liveness information for each candidate, and set FlagsSetInAll.
   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
                 [&FlagsSetInAll](outliner::Candidate &C) {
                   FlagsSetInAll &= C.Flags;
                 });
 
-  // According to the AArch64 Procedure Call Standard, the following are
-  // undefined on entry/exit from a function call:
-  //
-  // * Registers x16, x17, (and thus w16, w17)
-  // * Condition codes (and thus the NZCV register)
-  //
-  // Because if this, we can't outline any sequence of instructions where
-  // one
-  // of these registers is live into/across it. Thus, we need to delete
-  // those
-  // candidates.
-  auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
-    // If the unsafe registers in this block are all dead, then we don't need
-    // to compute liveness here.
-    if (C.Flags & UnsafeRegsDead)
-      return false;
-    return C.isAnyUnavailableAcrossOrOutOfSeq(
-        {AArch64::W16, AArch64::W17, AArch64::NZCV}, TRI);
-  };
-
-  // Are there any candidates where those registers are live?
-  if (!(FlagsSetInAll & UnsafeRegsDead)) {
-    // Erase every candidate that violates the restrictions above. (It could be
-    // true that we have viable candidates, so it's not worth bailing out in
-    // the case that, say, 1 out of 20 candidates violate the restructions.)
-    llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall);
-
-    // If the sequence doesn't have enough candidates left, then we're done.
-    if (RepeatedSequenceLocs.size() < 2)
-      return outliner::OutlinedFunction();
-  }
-
-  // At this point, we have only "safe" candidates to outline. Figure out
-  // frame + call instruction information.
-
   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
 
   // Helper lambda which sets call information for every candidate.
@@ -6952,6 +6915,10 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
 
     // Check if we have to save LR.
     for (outliner::Candidate &C : RepeatedSequenceLocs) {
+      bool LRAvailable =
+          (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
+              ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
+              : true;
       // If we have a noreturn caller, then we're going to be conservative and
       // say that we have to save LR. If we don't have a ret at the end of the
       // block, then we can't reason about liveness accurately.
@@ -6962,7 +6929,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
 
       // Is LR available? If so, we don't need a save.
-      if (C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) && !IsNoReturn) {
+      if (LRAvailable && !IsNoReturn) {
         NumBytesNoStackCalls += 4;
         C.setCallInfo(MachineOutlinerNoLRSave, 4);
         CandidatesWithoutStackFixups.push_back(C);
@@ -7134,72 +7101,88 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
   return true;
 }
 
-bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
-                                              unsigned &Flags) const {
-  if (!TargetInstrInfo::isMBBSafeToOutlineFrom(MBB, Flags))
-    return false;
-  // Check if LR is available through all of the MBB. If it's not, then set
-  // a flag.
+SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
+AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
+                                      unsigned &Flags) const {
   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
-         "Suitable Machine Function for outlining must track liveness");
-  LiveRegUnits LRU(getRegisterInfo());
+         "Must track liveness!");
+  SmallVector<
+      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
+      Ranges;
 
-  std::for_each(MBB.rbegin(), MBB.rend(),
-                [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
+  // The range [RangeBegin, RangeEnd).
+  MachineBasicBlock::instr_iterator RangeEnd = MBB.instr_end();
+  MachineBasicBlock::instr_iterator RangeBegin = RangeEnd;
+  unsigned RangeLen = 0;
 
-  // Check if each of the unsafe registers are available...
-  bool W16AvailableInBlock = LRU.available(AArch64::W16);
-  bool W17AvailableInBlock = LRU.available(AArch64::W17);
-  bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
+  // According to the AArch64 Procedure Call Standard, the following are
+  // undefined on entry/exit from a function call:
+  //
+  // * Registers x16, x17, (and thus w16, w17)
+  // * Condition codes (and thus the NZCV register)
+  //
+  // If any of these registers are used inside or live across an outlined
+  // function, then they may be modified later, either by the compiler or
+  // some other tool (like the linker).
+  //
+  // To avoid outlining in these situations, partition each block into ranges
+  // where these registers are dead. We will only outline from those ranges.
+  LiveRegUnits LRU(getRegisterInfo());
+  auto AreAllUnsafeRegsDead = [&LRU]() {
+    return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
+           LRU.available(AArch64::NZCV);
+  };
 
-  // If all of these are dead (and not live out), we know we don't have to check
-  // them later.
-  if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
-    Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
+  // We need to know if LR is live across an outlining boundary later on in
+  // order to decide how we'll create the outlined call, frame, etc.
+  //
+  // It's pretty expensive to check this for *every candidate* within a block.
+  // That's some potentially n^2 behaviour, since in the worst case, we'd need
+  // to compute liveness from the end of the block for O(n) candidates within
+  // the block.
+  //
+  // So, to improve the average case, let's keep track of liveness from the end
+  // of the block to the beginning of *every outlinable range*. If we know that
+  // LR is available in every range we could outline from, then we know that
+  // we don't need to check liveness for any candidate within that range.
+  bool LRAvailableEverywhere = true;
 
-  // Now, add the live outs to the set.
+  // Compute liveness bottom-up.
   LRU.addLiveOuts(MBB);
-
-  // If any of these registers is available in the MBB, but also a live out of
-  // the block, then we know outlining is unsafe.
-  if (W16AvailableInBlock && !LRU.available(AArch64::W16))
-    return false;
-  if (W17AvailableInBlock && !LRU.available(AArch64::W17))
-    return false;
-  if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
-    return false;
-
-  // Check if there's a call inside this MachineBasicBlock. If there is, then
-  // set a flag.
-  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
-    Flags |= MachineOutlinerMBBFlags::HasCalls;
-
-  MachineFunction *MF = MBB.getParent();
-
-  // In the event that we outline, we may have to save LR. If there is an
-  // available register in the MBB, then we'll always save LR there. Check if
-  // this is true.
-  bool CanSaveLR = false;
-  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
-      MF->getSubtarget().getRegisterInfo());
-
-  // Check if there is an available register across the sequence that we can
-  // use.
-  for (unsigned Reg : AArch64::GPR64RegClass) {
-    if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
-        Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
-      CanSaveLR = true;
-      break;
+  for (auto &MI : make_range(MBB.instr_rbegin(), MBB.instr_rend())) {
+    LRU.stepBackward(MI);
+    // If we are in a range where all of the unsafe registers are dead, then
+    // update the beginning of the range. Also try to precalculate some stuff
+    // for getOutliningCandidateInfo.
+    if (AreAllUnsafeRegsDead()) {
+      if (MI.isCall())
+        Flags |= MachineOutlinerMBBFlags::HasCalls;
+      LRAvailableEverywhere &= LRU.available(AArch64::LR);
+      RangeBegin = MI.getIterator();
+      ++RangeLen;
+      continue;
     }
-  }
-
-  // Check if we have a register we can save LR to, and if LR was used
-  // somewhere. If both of those things are true, then we need to evaluate the
-  // safety of outlining stack instructions later.
-  if (!CanSaveLR && !LRU.available(AArch64::LR))
+    // At least one unsafe register is not dead. We do not want to outline at
+    // this point. If it is long enough to outline from, save the range
+    // [RangeBegin, RangeEnd).
+    if (RangeLen > 1)
+      Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
+    // Start a new range where RangeEnd is the first known unsafe point.
+    RangeLen = 0;
+    RangeBegin = MI.getIterator();
+    RangeEnd = MI.getIterator();
+  }
+  // Above loop misses the last (or only) range.
+  if (AreAllUnsafeRegsDead() && RangeLen > 1)
+    Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
+  if (Ranges.empty())
+    return Ranges;
+  // We found the ranges bottom-up. Mapping expects the top-down. Reverse
+  // the order.
+  std::reverse(Ranges.begin(), Ranges.end());
+  if (!LRAvailableEverywhere)
     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
-
-  return true;
+  return Ranges;
 }
 
 outliner::InstrType
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 55b1813f0b301..677e1443191cd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -280,10 +280,11 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                                    bool OutlineFromLinkOnceODRs) const override;
   outliner::OutlinedFunction getOutliningCandidateInfo(
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
-  outliner::InstrType
-  getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
-  bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
-                              unsigned &Flags) const override;
+  outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT,
+                                       unsigned Flags) const override;
+  SmallVector<
+      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
+  getOutlinableRanges(MachineBasicBlock &MBB, unsigned &Flags) const override;
   void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
                           const outliner::OutlinedFunction &OF) const override;
   MachineBasicBlock::iterator

From 294072e10b9949323bf20695085723158f7c873c Mon Sep 17 00:00:00 2001
From: Lian Wang <Lian.Wang@streamcomputing.com>
Date: Tue, 22 Feb 2022 02:20:32 +0000
Subject: [PATCH 460/748] [RISCV] Add more tests for SHLFI and UNSHFLI aliaes
 in Zbp extension

RV32/RV64:
zip.n/zip2.b/zip.b/zip4.h/zip2.h/zip.h
unzip.n/unzip2.b/unzip.b/unzip4.h/unzip2.h/unzip.h

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120241
---
 llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll | 108 ++++++++++
 llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll | 216 +++++++++++++++++++
 2 files changed, 324 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
index 816a27b2be4da..4f5ccca74b2cb 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
@@ -83,6 +83,60 @@ define i32 @shfl32_demandedbits(i32 %a, i32 %b) nounwind {
   ret i32 %tmp
 }
 
+define i32 @zipni32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zipni32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip.n a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 1)
+ ret i32 %tmp
+}
+
+define i32 @zip2bi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zip2bi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip2.b a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 2)
+ ret i32 %tmp
+}
+
+define i32 @zipbi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zipbi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip.b a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 3)
+ ret i32 %tmp
+}
+
+define i32 @zip4hi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zip4hi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip4.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 4)
+ ret i32 %tmp
+}
+
+define i32 @zip2hi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: zip2hi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip2.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 6)
+ ret i32 %tmp
+}
+
+define i32 @ziphi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: ziphi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    zip.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 7)
+ ret i32 %tmp
+}
+
 define i32 @shfli32(i32 %a) nounwind {
 ; RV32ZBP-LABEL: shfli32:
 ; RV32ZBP:       # %bb.0:
@@ -149,6 +203,60 @@ define i32 @unshfl32_demandedbits(i32 %a, i32 %b) nounwind {
   ret i32 %tmp
 }
 
+define i32 @unzipni32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzipni32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip.n a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 1)
+ ret i32 %tmp
+}
+
+define i32 @unzip2bi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzip2bi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip2.b a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 2)
+ ret i32 %tmp
+}
+
+define i32 @unzipbi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzipbi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip.b a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 3)
+ ret i32 %tmp
+}
+
+define i32 @unzip4hi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzip4hi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip4.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 4)
+ ret i32 %tmp
+}
+
+define i32 @unzip2hi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unzip2hi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip2.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 6)
+ ret i32 %tmp
+}
+
+define i32 @unziphi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: unziphi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    unzip.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 7)
+ ret i32 %tmp
+}
+
 define i32 @unshfli32(i32 %a) nounwind {
 ; RV32ZBP-LABEL: unshfli32:
 ; RV32ZBP:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
index 5a1736a250e57..b236fb6f060e8 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
@@ -91,6 +91,60 @@ define signext i32 @shfl32_demandedbits(i32 signext %a, i32 signext %b, i32 sign
   ret i32 %tmp
 }
 
+define signext i32 @zipni32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zipni32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.n a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 1)
+ ret i32 %tmp
+}
+
+define signext i32 @zip2bi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zip2bi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip2.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 2)
+ ret i32 %tmp
+}
+
+define signext i32 @zipbi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zipbi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 3)
+ ret i32 %tmp
+}
+
+define signext i32 @zip4hi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zip4hi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip4.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 4)
+ ret i32 %tmp
+}
+
+define signext i32 @zip2hi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: zip2hi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip2.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 6)
+ ret i32 %tmp
+}
+
+define signext i32 @ziphi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: ziphi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.shfl.i32(i32 %a, i32 7)
+ ret i32 %tmp
+}
+
 define signext i32 @shfli32(i32 signext %a) nounwind {
 ; RV64ZBP-LABEL: shfli32:
 ; RV64ZBP:       # %bb.0:
@@ -159,6 +213,60 @@ define signext i32 @unshfl32_demandedbits(i32 signext %a, i32 signext %b, i32 si
   ret i32 %tmp
 }
 
+define signext i32 @unzipni32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzipni32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.n a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 1)
+ ret i32 %tmp
+}
+
+define signext i32 @unzip2bi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzip2bi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip2.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 2)
+ ret i32 %tmp
+}
+
+define signext i32 @unzipbi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzipbi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 3)
+ ret i32 %tmp
+}
+
+define signext i32 @unzip4hi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzip4hi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip4.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 4)
+ ret i32 %tmp
+}
+
+define signext i32 @unzip2hi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unzip2hi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip2.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 6)
+ ret i32 %tmp
+}
+
+define signext i32 @unziphi32(i32 signext %a) nounwind {
+; RV64ZBP-LABEL: unziphi32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.unshfl.i32(i32 %a, i32 7)
+ ret i32 %tmp
+}
+
 define signext i32 @unshfli32(i32 signext %a) nounwind {
 ; RV64ZBP-LABEL: unshfli32:
 ; RV64ZBP:       # %bb.0:
@@ -359,6 +467,60 @@ define i64 @zipi64(i64 %a) nounwind {
  ret i64 %tmp
 }
 
+define i64 @zipni64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zipni64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.n a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 1)
+ ret i64 %tmp
+}
+
+define i64 @zip2bi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip2bi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip2.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 2)
+ ret i64 %tmp
+}
+
+define i64 @zipbi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zipbi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 3)
+ ret i64 %tmp
+}
+
+define i64 @zip4hi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip4hi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip4.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 4)
+ ret i64 %tmp
+}
+
+define i64 @zip2hi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: zip2hi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip2.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 6)
+ ret i64 %tmp
+}
+
+define i64 @ziphi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: ziphi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    zip.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.shfl.i64(i64 %a, i64 7)
+ ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.unshfl.i64(i64 %a, i64 %b)
 
 define i64 @unshfl64(i64 %a, i64 %b) nounwind {
@@ -452,6 +614,60 @@ define i64 @unzipi64(i64 %a) nounwind {
  ret i64 %tmp
 }
 
+define i64 @unzipni64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzipni64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.n a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 1)
+ ret i64 %tmp
+}
+
+define i64 @unzip2bi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip2bi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip2.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 2)
+ ret i64 %tmp
+}
+
+define i64 @unzipbi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzipbi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.b a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 3)
+ ret i64 %tmp
+}
+
+define i64 @unzip4hi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip4hi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip4.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 4)
+ ret i64 %tmp
+}
+
+define i64 @unzip2hi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unzip2hi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip2.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 6)
+ ret i64 %tmp
+}
+
+define i64 @unziphi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: unziphi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    unzip.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.unshfl.i64(i64 %a, i64 7)
+ ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.xperm.n.i64(i64 %a, i64 %b)
 
 define i64 @xpermn64(i64 %a, i64 %b) nounwind {

From 456ffd7a225f908ca61d747be21439f6182ea40c Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Mon, 21 Feb 2022 12:40:16 -0500
Subject: [PATCH 461/748] [OpenMP] Ensure offloading sections do not have
 SHF_ALLOC flag

We use offloading sections in the new Clang driver scheme to embed
device code into the host. We later use these sections to link the
device image, after which point they are completely unused and should
not be loaded into memory if they are still in the executable.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D120275
---
 llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 3 ++-
 llvm/test/CodeGen/X86/offload_sections.ll         | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/offload_sections.ll

diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 0892c74a7a24b..348470bd7687d 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -446,7 +446,8 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
                                       /*AddSegmentInfo=*/false) ||
       Name == getInstrProfSectionName(IPSK_covfun, Triple::ELF,
                                       /*AddSegmentInfo=*/false) ||
-      Name == ".llvmbc" || Name == ".llvmcmd")
+      Name == ".llvmbc" || Name == ".llvmcmd" ||
+      Name.startswith(".llvm.offloading."))
     return SectionKind::getMetadata();
 
   if (Name.empty() || Name[0] != '.') return K;
diff --git a/llvm/test/CodeGen/X86/offload_sections.ll b/llvm/test/CodeGen/X86/offload_sections.ll
new file mode 100644
index 0000000000000..dcd6dcb7cf3fc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/offload_sections.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+@llvm.embedded.object = hidden constant [1 x i8] c"\00", section ".llvm.offloading.dummy"
+@llvm.compiler.used = appending global [1 x i8*] [i8* getelementptr inbounds ([1 x i8], [1 x i8]* @llvm.embedded.object, i32 0, i32 0)], section "llvm.metadata"
+
+; CHECK-DAG: .section	.llvm.offloading.dummy,""

From 14101f48d205b6cbf65b28c469d898e90e3995d2 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <syaghmour@apple.com>
Date: Mon, 21 Feb 2022 18:46:12 -0800
Subject: [PATCH 462/748] [LLDB] Remove recursive include of
 GDBRemoteCommunicationServerCommon.h

GDBRemoteCommunicationServerCommon.h includes itself, removing this include.

Differential Revision: https://reviews.llvm.org/D120105
---
 .../Process/gdb-remote/GDBRemoteCommunicationServerCommon.h      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h
index 029972348ef01..f696cb5c61c66 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.h
@@ -15,7 +15,6 @@
 #include "lldb/lldb-private-forward.h"
 
 #include "GDBRemoteCommunicationServer.h"
-#include "GDBRemoteCommunicationServerCommon.h"
 
 class StringExtractorGDBRemote;
 

From b4990ac33015200b74d830beaea2883d313ac16c Mon Sep 17 00:00:00 2001
From: Tanya Lattner <tanyalattner@llvm.org>
Date: Mon, 21 Feb 2022 18:58:48 -0800
Subject: [PATCH 463/748] Update references to the mailing lists that have
 moved to Discourse.

---
 llvm/docs/GettingInvolved.rst | 33 +++++++++++----------------------
 llvm/docs/Security.rst        |  6 +++---
 llvm/docs/Statepoints.rst     |  3 +--
 llvm/docs/SupportPolicy.rst   |  4 ++--
 llvm/docs/index.rst           |  2 +-
 5 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 5d0ed6299f0b4..f7cd97b19e9db 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -96,21 +96,18 @@ Information about LLVM's development process.
 :doc:`Release notes for the current release <ReleaseNotes>`
    This describes new features, known bugs, and other limitations.
 
-.. _mailing-lists:
+.. _lists-forums:
 
-Mailing Lists
--------------
+Forums & Mailing Lists
+----------------------
 
-If you can't find what you need in these docs, try consulting the mailing
-lists. In addition to the traditional mailing lists there is also a
-`Discourse server <https://llvm.discourse.group>`_ available.
+If you can't find what you need in these docs, try consulting the
+Discourse forums. There are also commit mailing lists for all commits to the LLVM Project.
 
-`Developer's List (llvm-dev)`__
-  This list is for people who want to be included in technical discussions of
-  LLVM. People post to this list when they have questions about writing code
-  for or using the LLVM tools. It is relatively low volume.
+`LLVM Discourse`__
+  The forums for all things LLVM and related sub-projects. There are categories and subcategories for a wide variety of areas within LLVM. You can also view tags or search for a specific topic. 
 
-  .. __: http://lists.llvm.org/mailman/listinfo/llvm-dev
+  .. __: https://discourse.llvm.org/
 
 `Commits Archive (llvm-commits)`__
   This list contains all commit messages that are made when LLVM developers
@@ -127,18 +124,10 @@ lists. In addition to the traditional mailing lists there is also a
 
   .. __: http://lists.llvm.org/pipermail/llvm-bugs/
 
-`Test Results Archive (llvm-testresults)`__
-  A message is automatically sent to this list by every active nightly tester
-  when it completes.  As such, this list gets email several times each day,
-  making it a high volume list.
+`LLVM Announcements`__
+  If you just want project wide announcements such as releases, developers meetings, or blog posts, then you should check out the Announcement category on LLVM Discourse. 
 
-  .. __: http://lists.llvm.org/pipermail/llvm-testresults/
-
-`LLVM Announcements List (llvm-announce)`__
-  This is a low volume list that provides important announcements regarding
-  LLVM.  It gets email about once a month.
-
-  .. __: http://lists.llvm.org/mailman/listinfo/llvm-announce
+  .. __: https://discourse.llvm.org/c/announce/46 
 
 .. _online-sync-ups:
 
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 04cf5cabf8793..009cdafcb6381 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -22,7 +22,7 @@ How to report a security issue?
 
 To report a security issue in the LLVM Project, please `open a new issue`_ in the LLVM project page, on the chromium issue tracker.  Be sure to use the "Security bug report" template.
 
-We aim to acknowledge your report within two business days since you first reach out. If you do not receive any response by then, you can escalate by sending a message to the `llvm-dev mailing list`_ asking to get in touch with someone from the LLVM Security Group. **The escalation mailing list is public**: avoid discussing or mentioning the specific issue when posting on it.
+We aim to acknowledge your report within two business days since you first reach out. If you do not receive any response by then, you can escalate by posting on the `Discourse forums`_ asking to get in touch with someone from the LLVM Security Group. **The escalation mailing list is public**: avoid discussing or mentioning the specific issue when posting on it.
 
 
 Group Composition
@@ -177,7 +177,7 @@ We also occasionally need to discuss logistics of the LLVM Security Group itself
 * Propose member removal.
 * Suggest policy changes.
 
-We often have these discussions publicly, in our :ref:`monthly public sync-up call <online-sync-ups>` and on public LLVM mailing lists.  For internal or confidential discussions, we also use a private mailing list.
+We often have these discussions publicly, in our :ref:`monthly public sync-up call <online-sync-ups>` and on the Discourse forums.  For internal or confidential discussions, we also use a private mailing list.
 
 Process
 =======
@@ -230,6 +230,6 @@ The parts of the LLVM Project which are currently treated as non-security sensit
 .. _open a new issue: https://bugs.chromium.org/p/llvm/issues/entry
 .. _chromium issue tracker: https://crbug.com
 .. _GitHub security: https://help.github.com/en/articles/about-maintainer-security-advisories
-.. _llvm-dev mailing list: https://lists.llvm.org/mailman/listinfo/llvm-dev
+.. _Discourse forums: https://discourse.llvm.org
 .. _MITRE: https://cve.mitre.org
 .. _example nomination is available here: https://reviews.llvm.org/D99232
diff --git a/llvm/docs/Statepoints.rst b/llvm/docs/Statepoints.rst
index 15b4406761ef6..25f0a093c458c 100644
--- a/llvm/docs/Statepoints.rst
+++ b/llvm/docs/Statepoints.rst
@@ -803,7 +803,6 @@ tracked by performing a `bugzilla search
 <https://bugs.llvm.org/buglist.cgi?cmdtype=runnamed&namedcmd=Statepoint%20Bugs&list_id=64342>`_
 for [Statepoint] in the summary field. When filing new bugs, please
 use this tag so that interested parties see the newly filed bug.  As
-with most LLVM features, design discussions take place on `llvm-dev
-<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_, and patches
+with most LLVM features, design discussions take place on the `Discourse forums <https://discourse.llvm.org>`_ and patches
 should be sent to `llvm-commits
 <http://lists.llvm.org/mailman/listinfo/llvm-commits>`_ for review.
diff --git a/llvm/docs/SupportPolicy.rst b/llvm/docs/SupportPolicy.rst
index 2d68e4801793a..0766522a9c578 100644
--- a/llvm/docs/SupportPolicy.rst
+++ b/llvm/docs/SupportPolicy.rst
@@ -201,8 +201,8 @@ about it. In that sense, code will never be removed outright without a series
 of steps are taken.
 
 A minimum set of steps should be:
- #. A proposal for removal / deactivation should be made to the developers'
-    mailing lists (``llvm-dev``, ``cfe-dev``, ``lldb-dev``, etc), with a clear
+ #. A proposal for removal / deactivation should be made to the Discourse forums 
+    (under the appropriate category), with a clear
     statement of the maintenance costs imposed and the alternatives, if
     applicable.
  #. There must be enough consensus on the list that removal is warranted, and no
diff --git a/llvm/docs/index.rst b/llvm/docs/index.rst
index 12e30adae43ed..d1e1e785b0efb 100644
--- a/llvm/docs/index.rst
+++ b/llvm/docs/index.rst
@@ -83,7 +83,7 @@ LLVM welcomes contributions of all kinds. To learn more, see the following artic
 
 * :doc:`GettingInvolved`
 * :ref:`development-process`
-* :ref:`mailing-lists`
+* :ref:`lists-forums`
 * :ref:`meetups-social-events`
 * :ref:`community-proposals`
 

From 95fed2b267ee87e6cba72ebe73f0d6fdab37f995 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 21 Feb 2022 23:11:13 -0500
Subject: [PATCH 464/748] [Driver][OpenBSD] Pass sysroot to the linker

---
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 3 +++
 clang/test/Driver/openbsd.c             | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index bcd54bedfa897..7f19587f5f824 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -123,6 +123,9 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // handled somewhere else.
   Args.ClaimAllArgs(options::OPT_w);
 
+  if (!D.SysRoot.empty())
+    CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot));
+
   if (ToolChain.getArch() == llvm::Triple::mips64)
     CmdArgs.push_back("-EB");
   else if (ToolChain.getArch() == llvm::Triple::mips64el)
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index d6d5ae994e67e..da35d0441eb80 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -54,6 +54,12 @@
 // CHECK-MIPS64EL-LD: clang{{.*}}" "-cc1" "-triple" "mips64el-unknown-openbsd"
 // CHECK-MIPS64EL-LD: ld{{.*}}" "-EL" "-e" "__start" "--eh-frame-hdr" "-Bdynamic" "-dynamic-linker" "{{.*}}ld.so" "-o" "a.out" "{{.*}}crt0.o" "{{.*}}crtbegin.o" "-L{{.*}}" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
 
+// Check that --sysroot is passed to the linker
+// RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd %s -### 2>&1 \
+// RUN:   --sysroot=%S/Inputs/basic_netbsd_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-SYSROOT %s
+// CHECK-LD-SYSROOT: ld{{.*}}" "--sysroot=[[SYSROOT:[^"]+]]"
+
 // Check passing options to the assembler for various OpenBSD targets
 // RUN: %clang -target amd64-pc-openbsd -m32 -### -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AMD64-M32 %s

From 77e60bc42c48e16d646488d43210b1630cd4db49 Mon Sep 17 00:00:00 2001
From: owenca <owenpiano@gmail.com>
Date: Mon, 7 Feb 2022 22:58:50 -0800
Subject: [PATCH 465/748] [clang-format] Add option to insert braces after
 control statements

Adds a new option InsertBraces to insert the optional braces after
if, else, for, while, and do in C++.

Differential Revision: https://reviews.llvm.org/D120217
---
 clang/docs/ClangFormatStyleOptions.rst   |  33 ++++
 clang/docs/ReleaseNotes.rst              |   3 +
 clang/include/clang/Format/Format.h      |  32 ++++
 clang/lib/Format/Format.cpp              |  51 +++++-
 clang/lib/Format/FormatToken.h           |   6 +
 clang/lib/Format/UnwrappedLineParser.cpp |  69 +++++---
 clang/lib/Format/UnwrappedLineParser.h   |   1 +
 clang/unittests/Format/FormatTest.cpp    | 197 +++++++++++++++++++++++
 8 files changed, 373 insertions(+), 19 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 0cddf022ead3c..2cb67fa5492e3 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -2756,6 +2756,39 @@ the configuration (without a prefix: ``Auto``).
      LoooooooooooooooooooooooooooooooooooooooongReturnType
      LoooooooooooooooooooooooooooooooongFunctionDeclaration();
 
+**InsertBraces** (``Boolean``) :versionbadge:`clang-format 15`
+  Insert braces after control statements (``if``, ``else``, ``for``, ``do``,
+  and ``while``) in C++ unless the control statements are inside macro
+  definitions or the braces would enclose preprocessor directives.
+
+  .. warning:: 
+
+   Setting this option to `true` could lead to incorrect code formatting due
+   to clang-format's lack of complete semantic information. As such, extra
+   care should be taken to review code changes made by this option.
+
+  .. code-block:: c++
+
+    false:                                    true:
+
+    if (isa<FunctionDecl>(D))        vs.      if (isa<FunctionDecl>(D)) {
+      handleFunctionDecl(D);                    handleFunctionDecl(D);
+    else if (isa<VarDecl>(D))                 } else if (isa<VarDecl>(D)) {
+      handleVarDecl(D);                         handleVarDecl(D);
+    else                                      } else {
+      return;                                   return;
+                                              }
+
+    while (i--)                      vs.      while (i--) {
+      for (auto *A : D.attrs())                 for (auto *A : D.attrs()) {
+        handleAttr(A);                            handleAttr(A);
+                                                }
+                                              }
+
+    do                               vs.      do {
+      --i;                                      --i;
+    while (i);                                } while (i);
+
 **InsertTrailingCommas** (``TrailingCommaStyle``) :versionbadge:`clang-format 12`
   If set to ``TCS_Wrapped`` will insert trailing commas in container
   literals (arrays and objects) that wrap across multiple lines.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 43a2cf98e7c8b..499b065fe6e07 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -188,6 +188,9 @@ clang-format
 
 - Changed ``BreakBeforeConceptDeclarations`` from ``Boolean`` to an enum.
 
+- Option ``InsertBraces`` has been added to insert optional braces after control
+  statements.
+
 libclang
 --------
 
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index d4a479e7c5120..484438306b358 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -2571,6 +2571,38 @@ struct FormatStyle {
   /// \version 3.7
   bool IndentWrappedFunctionNames;
 
+  /// Insert braces after control statements (``if``, ``else``, ``for``, ``do``,
+  /// and ``while``) in C++ unless the control statements are inside macro
+  /// definitions or the braces would enclose preprocessor directives.
+  /// \warning
+  ///  Setting this option to `true` could lead to incorrect code formatting due
+  ///  to clang-format's lack of complete semantic information. As such, extra
+  ///  care should be taken to review code changes made by this option.
+  /// \endwarning
+  /// \code
+  ///   false:                                    true:
+  ///
+  ///   if (isa<FunctionDecl>(D))        vs.      if (isa<FunctionDecl>(D)) {
+  ///     handleFunctionDecl(D);                    handleFunctionDecl(D);
+  ///   else if (isa<VarDecl>(D))                 } else if (isa<VarDecl>(D)) {
+  ///     handleVarDecl(D);                         handleVarDecl(D);
+  ///   else                                      } else {
+  ///     return;                                   return;
+  ///                                             }
+  ///
+  ///   while (i--)                      vs.      while (i--) {
+  ///     for (auto *A : D.attrs())                 for (auto *A : D.attrs()) {
+  ///       handleAttr(A);                            handleAttr(A);
+  ///                                               }
+  ///                                             }
+  ///
+  ///   do                               vs.      do {
+  ///     --i;                                      --i;
+  ///   while (i);                                } while (i);
+  /// \endcode
+  /// \version 15
+  bool InsertBraces;
+
   /// A vector of prefixes ordered by the desired groups for Java imports.
   ///
   /// One group's prefix can be a subset of another - the longest prefix is
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index bc3f0c93426bf..ec6574b33a8cf 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -768,6 +768,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("IndentWidth", Style.IndentWidth);
     IO.mapOptional("IndentWrappedFunctionNames",
                    Style.IndentWrappedFunctionNames);
+    IO.mapOptional("InsertBraces", Style.InsertBraces);
     IO.mapOptional("InsertTrailingCommas", Style.InsertTrailingCommas);
     IO.mapOptional("JavaImportGroups", Style.JavaImportGroups);
     IO.mapOptional("JavaScriptQuotes", Style.JavaScriptQuotes);
@@ -1223,6 +1224,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.IndentWrappedFunctionNames = false;
   LLVMStyle.IndentWidth = 2;
   LLVMStyle.PPIndentWidth = -1;
+  LLVMStyle.InsertBraces = false;
   LLVMStyle.InsertTrailingCommas = FormatStyle::TCS_None;
   LLVMStyle.JavaScriptQuotes = FormatStyle::JSQS_Leave;
   LLVMStyle.JavaScriptWrapImports = true;
@@ -1661,7 +1663,7 @@ ParseError validateQualifierOrder(FormatStyle *Style) {
     return ParseError::DuplicateQualifierSpecified;
   }
 
-  // Ensure the list has 'type' in it
+  // Ensure the list has 'type' in it.
   auto type = std::find(Style->QualifierOrder.begin(),
                         Style->QualifierOrder.end(), "type");
   if (type == Style->QualifierOrder.end())
@@ -1821,6 +1823,48 @@ class BracesRemover : public TokenAnalyzer {
   }
 };
 
+class BracesInserter : public TokenAnalyzer {
+public:
+  BracesInserter(const Environment &Env, const FormatStyle &Style)
+      : TokenAnalyzer(Env, Style) {}
+
+  std::pair<tooling::Replacements, unsigned>
+  analyze(TokenAnnotator &Annotator,
+          SmallVectorImpl<AnnotatedLine *> &AnnotatedLines,
+          FormatTokenLexer &Tokens) override {
+    AffectedRangeMgr.computeAffectedLines(AnnotatedLines);
+    tooling::Replacements Result;
+    insertBraces(AnnotatedLines, Result);
+    return {Result, 0};
+  }
+
+private:
+  void insertBraces(SmallVectorImpl<AnnotatedLine *> &Lines,
+                    tooling::Replacements &Result) {
+    const auto &SourceMgr = Env.getSourceManager();
+    for (AnnotatedLine *Line : Lines) {
+      insertBraces(Line->Children, Result);
+      if (!Line->Affected)
+        continue;
+      for (FormatToken *Token = Line->First; Token && !Token->Finalized;
+           Token = Token->Next) {
+        if (Token->BraceCount == 0)
+          continue;
+        std::string Brace;
+        if (Token->BraceCount < 0) {
+          assert(Token->BraceCount == -1);
+          Brace = '{';
+        } else {
+          Brace = std::string(Token->BraceCount, '}');
+        }
+        Token->BraceCount = 0;
+        const auto Start = Token->Tok.getEndLoc();
+        cantFail(Result.add(tooling::Replacement(SourceMgr, Start, 0, Brace)));
+      }
+    }
+  }
+};
+
 class JavaScriptRequoter : public TokenAnalyzer {
 public:
   JavaScriptRequoter(const Environment &Env, const FormatStyle &Style)
@@ -3133,6 +3177,11 @@ reformat(const FormatStyle &Style, StringRef Code,
     });
   }
 
+  if (Style.isCpp() && Style.InsertBraces)
+    Passes.emplace_back([&](const Environment &Env) {
+      return BracesInserter(Env, Expanded).process();
+    });
+
   if (Style.isCpp() && Style.RemoveBracesLLVM)
     Passes.emplace_back([&](const Environment &Env) {
       return BracesRemover(Env, Expanded).process();
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 6b7d475232b0e..5f05986addf6a 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -489,6 +489,12 @@ struct FormatToken {
   /// Is optional and can be removed.
   bool Optional = false;
 
+  /// Number of optional braces to be inserted after this token:
+  ///   -1: a single left brace
+  ///    0: no braces
+  ///   >0: number of right braces
+  int8_t BraceCount = 0;
+
   /// If this token starts a block, this contains all the unwrapped lines
   /// in it.
   SmallVector<AnnotatedLine *, 1> Children;
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 4c5ab5346b7dd..7d29afb0c042c 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2300,6 +2300,53 @@ void UnwrappedLineParser::keepAncestorBraces() {
   NestedTooDeep.push_back(false);
 }
 
+static FormatToken *getLastNonComment(const UnwrappedLine &Line) {
+  for (const auto &Token : llvm::reverse(Line.Tokens))
+    if (Token.Tok->isNot(tok::comment))
+      return Token.Tok;
+
+  return nullptr;
+}
+
+void UnwrappedLineParser::parseUnbracedBody(bool CheckEOF) {
+  FormatToken *Tok = nullptr;
+
+  if (Style.InsertBraces && !Line->InPPDirective && !Line->Tokens.empty() &&
+      PreprocessorDirectives.empty()) {
+    Tok = getLastNonComment(*Line);
+    assert(Tok);
+    if (Tok->BraceCount < 0) {
+      assert(Tok->BraceCount == -1);
+      Tok = nullptr;
+    } else {
+      Tok->BraceCount = -1;
+    }
+  }
+
+  addUnwrappedLine();
+  ++Line->Level;
+  parseStructuralElement();
+
+  if (Tok) {
+    assert(!Line->InPPDirective);
+    Tok = nullptr;
+    for (const auto &L : llvm::reverse(*CurrentLines)) {
+      if (!L.InPPDirective) {
+        Tok = getLastNonComment(L);
+        if (Tok)
+          break;
+      }
+    }
+    assert(Tok);
+    ++Tok->BraceCount;
+  }
+
+  if (CheckEOF && FormatTok->is(tok::eof))
+    addUnwrappedLine();
+
+  --Line->Level;
+}
+
 static void markOptionalBraces(FormatToken *LeftBrace) {
   if (!LeftBrace)
     return;
@@ -2354,10 +2401,7 @@ FormatToken *UnwrappedLineParser::parseIfThenElse(IfStmtKind *IfKind,
     else
       NeedsUnwrappedLine = true;
   } else {
-    addUnwrappedLine();
-    ++Line->Level;
-    parseStructuralElement();
-    --Line->Level;
+    parseUnbracedBody();
   }
 
   bool KeepIfBraces = false;
@@ -2403,12 +2447,7 @@ FormatToken *UnwrappedLineParser::parseIfThenElse(IfStmtKind *IfKind,
       if (IsPrecededByComment)
         --Line->Level;
     } else {
-      addUnwrappedLine();
-      ++Line->Level;
-      parseStructuralElement();
-      if (FormatTok->is(tok::eof))
-        addUnwrappedLine();
-      --Line->Level;
+      parseUnbracedBody(/*CheckEOF=*/true);
     }
   } else {
     if (Style.RemoveBracesLLVM)
@@ -2654,10 +2693,7 @@ void UnwrappedLineParser::parseForOrWhileLoop() {
     }
     addUnwrappedLine();
   } else {
-    addUnwrappedLine();
-    ++Line->Level;
-    parseStructuralElement();
-    --Line->Level;
+    parseUnbracedBody();
   }
 
   if (Style.RemoveBracesLLVM)
@@ -2676,10 +2712,7 @@ void UnwrappedLineParser::parseDoWhile() {
     if (Style.BraceWrapping.BeforeWhile)
       addUnwrappedLine();
   } else {
-    addUnwrappedLine();
-    ++Line->Level;
-    parseStructuralElement();
-    --Line->Level;
+    parseUnbracedBody();
   }
 
   if (Style.RemoveBracesLLVM)
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 52f7618d9beab..b2a2ae1bedc17 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -119,6 +119,7 @@ class UnwrappedLineParser {
   void parseParens(TokenType AmpAmpTokenType = TT_Unknown);
   void parseSquare(bool LambdaIntroducer = false);
   void keepAncestorBraces();
+  void parseUnbracedBody(bool CheckEOF = false);
   FormatToken *parseIfThenElse(IfStmtKind *IfKind, bool KeepBraces = false);
   void parseTryCatch();
   void parseForOrWhileLoop();
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index f6810766d83db..51f6239bf2100 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -19474,6 +19474,7 @@ TEST_F(FormatTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL_FIELD(IndentRequiresClause, "IndentRequires");
   CHECK_PARSE_BOOL(IndentRequiresClause);
   CHECK_PARSE_BOOL(IndentWrappedFunctionNames);
+  CHECK_PARSE_BOOL(InsertBraces);
   CHECK_PARSE_BOOL(KeepEmptyLinesAtTheStartOfBlocks);
   CHECK_PARSE_BOOL(ObjCSpaceAfterProperty);
   CHECK_PARSE_BOOL(ObjCSpaceBeforeProtocolList);
@@ -24300,6 +24301,202 @@ TEST_F(FormatTest, ShortTemplatedArgumentLists) {
   verifyFormat("template <int N> struct Foo<char[N]> {};", Style);
 }
 
+TEST_F(FormatTest, InsertBraces) {
+  FormatStyle Style = getLLVMStyle();
+  Style.InsertBraces = true;
+
+  verifyFormat("// clang-format off\n"
+               "// comment\n"
+               "if (a) f();\n"
+               "// clang-format on\n"
+               "if (b) {\n"
+               "  g();\n"
+               "}",
+               "// clang-format off\n"
+               "// comment\n"
+               "if (a) f();\n"
+               "// clang-format on\n"
+               "if (b) g();",
+               Style);
+
+  verifyFormat("if (a) {\n"
+               "  switch (b) {\n"
+               "  case 1:\n"
+               "    c = 0;\n"
+               "    break;\n"
+               "  default:\n"
+               "    c = 1;\n"
+               "  }\n"
+               "}",
+               "if (a)\n"
+               "  switch (b) {\n"
+               "  case 1:\n"
+               "    c = 0;\n"
+               "    break;\n"
+               "  default:\n"
+               "    c = 1;\n"
+               "  }",
+               Style);
+
+  verifyFormat("for (auto node : nodes) {\n"
+               "  if (node) {\n"
+               "    break;\n"
+               "  }\n"
+               "}",
+               "for (auto node : nodes)\n"
+               "  if (node)\n"
+               "    break;",
+               Style);
+
+  verifyFormat("for (auto node : nodes) {\n"
+               "  if (node)\n"
+               "}",
+               "for (auto node : nodes)\n"
+               "  if (node)",
+               Style);
+
+  verifyFormat("do {\n"
+               "  --a;\n"
+               "} while (a);",
+               "do\n"
+               "  --a;\n"
+               "while (a);",
+               Style);
+
+  verifyFormat("if (i) {\n"
+               "  ++i;\n"
+               "} else {\n"
+               "  --i;\n"
+               "}",
+               "if (i)\n"
+               "  ++i;\n"
+               "else {\n"
+               "  --i;\n"
+               "}",
+               Style);
+
+  verifyFormat("void f() {\n"
+               "  while (j--) {\n"
+               "    while (i) {\n"
+               "      --i;\n"
+               "    }\n"
+               "  }\n"
+               "}",
+               "void f() {\n"
+               "  while (j--)\n"
+               "    while (i)\n"
+               "      --i;\n"
+               "}",
+               Style);
+
+  verifyFormat("f({\n"
+               "  if (a) {\n"
+               "    g();\n"
+               "  }\n"
+               "});",
+               "f({\n"
+               "  if (a)\n"
+               "    g();\n"
+               "});",
+               Style);
+
+  verifyFormat("if (a) {\n"
+               "  f();\n"
+               "} else if (b) {\n"
+               "  g();\n"
+               "} else {\n"
+               "  h();\n"
+               "}",
+               "if (a)\n"
+               "  f();\n"
+               "else if (b)\n"
+               "  g();\n"
+               "else\n"
+               "  h();",
+               Style);
+
+  verifyFormat("if (a) {\n"
+               "  f();\n"
+               "}\n"
+               "// comment\n"
+               "/* comment */",
+               "if (a)\n"
+               "  f();\n"
+               "// comment\n"
+               "/* comment */",
+               Style);
+
+  verifyFormat("if (a) {\n"
+               "  // foo\n"
+               "  // bar\n"
+               "  f();\n"
+               "}",
+               "if (a)\n"
+               "  // foo\n"
+               "  // bar\n"
+               "  f();",
+               Style);
+
+  verifyFormat("if (a) { // comment\n"
+               "  // comment\n"
+               "  f();\n"
+               "}",
+               "if (a) // comment\n"
+               "  // comment\n"
+               "  f();",
+               Style);
+
+  verifyFormat("if (a) {\n"
+               "  f();\n"
+               "}\n"
+               "#undef A\n"
+               "#undef B",
+               "if (a)\n"
+               "  f();\n"
+               "#undef A\n"
+               "#undef B",
+               Style);
+
+  verifyFormat("if (a)\n"
+               "#ifdef A\n"
+               "  f();\n"
+               "#else\n"
+               "  g();\n"
+               "#endif",
+               Style);
+
+  verifyFormat("#if 0\n"
+               "#elif 1\n"
+               "#endif\n"
+               "void f() {\n"
+               "  if (a) {\n"
+               "    g();\n"
+               "  }\n"
+               "}",
+               "#if 0\n"
+               "#elif 1\n"
+               "#endif\n"
+               "void f() {\n"
+               "  if (a) g();\n"
+               "}",
+               Style);
+
+  Style.ColumnLimit = 15;
+
+  verifyFormat("#define A     \\\n"
+               "  if (a)      \\\n"
+               "    f();",
+               Style);
+
+  verifyFormat("if (a + b >\n"
+               "    c) {\n"
+               "  f();\n"
+               "}",
+               "if (a + b > c)\n"
+               "  f();",
+               Style);
+}
+
 TEST_F(FormatTest, RemoveBraces) {
   FormatStyle Style = getLLVMStyle();
   Style.RemoveBracesLLVM = true;

From 289b725051cfb4a7167936db89583aa6b8a12d18 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 21 Feb 2022 23:44:18 -0500
Subject: [PATCH 466/748] [Driver][OpenBSD] Test tweaking and clean up

---
 clang/test/Driver/openbsd.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index da35d0441eb80..04a46f2862e71 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -1,14 +1,10 @@
-// RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd %s -### 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-LD %s
-// CHECK-LD: clang{{.*}}" "-cc1" "-triple" "i686-pc-openbsd"
-// CHECK-LD: ld{{.*}}" "-e" "__start" "--eh-frame-hdr" "-Bdynamic" "-dynamic-linker" "{{.*}}ld.so" "-o" "a.out" "{{.*}}crt0.o" "{{.*}}crtbegin.o" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
-
 // Check for --eh-frame-hdr being passed with static linking
 // RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd -static %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-STATIC-EH %s
 // CHECK-LD-STATIC-EH: clang{{.*}}" "-cc1" "-triple" "i686-pc-openbsd"
 // CHECK-LD-STATIC-EH: ld{{.*}}" "-e" "__start" "--eh-frame-hdr" "-Bstatic" "-o" "a.out" "{{.*}}rcrt0.o" "{{.*}}crtbegin.o" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
 
+// Check for profiling variants of libraries when linking and -nopie
 // RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd -pg -pthread %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG %s
 // CHECK-PG: clang{{.*}}" "-cc1" "-triple" "i686-pc-openbsd"
@@ -56,7 +52,7 @@
 
 // Check that --sysroot is passed to the linker
 // RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd %s -### 2>&1 \
-// RUN:   --sysroot=%S/Inputs/basic_netbsd_tree \
+// RUN:   --sysroot=%S/Inputs/basic_openbsd_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-SYSROOT %s
 // CHECK-LD-SYSROOT: ld{{.*}}" "--sysroot=[[SYSROOT:[^"]+]]"
 
@@ -86,11 +82,6 @@
 // CHECK-MIPS64EL: as{{.*}}" "-mabi" "64" "-EL"
 // CHECK-MIPS64EL-PIC: as{{.*}}" "-mabi" "64" "-EL" "-KPIC"
 
-// Check that the integrated assembler is enabled for SPARC
-// RUN: %clang -target sparc64-unknown-openbsd -### -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-IAS %s
-// CHECK-IAS-NOT: "-no-integrated-as"
-
 // Check linking against correct startup code when (not) using PIE
 // RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-PIE %s

From 8d9eeb03b3e9c800843659f243242f262d7bd786 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 22 Feb 2022 00:20:46 -0500
Subject: [PATCH 467/748] [Driver][OpenBSD] Add comments for C++ tests

---
 clang/test/Driver/openbsd.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/test/Driver/openbsd.cpp b/clang/test/Driver/openbsd.cpp
index 23c365d28e7ed..417783b8d5a2b 100644
--- a/clang/test/Driver/openbsd.cpp
+++ b/clang/test/Driver/openbsd.cpp
@@ -1,3 +1,4 @@
+// Check libraries used when linking C++
 // RUN: %clangxx %s -### -o %t.o -target amd64-pc-openbsd 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CXX %s
 // RUN: %clangxx %s -### -o %t.o -target i686-pc-openbsd 2>&1 \
@@ -8,6 +9,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-CXX %s
 // CHECK-CXX: "-lc++" "-lc++abi" "-lpthread" "-lm"
 
+// Check for profiling variants of libraries when linking C++
 // RUN: %clangxx %s -### -pg -o %t.o -target amd64-pc-openbsd 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-CXX %s
 // RUN: %clangxx %s -### -pg -o %t.o -target i686-pc-openbsd 2>&1 \

From 5acd9c49a85fb07d8cc04803c27e7aa1fb8c0211 Mon Sep 17 00:00:00 2001
From: jacquesguan <Jianjian.Guan@streamcomputing.com>
Date: Wed, 19 Jan 2022 15:32:06 +0800
Subject: [PATCH 468/748] [RISCV] Add patterns for vector widening integer
 reduction instructions

Add patterns for vector widening integer reduction instructions.

Differential Revision: https://reviews.llvm.org/D117643
---
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td |   47 +
 .../RISCV/rvv/fixed-vectors-reduction-int.ll  | 1244 ++++++++++++++++-
 .../CodeGen/RISCV/rvv/vreductions-int-rv32.ll |  294 ++++
 .../CodeGen/RISCV/rvv/vreductions-int-rv64.ll |  270 ++++
 .../CodeGen/RISCV/rvv/vreductions-int-vp.ll   |  210 +++
 5 files changed, 2063 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 9e84cae445a9b..52e7af0cb20aa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -685,6 +685,46 @@ multiclass VPatBinarySDNode_V_WX<SDNode op, string instruction_name> {
   defm : VPatBinarySDNodeExt_V_WX<op, zext_oneuse, instruction_name>;
 }
 
+multiclass VPatWidenReductionVL<SDNode vop, PatFrags extop, string instruction_name> {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    defvar wti_m1 = !cast<VTypeInfo>("VI"#wti.SEW#"M1");
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
+                                 VR:$rs2, (vti.Mask true_mask), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX)
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>;
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
+                                 VR:$rs2, (vti.Mask V0), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_MASK")
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenReductionVL_Ext_VL<SDNode vop, SDNode extop, string instruction_name> {
+  foreach vtiToWti = AllWidenableIntVectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar wti = vtiToWti.Wti;
+    defvar wti_m1 = !cast<VTypeInfo>("VI"#wti.SEW#"M1");
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
+                                 VR:$rs2, (vti.Mask true_mask), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX)
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>;
+    def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+                                 (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
+                                 VR:$rs2, (vti.Mask V0), VLOpFrag)),
+             (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_MASK")
+                (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Patterns.
 //===----------------------------------------------------------------------===//
@@ -1082,6 +1122,13 @@ defm : VPatReductionVL<rvv_vecreduce_SMIN_vl, "PseudoVREDMIN", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_AND_vl,  "PseudoVREDAND", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_OR_vl,   "PseudoVREDOR", /*is_float*/0>;
 defm : VPatReductionVL<rvv_vecreduce_XOR_vl,  "PseudoVREDXOR", /*is_float*/0>;
+
+// 15.2. Vector Widening Integer Reduction Instructions
+defm : VPatWidenReductionVL<rvv_vecreduce_ADD_vl, anyext_oneuse, "PseudoVWREDSUMU">;
+defm : VPatWidenReductionVL<rvv_vecreduce_ADD_vl, zext_oneuse, "PseudoVWREDSUMU">;
+defm : VPatWidenReductionVL_Ext_VL<rvv_vecreduce_ADD_vl, riscv_zext_vl, "PseudoVWREDSUMU">;
+defm : VPatWidenReductionVL<rvv_vecreduce_ADD_vl, sext_oneuse, "PseudoVWREDSUM">;
+defm : VPatWidenReductionVL_Ext_VL<rvv_vecreduce_ADD_vl, riscv_sext_vl, "PseudoVWREDSUM">;
 } // Predicates = [HasVInstructions]
 
 // 15.3. Vector Single-Width Floating-Point Reduction Instructions
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 4844db616b6fb..77d72335dc81e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -173,6 +173,36 @@ define i16 @vreduce_add_v1i16(<1 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v1i16(<1 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
+; CHECK-NEXT:    vsext.vf2 v9, v8
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
+  %v = load <1 x i8>, <1 x i8>* %x
+  %e = sext <1 x i8> %v to <1 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v1i16(<1 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
+  %v = load <1 x i8>, <1 x i8>* %x
+  %e = zext <1 x i8> %v to <1 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
 
 define i16 @vreduce_add_v2i16(<2 x i16>* %x) {
@@ -189,6 +219,42 @@ define i16 @vreduce_add_v2i16(<2 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v2i16(<2 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, <2 x i8>* %x
+  %e = sext <2 x i8> %v to <2 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v2i16(<2 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, <2 x i8>* %x
+  %e = zext <2 x i8> %v to <2 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
 
 define i16 @vreduce_add_v4i16(<4 x i16>* %x) {
@@ -205,6 +271,42 @@ define i16 @vreduce_add_v4i16(<4 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v4i16(<4 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, <4 x i8>* %x
+  %e = sext <4 x i8> %v to <4 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v4i16(<4 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, <4 x i8>* %x
+  %e = zext <4 x i8> %v to <4 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
 
 define i16 @vreduce_add_v8i16(<8 x i16>* %x) {
@@ -221,6 +323,42 @@ define i16 @vreduce_add_v8i16(<8 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v8i16(<8 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, <8 x i8>* %x
+  %e = sext <8 x i8> %v to <8 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v8i16(<8 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, <8 x i8>* %x
+  %e = zext <8 x i8> %v to <8 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 
 define i16 @vreduce_add_v16i16(<16 x i16>* %x) {
@@ -237,6 +375,42 @@ define i16 @vreduce_add_v16i16(<16 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v16i16(<16 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, <16 x i8>* %x
+  %e = sext <16 x i8> %v to <16 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v16i16(<16 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, <16 x i8>* %x
+  %e = zext <16 x i8> %v to <16 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
 
 define i16 @vreduce_add_v32i16(<32 x i16>* %x) {
@@ -256,6 +430,44 @@ define i16 @vreduce_add_v32i16(<32 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v32i16(<32 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, <32 x i8>* %x
+  %e = sext <32 x i8> %v to <32 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v32i16(<32 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, <32 x i8>* %x
+  %e = zext <32 x i8> %v to <32 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
 
 define i16 @vreduce_add_v64i16(<64 x i16>* %x) {
@@ -275,6 +487,44 @@ define i16 @vreduce_add_v64i16(<64 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v64i16(<64 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v64i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v12
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <64 x i8>, <64 x i8>* %x
+  %e = sext <64 x i8> %v to <64 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v64i16(<64 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v64i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v12
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <64 x i8>, <64 x i8>* %x
+  %e = zext <64 x i8> %v to <64 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
 
 define i16 @vreduce_add_v128i16(<128 x i16>* %x) {
@@ -297,6 +547,52 @@ define i16 @vreduce_add_v128i16(<128 x i16>* %x) {
   ret i16 %red
 }
 
+define i16 @vwreduce_add_v128i16(<128 x i8>* %x) {
+; CHECK-LABEL: vwreduce_add_v128i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
+; CHECK-NEXT:    vslidedown.vx v16, v8, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
+; CHECK-NEXT:    vwadd.vv v24, v8, v16
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
+; CHECK-NEXT:    vredsum.vs v8, v24, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <128 x i8>, <128 x i8>* %x
+  %e = sext <128 x i8> %v to <128 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
+  ret i16 %red
+}
+
+define i16 @vwreduce_uadd_v128i16(<128 x i8>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v128i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
+; CHECK-NEXT:    vslidedown.vx v16, v8, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
+; CHECK-NEXT:    vwaddu.vv v24, v8, v16
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
+; CHECK-NEXT:    vredsum.vs v8, v24, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <128 x i8>, <128 x i8>* %x
+  %e = zext <128 x i8> %v to <128 x i16>
+  %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e)
+  ret i16 %red
+}
+
 declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>)
 
 define i32 @vreduce_add_v1i32(<1 x i32>* %x) {
@@ -311,6 +607,36 @@ define i32 @vreduce_add_v1i32(<1 x i32>* %x) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_v1i32(<1 x i16>* %x) {
+; CHECK-LABEL: vwreduce_add_v1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT:    vsext.vf2 v9, v8
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
+  %v = load <1 x i16>, <1 x i16>* %x
+  %e = sext <1 x i16> %v to <1 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_v1i32(<1 x i16>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    vmv.x.s a0, v9
+; CHECK-NEXT:    ret
+  %v = load <1 x i16>, <1 x i16>* %x
+  %e = zext <1 x i16> %v to <1 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 
 define i32 @vreduce_add_v2i32(<2 x i32>* %x) {
@@ -327,6 +653,42 @@ define i32 @vreduce_add_v2i32(<2 x i32>* %x) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_v2i32(<2 x i16>* %x) {
+; CHECK-LABEL: vwreduce_add_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, <2 x i16>* %x
+  %e = sext <2 x i16> %v to <2 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_v2i32(<2 x i16>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, <2 x i16>* %x
+  %e = zext <2 x i16> %v to <2 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
 define i32 @vreduce_add_v4i32(<4 x i32>* %x) {
@@ -343,6 +705,42 @@ define i32 @vreduce_add_v4i32(<4 x i32>* %x) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_v4i32(<4 x i16>* %x) {
+; CHECK-LABEL: vwreduce_add_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, <4 x i16>* %x
+  %e = sext <4 x i16> %v to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_v4i32(<4 x i16>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, <4 x i16>* %x
+  %e = zext <4 x i16> %v to <4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @vreduce_add_v8i32(<8 x i32>* %x) {
@@ -359,8 +757,44 @@ define i32 @vreduce_add_v8i32(<8 x i32>* %x) {
   ret i32 %red
 }
 
-declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
-
+define i32 @vwreduce_add_v8i32(<8 x i16>* %x) {
+; CHECK-LABEL: vwreduce_add_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, <8 x i16>* %x
+  %e = sext <8 x i16> %v to <8 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_v8i32(<8 x i16>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, <8 x i16>* %x
+  %e = zext <8 x i16> %v to <8 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e)
+  ret i32 %red
+}
+
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+
 define i32 @vreduce_add_v16i32(<16 x i32>* %x) {
 ; CHECK-LABEL: vreduce_add_v16i32:
 ; CHECK:       # %bb.0:
@@ -375,6 +809,42 @@ define i32 @vreduce_add_v16i32(<16 x i32>* %x) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_v16i32(<16 x i16>* %x) {
+; CHECK-LABEL: vwreduce_add_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, <16 x i16>* %x
+  %e = sext <16 x i16> %v to <16 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_v16i32(<16 x i16>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, <16 x i16>* %x
+  %e = zext <16 x i16> %v to <16 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 
 define i32 @vreduce_add_v32i32(<32 x i32>* %x) {
@@ -394,6 +864,44 @@ define i32 @vreduce_add_v32i32(<32 x i32>* %x) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_v32i32(<32 x i16>* %x) {
+; CHECK-LABEL: vwreduce_add_v32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v12
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <32 x i16>, <32 x i16>* %x
+  %e = sext <32 x i16> %v to <32 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_v32i32(<32 x i16>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v12, zero
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v12
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <32 x i16>, <32 x i16>* %x
+  %e = zext <32 x i16> %v to <32 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
 
 define i32 @vreduce_add_v64i32(<64 x i32>* %x) {
@@ -416,6 +924,52 @@ define i32 @vreduce_add_v64i32(<64 x i32>* %x) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_v64i32(<64 x i16>* %x) {
+; CHECK-LABEL: vwreduce_add_v64i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
+; CHECK-NEXT:    vslidedown.vx v16, v8, a0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
+; CHECK-NEXT:    vwadd.vv v24, v8, v16
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
+; CHECK-NEXT:    vredsum.vs v8, v24, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <64 x i16>, <64 x i16>* %x
+  %e = sext <64 x i16> %v to <64 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_v64i32(<64 x i16>* %x) {
+; CHECK-LABEL: vwreduce_uadd_v64i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
+; CHECK-NEXT:    vslidedown.vx v16, v8, a0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
+; CHECK-NEXT:    vwaddu.vv v24, v8, v16
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
+; CHECK-NEXT:    vredsum.vs v8, v24, v8
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %v = load <64 x i16>, <64 x i16>* %x
+  %e = zext <64 x i16> %v to <64 x i32>
+  %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e)
+  ret i32 %red
+}
+
 declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>)
 
 define i64 @vreduce_add_v1i64(<1 x i64>* %x) {
@@ -440,6 +994,60 @@ define i64 @vreduce_add_v1i64(<1 x i64>* %x) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_v1i64(<1 x i32>* %x) {
+; RV32-LABEL: vwreduce_add_v1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vsext.vf2 v9, v8
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsrl.vx v8, v9, a0
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_add_v1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT:    vsext.vf2 v9, v8
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    ret
+  %v = load <1 x i32>, <1 x i32>* %x
+  %e = sext <1 x i32> %v to <1 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_v1i64(<1 x i32>* %x) {
+; RV32-LABEL: vwreduce_uadd_v1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vzext.vf2 v9, v8
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsrl.vx v8, v9, a0
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_uadd_v1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT:    vzext.vf2 v9, v8
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    ret
+  %v = load <1 x i32>, <1 x i32>* %x
+  %e = zext <1 x i32> %v to <1 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
 
 define i64 @vreduce_add_v2i64(<2 x i64>* %x) {
@@ -469,6 +1077,74 @@ define i64 @vreduce_add_v2i64(<2 x i64>* %x) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_v2i64(<2 x i32>* %x) {
+; RV32-LABEL: vwreduce_add_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    vwredsum.vs v8, v8, v9
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_add_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vwredsum.vs v8, v8, v9
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <2 x i32>, <2 x i32>* %x
+  %e = sext <2 x i32> %v to <2 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_v2i64(<2 x i32>* %x) {
+; RV32-LABEL: vwreduce_uadd_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    vwredsumu.vs v8, v8, v9
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_uadd_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vwredsumu.vs v8, v8, v9
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <2 x i32>, <2 x i32>* %x
+  %e = zext <2 x i32> %v to <2 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 
 define i64 @vreduce_add_v4i64(<4 x i64>* %x) {
@@ -498,6 +1174,74 @@ define i64 @vreduce_add_v4i64(<4 x i64>* %x) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_v4i64(<4 x i32>* %x) {
+; RV32-LABEL: vwreduce_add_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV32-NEXT:    vwredsum.vs v8, v8, v9
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_add_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV64-NEXT:    vwredsum.vs v8, v8, v9
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <4 x i32>, <4 x i32>* %x
+  %e = sext <4 x i32> %v to <4 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_v4i64(<4 x i32>* %x) {
+; RV32-LABEL: vwreduce_uadd_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV32-NEXT:    vwredsumu.vs v8, v8, v9
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_uadd_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV64-NEXT:    vwredsumu.vs v8, v8, v9
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <4 x i32>, <4 x i32>* %x
+  %e = zext <4 x i32> %v to <4 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
 
 define i64 @vreduce_add_v8i64(<8 x i64>* %x) {
@@ -527,6 +1271,74 @@ define i64 @vreduce_add_v8i64(<8 x i64>* %x) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_v8i64(<8 x i32>* %x) {
+; RV32-LABEL: vwreduce_add_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vwredsum.vs v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_add_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v10, zero
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vwredsum.vs v8, v8, v10
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <8 x i32>, <8 x i32>* %x
+  %e = sext <8 x i32> %v to <8 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_v8i64(<8 x i32>* %x) {
+; RV32-LABEL: vwreduce_uadd_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vwredsumu.vs v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_uadd_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v10, zero
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vwredsumu.vs v8, v8, v10
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <8 x i32>, <8 x i32>* %x
+  %e = zext <8 x i32> %v to <8 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 
 define i64 @vreduce_add_v16i64(<16 x i64>* %x) {
@@ -556,6 +1368,74 @@ define i64 @vreduce_add_v16i64(<16 x i64>* %x) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_v16i64(<16 x i32>* %x) {
+; RV32-LABEL: vwreduce_add_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vwredsum.vs v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_add_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v12, zero
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    vwredsum.vs v8, v8, v12
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, <16 x i32>* %x
+  %e = sext <16 x i32> %v to <16 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_v16i64(<16 x i32>* %x) {
+; RV32-LABEL: vwreduce_uadd_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vwredsumu.vs v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_uadd_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v12, zero
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    vwredsumu.vs v8, v8, v12
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <16 x i32>, <16 x i32>* %x
+  %e = zext <16 x i32> %v to <16 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
 
 define i64 @vreduce_add_v32i64(<32 x i64>* %x) {
@@ -591,6 +1471,88 @@ define i64 @vreduce_add_v32i64(<32 x i64>* %x) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_v32i64(<32 x i32>* %x) {
+; RV32-LABEL: vwreduce_add_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV32-NEXT:    vslidedown.vi v16, v8, 16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vwadd.vv v24, v8, v16
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v8, zero
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; RV32-NEXT:    vredsum.vs v8, v24, v8
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_add_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV64-NEXT:    vslidedown.vi v16, v8, 16
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    vwadd.vv v24, v8, v16
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v8, zero
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; RV64-NEXT:    vredsum.vs v8, v24, v8
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <32 x i32>, <32 x i32>* %x
+  %e = sext <32 x i32> %v to <32 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_v32i64(<32 x i32>* %x) {
+; RV32-LABEL: vwreduce_uadd_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV32-NEXT:    vslidedown.vi v16, v8, 16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vwaddu.vv v24, v8, v16
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vmv.s.x v8, zero
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; RV32-NEXT:    vredsum.vs v8, v24, v8
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_uadd_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV64-NEXT:    vslidedown.vi v16, v8, 16
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    vwaddu.vv v24, v8, v16
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v8, zero
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
+; RV64-NEXT:    vredsum.vs v8, v24, v8
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
+  %v = load <32 x i32>, <32 x i32>* %x
+  %e = zext <32 x i32> %v to <32 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
 
 define i64 @vreduce_add_v64i64(<64 x i64>* %x) nounwind {
@@ -638,6 +1600,284 @@ define i64 @vreduce_add_v64i64(<64 x i64>* %x) nounwind {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_v64i64(<64 x i32>* %x) {
+; RV32-LABEL: vwreduce_add_v64i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    addi a1, a0, 128
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV32-NEXT:    vslidedown.vi v24, v8, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8re8.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwadd.vv v0, v24, v8
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwadd.vv v0, v8, v16
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    vredsum.vs v8, v8, v16
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add sp, sp, a2
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_add_v64i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    addi a1, a0, 128
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vle32.v v16, (a1)
+; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8re8.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwadd.vv v0, v24, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwadd.vv v0, v8, v16
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vmv.s.x v16, zero
+; RV64-NEXT:    vredsum.vs v8, v8, v16
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add sp, sp, a1
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %v = load <64 x i32>, <64 x i32>* %x
+  %e = sext <64 x i32> %v to <64 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_v64i64(<64 x i32>* %x) {
+; RV32-LABEL: vwreduce_uadd_v64i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    addi a1, a0, 128
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV32-NEXT:    vslidedown.vi v24, v8, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8re8.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwaddu.vv v0, v24, v8
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwaddu.vv v0, v8, v16
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    vredsum.vs v8, v8, v16
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add sp, sp, a2
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwreduce_uadd_v64i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    addi a1, a0, 128
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vle32.v v16, (a1)
+; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, mu
+; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vslidedown.vi v24, v16, 16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8re8.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwaddu.vv v0, v24, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwaddu.vv v0, v8, v16
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8re8.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vmv.s.x v16, zero
+; RV64-NEXT:    vredsum.vs v8, v8, v16
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add sp, sp, a1
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %v = load <64 x i32>, <64 x i32>* %x
+  %e = zext <64 x i32> %v to <64 x i64>
+  %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e)
+  ret i64 %red
+}
+
 declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
 
 define i8 @vreduce_and_v1i8(<1 x i8>* %x) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll
index b9e60a32cb926..645c5b09f7cc5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv32.ll
@@ -382,6 +382,36 @@ define signext i16 @vreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
   ret i16 %red
 }
 
+define signext i16 @vwreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
+; CHECK-LABEL: vwreduce_add_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
+  ret i16 %red
+}
+
+define signext i16 @vwreduce_uadd_nxv1i8(<vscale x 1 x i8> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16>)
 
 define signext i16 @vreduce_umax_nxv1i16(<vscale x 1 x i16> %v) {
@@ -505,6 +535,36 @@ define signext i16 @vreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
   ret i16 %red
 }
 
+define signext i16 @vwreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
+; CHECK-LABEL: vwreduce_add_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
+  ret i16 %red
+}
+
+define signext i16 @vwreduce_uadd_nxv2i8(<vscale x 2 x i8> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16>)
 
 define signext i16 @vreduce_umax_nxv2i16(<vscale x 2 x i16> %v) {
@@ -628,6 +688,36 @@ define signext i16 @vreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
   ret i16 %red
 }
 
+define signext i16 @vwreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
+; CHECK-LABEL: vwreduce_add_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
+  ret i16 %red
+}
+
+define signext i16 @vwreduce_uadd_nxv4i8(<vscale x 4 x i8> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16>)
 
 define signext i16 @vreduce_umax_nxv4i16(<vscale x 4 x i16> %v) {
@@ -751,6 +841,36 @@ define i32 @vreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
+; CHECK-LABEL: vwreduce_add_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i16> %v to <vscale x 1 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_nxv1i16(<vscale x 1 x i16> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 1 x i16> %v to <vscale x 1 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32>)
 
 define i32 @vreduce_umax_nxv1i32(<vscale x 1 x i32> %v) {
@@ -874,6 +994,36 @@ define i32 @vreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
+; CHECK-LABEL: vwreduce_add_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i16> %v to <vscale x 2 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_nxv2i16(<vscale x 2 x i16> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 2 x i16> %v to <vscale x 2 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32>)
 
 define i32 @vreduce_umax_nxv2i32(<vscale x 2 x i32> %v) {
@@ -997,6 +1147,36 @@ define i32 @vreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
   ret i32 %red
 }
 
+define i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
+; CHECK-LABEL: vwreduce_add_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
+  ret i32 %red
+}
+
+define i32 @vwreduce_uadd_nxv4i16(<vscale x 4 x i16> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32>)
 
 define i32 @vreduce_umax_nxv4i32(<vscale x 4 x i32> %v) {
@@ -1124,6 +1304,44 @@ define i64 @vreduce_add_nxv1i64(<vscale x 1 x i64> %v) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
+; CHECK-LABEL: vwreduce_add_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i32> %v to <vscale x 1 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_nxv1i32(<vscale x 1 x i32> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 1 x i32> %v to <vscale x 1 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64>)
 
 define i64 @vreduce_umax_nxv1i64(<vscale x 1 x i64> %v) {
@@ -1292,6 +1510,44 @@ define i64 @vreduce_add_nxv2i64(<vscale x 2 x i64> %v) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
+; CHECK-LABEL: vwreduce_add_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_nxv2i32(<vscale x 2 x i32> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64>)
 
 define i64 @vreduce_umax_nxv2i64(<vscale x 2 x i64> %v) {
@@ -1460,6 +1716,44 @@ define i64 @vreduce_add_nxv4i64(<vscale x 4 x i64> %v) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
+; CHECK-LABEL: vwreduce_add_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_nxv4i32(<vscale x 4 x i32> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64>)
 
 define i64 @vreduce_umax_nxv4i64(<vscale x 4 x i64> %v) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll
index 04e8ddf72cc40..439c9d4bb58d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-rv64.ll
@@ -382,6 +382,36 @@ define signext i16 @vreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
   ret i16 %red
 }
 
+define signext i16 @vwreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
+; CHECK-LABEL: vwreduce_add_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
+  ret i16 %red
+}
+
+define signext i16 @vwreduce_uadd_nxv1i8(<vscale x 1 x i8> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i8> %v to <vscale x 1 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.umax.nxv1i16(<vscale x 1 x i16>)
 
 define signext i16 @vreduce_umax_nxv1i16(<vscale x 1 x i16> %v) {
@@ -505,6 +535,36 @@ define signext i16 @vreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
   ret i16 %red
 }
 
+define signext i16 @vwreduce_add_nxv2i8(<vscale x 2 x i8> %v) {
+; CHECK-LABEL: vwreduce_add_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
+  ret i16 %red
+}
+
+define signext i16 @vwreduce_uadd_nxv2i8(<vscale x 2 x i8> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i8> %v to <vscale x 2 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.umax.nxv2i16(<vscale x 2 x i16>)
 
 define signext i16 @vreduce_umax_nxv2i16(<vscale x 2 x i16> %v) {
@@ -628,6 +688,36 @@ define signext i16 @vreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
   ret i16 %red
 }
 
+define signext i16 @vwreduce_add_nxv4i8(<vscale x 4 x i8> %v) {
+; CHECK-LABEL: vwreduce_add_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
+  ret i16 %red
+}
+
+define signext i16 @vwreduce_uadd_nxv4i8(<vscale x 4 x i8> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i8> %v to <vscale x 4 x i16>
+  %red = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %e)
+  ret i16 %red
+}
+
 declare i16 @llvm.vector.reduce.umax.nxv4i16(<vscale x 4 x i16>)
 
 define signext i16 @vreduce_umax_nxv4i16(<vscale x 4 x i16> %v) {
@@ -751,6 +841,36 @@ define signext i32 @vreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
   ret i32 %red
 }
 
+define signext i32 @vwreduce_add_nxv1i16(<vscale x 1 x i16> %v) {
+; CHECK-LABEL: vwreduce_add_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i16> %v to <vscale x 1 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
+  ret i32 %red
+}
+
+define signext i32 @vwreduce_uadd_nxv1i16(<vscale x 1 x i16> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 1 x i16> %v to <vscale x 1 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32>)
 
 define signext i32 @vreduce_umax_nxv1i32(<vscale x 1 x i32> %v) {
@@ -874,6 +994,36 @@ define signext i32 @vreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
   ret i32 %red
 }
 
+define signext i32 @vwreduce_add_nxv2i16(<vscale x 2 x i16> %v) {
+; CHECK-LABEL: vwreduce_add_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i16> %v to <vscale x 2 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
+  ret i32 %red
+}
+
+define signext i32 @vwreduce_uadd_nxv2i16(<vscale x 2 x i16> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 2 x i16> %v to <vscale x 2 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32>)
 
 define signext i32 @vreduce_umax_nxv2i32(<vscale x 2 x i32> %v) {
@@ -997,6 +1147,36 @@ define signext i32 @vreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
   ret i32 %red
 }
 
+define signext i32 @vwreduce_add_nxv4i16(<vscale x 4 x i16> %v) {
+; CHECK-LABEL: vwreduce_add_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
+  ret i32 %red
+}
+
+define signext i32 @vwreduce_uadd_nxv4i16(<vscale x 4 x i16> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e32, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 4 x i16> %v to <vscale x 4 x i32>
+  %red = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %e)
+  ret i32 %red
+}
+
 declare i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32>)
 
 define signext i32 @vreduce_umax_nxv4i32(<vscale x 4 x i32> %v) {
@@ -1120,6 +1300,36 @@ define i64 @vreduce_add_nxv1i64(<vscale x 1 x i64> %v) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
+; CHECK-LABEL: vwreduce_add_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 1 x i32> %v to <vscale x 1 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_nxv1i32(<vscale x 1 x i32> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 1 x i32> %v to <vscale x 1 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64>)
 
 define i64 @vreduce_umax_nxv1i64(<vscale x 1 x i64> %v) {
@@ -1244,6 +1454,36 @@ define i64 @vreduce_add_nxv2i64(<vscale x 2 x i64> %v) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
+; CHECK-LABEL: vwreduce_add_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_nxv2i32(<vscale x 2 x i32> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64>)
 
 define i64 @vreduce_umax_nxv2i64(<vscale x 2 x i64> %v) {
@@ -1368,6 +1608,36 @@ define i64 @vreduce_add_nxv4i64(<vscale x 4 x i64> %v) {
   ret i64 %red
 }
 
+define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
+; CHECK-LABEL: vwreduce_add_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vwredsum.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
+  ret i64 %red
+}
+
+define i64 @vwreduce_uadd_nxv4i32(<vscale x 4 x i32> %v) {
+; CHECK-LABEL: vwreduce_uadd_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.s.x v10, zero
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
+; CHECK-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    ret
+  %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+  %red = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %e)
+  ret i64 %red
+}
+
 declare i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64>)
 
 define i64 @vreduce_umax_nxv4i64(<vscale x 4 x i64> %v) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 9f57f9770753d..4cd93960f9422 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -1353,6 +1353,76 @@ define signext i64 @vpreduce_add_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
   ret i64 %r
 }
 
+define signext i64 @vpwreduce_add_nxv1i32(i64 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpwreduce_add_nxv1i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e32, mf2, tu, mu
+; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v9, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpwreduce_add_nxv1i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, a0
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, tu, mu
+; RV64-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    ret
+  %e = sext <vscale x 1 x i32> %v to <vscale x 1 x i64>
+  %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, <vscale x 1 x i64> %e, <vscale x 1 x i1> %m, i32 %evl)
+  ret i64 %r
+}
+
+define signext i64 @vpwreduce_uadd_nxv1i32(i64 signext %s, <vscale x 1 x i32> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpwreduce_uadd_nxv1i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e32, mf2, tu, mu
+; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v9, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpwreduce_uadd_nxv1i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, a0
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, tu, mu
+; RV64-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    ret
+  %e = sext <vscale x 1 x i32> %v to <vscale x 1 x i64>
+  %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, <vscale x 1 x i64> %e, <vscale x 1 x i1> %m, i32 %evl)
+  ret i64 %r
+}
+
 declare i64 @llvm.vp.reduce.umax.nxv1i64(i64, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
 
 define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -1625,6 +1695,76 @@ define signext i64 @vpreduce_add_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
   ret i64 %r
 }
 
+define signext i64 @vwpreduce_add_nxv2i32(i64 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vwpreduce_add_nxv2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e32, m1, tu, mu
+; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v9, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwpreduce_add_nxv2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, a0
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, tu, mu
+; RV64-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    ret
+  %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+  %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, <vscale x 2 x i64> %e, <vscale x 2 x i1> %m, i32 %evl)
+  ret i64 %r
+}
+
+define signext i64 @vwpreduce_uadd_nxv2i32(i64 signext %s, <vscale x 2 x i32> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vwpreduce_uadd_nxv2i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e32, m1, tu, mu
+; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v9, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwpreduce_uadd_nxv2i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v9, a0
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, tu, mu
+; RV64-NEXT:    vwredsum.vs v9, v8, v9, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v9
+; RV64-NEXT:    ret
+  %e = sext <vscale x 2 x i32> %v to <vscale x 2 x i64>
+  %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, <vscale x 2 x i64> %e, <vscale x 2 x i1> %m, i32 %evl)
+  ret i64 %r
+}
+
 declare i64 @llvm.vp.reduce.umax.nxv2i64(i64, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
 
 define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
@@ -1897,6 +2037,76 @@ define signext i64 @vpreduce_add_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
   ret i64 %r
 }
 
+define signext i64 @vpwreduce_add_nxv4i32(i64 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpwreduce_add_nxv4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e32, m2, tu, mu
+; RV32-NEXT:    vwredsum.vs v10, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v10, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpwreduce_add_nxv4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v10, a0
+; RV64-NEXT:    vsetvli zero, a1, e32, m2, tu, mu
+; RV64-NEXT:    vwredsum.vs v10, v8, v10, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    ret
+  %e = sext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+  %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, <vscale x 4 x i64> %e, <vscale x 4 x i1> %m, i32 %evl)
+  ret i64 %r
+}
+
+define signext i64 @vpwreduce_uadd_nxv4i32(i64 signext %s, <vscale x 4 x i32> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpwreduce_uadd_nxv4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e32, m2, tu, mu
+; RV32-NEXT:    vwredsumu.vs v10, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV32-NEXT:    vmv.x.s a0, v10
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vsrl.vx v8, v10, a1
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpwreduce_uadd_nxv4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV64-NEXT:    vmv.s.x v10, a0
+; RV64-NEXT:    vsetvli zero, a1, e32, m2, tu, mu
+; RV64-NEXT:    vwredsumu.vs v10, v8, v10, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m1, ta, mu
+; RV64-NEXT:    vmv.x.s a0, v10
+; RV64-NEXT:    ret
+  %e = zext <vscale x 4 x i32> %v to <vscale x 4 x i64>
+  %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, <vscale x 4 x i64> %e, <vscale x 4 x i1> %m, i32 %evl)
+  ret i64 %r
+}
+
 declare i64 @llvm.vp.reduce.umax.nxv4i64(i64, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
 
 define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <vscale x 4 x i1> %m, i32 zeroext %evl) {

From d2dacde5d8a313a81e8fd6f5134541377e5a244a Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Tue, 22 Feb 2022 17:14:03 +0900
Subject: [PATCH 469/748] [mlir][bufferize][NFC] Rename
 `comprehensive-function-bufferize` to `one-shot-bufferize`

The related functionality is moved over to the bufferization dialect. Test cases are cleaned up a bit.

Differential Revision: https://reviews.llvm.org/D120191
---
 .../Transforms/OneShotAnalysis.h              |   5 +-
 .../Dialect/Bufferization/Transforms/Passes.h |  10 ++
 .../Bufferization/Transforms/Passes.td        |  82 +++++++++++
 mlir/include/mlir/InitAllDialects.h           |  10 ++
 .../Bufferization/Transforms/Bufferize.cpp    |  77 ++++++++++
 .../Transforms/OneShotAnalysis.cpp            |   6 +-
 .../one-shot-bufferize-compat.mlir}           |   4 +-
 .../one-shot-bufferize-partial.mlir}          | 121 +++++++++------
 .../Transforms/one-shot-bufferize.mlir}       |  35 +----
 .../comprehensive-module-bufferize.mlir       |  32 ++++
 mlir/test/lib/Dialect/Linalg/CMakeLists.txt   |   1 -
 .../Linalg/TestComprehensiveBufferize.cpp     | 138 ------------------
 mlir/tools/mlir-opt/mlir-opt.cpp              |   2 -
 13 files changed, 297 insertions(+), 226 deletions(-)
 rename mlir/test/Dialect/{Linalg/comprehensive-function-bufferize-compat.mlir => Bufferization/Transforms/one-shot-bufferize-compat.mlir} (82%)
 rename mlir/test/Dialect/{Linalg/comprehensive-module-bufferize-partial.mlir => Bufferization/Transforms/one-shot-bufferize-partial.mlir} (58%)
 rename mlir/test/Dialect/{Linalg/comprehensive-function-bufferize.mlir => Bufferization/Transforms/one-shot-bufferize.mlir} (50%)
 delete mode 100644 mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
index 8e9e09663ec13..8641bc1702712 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@@ -164,9 +164,8 @@ class AnalysisBufferizationState : public BufferizationState {
 LogicalResult analyzeOp(Operation *op, AnalysisBufferizationState &state);
 
 /// Run One-Shot Bufferize on the given op: Analysis + Bufferization
-LogicalResult
-runOneShotBufferize(Operation *op,
-                    std::unique_ptr<AnalysisBufferizationOptions> options);
+LogicalResult runOneShotBufferize(Operation *op,
+                                  const AnalysisBufferizationOptions &options);
 
 } // namespace bufferization
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 29c0d7f741d93..1aa40da14bcdb 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -5,6 +5,7 @@
 
 namespace mlir {
 namespace bufferization {
+struct AnalysisBufferizationOptions;
 
 //===----------------------------------------------------------------------===//
 // Passes
@@ -29,6 +30,15 @@ std::unique_ptr<Pass> createBufferResultsToOutParamsPass();
 /// bufferization.to_tensor and bufferization.to_memref operations.
 std::unique_ptr<OperationPass<FuncOp>> createFinalizingBufferizePass();
 
+/// Create a pass that bufferizes all ops that implement BufferizableOpInterface
+/// with One-Shot Bufferize.
+std::unique_ptr<Pass> createOneShotBufferizePass();
+
+/// Create a pass that bufferizes all ops that implement BufferizableOpInterface
+/// with One-Shot Bufferize and the specified bufferization options.
+std::unique_ptr<Pass>
+createOneShotBufferizePass(const AnalysisBufferizationOptions &options);
+
 /// Creates a pass that promotes heap-based allocations to stack-based ones.
 /// Only buffers smaller than the provided size are promoted.
 /// Dynamic shaped buffers are promoted up to the given rank.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index dbee453bdec89..fc92a0a74445e 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -149,6 +149,88 @@ def FinalizingBufferize : Pass<"finalizing-bufferize", "FuncOp"> {
   let constructor = "mlir::bufferization::createFinalizingBufferizePass()";
 }
 
+def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
+  let summary = "One-Shot Bufferize";
+  let description = [{
+    This pass bufferizes all ops that implement `BufferizableOpInterface`. It
+    first performs an inplacability analysis on SSA use-def chains of tensor
+    values to determine which OpOperands may bufferize in-place, i.e., without
+    inserting a buffer copy. It then rewrites the IR, inserting a buffer
+    allocation and copy for each OpOperand that was decided to bufferize
+    out-of-place.
+
+    One-Shot Bufferize (and `BufferizableOpInterface`) was designed for ops that
+    are in destination-passing style. When bufferizing such ops, it is possible
+    to reuse the buffer of a tensor OpOperand for a tensor OpResult. In essence,
+    a possible destination of an operation is already passed as an SSA value.
+
+    `tensor.insert` is an example for an op in destination-passing style. E.g.,
+    when bufferizing `%t0 = tensor.insert %f into %dest[%idx]`, `buffer(%t0)` is
+    identical to `buffer(%dest)` in the absence of RaW conflicts. As a counter
+    example, `tensor.generate` is not in destination-passing style and always
+    results in a new buffer allocation.
+
+    One-Shot Bufferize deallocates all buffers that it allocates. Yielding newly
+    allocated buffers from a block is not supported yet and such IR will be
+    rejected. For testing purposes and compatibility with partial bufferization,
+    One-Shot Bufferize can be run with `allow-return-memref=1 create-dealloc=0`
+    to allow such IR.
+
+    One-Shot Bufferize will by default reject IR that contains non-bufferizable
+    op, i.e., ops that do not implemement BufferizableOpInterface. Such IR can
+    be allowed with `allow-unknown-ops=1`. In that case, to_memref and to_tensor
+    ops will be generated at the bufferization boundary. This is useful for
+    compatibility with existing partial bufferization passes: These can
+    bufferize the remaining IR after running One-Shot Bufferize.
+
+    Note: Running One-Shot Bufferize after a partial bufferization pass is
+    currently not supported. Running partial bufferization passes after running
+    One-Shot Bufferize is supported and the recommended way to gradually
+    migrate from partial bufferization to One-Shot Bufferize.
+
+    With `dialect-filter`, bufferization can be restricted to a set of dialects.
+    If no filter is specified, all ops that implement `BufferizableOpInterface`
+    are bufferized. Ops from the `std` dialect are an exception: These ops are
+    always ignored, even if no filter is specified. When specifying a dialect
+    filter and `allow-unknown-ops` is not turned on, bufferization would fail
+    when encountering an op that is not included in the filter (even if it is
+    bufferizable).
+
+    For testing/debugging purposes, `test-analysis-only=1 print-conflicts=1`
+    prints analysis results and explains why an OpOperand was decided to
+    bufferize out-of-place. This is useful for understanding why One-Shot
+    Bufferize chose to insert a certain buffer copy.
+  }];
+  let options = [
+    Option<"allowReturnMemref", "allow-return-memref", "bool",
+            /*default=*/"false",
+           "Allows the return of memrefs (for testing purposes only)">,
+    Option<"allowUnknownOps", "allow-unknown-ops", "bool",
+           /*default=*/"false",
+           "Allows unknown (not bufferizable) ops in the input IR.">,
+    Option<"analysisFuzzerSeed", "analysis-fuzzer-seed", "unsigned",
+           /*default=*/"0",
+           "Test only: Analyze ops in random order with a given seed (fuzzer)">,
+    Option<"createDeallocs", "create-deallocs", "bool", /*default=*/"true",
+           "Specify if buffers should be deallocated. For compatibility with "
+           "core bufferization passes.">,
+    ListOption<"dialectFilter", "dialect-filter", "std::string",
+               "Restrict bufferization to ops from these dialects.",
+               "llvm::cl::MiscFlags::CommaSeparated">,
+    Option<"fullyDynamicLayoutMaps", "fully-dynamic-layout-maps", "bool",
+           /*default=*/"true",
+           "Generate MemRef types with dynamic offset+strides by default.">,
+    Option<"testAnalysisOnly", "test-analysis-only", "bool",
+            /*default=*/"false",
+           "Test only: Only run inplaceability analysis and annotate IR">,
+    Option<"printConflicts", "print-conflicts", "bool",
+            /*default=*/"false",
+           "Test only: Annotate IR with RaW conflicts. Requires "
+           "test-analysis-only.">,
+  ];
+  let constructor = "mlir::bufferization::createOneShotBufferizePass()";
+}
+
 def PromoteBuffersToStack : Pass<"promote-buffers-to-stack", "FuncOp"> {
   let summary = "Promotes heap-based allocations to automatically managed "
                 "stack-based allocations";
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 980bed05b1dc0..7cc54c00aaa39 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/AMX/AMXDialect.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/ArmNeon/ArmNeonDialect.h"
 #include "mlir/Dialect/ArmSVE/ArmSVEDialect.h"
 #include "mlir/Dialect/Async/IR/Async.h"
@@ -30,6 +31,7 @@
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
@@ -37,6 +39,7 @@
 #include "mlir/Dialect/PDL/IR/PDL.h"
 #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
 #include "mlir/Dialect/Quant/QuantOps.h"
+#include "mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@@ -45,8 +48,10 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
 #include "mlir/IR/Dialect.h"
 
@@ -88,8 +93,13 @@ inline void registerAllDialects(DialectRegistry &registry) {
                   tosa::TosaDialect,
                   x86vector::X86VectorDialect>();
   // clang-format on
+  arith::registerBufferizableOpInterfaceExternalModels(registry);
+  linalg::registerBufferizableOpInterfaceExternalModels(registry);
+  scf::registerBufferizableOpInterfaceExternalModels(registry);
+  tensor::registerBufferizableOpInterfaceExternalModels(registry);
   tensor::registerInferTypeOpInterfaceExternalModels(registry);
   tensor::registerTilingOpInterfaceExternalModels(registry);
+  vector::registerBufferizableOpInterfaceExternalModels(registry);
 }
 
 /// Append all the MLIR dialects to the registry contained in the given context.
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 01b22264e5bad..472a0932707b8 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -11,9 +11,13 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 using namespace mlir::bufferization;
@@ -144,8 +148,81 @@ struct FinalizingBufferizePass
       signalPassFailure();
   }
 };
+
+struct OneShotBufferizePass
+    : public OneShotBufferizeBase<OneShotBufferizePass> {
+  using OneShotBufferizeBase<OneShotBufferizePass>::OneShotBufferizeBase;
+
+  explicit OneShotBufferizePass(const AnalysisBufferizationOptions &options)
+      : options(options) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<bufferization::BufferizationDialect>();
+  }
+
+  void runOnOperation() override {
+    AnalysisBufferizationOptions opt;
+    if (!options) {
+      // Make new bufferization options if none were provided when creating the
+      // pass.
+      opt.allowReturnMemref = allowReturnMemref;
+      opt.allowUnknownOps = allowUnknownOps;
+      opt.analysisFuzzerSeed = analysisFuzzerSeed;
+      opt.createDeallocs = createDeallocs;
+      opt.fullyDynamicLayoutMaps = fullyDynamicLayoutMaps;
+      opt.printConflicts = printConflicts;
+      opt.testAnalysisOnly = testAnalysisOnly;
+
+      BufferizationOptions::OpFilterEntry::FilterFn filterFn =
+          [&](Operation *op) {
+            // Disallow non-std dialect ops. I.e., no ops related to function
+            // calls.
+            if (op->getDialect()->getNamespace() ==
+                StandardOpsDialect::getDialectNamespace())
+              return false;
+            // Filter may be specified via options.
+            if (this->dialectFilter.hasValue())
+              return llvm::find(this->dialectFilter,
+                                op->getDialect()->getNamespace()) !=
+                     this->dialectFilter.end();
+            // No filter specified: All other ops are allowed.
+            return true;
+          };
+      opt.allowOperationInFilter(filterFn);
+    } else {
+      opt = *options;
+    }
+
+    ModuleOp moduleOp = getOperation();
+    if (failed(runOneShotBufferize(moduleOp, opt))) {
+      signalPassFailure();
+      return;
+    }
+
+    if (opt.testAnalysisOnly)
+      return;
+
+    OpPassManager cleanupPipeline("builtin.module");
+    cleanupPipeline.addPass(createCanonicalizerPass());
+    cleanupPipeline.addPass(createCSEPass());
+    cleanupPipeline.addPass(createLoopInvariantCodeMotionPass());
+    (void)runPipeline(cleanupPipeline, moduleOp);
+  }
+
+private:
+  llvm::Optional<AnalysisBufferizationOptions> options;
+};
 } // namespace
 
+std::unique_ptr<Pass> mlir::bufferization::createOneShotBufferizePass() {
+  return std::make_unique<OneShotBufferizePass>();
+}
+
+std::unique_ptr<Pass> mlir::bufferization::createOneShotBufferizePass(
+    const AnalysisBufferizationOptions &options) {
+  return std::make_unique<OneShotBufferizePass>(options);
+}
+
 std::unique_ptr<OperationPass<FuncOp>>
 mlir::bufferization::createFinalizingBufferizePass() {
   return std::make_unique<FinalizingBufferizePass>();
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 6232e9ae7cba0..3e643aae57451 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -799,11 +799,11 @@ LogicalResult bufferization::analyzeOp(Operation *op,
 }
 
 LogicalResult bufferization::runOneShotBufferize(
-    Operation *op, std::unique_ptr<AnalysisBufferizationOptions> options) {
-  AnalysisBufferizationState state(op, *options);
+    Operation *op, const AnalysisBufferizationOptions &options) {
+  AnalysisBufferizationState state(op, options);
   if (failed(analyzeOp(op, state)))
     return failure();
-  if (options->testAnalysisOnly)
+  if (options.testAnalysisOnly)
     return success();
   return bufferizeOp(op, state);
 }
diff --git a/mlir/test/Dialect/Linalg/comprehensive-function-bufferize-compat.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-compat.mlir
similarity index 82%
rename from mlir/test/Dialect/Linalg/comprehensive-function-bufferize-compat.mlir
rename to mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-compat.mlir
index 6a07a96c0aee1..395547239d73b 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-function-bufferize-compat.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-compat.mlir
@@ -1,10 +1,10 @@
 // RUN: mlir-opt %s \
-// RUN:     -test-comprehensive-function-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
+// RUN:     -one-shot-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
 // RUN:     -split-input-file | \
 // RUN: FileCheck %s --check-prefix=CHECK-NODEALLOC
 
 // RUN: mlir-opt %s \
-// RUN:     -test-comprehensive-function-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
+// RUN:     -one-shot-bufferize="allow-return-memref allow-unknown-ops create-deallocs=0" \
 // RUN:     -buffer-deallocation | \
 // RUN: FileCheck %s --check-prefix=CHECK-BUFFERDEALLOC
 
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
similarity index 58%
rename from mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir
rename to mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
index 0ea8b59adb9ef..a806835c2848d 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
@@ -1,30 +1,28 @@
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
 
 // Test bufferization using memref types that have no layout map.
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops fully-dynamic-layout-maps=0" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref allow-unknown-ops fully-dynamic-layout-maps=0" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-comprehensive-function-bufferize="dialect-filter=tensor allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR
-// RUN: mlir-opt %s -allow-unregistered-dialect -test-comprehensive-function-bufferize="dialect-filter=scf allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-SCF
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=tensor allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=scf allow-unknown-ops allow-return-memref" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-SCF
 
 // CHECK: #[[$MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 
 // CHECK-LABEL: func @use_of_unknown_op_1(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32, #[[$MAP]]>
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 // CHECK-NO-LAYOUT-MAP-LABEL: func @use_of_unknown_op_1(
-//  CHECK-NO-LAYOUT-MAP-SAME:     %[[m1:.*]]: memref<?xf32>)
-func @use_of_unknown_op_1(%t1: tensor<?xf32> {linalg.inplaceable = true})
+//  CHECK-NO-LAYOUT-MAP-SAME:     %[[t1:.*]]: tensor<?xf32>
+func @use_of_unknown_op_1(%t1: tensor<?xf32>)
     -> vector<5xf32> {
   // ToTensorOp is generated because the function is bufferized and has a
   // memref block argument.
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]] : memref<?xf32, #[[$MAP]]>
-  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
-  // CHECK-NO-LAYOUT-MAP: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]] : memref<?xf32>
-  // CHECK-NO-LAYOUT-MAP: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
+  // CHECK-NO-LAYOUT-MAP: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
   %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
 
   %idx = arith.constant 0 : index
@@ -40,36 +38,34 @@ func @use_of_unknown_op_1(%t1: tensor<?xf32> {linalg.inplaceable = true})
 // -----
 
 // CHECK-LABEL: func @use_of_unknown_op_2(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
-func @use_of_unknown_op_2(%t1: tensor<?xf32> {linalg.inplaceable = true})
-    -> tensor<?xf32> {
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
-
-  // CHECK: %[[dummy1:.*]] = "test.dummy_op"(%[[m1_tensor]])
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
+func @use_of_unknown_op_2(%t1: tensor<?xf32>) -> tensor<?xf32> {
+  // CHECK: %[[dummy1:.*]] = "test.dummy_op"(%[[t1]])
   %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK: %[[dummy2:.*]] = "test.another_dummy_op"(%[[dummy1]])
   %1 = "test.another_dummy_op"(%0) : (tensor<?xf32>) -> tensor<?xf32>
 
-  // CHECK: %[[dummy2_memref:.*]] = bufferization.to_memref %[[dummy2]]
-  // CHECK: return %[[dummy2_memref]]
+  // CHECK: return %[[dummy2]]
   return %1 : tensor<?xf32>
 }
 
 // -----
 
+// CHECK: #[[$MAP2:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
 // CHECK-LABEL: func @use_of_unknown_op_3(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
-func @use_of_unknown_op_3(%t1: tensor<?xf32> {linalg.inplaceable = true})
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
+func @use_of_unknown_op_3(%t1: tensor<?xf32>)
     -> (vector<5xf32>, vector<5xf32>) {
   %idx = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]]
   // CHECK: %[[v1:.*]] = vector.transfer_read %[[m1]]
   %1 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<5xf32>
 
-  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
   %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
+  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : memref<?xf32, #[[$MAP2]]>
   // CHECK: %[[v2:.*]] = vector.transfer_read %[[dummy_memref]]
   %2 = vector.transfer_read %0[%idx], %cst : tensor<?xf32>, vector<5xf32>
 
@@ -80,14 +76,13 @@ func @use_of_unknown_op_3(%t1: tensor<?xf32> {linalg.inplaceable = true})
 // -----
 
 // CHECK-LABEL: func @use_of_unknown_op_4(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
-func @use_of_unknown_op_4(%t1: tensor<?xf32> {linalg.inplaceable = true})
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
+func @use_of_unknown_op_4(%t1: tensor<?xf32>)
     -> (vector<5xf32>, tensor<?xf32>) {
   %idx = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
 
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
-  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
   %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
 
   // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
@@ -97,40 +92,39 @@ func @use_of_unknown_op_4(%t1: tensor<?xf32> {linalg.inplaceable = true})
   // CHECK: %[[another_dummy:.*]] = "test.another_dummy_op"(%[[dummy]])
   %2 = "test.another_dummy_op"(%0) : (tensor<?xf32>) -> tensor<?xf32>
 
-  // CHECK: %[[another_dummy_memref:.*]] = bufferization.to_memref %[[another_dummy]]
-  // CHECK: return %[[v1]], %[[another_dummy_memref]]
+  // CHECK: return %[[v1]], %[[another_dummy]]
   return %1, %2 : vector<5xf32>, tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: func @use_of_bufferizable_op_in_unbufferizable_op
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 func @use_of_bufferizable_op_in_unbufferizable_op(
     %t1: tensor<?xf32>, %o: index, %s: index) -> (tensor<?xf32>, tensor<?xf32>) {
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]]
   // CHECK: %[[subview:.*]] = memref.subview %[[m1]]
   %0 = tensor.extract_slice %t1[%o][%s][1] : tensor<?xf32> to tensor<?xf32>
   // CHECK: %[[subview_tensor:.*]] = bufferization.to_tensor %[[subview]]
   // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[subview_tensor]])
   %1 = "test.dummy_op"(%0) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
-  // CHECK: return %[[subview]], %[[dummy_memref]]
+  // CHECK: return %[[subview_tensor]], %[[dummy]]
   return %0, %1 : tensor<?xf32>, tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: func @unused_unknown_op(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 func @unused_unknown_op(%t1 : tensor<?xf32>) -> vector<5xf32> {
   %idx = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
-  // ToTensorOp is inserted to pass in the result of the above bufferized op.
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
+
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]]
   // CHECK: vector.transfer_read %[[m1]]
   %1 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<5xf32>
 
-  // CHECK: "test.dummy_op"(%[[m1_tensor]])
+  // CHECK: "test.dummy_op"(%[[t1]])
   "test.dummy_op"(%t1) : (tensor<?xf32>) -> ()
 
   return %1 : vector<5xf32>
@@ -138,25 +132,60 @@ func @unused_unknown_op(%t1 : tensor<?xf32>) -> vector<5xf32> {
 
 // -----
 
+// CHECK: #[[$MAP3:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @unknown_op_may_read(
+func @unknown_op_may_read(%v: vector<5xf32>)
+    -> (tensor<10xf32>, tensor<10xf32>) {
+  %idx = arith.constant 0 : index
+  %cst = arith.constant 5.0 : f32
+
+  // One alloc for the init_tensor, another one because the transfer_write
+  // bufferizes out-of-place.
+  // CHECK: %[[m1:.*]] = memref.alloc() {{.*}} : memref<10xf32>
+  // CHECK: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<10xf32>
+  // CHECK: %[[alloc_casted:.*]] = memref.cast %[[alloc]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]>
+  // CHECK: %[[m1_casted:.*]] = memref.cast %[[m1]] : memref<10xf32> to memref<10xf32, #[[$MAP3]]>
+  %t1 = linalg.init_tensor [10] : tensor<10xf32>
+
+  // CHECK: linalg.fill(%{{.*}}, %[[m1]])
+  // CHECK: %[[filled_tensor:.*]] = bufferization.to_tensor %[[m1_casted]]
+  %filled = linalg.fill(%cst, %t1) : f32, tensor<10xf32> -> tensor<10xf32>
+
+  // The transfer_write is out-of-place because "dummy_op" may read.
+  // CHECK: memref.copy %[[m1]], %[[alloc]]
+  // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
+  // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc_casted]]
+  %1 = vector.transfer_write %v, %filled[%idx] : vector<5xf32>, tensor<10xf32>
+
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[filled_tensor]])
+  %2 = "test.dummy_op"(%filled) : (tensor<10xf32>) -> (tensor<10xf32>)
+
+  // CHECK: memref.dealloc %[[alloc]]
+  // CHECK: memref.dealloc %[[m1]]
+  // CHECK: return %[[alloc_tensor]], %[[dummy]]
+  return %1, %2 : tensor<10xf32>, tensor<10xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @unknown_op_not_writable
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+//  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 func @unknown_op_not_writable(
     %t1 : tensor<?xf32>, %v :  vector<5xf32>, %idx : index) -> tensor<?xf32> {
-  // CHECK: %[[m1_tensor:.*]] = bufferization.to_tensor %[[m1]]
-  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[m1_tensor]])
+  // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
   // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]]
   %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> (tensor<?xf32>)
 
   // The result of an unknown op is not writable. Always generate a copy.
-  // Note: This copy is essential for partial bufferization. Otherwise, we could
-  // introducing a RaW conflict.
   // CHECK: %[[dim:.*]] = tensor.dim %[[dummy]]
   // CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]])
   // CHECK: memref.copy %[[dummy_memref]], %[[alloc]]
   // CHECK: vector.transfer_write %{{.*}}, %[[alloc]]
   %1 = vector.transfer_write %v, %0[%idx] : vector<5xf32>, tensor<?xf32>
 
-  // CHECK: return %[[alloc]]
+  // CHECK: %[[alloc_tensor:.*]] = bufferization.to_tensor %[[alloc]]
+  // CHECK: return %[[alloc_tensor]]
   return %1 : tensor<?xf32>
 }
 
diff --git a/mlir/test/Dialect/Linalg/comprehensive-function-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
similarity index 50%
rename from mlir/test/Dialect/Linalg/comprehensive-function-bufferize.mlir
rename to mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
index 1a3b266ee4b80..0ea283fc9f6cc 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-function-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -test-comprehensive-function-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-memref test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
 // CHECK-LABEL: func @use_tensor_func_arg(
 //  CHECK-SAME:     %[[A:.*]]: tensor<?xf32>
@@ -68,31 +68,4 @@ func @empty_func() -> () {
   return
 }
 
-// -----
 
-// CHECK-LABEL: func @rank_reducing
-func @rank_reducing(
-    %i: index, %j: index,
-    %arg0: tensor<8x18x32xf32>) 
-      -> tensor<?x1x6x8xf32> {
-  %c1 = arith.constant 1 : index
-  %c6 = arith.constant 6 : index
-  %c8 = arith.constant 8 : index
-  %c32 = arith.constant 32 : index
-  %c0 = arith.constant 0 : index
-  %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32>
-  %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
-  %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32>
-  %5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor<?x1x6x8xf32>) {
-    %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
-    %8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32>
-    %9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) {
-      %11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32>
-      %12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32>
-      scf.yield %12 : tensor<1x6x8xf32>
-    }
-    %10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
-    scf.yield %10 : tensor<?x1x6x8xf32>
-  }
-  return %5: tensor<?x1x6x8xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
index f0a48aafbcdb9..ba6da08a097a8 100644
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -1355,3 +1355,35 @@ func @write_after_select_read_one(
   // CHECK: return %[[f]], %[[select]]
   return %f, %w : f32, tensor<?xf32>
 }
+
+// -----
+
+// A regression test to make sure that we handle rank-reducing extract_slice
+// correctly.
+
+// CHECK-LABEL: func @rank_reducing
+func @rank_reducing(
+    %i: index, %j: index,
+    %arg0: tensor<8x18x32xf32>)
+      -> tensor<?x1x6x8xf32> {
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  %c8 = arith.constant 8 : index
+  %c32 = arith.constant 32 : index
+  %c0 = arith.constant 0 : index
+  %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32>
+  %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
+  %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32>
+  %5 = scf.for %arg7 = %c0 to %c32 step %c8 iter_args(%arg8 = %1) -> (tensor<?x1x6x8xf32>) {
+    %7 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg7)
+    %8 = tensor.extract_slice %arg0[%i, %j, %arg7] [1, 6, 8] [1, 1, 1] : tensor<8x18x32xf32> to tensor<1x6x8xf32>
+    %9 = scf.for %arg9 = %c0 to %c6 step %c1 iter_args(%arg10 = %2) -> (tensor<1x6x8xf32>) {
+      %11 = tensor.extract_slice %8[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x6x8xf32> to tensor<1x1x8xf32>
+      %12 = tensor.insert_slice %11 into %arg10[0, %arg9, 0] [1, 1, 8] [1, 1, 1] : tensor<1x1x8xf32> into tensor<1x6x8xf32>
+      scf.yield %12 : tensor<1x6x8xf32>
+    }
+    %10 = tensor.insert_slice %9 into %arg8[%7, 0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
+    scf.yield %10 : tensor<?x1x6x8xf32>
+  }
+  return %5: tensor<?x1x6x8xf32>
+}
diff --git a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
index 51996a7df576b..c74fb756b785f 100644
--- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt
@@ -1,6 +1,5 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRLinalgTestPasses
-  TestComprehensiveBufferize.cpp
   TestLinalgCodegenStrategy.cpp
   TestLinalgDistribution.cpp
   TestLinalgElementwiseFusion.cpp
diff --git a/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp b/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
deleted file mode 100644
index f4e9b871398fe..0000000000000
--- a/mlir/test/lib/Dialect/Linalg/TestComprehensiveBufferize.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-//===- TestComprehensiveBufferize.cpp - Test Comprehensive Bufferize ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements logic for testing Comprehensive Bufferize.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
-#include "mlir/Dialect/Arithmetic/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
-#include "mlir/Dialect/Linalg/ComprehensiveBufferize/AffineInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/SCF/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-
-using namespace mlir;
-using namespace mlir::linalg;
-using namespace mlir::linalg::comprehensive_bufferize;
-using namespace mlir::bufferization;
-
-namespace {
-/// A helper struct for FunctionBufferize and ModuleBufferize. Both passes are
-/// mostly identical.
-struct TestComprehensiveFunctionBufferize
-    : public PassWrapper<TestComprehensiveFunctionBufferize,
-                         OperationPass<FuncOp>> {
-  StringRef getArgument() const final {
-    return "test-comprehensive-function-bufferize";
-  }
-
-  StringRef getDescription() const final {
-    return "Test Comprehensive Bufferize of FuncOps (body only).";
-  }
-
-  TestComprehensiveFunctionBufferize() = default;
-  TestComprehensiveFunctionBufferize(
-      const TestComprehensiveFunctionBufferize &pass)
-      : PassWrapper(pass) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, linalg::LinalgDialect,
-                    memref::MemRefDialect, tensor::TensorDialect,
-                    vector::VectorDialect, scf::SCFDialect, StandardOpsDialect,
-                    arith::ArithmeticDialect, AffineDialect>();
-    affine_ext::registerBufferizableOpInterfaceExternalModels(registry);
-    arith::registerBufferizableOpInterfaceExternalModels(registry);
-    linalg::registerBufferizableOpInterfaceExternalModels(registry);
-    scf::registerBufferizableOpInterfaceExternalModels(registry);
-    tensor::registerBufferizableOpInterfaceExternalModels(registry);
-    vector::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override;
-
-  Option<bool> allowReturnMemref{
-      *this, "allow-return-memref",
-      llvm::cl::desc("Allow returning/yielding memrefs from functions/blocks"),
-      llvm::cl::init(false)};
-  Option<bool> allowUnknownOps{
-      *this, "allow-unknown-ops",
-      llvm::cl::desc(
-          "Allows the return of memrefs (for testing purposes only)"),
-      llvm::cl::init(false)};
-  Option<bool> testAnalysisOnly{
-      *this, "test-analysis-only",
-      llvm::cl::desc(
-          "Only runs inplaceability analysis (for testing purposes only)"),
-      llvm::cl::init(false)};
-  Option<unsigned> analysisFuzzerSeed{
-      *this, "analysis-fuzzer-seed",
-      llvm::cl::desc("Analyze ops in random order with a given seed (fuzzer)"),
-      llvm::cl::init(0)};
-  ListOption<std::string> dialectFilter{
-      *this, "dialect-filter",
-      llvm::cl::desc("Bufferize only ops from the specified dialects"),
-      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
-  Option<bool> fullyDynamicLayoutMaps{
-      *this, "fully-dynamic-layout-maps",
-      llvm::cl::desc("Use fully dynamic layout maps on memref types"),
-      llvm::cl::init(true)};
-  Option<bool> createDeallocs{
-      *this, "create-deallocs",
-      llvm::cl::desc("Specify if buffers should be deallocated"),
-      llvm::cl::init(true)};
-};
-} // namespace
-
-void TestComprehensiveFunctionBufferize::runOnOperation() {
-  auto options = std::make_unique<AnalysisBufferizationOptions>();
-  options->allowReturnMemref = allowReturnMemref;
-  options->allowUnknownOps = allowUnknownOps;
-  options->testAnalysisOnly = testAnalysisOnly;
-  options->analysisFuzzerSeed = analysisFuzzerSeed;
-  options->fullyDynamicLayoutMaps = fullyDynamicLayoutMaps;
-  options->createDeallocs = createDeallocs;
-
-  if (dialectFilter.hasValue()) {
-    options->hasFilter = true;
-    for (const std::string &dialectNamespace : dialectFilter)
-      options->allowDialectInFilter(dialectNamespace);
-  }
-
-  Operation *op = getOperation();
-  if (failed(runOneShotBufferize(op, std::move(options))))
-    return;
-
-  if (testAnalysisOnly)
-    return;
-
-  OpPassManager cleanupPipeline("builtin.func");
-  cleanupPipeline.addPass(createCanonicalizerPass());
-  cleanupPipeline.addPass(createCSEPass());
-  cleanupPipeline.addPass(createLoopInvariantCodeMotionPass());
-  (void)this->runPipeline(cleanupPipeline, op);
-}
-
-namespace mlir {
-namespace test {
-void registerTestComprehensiveFunctionBufferize() {
-  PassRegistration<TestComprehensiveFunctionBufferize>();
-}
-} // namespace test
-} // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 647fffaf240d3..f0791d3e31537 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -64,7 +64,6 @@ void registerTestAffineLoopParametricTilingPass();
 void registerTestAliasAnalysisPass();
 void registerTestBuiltinAttributeInterfaces();
 void registerTestCallGraphPass();
-void registerTestComprehensiveFunctionBufferize();
 void registerTestConstantFold();
 void registerTestGpuSerializeToCubinPass();
 void registerTestGpuSerializeToHsacoPass();
@@ -159,7 +158,6 @@ void registerTestPasses() {
 #if MLIR_ROCM_CONVERSIONS_ENABLED
   mlir::test::registerTestGpuSerializeToHsacoPass();
 #endif
-  mlir::test::registerTestComprehensiveFunctionBufferize();
   mlir::test::registerTestDecomposeCallGraphTypes();
   mlir::test::registerTestDataLayoutQuery();
   mlir::test::registerTestDominancePass();

From 3b4268686965c6479dd29471287f2d4bc9cfcfaa Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Tue, 22 Feb 2022 17:39:04 +0900
Subject: [PATCH 470/748] [mlir][bufferize] Do not assert destination passing
 style for non-bufferizable ops

---
 mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 3e643aae57451..340f28526b2d2 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -707,7 +707,8 @@ assertDestinationPassingStyle(Operation *op, BufferizationState &state,
   LogicalResult status = success();
   DominanceInfo domInfo(op);
   op->walk([&](Operation *returnOp) {
-    if (!isRegionReturnLike(returnOp))
+    if (!isRegionReturnLike(returnOp) ||
+        !state.getOptions().isOpAllowed(returnOp))
       return WalkResult::advance();
 
     for (OpOperand &returnValOperand : returnOp->getOpOperands()) {

From 48dc980847b25384511276e37c02c7edcf9e3d3a Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 21 Feb 2022 22:42:35 +0100
Subject: [PATCH 471/748] [Format] Remove unused
 LineContainsContinuedForLoopSection. NFC

Differential Revision: https://reviews.llvm.org/D120282
---
 clang/lib/Format/ContinuationIndenter.cpp | 3 ---
 clang/lib/Format/ContinuationIndenter.h   | 6 ------
 2 files changed, 9 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index f4a755268eae8..ec268e74fd97e 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -253,7 +253,6 @@ LineState ContinuationIndenter::getInitialState(unsigned FirstIndent,
   State.Stack.push_back(ParenState(/*Tok=*/nullptr, FirstIndent, FirstIndent,
                                    /*AvoidBinPacking=*/false,
                                    /*NoLineBreak=*/false));
-  State.LineContainsContinuedForLoopSection = false;
   State.NoContinuation = false;
   State.StartOfStringLiteral = 0;
   State.StartOfLineLevel = 0;
@@ -343,8 +342,6 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
     return true;
   if (CurrentState.BreakBeforeClosingParen && Current.is(tok::r_paren))
     return true;
-  if (Previous.is(tok::semi) && State.LineContainsContinuedForLoopSection)
-    return true;
   if (Style.Language == FormatStyle::LK_ObjC &&
       Style.ObjCBreakBeforeNestedBlockParam &&
       Current.ObjCSelectorNameParts > 1 &&
diff --git a/clang/lib/Format/ContinuationIndenter.h b/clang/lib/Format/ContinuationIndenter.h
index 0eb53cbd02937..494a9727d5edc 100644
--- a/clang/lib/Format/ContinuationIndenter.h
+++ b/clang/lib/Format/ContinuationIndenter.h
@@ -419,9 +419,6 @@ struct LineState {
   /// The token that needs to be next formatted.
   FormatToken *NextToken;
 
-  /// \c true if this line contains a continued for-loop section.
-  bool LineContainsContinuedForLoopSection;
-
   /// \c true if \p NextToken should not continue this line.
   bool NoContinuation;
 
@@ -468,9 +465,6 @@ struct LineState {
       return NextToken < Other.NextToken;
     if (Column != Other.Column)
       return Column < Other.Column;
-    if (LineContainsContinuedForLoopSection !=
-        Other.LineContainsContinuedForLoopSection)
-      return LineContainsContinuedForLoopSection;
     if (NoContinuation != Other.NoContinuation)
       return NoContinuation;
     if (StartOfLineLevel != Other.StartOfLineLevel)

From f8d72100323b127b39fed5f5890b86d81b8a0335 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 22 Feb 2022 09:53:32 +0100
Subject: [PATCH 472/748] [GlobalStatus] Keep Visited set in
 isSafeToDestroyConstant()

Constants cannot be cyclic, but they can be tree-like. Keep a
visited set to ensure we do not degenerate to exponential run-time.

This fixes the problem reported in https://reviews.llvm.org/D117223#3335482,
though I haven't been able to construct a concise test case for
the issue. This requires a combination of dead constants and the
kind of constant expression tree that textual IR cannot represent
(because the textual representation, unlike the in-memory
representation, is also exponential in size).
---
 llvm/lib/Transforms/Utils/GlobalStatus.cpp | 30 ++++++++++++----------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index c1c5f5cc879f5..3ba920cd5878a 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -38,22 +38,26 @@ static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
 }
 
 /// It is safe to destroy a constant iff it is only used by constants itself.
-/// Note that constants cannot be cyclic, so this test is pretty easy to
-/// implement recursively.
-///
+/// Note that while constants cannot be cyclic, they can be tree-like, so we
+/// should keep a visited set to avoid exponential runtime.
 bool llvm::isSafeToDestroyConstant(const Constant *C) {
-  if (isa<GlobalValue>(C))
-    return false;
-
-  if (isa<ConstantData>(C))
-    return false;
+  SmallVector<const Constant *, 8> Worklist;
+  SmallPtrSet<const Constant *, 8> Visited;
+  Worklist.push_back(C);
+  while (!Worklist.empty()) {
+    const Constant *C = Worklist.pop_back_val();
+    if (!Visited.insert(C).second)
+      continue;
+    if (isa<GlobalValue>(C) || isa<ConstantData>(C))
+      return false;
 
-  for (const User *U : C->users())
-    if (const Constant *CU = dyn_cast<Constant>(U)) {
-      if (!isSafeToDestroyConstant(CU))
+    for (const User *U : C->users()) {
+      if (const Constant *CU = dyn_cast<Constant>(U))
+        Worklist.push_back(CU);
+      else
         return false;
-    } else
-      return false;
+    }
+  }
   return true;
 }
 

From 650aec687eb54aeeb3cef4a41f2dbaa49ef3e358 Mon Sep 17 00:00:00 2001
From: tyb0807 <sontuan.vu@arm.com>
Date: Thu, 3 Feb 2022 22:30:41 +0000
Subject: [PATCH 473/748] [ARM][AArch64] Add missing v8.x checks

Summary:
This patch adds checks that were missing in clang for Armv8.5/6/7-A. These include:
* ACLE macro defines for AArch32.
* Handling of crypto and SM4, SHA and AES feature flags on clang's driver.

Reviewers: dmgreen, SjoerdMeijer, tmatheson

Differential Revision: https://reviews.llvm.org/D116153
---
 clang/lib/Basic/Targets/ARM.cpp               |  1 +
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp  |  3 ++
 .../Preprocessor/aarch64-target-features.c    | 33 +++++++++++++++++--
 clang/test/Preprocessor/arm-target-features.c | 17 ++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 9c9d198e8f324..b2f61cff81c95 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -955,6 +955,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV8_4A:
   case llvm::ARM::ArchKind::ARMV8_5A:
   case llvm::ARM::ArchKind::ARMV8_6A:
+  case llvm::ARM::ArchKind::ARMV8_7A:
   case llvm::ARM::ArchKind::ARMV8_8A:
   case llvm::ARM::ArchKind::ARMV9A:
   case llvm::ARM::ArchKind::ARMV9_1A:
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index 6e3e3d04bbe3a..f9557bac5fcdc 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -393,6 +393,9 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
   }
 
   if (std::find(ItBegin, ItEnd, "+v8.4a") != ItEnd ||
+      std::find(ItBegin, ItEnd, "+v8.5a") != ItEnd ||
+      std::find(ItBegin, ItEnd, "+v8.6a") != ItEnd ||
+      std::find(ItBegin, ItEnd, "+v8.7a") != ItEnd ||
       std::find(ItBegin, ItEnd, "+v8.8a") != ItEnd ||
       std::find(ItBegin, ItEnd, "+v9a") != ItEnd ||
       std::find(ItBegin, ItEnd, "+v9.1a") != ItEnd ||
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 6de0657d09bc4..833d75b7e5b9e 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -294,7 +294,7 @@
 // CHECK-MCPU-CARMEL: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fullfp16" "-target-feature" "+ras" "-target-feature" "+lse" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+aes"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s
-// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+v8.5a" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+dotprod" "-target-feature" "+fp16fml" "-target-feature" "+ras" "-target-feature" "+lse" "-target-feature" "+rdm" "-target-feature" "+rcpc" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+fullfp16" "-target-feature" "+sha2" "-target-feature" "+aes"
+// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+v8.5a" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+dotprod" "-target-feature" "+fp16fml" "-target-feature" "+ras" "-target-feature" "+lse" "-target-feature" "+rdm" "-target-feature" "+rcpc" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+fullfp16" "-target-feature" "+sm4" "-target-feature" "+sha3" "-target-feature" "+sha2" "-target-feature" "+aes"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64_32 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64_32 %s
 // CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+v8.3a" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fullfp16" "-target-feature" "+ras" "-target-feature" "+lse" "-target-feature" "+rdm" "-target-feature" "+rcpc" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+sha2" "-target-feature" "+aes"
@@ -390,7 +390,13 @@
 // Check +crypto:
 //
 // RUN: %clang -target aarch64 -march=armv8.4a+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO84 %s
-// CHECK-CRYPTO84: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.4a" "-target-feature" "+crypto" "-target-feature" "+sm4" "-target-feature" "+sha3" "-target-feature" "+sha2" "-target-feature" "+aes"
+// RUN: %clang -target aarch64 -march=armv8.5a+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO85 %s
+// RUN: %clang -target aarch64 -march=armv8.6a+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO86 %s
+// RUN: %clang -target aarch64 -march=armv8.7a+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO87 %s
+// CHECK-CRYPTO84: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.4a"{{.*}} "-target-feature" "+crypto" "-target-feature" "+sm4" "-target-feature" "+sha3" "-target-feature" "+sha2" "-target-feature" "+aes"
+// CHECK-CRYPTO85: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.5a"{{.*}} "-target-feature" "+crypto" "-target-feature" "+sm4" "-target-feature" "+sha3" "-target-feature" "+sha2" "-target-feature" "+aes"
+// CHECK-CRYPTO86: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.6a"{{.*}} "-target-feature" "+crypto" "-target-feature" "+sm4" "-target-feature" "+sha3" "-target-feature" "+sha2" "-target-feature" "+aes"
+// CHECK-CRYPTO87: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.7a"{{.*}} "-target-feature" "+crypto" "-target-feature" "+sm4" "-target-feature" "+sha3" "-target-feature" "+sha2" "-target-feature" "+aes"
 //
 // Check -crypto:
 //
@@ -528,3 +534,26 @@
 // RUN: %clang -target aarch64-arm-none-eabi -march=armv9.3-a+mops -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-MOPS   %s
 // CHECK-MOPS: __ARM_FEATURE_MOPS 1
 // CHECK-NOMOPS-NOT: __ARM_FEATURE_MOPS 1
+
+// ================== Check default macros for Armv8.1-A and later
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.1-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-BEFORE-V83,CHECK-BEFORE-V85     %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.2-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-BEFORE-V83,CHECK-BEFORE-V85     %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.3-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-BEFORE-V85   %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.4-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-BEFORE-V85   %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.5-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.6-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.7-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.8-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv9-a   -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.1-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.2-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-arm-none-eabi -march=armv9.3-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// CHECK-V81-OR-LATER: __ARM_FEATURE_ATOMICS 1
+// CHECK-V83-OR-LATER: __ARM_FEATURE_COMPLEX 1
+// CHECK-V81-OR-LATER: __ARM_FEATURE_CRC32 1
+// CHECK-V85-OR-LATER: __ARM_FEATURE_FRINT 1
+// CHECK-V83-OR-LATER: __ARM_FEATURE_JCVT 1
+// CHECK-V81-OR-LATER: __ARM_FEATURE_QRDMX 1
+// CHECK-BEFORE-V83-NOT: __ARM_FEATURE_COMPLEX 1
+// CHECK-BEFORE-V83-NOT: __ARM_FEATURE_JCVT 1
+// CHECK-BEFORE-V85-NOT: __ARM_FEATURE_FRINT 1
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index d1a313b675cdf..3a1f06041d35d 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -938,3 +938,20 @@
 // CHECK-SHA2-NOT: #define __ARM_FEATURE_AES 1
 // CHECK-SHA2-NOT: #define __ARM_FEATURE_CRYPTO 1
 // CHECK-SHA2: #define __ARM_FEATURE_SHA2 1
+
+// ================== Check default macros for Armv8.1-A and later
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.1-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-BEFORE-V83   %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.2-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-BEFORE-V83   %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.3-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.4-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.5-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.6-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.7-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv8.8-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv9-a   -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv9.1-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv9.2-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// RUN: %clang -target arm-arm-none-eabi -march=armv9.3-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER %s
+// CHECK-V83-OR-LATER: __ARM_FEATURE_COMPLEX 1
+// CHECK-V81-OR-LATER: __ARM_FEATURE_QRDMX 1
+// CHECK-BEFORE-V83-NOT: __ARM_FEATURE_COMPLEX 1

From 47eff645d8e873ba531014751c1c06a716a367e9 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 21 Feb 2022 10:38:08 +0000
Subject: [PATCH 474/748] [InstCombine] Bail out of load-store forwarding for
 scalable vector types

This patch fixes an invalid TypeSize->uint64_t implicit conversion in
FoldReinterpretLoadFromConst. If the size of the constant is scalable
we bail out of the optimisation for now.

Tests added here:

  Transforms/InstCombine/load-store-forward.ll

Differential Revision: https://reviews.llvm.org/D120240
---
 llvm/lib/Analysis/ConstantFolding.cpp         |   9 +-
 .../InstCombine/load-store-forward.ll         | 169 ++++++++++++++++++
 2 files changed, 175 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index fff46bfff5d27..14b277a94ed7d 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -589,14 +589,17 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
   if (BytesLoaded > 32 || BytesLoaded == 0)
     return nullptr;
 
-  int64_t InitializerSize = DL.getTypeAllocSize(C->getType()).getFixedSize();
-
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset <= -1 * static_cast<int64_t>(BytesLoaded))
     return UndefValue::get(IntType);
 
+  // TODO: We should be able to support scalable types.
+  TypeSize InitializerSize = DL.getTypeAllocSize(C->getType());
+  if (InitializerSize.isScalable())
+    return nullptr;
+
   // If we're not accessing anything in this constant, the result is undefined.
-  if (Offset >= InitializerSize)
+  if (Offset >= InitializerSize.getFixedValue())
     return UndefValue::get(IntType);
 
   unsigned char RawBytes[32] = {0};
diff --git a/llvm/test/Transforms/InstCombine/load-store-forward.ll b/llvm/test/Transforms/InstCombine/load-store-forward.ll
index a30c0089a1051..eee47e87a1ca3 100644
--- a/llvm/test/Transforms/InstCombine/load-store-forward.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-forward.ll
@@ -120,3 +120,172 @@ define i32 @vec_store_load_overlap(i32* %p) {
   %load = load i32, i32* %p5, align 2
   ret i32 %load
 }
+
+define i32 @load_i32_store_nxv4i32(i32* %a) {
+; CHECK-LABEL: @load_i32_store_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %0 = bitcast i32* %a to <vscale x 4 x i32>*
+  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* %0, align 16
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+define i64 @load_i64_store_nxv8i8(i8* %a) {
+; CHECK-LABEL: @load_i64_store_nxv8i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[A:%.*]] to <vscale x 8 x i8>*
+; CHECK-NEXT:    store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8>* [[TMP0]], align 16
+; CHECK-NEXT:    [[A2:%.*]] = bitcast i8* [[A]] to i64*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, i64* [[A2]], align 8
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+entry:
+  %0 = bitcast i8* %a to <vscale x 8 x i8>*
+  store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8>* %0, align 16
+  %a2 = bitcast i8* %a to i64*
+  %load = load i64, i64* %a2, align 8
+  ret i64 %load
+}
+
+define i64 @load_i64_store_nxv4i32(i32* %a) {
+; CHECK-LABEL: @load_i64_store_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[A2:%.*]] = bitcast i32* [[A]] to i64*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, i64* [[A2]], align 8
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+entry:
+  %0 = bitcast i32* %a to <vscale x 4 x i32>*
+  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* %0, align 16
+  %a2 = bitcast i32* %a to i64*
+  %load = load i64, i64* %a2, align 8
+  ret i64 %load
+}
+
+define i8 @load_i8_store_nxv4i32(i32* %a) {
+; CHECK-LABEL: @load_i8_store_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[A2:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[A2]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+entry:
+  %0 = bitcast i32* %a to <vscale x 4 x i32>*
+  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* %0, align 16
+  %a2 = bitcast i32* %a to i8*
+  %load = load i8, i8* %a2, align 1
+  ret i8 %load
+}
+
+define float @load_f32_store_nxv4f32(float* %a) {
+; CHECK-LABEL: @load_f32_store_nxv4f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <vscale x 4 x float>*
+; CHECK-NEXT:    store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[A]], align 4
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+entry:
+  %0 = bitcast float* %a to <vscale x 4 x float>*
+  store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.0, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float>* %0, align 16
+  %1 = load float, float* %a, align 4
+  ret float %1
+}
+
+define i32 @load_i32_store_nxv4f32(float* %a) {
+; CHECK-LABEL: @load_i32_store_nxv4f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <vscale x 4 x float>*
+; CHECK-NEXT:    store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float>* [[TMP0]], align 16
+; CHECK-NEXT:    [[A2:%.*]] = bitcast float* [[A]] to i32*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[A2]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %0 = bitcast float* %a to <vscale x 4 x float>*
+  store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.0, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float>* %0, align 16
+  %a2 = bitcast float* %a to i32*
+  %load = load i32, i32* %a2, align 4
+  ret i32 %load
+}
+
+define <4 x i32> @load_v4i32_store_nxv4i32(i32* %a) {
+; CHECK-LABEL: @load_v4i32_store_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+entry:
+  %0 = bitcast i32* %a to <vscale x 4 x i32>*
+  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* %0, align 16
+  %1 = bitcast i32* %a to <4 x i32>*
+  %2 = load <4 x i32>, <4 x i32>* %1, align 16
+  ret <4 x i32> %2
+}
+
+define <4 x i16> @load_v4i16_store_nxv4i32(i32* %a) {
+; CHECK-LABEL: @load_v4i16_store_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 16
+; CHECK-NEXT:    ret <4 x i16> [[TMP2]]
+;
+entry:
+  %0 = bitcast i32* %a to <vscale x 4 x i32>*
+  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* %0, align 16
+  %1 = bitcast i32* %a to <4 x i16>*
+  %2 = load <4 x i16>, <4 x i16>* %1, align 16
+  ret <4 x i16> %2
+}
+
+; Loaded data type exceeds the known minimum size of the store.
+define i64 @load_i64_store_nxv4i8(i8* %a) {
+; CHECK-LABEL: @load_i64_store_nxv4i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[A:%.*]] to <vscale x 4 x i8>*
+; CHECK-NEXT:    store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8>* [[TMP0]], align 16
+; CHECK-NEXT:    [[A2:%.*]] = bitcast i8* [[A]] to i64*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, i64* [[A2]], align 8
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+entry:
+  %0 = bitcast i8* %a to <vscale x 4 x i8>*
+  store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8>* %0, align 16
+  %a2 = bitcast i8* %a to i64*
+  %load = load i64, i64* %a2, align 8
+  ret i64 %load
+}
+
+; Loaded data size is unknown - we cannot guarantee it won't
+; exceed the store size.
+define <vscale x 4 x i8> @load_nxv4i8_store_nxv4i32(i32* %a) {
+; CHECK-LABEL: @load_nxv4i8_store_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to <vscale x 4 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i8>, <vscale x 4 x i8>* [[TMP1]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i8> [[TMP2]]
+;
+entry:
+  %0 = bitcast i32* %a to <vscale x 4 x i32>*
+  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* %0, align 16
+  %1 = bitcast i32* %a to <vscale x 4 x i8>*
+  %2 = load <vscale x 4 x i8>, <vscale x 4 x i8>* %1, align 16
+  ret <vscale x 4 x i8> %2
+}

From 321a39b7556d016c79c0ff84484e5ee8243a4f3d Mon Sep 17 00:00:00 2001
From: Sunho Kim <ksunhokim123@gmail.com>
Date: Tue, 22 Feb 2022 09:26:47 +0000
Subject: [PATCH 475/748] [NFC][AARCH64] Add test cases for negation of select

Add tests to demonstrate new dag combine pattern.

Differential Revision: https://reviews.llvm.org/D120214
---
 llvm/test/CodeGen/AArch64/neg-selects.ll | 81 ++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/neg-selects.ll

diff --git a/llvm/test/CodeGen/AArch64/neg-selects.ll b/llvm/test/CodeGen/AArch64/neg-selects.ll
new file mode 100644
index 0000000000000..e0f0efdcc2d3f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neg-selects.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-none-eabi %s -o - | FileCheck %s
+
+define i32 @neg_select_neg(i32 %a, i32 %b, i1 %bb) {
+; CHECK-LABEL: neg_select_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %nega = sub i32 0, %a
+  %negb = sub i32 0, %b
+  %sel = select i1 %bb, i32 %nega, i32 %negb
+  %res = sub i32 0, %sel
+  ret i32 %res
+}
+
+define i32 @negneg_select_nega(i32 %a, i32 %b, i1 %bb) {
+; CHECK-LABEL: negneg_select_nega:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csneg w0, w1, w0, eq
+; CHECK-NEXT:    ret
+  %nega = sub i32 0, %a
+  %sel = select i1 %bb, i32 %nega, i32 %b
+  %nsel = sub i32 0, %sel
+  %res = sub i32 0, %nsel
+  ret i32 %res
+}
+
+define i32 @neg_select_nega(i32 %a, i32 %b, i1 %bb) {
+; CHECK-LABEL: neg_select_nega:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csneg w8, w1, w0, eq
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %nega = sub i32 0, %a
+  %sel = select i1 %bb, i32 %nega, i32 %b
+  %res = sub i32 0, %sel
+  ret i32 %res
+}
+
+define i32 @neg_select_negb(i32 %a, i32 %b, i1 %bb) {
+; CHECK-LABEL: neg_select_negb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csneg w8, w0, w1, ne
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %negb = sub i32 0, %b
+  %sel = select i1 %bb, i32 %a, i32 %negb
+  %res = sub i32 0, %sel
+  ret i32 %res
+}
+
+define i32 @neg_select_ab(i32 %a, i32 %b, i1 %bb) {
+; CHECK-LABEL: neg_select_ab:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    csel w8, w0, w1, ne
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %sel = select i1 %bb, i32 %a, i32 %b
+  %res = sub i32 0, %sel
+  ret i32 %res
+}
+
+define i32 @neg_select_nega_with_use(i32 %a, i32 %b, i1 %bb) {
+; CHECK-LABEL: neg_select_nega_with_use:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w2, #0x1
+; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    csneg w9, w1, w0, eq
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %nega = sub i32 0, %a
+  %sel = select i1 %bb, i32 %nega, i32 %b
+  %nsel = sub i32 0, %sel
+  %res = add i32 %nsel, %nega
+  ret i32 %res
+}

From dc0657277f2ff53f4e3ab01d856bcc576867ba9e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 22 Feb 2022 09:36:52 +0000
Subject: [PATCH 476/748] Fix warning introduced by
 47eff645d8e873ba531014751c1c06a716a367e9

---
 llvm/lib/Analysis/ConstantFolding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 14b277a94ed7d..2a9ff10c5c271 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -599,7 +599,7 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
     return nullptr;
 
   // If we're not accessing anything in this constant, the result is undefined.
-  if (Offset >= InitializerSize.getFixedValue())
+  if (Offset >= (int64_t)InitializerSize.getFixedValue())
     return UndefValue::get(IntType);
 
   unsigned char RawBytes[32] = {0};

From 912bba5ae25aea1da32ee022a51f063477e5e0f0 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 16 Feb 2022 15:47:16 +0000
Subject: [PATCH 477/748] [libcxx][CI] Set Arm triples to match native clang
 build's default

We were using:
armv8-linux-gnueabihf
But for a native clang build the default target is:
armv8l-linux-gnueabihf

(ditto for v7)

Add the "l" to the target triples and update the one test
that is unsupported to look for the various possible names.

armv(7 or 8)(m or l, optionally)

The UNSUPPORTED does not include aarch64 because aarch64 Linux
(and others that follow Arm's AAPCS64) use quad precision for
long double where arm64 (darwin) does not:
https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#811arithmetic-types

Reviewed By: rovka

Differential Revision: https://reviews.llvm.org/D119948
---
 libcxx/cmake/caches/Armv7Arm.cmake                             | 2 +-
 libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake              | 2 +-
 libcxx/cmake/caches/Armv8Arm.cmake                             | 2 +-
 libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake              | 2 +-
 .../cmp/cmp.alg/strong_order_long_double.verify.cpp            | 3 ++-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/libcxx/cmake/caches/Armv7Arm.cmake b/libcxx/cmake/caches/Armv7Arm.cmake
index 8b2b54eba13ce..0e9dc10e9d41f 100644
--- a/libcxx/cmake/caches/Armv7Arm.cmake
+++ b/libcxx/cmake/caches/Armv7Arm.cmake
@@ -1,4 +1,4 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv7-linux-gnueabihf" CACHE STRING "")
+set(LIBCXX_TARGET_TRIPLE "armv7l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "")
 set(CMAKE_C_FLAGS "-marm" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake b/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake
index 67ec43b93f207..61cd3bf7376ea 100644
--- a/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake
+++ b/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake
@@ -1,5 +1,5 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv7-linux-gnueabihf" CACHE STRING "")
+set(LIBCXX_TARGET_TRIPLE "armv7l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "")
 set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Armv8Arm.cmake b/libcxx/cmake/caches/Armv8Arm.cmake
index 55dfa908b3d01..eee2eb46da56d 100644
--- a/libcxx/cmake/caches/Armv8Arm.cmake
+++ b/libcxx/cmake/caches/Armv8Arm.cmake
@@ -1,4 +1,4 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv8-linux-gnueabihf" CACHE STRING "")
+set(LIBCXX_TARGET_TRIPLE "armv8l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "")
 set(CMAKE_C_FLAGS "-marm" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake b/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake
index fb1d10efaddce..9c2f90661ef8d 100644
--- a/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake
+++ b/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake
@@ -1,5 +1,5 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv8-linux-gnueabihf" CACHE STRING "")
+set(LIBCXX_TARGET_TRIPLE "armv8l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "")
 set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
index b35a50e0a5894..3d777d9d81b60 100644
--- a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
+++ b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
@@ -10,7 +10,8 @@
 // UNSUPPORTED: libcpp-no-concepts
 
 // The following platforms have sizeof(long double) == sizeof(double), so this test doesn't apply to them.
-// UNSUPPORTED: target={{arm64|armv8|armv7|armv7m|powerpc|powerpc64}}-{{.+}}
+// This test does apply to aarch64 where Arm's AAPCS64 is followed. There they are different sizes.
+// UNSUPPORTED: target={{arm64|armv(7|8)(l|m)?|powerpc|powerpc64}}-{{.+}}
 // UNSUPPORTED: target=x86_64-pc-windows-{{.+}}
 
 // <compare>

From 9c720250d1bb2443d56d3dc8f8841af0dac0001a Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 17 Feb 2022 11:57:26 +0000
Subject: [PATCH 478/748] [libcxx][ci] Switch to CMAKE_CXX_COMPILER_TARGET for
 Arm bots

As suggested by the cmake warning:
CMake Warning at <...>/llvm-project/libcxx-ci/libcxx/CMakeLists.txt:289 (message):
  LIBCXX_TARGET_TRIPLE is deprecated, please use CMAKE_CXX_COMPILER_TARGET instead

Depends on D119948

Differential Revision: https://reviews.llvm.org/D120038
---
 libcxx/cmake/caches/AArch64.cmake                 | 2 +-
 libcxx/cmake/caches/Armv7Arm.cmake                | 2 +-
 libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake | 2 +-
 libcxx/cmake/caches/Armv8Arm.cmake                | 2 +-
 libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcxx/cmake/caches/AArch64.cmake b/libcxx/cmake/caches/AArch64.cmake
index 33356a7ee0215..fa802d3de63f0 100644
--- a/libcxx/cmake/caches/AArch64.cmake
+++ b/libcxx/cmake/caches/AArch64.cmake
@@ -1,2 +1,2 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "aarch64-linux-gnu" CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET "aarch64-linux-gnu" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv7Arm.cmake b/libcxx/cmake/caches/Armv7Arm.cmake
index 0e9dc10e9d41f..4d18d08fefcd2 100644
--- a/libcxx/cmake/caches/Armv7Arm.cmake
+++ b/libcxx/cmake/caches/Armv7Arm.cmake
@@ -1,4 +1,4 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv7l-linux-gnueabihf" CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET "armv7l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "")
 set(CMAKE_C_FLAGS "-marm" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake b/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake
index 61cd3bf7376ea..71173af106b63 100644
--- a/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake
+++ b/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake
@@ -1,5 +1,5 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv7l-linux-gnueabihf" CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET "armv7l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "")
 set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Armv8Arm.cmake b/libcxx/cmake/caches/Armv8Arm.cmake
index eee2eb46da56d..5055582fdafc0 100644
--- a/libcxx/cmake/caches/Armv8Arm.cmake
+++ b/libcxx/cmake/caches/Armv8Arm.cmake
@@ -1,4 +1,4 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv8l-linux-gnueabihf" CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET "armv8l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "")
 set(CMAKE_C_FLAGS "-marm" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake b/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake
index 9c2f90661ef8d..316edd3149066 100644
--- a/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake
+++ b/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake
@@ -1,5 +1,5 @@
 set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE "armv8l-linux-gnueabihf" CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET "armv8l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "")
 set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")

From d6a9eec2382559aeae3bb87761afa1b6d351e9a5 Mon Sep 17 00:00:00 2001
From: Sunho Kim <ksunhokim123@gmail.com>
Date: Tue, 22 Feb 2022 09:59:36 +0000
Subject: [PATCH 479/748] [AARCH64][DAGCombine] Add combine for negation of
 CSEL absolute value pattern.

This folds a negation through a csel, which can come up during the
lowering of negative abs.

Fixes https://github.com/llvm/llvm-project/issues/51558.

Differential Revision: https://reviews.llvm.org/D112204
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 45 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/neg-abs.ll          |  6 +--
 llvm/test/CodeGen/AArch64/neg-selects.ll      |  6 +--
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 30d30e88f2740..5678029be376e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14893,6 +14893,49 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
                      Dot.getOperand(2));
 }
 
+static bool isNegatedInteger(SDValue Op) {
+  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
+}
+
+static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
+}
+
+// Try to fold
+//
+// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
+//
+// The folding helps csel to be matched with csneg without generating
+// redundant neg instruction, which includes negation of the csel expansion
+// of abs node lowered by lowerABS.
+static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
+  if (!isNegatedInteger(SDValue(N, 0)))
+    return SDValue();
+
+  SDValue CSel = N->getOperand(1);
+  if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
+    return SDValue();
+
+  SDValue N0 = CSel.getOperand(0);
+  SDValue N1 = CSel.getOperand(1);
+
+  // If both of them is not negations, it's not worth the folding as it
+  // introduces two additional negations while reducing one negation.
+  if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
+    return SDValue();
+
+  SDValue N0N = getNegatedInteger(N0, DAG);
+  SDValue N1N = getNegatedInteger(N1, DAG);
+
+  SDLoc DL(N);
+  EVT VT = CSel.getValueType();
+  return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
+                     CSel.getOperand(3));
+}
+
 // The basic add/sub long vector instructions have variants with "2" on the end
 // which act on the high-half of their inputs. They are normally matched by
 // patterns like:
@@ -14956,6 +14999,8 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
     return Val;
+  if (SDValue Val = performNegCSelCombine(N, DAG))
+    return Val;
 
   return performAddSubLongCombine(N, DCI, DAG);
 }
diff --git a/llvm/test/CodeGen/AArch64/neg-abs.ll b/llvm/test/CodeGen/AArch64/neg-abs.ll
index 71a320e98398a..7f691c9b694c4 100644
--- a/llvm/test/CodeGen/AArch64/neg-abs.ll
+++ b/llvm/test/CodeGen/AArch64/neg-abs.ll
@@ -8,8 +8,7 @@ define i64 @neg_abs64(i64 %x) {
 ; CHECK-LABEL: neg_abs64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    cneg x8, x0, mi
-; CHECK-NEXT:    neg x0, x8
+; CHECK-NEXT:    cneg x0, x0, pl
 ; CHECK-NEXT:    ret
   %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
   %neg = sub nsw i64 0, %abs
@@ -22,8 +21,7 @@ define i32 @neg_abs32(i32 %x) {
 ; CHECK-LABEL: neg_abs32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cneg w8, w0, mi
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    cneg w0, w0, pl
 ; CHECK-NEXT:    ret
   %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
   %neg = sub nsw i32 0, %abs
diff --git a/llvm/test/CodeGen/AArch64/neg-selects.ll b/llvm/test/CodeGen/AArch64/neg-selects.ll
index e0f0efdcc2d3f..114300a79b909 100644
--- a/llvm/test/CodeGen/AArch64/neg-selects.ll
+++ b/llvm/test/CodeGen/AArch64/neg-selects.ll
@@ -31,8 +31,7 @@ define i32 @neg_select_nega(i32 %a, i32 %b, i1 %bb) {
 ; CHECK-LABEL: neg_select_nega:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w2, #0x1
-; CHECK-NEXT:    csneg w8, w1, w0, eq
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    csneg w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %nega = sub i32 0, %a
   %sel = select i1 %bb, i32 %nega, i32 %b
@@ -44,8 +43,7 @@ define i32 @neg_select_negb(i32 %a, i32 %b, i1 %bb) {
 ; CHECK-LABEL: neg_select_negb:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst w2, #0x1
-; CHECK-NEXT:    csneg w8, w0, w1, ne
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    csneg w0, w1, w0, eq
 ; CHECK-NEXT:    ret
   %negb = sub i32 0, %b
   %sel = select i1 %bb, i32 %a, i32 %negb

From c9cc8035eb4fbf5d20a73ec7e0870e6195012a71 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Mon, 29 Nov 2021 08:05:56 +0000
Subject: [PATCH 480/748] [C++20][Modules][2/8] Add enumerations for partition
 modules and stream them.

This is an initial enabling patch for module partition support.
We add enumerations for partition interfaces/implementations.

This means that the module kind enumeration now occupies three
bits, so the AST streamer is adjusted for this.  Adding one bit there
seems preferable to trying to overload the meanings of existing
kinds (and we will also want to add a C++20 header unit case later).

Differential Revision: https://reviews.llvm.org/D114714
---
 clang/include/clang/Basic/Module.h    | 15 +++++++++++++--
 clang/lib/AST/Decl.cpp                |  2 ++
 clang/lib/Sema/SemaModule.cpp         |  2 ++
 clang/lib/Serialization/ASTWriter.cpp |  2 +-
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index de7857347bc2e..b05d3c654e726 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -106,9 +106,15 @@ class Module {
     /// of header files.
     ModuleMapModule,
 
-    /// This is a C++ Modules TS module interface unit.
+    /// This is a C++20 module interface unit.
     ModuleInterfaceUnit,
 
+    /// This is a C++ 20 module partition interface.
+    ModulePartitionInterface,
+
+    /// This is a C++ 20 module partition implementation.
+    ModulePartitionImplementation,
+
     /// This is a fragment of the global module within some C++ module.
     GlobalModuleFragment,
 
@@ -150,7 +156,9 @@ class Module {
 
   /// Does this Module scope describe part of the purview of a named C++ module?
   bool isModulePurview() const {
-    return Kind == ModuleInterfaceUnit || Kind == PrivateModuleFragment;
+    return Kind == ModuleInterfaceUnit || Kind == ModulePartitionInterface ||
+           Kind == ModulePartitionImplementation ||
+           Kind == PrivateModuleFragment;
   }
 
   /// Does this Module scope describe a fragment of the global module within
@@ -506,6 +514,9 @@ class Module {
     Parent->SubModules.push_back(this);
   }
 
+  /// Is this a module partition.
+  bool isModulePartition() const { return Name.find(':') != std::string::npos; }
+
   /// Retrieve the full name of this module, including the path from
   /// its top-level module.
   /// \param AllowStringLiterals If \c true, components that might not be
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 030da7f55fac4..82c4412296dbc 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1550,6 +1550,8 @@ Module *Decl::getOwningModuleForLinkage(bool IgnoreLinkage) const {
     return nullptr;
 
   case Module::ModuleInterfaceUnit:
+  case Module::ModulePartitionInterface:
+  case Module::ModulePartitionImplementation:
     return M;
 
   case Module::GlobalModuleFragment: {
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 9bed3cb769f70..bd5b900e5d389 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -261,6 +261,8 @@ Sema::ActOnPrivateModuleFragmentDecl(SourceLocation ModuleLoc,
                                : ModuleScopes.back().Module->Kind) {
   case Module::ModuleMapModule:
   case Module::GlobalModuleFragment:
+  case Module::ModulePartitionImplementation:
+  case Module::ModulePartitionInterface:
     Diag(PrivateLoc, diag::err_private_module_fragment_not_module);
     return nullptr;
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a126e12bcbd99..cf42e529a8d6a 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -2674,7 +2674,7 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
   Abbrev->Add(BitCodeAbbrevOp(SUBMODULE_DEFINITION));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ID
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Parent
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // Kind
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Kind
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsFramework
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsExplicit
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsSystem

From 034ec9d708cbcb1393c496212013e2b3c05e4dd1 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 22 Feb 2022 09:40:05 +0000
Subject: [PATCH 481/748] [StructurizeCFG] Precommit test case for D120312

---
 .../StructurizeCFG/invert-condition.ll        | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/llvm/test/Transforms/StructurizeCFG/invert-condition.ll b/llvm/test/Transforms/StructurizeCFG/invert-condition.ll
index c5db5ad0e4d3e..5b6f1d8545175 100644
--- a/llvm/test/Transforms/StructurizeCFG/invert-condition.ll
+++ b/llvm/test/Transforms/StructurizeCFG/invert-condition.ll
@@ -28,3 +28,39 @@ bb2:                                              ; preds = %bb2, %bb
 bb5:                                              ; preds = %bb2
   ret void
 }
+
+; FIXME: StructurizeCFG modifies I5 in-place without updating the use of I5 in
+; the phi instruction.
+define void @invert_condition_phi(i32 %arg) {
+; CHECK-LABEL: @invert_condition_phi(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[I5:%.*]] = icmp ne i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    br i1 [[I5]], label [[IF1:%.*]], label [[ENDIF1:%.*]]
+; CHECK:       if1:
+; CHECK-NEXT:    br label [[ENDIF1]]
+; CHECK:       endif1:
+; CHECK-NEXT:    [[I7:%.*]] = phi i1 [ [[I5]], [[MAIN_BODY:%.*]] ], [ false, [[IF1]] ]
+; CHECK-NEXT:    [[I7_INV:%.*]] = xor i1 [[I7]], true
+; CHECK-NEXT:    br i1 [[I7_INV]], label [[IF4:%.*]], label [[ENDIF4:%.*]]
+; CHECK:       if4:
+; CHECK-NEXT:    br label [[ENDIF4]]
+; CHECK:       endif4:
+; CHECK-NEXT:    ret void
+;
+main_body:
+  %i5 = icmp eq i32 %arg, 0
+  br i1 %i5, label %endif1, label %if1
+
+if1:
+  br label %endif1
+
+endif1:
+  %i7 = phi i1 [ false, %if1 ], [ %i5, %main_body ]
+  br i1 %i7, label %endif4, label %if4
+
+if4:
+  br label %endif4
+
+endif4:
+  ret void
+}

From e7e17b30d02d4f0035fef92850d529f16849c6f0 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Tue, 22 Feb 2022 10:15:40 +0000
Subject: [PATCH 482/748] [OpenCL] opencl-c.h: use uint/ulong consistently

Most places already seem to use the short spelling instead of
'unsigned int/long', so perform the following substitutions:

  s/unsigned int /uint /g
  s/unsigned long /ulong /g

This simplifies completeness comparisons against OpenCLBuiltins.td.

Differential Revision: https://reviews.llvm.org/D120032
---
 clang/lib/Headers/opencl-c.h | 154 +++++++++++++++++------------------
 1 file changed, 77 insertions(+), 77 deletions(-)

diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h
index 6c9c3cacf3ec6..18c1c317e100f 100644
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@@ -12919,28 +12919,28 @@ void __ovld prefetch(const __global half16 *p, size_t num_elements);
  * pointed by p. The function returns old.
  */
 int __ovld atomic_add(volatile __global int *p, int val);
-unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_add(volatile __global uint *p, uint val);
 int __ovld atomic_add(volatile __local int *p, int val);
-unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_add(volatile __local uint *p, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_add(volatile int *p, int val);
-unsigned int __ovld atomic_add(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_add(volatile uint *p, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_add(volatile __global int *p, int val);
-unsigned int __ovld atom_add(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_add(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
 int __ovld atom_add(volatile __local int *p, int val);
-unsigned int __ovld atom_add(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_add(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
 long __ovld atom_add(volatile __global long *p, long val);
-unsigned long __ovld atom_add(volatile __global unsigned long *p, unsigned long val);
+ulong __ovld atom_add(volatile __global ulong *p, ulong val);
 long __ovld atom_add(volatile __local long *p, long val);
-unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_add(volatile __local ulong *p, ulong val);
 #endif
 
 /**
@@ -12949,28 +12949,28 @@ unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long v
  * returns old.
  */
 int __ovld atomic_sub(volatile __global int *p, int val);
-unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_sub(volatile __global uint *p, uint val);
 int __ovld atomic_sub(volatile __local int *p, int val);
-unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_sub(volatile __local uint *p, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_sub(volatile int *p, int val);
-unsigned int __ovld atomic_sub(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_sub(volatile uint *p, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_sub(volatile __global int *p, int val);
-unsigned int __ovld atom_sub(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_sub(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
 int __ovld atom_sub(volatile __local int *p, int val);
-unsigned int __ovld atom_sub(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_sub(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
 long __ovld atom_sub(volatile __global long *p, long val);
-unsigned long __ovld atom_sub(volatile __global unsigned long *p, unsigned long val);
+ulong __ovld atom_sub(volatile __global ulong *p, ulong val);
 long __ovld atom_sub(volatile __local long *p, long val);
-unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
 #endif
 
 /**
@@ -12979,31 +12979,31 @@ unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long v
  * value.
  */
 int __ovld atomic_xchg(volatile __global int *p, int val);
-unsigned int __ovld atomic_xchg(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_xchg(volatile __global uint *p, uint val);
 int __ovld atomic_xchg(volatile __local int *p, int val);
-unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_xchg(volatile __local uint *p, uint val);
 float __ovld atomic_xchg(volatile __global float *p, float val);
 float __ovld atomic_xchg(volatile __local float *p, float val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_xchg(volatile int *p, int val);
-unsigned int __ovld atomic_xchg(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_xchg(volatile uint *p, uint val);
 float __ovld atomic_xchg(volatile float *p, float val);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_xchg(volatile __global int *p, int val);
-unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_xchg(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
 int __ovld atom_xchg(volatile __local int *p, int val);
-unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_xchg(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
 long __ovld atom_xchg(volatile __global long *p, long val);
 long __ovld atom_xchg(volatile __local long *p, long val);
-unsigned long __ovld atom_xchg(volatile __global unsigned long *p, unsigned long val);
-unsigned long __ovld atom_xchg(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_xchg(volatile __global ulong *p, ulong val);
+ulong __ovld atom_xchg(volatile __local ulong *p, ulong val);
 #endif
 
 /**
@@ -13013,28 +13013,28 @@ unsigned long __ovld atom_xchg(volatile __local unsigned long *p, unsigned long
  * pointed by p. The function returns old.
  */
 int __ovld atomic_inc(volatile __global int *p);
-unsigned int __ovld atomic_inc(volatile __global unsigned int *p);
+uint __ovld atomic_inc(volatile __global uint *p);
 int __ovld atomic_inc(volatile __local int *p);
-unsigned int __ovld atomic_inc(volatile __local unsigned int *p);
+uint __ovld atomic_inc(volatile __local uint *p);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_inc(volatile int *p);
-unsigned int __ovld atomic_inc(volatile unsigned int *p);
+uint __ovld atomic_inc(volatile uint *p);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_inc(volatile __global int *p);
-unsigned int __ovld atom_inc(volatile __global unsigned int *p);
+uint __ovld atom_inc(volatile __global uint *p);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
 int __ovld atom_inc(volatile __local int *p);
-unsigned int __ovld atom_inc(volatile __local unsigned int *p);
+uint __ovld atom_inc(volatile __local uint *p);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
 long __ovld atom_inc(volatile __global long *p);
-unsigned long __ovld atom_inc(volatile __global unsigned long *p);
+ulong __ovld atom_inc(volatile __global ulong *p);
 long __ovld atom_inc(volatile __local long *p);
-unsigned long __ovld atom_inc(volatile __local unsigned long *p);
+ulong __ovld atom_inc(volatile __local ulong *p);
 #endif
 
 /**
@@ -13044,28 +13044,28 @@ unsigned long __ovld atom_inc(volatile __local unsigned long *p);
  * pointed by p. The function returns old.
  */
 int __ovld atomic_dec(volatile __global int *p);
-unsigned int __ovld atomic_dec(volatile __global unsigned int *p);
+uint __ovld atomic_dec(volatile __global uint *p);
 int __ovld atomic_dec(volatile __local int *p);
-unsigned int __ovld atomic_dec(volatile __local unsigned int *p);
+uint __ovld atomic_dec(volatile __local uint *p);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_dec(volatile int *p);
-unsigned int __ovld atomic_dec(volatile unsigned int *p);
+uint __ovld atomic_dec(volatile uint *p);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_dec(volatile __global int *p);
-unsigned int __ovld atom_dec(volatile __global unsigned int *p);
+uint __ovld atom_dec(volatile __global uint *p);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
 int __ovld atom_dec(volatile __local int *p);
-unsigned int __ovld atom_dec(volatile __local unsigned int *p);
+uint __ovld atom_dec(volatile __local uint *p);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
 long __ovld atom_dec(volatile __global long *p);
-unsigned long __ovld atom_dec(volatile __global unsigned long *p);
+ulong __ovld atom_dec(volatile __global ulong *p);
 long __ovld atom_dec(volatile __local long *p);
-unsigned long __ovld atom_dec(volatile __local unsigned long *p);
+ulong __ovld atom_dec(volatile __local ulong *p);
 #endif
 
 /**
@@ -13076,28 +13076,28 @@ unsigned long __ovld atom_dec(volatile __local unsigned long *p);
  * returns old.
  */
 int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
-unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+uint __ovld atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
 int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
-unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+uint __ovld atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
-unsigned int __ovld atomic_cmpxchg(volatile unsigned int *p, unsigned int cmp, unsigned int val);
+uint __ovld atomic_cmpxchg(volatile uint *p, uint cmp, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
-unsigned int __ovld atom_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+uint __ovld atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
 int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
-unsigned int __ovld atom_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+uint __ovld atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
 long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
-unsigned long __ovld atom_cmpxchg(volatile __global unsigned long *p, unsigned long cmp, unsigned long val);
+ulong __ovld atom_cmpxchg(volatile __global ulong *p, ulong cmp, ulong val);
 long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
-unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned long cmp, unsigned long val);
+ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
 #endif
 
 /**
@@ -13108,28 +13108,28 @@ unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned lo
  * returns old.
  */
 int __ovld atomic_min(volatile __global int *p, int val);
-unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_min(volatile __global uint *p, uint val);
 int __ovld atomic_min(volatile __local int *p, int val);
-unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_min(volatile __local uint *p, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_min(volatile int *p, int val);
-unsigned int __ovld atomic_min(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_min(volatile uint *p, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_min(volatile __global int *p, int val);
-unsigned int __ovld atom_min(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_min(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
 int __ovld atom_min(volatile __local int *p, int val);
-unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_min(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
 long __ovld atom_min(volatile __global long *p, long val);
-unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);
+ulong __ovld atom_min(volatile __global ulong *p, ulong val);
 long __ovld atom_min(volatile __local long *p, long val);
-unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_min(volatile __local ulong *p, ulong val);
 #endif
 
 /**
@@ -13140,28 +13140,28 @@ unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long v
  * returns old.
  */
 int __ovld atomic_max(volatile __global int *p, int val);
-unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_max(volatile __global uint *p, uint val);
 int __ovld atomic_max(volatile __local int *p, int val);
-unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_max(volatile __local uint *p, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_max(volatile int *p, int val);
-unsigned int __ovld atomic_max(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_max(volatile uint *p, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_max(volatile __global int *p, int val);
-unsigned int __ovld atom_max(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_max(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
 int __ovld atom_max(volatile __local int *p, int val);
-unsigned int __ovld atom_max(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_max(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
 long __ovld atom_max(volatile __global long *p, long val);
-unsigned long __ovld atom_max(volatile __global unsigned long *p, unsigned long val);
+ulong __ovld atom_max(volatile __global ulong *p, ulong val);
 long __ovld atom_max(volatile __local long *p, long val);
-unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_max(volatile __local ulong *p, ulong val);
 #endif
 
 /**
@@ -13171,28 +13171,28 @@ unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long v
  * pointed by p. The function returns old.
  */
 int __ovld atomic_and(volatile __global int *p, int val);
-unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_and(volatile __global uint *p, uint val);
 int __ovld atomic_and(volatile __local int *p, int val);
-unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_and(volatile __local uint *p, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_and(volatile int *p, int val);
-unsigned int __ovld atomic_and(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_and(volatile uint *p, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_and(volatile __global int *p, int val);
-unsigned int __ovld atom_and(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_and(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
 int __ovld atom_and(volatile __local int *p, int val);
-unsigned int __ovld atom_and(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_and(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
 long __ovld atom_and(volatile __global long *p, long val);
-unsigned long __ovld atom_and(volatile __global unsigned long *p, unsigned long val);
+ulong __ovld atom_and(volatile __global ulong *p, ulong val);
 long __ovld atom_and(volatile __local long *p, long val);
-unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_and(volatile __local ulong *p, ulong val);
 #endif
 
 /**
@@ -13202,28 +13202,28 @@ unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long v
  * pointed by p. The function returns old.
  */
 int __ovld atomic_or(volatile __global int *p, int val);
-unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_or(volatile __global uint *p, uint val);
 int __ovld atomic_or(volatile __local int *p, int val);
-unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_or(volatile __local uint *p, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_or(volatile int *p, int val);
-unsigned int __ovld atomic_or(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_or(volatile uint *p, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_or(volatile __global int *p, int val);
-unsigned int __ovld atom_or(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_or(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
 int __ovld atom_or(volatile __local int *p, int val);
-unsigned int __ovld atom_or(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_or(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
 long __ovld atom_or(volatile __global long *p, long val);
-unsigned long __ovld atom_or(volatile __global unsigned long *p, unsigned long val);
+ulong __ovld atom_or(volatile __global ulong *p, ulong val);
 long __ovld atom_or(volatile __local long *p, long val);
-unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_or(volatile __local ulong *p, ulong val);
 #endif
 
 /**
@@ -13233,28 +13233,28 @@ unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long va
  * pointed by p. The function returns old.
  */
 int __ovld atomic_xor(volatile __global int *p, int val);
-unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atomic_xor(volatile __global uint *p, uint val);
 int __ovld atomic_xor(volatile __local int *p, int val);
-unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atomic_xor(volatile __local uint *p, uint val);
 #ifdef __OPENCL_CPP_VERSION__
 int __ovld atomic_xor(volatile int *p, int val);
-unsigned int __ovld atomic_xor(volatile unsigned int *p, unsigned int val);
+uint __ovld atomic_xor(volatile uint *p, uint val);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_xor(volatile __global int *p, int val);
-unsigned int __ovld atom_xor(volatile __global unsigned int *p, unsigned int val);
+uint __ovld atom_xor(volatile __global uint *p, uint val);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
 int __ovld atom_xor(volatile __local int *p, int val);
-unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
+uint __ovld atom_xor(volatile __local uint *p, uint val);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
 long __ovld atom_xor(volatile __global long *p, long val);
-unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val);
+ulong __ovld atom_xor(volatile __global ulong *p, ulong val);
 long __ovld atom_xor(volatile __local long *p, long val);
-unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val);
+ulong __ovld atom_xor(volatile __local ulong *p, ulong val);
 #endif
 
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)

From ad3b1fe47273f15c721e7d0b125a2c6f0a8283cb Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Tue, 22 Feb 2022 16:54:27 +0700
Subject: [PATCH 483/748] [SCEV] Do not erase LoopUsers. PR53969

This patch fixes a logical error in how we work with `LoopUsers` map.
It maps a loop onto a set of AddRecs that depend on it. The Addrecs
are added to this map only once when they are created and put to
the UniqueSCEVs` map.

The only purpose of this map is to make sure that, whenever we forget
a loop, all (directly or indirectly) dependent SCEVs get forgotten too.

Current code erases SCEVs from dependent set of a given loop whenever
we forget this loop. This is not a correct behavior due to the following scenario:

1. We have a loop `L` and an AddRec `AR` that depends on it;
2. We modify something in the loop, but don't destroy it. We still call forgetLoop on it;
3. `AR` is no longer dependent on `L` according to `LoopUsers`. It is erased from
    ValueExprMap` and `ExprValue map, but still exists in UniqueSCEVs;
4. We can later request the very same AddRec for the very same loop again, and get existing
    SCEV `AR`.
5. Now, `AR` exists and is used again, but its notion that it depends on `L` is lost;
6. Then we decide to delete `L`. `AR` will not be forgotten because we have lost it;
7. Just you wait when you run into a dangling pointer problem, or any other kind of problem
   because an active SCEV is now referecing a non-existent loop.

The solution to this is to stop erasing values from `LoopUsers`. Yes, we will maybe forget something
that is already not used, but it's cheap.

This fixes a functional bug and potentially may have negative compile time impact on methods with
huge or numerous loops.

Differential Revision: https://reviews.llvm.org/D120303
Reviewed By: nikic
---
 llvm/lib/Analysis/ScalarEvolution.cpp        |  1 -
 llvm/test/Transforms/LoopDeletion/pr53969.ll | 62 +++++++++++++++++++-
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 73fa48cee7bcf..08caaaabc4259 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8068,7 +8068,6 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
     if (LoopUsersItr != LoopUsers.end()) {
       ToForget.insert(ToForget.end(), LoopUsersItr->second.begin(),
                 LoopUsersItr->second.end());
-      LoopUsers.erase(LoopUsersItr);
     }
 
     // Drop information about expressions based on loop-header PHIs.
diff --git a/llvm/test/Transforms/LoopDeletion/pr53969.ll b/llvm/test/Transforms/LoopDeletion/pr53969.ll
index 2765f01637779..16d761e52b0cb 100644
--- a/llvm/test/Transforms/LoopDeletion/pr53969.ll
+++ b/llvm/test/Transforms/LoopDeletion/pr53969.ll
@@ -1,13 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes="loop(indvars,loop-deletion)" -S  < %s | FileCheck %s
-; XFAIL: *
-; REQUIRES: asserts
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Make sure we don't crash.
 define void @test() {
-; CHECK-LABEL: test
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ 11, [[BB:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i32 112, -1
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i32 [[TMP2]], -6
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i32 [[TMP7]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 undef, 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label [[BB33_LOOPEXIT1:%.*]], label [[BB34_PREHEADER:%.*]]
+; CHECK:       bb34.preheader:
+; CHECK-NEXT:    br label [[BB34:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    [[TMP2_LCSSA12:%.*]] = phi i32 [ 11, [[BB34]] ]
+; CHECK-NEXT:    br label [[BB33_LOOPEXIT:%.*]]
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[TMP40:%.*]], 0
+; CHECK-NEXT:    br label [[BB14:%.*]]
+; CHECK:       bb14:
+; CHECK-NEXT:    br i1 true, label [[BB32:%.*]], label [[BB22:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 4 to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[TMP1]], undef
+; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], undef
+; CHECK-NEXT:    br i1 false, label [[BB42:%.*]], label [[BB25:%.*]]
+; CHECK:       bb25:
+; CHECK-NEXT:    br label [[BB31:%.*]]
+; CHECK:       bb31:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb32:
+; CHECK-NEXT:    ret void
+; CHECK:       bb33.loopexit:
+; CHECK-NEXT:    [[TMP2_LCSSA9:%.*]] = phi i32 [ [[TMP2_LCSSA12]], [[BB11:%.*]] ]
+; CHECK-NEXT:    br label [[BB33:%.*]]
+; CHECK:       bb33.loopexit1:
+; CHECK-NEXT:    [[TMP2_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[BB1]] ]
+; CHECK-NEXT:    br label [[BB33]]
+; CHECK:       bb33:
+; CHECK-NEXT:    [[TMP210:%.*]] = phi i32 [ [[TMP2_LCSSA]], [[BB33_LOOPEXIT1]] ], [ [[TMP2_LCSSA9]], [[BB33_LOOPEXIT]] ]
+; CHECK-NEXT:    call void @use(i32 [[TMP210]])
+; CHECK-NEXT:    ret void
+; CHECK:       bb34:
+; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 0, [[TMP8]]
+; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP36]], undef
+; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP38]], undef
+; CHECK-NEXT:    [[TMP40]] = sext i32 [[TMP39]] to i64
+; CHECK-NEXT:    br i1 false, label [[BB11]], label [[BB12:%.*]]
+; CHECK:       bb42:
+; CHECK-NEXT:    [[TMP24_LCSSA:%.*]] = phi i32 [ [[TMP24]], [[BB22]] ]
+; CHECK-NEXT:    [[TMP18_LCSSA4:%.*]] = phi i64 [ [[TMP0]], [[BB22]] ]
+; CHECK-NEXT:    store atomic i64 [[TMP18_LCSSA4]], i64 addrspace(1)* undef unordered, align 8
+; CHECK-NEXT:    call void @use(i32 [[TMP24_LCSSA]])
+; CHECK-NEXT:    ret void
+;
 bb:
   br label %bb1
 

From cfd6ba89fd9f0572a9aaf76ff3da3f46e265ad75 Mon Sep 17 00:00:00 2001
From: Arjun P <arjunpitchanathan@gmail.com>
Date: Tue, 22 Feb 2022 08:48:55 +0000
Subject: [PATCH 484/748] [MLIR][Presburger] rename get*LexMin -> find*LexMin

This reflects the fact that we are performing some non-trivial computations
here. Also, this is more uniform in line with findIntegerSample.
---
 .../mlir/Analysis/Presburger/IntegerPolyhedron.h       |  4 ++--
 mlir/include/mlir/Analysis/Presburger/Simplex.h        |  4 ++--
 mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp     |  8 ++++----
 mlir/lib/Analysis/Presburger/Simplex.cpp               |  4 ++--
 .../Analysis/Presburger/IntegerPolyhedronTest.cpp      | 10 +++++-----
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
index 834e3b0edce18..f9b3f84d5e1cc 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerPolyhedron.h
@@ -255,14 +255,14 @@ class IntegerPolyhedron : public IntegerRelation {
   /// the lexmin is unbounded. Symbols are not supported and will result in
   /// assert-failure.
   presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>>
-  getRationalLexMin() const;
+  findRationalLexMin() const;
 
   /// Same as above, but returns lexicographically minimal integer point.
   /// Note: this should be used only when the lexmin is really required.
   /// For a generic integer sampling operation, findIntegerSample is more
   /// robust and should be preferred.
   presburger_utils::MaybeOptimum<SmallVector<int64_t, 8>>
-  getIntegerLexMin() const;
+  findIntegerLexMin() const;
 
   /// Swap the posA^th identifier with the posB^th identifier.
   virtual void swapId(unsigned posA, unsigned posB);
diff --git a/mlir/include/mlir/Analysis/Presburger/Simplex.h b/mlir/include/mlir/Analysis/Presburger/Simplex.h
index d5e14f717e925..83f4d398b67c9 100644
--- a/mlir/include/mlir/Analysis/Presburger/Simplex.h
+++ b/mlir/include/mlir/Analysis/Presburger/Simplex.h
@@ -438,13 +438,13 @@ class LexSimplex : public SimplexBase {
   unsigned getSnapshot() { return SimplexBase::getSnapshotBasis(); }
 
   /// Return the lexicographically minimum rational solution to the constraints.
-  presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>> getRationalLexMin();
+  presburger_utils::MaybeOptimum<SmallVector<Fraction, 8>> findRationalLexMin();
 
   /// Return the lexicographically minimum integer solution to the constraints.
   ///
   /// Note: this should be used only when the lexmin is really needed. To obtain
   /// any integer sample, use Simplex::findIntegerSample as that is more robust.
-  presburger_utils::MaybeOptimum<SmallVector<int64_t, 8>> getIntegerLexMin();
+  presburger_utils::MaybeOptimum<SmallVector<int64_t, 8>> findIntegerLexMin();
 
 protected:
   /// Returns the current sample point, which may contain non-integer (rational)
diff --git a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
index 5e26149303e6e..2380a214c0f2b 100644
--- a/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerPolyhedron.cpp
@@ -73,10 +73,10 @@ bool IntegerPolyhedron::isSubsetOf(const IntegerPolyhedron &other) const {
 }
 
 MaybeOptimum<SmallVector<Fraction, 8>>
-IntegerPolyhedron::getRationalLexMin() const {
+IntegerPolyhedron::findRationalLexMin() const {
   assert(getNumSymbolIds() == 0 && "Symbols are not supported!");
   MaybeOptimum<SmallVector<Fraction, 8>> maybeLexMin =
-      LexSimplex(*this).getRationalLexMin();
+      LexSimplex(*this).findRationalLexMin();
 
   if (!maybeLexMin.isBounded())
     return maybeLexMin;
@@ -93,10 +93,10 @@ IntegerPolyhedron::getRationalLexMin() const {
 }
 
 MaybeOptimum<SmallVector<int64_t, 8>>
-IntegerPolyhedron::getIntegerLexMin() const {
+IntegerPolyhedron::findIntegerLexMin() const {
   assert(getNumSymbolIds() == 0 && "Symbols are not supported!");
   MaybeOptimum<SmallVector<int64_t, 8>> maybeLexMin =
-      LexSimplex(*this).getIntegerLexMin();
+      LexSimplex(*this).findIntegerLexMin();
 
   if (!maybeLexMin.isBounded())
     return maybeLexMin.getKind();
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index 79ccae57573e5..02eb323c1820a 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -164,7 +164,7 @@ Direction flippedDirection(Direction direction) {
 }
 } // namespace
 
-MaybeOptimum<SmallVector<Fraction, 8>> LexSimplex::getRationalLexMin() {
+MaybeOptimum<SmallVector<Fraction, 8>> LexSimplex::findRationalLexMin() {
   restoreRationalConsistency();
   return getRationalSample();
 }
@@ -194,7 +194,7 @@ Optional<unsigned> LexSimplex::maybeGetNonIntegeralVarRow() const {
   return {};
 }
 
-MaybeOptimum<SmallVector<int64_t, 8>> LexSimplex::getIntegerLexMin() {
+MaybeOptimum<SmallVector<int64_t, 8>> LexSimplex::findIntegerLexMin() {
   while (!empty) {
     restoreRationalConsistency();
     if (empty)
diff --git a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
index fffbf7527f994..e403ddd013ad1 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
@@ -61,7 +61,7 @@ static void checkSample(bool hasSample, const IntegerPolyhedron &poly,
   switch (fn) {
   case TestFunction::Sample:
     maybeSample = poly.findIntegerSample();
-    maybeLexMin = poly.getIntegerLexMin();
+    maybeLexMin = poly.findIntegerLexMin();
 
     if (!hasSample) {
       EXPECT_FALSE(maybeSample.hasValue());
@@ -1081,7 +1081,7 @@ TEST(IntegerPolyhedronTest, negativeDividends) {
 
 void expectRationalLexMin(const IntegerPolyhedron &poly,
                           ArrayRef<Fraction> min) {
-  auto lexMin = poly.getRationalLexMin();
+  auto lexMin = poly.findRationalLexMin();
   ASSERT_TRUE(lexMin.isBounded());
   EXPECT_EQ(ArrayRef<Fraction>(*lexMin), min);
 }
@@ -1089,7 +1089,7 @@ void expectRationalLexMin(const IntegerPolyhedron &poly,
 void expectNoRationalLexMin(OptimumKind kind, const IntegerPolyhedron &poly) {
   ASSERT_NE(kind, OptimumKind::Bounded)
       << "Use expectRationalLexMin for bounded min";
-  EXPECT_EQ(poly.getRationalLexMin().getKind(), kind);
+  EXPECT_EQ(poly.findRationalLexMin().getKind(), kind);
 }
 
 TEST(IntegerPolyhedronTest, getRationalLexMin) {
@@ -1164,7 +1164,7 @@ TEST(IntegerPolyhedronTest, getRationalLexMin) {
 }
 
 void expectIntegerLexMin(const IntegerPolyhedron &poly, ArrayRef<int64_t> min) {
-  auto lexMin = poly.getIntegerLexMin();
+  auto lexMin = poly.findIntegerLexMin();
   ASSERT_TRUE(lexMin.isBounded());
   EXPECT_EQ(ArrayRef<int64_t>(*lexMin), min);
 }
@@ -1172,7 +1172,7 @@ void expectIntegerLexMin(const IntegerPolyhedron &poly, ArrayRef<int64_t> min) {
 void expectNoIntegerLexMin(OptimumKind kind, const IntegerPolyhedron &poly) {
   ASSERT_NE(kind, OptimumKind::Bounded)
       << "Use expectRationalLexMin for bounded min";
-  EXPECT_EQ(poly.getRationalLexMin().getKind(), kind);
+  EXPECT_EQ(poly.findRationalLexMin().getKind(), kind);
 }
 
 TEST(IntegerPolyhedronTest, getIntegerLexMin) {

From 01c0b4d51c50f2dc51e0dec96174939e52df87f2 Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexander.batashev@intel.com>
Date: Tue, 22 Feb 2022 14:04:21 +0300
Subject: [PATCH 485/748] [mlir][spirv] Fix SPIR-V spec parser

Header class in SPIR-V HTML spec has changed. Update script to reflect that.

Reviewed By: antiagainst

Differential Revision: https://reviews.llvm.org/D120179
---
 mlir/utils/spirv/gen_spirv_dialect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/utils/spirv/gen_spirv_dialect.py b/mlir/utils/spirv/gen_spirv_dialect.py
index 72db3493c126c..0fc6f2f9d910e 100755
--- a/mlir/utils/spirv/gen_spirv_dialect.py
+++ b/mlir/utils/spirv/gen_spirv_dialect.py
@@ -59,7 +59,7 @@ def get_spirv_doc_from_html_spec(url, settings):
         # Ignore the first line, which is just the opname.
         doc[opname] = inst_html.text.split('\n', 1)[1].strip()
   else:
-    section_anchor = spirv.find('h3', {'id': '_a_id_instructions_a_instructions'})
+    section_anchor = spirv.find('h3', {'id': '_instructions_3'})
     for section in section_anchor.parent.find_all('div', {'class': 'sect3'}):
       for table in section.find_all('table'):
         inst_html = table.tbody.tr.td.p

From 3c0096a1d45689389e11c385fd4ab98fdec80323 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 22 Feb 2022 12:25:30 +0100
Subject: [PATCH 486/748] [MergeICmps] Don't call comesBefore() if in different
 blocks (PR53959)

Only call comesBefore() if the instructions are in the same block.
Otherwise make a conservative assumption.

Fixes https://github.com/llvm/llvm-project/issues/53959.
---
 llvm/lib/Transforms/Scalar/MergeICmps.cpp     |  2 +-
 .../test/Transforms/MergeICmps/X86/pr53959.ll | 48 +++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/MergeICmps/X86/pr53959.ll

diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index aac0deea5be3d..d38e362015ed7 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -244,7 +244,7 @@ bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
     auto MayClobber = [&](LoadInst *LI) {
       // If a potentially clobbering instruction comes before the load,
       // we can still safely sink the load.
-      return !Inst->comesBefore(LI) &&
+      return (Inst->getParent() != LI->getParent() || !Inst->comesBefore(LI)) &&
              isModSet(AA.getModRefInfo(Inst, MemoryLocation::get(LI)));
     };
     if (MayClobber(Cmp.Lhs.LoadI) || MayClobber(Cmp.Rhs.LoadI))
diff --git a/llvm/test/Transforms/MergeICmps/X86/pr53959.ll b/llvm/test/Transforms/MergeICmps/X86/pr53959.ll
new file mode 100644
index 0000000000000..b7ba8bd39ba53
--- /dev/null
+++ b/llvm/test/Transforms/MergeICmps/X86/pr53959.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mergeicmps < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@c = external global i32, align 4
+
+define i1 @d() {
+; CHECK-LABEL: @d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[G:%.*]] = alloca [8 x i64], align 16
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[G]], i64 0, i64 0
+; CHECK-NEXT:    [[V1:%.*]] = load i64, i64* [[IDX1]], align 8
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[G]], i64 0, i64 0
+; CHECK-NEXT:    [[V2:%.*]] = load i64, i64* [[IDX2]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[V1]], [[V2]]
+; CHECK-NEXT:    br label [[SPLIT:%.*]]
+; CHECK:       split:
+; CHECK-NEXT:    [[X:%.*]] = load volatile i32, i32* @c, align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i1 [ false, [[SPLIT]] ], [ [[TOBOOL]], [[IF]] ]
+; CHECK-NEXT:    ret i1 [[P]]
+;
+entry:
+  %g = alloca [8 x i64], align 16
+  %idx1 = getelementptr inbounds [8 x i64], [8 x i64]* %g, i64 0, i64 0
+  %v1 = load i64, i64* %idx1, align 8
+  %idx2 = getelementptr inbounds [8 x i64], [8 x i64]* %g, i64 0, i64 0
+  %v2 = load i64, i64* %idx2, align 8
+  %cmp = icmp eq i64 %v1, %v2
+  br label %split
+
+split:
+  %x = load volatile i32, i32* @c, align 4
+  br i1 %cmp, label %if, label %exit
+
+if:
+  %tobool = icmp ne i32 %x, 0
+  br label %exit
+
+exit:
+  %p = phi i1 [ false, %split ], [ %tobool, %if ]
+  ret i1 %p
+}

From a2c267e0c9d9b9963f4022caf455327a7d96dfbf Mon Sep 17 00:00:00 2001
From: Ilya Nozhkin <nozhkin.ii@gmail.com>
Date: Tue, 22 Feb 2022 12:48:32 +0100
Subject: [PATCH 487/748] [lldb] Fix race condition between lldb-vscode and
 stop hooks executor

The race is between these two pieces of code that are executed in two separate
lldb-vscode threads (the first is in the main thread and another is in the
event-handling thread):

```
// lldb-vscode.cpp
g_vsc.debugger.SetAsync(false);
g_vsc.target.Launch(launch_info, error);
g_vsc.debugger.SetAsync(true);
```

```
// Target.cpp
bool old_async = debugger.GetAsyncExecution();
debugger.SetAsyncExecution(true);
debugger.GetCommandInterpreter().HandleCommands(GetCommands(), exc_ctx,
                                                options, result);
debugger.SetAsyncExecution(old_async);
```

The sequence that leads to the bug is this one:
1. Main thread enables synchronous mode and launches the process.
2. When the process is launched, it generates the first stop event.
3. This stop event is catched by the event-handling thread and DoOnRemoval
   is invoked.
4. Inside DoOnRemoval, this thread runs stop hooks. And before running stop
   hooks, the current synchronization mode is stored into old_async (and
   right now it is equal to "false").
5. The main thread finishes the launch and returns to lldb-vscode, the
   synchronization mode is restored to asynchronous by lldb-vscode.
6. Event-handling thread finishes stop hooks processing and restores the
   synchronization mode according to old_async (i.e. makes the mode synchronous)
7. And now the mode is synchronous while lldb-vscode expects it to be
   asynchronous. Synchronous mode forbids the process to broadcast public stop
   events, so, VS Code just hangs because lldb-vscode doesn't notify it about
   stops.

So, this diff makes the target intercept the first stop event if the process is
launched in the synchronous mode, thus preventing stop hooks execution.

The bug is only present on Windows because other platforms already
intercept this event using their own hijacking listeners.

So, this diff also fixes some problems with lldb-vscode tests on Windows to make
it possible to run the related test. Other tests still can't be enabled because
the debugged program prints something into stdout and LLDB can't intercept this
output and redirect it to lldb-vscode properly.

Reviewed By: jingham

Differential Revision: https://reviews.llvm.org/D119548
---
 lldb/include/lldb/Target/Process.h            |   3 +
 .../Python/lldbsuite/test/lldbtest.py         |   5 +
 .../tools/lldb-vscode/lldbvscode_testcase.py  |   4 +-
 .../Plugins/Platform/POSIX/PlatformPOSIX.cpp  |  24 +--
 .../Platform/QemuUser/PlatformQemuUser.cpp    |  10 +-
 .../Platform/Windows/PlatformWindows.cpp      |  22 +-
 .../gdb-server/PlatformRemoteGDBServer.cpp    |   2 +
 lldb/source/Target/Process.cpp                | 188 ++++++++++--------
 lldb/source/Target/Target.cpp                 |  46 +++--
 .../API/tools/lldb-vscode/stop-hooks/Makefile |   3 +
 .../stop-hooks/TestVSCode_stop_hooks.py       |  35 ++++
 .../API/tools/lldb-vscode/stop-hooks/main.c   |   1 +
 12 files changed, 205 insertions(+), 138 deletions(-)
 create mode 100644 lldb/test/API/tools/lldb-vscode/stop-hooks/Makefile
 create mode 100644 lldb/test/API/tools/lldb-vscode/stop-hooks/TestVSCode_stop_hooks.py
 create mode 100644 lldb/test/API/tools/lldb-vscode/stop-hooks/main.c

diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 7911dac40b705..adceec619ff04 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -3073,6 +3073,9 @@ void PruneThreadPlans();
 
   void ControlPrivateStateThread(uint32_t signal);
 
+  Status LaunchPrivate(ProcessLaunchInfo &launch_info, lldb::StateType &state,
+                       lldb::EventSP &event_sp);
+
   Process(const Process &) = delete;
   const Process &operator=(const Process &) = delete;
 };
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 950dd41666fd6..98b0922e9cfe2 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -231,6 +231,11 @@ def pointer_size():
 
 def is_exe(fpath):
     """Returns true if fpath is an executable."""
+    if fpath == None:
+        return False
+    if sys.platform == 'win32':
+        if not fpath.endswith(".exe"):
+            fpath += ".exe"
     return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
 
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py
index 255a4805a9737..b0fb17ffa9719 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-vscode/lldbvscode_testcase.py
@@ -11,8 +11,8 @@ class VSCodeTestCaseBase(TestBase):
 
     def create_debug_adaptor(self, lldbVSCodeEnv=None):
         '''Create the Visual Studio Code debug adaptor'''
-        self.assertTrue(os.path.exists(self.lldbVSCodeExec),
-                        'lldb-vscode must exist')
+        self.assertTrue(is_exe(self.lldbVSCodeExec),
+                        'lldb-vscode must exist and be executable')
         log_file_path = self.getBuildArtifact('vscode.txt')
         self.vscode = vscode.DebugAdaptor(
             executable=self.lldbVSCodeExec, init_commands=self.setUpCommands(),
diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
index 177332b6cbc96..a25fd1f2678eb 100644
--- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
+++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
@@ -443,9 +443,8 @@ lldb::ProcessSP PlatformPOSIX::DebugProcess(ProcessLaunchInfo &launch_info,
 
   // Now create the gdb-remote process.
   LLDB_LOG(log, "having target create process with gdb-remote plugin");
-  process_sp =
-      target.CreateProcess(launch_info.GetListener(), "gdb-remote", nullptr,
-                            true);
+  process_sp = target.CreateProcess(launch_info.GetListener(), "gdb-remote",
+                                    nullptr, true);
 
   if (!process_sp) {
     error.SetErrorString("CreateProcess() failed for gdb-remote process");
@@ -454,15 +453,8 @@ lldb::ProcessSP PlatformPOSIX::DebugProcess(ProcessLaunchInfo &launch_info,
   }
 
   LLDB_LOG(log, "successfully created process");
-  // Adjust launch for a hijacker.
-  ListenerSP listener_sp;
-  if (!launch_info.GetHijackListener()) {
-    LLDB_LOG(log, "setting up hijacker");
-    listener_sp =
-        Listener::MakeListener("lldb.PlatformLinux.DebugProcess.hijack");
-    launch_info.SetHijackListener(listener_sp);
-    process_sp->HijackProcessEvents(listener_sp);
-  }
+
+  process_sp->HijackProcessEvents(launch_info.GetHijackListener());
 
   // Log file actions.
   if (log) {
@@ -480,14 +472,6 @@ lldb::ProcessSP PlatformPOSIX::DebugProcess(ProcessLaunchInfo &launch_info,
   // Do the launch.
   error = process_sp->Launch(launch_info);
   if (error.Success()) {
-    // Handle the hijacking of process events.
-    if (listener_sp) {
-      const StateType state = process_sp->WaitForProcessToStop(
-          llvm::None, nullptr, false, listener_sp);
-
-      LLDB_LOG(log, "pid {0} state {0}", process_sp->GetID(), state);
-    }
-
     // Hook up process PTY if we have one (which we should for local debugging
     // with llgs).
     int pty_fd = launch_info.GetPTY().ReleasePrimaryFileDescriptor();
diff --git a/lldb/source/Plugins/Platform/QemuUser/PlatformQemuUser.cpp b/lldb/source/Plugins/Platform/QemuUser/PlatformQemuUser.cpp
index 6e49faee4da7a..927daed08d34b 100644
--- a/lldb/source/Plugins/Platform/QemuUser/PlatformQemuUser.cpp
+++ b/lldb/source/Plugins/Platform/QemuUser/PlatformQemuUser.cpp
@@ -217,11 +217,12 @@ lldb::ProcessSP PlatformQemuUser::DebugProcess(ProcessLaunchInfo &launch_info,
       launch_info.GetListener(),
       process_gdb_remote::ProcessGDBRemote::GetPluginNameStatic(), nullptr,
       true);
+  if (!process_sp) {
+    error.SetErrorString("Failed to create GDB process");
+    return nullptr;
+  }
 
-  ListenerSP listener_sp =
-      Listener::MakeListener("lldb.platform_qemu_user.debugprocess");
-  launch_info.SetHijackListener(listener_sp);
-  Process::ProcessEventHijacker hijacker(*process_sp, listener_sp);
+  process_sp->HijackProcessEvents(launch_info.GetHijackListener());
 
   error = process_sp->ConnectRemote(("unix-connect://" + socket_path).str());
   if (error.Fail())
@@ -232,7 +233,6 @@ lldb::ProcessSP PlatformQemuUser::DebugProcess(ProcessLaunchInfo &launch_info,
     process_sp->SetSTDIOFileDescriptor(
         launch_info.GetPTY().ReleasePrimaryFileDescriptor());
 
-  process_sp->WaitForProcessToStop(llvm::None, nullptr, false, listener_sp);
   return process_sp;
 }
 
diff --git a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
index 220a153f5c3b6..708268ff900bc 100644
--- a/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
+++ b/lldb/source/Plugins/Platform/Windows/PlatformWindows.cpp
@@ -488,18 +488,20 @@ ProcessSP PlatformWindows::DebugProcess(ProcessLaunchInfo &launch_info,
     // This is a process attach.  Don't need to launch anything.
     ProcessAttachInfo attach_info(launch_info);
     return Attach(attach_info, debugger, &target, error);
-  } else {
-    ProcessSP process_sp = target.CreateProcess(
-        launch_info.GetListener(), launch_info.GetProcessPluginName(), nullptr,
-        false);
+  }
 
-    // We need to launch and attach to the process.
-    launch_info.GetFlags().Set(eLaunchFlagDebug);
-    if (process_sp)
-      error = process_sp->Launch(launch_info);
+  ProcessSP process_sp =
+      target.CreateProcess(launch_info.GetListener(),
+                           launch_info.GetProcessPluginName(), nullptr, false);
 
-    return process_sp;
-  }
+  process_sp->HijackProcessEvents(launch_info.GetHijackListener());
+
+  // We need to launch and attach to the process.
+  launch_info.GetFlags().Set(eLaunchFlagDebug);
+  if (process_sp)
+    error = process_sp->Launch(launch_info);
+
+  return process_sp;
 }
 
 lldb::ProcessSP PlatformWindows::Attach(ProcessAttachInfo &attach_info,
diff --git a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
index 219d7c7e37b27..2438661886291 100644
--- a/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
+++ b/lldb/source/Plugins/Platform/gdb-server/PlatformRemoteGDBServer.cpp
@@ -428,6 +428,8 @@ PlatformRemoteGDBServer::DebugProcess(ProcessLaunchInfo &launch_info,
                                           "gdb-remote", nullptr, true);
 
         if (process_sp) {
+          process_sp->HijackProcessEvents(launch_info.GetHijackListener());
+
           error = process_sp->ConnectRemote(connect_url.c_str());
           // Retry the connect remote one time...
           if (error.Fail())
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 8e1857ce4bfa9..c70bfcfc448de 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -2429,6 +2429,39 @@ void Process::LoadOperatingSystemPlugin(bool flush) {
 }
 
 Status Process::Launch(ProcessLaunchInfo &launch_info) {
+  StateType state_after_launch = eStateInvalid;
+  EventSP first_stop_event_sp;
+  Status status =
+      LaunchPrivate(launch_info, state_after_launch, first_stop_event_sp);
+  if (status.Fail())
+    return status;
+
+  if (state_after_launch != eStateStopped &&
+      state_after_launch != eStateCrashed)
+    return Status();
+
+  // Note, the stop event was consumed above, but not handled. This
+  // was done to give DidLaunch a chance to run. The target is either
+  // stopped or crashed. Directly set the state.  This is done to
+  // prevent a stop message with a bunch of spurious output on thread
+  // status, as well as not pop a ProcessIOHandler.
+  SetPublicState(state_after_launch, false);
+
+  if (PrivateStateThreadIsValid())
+    ResumePrivateStateThread();
+  else
+    StartPrivateStateThread();
+
+  // Target was stopped at entry as was intended. Need to notify the
+  // listeners about it.
+  if (launch_info.GetFlags().Test(eLaunchFlagStopAtEntry))
+    HandlePrivateEvent(first_stop_event_sp);
+
+  return Status();
+}
+
+Status Process::LaunchPrivate(ProcessLaunchInfo &launch_info, StateType &state,
+                              EventSP &event_sp) {
   Status error;
   m_abi_sp.reset();
   m_dyld_up.reset();
@@ -2445,7 +2478,7 @@ Status Process::Launch(ProcessLaunchInfo &launch_info) {
   // be a way to express this path, without actually having a module.
   // The way to do that is to set the ExecutableFile in the LaunchInfo.
   // Figure that out here:
-  
+
   FileSpec exe_spec_to_use;
   if (!exe_module) {
     if (!launch_info.GetExecutableFile()) {
@@ -2455,7 +2488,7 @@ Status Process::Launch(ProcessLaunchInfo &launch_info) {
     exe_spec_to_use = launch_info.GetExecutableFile();
   } else
     exe_spec_to_use = exe_module->GetFileSpec();
-  
+
   if (exe_module && FileSystem::Instance().Exists(exe_module->GetFileSpec())) {
     // Install anything that might need to be installed prior to launching.
     // For host systems, this will do nothing, but if we are connected to a
@@ -2464,6 +2497,7 @@ Status Process::Launch(ProcessLaunchInfo &launch_info) {
     if (error.Fail())
       return error;
   }
+
   // Listen and queue events that are broadcasted during the process launch.
   ListenerSP listener_sp(Listener::MakeListener("LaunchEventHijack"));
   HijackProcessEvents(listener_sp);
@@ -2473,93 +2507,81 @@ Status Process::Launch(ProcessLaunchInfo &launch_info) {
     PausePrivateStateThread();
 
   error = WillLaunch(exe_module);
-  if (error.Success()) {
-    const bool restarted = false;
-    SetPublicState(eStateLaunching, restarted);
-    m_should_detach = false;
+  if (error.Fail()) {
+    std::string local_exec_file_path = exe_spec_to_use.GetPath();
+    return Status("file doesn't exist: '%s'", local_exec_file_path.c_str());
+  }
 
-    if (m_public_run_lock.TrySetRunning()) {
-      // Now launch using these arguments.
-      error = DoLaunch(exe_module, launch_info);
-    } else {
-      // This shouldn't happen
-      error.SetErrorString("failed to acquire process run lock");
-    }
+  const bool restarted = false;
+  SetPublicState(eStateLaunching, restarted);
+  m_should_detach = false;
 
-    if (error.Fail()) {
-      if (GetID() != LLDB_INVALID_PROCESS_ID) {
-        SetID(LLDB_INVALID_PROCESS_ID);
-        const char *error_string = error.AsCString();
-        if (error_string == nullptr)
-          error_string = "launch failed";
-        SetExitStatus(-1, error_string);
-      }
-    } else {
-      EventSP event_sp;
+  if (m_public_run_lock.TrySetRunning()) {
+    // Now launch using these arguments.
+    error = DoLaunch(exe_module, launch_info);
+  } else {
+    // This shouldn't happen
+    error.SetErrorString("failed to acquire process run lock");
+  }
 
-      // Now wait for the process to launch and return control to us, and then
-      // call DidLaunch:
-      StateType state = WaitForProcessStopPrivate(event_sp, seconds(10));
-
-      if (state == eStateInvalid || !event_sp) {
-        // We were able to launch the process, but we failed to catch the
-        // initial stop.
-        error.SetErrorString("failed to catch stop after launch");
-        SetExitStatus(0, "failed to catch stop after launch");
-        Destroy(false);
-      } else if (state == eStateStopped || state == eStateCrashed) {
-        DidLaunch();
-
-        DynamicLoader *dyld = GetDynamicLoader();
-        if (dyld)
-          dyld->DidLaunch();
-
-        GetJITLoaders().DidLaunch();
-
-        SystemRuntime *system_runtime = GetSystemRuntime();
-        if (system_runtime)
-          system_runtime->DidLaunch();
-
-        if (!m_os_up)
-          LoadOperatingSystemPlugin(false);
-
-        // We successfully launched the process and stopped, now it the
-        // right time to set up signal filters before resuming.
-        UpdateAutomaticSignalFiltering();
-
-        // Note, the stop event was consumed above, but not handled. This
-        // was done to give DidLaunch a chance to run. The target is either
-        // stopped or crashed. Directly set the state.  This is done to
-        // prevent a stop message with a bunch of spurious output on thread
-        // status, as well as not pop a ProcessIOHandler.
-        // We are done with the launch hijack listener, and this stop should
-        // go to the public state listener:
-        RestoreProcessEvents();
-        SetPublicState(state, false);
-
-        if (PrivateStateThreadIsValid())
-          ResumePrivateStateThread();
-        else
-          StartPrivateStateThread();
-
-        // Target was stopped at entry as was intended. Need to notify the
-        // listeners about it.
-        if (state == eStateStopped &&
-            launch_info.GetFlags().Test(eLaunchFlagStopAtEntry))
-          HandlePrivateEvent(event_sp);
-      } else if (state == eStateExited) {
-        // We exited while trying to launch somehow.  Don't call DidLaunch
-        // as that's not likely to work, and return an invalid pid.
-        HandlePrivateEvent(event_sp);
-      }
+  if (error.Fail()) {
+    if (GetID() != LLDB_INVALID_PROCESS_ID) {
+      SetID(LLDB_INVALID_PROCESS_ID);
+      const char *error_string = error.AsCString();
+      if (error_string == nullptr)
+        error_string = "launch failed";
+      SetExitStatus(-1, error_string);
     }
-  } else {
-    std::string local_exec_file_path = exe_spec_to_use.GetPath();
-    error.SetErrorStringWithFormat("file doesn't exist: '%s'",
-                                   local_exec_file_path.c_str());
+    return error;
   }
 
-  return error;
+  // Now wait for the process to launch and return control to us, and then
+  // call DidLaunch:
+  state = WaitForProcessStopPrivate(event_sp, seconds(10));
+
+  if (state == eStateInvalid || !event_sp) {
+    // We were able to launch the process, but we failed to catch the
+    // initial stop.
+    error.SetErrorString("failed to catch stop after launch");
+    SetExitStatus(0, error.AsCString());
+    Destroy(false);
+    return error;
+  }
+
+  if (state == eStateExited) {
+    // We exited while trying to launch somehow.  Don't call DidLaunch
+    // as that's not likely to work, and return an invalid pid.
+    HandlePrivateEvent(event_sp);
+    return Status();
+  }
+
+  if (state == eStateStopped || state == eStateCrashed) {
+    DidLaunch();
+
+    DynamicLoader *dyld = GetDynamicLoader();
+    if (dyld)
+      dyld->DidLaunch();
+
+    GetJITLoaders().DidLaunch();
+
+    SystemRuntime *system_runtime = GetSystemRuntime();
+    if (system_runtime)
+      system_runtime->DidLaunch();
+
+    if (!m_os_up)
+      LoadOperatingSystemPlugin(false);
+
+    // We successfully launched the process and stopped, now it the
+    // right time to set up signal filters before resuming.
+    UpdateAutomaticSignalFiltering();
+    return Status();
+  }
+
+  return Status("Unexpected process state after the launch: %s, expected %s, "
+                "%s, %s or %s",
+                StateAsCString(state), StateAsCString(eStateInvalid),
+                StateAsCString(eStateExited), StateAsCString(eStateStopped),
+                StateAsCString(eStateCrashed));
 }
 
 Status Process::LoadCore() {
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index d7023a159046c..298db3bca6803 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3026,6 +3026,14 @@ Status Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
   if (!launch_info.GetArchitecture().IsValid())
     launch_info.GetArchitecture() = GetArchitecture();
 
+  // Hijacking events of the process to be created to be sure that all events
+  // until the first stop are intercepted (in case if platform doesn't define
+  // its own hijacking listener or if the process is created by the target
+  // manually, without the platform).
+  if (!launch_info.GetHijackListener())
+    launch_info.SetHijackListener(
+        Listener::MakeListener("lldb.Target.Launch.hijack"));
+
   // If we're not already connected to the process, and if we have a platform
   // that can launch a process for debugging, go ahead and do that here.
   if (state != eStateConnected && platform_sp &&
@@ -3057,8 +3065,10 @@ Status Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
     }
 
     // Since we didn't have a platform launch the process, launch it here.
-    if (m_process_sp)
+    if (m_process_sp) {
+      m_process_sp->HijackProcessEvents(launch_info.GetHijackListener());
       error = m_process_sp->Launch(launch_info);
+    }
   }
 
   if (!m_process_sp && error.Success())
@@ -3067,35 +3077,35 @@ Status Target::Launch(ProcessLaunchInfo &launch_info, Stream *stream) {
   if (!error.Success())
     return error;
 
-  auto at_exit =
-      llvm::make_scope_exit([&]() { m_process_sp->RestoreProcessEvents(); });
+  bool rebroadcast_first_stop =
+      !synchronous_execution &&
+      launch_info.GetFlags().Test(eLaunchFlagStopAtEntry);
 
-  if (!synchronous_execution &&
-      launch_info.GetFlags().Test(eLaunchFlagStopAtEntry))
-    return error;
+  assert(launch_info.GetHijackListener());
+
+  EventSP first_stop_event_sp;
+  state = m_process_sp->WaitForProcessToStop(llvm::None, &first_stop_event_sp,
+                                             rebroadcast_first_stop,
+                                             launch_info.GetHijackListener());
+  m_process_sp->RestoreProcessEvents();
 
-  ListenerSP hijack_listener_sp(launch_info.GetHijackListener());
-  if (!hijack_listener_sp) {
-    hijack_listener_sp = Listener::MakeListener("lldb.Target.Launch.hijack");
-    launch_info.SetHijackListener(hijack_listener_sp);
-    m_process_sp->HijackProcessEvents(hijack_listener_sp);
+  if (rebroadcast_first_stop) {
+    assert(first_stop_event_sp);
+    m_process_sp->BroadcastEvent(first_stop_event_sp);
+    return error;
   }
 
-  switch (m_process_sp->WaitForProcessToStop(llvm::None, nullptr, false,
-                                             hijack_listener_sp, nullptr)) {
+  switch (state) {
   case eStateStopped: {
     if (launch_info.GetFlags().Test(eLaunchFlagStopAtEntry))
       break;
-    if (synchronous_execution) {
+    if (synchronous_execution)
       // Now we have handled the stop-from-attach, and we are just
       // switching to a synchronous resume.  So we should switch to the
       // SyncResume hijacker.
-      m_process_sp->RestoreProcessEvents();
       m_process_sp->ResumeSynchronous(stream);
-    } else {
-      m_process_sp->RestoreProcessEvents();
+    else
       error = m_process_sp->PrivateResume();
-    }
     if (!error.Success()) {
       Status error2;
       error2.SetErrorStringWithFormat(
diff --git a/lldb/test/API/tools/lldb-vscode/stop-hooks/Makefile b/lldb/test/API/tools/lldb-vscode/stop-hooks/Makefile
new file mode 100644
index 0000000000000..10495940055b6
--- /dev/null
+++ b/lldb/test/API/tools/lldb-vscode/stop-hooks/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-vscode/stop-hooks/TestVSCode_stop_hooks.py b/lldb/test/API/tools/lldb-vscode/stop-hooks/TestVSCode_stop_hooks.py
new file mode 100644
index 0000000000000..c6be1a013a80a
--- /dev/null
+++ b/lldb/test/API/tools/lldb-vscode/stop-hooks/TestVSCode_stop_hooks.py
@@ -0,0 +1,35 @@
+"""
+Test stop hooks
+"""
+
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+import lldbvscode_testcase
+
+
+class TestVSCode_stop_hooks(lldbvscode_testcase.VSCodeTestCaseBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    @skipIfRemote
+    def test_stop_hooks_before_run(self):
+        '''
+            Test that there is no race condition between lldb-vscode and
+            stop hooks executor
+        '''
+        program = self.getBuildArtifact("a.out")
+        preRunCommands = ['target stop-hook add -o help']
+        self.build_and_launch(program, stopOnEntry=True, preRunCommands=preRunCommands)
+
+        # The first stop is on entry.
+        self.continue_to_next_stop()
+
+        breakpoint_ids = self.set_function_breakpoints(['main'])
+        # This request hangs if the race happens, because, in that case, the
+        # command interpreter is in synchronous mode while lldb-vscode expects
+        # it to be in asynchronous mode, so, the process doesn't send the stop
+        # event to "lldb.Debugger" listener (which is monitored by lldb-vscode).
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-vscode/stop-hooks/main.c b/lldb/test/API/tools/lldb-vscode/stop-hooks/main.c
new file mode 100644
index 0000000000000..76e8197013aab
--- /dev/null
+++ b/lldb/test/API/tools/lldb-vscode/stop-hooks/main.c
@@ -0,0 +1 @@
+int main() { return 0; }

From 12fb133eba819cac8c14ac1888ea1e460e45d17a Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Tue, 22 Feb 2022 10:49:34 +0000
Subject: [PATCH 488/748] [LoopVectorize] Support conditional in-loop vector
 reductions

Extends getReductionOpChain to look through Phis which may be part of
the reduction chain. adjustRecipesForReductions will now also create a
CondOp for VPReductionRecipe if the block is predicated and not only if
foldTailByMasking is true.

Changes were required in tryToBlend to ensure that we don't attempt
to convert the reduction Phi into a select by returning a VPBlendRecipe.
The VPReductionRecipe will create a select between the Phi and the reduction.

Reviewed By: david-arm

Differential Revision: https://reviews.llvm.org/D117580
---
 llvm/lib/Analysis/IVDescriptors.cpp           |  59 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  21 +-
 .../AArch64/scalable-reduction-inloop-cond.ll | 186 +++++
 .../LoopVectorize/AArch64/sve-tail-folding.ll |  46 +-
 .../LoopVectorize/reduction-inloop-cond.ll    | 729 ++++++++++++++++++
 .../LoopVectorize/reduction-inloop-uf4.ll     | 258 +++++++
 6 files changed, 1257 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 6399c75082b21..11bbac3f89b6d 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1058,7 +1058,7 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
   // to check for a pair of icmp/select, for which we use getNextInstruction and
   // isCorrectOpcode functions to step the right number of instruction, and
   // check the icmp/select pair.
-  // FIXME: We also do not attempt to look through Phi/Select's yet, which might
+  // FIXME: We also do not attempt to look through Select's yet, which might
   // be part of the reduction chain, or attempt to looks through And's to find a
   // smaller bitwidth. Subs are also currently not allowed (which are usually
   // treated as part of a add reduction) as they are expected to generally be
@@ -1068,16 +1068,21 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
   if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp)
     ExpectedUses = 2;
 
-  auto getNextInstruction = [&](Instruction *Cur) {
-    if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
-      // We are expecting a icmp/select pair, which we go to the next select
-      // instruction if we can. We already know that Cur has 2 uses.
-      if (isa<SelectInst>(*Cur->user_begin()))
-        return cast<Instruction>(*Cur->user_begin());
-      else
-        return cast<Instruction>(*std::next(Cur->user_begin()));
+  auto getNextInstruction = [&](Instruction *Cur) -> Instruction * {
+    for (auto User : Cur->users()) {
+      Instruction *UI = cast<Instruction>(User);
+      if (isa<PHINode>(UI))
+        continue;
+      if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
+        // We are expecting a icmp/select pair, which we go to the next select
+        // instruction if we can. We already know that Cur has 2 uses.
+        if (isa<SelectInst>(UI))
+          return UI;
+        continue;
+      }
+      return UI;
     }
-    return cast<Instruction>(*Cur->user_begin());
+    return nullptr;
   };
   auto isCorrectOpcode = [&](Instruction *Cur) {
     if (RedOp == Instruction::ICmp || RedOp == Instruction::FCmp) {
@@ -1092,22 +1097,46 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
     return Cur->getOpcode() == RedOp;
   };
 
+  // Attempt to look through Phis which are part of the reduction chain
+  unsigned ExtraPhiUses = 0;
+  Instruction *RdxInstr = LoopExitInstr;
+  if (auto ExitPhi = dyn_cast<PHINode>(LoopExitInstr)) {
+    if (ExitPhi->getNumIncomingValues() != 2)
+      return {};
+
+    Instruction *Inc0 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(0));
+    Instruction *Inc1 = dyn_cast<Instruction>(ExitPhi->getIncomingValue(1));
+
+    Instruction *Chain = nullptr;
+    if (Inc0 == Phi)
+      Chain = Inc1;
+    else if (Inc1 == Phi)
+      Chain = Inc0;
+    else
+      return {};
+
+    RdxInstr = Chain;
+    ExtraPhiUses = 1;
+  }
+
   // The loop exit instruction we check first (as a quick test) but add last. We
   // check the opcode is correct (and dont allow them to be Subs) and that they
   // have expected to have the expected number of uses. They will have one use
   // from the phi and one from a LCSSA value, no matter the type.
-  if (!isCorrectOpcode(LoopExitInstr) || !LoopExitInstr->hasNUses(2))
+  if (!isCorrectOpcode(RdxInstr) || !LoopExitInstr->hasNUses(2))
     return {};
 
-  // Check that the Phi has one (or two for min/max) uses.
-  if (!Phi->hasNUses(ExpectedUses))
+  // Check that the Phi has one (or two for min/max) uses, plus an extra use
+  // for conditional reductions.
+  if (!Phi->hasNUses(ExpectedUses + ExtraPhiUses))
     return {};
+
   Instruction *Cur = getNextInstruction(Phi);
 
   // Each other instruction in the chain should have the expected number of uses
   // and be the correct opcode.
-  while (Cur != LoopExitInstr) {
-    if (!isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
+  while (Cur != RdxInstr) {
+    if (!Cur || !isCorrectOpcode(Cur) || !Cur->hasNUses(ExpectedUses))
       return {};
 
     ReductionOperations.push_back(Cur);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bef39a56d9f25..da14d78647823 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8593,13 +8593,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
     return Operands[0];
   }
 
+  unsigned NumIncoming = Phi->getNumIncomingValues();
+  // For in-loop reductions, we do not need to create an additional select.
+  VPValue *InLoopVal = nullptr;
+  for (unsigned In = 0; In < NumIncoming; In++) {
+    PHINode *PhiOp =
+        dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
+    if (PhiOp && CM.isInLoopReduction(PhiOp)) {
+      assert(!InLoopVal && "Found more than one in-loop reduction!");
+      InLoopVal = Operands[In];
+    }
+  }
+
+  assert((!InLoopVal || NumIncoming == 2) &&
+         "Found an in-loop reduction for PHI with unexpected number of "
+         "incoming values");
+  if (InLoopVal)
+    return Operands[Operands[0] == InLoopVal ? 1 : 0];
+
   // We know that all PHIs in non-header blocks are converted into selects, so
   // we don't have to worry about the insertion order and we can just use the
   // builder. At this point we generate the predication tree. There may be
   // duplications since this is a simple recursive scan, but future
   // optimizations will clean it up.
   SmallVector<VPValue *, 2> OperandsWithMask;
-  unsigned NumIncoming = Phi->getNumIncomingValues();
 
   for (unsigned In = 0; In < NumIncoming; In++) {
     VPValue *EdgeMask =
@@ -9423,7 +9440,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
 
-      auto *CondOp = CM.foldTailByMasking()
+      auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
                          : nullptr;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
new file mode 100644
index 0000000000000..d6f73c8e9f5ef
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -S | FileCheck %s
+
+define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
+; CHECK-LABEL: @cond_fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 2.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP19]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[FADD]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.inc ]
+  %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %indvars
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 2.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars
+  %1 = load float, float* %arrayidx2
+  %fadd = fadd fast float %rdx, %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %fadd, %if.then ], [ %rdx, %for.body ]
+  %indvars.next = add nuw nsw i64 %indvars, 1
+  %exitcond.not = icmp eq i64 %indvars.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %res
+}
+
+define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
+; CHECK-LABEL: @cond_cmp_sel(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0xFFF0000000000000, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP12]])
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP13]], float [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp fast olt float [[RDX]], [[TMP18]]
+; CHECK-NEXT:    [[FSEL:%.*]] = select fast i1 [[FCMP]], float [[RDX]], float [[TMP18]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[FSEL]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi float [ %res, %for.inc ], [ 1.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 3.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %fcmp = fcmp fast olt float %rdx, %1
+  %fsel = select fast i1 %fcmp, float %rdx, float %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %rdx, %for.body ], [ %fsel, %if.then ]
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %res
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index f72c6cbf378ce..8ca671854fccb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -590,33 +590,29 @@ define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    br label %vector.body
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 7, i32 0), %vector.ph ], [ [[PREDPHI:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[N]])
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[TMP12]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, %vector.ph ], [ [[TMP16:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 5, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP16]] = xor i32 [[TMP15]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP20]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label %scalar.ph
-;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
new file mode 100644
index 0000000000000..8be98ebf087d5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -0,0 +1,729 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s
+
+define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
+; CHECK-LABEL: @cond_fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP11]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP17]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x float> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select fast <4 x i1> [[TMP2]], <4 x float> [[TMP25]], <4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP27]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP26]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP29]], 5.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[FADD:%.*]] = fadd fast float [[RDX]], [[TMP30]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[FADD]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 5.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %fadd = fadd fast float %rdx, %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %rdx, %for.body ], [ %fadd, %if.then ]
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %res
+}
+
+define float @cond_cmp_sel(float* noalias %a, float* noalias %cond, i64 %N) {
+; CHECK-LABEL: @cond_cmp_sel(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP11]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP17]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x float> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select fast <4 x i1> [[TMP2]], <4 x float> [[TMP25]], <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
+; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP26]])
+; CHECK-NEXT:    [[TMP28]] = call fast float @llvm.minnum.f32(float [[TMP27]], float [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP30]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = call fast float @llvm.minnum.f32(float [[RDX]], float [[TMP31]])
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP32]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi float [ 1.000000e+00, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %tobool = fcmp une float %0, 3.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %fcmp = fcmp fast olt float %rdx, %1
+  %fsel = select fast i1 %fcmp, float %rdx, float %1
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %rdx, %for.body ], [ %fsel, %if.then ]
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %res
+}
+
+define i32 @conditional_and(i32* noalias %A, i32* noalias %B, i32 %cond, i64 noundef %N) #0 {
+; CHECK-LABEL: @conditional_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP11]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
+; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP17]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP25]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP26]])
+; CHECK-NEXT:    [[TMP28]] = and i32 [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 7, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP30]], [[COND]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP31]], [[RDX]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi i32 [ [[AND]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi i32 [ 7, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %tobool = icmp eq i32 %0, %cond
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %iv
+  %1 = load i32, i32* %arrayidx2
+  %and = and i32 %1, %rdx
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %and, %if.then ], [ %rdx, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %res
+}
+
+define i32 @simple_chained_rdx(i32* noalias %a, i32* noalias %b, i32* noalias %cond, i64 noundef %N) {
+; CHECK-LABEL: @simple_chained_rdx(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 5, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[PRED_LOAD_CONTINUE14]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK:       pred.load.if1:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP13]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
+; CHECK:       pred.load.continue2:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK:       pred.load.if3:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP18]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; CHECK:       pred.load.continue4:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP23]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP25]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; CHECK:       pred.load.if7:
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP31]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; CHECK:       pred.load.continue8:
+; CHECK-NEXT:    [[TMP33:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP32]], [[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; CHECK:       pred.load.if9:
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
+; CHECK:       pred.load.continue10:
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP37]], [[PRED_LOAD_IF9]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
+; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP42]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
+; CHECK-NEXT:    br i1 [[TMP44]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP46]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP48:%.*]] = phi <4 x i32> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP47]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP48]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP49]])
+; CHECK-NEXT:    [[TMP51]] = add i32 [[TMP50]], [[TMP28]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ 5, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP53]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP54]], [[RDX]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[TMP55]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RES]] = phi i32 [ [[ADD3]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %rdx = phi i32 [ %res, %for.inc ], [ 5, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %add = add nsw i32 %1, %rdx
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add3 = add nsw i32 %add, %2
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %add3, %if.then ], [ %rdx, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %res
+}
+
+;
+; Negative Tests
+;
+
+;
+; Reduction not performed in loop as the phi has more than two incoming values
+;
+define i64 @nested_cond_and(i64* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, i64* noalias nocapture readonly %cond, i64 %N){
+; CHECK-LABEL: @nested_cond_and(
+; CHECK:       vector.body:
+; CHECK-NOT:     @llvm.vector.reduce.and
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.and
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %rdx = phi i64 [ 5, %entry ], [ %res, %for.inc ]
+  %arrayidx = getelementptr inbounds i64, i64* %cond, i64 %iv
+  %0 = load i64, i64* %arrayidx
+  %tobool = icmp eq i64 %0, 0
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %iv
+  %1 = load i64, i64* %arrayidx2
+  %and1 = and i64 %rdx, %1
+  %tobool2 = icmp eq i64 %1, 3
+  br i1 %tobool2, label %if.then.2, label %for.inc
+
+if.then.2:
+  %arrayidx3 = getelementptr inbounds i64, i64* %b, i64 %iv
+  %2 = load i64, i64* %arrayidx3
+  %and2 = and i64 %rdx, %2
+  br label %for.inc
+
+for.inc:
+  %res = phi i64 [ %and2, %if.then.2 ], [ %and1, %if.then ], [ %rdx, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i64 %res
+}
+
+; Chain of conditional & unconditional reductions. We currently only support conditional reductions
+; if they are the last in the chain, i.e. the loop exit instruction is a Phi node. Therefore we reject
+; the Phi (%rdx1) as it has more than one use.
+;
+define i32 @cond-uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %n) #0 {
+; CHECK-LABEL: @cond-uncond(
+; CHECK:       pred.load.continue6:
+; CHECK-NOT:     @llvm.vector.reduce.add
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx1 = phi i32 [ %add2, %if.end ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %if.end ], [ 0, %entry]
+  %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %arrayidx1 = getelementptr inbounds i32, i32* %src2, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %add = add nsw i32 %1, %rdx1
+  br label %if.end
+
+if.end:
+  %res = phi i32 [ %add, %if.then ], [ %rdx1, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %src1, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add2 = add nsw i32 %2, %res
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add2
+}
+
+;
+; Chain of two conditional reductions. We do not vectorise this with in-loop reductions as neither
+; of the incoming values of the LoopExitInstruction (%res) is the reduction Phi (%rdx1).
+;
+define float @cond_cond(float* noalias %src1, float* noalias %src2, float* noalias %cond, i64 %n) #0 {
+; CHECK-LABEL: @cond_cond(
+; CHECK:       pred.load.continue14:
+; CHECK-NOT:     @llvm.vector.reduce.fadd
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.fadd
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx1 = phi float [ %res, %for.inc ], [ 2.000000e+00, %entry ]
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %cond, i64 %iv
+  %0 = load float, float* %arrayidx
+  %cmp1 = fcmp fast oeq float %0, 3.000000e+00
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx2 = getelementptr inbounds float, float* %src1, i64 %iv
+  %1 = load float, float* %arrayidx2
+  %add = fadd fast float %1, %rdx1
+  br label %if.end
+
+if.end:
+  %rdx2 = phi float [ %add, %if.then ], [ %rdx1, %for.body ]
+  %cmp5 = fcmp fast oeq float %0, 7.000000e+00
+  br i1 %cmp5, label %if.then6, label %for.inc
+
+if.then6:
+  %arrayidx7 = getelementptr inbounds float, float* %src2, i64 %iv
+  %2 = load float, float* %arrayidx7
+  %add2 = fadd fast float %2, %rdx2
+  br label %for.inc
+
+for.inc:
+  %res = phi float [ %add2, %if.then6 ], [ %rdx2, %if.end ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %res
+}
+
+;
+; Chain of an unconditional & a conditional reduction. We do not vectorise this in-loop as neither of the
+; incoming values of the LoopExitInstruction (%res) is the reduction Phi (%rdx).
+;
+define i32 @uncond_cond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 %N) #0 {
+; CHECK-LABEL: @uncond_cond(
+; CHECK:       pred.load.continue7:
+; CHECK-NOT:     @llvm.vector.reduce.add
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx = phi i32 [ %res, %for.inc ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %add1 = add nsw i32 %0, %rdx
+  %arrayidx1 = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %tobool.not = icmp eq i32 %1, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add2 = add nsw i32 %2, %add1
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %add2, %if.then ], [ %add1, %for.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %res
+}
+
+;
+; Chain of multiple unconditional & conditional reductions. Does not vectorise in-loop as when we look back
+; through the chain and check the number of uses of %add1, we find more than the expected one use.
+;
+define i32 @uncond_cond_uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %N) {
+; CHECK-LABEL: @uncond_cond_uncond(
+; CHECK:       pred.load.continue7:
+; CHECK-NOT:     @llvm.vector.reduce.add
+; CHECK:       middle.block:
+; CHECK:         @llvm.vector.reduce.add
+; CHECK:       scalar.ph
+entry:
+  br label %for.body
+
+for.body:
+  %rdx = phi i32 [ %add3, %if.end ], [ 0, %entry ]
+  %iv = phi i64 [ %iv.next, %if.end ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %src1, i64 %iv
+  %0 = load i32, i32* %arrayidx
+  %add1 = add nsw i32 %0, %rdx
+  %arrayidx1 = getelementptr inbounds i32, i32* %cond, i64 %iv
+  %1 = load i32, i32* %arrayidx1
+  %tobool.not = icmp eq i32 %1, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %iv
+  %2 = load i32, i32* %arrayidx2
+  %add2 = add nsw i32 %2, %add1
+  br label %if.end
+
+if.end:
+  %res = phi i32 [ %add2, %if.then ], [ %add1, %for.body ]
+  %add3 = add nsw i32 %res, %0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add3
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index f9cc573252902..c68f1ebc624fa 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -297,6 +297,264 @@ entry:
   ret i32 %sum.0.lcssa
 }
 
+define i32 @cond_rdx_pred(i32 %cond, i32* noalias %a, i64 %N) {
+; CHECK-LABEL: @cond_rdx_pred(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], -16
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[COND:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[COND]], i64 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE44:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 4, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP116:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP119:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP122:%.*]], [[PRED_LOAD_CONTINUE44]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[STEP_ADD1:%.*]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[STEP_ADD2:%.*]] = add <4 x i64> [[VEC_IND]], <i64 12, i64 12, i64 12, i64 12>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT7]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT9]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT11]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLATINSERT13]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP3]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP19]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP12]], i64 1
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP22:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; CHECK:       pred.load.continue16:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP12]], i64 2
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
+; CHECK:       pred.load.if17:
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP30]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
+; CHECK:       pred.load.continue18:
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <4 x i32> [ [[TMP26]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP31]], [[PRED_LOAD_IF17]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP12]], i64 3
+; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
+; CHECK:       pred.load.if19:
+; CHECK-NEXT:    [[TMP34:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP36]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
+; CHECK:       pred.load.continue20:
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP37]], [[PRED_LOAD_IF19]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP13]], i64 0
+; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
+; CHECK:       pred.load.if21:
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> poison, i32 [[TMP42]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
+; CHECK:       pred.load.continue22:
+; CHECK-NEXT:    [[TMP44:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP43]], [[PRED_LOAD_IF21]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP13]], i64 1
+; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]]
+; CHECK:       pred.load.if23:
+; CHECK-NEXT:    [[TMP46:%.*]] = or i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP46]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, i32* [[TMP47]], align 4
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> [[TMP44]], i32 [[TMP48]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE24]]
+; CHECK:       pred.load.continue24:
+; CHECK-NEXT:    [[TMP50:%.*]] = phi <4 x i32> [ [[TMP44]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP49]], [[PRED_LOAD_IF23]] ]
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i1> [[TMP13]], i64 2
+; CHECK-NEXT:    br i1 [[TMP51]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]]
+; CHECK:       pred.load.if25:
+; CHECK-NEXT:    [[TMP52:%.*]] = or i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <4 x i32> [[TMP50]], i32 [[TMP54]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE26]]
+; CHECK:       pred.load.continue26:
+; CHECK-NEXT:    [[TMP56:%.*]] = phi <4 x i32> [ [[TMP50]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP55]], [[PRED_LOAD_IF25]] ]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <4 x i1> [[TMP13]], i64 3
+; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]]
+; CHECK:       pred.load.if27:
+; CHECK-NEXT:    [[TMP58:%.*]] = or i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP59]], align 4
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i32> [[TMP56]], i32 [[TMP60]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE28]]
+; CHECK:       pred.load.continue28:
+; CHECK-NEXT:    [[TMP62:%.*]] = phi <4 x i32> [ [[TMP56]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP61]], [[PRED_LOAD_IF27]] ]
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <4 x i1> [[TMP14]], i64 0
+; CHECK-NEXT:    br i1 [[TMP63]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]]
+; CHECK:       pred.load.if29:
+; CHECK-NEXT:    [[TMP64:%.*]] = or i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load i32, i32* [[TMP65]], align 4
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE30]]
+; CHECK:       pred.load.continue30:
+; CHECK-NEXT:    [[TMP68:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP67]], [[PRED_LOAD_IF29]] ]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP14]], i64 1
+; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]]
+; CHECK:       pred.load.if31:
+; CHECK-NEXT:    [[TMP70:%.*]] = or i64 [[INDEX]], 9
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = load i32, i32* [[TMP71]], align 4
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP72]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE32]]
+; CHECK:       pred.load.continue32:
+; CHECK-NEXT:    [[TMP74:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP73]], [[PRED_LOAD_IF31]] ]
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP14]], i64 2
+; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]]
+; CHECK:       pred.load.if33:
+; CHECK-NEXT:    [[TMP76:%.*]] = or i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP76]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load i32, i32* [[TMP77]], align 4
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <4 x i32> [[TMP74]], i32 [[TMP78]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE34]]
+; CHECK:       pred.load.continue34:
+; CHECK-NEXT:    [[TMP80:%.*]] = phi <4 x i32> [ [[TMP74]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP79]], [[PRED_LOAD_IF33]] ]
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i1> [[TMP14]], i64 3
+; CHECK-NEXT:    br i1 [[TMP81]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]]
+; CHECK:       pred.load.if35:
+; CHECK-NEXT:    [[TMP82:%.*]] = or i64 [[INDEX]], 11
+; CHECK-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP82]]
+; CHECK-NEXT:    [[TMP84:%.*]] = load i32, i32* [[TMP83]], align 4
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP80]], i32 [[TMP84]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE36]]
+; CHECK:       pred.load.continue36:
+; CHECK-NEXT:    [[TMP86:%.*]] = phi <4 x i32> [ [[TMP80]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP85]], [[PRED_LOAD_IF35]] ]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <4 x i1> [[TMP15]], i64 0
+; CHECK-NEXT:    br i1 [[TMP87]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]]
+; CHECK:       pred.load.if37:
+; CHECK-NEXT:    [[TMP88:%.*]] = or i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = load i32, i32* [[TMP89]], align 4
+; CHECK-NEXT:    [[TMP91:%.*]] = insertelement <4 x i32> poison, i32 [[TMP90]], i64 0
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE38]]
+; CHECK:       pred.load.continue38:
+; CHECK-NEXT:    [[TMP92:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE36]] ], [ [[TMP91]], [[PRED_LOAD_IF37]] ]
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <4 x i1> [[TMP15]], i64 1
+; CHECK-NEXT:    br i1 [[TMP93]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]]
+; CHECK:       pred.load.if39:
+; CHECK-NEXT:    [[TMP94:%.*]] = or i64 [[INDEX]], 13
+; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP94]]
+; CHECK-NEXT:    [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP96]], i64 1
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE40]]
+; CHECK:       pred.load.continue40:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ [[TMP92]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP97]], [[PRED_LOAD_IF39]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP15]], i64 2
+; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]]
+; CHECK:       pred.load.if41:
+; CHECK-NEXT:    [[TMP100:%.*]] = or i64 [[INDEX]], 14
+; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP100]]
+; CHECK-NEXT:    [[TMP102:%.*]] = load i32, i32* [[TMP101]], align 4
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP102]], i64 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE42]]
+; CHECK:       pred.load.continue42:
+; CHECK-NEXT:    [[TMP104:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP103]], [[PRED_LOAD_IF41]] ]
+; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <4 x i1> [[TMP15]], i64 3
+; CHECK-NEXT:    br i1 [[TMP105]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44]]
+; CHECK:       pred.load.if43:
+; CHECK-NEXT:    [[TMP106:%.*]] = or i64 [[INDEX]], 15
+; CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP106]]
+; CHECK-NEXT:    [[TMP108:%.*]] = load i32, i32* [[TMP107]], align 4
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP104]], i32 [[TMP108]], i64 3
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE44]]
+; CHECK:       pred.load.continue44:
+; CHECK-NEXT:    [[TMP110:%.*]] = phi <4 x i32> [ [[TMP104]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP109]], [[PRED_LOAD_IF43]] ]
+; CHECK-NEXT:    [[TMP111:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP38]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP112:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP111]])
+; CHECK-NEXT:    [[TMP113]] = mul i32 [[TMP112]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP114:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> [[TMP62]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP115:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP114]])
+; CHECK-NEXT:    [[TMP116]] = mul i32 [[TMP115]], [[VEC_PHI4]]
+; CHECK-NEXT:    [[TMP117:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> [[TMP86]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP118:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP117]])
+; CHECK-NEXT:    [[TMP119]] = mul i32 [[TMP118]], [[VEC_PHI5]]
+; CHECK-NEXT:    [[TMP120:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP110]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP121:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP120]])
+; CHECK-NEXT:    [[TMP122]] = mul i32 [[TMP121]], [[VEC_PHI6]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 16, i64 16, i64 16, i64 16>
+; CHECK-NEXT:    [[TMP123:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP123]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul i32 [[TMP116]], [[TMP113]]
+; CHECK-NEXT:    [[BIN_RDX45:%.*]] = mul i32 [[TMP119]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX46:%.*]] = mul i32 [[TMP122]], [[BIN_RDX45]]
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ undef, [[FOR_INC]] ], [ [[BIN_RDX46]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %sum = phi i32 [ %res, %for.inc ], [ 4, %entry ]
+  %cmp1 = icmp sgt i32 %cond, 7
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %load = load i32, i32* %arrayidx
+  %mul = mul nsw i32 %load, %sum
+  br label %for.inc
+
+for.inc:
+  %res = phi i32 [ %mul, %if.then ], [ %sum, %for.body ]
+  %inc = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:
+  ret i32 %res
+}
+
 !6 = distinct !{!6, !7, !8}
 !7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 !8 = !{!"llvm.loop.vectorize.enable", i1 true}

From 0b900073457497e19d4c55541dc557520c07b9ad Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 22 Feb 2022 11:52:45 +0000
Subject: [PATCH 489/748] [GISel] Silence 'sideeffect in assertion' coverity
 warnings. NFCI.

Use llvm::enumerate to keep track of index.
---
 llvm/utils/TableGen/RegisterBankEmitter.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index d97d7acb87a79..2cc8c0f548b2d 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -266,9 +266,8 @@ void RegisterBankEmitter::emitBaseClassImplementation(
      << "::NumRegisterBanks) {\n"
      << "  // Assert that RegBank indices match their ID's\n"
      << "#ifndef NDEBUG\n"
-     << "  unsigned Index = 0;\n"
-     << "  for (const auto &RB : RegBanks)\n"
-     << "    assert(Index++ == RB->getID() && \"Index != ID\");\n"
+     << "  for (auto RB : enumerate(RegBanks))\n"
+     << "    assert(RB.index() == RB.value()->getID() && \"Index != ID\");\n"
      << "#endif // NDEBUG\n"
      << "}\n"
      << "} // end namespace llvm\n";

From 380ff31d831610389b9cd5574fdf91431e482171 Mon Sep 17 00:00:00 2001
From: Thomas Symalla <5754458+tsymalla@users.noreply.github.com>
Date: Tue, 22 Feb 2022 13:27:26 +0100
Subject: [PATCH 490/748] [AMDGPU] Fix typo in comment [NFC]

This replaces "V_MOB_B32" with "V_MOV_B32" in some comment.
---
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a1eb80b1b762b..b1c95ca647791 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -893,7 +893,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
     return false;
 
   // V_NOP will be discarded by SQ.
-  // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+  // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
   // which is always a VGPR and available.
   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
   Register Reg = Src0->getReg();

From f57627f544665de2b2e246226b0aa3e4a25491e5 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Tue, 22 Feb 2022 10:12:04 +0000
Subject: [PATCH 491/748] [Flang] Initial patch to lower a Fortran intrinsic

This patch brings in some initial changes for lowering Fortran
intrinsics. Intrinsics are generally lowered to a mix of FIR and
MLIR operations, runtime calls or LLVM intrinsics. This patch
particularly brings in the lowering of the Fortran `andi` intrinsic
to `arith.andi` in MLIR.

The significant changes are in ConvertExpr.cpp and IntrinsicCall.cpp.
Intrinsic functions occur as part of expressions. Lowering deals with this
in ConvertExpr.cpp in `genval(const Fortran::evaluate::FunctionRef<A> &funcRef)`.
The code in the above mentioned function kicks of a sequence of calls
that ultimately results in a call to the `genIand ` function in
IntrinsicCall.cpp which creates the MLIR `arith.andi` operation.

A few tests are also included.

Note: Generally intrinsics like `iand` can occur in array (elemental)
context, but since that part is not fully supported in lowering, tests
are only added for the scalar context.

This patch is part of upstreaming from the fir-dev branch of
https://github.com/flang-compiler/f18-llvm-project.

Reviewed By: clementval

Differential Revision: https://reviews.llvm.org/D119990

Co-authored-by: Jean Perier <jperier@nvidia.com>
Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Co-authored-by: zacharyselk <zrselk@gmail.com>
Co-authored-by: V Donaldson <vdonaldson@nvidia.com>
Co-authored-by: Valentin Clement <clementval@gmail.com>
---
 flang/include/flang/Lower/IntrinsicCall.h |  83 ++++++++
 flang/lib/Lower/CMakeLists.txt            |   1 +
 flang/lib/Lower/ConvertExpr.cpp           | 106 ++++++++++-
 flang/lib/Lower/IntrinsicCall.cpp         | 220 ++++++++++++++++++++++
 flang/test/Lower/Intrinsics/iand.f90      |  79 ++++++++
 5 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 flang/include/flang/Lower/IntrinsicCall.h
 create mode 100644 flang/lib/Lower/IntrinsicCall.cpp
 create mode 100644 flang/test/Lower/Intrinsics/iand.f90

diff --git a/flang/include/flang/Lower/IntrinsicCall.h b/flang/include/flang/Lower/IntrinsicCall.h
new file mode 100644
index 0000000000000..6b5b460786bd3
--- /dev/null
+++ b/flang/include/flang/Lower/IntrinsicCall.h
@@ -0,0 +1,83 @@
+//===-- Lower/IntrinsicCall.h -- lowering of intrinsics ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_INTRINSICCALL_H
+#define FORTRAN_LOWER_INTRINSICCALL_H
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "llvm/ADT/Optional.h"
+
+namespace fir {
+class ExtendedValue;
+}
+
+namespace Fortran::lower {
+
+// TODO: Error handling interface ?
+// TODO: Implementation is incomplete. Many intrinsics to tbd.
+
+/// Generate the FIR+MLIR operations for the generic intrinsic \p name
+/// with arguments \p args and expected result type \p resultType.
+/// Returned mlir::Value is the returned Fortran intrinsic value.
+fir::ExtendedValue genIntrinsicCall(fir::FirOpBuilder &, mlir::Location,
+                                    llvm::StringRef name,
+                                    llvm::Optional<mlir::Type> resultType,
+                                    llvm::ArrayRef<fir::ExtendedValue> args);
+
+/// Enum specifying how intrinsic argument evaluate::Expr should be
+/// lowered to fir::ExtendedValue to be passed to genIntrinsicCall.
+enum class LowerIntrinsicArgAs {
+  /// Lower argument to a value. Mainly intended for scalar arguments.
+  Value,
+  /// Lower argument to an address. Only valid when the argument properties are
+  /// fully defined (e.g. allocatable is allocated...).
+  Addr,
+  /// Lower argument to a box.
+  Box,
+  /// Lower argument without assuming that the argument is fully defined.
+  /// It can be used on unallocated allocatable, disassociated pointer,
+  /// or absent optional. This is meant for inquiry intrinsic arguments.
+  Inquired
+};
+
+/// Define how a given intrinsic argument must be lowered.
+struct ArgLoweringRule {
+  LowerIntrinsicArgAs lowerAs;
+  /// Value:
+  //    - Numerical: 0
+  //    - Logical : false
+  //    - Derived/character: not possible. Need custom intrinsic lowering.
+  //  Addr:
+  //    - nullptr
+  //  Box:
+  //    - absent box
+  //  AsInquired:
+  //    - no-op
+  bool handleDynamicOptional;
+};
+
+/// Opaque class defining the argument lowering rules for all the argument of
+/// an intrinsic.
+struct IntrinsicArgumentLoweringRules;
+
+/// Return argument lowering rules for an intrinsic.
+/// Returns a nullptr if all the intrinsic arguments should be lowered by value.
+const IntrinsicArgumentLoweringRules *
+getIntrinsicArgumentLowering(llvm::StringRef intrinsicName);
+
+/// Return how argument \p argName should be lowered given the rules for the
+/// intrinsic function. The argument names are the one defined by the standard.
+ArgLoweringRule lowerIntrinsicArgumentAs(mlir::Location,
+                                         const IntrinsicArgumentLoweringRules &,
+                                         llvm::StringRef argName);
+
+/// Return place-holder for absent intrinsic arguments.
+fir::ExtendedValue getAbsentIntrinsicArgument();
+} // namespace Fortran::lower
+
+#endif // FORTRAN_LOWER_INTRINSICCALL_H
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 7d72360fce70b..56fdce46f9433 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -7,6 +7,7 @@ add_flang_library(FortranLower
   ConvertExpr.cpp
   ConvertType.cpp
   ConvertVariable.cpp
+  IntrinsicCall.cpp
   Mangler.cpp
   OpenACC.cpp
   OpenMP.cpp
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 07e5fb8fa1a57..76bee213c96b6 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -15,6 +15,7 @@
 #include "flang/Evaluate/real.h"
 #include "flang/Evaluate/traverse.h"
 #include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/IntrinsicCall.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Lower/Todo.h"
 #include "flang/Semantics/expression.h"
@@ -90,6 +91,16 @@ static fir::ExtendedValue genLoad(fir::FirOpBuilder &builder,
       });
 }
 
+/// Is this a call to an elemental procedure with at least one array argument?
+static bool
+isElementalProcWithArrayArgs(const Fortran::evaluate::ProcedureRef &procRef) {
+  if (procRef.IsElemental())
+    for (const std::optional<Fortran::evaluate::ActualArgument> &arg :
+         procRef.arguments())
+      if (arg && arg->Rank() != 0)
+        return true;
+  return false;
+}
 namespace {
 
 /// Lowering of Fortran::evaluate::Expr<T> expressions
@@ -444,6 +455,23 @@ class ScalarExprLowering {
     return std::visit([&](const auto &x) { return genval(x); }, des.u);
   }
 
+  mlir::Type genType(const Fortran::evaluate::DynamicType &dt) {
+    if (dt.category() != Fortran::common::TypeCategory::Derived)
+      return converter.genType(dt.category(), dt.kind());
+    TODO(getLoc(), "genType Derived Type");
+  }
+
+  /// Lower a function reference
+  template <typename A>
+  ExtValue genFunctionRef(const Fortran::evaluate::FunctionRef<A> &funcRef) {
+    if (!funcRef.GetType().has_value())
+      fir::emitFatalError(getLoc(), "internal: a function must have a type");
+    mlir::Type resTy = genType(*funcRef.GetType());
+    return genProcedureRef(funcRef, {resTy});
+  }
+
+  /// Lower function call `funcRef` and return a reference to the resultant
+  /// value. This is required for lowering expressions such as `f1(f2(v))`.
   template <typename A>
   ExtValue gen(const Fortran::evaluate::FunctionRef<A> &funcRef) {
     TODO(getLoc(), "gen FunctionRef<A>");
@@ -451,13 +479,67 @@ class ScalarExprLowering {
 
   template <typename A>
   ExtValue genval(const Fortran::evaluate::FunctionRef<A> &funcRef) {
-    TODO(getLoc(), "genval FunctionRef<A>");
+    ExtValue result = genFunctionRef(funcRef);
+    if (result.rank() == 0 && fir::isa_ref_type(fir::getBase(result).getType()))
+      return genLoad(result);
+    return result;
   }
 
   ExtValue genval(const Fortran::evaluate::ProcedureRef &procRef) {
     TODO(getLoc(), "genval ProcedureRef");
   }
 
+  /// Generate a call to an intrinsic function.
+  ExtValue
+  genIntrinsicRef(const Fortran::evaluate::ProcedureRef &procRef,
+                  const Fortran::evaluate::SpecificIntrinsic &intrinsic,
+                  llvm::Optional<mlir::Type> resultType) {
+    llvm::SmallVector<ExtValue> operands;
+
+    llvm::StringRef name = intrinsic.name;
+    mlir::Location loc = getLoc();
+
+    const Fortran::lower::IntrinsicArgumentLoweringRules *argLowering =
+        Fortran::lower::getIntrinsicArgumentLowering(name);
+    for (const auto &[arg, dummy] :
+         llvm::zip(procRef.arguments(),
+                   intrinsic.characteristics.value().dummyArguments)) {
+      auto *expr = Fortran::evaluate::UnwrapExpr<Fortran::lower::SomeExpr>(arg);
+      if (!expr) {
+        // Absent optional.
+        operands.emplace_back(Fortran::lower::getAbsentIntrinsicArgument());
+        continue;
+      }
+      if (!argLowering) {
+        // No argument lowering instruction, lower by value.
+        operands.emplace_back(genval(*expr));
+        continue;
+      }
+      // Ad-hoc argument lowering handling.
+      Fortran::lower::ArgLoweringRule argRules =
+          Fortran::lower::lowerIntrinsicArgumentAs(loc, *argLowering,
+                                                   dummy.name);
+      switch (argRules.lowerAs) {
+      case Fortran::lower::LowerIntrinsicArgAs::Value:
+        operands.emplace_back(genval(*expr));
+        continue;
+      case Fortran::lower::LowerIntrinsicArgAs::Addr:
+        TODO(getLoc(), "argument lowering for Addr");
+        continue;
+      case Fortran::lower::LowerIntrinsicArgAs::Box:
+        TODO(getLoc(), "argument lowering for Box");
+        continue;
+      case Fortran::lower::LowerIntrinsicArgAs::Inquired:
+        TODO(getLoc(), "argument lowering for Inquired");
+        continue;
+      }
+      llvm_unreachable("bad switch");
+    }
+    // Let the intrinsic library lower the intrinsic procedure call
+    return Fortran::lower::genIntrinsicCall(builder, getLoc(), name, resultType,
+                                            operands);
+  }
+
   template <typename A>
   ExtValue genval(const Fortran::evaluate::Expr<A> &x) {
     if (isScalar(x))
@@ -465,6 +547,28 @@ class ScalarExprLowering {
     TODO(getLoc(), "genval Expr<A> arrays");
   }
 
+  /// Lower a non-elemental procedure reference.
+  // TODO: Handle read allocatable and pointer results.
+  ExtValue genProcedureRef(const Fortran::evaluate::ProcedureRef &procRef,
+                           llvm::Optional<mlir::Type> resultType) {
+    ExtValue res = genRawProcedureRef(procRef, resultType);
+    return res;
+  }
+
+  /// Lower a non-elemental procedure reference.
+  ExtValue genRawProcedureRef(const Fortran::evaluate::ProcedureRef &procRef,
+                              llvm::Optional<mlir::Type> resultType) {
+    mlir::Location loc = getLoc();
+    if (isElementalProcWithArrayArgs(procRef))
+      fir::emitFatalError(loc, "trying to lower elemental procedure with array "
+                               "arguments as normal procedure");
+    if (const Fortran::evaluate::SpecificIntrinsic *intrinsic =
+            procRef.proc().GetSpecificIntrinsic())
+      return genIntrinsicRef(procRef, *intrinsic, resultType);
+
+    return {};
+  }
+
   /// Helper to detect Transformational function reference.
   template <typename T>
   bool isTransformationalRef(const T &) {
diff --git a/flang/lib/Lower/IntrinsicCall.cpp b/flang/lib/Lower/IntrinsicCall.cpp
new file mode 100644
index 0000000000000..08c46c26c47fa
--- /dev/null
+++ b/flang/lib/Lower/IntrinsicCall.cpp
@@ -0,0 +1,220 @@
+//===-- IntrinsicCall.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper routines for constructing the FIR dialect of MLIR. As FIR is a
+// dialect of MLIR, it makes extensive use of MLIR interfaces and MLIR's coding
+// style (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this
+// module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Lower/IntrinsicCall.h"
+#include "flang/Lower/SymbolMap.h"
+#include "flang/Lower/Todo.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Support/FatalError.h"
+
+#define DEBUG_TYPE "flang-lower-intrinsic"
+
+/// This file implements lowering of Fortran intrinsic procedures.
+/// Intrinsics are lowered to a mix of FIR and MLIR operations as
+/// well as call to runtime functions or LLVM intrinsics.
+
+/// Lowering of intrinsic procedure calls is based on a map that associates
+/// Fortran intrinsic generic names to FIR generator functions.
+/// All generator functions are member functions of the IntrinsicLibrary class
+/// and have the same interface.
+/// If no generator is given for an intrinsic name, a math runtime library
+/// is searched for an implementation and, if a runtime function is found,
+/// a call is generated for it. LLVM intrinsics are handled as a math
+/// runtime library here.
+
+fir::ExtendedValue Fortran::lower::getAbsentIntrinsicArgument() {
+  return fir::UnboxedValue{};
+}
+
+// TODO error handling -> return a code or directly emit messages ?
+struct IntrinsicLibrary {
+
+  // Constructors.
+  explicit IntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc)
+      : builder{builder}, loc{loc} {}
+  IntrinsicLibrary() = delete;
+  IntrinsicLibrary(const IntrinsicLibrary &) = delete;
+
+  /// Generate FIR for call to Fortran intrinsic \p name with arguments \p arg
+  /// and expected result type \p resultType.
+  fir::ExtendedValue genIntrinsicCall(llvm::StringRef name,
+                                      llvm::Optional<mlir::Type> resultType,
+                                      llvm::ArrayRef<fir::ExtendedValue> arg);
+
+  mlir::Value genIand(mlir::Type, llvm::ArrayRef<mlir::Value>);
+
+  /// Define the different FIR generators that can be mapped to intrinsic to
+  /// generate the related code.
+  using ElementalGenerator = decltype(&IntrinsicLibrary::genIand);
+  using Generator = std::variant<ElementalGenerator>;
+
+  /// Generate calls to ElementalGenerator, handling the elemental aspects
+  template <typename GeneratorType>
+  fir::ExtendedValue
+  genElementalCall(GeneratorType, llvm::StringRef name, mlir::Type resultType,
+                   llvm::ArrayRef<fir::ExtendedValue> args, bool outline);
+
+  /// Helper to invoke code generator for the intrinsics given arguments.
+  mlir::Value invokeGenerator(ElementalGenerator generator,
+                              mlir::Type resultType,
+                              llvm::ArrayRef<mlir::Value> args);
+  fir::FirOpBuilder &builder;
+  mlir::Location loc;
+};
+
+struct IntrinsicDummyArgument {
+  const char *name = nullptr;
+  Fortran::lower::LowerIntrinsicArgAs lowerAs =
+      Fortran::lower::LowerIntrinsicArgAs::Value;
+  bool handleDynamicOptional = false;
+};
+
+struct Fortran::lower::IntrinsicArgumentLoweringRules {
+  /// There is no more than 7 non repeated arguments in Fortran intrinsics.
+  IntrinsicDummyArgument args[7];
+  constexpr bool hasDefaultRules() const { return args[0].name == nullptr; }
+};
+
+/// Structure describing what needs to be done to lower intrinsic "name".
+struct IntrinsicHandler {
+  const char *name;
+  IntrinsicLibrary::Generator generator;
+  Fortran::lower::IntrinsicArgumentLoweringRules argLoweringRules = {};
+};
+
+using I = IntrinsicLibrary;
+
+/// Table that drives the fir generation depending on the intrinsic.
+/// one to one mapping with Fortran arguments. If no mapping is
+/// defined here for a generic intrinsic, genRuntimeCall will be called
+/// to look for a match in the runtime a emit a call. Note that the argument
+/// lowering rules for an intrinsic need to be provided only if at least one
+/// argument must not be lowered by value. In which case, the lowering rules
+/// should be provided for all the intrinsic arguments for completeness.
+static constexpr IntrinsicHandler handlers[]{
+    {"iand", &I::genIand},
+};
+
+static const IntrinsicHandler *findIntrinsicHandler(llvm::StringRef name) {
+  auto compare = [](const IntrinsicHandler &handler, llvm::StringRef name) {
+    return name.compare(handler.name) > 0;
+  };
+  auto result =
+      std::lower_bound(std::begin(handlers), std::end(handlers), name, compare);
+  return result != std::end(handlers) && result->name == name ? result
+                                                              : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// IntrinsicLibrary
+//===----------------------------------------------------------------------===//
+
+template <typename GeneratorType>
+fir::ExtendedValue IntrinsicLibrary::genElementalCall(
+    GeneratorType generator, llvm::StringRef name, mlir::Type resultType,
+    llvm::ArrayRef<fir::ExtendedValue> args, bool outline) {
+  llvm::SmallVector<mlir::Value> scalarArgs;
+  for (const fir::ExtendedValue &arg : args)
+    if (arg.getUnboxed() || arg.getCharBox())
+      scalarArgs.emplace_back(fir::getBase(arg));
+    else
+      fir::emitFatalError(loc, "nonscalar intrinsic argument");
+  return invokeGenerator(generator, resultType, scalarArgs);
+}
+
+static fir::ExtendedValue
+invokeHandler(IntrinsicLibrary::ElementalGenerator generator,
+              const IntrinsicHandler &handler,
+              llvm::Optional<mlir::Type> resultType,
+              llvm::ArrayRef<fir::ExtendedValue> args, bool outline,
+              IntrinsicLibrary &lib) {
+  assert(resultType && "expect elemental intrinsic to be functions");
+  return lib.genElementalCall(generator, handler.name, *resultType, args,
+                              outline);
+}
+
+fir::ExtendedValue
+IntrinsicLibrary::genIntrinsicCall(llvm::StringRef name,
+                                   llvm::Optional<mlir::Type> resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  if (const IntrinsicHandler *handler = findIntrinsicHandler(name)) {
+    bool outline = false;
+    return std::visit(
+        [&](auto &generator) -> fir::ExtendedValue {
+          return invokeHandler(generator, *handler, resultType, args, outline,
+                               *this);
+        },
+        handler->generator);
+  }
+
+  TODO(loc, "genIntrinsicCall runtime");
+  return {};
+}
+
+mlir::Value
+IntrinsicLibrary::invokeGenerator(ElementalGenerator generator,
+                                  mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  return std::invoke(generator, *this, resultType, args);
+}
+//===----------------------------------------------------------------------===//
+// Code generators for the intrinsic
+//===----------------------------------------------------------------------===//
+
+// IAND
+mlir::Value IntrinsicLibrary::genIand(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  return builder.create<mlir::arith::AndIOp>(loc, args[0], args[1]);
+}
+
+//===----------------------------------------------------------------------===//
+// Argument lowering rules interface
+//===----------------------------------------------------------------------===//
+
+const Fortran::lower::IntrinsicArgumentLoweringRules *
+Fortran::lower::getIntrinsicArgumentLowering(llvm::StringRef intrinsicName) {
+  if (const IntrinsicHandler *handler = findIntrinsicHandler(intrinsicName))
+    if (!handler->argLoweringRules.hasDefaultRules())
+      return &handler->argLoweringRules;
+  return nullptr;
+}
+
+/// Return how argument \p argName should be lowered given the rules for the
+/// intrinsic function.
+Fortran::lower::ArgLoweringRule Fortran::lower::lowerIntrinsicArgumentAs(
+    mlir::Location loc, const IntrinsicArgumentLoweringRules &rules,
+    llvm::StringRef argName) {
+  for (const IntrinsicDummyArgument &arg : rules.args) {
+    if (arg.name && arg.name == argName)
+      return {arg.lowerAs, arg.handleDynamicOptional};
+  }
+  fir::emitFatalError(
+      loc, "internal: unknown intrinsic argument name in lowering '" + argName +
+               "'");
+}
+
+//===----------------------------------------------------------------------===//
+// Public intrinsic call helpers
+//===----------------------------------------------------------------------===//
+
+fir::ExtendedValue
+Fortran::lower::genIntrinsicCall(fir::FirOpBuilder &builder, mlir::Location loc,
+                                 llvm::StringRef name,
+                                 llvm::Optional<mlir::Type> resultType,
+                                 llvm::ArrayRef<fir::ExtendedValue> args) {
+  return IntrinsicLibrary{builder, loc}.genIntrinsicCall(name, resultType,
+                                                         args);
+}
diff --git a/flang/test/Lower/Intrinsics/iand.f90 b/flang/test/Lower/Intrinsics/iand.f90
new file mode 100644
index 0000000000000..1e38cd1b07c3f
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/iand.f90
@@ -0,0 +1,79 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+
+! CHECK-LABEL: iand_test
+! CHECK-SAME: %[[A:.*]]: !fir.ref<i32>{{.*}}, %[[B:.*]]: !fir.ref<i32>{{.*}}, %[[C:.*]]: !fir.ref<i32>{{.*}}
+subroutine iand_test(a, b, c)
+  integer :: a, b, c
+! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
+! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+  c = iand(a, b)
+! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i32
+! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i32>
+end subroutine iand_test
+
+! CHECK-LABEL: iand_test1
+! CHECK-SAME: %[[A:.*]]: !fir.ref<i8>{{.*}}, %[[B:.*]]: !fir.ref<i8>{{.*}}, %[[C:.*]]: !fir.ref<i8>{{.*}}
+subroutine iand_test1(a, b, c)
+  integer(kind=1) :: a, b, c
+! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i8>
+! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i8>
+  c = iand(a, b)
+! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i8
+! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i8>
+end subroutine iand_test1
+
+! CHECK-LABEL: iand_test2
+! CHECK-SAME: %[[A:.*]]: !fir.ref<i16>{{.*}}, %[[B:.*]]: !fir.ref<i16>{{.*}}, %[[C:.*]]: !fir.ref<i16>{{.*}}
+subroutine iand_test2(a, b, c)
+  integer(kind=2) :: a, b, c
+! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i16>
+! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i16>
+  c = iand(a, b)
+! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i16
+! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i16>
+end subroutine iand_test2
+
+! CHECK-LABEL: iand_test3
+! CHECK-SAME: %[[A:.*]]: !fir.ref<i32>{{.*}}, %[[B:.*]]: !fir.ref<i32>{{.*}}, %[[C:.*]]: !fir.ref<i32>{{.*}}
+subroutine iand_test3(a, b, c)
+  integer(kind=4) :: a, b, c
+! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
+! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+  c = iand(a, b)
+! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i32
+! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i32>
+end subroutine iand_test3
+
+! CHECK-LABEL: iand_test4
+! CHECK-SAME: %[[A:.*]]: !fir.ref<i64>{{.*}}, %[[B:.*]]: !fir.ref<i64>{{.*}}, %[[C:.*]]: !fir.ref<i64>{{.*}}
+subroutine iand_test4(a, b, c)
+  integer(kind=8) :: a, b, c
+! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i64>
+! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i64>
+  c = iand(a, b)
+! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i64
+! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i64>
+end subroutine iand_test4
+
+! CHECK-LABEL: iand_test5
+! CHECK-SAME: %[[A:.*]]: !fir.ref<i128>{{.*}}, %[[B:.*]]: !fir.ref<i128>{{.*}}, %[[C:.*]]: !fir.ref<i128>{{.*}}
+subroutine iand_test5(a, b, c)
+  integer(kind=16) :: a, b, c
+! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i128>
+! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i128>
+  c = iand(a, b)
+! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i128
+! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i128>
+end subroutine iand_test5
+
+! CHECK-LABEL: iand_test6
+! CHECK-SAME: %[[S1:.*]]: !fir.ref<i32>{{.*}}, %[[S2:.*]]: !fir.ref<i32>{{.*}}
+subroutine iand_test6(s1, s2)
+  integer :: s1, s2
+! CHECK-DAG: %[[S1_VAL:.*]] = fir.load %[[S1]] : !fir.ref<i32>
+! CHECK-DAG: %[[S2_VAL:.*]] = fir.load %[[S2]] : !fir.ref<i32>
+  stop iand(s1,s2)
+! CHECK-DAG: %[[ANDI:.*]] = arith.andi %[[S1_VAL]], %[[S2_VAL]] : i32
+! CHECK: fir.call @_FortranAStopStatement(%[[ANDI]], {{.*}}, {{.*}}) : (i32, i1, i1) -> none
+! CHECK-NEXT: fir.unreachable
+end subroutine iand_test6

From ad7214f23dc3a088d73eb3974b146a0bb09d6ffd Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 22 Feb 2022 08:01:09 -0500
Subject: [PATCH 492/748] [x86] add load folding restriction to
 pushAddIntoCmovOfConsts()

With only a load-fold the diffs look neutral. If there's a load and store (rmw)
fold opportunity as shown in the test based on #53862, then we end up with an
extra instruction.

Fixes #53862

Differential Revision: https://reviews.llvm.org/D120281
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +++++++--
 llvm/test/CodeGen/X86/add-cmov.ll       | 27 +++++++++++--------------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c372919f44f70..9666d71288a34 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52606,7 +52606,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
 /// Try to fold those constants into an 'add' instruction to reduce instruction
 /// count. We do this with CMOV rather the generic 'select' because there are
 /// earlier folds that may be used to turn select-of-constants into logic hacks.
-static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
+static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
   // better because we eliminate 1-2 instructions. This transform is still
   // an improvement without zero operands because we trade 2 move constants and
@@ -52631,6 +52632,11 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
   if (!isSuitableCmov(Cmov))
     return SDValue();
 
+  // Don't remove a load folding opportunity for the add. That would neutralize
+  // any improvements from removing constant materializations.
+  if (X86::mayFoldLoad(OtherOp, Subtarget))
+    return SDValue();
+
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue FalseOp = Cmov.getOperand(0);
@@ -52673,7 +52679,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   SDLoc DL(N);
 
-  if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
+  if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
     return Select;
 
   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll
index a47cad269da96..492feff344152 100644
--- a/llvm/test/CodeGen/X86/add-cmov.ll
+++ b/llvm/test/CodeGen/X86/add-cmov.ll
@@ -477,12 +477,11 @@ define void @complex_lea_alt8(i1 %b, i16* readnone %ptr, i64 %idx) {
 define i32 @loadfold_select_const_arms(i32* %x, i1 %y) {
 ; CHECK-LABEL: loadfold_select_const_arms:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl (%rdi), %eax
-; CHECK-NEXT:    leal -10(%rax), %ecx
-; CHECK-NEXT:    addl $10, %eax
 ; CHECK-NEXT:    testb $1, %sil
-; CHECK-NEXT:    cmovel %ecx, %eax
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    movl $10, %ecx
+; CHECK-NEXT:    movl $-10, %eax
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    addl (%rdi), %eax
 ; CHECK-NEXT:    retq
   %cond = select i1 %y, i32 10, i32 -10
   %t0 = load i32, i32* %x, align 4
@@ -522,12 +521,11 @@ define void @rmw_add_select_const_arm(i32* %x, i1 %y, i32 %z) {
 define void @rmw_select_const_arms(i32* %x, i1 %y) {
 ; CHECK-LABEL: rmw_select_const_arms:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl (%rdi), %eax
-; CHECK-NEXT:    leal -10(%rax), %ecx
-; CHECK-NEXT:    addl $10, %eax
 ; CHECK-NEXT:    testb $1, %sil
-; CHECK-NEXT:    cmovel %ecx, %eax
-; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    movl $10, %eax
+; CHECK-NEXT:    movl $-10, %ecx
+; CHECK-NEXT:    cmovnel %eax, %ecx
+; CHECK-NEXT:    addl %ecx, (%rdi)
 ; CHECK-NEXT:    retq
   %cond = select i1 %y, i32 10, i32 -10
   %t0 = load i32, i32* %x, align 4
@@ -557,13 +555,12 @@ define i32 @rmw_select_const_arms_extra_load_use(i32* %x, i1 %y) {
 define i32 @rmw_select_const_arms_extra_add_use(i32* %x, i1 %y) {
 ; CHECK-LABEL: rmw_select_const_arms_extra_add_use:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl (%rdi), %eax
-; CHECK-NEXT:    leal -10(%rax), %ecx
-; CHECK-NEXT:    addl $10, %eax
 ; CHECK-NEXT:    testb $1, %sil
-; CHECK-NEXT:    cmovel %ecx, %eax
+; CHECK-NEXT:    movl $10, %ecx
+; CHECK-NEXT:    movl $-10, %eax
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    addl (%rdi), %eax
 ; CHECK-NEXT:    movl %eax, (%rdi)
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    retq
   %cond = select i1 %y, i32 10, i32 -10
   %t0 = load i32, i32* %x, align 4

From 25ed2ab3418b09f253e3f3d16b2bfc8b90121f65 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Mon, 14 Feb 2022 20:01:29 +0000
Subject: [PATCH 493/748] [SVE] Add isel patterns for SABA/UABA.

Differential Revision: https://reviews.llvm.org/D119830
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  12 +-
 llvm/test/CodeGen/AArch64/sve-aba.ll          | 277 ++++++++++++++++++
 2 files changed, 287 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-aba.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 2901527a07d36..0bd75e29e7ba5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -244,6 +244,14 @@ def AArch64fmul_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fmul, AArch
 def AArch64fadd_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fadd, AArch64fadd_p>;
 def AArch64fsub_m1 : EitherVSelectOrPassthruPatFrags<int_aarch64_sve_fsub, AArch64fsub_p>;
 
+def AArch64saba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_saba node:$op1, node:$op2, node:$op3),
+                            (add node:$op1, (AArch64sabd_p (SVEAllActive), node:$op2, node:$op3))]>;
+
+def AArch64uaba : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+                           [(int_aarch64_sve_uaba node:$op1, node:$op2, node:$op3),
+                            (add node:$op1, (AArch64uabd_p (SVEAllActive), node:$op2, node:$op3))]>;
+
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
   SDTCVecEltisVT<1,i1>
@@ -2970,8 +2978,8 @@ let Predicates = [HasSVE2orStreamingSVE] in {
   defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;
 
   // SVE2 integer absolute difference and accumulate
-  defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
-  defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;
+  defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", AArch64saba>;
+  defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", AArch64uaba>;
 
   // SVE2 integer absolute difference and accumulate long
   defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
diff --git a/llvm/test/CodeGen/AArch64/sve-aba.ll b/llvm/test/CodeGen/AArch64/sve-aba.ll
new file mode 100644
index 0000000000000..cf7da62bbcf00
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-aba.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; SABA
+;
+
+define <vscale x 16 x i8> @saba_b(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
+; CHECK-LABEL: saba_b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saba z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i16>
+  %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+  %sub = sub <vscale x 16 x i16> %b.sext, %c.sext
+  %abs = call <vscale x 16 x i16> @llvm.abs.nxv16i16(<vscale x 16 x i16> %sub, i1 true)
+  %trunc = trunc <vscale x 16 x i16> %abs to <vscale x 16 x i8>
+  %add = add <vscale x 16 x i8> %a, %trunc
+  ret <vscale x 16 x i8> %add
+}
+
+define <vscale x 16 x i8> @saba_b_promoted_ops(<vscale x 16 x i8> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c) #0 {
+; CHECK-LABEL: saba_b_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    saba z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 16 x i1> %b to <vscale x 16 x i8>
+  %c.sext = sext <vscale x 16 x i1> %c to <vscale x 16 x i8>
+  %sub = sub <vscale x 16 x i8> %b.sext, %c.sext
+  %abs = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> %sub, i1 true)
+  %add = add <vscale x 16 x i8> %a, %abs
+  ret <vscale x 16 x i8> %add
+}
+
+define <vscale x 8 x i16> @saba_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) #0 {
+; CHECK-LABEL: saba_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saba z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 8 x i16> %b to <vscale x 8 x i32>
+  %c.sext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+  %sub = sub <vscale x 8 x i32> %b.sext, %c.sext
+  %abs = call <vscale x 8 x i32> @llvm.abs.nxv8i32(<vscale x 8 x i32> %sub, i1 true)
+  %trunc = trunc <vscale x 8 x i32> %abs to <vscale x 8 x i16>
+  %add = add <vscale x 8 x i16> %a, %trunc
+  ret <vscale x 8 x i16> %add
+}
+
+define <vscale x 8 x i16> @saba_h_promoted_ops(<vscale x 8 x i16> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c) #0 {
+; CHECK-LABEL: saba_h_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-NEXT:    saba z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
+  %c.sext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+  %sub = sub <vscale x 8 x i16> %b.sext, %c.sext
+  %abs = call <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16> %sub, i1 true)
+  %add = add <vscale x 8 x i16> %a, %abs
+  ret <vscale x 8 x i16> %add
+}
+
+define <vscale x 4 x i32> @saba_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) #0 {
+; CHECK-LABEL: saba_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saba z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 4 x i32> %b to <vscale x 4 x i64>
+  %c.sext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+  %sub = sub <vscale x 4 x i64> %b.sext, %c.sext
+  %abs = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> %sub, i1 true)
+  %trunc = trunc <vscale x 4 x i64> %abs to <vscale x 4 x i32>
+  %add = add <vscale x 4 x i32> %a, %trunc
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @saba_s_promoted_ops(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c) #0 {
+; CHECK-LABEL: saba_s_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sxth z1.s, p0/m, z1.s
+; CHECK-NEXT:    sxth z2.s, p0/m, z2.s
+; CHECK-NEXT:    saba z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 4 x i16> %b to <vscale x 4 x i32>
+  %c.sext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+  %sub = sub <vscale x 4 x i32> %b.sext, %c.sext
+  %abs = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> %sub, i1 true)
+  %add = add <vscale x 4 x i32> %a, %abs
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 2 x i64> @saba_d(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) #0 {
+; CHECK-LABEL: saba_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saba z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 2 x i64> %b to <vscale x 2 x i128>
+  %c.sext = sext <vscale x 2 x i64> %c to <vscale x 2 x i128>
+  %sub = sub <vscale x 2 x i128> %b.sext, %c.sext
+  %abs = call <vscale x 2 x i128> @llvm.abs.nxv2i128(<vscale x 2 x i128> %sub, i1 true)
+  %trunc = trunc <vscale x 2 x i128> %abs to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i64> %a, %trunc
+  ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @saba_d_promoted_ops(<vscale x 2 x i64> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c) #0 {
+; CHECK-LABEL: saba_d_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z1.d, p0/m, z1.d
+; CHECK-NEXT:    sxtw z2.d, p0/m, z2.d
+; CHECK-NEXT:    saba z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %b.sext = sext <vscale x 2 x i32> %b to <vscale x 2 x i64>
+  %c.sext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+  %sub = sub <vscale x 2 x i64> %b.sext, %c.sext
+  %abs = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> %sub, i1 true)
+  %add = add <vscale x 2 x i64> %a, %abs
+  ret <vscale x 2 x i64> %add
+}
+
+;
+; UABA
+;
+
+define <vscale x 16 x i8> @uaba_b(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) #0 {
+; CHECK-LABEL: uaba_b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaba z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
+  %c.zext = zext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+  %sub = sub <vscale x 16 x i16> %b.zext, %c.zext
+  %abs = call <vscale x 16 x i16> @llvm.abs.nxv16i16(<vscale x 16 x i16> %sub, i1 true)
+  %trunc = trunc <vscale x 16 x i16> %abs to <vscale x 16 x i8>
+  %add = add <vscale x 16 x i8> %a, %trunc
+  ret <vscale x 16 x i8> %add
+}
+
+define <vscale x 16 x i8> @uaba_b_promoted_ops(<vscale x 16 x i8> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c) #0 {
+; CHECK-LABEL: uaba_b_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov z2.b, p1/z, #1 // =0x1
+; CHECK-NEXT:    uaba z0.b, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 16 x i1> %b to <vscale x 16 x i8>
+  %c.zext = zext <vscale x 16 x i1> %c to <vscale x 16 x i8>
+  %sub = sub <vscale x 16 x i8> %b.zext, %c.zext
+  %abs = call <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8> %sub, i1 true)
+  %add = add <vscale x 16 x i8> %a, %abs
+  ret <vscale x 16 x i8> %add
+}
+
+define <vscale x 8 x i16> @uaba_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) #0 {
+; CHECK-LABEL: uaba_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaba z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 8 x i16> %b to <vscale x 8 x i32>
+  %c.zext = zext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+  %sub = sub <vscale x 8 x i32> %b.zext, %c.zext
+  %abs = call <vscale x 8 x i32> @llvm.abs.nxv8i32(<vscale x 8 x i32> %sub, i1 true)
+  %trunc = trunc <vscale x 8 x i32> %abs to <vscale x 8 x i16>
+  %add = add <vscale x 8 x i16> %a, %trunc
+  ret <vscale x 8 x i16> %add
+}
+
+define <vscale x 8 x i16> @uaba_h_promoted_ops(<vscale x 8 x i16> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c) #0 {
+; CHECK-LABEL: uaba_h_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    uaba z0.h, z1.h, z2.h
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
+  %c.zext = zext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+  %sub = sub <vscale x 8 x i16> %b.zext, %c.zext
+  %abs = call <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16> %sub, i1 true)
+  %add = add <vscale x 8 x i16> %a, %abs
+  ret <vscale x 8 x i16> %add
+}
+
+define <vscale x 4 x i32> @uaba_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) #0 {
+; CHECK-LABEL: uaba_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaba z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 4 x i32> %b to <vscale x 4 x i64>
+  %c.zext = zext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+  %sub = sub <vscale x 4 x i64> %b.zext, %c.zext
+  %abs = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> %sub, i1 true)
+  %trunc = trunc <vscale x 4 x i64> %abs to <vscale x 4 x i32>
+  %add = add <vscale x 4 x i32> %a, %trunc
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @uaba_s_promoted_ops(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c) #0 {
+; CHECK-LABEL: uaba_s_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-NEXT:    uaba z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 4 x i16> %b to <vscale x 4 x i32>
+  %c.zext = zext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+  %sub = sub <vscale x 4 x i32> %b.zext, %c.zext
+  %abs = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> %sub, i1 true)
+  %add = add <vscale x 4 x i32> %a, %abs
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 2 x i64> @uaba_d(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) #0 {
+; CHECK-LABEL: uaba_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaba z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 2 x i64> %b to <vscale x 2 x i128>
+  %c.zext = zext <vscale x 2 x i64> %c to <vscale x 2 x i128>
+  %sub = sub <vscale x 2 x i128> %b.zext, %c.zext
+  %abs = call <vscale x 2 x i128> @llvm.abs.nxv2i128(<vscale x 2 x i128> %sub, i1 true)
+  %trunc = trunc <vscale x 2 x i128> %abs to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i64> %a, %trunc
+  ret <vscale x 2 x i64> %add
+}
+
+define <vscale x 2 x i64> @uaba_d_promoted_ops(<vscale x 2 x i64> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c) #0 {
+; CHECK-LABEL: uaba_d_promoted_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT:    and z2.d, z2.d, #0xffffffff
+; CHECK-NEXT:    uaba z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 2 x i32> %b to <vscale x 2 x i64>
+  %c.zext = zext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+  %sub = sub <vscale x 2 x i64> %b.zext, %c.zext
+  %abs = call <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64> %sub, i1 true)
+  %add = add <vscale x 2 x i64> %a, %abs
+  ret <vscale x 2 x i64> %add
+}
+
+; A variant of uaba_s but with the add operands switched.
+define <vscale x 4 x i32> @uaba_s_commutative(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) #0 {
+; CHECK-LABEL: uaba_s_commutative:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaba z0.s, z1.s, z2.s
+; CHECK-NEXT:    ret
+  %b.zext = zext <vscale x 4 x i32> %b to <vscale x 4 x i64>
+  %c.zext = zext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+  %sub = sub <vscale x 4 x i64> %b.zext, %c.zext
+  %abs = call <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64> %sub, i1 true)
+  %trunc = trunc <vscale x 4 x i64> %abs to <vscale x 4 x i32>
+  %add = add <vscale x 4 x i32> %trunc, %a
+  ret <vscale x 4 x i32> %add
+}
+
+declare <vscale x 16 x i8> @llvm.abs.nxv16i8(<vscale x 16 x i8>, i1)
+
+declare <vscale x 8 x i16> @llvm.abs.nxv8i16(<vscale x 8 x i16>, i1)
+declare <vscale x 16 x i16> @llvm.abs.nxv16i16(<vscale x 16 x i16>, i1)
+
+declare <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32>, i1)
+declare <vscale x 8 x i32> @llvm.abs.nxv8i32(<vscale x 8 x i32>, i1)
+
+declare <vscale x 2 x i64> @llvm.abs.nxv2i64(<vscale x 2 x i64>, i1)
+declare <vscale x 4 x i64> @llvm.abs.nxv4i64(<vscale x 4 x i64>, i1)
+
+declare <vscale x 2 x i128> @llvm.abs.nxv2i128(<vscale x 2 x i128>, i1)
+
+attributes #0 = { "target-features"="+neon,+sve,+sve2" }

From 4fd77129f2deb4f84e39b3c3e11095522cae542f Mon Sep 17 00:00:00 2001
From: Simon Moll <simon.moll@emea.nec.com>
Date: Tue, 22 Feb 2022 14:29:29 +0100
Subject: [PATCH 494/748] [VE] Split unsupported v512.32 ops

Split v512.32 binary ops into two v256.32 ops using packing support
opcodes (vec_unpack_lo|hi, vec_pack).

Depends on D120053 for packing opcodes.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D120146
---
 llvm/lib/Target/VE/VECustomDAG.cpp       | 86 +++++++++++++++++++++++-
 llvm/lib/Target/VE/VECustomDAG.h         | 24 ++++++-
 llvm/lib/Target/VE/VEISelLowering.cpp    |  9 +++
 llvm/lib/Target/VE/VEISelLowering.h      |  1 +
 llvm/lib/Target/VE/VEInstrPatternsVec.td | 23 +++++++
 llvm/lib/Target/VE/VVPISelLowering.cpp   | 58 ++++++++++++++++
 llvm/lib/Target/VE/VVPNodes.def          | 31 +++++----
 llvm/test/CodeGen/VE/Packed/vp_fdiv.ll   | 82 ++++++++++++++++++++++
 llvm/test/CodeGen/VE/Packed/vp_mul.ll    | 25 +++++++
 llvm/test/CodeGen/VE/Packed/vp_sdiv.ll   | 85 +++++++++++++++++++++++
 llvm/test/CodeGen/VE/Packed/vp_udiv.ll   | 85 +++++++++++++++++++++++
 11 files changed, 493 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/VE/Packed/vp_fdiv.ll
 create mode 100644 llvm/test/CodeGen/VE/Packed/vp_mul.ll
 create mode 100644 llvm/test/CodeGen/VE/Packed/vp_sdiv.ll
 create mode 100644 llvm/test/CodeGen/VE/Packed/vp_udiv.ll

diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
index d605cdcc7ee15..ed463fe624ad0 100644
--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -25,6 +25,12 @@ bool isPackedVectorType(EVT SomeVT) {
   return SomeVT.getVectorNumElements() > StandardVectorWidth;
 }
 
+MVT splitVectorType(MVT VT) {
+  if (!VT.isVector())
+    return VT;
+  return MVT::getVectorVT(VT.getVectorElementType(), StandardVectorWidth);
+}
+
 MVT getLegalVectorType(Packing P, MVT ElemVT) {
   return MVT::getVectorVT(ElemVT, P == Packing::Normal ? StandardVectorWidth
                                                        : PackedVectorWidth);
@@ -83,6 +89,31 @@ bool maySafelyIgnoreMask(SDValue Op) {
   }
 }
 
+bool supportsPackedMode(unsigned Opcode, EVT IdiomVT) {
+  bool IsPackedOp = isPackedVectorType(IdiomVT);
+  bool IsMaskOp = isMaskType(IdiomVT);
+  switch (Opcode) {
+  default:
+    return false;
+
+  case VEISD::VEC_BROADCAST:
+    return true;
+#define REGISTER_PACKED(VVP_NAME) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+    return IsPackedOp && !IsMaskOp;
+  }
+}
+
+bool isPackingSupportOpcode(unsigned Opc) {
+  switch (Opc) {
+  case VEISD::VEC_PACK:
+  case VEISD::VEC_UNPACK_LO:
+  case VEISD::VEC_UNPACK_HI:
+    return true;
+  }
+  return false;
+}
+
 bool isVVPOrVEC(unsigned Opcode) {
   switch (Opcode) {
   case VEISD::VEC_BROADCAST:
@@ -125,6 +156,25 @@ Optional<int> getAVLPos(unsigned Opc) {
   return None;
 }
 
+Optional<int> getMaskPos(unsigned Opc) {
+  // This is only available for VP SDNodes
+  auto PosOpt = ISD::getVPMaskIdx(Opc);
+  if (PosOpt)
+    return *PosOpt;
+
+  // VVP Opcodes.
+  if (isVVPBinaryOp(Opc))
+    return 2;
+
+  // VM Opcodes.
+  switch (Opc) {
+  case VEISD::VVP_SELECT:
+    return 2;
+  }
+
+  return None;
+}
+
 bool isLegalAVL(SDValue AVL) { return AVL->getOpcode() == VEISD::LEGALAVL; }
 
 SDValue getNodeAVL(SDValue Op) {
@@ -132,6 +182,11 @@ SDValue getNodeAVL(SDValue Op) {
   return PosOpt ? Op->getOperand(*PosOpt) : SDValue();
 }
 
+SDValue getNodeMask(SDValue Op) {
+  auto PosOpt = getMaskPos(Op->getOpcode());
+  return PosOpt ? Op->getOperand(*PosOpt) : SDValue();
+}
+
 std::pair<SDValue, bool> getAnnotatedNodeAVL(SDValue Op) {
   SDValue AVL = getNodeAVL(Op);
   if (!AVL)
@@ -218,7 +273,9 @@ SDValue VECustomDAG::annotateLegalAVL(SDValue AVL) const {
 }
 
 SDValue VECustomDAG::getUnpack(EVT DestVT, SDValue Vec, PackElem Part,
-                               SDValue AVL) {
+                               SDValue AVL) const {
+  assert(getAnnotatedNodeAVL(AVL).second && "Expected a pack-legalized AVL");
+
   // TODO: Peek through VEC_PACK and VEC_BROADCAST(REPL_<sth> ..) operands.
   unsigned OC =
       (Part == PackElem::Lo) ? VEISD::VEC_UNPACK_LO : VEISD::VEC_UNPACK_HI;
@@ -226,9 +283,34 @@ SDValue VECustomDAG::getUnpack(EVT DestVT, SDValue Vec, PackElem Part,
 }
 
 SDValue VECustomDAG::getPack(EVT DestVT, SDValue LoVec, SDValue HiVec,
-                             SDValue AVL) {
+                             SDValue AVL) const {
+  assert(getAnnotatedNodeAVL(AVL).second && "Expected a pack-legalized AVL");
+
   // TODO: Peek through VEC_UNPACK_LO|HI operands.
   return DAG.getNode(VEISD::VEC_PACK, DL, DestVT, LoVec, HiVec, AVL);
 }
 
+VETargetMasks VECustomDAG::getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
+                                              PackElem Part) const {
+  // Adjust AVL for this part
+  SDValue NewAVL;
+  SDValue OneV = getConstant(1, MVT::i32);
+  if (Part == PackElem::Hi)
+    NewAVL = getNode(ISD::ADD, MVT::i32, {RawAVL, OneV});
+  else
+    NewAVL = RawAVL;
+  NewAVL = getNode(ISD::SRL, MVT::i32, {NewAVL, OneV});
+
+  NewAVL = annotateLegalAVL(NewAVL);
+
+  // Legalize Mask (unpack or all-true)
+  SDValue NewMask;
+  if (!RawMask)
+    NewMask = getConstantMask(Packing::Normal, true);
+  else
+    NewMask = getUnpack(MVT::v256i1, RawMask, Part, NewAVL);
+
+  return VETargetMasks(NewMask, NewAVL);
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
index 4adceef341f48..6553b90a2b69b 100644
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -25,6 +25,8 @@ Optional<unsigned> getVVPOpcode(unsigned Opcode);
 
 bool isVVPBinaryOp(unsigned Opcode);
 
+MVT splitVectorType(MVT VT);
+
 bool isPackedVectorType(EVT SomeVT);
 
 bool isMaskType(EVT SomeVT);
@@ -33,6 +35,10 @@ bool isMaskArithmetic(SDValue Op);
 
 bool isVVPOrVEC(unsigned);
 
+bool supportsPackedMode(unsigned Opcode, EVT IdiomVT);
+
+bool isPackingSupportOpcode(unsigned Opc);
+
 bool maySafelyIgnoreMask(SDValue Op);
 
 /// The VE backend uses a two-staged process to lower and legalize vector
@@ -71,6 +77,11 @@ bool isLegalAVL(SDValue AVL);
 // The AVL operand of this node.
 SDValue getNodeAVL(SDValue);
 
+// Mask position of this node.
+Optional<int> getMaskPos(unsigned);
+
+SDValue getNodeMask(SDValue);
+
 // Return the AVL operand of this node. If it is a LEGALAVL node, unwrap it.
 // Return with the boolean whether unwrapping happened.
 std::pair<SDValue, bool> getAnnotatedNodeAVL(SDValue);
@@ -93,6 +104,13 @@ enum class PackElem : int8_t {
   Hi = 1  // Float   (32,  0]
 };
 
+struct VETargetMasks {
+  SDValue Mask;
+  SDValue AVL;
+  VETargetMasks(SDValue Mask = SDValue(), SDValue AVL = SDValue())
+      : Mask(Mask), AVL(AVL) {}
+};
+
 class VECustomDAG {
   SelectionDAG &DAG;
   SDLoc DL;
@@ -135,8 +153,8 @@ class VECustomDAG {
   /// } getNode
 
   /// Packing {
-  SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL);
-  SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL);
+  SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL) const;
+  SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL) const;
   /// } Packing
 
   SDValue getConstant(uint64_t Val, EVT VT, bool IsTarget = false,
@@ -148,6 +166,8 @@ class VECustomDAG {
 
   // Wrap AVL in a LEGALAVL node (unless it is one already).
   SDValue annotateLegalAVL(SDValue AVL) const;
+  VETargetMasks getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
+                                   PackElem Part) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 38182dca7ba76..0e3f2eb522829 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -1681,6 +1681,15 @@ SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
 
 TargetLowering::LegalizeAction
 VETargetLowering::getCustomOperationAction(SDNode &Op) const {
+  // Custom legalization on VVP_* and VEC_* opcodes is required to pack-legalize
+  // these operations (transform nodes such that their AVL parameter refers to
+  // packs of 64bit, instead of number of elements.
+
+  // Packing opcodes are created with a pack-legal AVL (LEGALAVL). No need to
+  // re-visit them.
+  if (isPackingSupportOpcode(Op.getOpcode()))
+    return Legal;
+
   // Custom lower to legalize AVL for packed mode.
   if (isVVPOrVEC(Op.getOpcode()))
     return Custom;
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index 604f34fa2086a..cc7a156d5b937 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -187,6 +187,7 @@ class VETargetLowering : public TargetLowering {
   /// VVP Lowering {
   SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const;
   SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const;
   SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const;
   SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const;
   /// } VVPLowering
diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td
index e17b418201c65..71199717a3a2b 100644
--- a/llvm/lib/Target/VE/VEInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -125,3 +125,26 @@ def : Pat<(v512i1 (vec_pack v256i1:$vlo, v256i1:$vhi, (i32 srcvalue))),
                          (v512i1 (IMPLICIT_DEF)),
                          $vlo, sub_vm_odd),
                          $vhi, sub_vm_even)>;
+
+// v256.32 <> v512.32
+multiclass Packing<ValueType PackVT> {
+  // no-op unpacks
+  def : Pat<(v256i32 (vec_unpack_lo PackVT:$vp, (i32 srcvalue))),
+            (COPY_TO_REGCLASS $vp, V64)>;
+  def : Pat<(v256f32 (vec_unpack_hi PackVT:$vp, (i32 srcvalue))),
+            (COPY_TO_REGCLASS $vp, V64)>;
+
+  // shuffle unpacks
+  def : Pat<(v256f32 (vec_unpack_lo PackVT:$vp, i32:$avl)),
+            (VSHFvvil $vp, $vp, 4, $avl)>; // always pick lo
+  def : Pat<(v256i32 (vec_unpack_hi PackVT:$vp, i32:$avl)),
+            (VSHFvvil $vp, $vp, 0, $avl)>; // always pick hi
+}
+
+defm : Packing<v512i32>;
+defm : Packing<v512f32>;
+
+def : Pat<(v512i32 (vec_pack v256i32:$vlo, v256i32:$vhi, i32:$avl)),
+          (VSHFvvil $vlo, $vhi, 13, $avl)>;
+def : Pat<(v512f32 (vec_pack v256f32:$vlo, v256f32:$vhi, i32:$avl)),
+          (VSHFvvil $vlo, $vhi, 8, $avl)>;
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index 735f65bf4c9a3..54fdd9f3ac543 100644
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -21,10 +21,68 @@ using namespace llvm;
 SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
                                                    SelectionDAG &DAG) const {
   VECustomDAG CDAG(DAG, Op);
+
+  EVT IdiomVT = Op.getValueType();
+  if (isPackedVectorType(IdiomVT) &&
+      !supportsPackedMode(Op.getOpcode(), IdiomVT))
+    return splitVectorOp(Op, CDAG);
+
   // TODO: Implement odd/even splitting.
   return legalizePackedAVL(Op, CDAG);
 }
 
+SDValue VETargetLowering::splitVectorOp(SDValue Op, VECustomDAG &CDAG) const {
+  MVT ResVT = splitVectorType(Op.getValue(0).getSimpleValueType());
+
+  auto AVLPos = getAVLPos(Op->getOpcode());
+  auto MaskPos = getMaskPos(Op->getOpcode());
+
+  SDValue PackedMask = getNodeMask(Op);
+  auto AVLPair = getAnnotatedNodeAVL(Op);
+  SDValue PackedAVL = AVLPair.first;
+  assert(!AVLPair.second && "Expecting non pack-legalized oepration");
+
+  // request the parts
+  SDValue PartOps[2];
+
+  SDValue UpperPartAVL; // we will use this for packing things back together
+  for (PackElem Part : {PackElem::Hi, PackElem::Lo}) {
+    // VP ops already have an explicit mask and AVL. When expanding from non-VP
+    // attach those additional inputs here.
+    auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part);
+
+    if (Part == PackElem::Hi)
+      UpperPartAVL = SplitTM.AVL;
+
+    // Attach non-predicating value operands
+    SmallVector<SDValue, 4> OpVec;
+    for (unsigned i = 0; i < Op.getNumOperands(); ++i) {
+      if (AVLPos && ((int)i) == *AVLPos)
+        continue;
+      if (MaskPos && ((int)i) == *MaskPos)
+        continue;
+
+      // Value operand
+      auto PackedOperand = Op.getOperand(i);
+      auto UnpackedOpVT = splitVectorType(PackedOperand.getSimpleValueType());
+      SDValue PartV =
+          CDAG.getUnpack(UnpackedOpVT, PackedOperand, Part, SplitTM.AVL);
+      OpVec.push_back(PartV);
+    }
+
+    // Add predicating args and generate part node.
+    OpVec.push_back(SplitTM.Mask);
+    OpVec.push_back(SplitTM.AVL);
+    // Emit legal VVP nodes.
+    PartOps[(int)Part] =
+        CDAG.getNode(Op.getOpcode(), ResVT, OpVec, Op->getFlags());
+  }
+
+  // Re-package vectors.
+  return CDAG.getPack(Op.getValueType(), PartOps[(int)PackElem::Lo],
+                      PartOps[(int)PackElem::Hi], UpperPartAVL);
+}
+
 SDValue VETargetLowering::legalizePackedAVL(SDValue Op,
                                             VECustomDAG &CDAG) const {
   LLVM_DEBUG(dbgs() << "::legalizePackedAVL\n";);
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index edb0cbe69efec..1ba602f4f2d36 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -38,31 +38,37 @@
     ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME)
 #endif
 
+/// REGISTER_PACKED(OPC)
+/// \p OPC The VVP opcode of the operation.
+#ifndef REGISTER_PACKED
+#define REGISTER_PACKED(OPC)
+#endif
+
 // Integer arithmetic.
-ADD_BINARY_VVP_OP_COMPACT(ADD)
-ADD_BINARY_VVP_OP_COMPACT(SUB)
+ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD)
+ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB)
 ADD_BINARY_VVP_OP_COMPACT(MUL)
 ADD_BINARY_VVP_OP_COMPACT(UDIV)
 ADD_BINARY_VVP_OP_COMPACT(SDIV)
 
-ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA)
-ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL)
-ADD_BINARY_VVP_OP_COMPACT(SHL)
+ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA) REGISTER_PACKED(VVP_SRA)
+ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL) REGISTER_PACKED(VVP_SRL)
+ADD_BINARY_VVP_OP_COMPACT(SHL) REGISTER_PACKED(VVP_SHL)
 
-ADD_BINARY_VVP_OP_COMPACT(AND)
-ADD_BINARY_VVP_OP_COMPACT(OR)
-ADD_BINARY_VVP_OP_COMPACT(XOR)
+ADD_BINARY_VVP_OP_COMPACT(AND) REGISTER_PACKED(VVP_AND)
+ADD_BINARY_VVP_OP_COMPACT(OR)  REGISTER_PACKED(VVP_OR)
+ADD_BINARY_VVP_OP_COMPACT(XOR) REGISTER_PACKED(VVP_XOR)
 
 // FP arithmetic.
-ADD_BINARY_VVP_OP_COMPACT(FADD)
-ADD_BINARY_VVP_OP_COMPACT(FSUB)
-ADD_BINARY_VVP_OP_COMPACT(FMUL)
+ADD_BINARY_VVP_OP_COMPACT(FADD) REGISTER_PACKED(VVP_FADD)
+ADD_BINARY_VVP_OP_COMPACT(FSUB) REGISTER_PACKED(VVP_FSUB)
+ADD_BINARY_VVP_OP_COMPACT(FMUL) REGISTER_PACKED(VVP_FMUL)
 ADD_BINARY_VVP_OP_COMPACT(FDIV)
 
 ADD_VVP_OP(VVP_SETCC, SETCC)
 
 // Shuffles.
-ADD_VVP_OP(VVP_SELECT,VSELECT)
+ADD_VVP_OP(VVP_SELECT,VSELECT) REGISTER_PACKED(VVP_SELECT)
 HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT)
 HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
 
@@ -70,3 +76,4 @@ HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
 #undef ADD_BINARY_VVP_OP_COMPACT
 #undef ADD_VVP_OP
 #undef HANDLE_VP_TO_VVP
+#undef REGISTER_PACKED
diff --git a/llvm/test/CodeGen/VE/Packed/vp_fdiv.ll b/llvm/test/CodeGen/VE/Packed/vp_fdiv.ll
new file mode 100644
index 0000000000000..50c3fa189ea85
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vp_fdiv.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <512 x float> @llvm.vp.fdiv.v512f32(<512 x float>, <512 x float>, <512 x i1>, i32)
+
+define fastcc <512 x float> @test_vp_fdiv_v512f32_vv(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v512f32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s0, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 4
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 4
+; CHECK-NEXT:    vfdiv.s %v2, %v3, %v2, %vm3
+; CHECK-NEXT:    adds.w.sx %s0, 1, %s0
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfdiv.s %v0, %v0, %v1, %vm2
+; CHECK-NEXT:    vshf %v0, %v2, %v0, 8
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <512 x float> @llvm.vp.fdiv.v512f32(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x float> %r0
+}
+
+define fastcc <512 x float> @test_vp_fdiv_v512f32_rv(float %s0, <512 x float> %i1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v512f32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s2, %s0, (32)1
+; CHECK-NEXT:    srl %s0, %s0, 32
+; CHECK-NEXT:    or %s0, %s0, %s2
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, 1, %s1
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfdiv.s %v2, %v1, %v0, %vm2
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v1, %v1, %v1, 4
+; CHECK-NEXT:    vshf %v0, %v0, %v0, 4
+; CHECK-NEXT:    vfdiv.s %v0, %v1, %v0, %vm3
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 8
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <512 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <512 x float> %xins, <512 x float> undef, <512 x i32> zeroinitializer
+  %r0 = call <512 x float> @llvm.vp.fdiv.v512f32(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x float> %r0
+}
+
+define fastcc <512 x float> @test_vp_fdiv_v512f32_vr(<512 x float> %i0, float %s1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v512f32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s2, %s0, (32)1
+; CHECK-NEXT:    srl %s0, %s0, 32
+; CHECK-NEXT:    or %s0, %s0, %s2
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, 1, %s1
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfdiv.s %v2, %v0, %v1, %vm2
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v1, %v1, %v1, 4
+; CHECK-NEXT:    vshf %v0, %v0, %v0, 4
+; CHECK-NEXT:    vfdiv.s %v0, %v0, %v1, %vm3
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 8
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <512 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <512 x float> %yins, <512 x float> undef, <512 x i32> zeroinitializer
+  %r0 = call <512 x float> @llvm.vp.fdiv.v512f32(<512 x float> %i0, <512 x float> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x float> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Packed/vp_mul.ll b/llvm/test/CodeGen/VE/Packed/vp_mul.ll
new file mode 100644
index 0000000000000..26271eef99454
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vp_mul.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <512 x i32> @llvm.vp.mul.v512i32(<512 x i32>, <512 x i32>, <512 x i1>, i32)
+
+define fastcc <512 x i32> @test_vp_v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_v512i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    adds.w.sx %s1, 1, %s0
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 0
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 0
+; CHECK-NEXT:    vmuls.w.sx %v2, %v3, %v2, %vm2
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vmuls.w.sx %v0, %v0, %v1, %vm3
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <512 x i32> @llvm.vp.mul.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x i32> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Packed/vp_sdiv.ll b/llvm/test/CodeGen/VE/Packed/vp_sdiv.ll
new file mode 100644
index 0000000000000..24202eaecedb7
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vp_sdiv.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32>, <512 x i32>, <512 x i1>, i32)
+
+define fastcc <512 x i32> @test_vp_sdiv_v512i32_vv(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_sdiv_v512i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    adds.w.sx %s1, 1, %s0
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 0
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 0
+; CHECK-NEXT:    vdivs.w.sx %v2, %v3, %v2, %vm2
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivs.w.sx %v0, %v0, %v1, %vm3
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x i32> %r0
+}
+
+define fastcc <512 x i32> @test_vp_sdiv_v512i32_rv(i32 %s0, <512 x i32> %i1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_sdiv_v512i32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    sll %s2, %s0, 32
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    or %s0, %s0, %s2
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, 1, %s1
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 0
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 0
+; CHECK-NEXT:    vdivs.w.sx %v2, %v2, %v3, %vm2
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.w.sx %v0, %v1, %v0, %vm3
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <512 x i32> undef, i32 %s0, i32 0
+  %i0 = shufflevector <512 x i32> %xins, <512 x i32> undef, <512 x i32> zeroinitializer
+  %r0 = call <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x i32> %r0
+}
+
+define fastcc <512 x i32> @test_vp_sdiv_v512i32_vr(<512 x i32> %i0, i32 %s1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_sdiv_v512i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    sll %s2, %s0, 32
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    or %s0, %s0, %s2
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, 1, %s1
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 0
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 0
+; CHECK-NEXT:    vdivs.w.sx %v2, %v3, %v2, %vm2
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivs.w.sx %v0, %v0, %v1, %vm3
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <512 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <512 x i32> %yins, <512 x i32> undef, <512 x i32> zeroinitializer
+  %r0 = call <512 x i32> @llvm.vp.sdiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x i32> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Packed/vp_udiv.ll b/llvm/test/CodeGen/VE/Packed/vp_udiv.ll
new file mode 100644
index 0000000000000..80e1729bf64a0
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vp_udiv.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32>, <512 x i32>, <512 x i1>, i32)
+
+define fastcc <512 x i32> @test_vp_udiv_v512i32_vv(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_udiv_v512i32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    adds.w.sx %s1, 1, %s0
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 0
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 0
+; CHECK-NEXT:    vdivu.w %v2, %v3, %v2, %vm2
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vdivu.w %v0, %v0, %v1, %vm3
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x i32> %r0
+}
+
+define fastcc <512 x i32> @test_vp_udiv_v512i32_rv(i32 %s0, <512 x i32> %i1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_udiv_v512i32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    sll %s2, %s0, 32
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    or %s0, %s0, %s2
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, 1, %s1
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 0
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 0
+; CHECK-NEXT:    vdivu.w %v2, %v2, %v3, %vm2
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.w %v0, %v1, %v0, %vm3
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <512 x i32> undef, i32 %s0, i32 0
+  %i0 = shufflevector <512 x i32> %xins, <512 x i32> undef, <512 x i32> zeroinitializer
+  %r0 = call <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x i32> %r0
+}
+
+define fastcc <512 x i32> @test_vp_udiv_v512i32_vr(<512 x i32> %i0, i32 %s1, <512 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_udiv_v512i32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    sll %s2, %s0, 32
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    or %s0, %s0, %s2
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vbrd %v1, %s0
+; CHECK-NEXT:    adds.w.sx %s0, 1, %s1
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    srl %s0, %s0, 1
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v2, %v1, %v1, 0
+; CHECK-NEXT:    vshf %v3, %v0, %v0, 0
+; CHECK-NEXT:    vdivu.w %v2, %v3, %v2, %vm2
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    srl %s1, %s1, 1
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vdivu.w %v0, %v0, %v1, %vm3
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v2, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <512 x i32> undef, i32 %s1, i32 0
+  %i1 = shufflevector <512 x i32> %yins, <512 x i32> undef, <512 x i32> zeroinitializer
+  %r0 = call <512 x i32> @llvm.vp.udiv.v512i32(<512 x i32> %i0, <512 x i32> %i1, <512 x i1> %m, i32 %n)
+  ret <512 x i32> %r0
+}

From 3a1cb362370d223e09899d234726e15b52327b0e Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan@apple.com>
Date: Mon, 24 Jan 2022 14:18:14 +0000
Subject: [PATCH 495/748] Add DriverKit support

This patch is the first in a series of patches to upstream the support for Apple's DriverKit. Once complete, it will allow targeting DriverKit platform with Clang similarly to AppleClang.

This code was originally authored by JF Bastien.

Differential Revision: https://reviews.llvm.org/D118046
---
 llvm/include/llvm/ADT/Triple.h                |  9 +++++++-
 llvm/include/llvm/Support/VersionTuple.h      |  6 +++++
 llvm/lib/MC/MCParser/DarwinAsmParser.cpp      |  4 +++-
 llvm/lib/MC/MCStreamer.cpp                    |  8 +++++++
 llvm/lib/Support/Triple.cpp                   | 23 +++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |  2 ++
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  2 +-
 llvm/lib/Target/ARM/ARMSubtarget.h            |  1 +
 .../Instrumentation/AddressSanitizer.cpp      |  5 +++-
 .../arm-darwin-version-min-load-command.s     | 18 +++++++++++++++
 .../MachO/darwin-version-min-load-command.s   |  8 +++++++
 llvm/test/MC/MachO/driverkit-sdk-version.ll   | 18 +++++++++++++++
 llvm/unittests/ADT/TripleTest.cpp             | 17 ++++++++++++++
 13 files changed, 117 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/MC/MachO/driverkit-sdk-version.ll

diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
index fc8673e54b6c8..6cf90fd6eaeee 100644
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -200,6 +200,7 @@ class Triple {
     ELFIAMCU,
     TvOS,       // Apple tvOS
     WatchOS,    // Apple watchOS
+    DriverKit,  // Apple DriverKit
     Mesa3D,
     Contiki,
     AMDPAL,     // AMD PAL Runtime
@@ -362,6 +363,9 @@ class Triple {
   /// with WatchOS or generic triples.
   VersionTuple getWatchOSVersion() const;
 
+  /// Parse the version number as with getOSVersion.
+  VersionTuple getDriverKitVersion() const;
+
   /// @}
   /// @name Direct Component Access
   /// @{
@@ -464,11 +468,14 @@ class Triple {
     return getSubArch() == Triple::ARMSubArch_v7k;
   }
 
+  /// Is this an Apple DriverKit triple.
+  bool isDriverKit() const { return getOS() == Triple::DriverKit; }
+
   bool isOSzOS() const { return getOS() == Triple::ZOS; }
 
   /// Is this a "Darwin" OS (macOS, iOS, tvOS or watchOS).
   bool isOSDarwin() const {
-    return isMacOSX() || isiOS() || isWatchOS();
+    return isMacOSX() || isiOS() || isWatchOS() || isDriverKit();
   }
 
   bool isSimulatorEnvironment() const {
diff --git a/llvm/include/llvm/Support/VersionTuple.h b/llvm/include/llvm/Support/VersionTuple.h
index 1a1072d228f11..aa323fafca65a 100644
--- a/llvm/include/llvm/Support/VersionTuple.h
+++ b/llvm/include/llvm/Support/VersionTuple.h
@@ -97,6 +97,12 @@ class VersionTuple {
     return *this;
   }
 
+  /// Return a version tuple that contains a different major version but
+  /// everything else is the same.
+  VersionTuple withMajorReplaced(unsigned NewMajor) const {
+    return VersionTuple(NewMajor, Minor, Subminor, Build);
+  }
+
   /// Return a version tuple that contains only components that are non-zero.
   VersionTuple normalize() const {
     VersionTuple Result = *this;
diff --git a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index 0d28a08a0e667..5274f30be7e8d 100644
--- a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -1149,11 +1149,12 @@ static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) {
   case MachO::PLATFORM_TVOS:    return Triple::TvOS;
   case MachO::PLATFORM_WATCHOS: return Triple::WatchOS;
   case MachO::PLATFORM_BRIDGEOS:         /* silence warning */ break;
+  case MachO::PLATFORM_DRIVERKIT:
+    return Triple::DriverKit;
   case MachO::PLATFORM_MACCATALYST: return Triple::IOS;
   case MachO::PLATFORM_IOSSIMULATOR:     /* silence warning */ break;
   case MachO::PLATFORM_TVOSSIMULATOR:    /* silence warning */ break;
   case MachO::PLATFORM_WATCHOSSIMULATOR: /* silence warning */ break;
-  case MachO::PLATFORM_DRIVERKIT:        /* silence warning */ break;
   }
   llvm_unreachable("Invalid mach-o platform type");
 }
@@ -1172,6 +1173,7 @@ bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) {
     .Case("tvos", MachO::PLATFORM_TVOS)
     .Case("watchos", MachO::PLATFORM_WATCHOS)
     .Case("macCatalyst", MachO::PLATFORM_MACCATALYST)
+    .Case("driverkit", MachO::PLATFORM_DRIVERKIT)
     .Default(0);
   if (Platform == 0)
     return Error(PlatformLoc, "unknown platform name");
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 93001a47af786..a42b4da8a19da 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1286,6 +1286,9 @@ static VersionTuple getMachoBuildVersionSupportedOS(const Triple &Target) {
     return VersionTuple(12);
   case Triple::WatchOS:
     return VersionTuple(5);
+  case Triple::DriverKit:
+    // DriverKit always uses the build version load command.
+    return VersionTuple();
   default:
     break;
   }
@@ -1310,6 +1313,8 @@ getMachoBuildVersionPlatformType(const Triple &Target) {
   case Triple::WatchOS:
     return Target.isSimulatorEnvironment() ? MachO::PLATFORM_WATCHOSSIMULATOR
                                            : MachO::PLATFORM_WATCHOS;
+  case Triple::DriverKit:
+    return MachO::PLATFORM_DRIVERKIT;
   default:
     break;
   }
@@ -1339,6 +1344,9 @@ void MCStreamer::emitVersionForTarget(
   case Triple::WatchOS:
     Version = Target.getWatchOSVersion();
     break;
+  case Triple::DriverKit:
+    Version = Target.getDriverKitVersion();
+    break;
   default:
     llvm_unreachable("unexpected OS type");
   }
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index a80310b245ad7..978545a1ba144 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -208,6 +208,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case Contiki: return "contiki";
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
+  case DriverKit: return "driverkit";
   case ELFIAMCU: return "elfiamcu";
   case Emscripten: return "emscripten";
   case FreeBSD: return "freebsd";
@@ -550,6 +551,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("elfiamcu", Triple::ELFIAMCU)
     .StartsWith("tvos", Triple::TvOS)
     .StartsWith("watchos", Triple::WatchOS)
+    .StartsWith("driverkit", Triple::DriverKit)
     .StartsWith("mesa3d", Triple::Mesa3D)
     .StartsWith("contiki", Triple::Contiki)
     .StartsWith("amdpal", Triple::AMDPAL)
@@ -1169,6 +1171,8 @@ bool Triple::getMacOSXVersion(VersionTuple &Version) const {
     // IOS.
     Version = VersionTuple(10, 4);
     break;
+  case DriverKit:
+    llvm_unreachable("OSX version isn't relevant for DriverKit");
   }
   return true;
 }
@@ -1193,6 +1197,8 @@ VersionTuple Triple::getiOSVersion() const {
   }
   case WatchOS:
     llvm_unreachable("conflicting triple info");
+  case DriverKit:
+    llvm_unreachable("DriverKit doesn't have an iOS version");
   }
 }
 
@@ -1214,6 +1220,20 @@ VersionTuple Triple::getWatchOSVersion() const {
   }
   case IOS:
     llvm_unreachable("conflicting triple info");
+  case DriverKit:
+    llvm_unreachable("DriverKit doesn't have a WatchOS version");
+  }
+}
+
+VersionTuple Triple::getDriverKitVersion() const {
+  switch (getOS()) {
+  default:
+    llvm_unreachable("unexpected OS for Darwin triple");
+  case DriverKit:
+    VersionTuple Version = getOSVersion();
+    if (Version.getMajor() == 0)
+      return Version.withMajorReplaced(19);
+    return Version;
   }
 }
 
@@ -1746,6 +1766,8 @@ VersionTuple Triple::getMinimumSupportedOSVersion() const {
     if (isSimulatorEnvironment())
       return VersionTuple(7, 0, 0);
     break;
+  case Triple::DriverKit:
+    return VersionTuple(20, 0, 0);
   default:
     break;
   }
@@ -1776,6 +1798,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   case llvm::Triple::MacOSX:
   case llvm::Triple::TvOS:
   case llvm::Triple::WatchOS:
+  case llvm::Triple::DriverKit:
     if (MArch == "v7k")
       return "cortex-a7";
     break;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 8a7e20237271a..0c9253f202c6b 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -352,6 +352,8 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
   if (!UseAddressTopByteIgnored)
     return false;
 
+  if (TargetTriple.isDriverKit())
+    return true;
   if (TargetTriple.isiOS()) {
     return TargetTriple.getiOSVersion() >= VersionTuple(8);
   }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 93193e97820d8..c678901bb3280 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -481,7 +481,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
-      !Subtarget->isTargetWatchOS()) {
+      !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index c498d273429dc..bf4b213eac8d9 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -773,6 +773,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
   bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
+  bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index a8d67c755799d..e111805895505 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -465,7 +465,8 @@ struct ShadowMapping {
 static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
                                       bool IsKasan) {
   bool IsAndroid = TargetTriple.isAndroid();
-  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
+  bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS() ||
+               TargetTriple.isDriverKit();
   bool IsMacOS = TargetTriple.isMacOSX();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
   bool IsNetBSD = TargetTriple.isOSNetBSD();
@@ -2125,6 +2126,8 @@ bool ModuleAddressSanitizer::ShouldUseMachOGlobalsSection() const {
     return true;
   if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
     return true;
+  if (TargetTriple.isDriverKit())
+    return true;
 
   return false;
 }
diff --git a/llvm/test/MC/MachO/AArch64/arm-darwin-version-min-load-command.s b/llvm/test/MC/MachO/AArch64/arm-darwin-version-min-load-command.s
index 37b95b2bd6817..2d008a84570f9 100644
--- a/llvm/test/MC/MachO/AArch64/arm-darwin-version-min-load-command.s
+++ b/llvm/test/MC/MachO/AArch64/arm-darwin-version-min-load-command.s
@@ -18,6 +18,10 @@
 // RUN: llvm-mc -triple arm64-apple-tvos10-simulator %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-BUILD-TVOSSIM2
 // RUN: llvm-mc -triple arm64-apple-watchos3-simulator %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-BUILD-WATCHOSSIM2
 
+// RUN: llvm-mc -triple arm64-apple-driverkit19.0 %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-DRIVERKIT-ARM64
+// RUN: llvm-mc -triple arm64e-apple-driverkit19.0 %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-DRIVERKIT-ARM64
+// RUN: llvm-mc -triple arm64-apple-driverkit20.1 %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-DRIVERKIT-ARM64_1
+
 // CHECK-BUILD-IOS-ARM64E:           cmd LC_BUILD_VERSION
 // CHECK-BUILD-IOS-ARM64E-NEXT:  cmdsize 24
 // CHECK-BUILD-IOS-ARM64E-NEXT: platform ios
@@ -94,3 +98,17 @@
 // CHECK-MACCATALYST-ARM64_1-NEXT:      sdk n/a
 // CHECK-MACCATALYST-ARM64_1-NEXT:    minos 14.1
 // CHECK-MACCATALYST-ARM64_1-NEXT:   ntools 0
+
+// CHECK-DRIVERKIT-ARM64:            cmd LC_BUILD_VERSION
+// CHECK-DRIVERKIT-ARM64-NEXT:   cmdsize 24
+// CHECK-DRIVERKIT-ARM64-NEXT:  platform driverkit
+// CHECK-DRIVERKIT-ARM64-NEXT:       sdk n/a
+// CHECK-DRIVERKIT-ARM64-NEXT:     minos 20.0
+// CHECK-DRIVERKIT-ARM64-NEXT:    ntools 0
+
+// CHECK-DRIVERKIT-ARM64_1:            cmd LC_BUILD_VERSION
+// CHECK-DRIVERKIT-ARM64_1-NEXT:   cmdsize 24
+// CHECK-DRIVERKIT-ARM64_1-NEXT:  platform driverkit
+// CHECK-DRIVERKIT-ARM64_1-NEXT:       sdk n/a
+// CHECK-DRIVERKIT-ARM64_1-NEXT:     minos 20.1
+// CHECK-DRIVERKIT-ARM64_1-NEXT:    ntools 0
diff --git a/llvm/test/MC/MachO/darwin-version-min-load-command.s b/llvm/test/MC/MachO/darwin-version-min-load-command.s
index 6e1b4c20a6700..1a57dd6f7e2dd 100644
--- a/llvm/test/MC/MachO/darwin-version-min-load-command.s
+++ b/llvm/test/MC/MachO/darwin-version-min-load-command.s
@@ -27,6 +27,7 @@
 
 // RUN: llvm-mc -triple x86_64-apple-watchos1.0.0 %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-WATCHOS
 // RUN: llvm-mc -triple x86_64-apple-tvos8.0.0 %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-TVOS
+// RUN: llvm-mc -triple x86_64-apple-driverkit19.0 %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s --check-prefix=CHECK-DRIVERKIT
 // CHECK-WATCHOS: Load command
 // CHECK-WATCHOS:           cmd LC_VERSION_MIN_WATCHOS
 // CHECK-WATCHOS-NEXT:   cmdsize 16
@@ -50,6 +51,13 @@
 // CHECK-MACCATALYST-NEXT:    minos 13.0
 // CHECK-MACCATALYST-NEXT:   ntools 0
 
+// CHECK-DRIVERKIT:            cmd LC_BUILD_VERSION
+// CHECK-DRIVERKIT-NEXT:   cmdsize 24
+// CHECK-DRIVERKIT-NEXT:  platform driverkit
+// CHECK-DRIVERKIT-NEXT:       sdk n/a
+// CHECK-DRIVERKIT-NEXT:     minos 19.0
+// CHECK-DRIVERKIT-NEXT:    ntools 0
+
 // CHECK-BUILD-MACOS:           cmd LC_BUILD_VERSION
 // CHECK-BUILD-MACOS-NEXT:  cmdsize 24
 // CHECK-BUILD-MACOS-NEXT: platform macos
diff --git a/llvm/test/MC/MachO/driverkit-sdk-version.ll b/llvm/test/MC/MachO/driverkit-sdk-version.ll
new file mode 100644
index 0000000000000..ea36ae32ffec3
--- /dev/null
+++ b/llvm/test/MC/MachO/driverkit-sdk-version.ll
@@ -0,0 +1,18 @@
+; RUN: llc %s -filetype=obj -o - | llvm-objdump --macho --private-headers - | FileCheck %s
+; RUN: llc %s -filetype=asm -o - | FileCheck --check-prefix=ASM %s
+
+target triple = "x86_64-apple-driverkit19.0.0"
+
+define void @foo() {
+entry:
+  ret void
+}
+
+; CHECK:            cmd LC_BUILD_VERSION
+; CHECK-NEXT:   cmdsize 24
+; CHECK-NEXT:  platform driverkit
+; CHECK-NEXT:       sdk n/a
+; CHECK-NEXT:     minos 19.0
+; CHECK-NEXT:    ntools 0
+
+; ASM: .build_version driverkit, 19, 0
diff --git a/llvm/unittests/ADT/TripleTest.cpp b/llvm/unittests/ADT/TripleTest.cpp
index 18a9641492ad0..f4715e8558983 100644
--- a/llvm/unittests/ADT/TripleTest.cpp
+++ b/llvm/unittests/ADT/TripleTest.cpp
@@ -1434,6 +1434,23 @@ TEST(TripleTest, getOSVersion) {
   EXPECT_TRUE(T.getEnvironment() == Triple::MacABI);
   EXPECT_TRUE(T.isMacCatalystEnvironment());
   EXPECT_FALSE(T.isSimulatorEnvironment());
+
+  T = Triple("x86_64-apple-driverkit20.1.0");
+  EXPECT_TRUE(T.isDriverKit());
+  EXPECT_TRUE(T.isOSDarwin());
+  EXPECT_FALSE(T.isMacOSX());
+  EXPECT_FALSE(T.isiOS());
+  Version = T.getDriverKitVersion();
+  EXPECT_EQ(VersionTuple(20, 1), Version);
+
+  T = Triple("x86_64-apple-driverkit20");
+  Version = T.getDriverKitVersion();
+  EXPECT_EQ(VersionTuple(20, 0), Version);
+
+  // DriverKit version should default to 19.0.
+  T = Triple("x86_64-apple-driverkit");
+  Version = T.getDriverKitVersion();
+  EXPECT_EQ(VersionTuple(19, 0), Version);
 }
 
 TEST(TripleTest, getEnvironmentVersion) {

From ffa4dfc8de526ba28d12d62a14f984a7e1e9224b Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 22 Feb 2022 13:48:21 +0000
Subject: [PATCH 496/748] [AArch64][SME] Remove term 'streaming-sve' from
 assembler diagnostics.

'streaming-sve' is not a feature that users should be able to set,
hence why it shouldn't show up in user-diagnostics. The only
flag that end-users should be able to set is '+sme'.

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D120256
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   8 +-
 llvm/test/MC/AArch64/SME/revd.s               |   2 +-
 llvm/test/MC/AArch64/SME/sclamp.s             |   8 +-
 .../test/MC/AArch64/SME/streaming-mode-neon.s |  24 +-
 llvm/test/MC/AArch64/SME/uclamp.s             |   8 +-
 llvm/test/MC/AArch64/SVE/abs.s                |  24 +-
 llvm/test/MC/AArch64/SVE/add.s                | 104 ++--
 llvm/test/MC/AArch64/SVE/addpl.s              |   8 +-
 llvm/test/MC/AArch64/SVE/addvl.s              |   8 +-
 llvm/test/MC/AArch64/SVE/and.s                |  52 +-
 llvm/test/MC/AArch64/SVE/ands.s               |   6 +-
 llvm/test/MC/AArch64/SVE/andv.s               |   8 +-
 llvm/test/MC/AArch64/SVE/asr.s                |  68 +--
 llvm/test/MC/AArch64/SVE/asrd.s               |  24 +-
 llvm/test/MC/AArch64/SVE/asrr.s               |  16 +-
 llvm/test/MC/AArch64/SVE/bfcvt.s              |  10 +-
 llvm/test/MC/AArch64/SVE/bfcvtnt.s            |  10 +-
 llvm/test/MC/AArch64/SVE/bfdot.s              |  18 +-
 llvm/test/MC/AArch64/SVE/bfmlal.s             |  60 +-
 llvm/test/MC/AArch64/SVE/bfmmla.s             |   2 +-
 llvm/test/MC/AArch64/SVE/bic.s                |  50 +-
 llvm/test/MC/AArch64/SVE/bics.s               |   4 +-
 llvm/test/MC/AArch64/SVE/brka.s               |   4 +-
 llvm/test/MC/AArch64/SVE/brkas.s              |   2 +-
 llvm/test/MC/AArch64/SVE/brkb.s               |   4 +-
 llvm/test/MC/AArch64/SVE/brkbs.s              |   2 +-
 llvm/test/MC/AArch64/SVE/brkn.s               |   4 +-
 llvm/test/MC/AArch64/SVE/brkns.s              |   4 +-
 llvm/test/MC/AArch64/SVE/brkpa.s              |   4 +-
 llvm/test/MC/AArch64/SVE/brkpas.s             |   4 +-
 llvm/test/MC/AArch64/SVE/brkpb.s              |   4 +-
 llvm/test/MC/AArch64/SVE/brkpbs.s             |   4 +-
 llvm/test/MC/AArch64/SVE/clasta.s             |  28 +-
 llvm/test/MC/AArch64/SVE/clastb.s             |  28 +-
 llvm/test/MC/AArch64/SVE/cls.s                |  16 +-
 llvm/test/MC/AArch64/SVE/clz.s                |  16 +-
 llvm/test/MC/AArch64/SVE/cmpeq.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmpge.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmpgt.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmphi.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmphs.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmple.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmplo.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmpls.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmplt.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cmpne.s              |  30 +-
 llvm/test/MC/AArch64/SVE/cnot.s               |  16 +-
 llvm/test/MC/AArch64/SVE/cnt.s                |  16 +-
 llvm/test/MC/AArch64/SVE/cntb.s               |  12 +-
 llvm/test/MC/AArch64/SVE/cntd.s               |  12 +-
 llvm/test/MC/AArch64/SVE/cnth.s               |  12 +-
 llvm/test/MC/AArch64/SVE/cntp.s               |   8 +-
 llvm/test/MC/AArch64/SVE/cntw.s               |  12 +-
 llvm/test/MC/AArch64/SVE/cpy.s                | 118 ++--
 llvm/test/MC/AArch64/SVE/ctermeq.s            |   8 +-
 llvm/test/MC/AArch64/SVE/ctermne.s            |   8 +-
 llvm/test/MC/AArch64/SVE/decb.s               |  40 +-
 llvm/test/MC/AArch64/SVE/decd.s               |  40 +-
 llvm/test/MC/AArch64/SVE/dech.s               |  40 +-
 llvm/test/MC/AArch64/SVE/decp.s               |  32 +-
 llvm/test/MC/AArch64/SVE/decw.s               |  40 +-
 .../SVE/directive-arch_extension-negative.s   |   2 +-
 llvm/test/MC/AArch64/SVE/dup.s                |  84 +--
 llvm/test/MC/AArch64/SVE/dupm.s               |  20 +-
 llvm/test/MC/AArch64/SVE/eon.s                |  20 +-
 llvm/test/MC/AArch64/SVE/eor.s                |  52 +-
 llvm/test/MC/AArch64/SVE/eors.s               |   6 +-
 llvm/test/MC/AArch64/SVE/eorv.s               |   8 +-
 llvm/test/MC/AArch64/SVE/ext.s                |   8 +-
 llvm/test/MC/AArch64/SVE/fabd.s               |  14 +-
 llvm/test/MC/AArch64/SVE/fabs.s               |  14 +-
 llvm/test/MC/AArch64/SVE/facge.s              |   6 +-
 llvm/test/MC/AArch64/SVE/facgt.s              |   6 +-
 llvm/test/MC/AArch64/SVE/facle.s              |   6 +-
 llvm/test/MC/AArch64/SVE/faclt.s              |   6 +-
 llvm/test/MC/AArch64/SVE/fadd.s               |  44 +-
 llvm/test/MC/AArch64/SVE/faddv.s              |   6 +-
 llvm/test/MC/AArch64/SVE/fcadd.s              |  20 +-
 llvm/test/MC/AArch64/SVE/fcmeq.s              |  12 +-
 llvm/test/MC/AArch64/SVE/fcmge.s              |  12 +-
 llvm/test/MC/AArch64/SVE/fcmgt.s              |  12 +-
 llvm/test/MC/AArch64/SVE/fcmla.s              |  44 +-
 llvm/test/MC/AArch64/SVE/fcmle.s              |  12 +-
 llvm/test/MC/AArch64/SVE/fcmlt.s              |  12 +-
 llvm/test/MC/AArch64/SVE/fcmne.s              |  12 +-
 llvm/test/MC/AArch64/SVE/fcmuo.s              |   6 +-
 llvm/test/MC/AArch64/SVE/fcpy.s               | 524 ++++++++---------
 llvm/test/MC/AArch64/SVE/fcvt.s               |  20 +-
 llvm/test/MC/AArch64/SVE/fcvtzs.s             |  22 +-
 llvm/test/MC/AArch64/SVE/fcvtzu.s             |  22 +-
 llvm/test/MC/AArch64/SVE/fdiv.s               |  14 +-
 llvm/test/MC/AArch64/SVE/fdivr.s              |  14 +-
 llvm/test/MC/AArch64/SVE/fdup.s               | 516 ++++++++---------
 llvm/test/MC/AArch64/SVE/fmad.s               |  14 +-
 llvm/test/MC/AArch64/SVE/fmax.s               |  36 +-
 llvm/test/MC/AArch64/SVE/fmaxnm.s             |  38 +-
 llvm/test/MC/AArch64/SVE/fmaxnmv.s            |   6 +-
 llvm/test/MC/AArch64/SVE/fmaxv.s              |   6 +-
 llvm/test/MC/AArch64/SVE/fmin.s               |  38 +-
 llvm/test/MC/AArch64/SVE/fminnm.s             |  38 +-
 llvm/test/MC/AArch64/SVE/fminnmv.s            |   6 +-
 llvm/test/MC/AArch64/SVE/fminv.s              |   6 +-
 llvm/test/MC/AArch64/SVE/fmla.s               |  24 +-
 llvm/test/MC/AArch64/SVE/fmls.s               |  24 +-
 llvm/test/MC/AArch64/SVE/fmov.s               | 538 +++++++++---------
 llvm/test/MC/AArch64/SVE/fmsb.s               |  14 +-
 llvm/test/MC/AArch64/SVE/fmul.s               |  54 +-
 llvm/test/MC/AArch64/SVE/fmulx.s              |  14 +-
 llvm/test/MC/AArch64/SVE/fneg.s               |  14 +-
 llvm/test/MC/AArch64/SVE/fnmad.s              |  14 +-
 llvm/test/MC/AArch64/SVE/fnmla.s              |  14 +-
 llvm/test/MC/AArch64/SVE/fnmls.s              |  14 +-
 llvm/test/MC/AArch64/SVE/fnmsb.s              |  14 +-
 llvm/test/MC/AArch64/SVE/frecpe.s             |   6 +-
 llvm/test/MC/AArch64/SVE/frecps.s             |   6 +-
 llvm/test/MC/AArch64/SVE/frecpx.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frinta.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frinti.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frintm.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frintn.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frintp.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frintx.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frintz.s             |  14 +-
 llvm/test/MC/AArch64/SVE/frsqrte.s            |   6 +-
 llvm/test/MC/AArch64/SVE/frsqrts.s            |   6 +-
 llvm/test/MC/AArch64/SVE/fscale.s             |  14 +-
 llvm/test/MC/AArch64/SVE/fsqrt.s              |  14 +-
 llvm/test/MC/AArch64/SVE/fsub.s               |  44 +-
 llvm/test/MC/AArch64/SVE/fsubr.s              |  38 +-
 llvm/test/MC/AArch64/SVE/ftmad.s              |   2 +-
 llvm/test/MC/AArch64/SVE/incb.s               |  66 +--
 llvm/test/MC/AArch64/SVE/incd.s               |  60 +-
 llvm/test/MC/AArch64/SVE/inch.s               |  60 +-
 llvm/test/MC/AArch64/SVE/incp.s               |  32 +-
 llvm/test/MC/AArch64/SVE/incw.s               |  60 +-
 llvm/test/MC/AArch64/SVE/index.s              |  64 +--
 llvm/test/MC/AArch64/SVE/insr.s               |  32 +-
 llvm/test/MC/AArch64/SVE/lasta.s              |  16 +-
 llvm/test/MC/AArch64/SVE/lastb.s              |  16 +-
 llvm/test/MC/AArch64/SVE/ld1b.s               |  44 +-
 llvm/test/MC/AArch64/SVE/ld1d.s               |  12 +-
 llvm/test/MC/AArch64/SVE/ld1h.s               |  32 +-
 llvm/test/MC/AArch64/SVE/ld1rb.s              |  16 +-
 llvm/test/MC/AArch64/SVE/ld1rd.s              |   4 +-
 llvm/test/MC/AArch64/SVE/ld1rh.s              |  12 +-
 llvm/test/MC/AArch64/SVE/ld1rqb.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ld1rqd.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ld1rqh.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ld1rqw.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ld1rsb.s             |  12 +-
 llvm/test/MC/AArch64/SVE/ld1rsh.s             |   8 +-
 llvm/test/MC/AArch64/SVE/ld1rsw.s             |   4 +-
 llvm/test/MC/AArch64/SVE/ld1rw.s              |   8 +-
 llvm/test/MC/AArch64/SVE/ld1sb.s              |  34 +-
 llvm/test/MC/AArch64/SVE/ld1sh.s              |  22 +-
 llvm/test/MC/AArch64/SVE/ld1sw.s              |  12 +-
 llvm/test/MC/AArch64/SVE/ld1w.s               |  22 +-
 llvm/test/MC/AArch64/SVE/ld2b.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld2d.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld2h.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld2w.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld3b.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld3d.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld3h.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld3w.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld4b.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld4d.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld4h.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ld4w.s               |  10 +-
 llvm/test/MC/AArch64/SVE/ldnt1b.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ldnt1d.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ldnt1h.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ldnt1w.s             |  10 +-
 llvm/test/MC/AArch64/SVE/ldr.s                |  12 +-
 llvm/test/MC/AArch64/SVE/lsl.s                |  68 +--
 llvm/test/MC/AArch64/SVE/lslr.s               |  16 +-
 llvm/test/MC/AArch64/SVE/lsr.s                |  68 +--
 llvm/test/MC/AArch64/SVE/lsrr.s               |  16 +-
 llvm/test/MC/AArch64/SVE/mad.s                |  16 +-
 .../MC/AArch64/SVE/matrix-multiply-fp64.s     |  12 +-
 .../MC/AArch64/SVE/matrix-multiply-int8.s     |  12 +-
 llvm/test/MC/AArch64/SVE/mla.s                |  16 +-
 llvm/test/MC/AArch64/SVE/mls.s                |  16 +-
 llvm/test/MC/AArch64/SVE/mov.s                | 258 ++++-----
 llvm/test/MC/AArch64/SVE/movprfx.s            |  14 +-
 llvm/test/MC/AArch64/SVE/movs.s               |   8 +-
 llvm/test/MC/AArch64/SVE/msb.s                |  16 +-
 llvm/test/MC/AArch64/SVE/mul.s                |  36 +-
 llvm/test/MC/AArch64/SVE/nand.s               |   4 +-
 llvm/test/MC/AArch64/SVE/nands.s              |   4 +-
 llvm/test/MC/AArch64/SVE/neg.s                |  24 +-
 llvm/test/MC/AArch64/SVE/nor.s                |   4 +-
 llvm/test/MC/AArch64/SVE/nors.s               |   4 +-
 llvm/test/MC/AArch64/SVE/not.s                |  20 +-
 llvm/test/MC/AArch64/SVE/nots.s               |   4 +-
 llvm/test/MC/AArch64/SVE/orn.s                |  24 +-
 llvm/test/MC/AArch64/SVE/orns.s               |   4 +-
 llvm/test/MC/AArch64/SVE/orr.s                |  58 +-
 llvm/test/MC/AArch64/SVE/orrs.s               |   6 +-
 llvm/test/MC/AArch64/SVE/orv.s                |   8 +-
 llvm/test/MC/AArch64/SVE/pfalse.s             |   2 +-
 llvm/test/MC/AArch64/SVE/pfirst.s             |   4 +-
 llvm/test/MC/AArch64/SVE/pnext.s              |  10 +-
 llvm/test/MC/AArch64/SVE/prfb.s               |  60 +-
 llvm/test/MC/AArch64/SVE/prfd.s               |  60 +-
 llvm/test/MC/AArch64/SVE/prfh.s               |  60 +-
 llvm/test/MC/AArch64/SVE/prfw.s               |  60 +-
 llvm/test/MC/AArch64/SVE/ptest.s              |   4 +-
 llvm/test/MC/AArch64/SVE/ptrue.s              |  80 +--
 llvm/test/MC/AArch64/SVE/ptrues.s             |  80 +--
 llvm/test/MC/AArch64/SVE/punpkhi.s            |   4 +-
 llvm/test/MC/AArch64/SVE/punpklo.s            |   4 +-
 llvm/test/MC/AArch64/SVE/rbit.s               |  16 +-
 llvm/test/MC/AArch64/SVE/rdvl.s               |   8 +-
 llvm/test/MC/AArch64/SVE/rev.s                |   8 +-
 llvm/test/MC/AArch64/SVE/revb.s               |  14 +-
 llvm/test/MC/AArch64/SVE/revh.s               |  12 +-
 llvm/test/MC/AArch64/SVE/revw.s               |  10 +-
 llvm/test/MC/AArch64/SVE/sabd.s               |  16 +-
 llvm/test/MC/AArch64/SVE/saddv.s              |   6 +-
 llvm/test/MC/AArch64/SVE/scvtf.s              |  22 +-
 llvm/test/MC/AArch64/SVE/sdiv.s               |  12 +-
 llvm/test/MC/AArch64/SVE/sdivr.s              |  12 +-
 llvm/test/MC/AArch64/SVE/sdot.s               |  16 +-
 llvm/test/MC/AArch64/SVE/sel.s                |  20 +-
 llvm/test/MC/AArch64/SVE/smax.s               |  36 +-
 llvm/test/MC/AArch64/SVE/smaxv.s              |   8 +-
 llvm/test/MC/AArch64/SVE/smin.s               |  36 +-
 llvm/test/MC/AArch64/SVE/sminv.s              |   8 +-
 llvm/test/MC/AArch64/SVE/smulh.s              |  16 +-
 llvm/test/MC/AArch64/SVE/splice.s             |  12 +-
 llvm/test/MC/AArch64/SVE/sqadd.s              |  40 +-
 llvm/test/MC/AArch64/SVE/sqdecb.s             |  78 +--
 llvm/test/MC/AArch64/SVE/sqdecd.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/sqdech.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/sqdecp.s             |  32 +-
 llvm/test/MC/AArch64/SVE/sqdecw.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/sqincb.s             |  78 +--
 llvm/test/MC/AArch64/SVE/sqincd.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/sqinch.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/sqincp.s             |  32 +-
 llvm/test/MC/AArch64/SVE/sqincw.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/sqsub.s              |  40 +-
 llvm/test/MC/AArch64/SVE/st1b.s               |  40 +-
 llvm/test/MC/AArch64/SVE/st1d.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st1h.s               |  30 +-
 llvm/test/MC/AArch64/SVE/st1w.s               |  20 +-
 llvm/test/MC/AArch64/SVE/st2b.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st2d.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st2h.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st2w.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st3b.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st3d.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st3h.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st3w.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st4b.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st4d.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st4h.s               |  10 +-
 llvm/test/MC/AArch64/SVE/st4w.s               |  10 +-
 llvm/test/MC/AArch64/SVE/stnt1b.s             |  10 +-
 llvm/test/MC/AArch64/SVE/stnt1d.s             |  10 +-
 llvm/test/MC/AArch64/SVE/stnt1h.s             |  10 +-
 llvm/test/MC/AArch64/SVE/stnt1w.s             |  10 +-
 llvm/test/MC/AArch64/SVE/str.s                |  12 +-
 llvm/test/MC/AArch64/SVE/sub.s                | 104 ++--
 llvm/test/MC/AArch64/SVE/subr.s               |  48 +-
 llvm/test/MC/AArch64/SVE/sunpkhi.s            |   6 +-
 llvm/test/MC/AArch64/SVE/sunpklo.s            |   6 +-
 llvm/test/MC/AArch64/SVE/sxtb.s               |  20 +-
 llvm/test/MC/AArch64/SVE/sxth.s               |  16 +-
 llvm/test/MC/AArch64/SVE/sxtw.s               |  12 +-
 llvm/test/MC/AArch64/SVE/tbl.s                |  16 +-
 llvm/test/MC/AArch64/SVE/trn1.s               |  16 +-
 llvm/test/MC/AArch64/SVE/trn2.s               |  16 +-
 llvm/test/MC/AArch64/SVE/uabd.s               |  16 +-
 llvm/test/MC/AArch64/SVE/uaddv.s              |   8 +-
 llvm/test/MC/AArch64/SVE/ucvtf.s              |  22 +-
 llvm/test/MC/AArch64/SVE/udiv.s               |  12 +-
 llvm/test/MC/AArch64/SVE/udivr.s              |  12 +-
 llvm/test/MC/AArch64/SVE/udot.s               |  16 +-
 llvm/test/MC/AArch64/SVE/umax.s               |  36 +-
 llvm/test/MC/AArch64/SVE/umaxv.s              |   8 +-
 llvm/test/MC/AArch64/SVE/umin.s               |  36 +-
 llvm/test/MC/AArch64/SVE/uminv.s              |   8 +-
 llvm/test/MC/AArch64/SVE/umulh.s              |  16 +-
 llvm/test/MC/AArch64/SVE/uqadd.s              |  40 +-
 llvm/test/MC/AArch64/SVE/uqdecb.s             |  78 +--
 llvm/test/MC/AArch64/SVE/uqdecd.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/uqdech.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/uqdecp.s             |  32 +-
 llvm/test/MC/AArch64/SVE/uqdecw.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/uqincb.s             |  78 +--
 llvm/test/MC/AArch64/SVE/uqincd.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/uqinch.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/uqincp.s             |  32 +-
 llvm/test/MC/AArch64/SVE/uqincw.s             | 102 ++--
 llvm/test/MC/AArch64/SVE/uqsub.s              |  40 +-
 llvm/test/MC/AArch64/SVE/uunpkhi.s            |   6 +-
 llvm/test/MC/AArch64/SVE/uunpklo.s            |   6 +-
 llvm/test/MC/AArch64/SVE/uxtb.s               |  20 +-
 llvm/test/MC/AArch64/SVE/uxth.s               |  16 +-
 llvm/test/MC/AArch64/SVE/uxtw.s               |  12 +-
 llvm/test/MC/AArch64/SVE/uzp1.s               |  16 +-
 llvm/test/MC/AArch64/SVE/uzp2.s               |  16 +-
 llvm/test/MC/AArch64/SVE/whilele.s            |  20 +-
 llvm/test/MC/AArch64/SVE/whilelo.s            |  20 +-
 llvm/test/MC/AArch64/SVE/whilels.s            |  20 +-
 llvm/test/MC/AArch64/SVE/whilelt.s            |  20 +-
 llvm/test/MC/AArch64/SVE/zip1.s               |  32 +-
 llvm/test/MC/AArch64/SVE/zip2.s               |  32 +-
 llvm/test/MC/AArch64/SVE2/adclb.s             |   8 +-
 llvm/test/MC/AArch64/SVE2/adclt.s             |   8 +-
 llvm/test/MC/AArch64/SVE2/addhnb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/addhnt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/addp.s              |  16 +-
 llvm/test/MC/AArch64/SVE2/bcax.s              |  12 +-
 llvm/test/MC/AArch64/SVE2/bsl.s               |   6 +-
 llvm/test/MC/AArch64/SVE2/bsl1n.s             |   6 +-
 llvm/test/MC/AArch64/SVE2/bsl2n.s             |   6 +-
 llvm/test/MC/AArch64/SVE2/cadd.s              |  20 +-
 llvm/test/MC/AArch64/SVE2/cdot.s              |  28 +-
 llvm/test/MC/AArch64/SVE2/cmla.s              |  48 +-
 .../MC/AArch64/SVE2/directive-arch-negative.s |   2 +-
 .../SVE2/directive-arch_extension-negative.s  |   2 +-
 .../MC/AArch64/SVE2/directive-cpu-negative.s  |   2 +-
 llvm/test/MC/AArch64/SVE2/eor3.s              |  12 +-
 llvm/test/MC/AArch64/SVE2/eorbt.s             |  12 +-
 llvm/test/MC/AArch64/SVE2/eortb.s             |  12 +-
 llvm/test/MC/AArch64/SVE2/ext.s               |   4 +-
 llvm/test/MC/AArch64/SVE2/faddp.s             |  14 +-
 llvm/test/MC/AArch64/SVE2/fcvtlt.s            |   4 +-
 llvm/test/MC/AArch64/SVE2/fcvtnt.s            |   4 +-
 llvm/test/MC/AArch64/SVE2/fcvtx.s             |  12 +-
 llvm/test/MC/AArch64/SVE2/fcvtxnt.s           |   4 +-
 llvm/test/MC/AArch64/SVE2/flogb.s             |  14 +-
 llvm/test/MC/AArch64/SVE2/fmaxnmp.s           |  14 +-
 llvm/test/MC/AArch64/SVE2/fmaxp.s             |  14 +-
 llvm/test/MC/AArch64/SVE2/fminnmp.s           |  14 +-
 llvm/test/MC/AArch64/SVE2/fminp.s             |  14 +-
 llvm/test/MC/AArch64/SVE2/fmlalb.s            |  14 +-
 llvm/test/MC/AArch64/SVE2/fmlalt.s            |  14 +-
 llvm/test/MC/AArch64/SVE2/fmlslb.s            |  14 +-
 llvm/test/MC/AArch64/SVE2/fmlslt.s            |  14 +-
 llvm/test/MC/AArch64/SVE2/mla.s               |  10 +-
 llvm/test/MC/AArch64/SVE2/mls.s               |  10 +-
 llvm/test/MC/AArch64/SVE2/mul.s               |  14 +-
 llvm/test/MC/AArch64/SVE2/nbsl.s              |   6 +-
 llvm/test/MC/AArch64/SVE2/pmul.s              |   4 +-
 llvm/test/MC/AArch64/SVE2/pmullb.s            |   4 +-
 llvm/test/MC/AArch64/SVE2/pmullt.s            |   4 +-
 llvm/test/MC/AArch64/SVE2/raddhnb.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/raddhnt.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/rshrnb.s            |  12 +-
 llvm/test/MC/AArch64/SVE2/rshrnt.s            |  12 +-
 llvm/test/MC/AArch64/SVE2/rsubhnb.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/rsubhnt.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/saba.s              |  12 +-
 llvm/test/MC/AArch64/SVE2/sabalb.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/sabalt.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/sabdlb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/sabdlt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/sadalp.s            |  14 +-
 llvm/test/MC/AArch64/SVE2/saddlb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/saddlbt.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/saddlt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/saddwb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/saddwt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/sbclb.s             |   8 +-
 llvm/test/MC/AArch64/SVE2/sbclt.s             |   8 +-
 llvm/test/MC/AArch64/SVE2/shadd.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/shrnb.s             |  12 +-
 llvm/test/MC/AArch64/SVE2/shrnt.s             |  12 +-
 llvm/test/MC/AArch64/SVE2/shsub.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/shsubr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/sli.s               |  16 +-
 llvm/test/MC/AArch64/SVE2/smaxp.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/sminp.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/smlalb.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/smlalt.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/smlslb.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/smlslt.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/smulh.s             |   8 +-
 llvm/test/MC/AArch64/SVE2/smullb.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/smullt.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/splice.s            |   8 +-
 llvm/test/MC/AArch64/SVE2/sqabs.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/sqadd.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/sqcadd.s            |  20 +-
 llvm/test/MC/AArch64/SVE2/sqdmlalb.s          |  18 +-
 llvm/test/MC/AArch64/SVE2/sqdmlalbt.s         |  10 +-
 llvm/test/MC/AArch64/SVE2/sqdmlalt.s          |  18 +-
 llvm/test/MC/AArch64/SVE2/sqdmlslb.s          |  18 +-
 llvm/test/MC/AArch64/SVE2/sqdmlslbt.s         |  10 +-
 llvm/test/MC/AArch64/SVE2/sqdmlslt.s          |  18 +-
 llvm/test/MC/AArch64/SVE2/sqdmulh.s           |  14 +-
 llvm/test/MC/AArch64/SVE2/sqdmullb.s          |  10 +-
 llvm/test/MC/AArch64/SVE2/sqdmullt.s          |  10 +-
 llvm/test/MC/AArch64/SVE2/sqneg.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/sqrdcmlah.s         |  48 +-
 llvm/test/MC/AArch64/SVE2/sqrdmlah.s          |  22 +-
 llvm/test/MC/AArch64/SVE2/sqrdmlsh.s          |  22 +-
 llvm/test/MC/AArch64/SVE2/sqrdmulh.s          |  14 +-
 llvm/test/MC/AArch64/SVE2/sqrshl.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/sqrshlr.s           |  16 +-
 llvm/test/MC/AArch64/SVE2/sqrshrnb.s          |  12 +-
 llvm/test/MC/AArch64/SVE2/sqrshrnt.s          |  12 +-
 llvm/test/MC/AArch64/SVE2/sqrshrunb.s         |  12 +-
 llvm/test/MC/AArch64/SVE2/sqrshrunt.s         |  12 +-
 llvm/test/MC/AArch64/SVE2/sqshl.s             |  40 +-
 llvm/test/MC/AArch64/SVE2/sqshlr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/sqshlu.s            |  24 +-
 llvm/test/MC/AArch64/SVE2/sqshrnb.s           |  12 +-
 llvm/test/MC/AArch64/SVE2/sqshrnt.s           |  12 +-
 llvm/test/MC/AArch64/SVE2/sqshrunb.s          |  12 +-
 llvm/test/MC/AArch64/SVE2/sqshrunt.s          |  12 +-
 llvm/test/MC/AArch64/SVE2/sqsub.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/sqsubr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/sqxtnb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/sqxtnt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/sqxtunb.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/sqxtunt.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/srhadd.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/sri.s               |  16 +-
 llvm/test/MC/AArch64/SVE2/srshl.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/srshlr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/srshr.s             |  24 +-
 llvm/test/MC/AArch64/SVE2/srsra.s             |  20 +-
 llvm/test/MC/AArch64/SVE2/sshllb.s            |  12 +-
 llvm/test/MC/AArch64/SVE2/sshllt.s            |  12 +-
 llvm/test/MC/AArch64/SVE2/ssra.s              |  20 +-
 llvm/test/MC/AArch64/SVE2/ssublb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/ssublbt.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/ssublt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/ssubltb.s           |   6 +-
 llvm/test/MC/AArch64/SVE2/ssubwb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/ssubwt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/subhnb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/subhnt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/suqadd.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/tbl.s               |   8 +-
 llvm/test/MC/AArch64/SVE2/tbx.s               |   8 +-
 llvm/test/MC/AArch64/SVE2/uaba.s              |  12 +-
 llvm/test/MC/AArch64/SVE2/uabalb.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/uabalt.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/uabdlb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/uabdlt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/uadalp.s            |  14 +-
 llvm/test/MC/AArch64/SVE2/uaddlb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/uaddlt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/uaddwb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/uaddwt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/uhadd.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/uhsub.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/uhsubr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/umaxp.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/uminp.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/umlalb.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/umlalt.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/umlslb.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/umlslt.s            |  18 +-
 llvm/test/MC/AArch64/SVE2/umulh.s             |   8 +-
 llvm/test/MC/AArch64/SVE2/umullb.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/umullt.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/uqadd.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/uqrshl.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/uqrshlr.s           |  16 +-
 llvm/test/MC/AArch64/SVE2/uqrshrnb.s          |  12 +-
 llvm/test/MC/AArch64/SVE2/uqrshrnt.s          |  12 +-
 llvm/test/MC/AArch64/SVE2/uqshl.s             |  40 +-
 llvm/test/MC/AArch64/SVE2/uqshlr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/uqshrnb.s           |  12 +-
 llvm/test/MC/AArch64/SVE2/uqshrnt.s           |  12 +-
 llvm/test/MC/AArch64/SVE2/uqsub.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/uqsubr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/uqxtnb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/uqxtnt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/urecpe.s            |  10 +-
 llvm/test/MC/AArch64/SVE2/urhadd.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/urshl.s             |  16 +-
 llvm/test/MC/AArch64/SVE2/urshlr.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/urshr.s             |  24 +-
 llvm/test/MC/AArch64/SVE2/ursqrte.s           |  10 +-
 llvm/test/MC/AArch64/SVE2/ursra.s             |  20 +-
 llvm/test/MC/AArch64/SVE2/ushllb.s            |  12 +-
 llvm/test/MC/AArch64/SVE2/ushllt.s            |  12 +-
 llvm/test/MC/AArch64/SVE2/usqadd.s            |  16 +-
 llvm/test/MC/AArch64/SVE2/usra.s              |  20 +-
 llvm/test/MC/AArch64/SVE2/usublb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/usublt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/usubwb.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/usubwt.s            |   6 +-
 llvm/test/MC/AArch64/SVE2/whilege.s           |  20 +-
 llvm/test/MC/AArch64/SVE2/whilegt.s           |  20 +-
 llvm/test/MC/AArch64/SVE2/whilehi.s           |  20 +-
 llvm/test/MC/AArch64/SVE2/whilehs.s           |  20 +-
 llvm/test/MC/AArch64/SVE2/whilerw.s           |   8 +-
 llvm/test/MC/AArch64/SVE2/whilewr.s           |   8 +-
 llvm/test/MC/AArch64/SVE2/xar.s               |  20 +-
 498 files changed, 5791 insertions(+), 5791 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 509fd05806211..f3aff92d4bac5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -143,23 +143,23 @@ def HasSMEF64        : Predicate<"Subtarget->hasSMEF64()">,
 def HasSMEI64        : Predicate<"Subtarget->hasSMEI64()">,
                                  AssemblerPredicate<(all_of FeatureSMEI64), "sme-i64">;
 def HasStreamingSVE  : Predicate<"Subtarget->hasStreamingSVE()">,
-                                 AssemblerPredicate<(all_of FeatureStreamingSVE), "streaming-sve">;
+                                 AssemblerPredicate<(all_of FeatureStreamingSVE), "sme">;
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
 def HasSVEorStreamingSVE
     : Predicate<"Subtarget->hasSVE() || Subtarget->hasStreamingSVE()">,
                 AssemblerPredicate<(any_of FeatureSVE, FeatureStreamingSVE),
-                "streaming-sve or sve">;
+                "sve or sme">;
 def HasSVE2orStreamingSVE
     : Predicate<"Subtarget->hasSVE2() || Subtarget->hasStreamingSVE()">,
                 AssemblerPredicate<(any_of FeatureSVE2, FeatureStreamingSVE),
-                "streaming-sve or sve2">;
+                "sve2 or sme">;
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
 def HasNEONorStreamingSVE
     : Predicate<"Subtarget->hasNEON() || Subtarget->hasStreamingSVE()">,
                 AssemblerPredicate<(any_of FeatureNEON, FeatureStreamingSVE),
-                "streaming-sve or neon">;
+                "neon or sme">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicate<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
diff --git a/llvm/test/MC/AArch64/SME/revd.s b/llvm/test/MC/AArch64/SME/revd.s
index d34b851d04993..ed45fb08120bc 100644
--- a/llvm/test/MC/AArch64/SME/revd.s
+++ b/llvm/test/MC/AArch64/SME/revd.s
@@ -42,7 +42,7 @@ revd    z31.q, p7/m, z31.q
 movprfx z21, z25
 // CHECK-INST: movprfx  z21, z25
 // CHECK-ENCODING: [0x35,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 35 bf 20 04 <unknown>
 
 revd    z21.q, p5/m, z10.q
diff --git a/llvm/test/MC/AArch64/SME/sclamp.s b/llvm/test/MC/AArch64/SME/sclamp.s
index 3e232711bca8d..0f1e6a2a4bd9b 100644
--- a/llvm/test/MC/AArch64/SME/sclamp.s
+++ b/llvm/test/MC/AArch64/SME/sclamp.s
@@ -126,7 +126,7 @@ sclamp  z31.d, z31.d, z31.d
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 sclamp  z23.b, z13.b, z8.b
@@ -138,7 +138,7 @@ sclamp  z23.b, z13.b, z8.b
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 sclamp  z23.h, z13.h, z8.h
@@ -150,7 +150,7 @@ sclamp  z23.h, z13.h, z8.h
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 sclamp  z23.s, z13.s, z8.s
@@ -162,7 +162,7 @@ sclamp  z23.s, z13.s, z8.s
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 sclamp  z23.d, z13.d, z8.d
diff --git a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s b/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
index 157fcf966e569..88f5cedbb06d4 100644
--- a/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
+++ b/llvm/test/MC/AArch64/SME/streaming-mode-neon.s
@@ -15,62 +15,62 @@
 fmulx s0, s1, s2
 // CHECK-INST: fmulx s0, s1, s2
 // CHECK-ENCODING: [0x20,0xdc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 fmulx d0, d1, d2
 // CHECK-INST: fmulx d0, d1, d2
 // CHECK-ENCODING: [0x20,0xdc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frecps s0, s1, s2
 // CHECK-INST: frecps s0, s1, s2
 // CHECK-ENCODING: [0x20,0xfc,0x22,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frecps d0, d1, d2
 // CHECK-INST: frecps d0, d1, d2
 // CHECK-ENCODING: [0x20,0xfc,0x62,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frsqrts s0, s1, s2
 // CHECK-INST: frsqrts s0, s1, s2
 // CHECK-ENCODING: [0x20,0xfc,0xa2,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frsqrts d0, d1, d2
 // CHECK-INST: frsqrts d0, d1, d2
 // CHECK-ENCODING: [0x20,0xfc,0xe2,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frecpe s0, s1
 // CHECK-INST: frecpe s0, s1
 // CHECK-ENCODING: [0x20,0xd8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frecpe d0, d1
 // CHECK-INST: frecpe d0, d1
 // CHECK-ENCODING: [0x20,0xd8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frecpx s0, s1
 // CHECK-INST: frecpx s0, s1
 // CHECK-ENCODING: [0x20,0xf8,0xa1,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frecpx d0, d1
 // CHECK-INST: frecpx d0, d1
 // CHECK-ENCODING: [0x20,0xf8,0xe1,0x5e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frsqrte s0, s1
 // CHECK-INST: frsqrte s0, s1
 // CHECK-ENCODING: [0x20,0xd8,0xa1,0x7e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 frsqrte d0, d1
 // CHECK-INST: frsqrte d0, d1
 // CHECK-ENCODING: [0x20,0xd8,0xe1,0x7e]
-// CHECK-ERROR: instruction requires: streaming-sve or neon
+// CHECK-ERROR: instruction requires: neon or sme
 
 // Vector to GPR integer move instructions
 
diff --git a/llvm/test/MC/AArch64/SME/uclamp.s b/llvm/test/MC/AArch64/SME/uclamp.s
index 2db0f8af0158f..94aa31c0f4c54 100644
--- a/llvm/test/MC/AArch64/SME/uclamp.s
+++ b/llvm/test/MC/AArch64/SME/uclamp.s
@@ -126,7 +126,7 @@ uclamp  z31.d, z31.d, z31.d
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 uclamp  z23.b, z13.b, z8.b
@@ -138,7 +138,7 @@ uclamp  z23.b, z13.b, z8.b
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 uclamp  z23.h, z13.h, z8.h
@@ -150,7 +150,7 @@ uclamp  z23.h, z13.h, z8.h
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 uclamp  z23.s, z13.s, z8.s
@@ -162,7 +162,7 @@ uclamp  z23.s, z13.s, z8.s
 movprfx z23, z27
 // CHECK-INST: movprfx  z23, z27
 // CHECK-ENCODING: [0x77,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 77 bf 20 04 <unknown>
 
 uclamp  z23.d, z13.d, z8.d
diff --git a/llvm/test/MC/AArch64/SVE/abs.s b/llvm/test/MC/AArch64/SVE/abs.s
index 1a7182da55340..58143bc128f0d 100644
--- a/llvm/test/MC/AArch64/SVE/abs.s
+++ b/llvm/test/MC/AArch64/SVE/abs.s
@@ -12,49 +12,49 @@
 abs     z0.b, p0/m, z0.b
 // CHECK-INST: abs     z0.b, p0/m, z0.b
 // CHECK-ENCODING: [0x00,0xa0,0x16,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 16 04 <unknown>
 
 abs     z0.h, p0/m, z0.h
 // CHECK-INST: abs     z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x56,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 56 04 <unknown>
 
 abs     z0.s, p0/m, z0.s
 // CHECK-INST: abs     z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x96,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 96 04 <unknown>
 
 abs     z0.d, p0/m, z0.d
 // CHECK-INST: abs     z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d6 04 <unknown>
 
 abs     z31.b, p7/m, z31.b
 // CHECK-INST: abs     z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x16,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 16 04 <unknown>
 
 abs     z31.h, p7/m, z31.h
 // CHECK-INST: abs     z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x56,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 56 04 <unknown>
 
 abs     z31.s, p7/m, z31.s
 // CHECK-INST: abs     z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x96,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 96 04 <unknown>
 
 abs     z31.d, p7/m, z31.d
 // CHECK-INST: abs     z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d6 04 <unknown>
 
 
@@ -64,23 +64,23 @@ abs     z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 abs     z4.d, p7/m, z31.d
 // CHECK-INST: abs	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d6 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 abs     z4.d, p7/m, z31.d
 // CHECK-INST: abs	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d6 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/add.s b/llvm/test/MC/AArch64/SVE/add.s
index 7b641735e3d61..1a57e499c1a78 100644
--- a/llvm/test/MC/AArch64/SVE/add.s
+++ b/llvm/test/MC/AArch64/SVE/add.s
@@ -12,277 +12,277 @@
 add     z31.s, z31.s, z31.s
 // CHECK-INST: add     z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x03,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 03 bf 04 <unknown>
 
 add     z23.d, z13.d, z8.d
 // CHECK-INST: add     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x01,0xe8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 01 e8 04 <unknown>
 
 add     z23.b, p3/m, z23.b, z13.b
 // CHECK-INST: add     z23.b, p3/m, z23.b, z13.b
 // CHECK-ENCODING: [0xb7,0x0d,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 00 04 <unknown>
 
 add     z0.s, z0.s, z0.s
 // CHECK-INST: add     z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x00,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 a0 04 <unknown>
 
 add     z31.d, z31.d, z31.d
 // CHECK-INST: add     z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x03,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 03 ff 04 <unknown>
 
 add     z21.b, z10.b, z21.b
 // CHECK-INST: add     z21.b, z10.b, z21.b
 // CHECK-ENCODING: [0x55,0x01,0x35,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 01 35 04 <unknown>
 
 add     z31.b, z31.b, z31.b
 // CHECK-INST: add     z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x03,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 03 3f 04 <unknown>
 
 add     z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: add     z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x00,0x40,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 40 04 <unknown>
 
 add     z0.h, z0.h, z0.h
 // CHECK-INST: add     z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x00,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 60 04 <unknown>
 
 add     z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: add     z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x00,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 00 04 <unknown>
 
 add     z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: add     z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x00,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 80 04 <unknown>
 
 add     z23.b, z13.b, z8.b
 // CHECK-INST: add     z23.b, z13.b, z8.b
 // CHECK-ENCODING: [0xb7,0x01,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 01 28 04 <unknown>
 
 add     z0.d, z0.d, z0.d
 // CHECK-INST: add     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x00,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 e0 04 <unknown>
 
 add     z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: add     z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x00,0xc0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 c0 04 <unknown>
 
 add     z31.h, z31.h, z31.h
 // CHECK-INST: add     z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x03,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 03 7f 04 <unknown>
 
 add     z0.b, z0.b, z0.b
 // CHECK-INST: add     z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x00,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 20 04 <unknown>
 
 add     z21.d, z10.d, z21.d
 // CHECK-INST: add     z21.d, z10.d, z21.d
 // CHECK-ENCODING: [0x55,0x01,0xf5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 01 f5 04 <unknown>
 
 add     z23.h, p3/m, z23.h, z13.h
 // CHECK-INST: add     z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x0d,0x40,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 40 04 <unknown>
 
 add     z23.s, p3/m, z23.s, z13.s
 // CHECK-INST: add     z23.s, p3/m, z23.s, z13.s
 // CHECK-ENCODING: [0xb7,0x0d,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 80 04 <unknown>
 
 add     z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: add     z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 80 04 <unknown>
 
 add     z21.h, z10.h, z21.h
 // CHECK-INST: add     z21.h, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x01,0x75,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 01 75 04 <unknown>
 
 add     z23.d, p3/m, z23.d, z13.d
 // CHECK-INST: add     z23.d, p3/m, z23.d, z13.d
 // CHECK-ENCODING: [0xb7,0x0d,0xc0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d c0 04 <unknown>
 
 add     z21.d, p5/m, z21.d, z10.d
 // CHECK-INST: add     z21.d, p5/m, z21.d, z10.d
 // CHECK-ENCODING: [0x55,0x15,0xc0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 c0 04 <unknown>
 
 add     z21.b, p5/m, z21.b, z10.b
 // CHECK-INST: add     z21.b, p5/m, z21.b, z10.b
 // CHECK-ENCODING: [0x55,0x15,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 00 04 <unknown>
 
 add     z21.s, z10.s, z21.s
 // CHECK-INST: add     z21.s, z10.s, z21.s
 // CHECK-ENCODING: [0x55,0x01,0xb5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 01 b5 04 <unknown>
 
 add     z21.h, p5/m, z21.h, z10.h
 // CHECK-INST: add     z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x15,0x40,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 40 04 <unknown>
 
 add     z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: add     z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x40,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 40 04 <unknown>
 
 add     z23.h, z13.h, z8.h
 // CHECK-INST: add     z23.h, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x01,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 01 68 04 <unknown>
 
 add     z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: add     z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xc0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f c0 04 <unknown>
 
 add     z21.s, p5/m, z21.s, z10.s
 // CHECK-INST: add     z21.s, p5/m, z21.s, z10.s
 // CHECK-ENCODING: [0x55,0x15,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 80 04 <unknown>
 
 add     z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: add     z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 00 04 <unknown>
 
 add     z23.s, z13.s, z8.s
 // CHECK-INST: add     z23.s, z13.s, z8.s
 // CHECK-ENCODING: [0xb7,0x01,0xa8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 01 a8 04 <unknown>
 
 add     z0.b, z0.b, #0
 // CHECK-INST: add     z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 20 25 <unknown>
 
 add     z31.b, z31.b, #255
 // CHECK-INST: add     z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 20 25 <unknown>
 
 add     z0.h, z0.h, #0
 // CHECK-INST: add     z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x60,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 60 25 <unknown>
 
 add     z0.h, z0.h, #0, lsl #8
 // CHECK-INST: add     z0.h, z0.h, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0x60,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 25 <unknown>
 
 add     z31.h, z31.h, #255, lsl #8
 // CHECK-INST: add     z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x60,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 60 25 <unknown>
 
 add     z31.h, z31.h, #65280
 // CHECK-INST: add     z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x60,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 60 25 <unknown>
 
 add     z0.s, z0.s, #0
 // CHECK-INST: add     z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xa0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a0 25 <unknown>
 
 add     z0.s, z0.s, #0, lsl #8
 // CHECK-INST: add     z0.s, z0.s, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a0 25 <unknown>
 
 add     z31.s, z31.s, #255, lsl #8
 // CHECK-INST: add     z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a0 25 <unknown>
 
 add     z31.s, z31.s, #65280
 // CHECK-INST: add     z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a0 25 <unknown>
 
 add     z0.d, z0.d, #0
 // CHECK-INST: add     z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xe0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e0 25 <unknown>
 
 add     z0.d, z0.d, #0, lsl #8
 // CHECK-INST: add     z0.d, z0.d, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 25 <unknown>
 
 add     z31.d, z31.d, #255, lsl #8
 // CHECK-INST: add     z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e0 25 <unknown>
 
 add     z31.d, z31.d, #65280
 // CHECK-INST: add     z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e0 25 <unknown>
 
 
@@ -293,35 +293,35 @@ add     z31.d, z31.d, #65280
 movprfx z4.b, p7/z, z6.b
 // CHECK-INST: movprfx	z4.b, p7/z, z6.b
 // CHECK-ENCODING: [0xc4,0x3c,0x10,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c 10 04 <unknown>
 
 add     z4.b, p7/m, z4.b, z31.b
 // CHECK-INST: add	z4.b, p7/m, z4.b, z31.b
 // CHECK-ENCODING: [0xe4,0x1f,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f 00 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 add     z4.b, p7/m, z4.b, z31.b
 // CHECK-INST: add	z4.b, p7/m, z4.b, z31.b
 // CHECK-ENCODING: [0xe4,0x1f,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f 00 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 add     z31.d, z31.d, #65280
 // CHECK-INST: add	z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e0 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/addpl.s b/llvm/test/MC/AArch64/SVE/addpl.s
index 3cbf92551f1bd..7a11345065990 100644
--- a/llvm/test/MC/AArch64/SVE/addpl.s
+++ b/llvm/test/MC/AArch64/SVE/addpl.s
@@ -12,23 +12,23 @@
 addpl   x21, x21, #0
 // CHECK-INST: addpl   x21, x21, #0
 // CHECK-ENCODING: [0x15,0x50,0x75,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 75 04 <unknown>
 
 addpl   x23, x8, #-1
 // CHECK-INST: addpl   x23, x8, #-1
 // CHECK-ENCODING: [0xf7,0x57,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f7 57 68 04 <unknown>
 
 addpl   sp, sp, #31
 // CHECK-INST: addpl   sp, sp, #31
 // CHECK-ENCODING: [0xff,0x53,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 53 7f 04 <unknown>
 
 addpl   x0, x0, #-32
 // CHECK-INST: addpl   x0, x0, #-32
 // CHECK-ENCODING: [0x00,0x54,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 54 60 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/addvl.s b/llvm/test/MC/AArch64/SVE/addvl.s
index d8c7cbe75db08..8ac14fae10a22 100644
--- a/llvm/test/MC/AArch64/SVE/addvl.s
+++ b/llvm/test/MC/AArch64/SVE/addvl.s
@@ -12,23 +12,23 @@
 addvl   x21, x21, #0
 // CHECK-INST: addvl   x21, x21, #0
 // CHECK-ENCODING: [0x15,0x50,0x35,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 35 04 <unknown>
 
 addvl   x23, x8, #-1
 // CHECK-INST: addvl   x23, x8, #-1
 // CHECK-ENCODING: [0xf7,0x57,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f7 57 28 04 <unknown>
 
 addvl   sp, sp, #31
 // CHECK-INST: addvl   sp, sp, #31
 // CHECK-ENCODING: [0xff,0x53,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 53 3f 04 <unknown>
 
 addvl   x0, x0, #-32
 // CHECK-INST: addvl   x0, x0, #-32
 // CHECK-ENCODING: [0x00,0x54,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 54 20 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/and.s b/llvm/test/MC/AArch64/SVE/and.s
index 725dc9410dc79..ccdc8ebfb0d7a 100644
--- a/llvm/test/MC/AArch64/SVE/and.s
+++ b/llvm/test/MC/AArch64/SVE/and.s
@@ -12,103 +12,103 @@
 and     z5.b, z5.b, #0xf9
 // CHECK-INST: and     z5.b, z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e 80 05 <unknown>
 
 and     z23.h, z23.h, #0xfff9
 // CHECK-INST: and     z23.h, z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d 80 05 <unknown>
 
 and     z0.s, z0.s, #0xfffffff9
 // CHECK-INST: and     z0.s, z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb 80 05 <unknown>
 
 and     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-INST: and     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x83,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 83 05 <unknown>
 
 and     z5.b, z5.b, #0x6
 // CHECK-INST: and     z5.b, z5.b, #0x6
 // CHECK-ENCODING: [0x25,0x3e,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 3e 80 05 <unknown>
 
 and     z23.h, z23.h, #0x6
 // CHECK-INST: and     z23.h, z23.h, #0x6
 // CHECK-ENCODING: [0x37,0x7c,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 37 7c 80 05 <unknown>
 
 and     z0.s, z0.s, #0x6
 // CHECK-INST: and     z0.s, z0.s, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 80 05 <unknown>
 
 and     z0.d, z0.d, #0x6
 // CHECK-INST: and     z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x83,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 83 05 <unknown>
 
 and     z0.d, z0.d, z0.d
 // CHECK-INST: and     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 20 04 <unknown>
 
 and     z23.d, z13.d, z8.d
 // CHECK-INST: and     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x31,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 31 28 04 <unknown>
 
 and     z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: and     z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x1a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 1a 04 <unknown>
 
 and     z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: and     z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x5a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 5a 04 <unknown>
 
 and     z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: and     z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x9a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 9a 04 <unknown>
 
 and     z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: and     z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xda,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f da 04 <unknown>
 
 and     p0.b, p0/z, p0.b, p1.b
 // CHECK-INST: and     p0.b, p0/z, p0.b, p1.b
 // CHECK-ENCODING: [0x00,0x40,0x01,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 01 25 <unknown>
 
 and     p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: mov     p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x00,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 00 25 <unknown>
 
 and     p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: mov     p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 0f 25 <unknown>
 
 
@@ -118,19 +118,19 @@ and     p15.b, p15/z, p15.b, p15.b
 and     z0.s, z0.s, z0.s
 // CHECK-INST: and     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 20 04 <unknown>
 
 and     z0.h, z0.h, z0.h
 // CHECK-INST: and     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 20 04 <unknown>
 
 and     z0.b, z0.b, z0.b
 // CHECK-INST: and     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 20 04 <unknown>
 
 
@@ -140,35 +140,35 @@ and     z0.b, z0.b, z0.b
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 and     z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: and	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xda,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f da 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 and     z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: and	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xda,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f da 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 and     z0.d, z0.d, #0x6
 // CHECK-INST: and	z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x83,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 83 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ands.s b/llvm/test/MC/AArch64/SVE/ands.s
index de15bacc66293..c5b12f8079814 100644
--- a/llvm/test/MC/AArch64/SVE/ands.s
+++ b/llvm/test/MC/AArch64/SVE/ands.s
@@ -12,18 +12,18 @@
 ands    p0.b, p0/z, p0.b, p1.b
 // CHECK-INST: ands    p0.b, p0/z, p0.b, p1.b
 // CHECK-ENCODING: [0x00,0x40,0x41,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 41 25 <unknown>
 
 ands    p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: movs    p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x40,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 40 25 <unknown>
 
 ands    p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: movs    p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 4f 25 <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE/andv.s b/llvm/test/MC/AArch64/SVE/andv.s
index bd892c8668381..ed4f57e500850 100644
--- a/llvm/test/MC/AArch64/SVE/andv.s
+++ b/llvm/test/MC/AArch64/SVE/andv.s
@@ -12,23 +12,23 @@
 andv b0, p7, z31.b
 // CHECK-INST: andv	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x1a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 1a 04 <unknown>
 
 andv h0, p7, z31.h
 // CHECK-INST: andv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x5a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 5a 04 <unknown>
 
 andv s0, p7, z31.s
 // CHECK-INST: andv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x9a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 9a 04 <unknown>
 
 andv d0, p7, z31.d
 // CHECK-INST: andv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xda,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f da 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/asr.s b/llvm/test/MC/AArch64/SVE/asr.s
index 62df01406fe91..3b9f4f354c504 100644
--- a/llvm/test/MC/AArch64/SVE/asr.s
+++ b/llvm/test/MC/AArch64/SVE/asr.s
@@ -12,157 +12,157 @@
 asr     z0.b, z0.b, #1
 // CHECK-INST: asr	z0.b, z0.b, #1
 // CHECK-ENCODING: [0x00,0x90,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 90 2f 04 <unknown>
 
 asr     z31.b, z31.b, #8
 // CHECK-INST: asr	z31.b, z31.b, #8
 // CHECK-ENCODING: [0xff,0x93,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 93 28 04 <unknown>
 
 asr     z0.h, z0.h, #1
 // CHECK-INST: asr	z0.h, z0.h, #1
 // CHECK-ENCODING: [0x00,0x90,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 90 3f 04 <unknown>
 
 asr     z31.h, z31.h, #16
 // CHECK-INST: asr	z31.h, z31.h, #16
 // CHECK-ENCODING: [0xff,0x93,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 93 30 04 <unknown>
 
 asr     z0.s, z0.s, #1
 // CHECK-INST: asr	z0.s, z0.s, #1
 // CHECK-ENCODING: [0x00,0x90,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 90 7f 04 <unknown>
 
 asr     z31.s, z31.s, #32
 // CHECK-INST: asr	z31.s, z31.s, #32
 // CHECK-ENCODING: [0xff,0x93,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 93 60 04 <unknown>
 
 asr     z0.d, z0.d, #1
 // CHECK-INST: asr	z0.d, z0.d, #1
 // CHECK-ENCODING: [0x00,0x90,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 90 ff 04 <unknown>
 
 asr     z31.d, z31.d, #64
 // CHECK-INST: asr	z31.d, z31.d, #64
 // CHECK-ENCODING: [0xff,0x93,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 93 a0 04 <unknown>
 
 asr     z0.b, p0/m, z0.b, #1
 // CHECK-INST: asr	z0.b, p0/m, z0.b, #1
 // CHECK-ENCODING: [0xe0,0x81,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 81 00 04 <unknown>
 
 asr     z31.b, p0/m, z31.b, #8
 // CHECK-INST: asr	z31.b, p0/m, z31.b, #8
 // CHECK-ENCODING: [0x1f,0x81,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 81 00 04 <unknown>
 
 asr     z0.h, p0/m, z0.h, #1
 // CHECK-INST: asr	z0.h, p0/m, z0.h, #1
 // CHECK-ENCODING: [0xe0,0x83,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 00 04 <unknown>
 
 asr     z31.h, p0/m, z31.h, #16
 // CHECK-INST: asr	z31.h, p0/m, z31.h, #16
 // CHECK-ENCODING: [0x1f,0x82,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 82 00 04 <unknown>
 
 asr     z0.s, p0/m, z0.s, #1
 // CHECK-INST: asr	z0.s, p0/m, z0.s, #1
 // CHECK-ENCODING: [0xe0,0x83,0x40,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 40 04 <unknown>
 
 asr     z31.s, p0/m, z31.s, #32
 // CHECK-INST: asr	z31.s, p0/m, z31.s, #32
 // CHECK-ENCODING: [0x1f,0x80,0x40,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 40 04 <unknown>
 
 asr     z0.d, p0/m, z0.d, #1
 // CHECK-INST: asr	z0.d, p0/m, z0.d, #1
 // CHECK-ENCODING: [0xe0,0x83,0xc0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 c0 04 <unknown>
 
 asr     z31.d, p0/m, z31.d, #64
 // CHECK-INST: asr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 80 04 <unknown>
 
 asr     z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: asr	z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x80,0x10,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 10 04 <unknown>
 
 asr     z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: asr	z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x50,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 50 04 <unknown>
 
 asr     z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: asr	z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x80,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 90 04 <unknown>
 
 asr     z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: asr	z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x80,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d0 04 <unknown>
 
 asr     z0.b, p0/m, z0.b, z1.d
 // CHECK-INST: asr	z0.b, p0/m, z0.b, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x18,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 18 04 <unknown>
 
 asr     z0.h, p0/m, z0.h, z1.d
 // CHECK-INST: asr	z0.h, p0/m, z0.h, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x58,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 58 04 <unknown>
 
 asr     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: asr	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x98,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 98 04 <unknown>
 
 asr     z0.b, z1.b, z2.d
 // CHECK-INST: asr	z0.b, z1.b, z2.d
 // CHECK-ENCODING: [0x20,0x80,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 22 04 <unknown>
 
 asr     z0.h, z1.h, z2.d
 // CHECK-INST: asr	z0.h, z1.h, z2.d
 // CHECK-ENCODING: [0x20,0x80,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 62 04 <unknown>
 
 asr     z0.s, z1.s, z2.d
 // CHECK-INST: asr	z0.s, z1.s, z2.d
 // CHECK-ENCODING: [0x20,0x80,0xa2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 a2 04 <unknown>
 
 
@@ -172,47 +172,47 @@ asr     z0.s, z1.s, z2.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx	z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 asr     z31.d, p0/m, z31.d, #64
 // CHECK-INST: asr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 80 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 asr     z31.d, p0/m, z31.d, #64
 // CHECK-INST: asr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 80 04 <unknown>
 
 movprfx z0.s, p0/z, z7.s
 // CHECK-INST: movprfx	z0.s, p0/z, z7.s
 // CHECK-ENCODING: [0xe0,0x20,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 20 90 04 <unknown>
 
 asr     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: asr	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x98,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 98 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 asr     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: asr	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x98,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 98 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/asrd.s b/llvm/test/MC/AArch64/SVE/asrd.s
index c025a87acdbfd..e5f10f21eb3db 100644
--- a/llvm/test/MC/AArch64/SVE/asrd.s
+++ b/llvm/test/MC/AArch64/SVE/asrd.s
@@ -12,49 +12,49 @@
 asrd    z0.b, p0/m, z0.b, #1
 // CHECK-INST: asrd	z0.b, p0/m, z0.b, #1
 // CHECK-ENCODING: [0xe0,0x81,0x04,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 81 04 04 <unknown>
 
 asrd    z31.b, p0/m, z31.b, #8
 // CHECK-INST: asrd	z31.b, p0/m, z31.b, #8
 // CHECK-ENCODING: [0x1f,0x81,0x04,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 81 04 04 <unknown>
 
 asrd    z0.h, p0/m, z0.h, #1
 // CHECK-INST: asrd	z0.h, p0/m, z0.h, #1
 // CHECK-ENCODING: [0xe0,0x83,0x04,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 04 04 <unknown>
 
 asrd    z31.h, p0/m, z31.h, #16
 // CHECK-INST: asrd	z31.h, p0/m, z31.h, #16
 // CHECK-ENCODING: [0x1f,0x82,0x04,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 82 04 04 <unknown>
 
 asrd    z0.s, p0/m, z0.s, #1
 // CHECK-INST: asrd	z0.s, p0/m, z0.s, #1
 // CHECK-ENCODING: [0xe0,0x83,0x44,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 44 04 <unknown>
 
 asrd    z31.s, p0/m, z31.s, #32
 // CHECK-INST: asrd	z31.s, p0/m, z31.s, #32
 // CHECK-ENCODING: [0x1f,0x80,0x44,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 44 04 <unknown>
 
 asrd    z0.d, p0/m, z0.d, #1
 // CHECK-INST: asrd	z0.d, p0/m, z0.d, #1
 // CHECK-ENCODING: [0xe0,0x83,0xc4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 c4 04 <unknown>
 
 asrd    z31.d, p0/m, z31.d, #64
 // CHECK-INST: asrd	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x84,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 84 04 <unknown>
 
 
@@ -64,23 +64,23 @@ asrd    z31.d, p0/m, z31.d, #64
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx	z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 asrd    z31.d, p0/m, z31.d, #64
 // CHECK-INST: asrd	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x84,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 84 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 asrd    z31.d, p0/m, z31.d, #64
 // CHECK-INST: asrd	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x84,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 84 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/asrr.s b/llvm/test/MC/AArch64/SVE/asrr.s
index c95f04a9fea13..5a7b6333bfa52 100644
--- a/llvm/test/MC/AArch64/SVE/asrr.s
+++ b/llvm/test/MC/AArch64/SVE/asrr.s
@@ -12,25 +12,25 @@
 asrr    z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: asrr	z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x80,0x14,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 14 04 <unknown>
 
 asrr    z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: asrr	z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x54,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 54 04 <unknown>
 
 asrr    z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: asrr	z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x80,0x94,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 94 04 <unknown>
 
 asrr    z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: asrr	z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x80,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d4 04 <unknown>
 
 
@@ -40,23 +40,23 @@ asrr    z0.d, p0/m, z0.d, z0.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 asrr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: asrr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x80,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 80 d4 04 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 asrr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: asrr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x80,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 80 d4 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/bfcvt.s b/llvm/test/MC/AArch64/SVE/bfcvt.s
index 96619e1e6cb42..d370d2b7cccda 100644
--- a/llvm/test/MC/AArch64/SVE/bfcvt.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvt.s
@@ -8,24 +8,24 @@
 bfcvt z0.H, p0/m, z1.S
 // CHECK-INST: bfcvt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x8a,0x65]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0.S, p0/m, z2.S
 // CHECK-INST: movprfx z0.s, p0/m, z2.s
 // CHECK-ENCODING: [0x40,0x20,0x91,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfcvt z0.H, p0/m, z1.S
 // CHECK-INST: bfcvt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x8a,0x65]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z2
 // CHECK-INST: movprfx z0, z2
 // CHECK-ENCODING: [0x40,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfcvt z0.H, p0/m, z1.S
 // CHECK-INST: bfcvt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x8a,0x65]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
diff --git a/llvm/test/MC/AArch64/SVE/bfcvtnt.s b/llvm/test/MC/AArch64/SVE/bfcvtnt.s
index 2973ffe88c7d3..8751b6f4ed2d8 100644
--- a/llvm/test/MC/AArch64/SVE/bfcvtnt.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvtnt.s
@@ -8,24 +8,24 @@
 bfcvtnt z0.H, p0/m, z1.S
 // CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0.S, p0/m, z2.S
 // CHECK-INST: movprfx z0.s, p0/m, z2.s
 // CHECK-ENCODING: [0x40,0x20,0x91,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfcvtnt z0.H, p0/m, z1.S
 // CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z2
 // CHECK-INST: movprfx z0, z2
 // CHECK-ENCODING: [0x40,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfcvtnt z0.H, p0/m, z1.S
 // CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
diff --git a/llvm/test/MC/AArch64/SVE/bfdot.s b/llvm/test/MC/AArch64/SVE/bfdot.s
index 08794f21daa9a..c2b48e6a9a48c 100644
--- a/llvm/test/MC/AArch64/SVE/bfdot.s
+++ b/llvm/test/MC/AArch64/SVE/bfdot.s
@@ -8,17 +8,17 @@
 bfdot z0.S, z1.H, z2.H
 // CHECK-INST: bfdot z0.s, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x80,0x62,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfdot z0.S, z1.H, z2.H[0]
 // CHECK-INST: bfdot z0.s, z1.h, z2.h[0]
 // CHECK-ENCODING: [0x20,0x40,0x62,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfdot z0.S, z1.H, z2.H[3]
 // CHECK-INST: bfdot z0.s, z1.h, z2.h[3]
 // CHECK-ENCODING: [0x20,0x40,0x7a,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 // --------------------------------------------------------------------------//
 // Test compatibility with MOVPRFX instruction.
@@ -26,29 +26,29 @@ bfdot z0.S, z1.H, z2.H[3]
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfdot z0.S, z1.H, z2.H
 // CHECK-INST: bfdot z0.s, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x80,0x62,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfdot z0.S, z1.H, z2.H[0]
 // CHECK-INST: bfdot z0.s, z1.h, z2.h[0]
 // CHECK-ENCODING: [0x20,0x40,0x62,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfdot z0.S, z1.H, z2.H[3]
 // CHECK-INST: bfdot z0.s, z1.h, z2.h[3]
 // CHECK-ENCODING: [0x20,0x40,0x7a,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
diff --git a/llvm/test/MC/AArch64/SVE/bfmlal.s b/llvm/test/MC/AArch64/SVE/bfmlal.s
index 6bb7b5876d8e3..4395fe8928247 100644
--- a/llvm/test/MC/AArch64/SVE/bfmlal.s
+++ b/llvm/test/MC/AArch64/SVE/bfmlal.s
@@ -8,52 +8,52 @@
 bfmlalb z0.S, z1.H, z2.H
 // CHECK-INST: bfmlalb z0.s, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x80,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalt z0.S, z1.H, z2.H
 // CHECK-INST: bfmlalt z0.s, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x84,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalb z0.S, z1.H, z2.H[0]
 // CHECK-INST: bfmlalb z0.s, z1.h, z2.h[0]
 // CHECK-ENCODING: [0x20,0x40,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalt z0.S, z1.H, z2.H[0]
 // CHECK-INST: bfmlalt z0.s, z1.h, z2.h[0]
 // CHECK-ENCODING: [0x20,0x44,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalb z0.S, z1.H, z2.H[7]
 // CHECK-INST: bfmlalb z0.s, z1.h, z2.h[7]
 // CHECK-ENCODING: [0x20,0x48,0xfa,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalt z0.S, z1.H, z2.H[7]
 // CHECK-INST: bfmlalt z0.s, z1.h, z2.h[7]
 // CHECK-ENCODING: [0x20,0x4c,0xfa,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalt z0.S, z1.H, z7.H[7]
 // CHECK-INST: bfmlalt z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x4c,0xff,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalb z10.S, z21.H, z14.H
 // CHECK-INST: bfmlalb z10.s, z21.h, z14.h
 // CHECK-ENCODING: [0xaa,0x82,0xee,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalt z14.S, z10.H, z21.H
 // CHECK-INST: bfmlalt z14.s, z10.h, z21.h
 // CHECK-ENCODING: [0x4e,0x85,0xf5,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 bfmlalb z21.s, z14.h, z3.h[2]
 // CHECK-INST: bfmlalb z21.s, z14.h, z3.h[2]
 // CHECK-ENCODING: [0xd5,0x41,0xeb,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 // --------------------------------------------------------------------------//
 // Test compatibility with MOVPRFX instruction.
@@ -61,99 +61,99 @@ bfmlalb z21.s, z14.h, z3.h[2]
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalb z0.S, z1.H, z2.H
 // CHECK-INST: bfmlalb z0.s, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x80,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalt z0.S, z1.H, z2.H
 // CHECK-INST: bfmlalt z0.s, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x84,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalb z0.S, z1.H, z2.H[0]
 // CHECK-INST: bfmlalb z0.s, z1.h, z2.h[0]
 // CHECK-ENCODING: [0x20,0x40,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalt z0.S, z1.H, z2.H[0]
 // CHECK-INST: bfmlalt z0.s, z1.h, z2.h[0]
 // CHECK-ENCODING: [0x20,0x44,0xe2,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalb z0.S, z1.H, z2.H[7]
 // CHECK-INST: bfmlalb z0.s, z1.h, z2.h[7]
 // CHECK-ENCODING: [0x20,0x48,0xfa,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalt z0.S, z1.H, z2.H[7]
 // CHECK-INST: bfmlalt z0.s, z1.h, z2.h[7]
 // CHECK-ENCODING: [0x20,0x4c,0xfa,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalt z0.S, z1.H, z7.H[7]
 // CHECK-INST: bfmlalt z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x4c,0xff,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z10, z7
 // CHECK-INST: movprfx z10, z7
 // CHECK-ENCODING: [0xea,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalb z10.S, z21.H, z14.H
 // CHECK-INST: bfmlalb z10.s, z21.h, z14.h
 // CHECK-ENCODING: [0xaa,0x82,0xee,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z14, z7
 // CHECK-INST: movprfx z14, z7
 // CHECK-ENCODING: [0xee,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalt z14.S, z10.H, z21.H
 // CHECK-INST: bfmlalt z14.s, z10.h, z21.h
 // CHECK-ENCODING: [0x4e,0x85,0xf5,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
 
 movprfx z21, z7
 // CHECK-INST: movprfx z21, z7
 // CHECK-ENCODING: [0xf5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmlalb z21.s, z14.h, z3.h[2]
 // CHECK-INST: bfmlalb z21.s, z14.h, z3.h[2]
 // CHECK-ENCODING: [0xd5,0x41,0xeb,0x64]
-// CHECK-ERROR: instruction requires: bf16 streaming-sve or sve
+// CHECK-ERROR: instruction requires: bf16 sve or sme
diff --git a/llvm/test/MC/AArch64/SVE/bfmmla.s b/llvm/test/MC/AArch64/SVE/bfmmla.s
index 6d4d809341efa..660f3034837a9 100644
--- a/llvm/test/MC/AArch64/SVE/bfmmla.s
+++ b/llvm/test/MC/AArch64/SVE/bfmmla.s
@@ -14,7 +14,7 @@ bfmmla z0.S, z1.H, z2.H
 movprfx z0, z7
 // CHECK-INST: movprfx z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 
 bfmmla z0.S, z1.H, z2.H
 // CHECK-INST: bfmmla z0.s, z1.h, z2.h
diff --git a/llvm/test/MC/AArch64/SVE/bic.s b/llvm/test/MC/AArch64/SVE/bic.s
index 992da415df0ea..af7db10944cea 100644
--- a/llvm/test/MC/AArch64/SVE/bic.s
+++ b/llvm/test/MC/AArch64/SVE/bic.s
@@ -12,97 +12,97 @@
 bic     z5.b, z5.b, #0xf9
 // CHECK-INST: and     z5.b, z5.b, #0x6
 // CHECK-ENCODING: [0x25,0x3e,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 3e 80 05 <unknown>
 
 bic     z23.h, z23.h, #0xfff9
 // CHECK-INST: and     z23.h, z23.h, #0x6
 // CHECK-ENCODING: [0x37,0x7c,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 37 7c 80 05 <unknown>
 
 bic     z0.s, z0.s, #0xfffffff9
 // CHECK-INST: and     z0.s, z0.s, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 80 05 <unknown>
 
 bic     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-INST: and     z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x83,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 83 05 <unknown>
 
 bic     z5.b, z5.b, #0x6
 // CHECK-INST: and     z5.b, z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e 80 05 <unknown>
 
 bic     z23.h, z23.h, #0x6
 // CHECK-INST: and     z23.h, z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d 80 05 <unknown>
 
 bic     z0.s, z0.s, #0x6
 // CHECK-INST: and     z0.s, z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0x80,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb 80 05 <unknown>
 
 bic     z0.d, z0.d, #0x6
 // CHECK-INST: and     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x83,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 83 05 <unknown>
 
 bic     z0.d, z0.d, z0.d
 // CHECK-INST: bic     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 e0 04 <unknown>
 
 bic     z23.d, z13.d, z8.d
 // CHECK-INST: bic     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x31,0xe8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 31 e8 04 <unknown>
 
 bic     z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: bic     z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x1b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 1b 04 <unknown>
 
 bic     z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: bic     z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x5b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 5b 04 <unknown>
 
 bic     z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: bic     z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x9b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 9b 04 <unknown>
 
 bic     z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: bic     z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xdb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f db 04 <unknown>
 
 bic     p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: bic     p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0x7d,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7d 0f 25 <unknown>
 
 bic     p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: bic     p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x10,0x40,0x00,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 00 25 <unknown>
 
 
@@ -112,19 +112,19 @@ bic     p0.b, p0/z, p0.b, p0.b
 bic     z0.s, z0.s, z0.s
 // CHECK-INST: bic     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 e0 04 <unknown>
 
 bic     z0.h, z0.h, z0.h
 // CHECK-INST: bic     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 e0 04 <unknown>
 
 bic     z0.b, z0.b, z0.b
 // CHECK-INST: bic     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 e0 04 <unknown>
 
 
@@ -134,35 +134,35 @@ bic     z0.b, z0.b, z0.b
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 bic     z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: bic	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xdb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f db 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 bic     z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: bic	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xdb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f db 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 bic     z0.d, z0.d, #0x6
 // CHECK-INST: and	z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x83,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 83 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/bics.s b/llvm/test/MC/AArch64/SVE/bics.s
index eb83d3b0c9ba0..6605480cc473c 100644
--- a/llvm/test/MC/AArch64/SVE/bics.s
+++ b/llvm/test/MC/AArch64/SVE/bics.s
@@ -12,11 +12,11 @@
 bics    p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: bics    p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x10,0x40,0x40,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 40 25 <unknown>
 
 bics    p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: bics    p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0x7d,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7d 4f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brka.s b/llvm/test/MC/AArch64/SVE/brka.s
index debe686e3c366..753299a860c44 100644
--- a/llvm/test/MC/AArch64/SVE/brka.s
+++ b/llvm/test/MC/AArch64/SVE/brka.s
@@ -12,11 +12,11 @@
 brka  p0.b, p15/m, p15.b
 // CHECK-INST: brka	p0.b, p15/m, p15.b
 // CHECK-ENCODING: [0xf0,0x7d,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f0 7d 10 25 <unknown>
 
 brka  p0.b, p15/z, p15.b
 // CHECK-INST: brka	p0.b, p15/z, p15.b
 // CHECK-ENCODING: [0xe0,0x7d,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 7d 10 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkas.s b/llvm/test/MC/AArch64/SVE/brkas.s
index 3d37a4e113a92..7de38c524c458 100644
--- a/llvm/test/MC/AArch64/SVE/brkas.s
+++ b/llvm/test/MC/AArch64/SVE/brkas.s
@@ -12,5 +12,5 @@
 brkas  p0.b, p15/z, p15.b
 // CHECK-INST: brkas	p0.b, p15/z, p15.b
 // CHECK-ENCODING: [0xe0,0x7d,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 7d 50 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkb.s b/llvm/test/MC/AArch64/SVE/brkb.s
index ae59b6a8f3b11..c28727dc609c0 100644
--- a/llvm/test/MC/AArch64/SVE/brkb.s
+++ b/llvm/test/MC/AArch64/SVE/brkb.s
@@ -12,11 +12,11 @@
 brkb  p0.b, p15/m, p15.b
 // CHECK-INST: brkb	p0.b, p15/m, p15.b
 // CHECK-ENCODING: [0xf0,0x7d,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f0 7d 90 25 <unknown>
 
 brkb  p0.b, p15/z, p15.b
 // CHECK-INST: brkb	p0.b, p15/z, p15.b
 // CHECK-ENCODING: [0xe0,0x7d,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 7d 90 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkbs.s b/llvm/test/MC/AArch64/SVE/brkbs.s
index 2865ab5127b73..d72c596076517 100644
--- a/llvm/test/MC/AArch64/SVE/brkbs.s
+++ b/llvm/test/MC/AArch64/SVE/brkbs.s
@@ -12,5 +12,5 @@
 brkbs  p0.b, p15/z, p15.b
 // CHECK-INST: brkbs	p0.b, p15/z, p15.b
 // CHECK-ENCODING: [0xe0,0x7d,0xd0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 7d d0 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkn.s b/llvm/test/MC/AArch64/SVE/brkn.s
index 43f189d6c93e0..8f8095fc61619 100644
--- a/llvm/test/MC/AArch64/SVE/brkn.s
+++ b/llvm/test/MC/AArch64/SVE/brkn.s
@@ -12,11 +12,11 @@
 brkn  p0.b, p15/z, p1.b, p0.b
 // CHECK-INST: brkn	p0.b, p15/z, p1.b, p0.b
 // CHECK-ENCODING: [0x20,0x7c,0x18,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c 18 25 <unknown>
 
 brkn  p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: brkn	p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x18,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 18 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkns.s b/llvm/test/MC/AArch64/SVE/brkns.s
index 9aaad2e820ef0..c380e1c2ba39d 100644
--- a/llvm/test/MC/AArch64/SVE/brkns.s
+++ b/llvm/test/MC/AArch64/SVE/brkns.s
@@ -12,11 +12,11 @@
 brkns  p0.b, p15/z, p1.b, p0.b
 // CHECK-INST: brkns	p0.b, p15/z, p1.b, p0.b
 // CHECK-ENCODING: [0x20,0x7c,0x58,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c 58 25 <unknown>
 
 brkns  p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: brkns	p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x58,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 58 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkpa.s b/llvm/test/MC/AArch64/SVE/brkpa.s
index b05fc96a41d44..e384b805e0f4c 100644
--- a/llvm/test/MC/AArch64/SVE/brkpa.s
+++ b/llvm/test/MC/AArch64/SVE/brkpa.s
@@ -12,11 +12,11 @@
 brkpa  p0.b,  p15/z, p1.b,  p2.b
 // CHECK-INST: brkpa	p0.b, p15/z, p1.b, p2.b
 // CHECK-ENCODING: [0x20,0xfc,0x02,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc 02 25 <unknown>
 
 brkpa  p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: brkpa	p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0xfd,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef fd 0f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkpas.s b/llvm/test/MC/AArch64/SVE/brkpas.s
index 6f816e19d5345..5cd7863fe36d0 100644
--- a/llvm/test/MC/AArch64/SVE/brkpas.s
+++ b/llvm/test/MC/AArch64/SVE/brkpas.s
@@ -12,11 +12,11 @@
 brkpas  p0.b,  p15/z, p1.b,  p2.b
 // CHECK-INST: brkpas	p0.b, p15/z, p1.b, p2.b
 // CHECK-ENCODING: [0x20,0xfc,0x42,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc 42 25 <unknown>
 
 brkpas  p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: brkpas	p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0xfd,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef fd 4f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkpb.s b/llvm/test/MC/AArch64/SVE/brkpb.s
index 9db00f8862bfe..5b0d3ec5acce5 100644
--- a/llvm/test/MC/AArch64/SVE/brkpb.s
+++ b/llvm/test/MC/AArch64/SVE/brkpb.s
@@ -12,11 +12,11 @@
 brkpb  p0.b,  p15/z, p1.b,  p2.b
 // CHECK-INST: brkpb	p0.b, p15/z, p1.b, p2.b
 // CHECK-ENCODING: [0x30,0xfc,0x02,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 fc 02 25 <unknown>
 
 brkpb  p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: brkpb	p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0xfd,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff fd 0f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/brkpbs.s b/llvm/test/MC/AArch64/SVE/brkpbs.s
index fd6318a964dba..f7c9d408709fc 100644
--- a/llvm/test/MC/AArch64/SVE/brkpbs.s
+++ b/llvm/test/MC/AArch64/SVE/brkpbs.s
@@ -12,11 +12,11 @@
 brkpbs  p0.b,  p15/z, p1.b,  p2.b
 // CHECK-INST: brkpbs	p0.b, p15/z, p1.b, p2.b
 // CHECK-ENCODING: [0x30,0xfc,0x42,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 fc 42 25 <unknown>
 
 brkpbs  p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: brkpbs	p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0xfd,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff fd 4f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/clasta.s b/llvm/test/MC/AArch64/SVE/clasta.s
index 7620f7b2187fa..bed54441dc0c3 100644
--- a/llvm/test/MC/AArch64/SVE/clasta.s
+++ b/llvm/test/MC/AArch64/SVE/clasta.s
@@ -12,73 +12,73 @@
 clasta   w0, p7, w0, z31.b
 // CHECK-INST: clasta	w0, p7, w0, z31.b
 // CHECK-ENCODING: [0xe0,0xbf,0x30,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 30 05 <unknown>
 
 clasta   w0, p7, w0, z31.h
 // CHECK-INST: clasta	w0, p7, w0, z31.h
 // CHECK-ENCODING: [0xe0,0xbf,0x70,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 70 05 <unknown>
 
 clasta   w0, p7, w0, z31.s
 // CHECK-INST: clasta	w0, p7, w0, z31.s
 // CHECK-ENCODING: [0xe0,0xbf,0xb0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf b0 05 <unknown>
 
 clasta   x0, p7, x0, z31.d
 // CHECK-INST: clasta	x0, p7, x0, z31.d
 // CHECK-ENCODING: [0xe0,0xbf,0xf0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf f0 05 <unknown>
 
 clasta   b0, p7, b0, z31.b
 // CHECK-INST: clasta	b0, p7, b0, z31.b
 // CHECK-ENCODING: [0xe0,0x9f,0x2a,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 2a 05 <unknown>
 
 clasta   h0, p7, h0, z31.h
 // CHECK-INST: clasta	h0, p7, h0, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x6a,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 6a 05 <unknown>
 
 clasta   s0, p7, s0, z31.s
 // CHECK-INST: clasta	s0, p7, s0, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xaa,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f aa 05 <unknown>
 
 clasta   d0, p7, d0, z31.d
 // CHECK-INST: clasta	d0, p7, d0, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xea,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f ea 05 <unknown>
 
 clasta   z0.b, p7, z0.b, z31.b
 // CHECK-INST: clasta	z0.b, p7, z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0x9f,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 28 05 <unknown>
 
 clasta   z0.h, p7, z0.h, z31.h
 // CHECK-INST: clasta	z0.h, p7, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x68,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 68 05 <unknown>
 
 clasta   z0.s, p7, z0.s, z31.s
 // CHECK-INST: clasta	z0.s, p7, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xa8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f a8 05 <unknown>
 
 clasta   z0.d, p7, z0.d, z31.d
 // CHECK-INST: clasta	z0.d, p7, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e8 05 <unknown>
 
 
@@ -88,11 +88,11 @@ clasta   z0.d, p7, z0.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 clasta   z0.d, p7, z0.d, z31.d
 // CHECK-INST: clasta	z0.d, p7, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e8 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/clastb.s b/llvm/test/MC/AArch64/SVE/clastb.s
index 5426ef04d0afc..c564d1d100a4d 100644
--- a/llvm/test/MC/AArch64/SVE/clastb.s
+++ b/llvm/test/MC/AArch64/SVE/clastb.s
@@ -12,73 +12,73 @@
 clastb   w0, p7, w0, z31.b
 // CHECK-INST: clastb	w0, p7, w0, z31.b
 // CHECK-ENCODING: [0xe0,0xbf,0x31,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 31 05 <unknown>
 
 clastb   w0, p7, w0, z31.h
 // CHECK-INST: clastb	w0, p7, w0, z31.h
 // CHECK-ENCODING: [0xe0,0xbf,0x71,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 71 05 <unknown>
 
 clastb   w0, p7, w0, z31.s
 // CHECK-INST: clastb	w0, p7, w0, z31.s
 // CHECK-ENCODING: [0xe0,0xbf,0xb1,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf b1 05 <unknown>
 
 clastb   x0, p7, x0, z31.d
 // CHECK-INST: clastb	x0, p7, x0, z31.d
 // CHECK-ENCODING: [0xe0,0xbf,0xf1,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf f1 05 <unknown>
 
 clastb   b0, p7, b0, z31.b
 // CHECK-INST: clastb	b0, p7, b0, z31.b
 // CHECK-ENCODING: [0xe0,0x9f,0x2b,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 2b 05 <unknown>
 
 clastb   h0, p7, h0, z31.h
 // CHECK-INST: clastb	h0, p7, h0, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x6b,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 6b 05 <unknown>
 
 clastb   s0, p7, s0, z31.s
 // CHECK-INST: clastb	s0, p7, s0, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xab,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f ab 05 <unknown>
 
 clastb   d0, p7, d0, z31.d
 // CHECK-INST: clastb	d0, p7, d0, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xeb,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f eb 05 <unknown>
 
 clastb   z0.b, p7, z0.b, z31.b
 // CHECK-INST: clastb	z0.b, p7, z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0x9f,0x29,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 29 05 <unknown>
 
 clastb   z0.h, p7, z0.h, z31.h
 // CHECK-INST: clastb	z0.h, p7, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x69,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 69 05 <unknown>
 
 clastb   z0.s, p7, z0.s, z31.s
 // CHECK-INST: clastb	z0.s, p7, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xa9,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f a9 05 <unknown>
 
 clastb   z0.d, p7, z0.d, z31.d
 // CHECK-INST: clastb	z0.d, p7, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe9,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e9 05 <unknown>
 
 
@@ -88,11 +88,11 @@ clastb   z0.d, p7, z0.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 clastb   z0.d, p7, z0.d, z31.d
 // CHECK-INST: clastb	z0.d, p7, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe9,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e9 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cls.s b/llvm/test/MC/AArch64/SVE/cls.s
index 02e035eab89c4..127fb2a867848 100644
--- a/llvm/test/MC/AArch64/SVE/cls.s
+++ b/llvm/test/MC/AArch64/SVE/cls.s
@@ -12,25 +12,25 @@
 cls     z31.b, p7/m, z31.b
 // CHECK-INST: cls	z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x18,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 18 04 <unknown>
 
 cls     z31.h, p7/m, z31.h
 // CHECK-INST: cls	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x58,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 58 04 <unknown>
 
 cls     z31.s, p7/m, z31.s
 // CHECK-INST: cls	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x98,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 98 04 <unknown>
 
 cls     z31.d, p7/m, z31.d
 // CHECK-INST: cls	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d8 04 <unknown>
 
 
@@ -40,23 +40,23 @@ cls     z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 cls     z4.d, p7/m, z31.d
 // CHECK-INST: cls	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d8 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 cls     z4.d, p7/m, z31.d
 // CHECK-INST: cls	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d8 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/clz.s b/llvm/test/MC/AArch64/SVE/clz.s
index 73563b7a26e71..07d1f9f4b3304 100644
--- a/llvm/test/MC/AArch64/SVE/clz.s
+++ b/llvm/test/MC/AArch64/SVE/clz.s
@@ -12,25 +12,25 @@
 clz     z31.b, p7/m, z31.b
 // CHECK-INST: clz	z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x19,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 19 04 <unknown>
 
 clz     z31.h, p7/m, z31.h
 // CHECK-INST: clz	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x59,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 59 04 <unknown>
 
 clz     z31.s, p7/m, z31.s
 // CHECK-INST: clz	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x99,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 99 04 <unknown>
 
 clz     z31.d, p7/m, z31.d
 // CHECK-INST: clz	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d9 04 <unknown>
 
 
@@ -40,23 +40,23 @@ clz     z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 clz     z4.d, p7/m, z31.d
 // CHECK-INST: clz	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d9 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 clz     z4.d, p7/m, z31.d
 // CHECK-INST: clz	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d9 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmpeq.s b/llvm/test/MC/AArch64/SVE/cmpeq.s
index 3c2ac7ef7bd10..d5c79f8248a56 100644
--- a/llvm/test/MC/AArch64/SVE/cmpeq.s
+++ b/llvm/test/MC/AArch64/SVE/cmpeq.s
@@ -13,89 +13,89 @@
 cmpeq   p0.b, p0/z, z0.b, z0.b
 // CHECK-INST: cmpeq p0.b, p0/z, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0xa0,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 00 24 <unknown>
 
 cmpeq   p0.h, p0/z, z0.h, z0.h
 // CHECK-INST: cmpeq p0.h, p0/z, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 40 24 <unknown>
 
 cmpeq   p0.s, p0/z, z0.s, z0.s
 // CHECK-INST: cmpeq p0.s, p0/z, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 80 24 <unknown>
 
 cmpeq   p0.d, p0/z, z0.d, z0.d
 // CHECK-INST: cmpeq p0.d, p0/z, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c0 24 <unknown>
 
 cmpeq   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmpeq p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x00,0x20,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 00 24 <unknown>
 
 cmpeq   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmpeq p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x00,0x20,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 40 24 <unknown>
 
 cmpeq   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmpeq p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x00,0x20,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 80 24 <unknown>
 
 cmpeq   p0.b, p0/z, z0.b, #-16
 // CHECK-INST: cmpeq p0.b, p0/z, z0.b, #-16
 // CHECK-ENCODING: [0x00,0x80,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 10 25 <unknown>
 
 cmpeq   p0.h, p0/z, z0.h, #-16
 // CHECK-INST: cmpeq p0.h, p0/z, z0.h, #-16
 // CHECK-ENCODING: [0x00,0x80,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 50 25 <unknown>
 
 cmpeq   p0.s, p0/z, z0.s, #-16
 // CHECK-INST: cmpeq p0.s, p0/z, z0.s, #-16
 // CHECK-ENCODING: [0x00,0x80,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 90 25 <unknown>
 
 cmpeq   p0.d, p0/z, z0.d, #-16
 // CHECK-INST: cmpeq p0.d, p0/z, z0.d, #-16
 // CHECK-ENCODING: [0x00,0x80,0xd0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d0 25 <unknown>
 
 cmpeq   p0.b, p0/z, z0.b, #15
 // CHECK-INST: cmpeq p0.b, p0/z, z0.b, #15
 // CHECK-ENCODING: [0x00,0x80,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 0f 25 <unknown>
 
 cmpeq   p0.h, p0/z, z0.h, #15
 // CHECK-INST: cmpeq p0.h, p0/z, z0.h, #15
 // CHECK-ENCODING: [0x00,0x80,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 4f 25 <unknown>
 
 cmpeq   p0.s, p0/z, z0.s, #15
 // CHECK-INST: cmpeq p0.s, p0/z, z0.s, #15
 // CHECK-ENCODING: [0x00,0x80,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 8f 25 <unknown>
 
 cmpeq   p0.d, p0/z, z0.d, #15
 // CHECK-INST: cmpeq p0.d, p0/z, z0.d, #15
 // CHECK-ENCODING: [0x00,0x80,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmpge.s b/llvm/test/MC/AArch64/SVE/cmpge.s
index 1953087c28da9..14fcfc40f2ac0 100644
--- a/llvm/test/MC/AArch64/SVE/cmpge.s
+++ b/llvm/test/MC/AArch64/SVE/cmpge.s
@@ -12,89 +12,89 @@
 cmpge   p0.b, p0/z, z0.b, z0.b
 // CHECK-INST: cmpge   p0.b, p0/z, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x80,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 00 24 <unknown>
 
 cmpge   p0.h, p0/z, z0.h, z0.h
 // CHECK-INST: cmpge   p0.h, p0/z, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 40 24 <unknown>
 
 cmpge   p0.s, p0/z, z0.s, z0.s
 // CHECK-INST: cmpge   p0.s, p0/z, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x80,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 80 24 <unknown>
 
 cmpge   p0.d, p0/z, z0.d, z0.d
 // CHECK-INST: cmpge   p0.d, p0/z, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x80,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 c0 24 <unknown>
 
 cmpge   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmpge   p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x00,0x40,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 00 24 <unknown>
 
 cmpge   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmpge   p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x00,0x40,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 40 24 <unknown>
 
 cmpge   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmpge   p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x00,0x40,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 80 24 <unknown>
 
 cmpge   p0.b, p0/z, z0.b, #-16
 // CHECK-INST: cmpge p0.b, p0/z, z0.b, #-16
 // CHECK-ENCODING: [0x00,0x00,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 10 25 <unknown>
 
 cmpge   p0.h, p0/z, z0.h, #-16
 // CHECK-INST: cmpge p0.h, p0/z, z0.h, #-16
 // CHECK-ENCODING: [0x00,0x00,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 50 25 <unknown>
 
 cmpge   p0.s, p0/z, z0.s, #-16
 // CHECK-INST: cmpge p0.s, p0/z, z0.s, #-16
 // CHECK-ENCODING: [0x00,0x00,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 90 25 <unknown>
 
 cmpge   p0.d, p0/z, z0.d, #-16
 // CHECK-INST: cmpge p0.d, p0/z, z0.d, #-16
 // CHECK-ENCODING: [0x00,0x00,0xd0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 d0 25 <unknown>
 
 cmpge   p0.b, p0/z, z0.b, #15
 // CHECK-INST: cmpge p0.b, p0/z, z0.b, #15
 // CHECK-ENCODING: [0x00,0x00,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 0f 25 <unknown>
 
 cmpge   p0.h, p0/z, z0.h, #15
 // CHECK-INST: cmpge p0.h, p0/z, z0.h, #15
 // CHECK-ENCODING: [0x00,0x00,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 4f 25 <unknown>
 
 cmpge   p0.s, p0/z, z0.s, #15
 // CHECK-INST: cmpge p0.s, p0/z, z0.s, #15
 // CHECK-ENCODING: [0x00,0x00,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 8f 25 <unknown>
 
 cmpge   p0.d, p0/z, z0.d, #15
 // CHECK-INST: cmpge p0.d, p0/z, z0.d, #15
 // CHECK-ENCODING: [0x00,0x00,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmpgt.s b/llvm/test/MC/AArch64/SVE/cmpgt.s
index 7b60eeacc23ff..34366e2110e3c 100644
--- a/llvm/test/MC/AArch64/SVE/cmpgt.s
+++ b/llvm/test/MC/AArch64/SVE/cmpgt.s
@@ -13,89 +13,89 @@
 cmpgt   p0.b, p0/z, z0.b, z0.b
 // CHECK-INST: cmpgt p0.b, p0/z, z0.b, z0.b
 // CHECK-ENCODING: [0x10,0x80,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 00 24 <unknown>
 
 cmpgt   p0.h, p0/z, z0.h, z0.h
 // CHECK-INST: cmpgt p0.h, p0/z, z0.h, z0.h
 // CHECK-ENCODING: [0x10,0x80,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 40 24 <unknown>
 
 cmpgt   p0.s, p0/z, z0.s, z0.s
 // CHECK-INST: cmpgt p0.s, p0/z, z0.s, z0.s
 // CHECK-ENCODING: [0x10,0x80,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 80 24 <unknown>
 
 cmpgt   p0.d, p0/z, z0.d, z0.d
 // CHECK-INST: cmpgt p0.d, p0/z, z0.d, z0.d
 // CHECK-ENCODING: [0x10,0x80,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 c0 24 <unknown>
 
 cmpgt   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmpgt p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x10,0x40,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 00 24 <unknown>
 
 cmpgt   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmpgt p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x10,0x40,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 40 24 <unknown>
 
 cmpgt   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmpgt p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x10,0x40,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 80 24 <unknown>
 
 cmpgt   p0.b, p0/z, z0.b, #-16
 // CHECK-INST: cmpgt p0.b, p0/z, z0.b, #-16
 // CHECK-ENCODING: [0x10,0x00,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 10 25 <unknown>
 
 cmpgt   p0.h, p0/z, z0.h, #-16
 // CHECK-INST: cmpgt p0.h, p0/z, z0.h, #-16
 // CHECK-ENCODING: [0x10,0x00,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 50 25 <unknown>
 
 cmpgt   p0.s, p0/z, z0.s, #-16
 // CHECK-INST: cmpgt p0.s, p0/z, z0.s, #-16
 // CHECK-ENCODING: [0x10,0x00,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 90 25 <unknown>
 
 cmpgt   p0.d, p0/z, z0.d, #-16
 // CHECK-INST: cmpgt p0.d, p0/z, z0.d, #-16
 // CHECK-ENCODING: [0x10,0x00,0xd0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 d0 25 <unknown>
 
 cmpgt   p0.b, p0/z, z0.b, #15
 // CHECK-INST: cmpgt p0.b, p0/z, z0.b, #15
 // CHECK-ENCODING: [0x10,0x00,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 0f 25 <unknown>
 
 cmpgt   p0.h, p0/z, z0.h, #15
 // CHECK-INST: cmpgt p0.h, p0/z, z0.h, #15
 // CHECK-ENCODING: [0x10,0x00,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 4f 25 <unknown>
 
 cmpgt   p0.s, p0/z, z0.s, #15
 // CHECK-INST: cmpgt p0.s, p0/z, z0.s, #15
 // CHECK-ENCODING: [0x10,0x00,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 8f 25 <unknown>
 
 cmpgt   p0.d, p0/z, z0.d, #15
 // CHECK-INST: cmpgt p0.d, p0/z, z0.d, #15
 // CHECK-ENCODING: [0x10,0x00,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmphi.s b/llvm/test/MC/AArch64/SVE/cmphi.s
index e4c9237a8e333..e2682d3b89ab3 100644
--- a/llvm/test/MC/AArch64/SVE/cmphi.s
+++ b/llvm/test/MC/AArch64/SVE/cmphi.s
@@ -13,89 +13,89 @@
 cmphi   p0.b, p0/z, z0.b, z0.b
 // CHECK-INST: cmphi p0.b, p0/z, z0.b, z0.b
 // CHECK-ENCODING: [0x10,0x00,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 00 24 <unknown>
 
 cmphi   p0.h, p0/z, z0.h, z0.h
 // CHECK-INST: cmphi p0.h, p0/z, z0.h, z0.h
 // CHECK-ENCODING: [0x10,0x00,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 40 24 <unknown>
 
 cmphi   p0.s, p0/z, z0.s, z0.s
 // CHECK-INST: cmphi p0.s, p0/z, z0.s, z0.s
 // CHECK-ENCODING: [0x10,0x00,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 80 24 <unknown>
 
 cmphi   p0.d, p0/z, z0.d, z0.d
 // CHECK-INST: cmphi p0.d, p0/z, z0.d, z0.d
 // CHECK-ENCODING: [0x10,0x00,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 c0 24 <unknown>
 
 cmphi   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmphi p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x10,0xc0,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 00 24 <unknown>
 
 cmphi   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmphi p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x10,0xc0,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 40 24 <unknown>
 
 cmphi   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmphi p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x10,0xc0,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 80 24 <unknown>
 
 cmphi   p0.b, p0/z, z0.b, #0
 // CHECK-INST: cmphi p0.b, p0/z, z0.b, #0
 // CHECK-ENCODING: [0x10,0x00,0x20,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 20 24 <unknown>
 
 cmphi   p0.h, p0/z, z0.h, #0
 // CHECK-INST: cmphi p0.h, p0/z, z0.h, #0
 // CHECK-ENCODING: [0x10,0x00,0x60,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 60 24 <unknown>
 
 cmphi   p0.s, p0/z, z0.s, #0
 // CHECK-INST: cmphi p0.s, p0/z, z0.s, #0
 // CHECK-ENCODING: [0x10,0x00,0xa0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 a0 24 <unknown>
 
 cmphi   p0.d, p0/z, z0.d, #0
 // CHECK-INST: cmphi p0.d, p0/z, z0.d, #0
 // CHECK-ENCODING: [0x10,0x00,0xe0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 00 e0 24 <unknown>
 
 cmphi   p0.b, p0/z, z0.b, #127
 // CHECK-INST: cmphi p0.b, p0/z, z0.b, #127
 // CHECK-ENCODING: [0x10,0xc0,0x3f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 3f 24 <unknown>
 
 cmphi   p0.h, p0/z, z0.h, #127
 // CHECK-INST: cmphi p0.h, p0/z, z0.h, #127
 // CHECK-ENCODING: [0x10,0xc0,0x7f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 7f 24 <unknown>
 
 cmphi   p0.s, p0/z, z0.s, #127
 // CHECK-INST: cmphi p0.s, p0/z, z0.s, #127
 // CHECK-ENCODING: [0x10,0xc0,0xbf,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 bf 24 <unknown>
 
 cmphi   p0.d, p0/z, z0.d, #127
 // CHECK-INST: cmphi p0.d, p0/z, z0.d, #127
 // CHECK-ENCODING: [0x10,0xc0,0xff,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 ff 24 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmphs.s b/llvm/test/MC/AArch64/SVE/cmphs.s
index b80a864b0a12e..bbe3ec323c820 100644
--- a/llvm/test/MC/AArch64/SVE/cmphs.s
+++ b/llvm/test/MC/AArch64/SVE/cmphs.s
@@ -13,89 +13,89 @@
 cmphs   p0.b, p0/z, z0.b, z0.b
 // CHECK-INST: cmphs p0.b, p0/z, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x00,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 00 24 <unknown>
 
 cmphs   p0.h, p0/z, z0.h, z0.h
 // CHECK-INST: cmphs p0.h, p0/z, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x00,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 40 24 <unknown>
 
 cmphs   p0.s, p0/z, z0.s, z0.s
 // CHECK-INST: cmphs p0.s, p0/z, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x00,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 80 24 <unknown>
 
 cmphs   p0.d, p0/z, z0.d, z0.d
 // CHECK-INST: cmphs p0.d, p0/z, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x00,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 c0 24 <unknown>
 
 cmphs   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmphs p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x00,0xc0,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 00 24 <unknown>
 
 cmphs   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmphs p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x00,0xc0,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 40 24 <unknown>
 
 cmphs   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmphs p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x00,0xc0,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 80 24 <unknown>
 
 cmphs   p0.b, p0/z, z0.b, #0
 // CHECK-INST: cmphs p0.b, p0/z, z0.b, #0
 // CHECK-ENCODING: [0x00,0x00,0x20,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 20 24 <unknown>
 
 cmphs   p0.h, p0/z, z0.h, #0
 // CHECK-INST: cmphs p0.h, p0/z, z0.h, #0
 // CHECK-ENCODING: [0x00,0x00,0x60,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 60 24 <unknown>
 
 cmphs   p0.s, p0/z, z0.s, #0
 // CHECK-INST: cmphs p0.s, p0/z, z0.s, #0
 // CHECK-ENCODING: [0x00,0x00,0xa0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 a0 24 <unknown>
 
 cmphs   p0.d, p0/z, z0.d, #0
 // CHECK-INST: cmphs p0.d, p0/z, z0.d, #0
 // CHECK-ENCODING: [0x00,0x00,0xe0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 e0 24 <unknown>
 
 cmphs   p0.b, p0/z, z0.b, #127
 // CHECK-INST: cmphs p0.b, p0/z, z0.b, #127
 // CHECK-ENCODING: [0x00,0xc0,0x3f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 3f 24 <unknown>
 
 cmphs   p0.h, p0/z, z0.h, #127
 // CHECK-INST: cmphs p0.h, p0/z, z0.h, #127
 // CHECK-ENCODING: [0x00,0xc0,0x7f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 7f 24 <unknown>
 
 cmphs   p0.s, p0/z, z0.s, #127
 // CHECK-INST: cmphs p0.s, p0/z, z0.s, #127
 // CHECK-ENCODING: [0x00,0xc0,0xbf,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 bf 24 <unknown>
 
 cmphs   p0.d, p0/z, z0.d, #127
 // CHECK-INST: cmphs p0.d, p0/z, z0.d, #127
 // CHECK-ENCODING: [0x00,0xc0,0xff,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 ff 24 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmple.s b/llvm/test/MC/AArch64/SVE/cmple.s
index ecb8a9ac4fd55..8de87478f04b2 100644
--- a/llvm/test/MC/AArch64/SVE/cmple.s
+++ b/llvm/test/MC/AArch64/SVE/cmple.s
@@ -12,89 +12,89 @@
 cmple   p0.b, p0/z, z0.b, z1.b
 // CHECK-INST: cmpge	p0.b, p0/z, z1.b, z0.b
 // CHECK-ENCODING: [0x20,0x80,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 00 24 <unknown>
 
 cmple   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: cmpge	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x20,0x80,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 40 24 <unknown>
 
 cmple   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: cmpge	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x20,0x80,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 80 24 <unknown>
 
 cmple   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: cmpge	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x20,0x80,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 c0 24 <unknown>
 
 cmple   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmple p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x10,0x60,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 60 00 24 <unknown>
 
 cmple   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmple p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x10,0x60,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 60 40 24 <unknown>
 
 cmple   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmple p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x10,0x60,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 60 80 24 <unknown>
 
 cmple   p0.b, p0/z, z0.b, #-16
 // CHECK-INST: cmple p0.b, p0/z, z0.b, #-16
 // CHECK-ENCODING: [0x10,0x20,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 10 25 <unknown>
 
 cmple   p0.h, p0/z, z0.h, #-16
 // CHECK-INST: cmple p0.h, p0/z, z0.h, #-16
 // CHECK-ENCODING: [0x10,0x20,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 50 25 <unknown>
 
 cmple   p0.s, p0/z, z0.s, #-16
 // CHECK-INST: cmple p0.s, p0/z, z0.s, #-16
 // CHECK-ENCODING: [0x10,0x20,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 90 25 <unknown>
 
 cmple   p0.d, p0/z, z0.d, #-16
 // CHECK-INST: cmple p0.d, p0/z, z0.d, #-16
 // CHECK-ENCODING: [0x10,0x20,0xd0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 d0 25 <unknown>
 
 cmple   p0.b, p0/z, z0.b, #15
 // CHECK-INST: cmple p0.b, p0/z, z0.b, #15
 // CHECK-ENCODING: [0x10,0x20,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 0f 25 <unknown>
 
 cmple   p0.h, p0/z, z0.h, #15
 // CHECK-INST: cmple p0.h, p0/z, z0.h, #15
 // CHECK-ENCODING: [0x10,0x20,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 4f 25 <unknown>
 
 cmple   p0.s, p0/z, z0.s, #15
 // CHECK-INST: cmple p0.s, p0/z, z0.s, #15
 // CHECK-ENCODING: [0x10,0x20,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 8f 25 <unknown>
 
 cmple   p0.d, p0/z, z0.d, #15
 // CHECK-INST: cmple p0.d, p0/z, z0.d, #15
 // CHECK-ENCODING: [0x10,0x20,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmplo.s b/llvm/test/MC/AArch64/SVE/cmplo.s
index 139c86fd40999..30a6120c02144 100644
--- a/llvm/test/MC/AArch64/SVE/cmplo.s
+++ b/llvm/test/MC/AArch64/SVE/cmplo.s
@@ -12,89 +12,89 @@
 cmplo   p0.b, p0/z, z0.b, z1.b
 // CHECK-INST: cmphi	p0.b, p0/z, z1.b, z0.b
 // CHECK-ENCODING: [0x30,0x00,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 00 00 24 <unknown>
 
 cmplo   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: cmphi	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x30,0x00,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 00 40 24 <unknown>
 
 cmplo   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: cmphi	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x30,0x00,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 00 80 24 <unknown>
 
 cmplo   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: cmphi	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x30,0x00,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 00 c0 24 <unknown>
 
 cmplo   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmplo p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x00,0xe0,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 00 24 <unknown>
 
 cmplo   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmplo p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x00,0xe0,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 24 <unknown>
 
 cmplo   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmplo p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x00,0xe0,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 80 24 <unknown>
 
 cmplo   p0.b, p0/z, z0.b, #0
 // CHECK-INST: cmplo p0.b, p0/z, z0.b, #0
 // CHECK-ENCODING: [0x00,0x20,0x20,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 20 24 <unknown>
 
 cmplo   p0.h, p0/z, z0.h, #0
 // CHECK-INST: cmplo p0.h, p0/z, z0.h, #0
 // CHECK-ENCODING: [0x00,0x20,0x60,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 60 24 <unknown>
 
 cmplo   p0.s, p0/z, z0.s, #0
 // CHECK-INST: cmplo p0.s, p0/z, z0.s, #0
 // CHECK-ENCODING: [0x00,0x20,0xa0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 a0 24 <unknown>
 
 cmplo   p0.d, p0/z, z0.d, #0
 // CHECK-INST: cmplo p0.d, p0/z, z0.d, #0
 // CHECK-ENCODING: [0x00,0x20,0xe0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 e0 24 <unknown>
 
 cmplo   p0.b, p0/z, z0.b, #127
 // CHECK-INST: cmplo p0.b, p0/z, z0.b, #127
 // CHECK-ENCODING: [0x00,0xe0,0x3f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 3f 24 <unknown>
 
 cmplo   p0.h, p0/z, z0.h, #127
 // CHECK-INST: cmplo p0.h, p0/z, z0.h, #127
 // CHECK-ENCODING: [0x00,0xe0,0x7f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 7f 24 <unknown>
 
 cmplo   p0.s, p0/z, z0.s, #127
 // CHECK-INST: cmplo p0.s, p0/z, z0.s, #127
 // CHECK-ENCODING: [0x00,0xe0,0xbf,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 bf 24 <unknown>
 
 cmplo   p0.d, p0/z, z0.d, #127
 // CHECK-INST: cmplo p0.d, p0/z, z0.d, #127
 // CHECK-ENCODING: [0x00,0xe0,0xff,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 ff 24 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmpls.s b/llvm/test/MC/AArch64/SVE/cmpls.s
index c7aea83f9b373..195a4099d2d26 100644
--- a/llvm/test/MC/AArch64/SVE/cmpls.s
+++ b/llvm/test/MC/AArch64/SVE/cmpls.s
@@ -12,89 +12,89 @@
 cmpls   p0.b, p0/z, z0.b, z1.b
 // CHECK-INST: cmphs	p0.b, p0/z, z1.b, z0.b
 // CHECK-ENCODING: [0x20,0x00,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 00 24 <unknown>
 
 cmpls   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: cmphs	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x20,0x00,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 40 24 <unknown>
 
 cmpls   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: cmphs	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x20,0x00,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 80 24 <unknown>
 
 cmpls   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: cmphs	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x20,0x00,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 c0 24 <unknown>
 
 cmpls   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmpls p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x10,0xe0,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 00 24 <unknown>
 
 cmpls   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmpls p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x10,0xe0,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 40 24 <unknown>
 
 cmpls   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmpls p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x10,0xe0,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 80 24 <unknown>
 
 cmpls   p0.b, p0/z, z0.b, #0
 // CHECK-INST: cmpls p0.b, p0/z, z0.b, #0
 // CHECK-ENCODING: [0x10,0x20,0x20,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 20 24 <unknown>
 
 cmpls   p0.h, p0/z, z0.h, #0
 // CHECK-INST: cmpls p0.h, p0/z, z0.h, #0
 // CHECK-ENCODING: [0x10,0x20,0x60,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 60 24 <unknown>
 
 cmpls   p0.s, p0/z, z0.s, #0
 // CHECK-INST: cmpls p0.s, p0/z, z0.s, #0
 // CHECK-ENCODING: [0x10,0x20,0xa0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 a0 24 <unknown>
 
 cmpls   p0.d, p0/z, z0.d, #0
 // CHECK-INST: cmpls p0.d, p0/z, z0.d, #0
 // CHECK-ENCODING: [0x10,0x20,0xe0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 e0 24 <unknown>
 
 cmpls   p0.b, p0/z, z0.b, #127
 // CHECK-INST: cmpls p0.b, p0/z, z0.b, #127
 // CHECK-ENCODING: [0x10,0xe0,0x3f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 3f 24 <unknown>
 
 cmpls   p0.h, p0/z, z0.h, #127
 // CHECK-INST: cmpls p0.h, p0/z, z0.h, #127
 // CHECK-ENCODING: [0x10,0xe0,0x7f,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 7f 24 <unknown>
 
 cmpls   p0.s, p0/z, z0.s, #127
 // CHECK-INST: cmpls p0.s, p0/z, z0.s, #127
 // CHECK-ENCODING: [0x10,0xe0,0xbf,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 bf 24 <unknown>
 
 cmpls   p0.d, p0/z, z0.d, #127
 // CHECK-INST: cmpls p0.d, p0/z, z0.d, #127
 // CHECK-ENCODING: [0x10,0xe0,0xff,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 ff 24 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmplt.s b/llvm/test/MC/AArch64/SVE/cmplt.s
index 77353120b199e..93af97cf23f62 100644
--- a/llvm/test/MC/AArch64/SVE/cmplt.s
+++ b/llvm/test/MC/AArch64/SVE/cmplt.s
@@ -12,89 +12,89 @@
 cmplt   p0.b, p0/z, z0.b, z1.b
 // CHECK-INST: cmpgt	p0.b, p0/z, z1.b, z0.b
 // CHECK-ENCODING: [0x30,0x80,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 80 00 24 <unknown>
 
 cmplt   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: cmpgt	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x30,0x80,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 80 40 24 <unknown>
 
 cmplt   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: cmpgt	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x30,0x80,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 80 80 24 <unknown>
 
 cmplt   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: cmpgt	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x30,0x80,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 80 c0 24 <unknown>
 
 cmplt   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmplt p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x00,0x60,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 00 24 <unknown>
 
 cmplt   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmplt p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x00,0x60,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 40 24 <unknown>
 
 cmplt   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmplt p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x00,0x60,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 80 24 <unknown>
 
 cmplt   p0.b, p0/z, z0.b, #-16
 // CHECK-INST: cmplt p0.b, p0/z, z0.b, #-16
 // CHECK-ENCODING: [0x00,0x20,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 10 25 <unknown>
 
 cmplt   p0.h, p0/z, z0.h, #-16
 // CHECK-INST: cmplt p0.h, p0/z, z0.h, #-16
 // CHECK-ENCODING: [0x00,0x20,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 50 25 <unknown>
 
 cmplt   p0.s, p0/z, z0.s, #-16
 // CHECK-INST: cmplt p0.s, p0/z, z0.s, #-16
 // CHECK-ENCODING: [0x00,0x20,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 90 25 <unknown>
 
 cmplt   p0.d, p0/z, z0.d, #-16
 // CHECK-INST: cmplt p0.d, p0/z, z0.d, #-16
 // CHECK-ENCODING: [0x00,0x20,0xd0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 d0 25 <unknown>
 
 cmplt   p0.b, p0/z, z0.b, #15
 // CHECK-INST: cmplt p0.b, p0/z, z0.b, #15
 // CHECK-ENCODING: [0x00,0x20,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 0f 25 <unknown>
 
 cmplt   p0.h, p0/z, z0.h, #15
 // CHECK-INST: cmplt p0.h, p0/z, z0.h, #15
 // CHECK-ENCODING: [0x00,0x20,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 4f 25 <unknown>
 
 cmplt   p0.s, p0/z, z0.s, #15
 // CHECK-INST: cmplt p0.s, p0/z, z0.s, #15
 // CHECK-ENCODING: [0x00,0x20,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 8f 25 <unknown>
 
 cmplt   p0.d, p0/z, z0.d, #15
 // CHECK-INST: cmplt p0.d, p0/z, z0.d, #15
 // CHECK-ENCODING: [0x00,0x20,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cmpne.s b/llvm/test/MC/AArch64/SVE/cmpne.s
index 59a844a27dc29..ff7982e41df71 100644
--- a/llvm/test/MC/AArch64/SVE/cmpne.s
+++ b/llvm/test/MC/AArch64/SVE/cmpne.s
@@ -13,89 +13,89 @@
 cmpne   p0.b, p0/z, z0.b, z0.b
 // CHECK-INST: cmpne p0.b, p0/z, z0.b, z0.b
 // CHECK-ENCODING: [0x10,0xa0,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 a0 00 24 <unknown>
 
 cmpne   p0.h, p0/z, z0.h, z0.h
 // CHECK-INST: cmpne p0.h, p0/z, z0.h, z0.h
 // CHECK-ENCODING: [0x10,0xa0,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 a0 40 24 <unknown>
 
 cmpne   p0.s, p0/z, z0.s, z0.s
 // CHECK-INST: cmpne p0.s, p0/z, z0.s, z0.s
 // CHECK-ENCODING: [0x10,0xa0,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 a0 80 24 <unknown>
 
 cmpne   p0.d, p0/z, z0.d, z0.d
 // CHECK-INST: cmpne p0.d, p0/z, z0.d, z0.d
 // CHECK-ENCODING: [0x10,0xa0,0xc0,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 a0 c0 24 <unknown>
 
 cmpne   p0.b, p0/z, z0.b, z0.d
 // CHECK-INST: cmpne p0.b, p0/z, z0.b, z0.d
 // CHECK-ENCODING: [0x10,0x20,0x00,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 00 24 <unknown>
 
 cmpne   p0.h, p0/z, z0.h, z0.d
 // CHECK-INST: cmpne p0.h, p0/z, z0.h, z0.d
 // CHECK-ENCODING: [0x10,0x20,0x40,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 40 24 <unknown>
 
 cmpne   p0.s, p0/z, z0.s, z0.d
 // CHECK-INST: cmpne p0.s, p0/z, z0.s, z0.d
 // CHECK-ENCODING: [0x10,0x20,0x80,0x24]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 80 24 <unknown>
 
 cmpne   p0.b, p0/z, z0.b, #-16
 // CHECK-INST: cmpne p0.b, p0/z, z0.b, #-16
 // CHECK-ENCODING: [0x10,0x80,0x10,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 10 25 <unknown>
 
 cmpne   p0.h, p0/z, z0.h, #-16
 // CHECK-INST: cmpne p0.h, p0/z, z0.h, #-16
 // CHECK-ENCODING: [0x10,0x80,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 50 25 <unknown>
 
 cmpne   p0.s, p0/z, z0.s, #-16
 // CHECK-INST: cmpne p0.s, p0/z, z0.s, #-16
 // CHECK-ENCODING: [0x10,0x80,0x90,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 90 25 <unknown>
 
 cmpne   p0.d, p0/z, z0.d, #-16
 // CHECK-INST: cmpne p0.d, p0/z, z0.d, #-16
 // CHECK-ENCODING: [0x10,0x80,0xd0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 d0 25 <unknown>
 
 cmpne   p0.b, p0/z, z0.b, #15
 // CHECK-INST: cmpne p0.b, p0/z, z0.b, #15
 // CHECK-ENCODING: [0x10,0x80,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 0f 25 <unknown>
 
 cmpne   p0.h, p0/z, z0.h, #15
 // CHECK-INST: cmpne p0.h, p0/z, z0.h, #15
 // CHECK-ENCODING: [0x10,0x80,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 4f 25 <unknown>
 
 cmpne   p0.s, p0/z, z0.s, #15
 // CHECK-INST: cmpne p0.s, p0/z, z0.s, #15
 // CHECK-ENCODING: [0x10,0x80,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 8f 25 <unknown>
 
 cmpne   p0.d, p0/z, z0.d, #15
 // CHECK-INST: cmpne p0.d, p0/z, z0.d, #15
 // CHECK-ENCODING: [0x10,0x80,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 80 cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cnot.s b/llvm/test/MC/AArch64/SVE/cnot.s
index fbc1f12b30964..29d1c4021be1b 100644
--- a/llvm/test/MC/AArch64/SVE/cnot.s
+++ b/llvm/test/MC/AArch64/SVE/cnot.s
@@ -12,25 +12,25 @@
 cnot    z31.b, p7/m, z31.b
 // CHECK-INST: cnot	z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x1b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 1b 04 <unknown>
 
 cnot    z31.h, p7/m, z31.h
 // CHECK-INST: cnot	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x5b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 5b 04 <unknown>
 
 cnot    z31.s, p7/m, z31.s
 // CHECK-INST: cnot	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x9b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 9b 04 <unknown>
 
 cnot    z31.d, p7/m, z31.d
 // CHECK-INST: cnot	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xdb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf db 04 <unknown>
 
 
@@ -40,23 +40,23 @@ cnot    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 cnot    z4.d, p7/m, z31.d
 // CHECK-INST: cnot	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xdb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf db 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 cnot    z4.d, p7/m, z31.d
 // CHECK-INST: cnot	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xdb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf db 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cnt.s b/llvm/test/MC/AArch64/SVE/cnt.s
index f3b8b4b9b9bd3..c63436a1547e2 100644
--- a/llvm/test/MC/AArch64/SVE/cnt.s
+++ b/llvm/test/MC/AArch64/SVE/cnt.s
@@ -12,25 +12,25 @@
 cnt     z31.b, p7/m, z31.b
 // CHECK-INST: cnt	z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x1a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 1a 04 <unknown>
 
 cnt     z31.h, p7/m, z31.h
 // CHECK-INST: cnt	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x5a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 5a 04 <unknown>
 
 cnt     z31.s, p7/m, z31.s
 // CHECK-INST: cnt	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x9a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 9a 04 <unknown>
 
 cnt     z31.d, p7/m, z31.d
 // CHECK-INST: cnt	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xda,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf da 04 <unknown>
 
 
@@ -40,23 +40,23 @@ cnt     z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 cnt     z4.d, p7/m, z31.d
 // CHECK-INST: cnt	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xda,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf da 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 cnt     z4.d, p7/m, z31.d
 // CHECK-INST: cnt	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xda,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf da 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cntb.s b/llvm/test/MC/AArch64/SVE/cntb.s
index ea3b47618c048..f1575f09099f6 100644
--- a/llvm/test/MC/AArch64/SVE/cntb.s
+++ b/llvm/test/MC/AArch64/SVE/cntb.s
@@ -12,35 +12,35 @@
 cntb  x0
 // CHECK-INST: cntb	x0
 // CHECK-ENCODING: [0xe0,0xe3,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 20 04 <unknown>
 
 cntb  x0, all
 // CHECK-INST: cntb	x0
 // CHECK-ENCODING: [0xe0,0xe3,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 20 04 <unknown>
 
 cntb  x0, all, mul #1
 // CHECK-INST: cntb	x0
 // CHECK-ENCODING: [0xe0,0xe3,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 20 04 <unknown>
 
 cntb  x0, all, mul #16
 // CHECK-INST: cntb	x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 2f 04 <unknown>
 
 cntb  x0, pow2
 // CHECK-INST: cntb	x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 20 04 <unknown>
 
 cntb  x0, #28
 // CHECK-INST: cntb	x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 20 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cntd.s b/llvm/test/MC/AArch64/SVE/cntd.s
index 2e6004a98b33e..f06544ba9ffa6 100644
--- a/llvm/test/MC/AArch64/SVE/cntd.s
+++ b/llvm/test/MC/AArch64/SVE/cntd.s
@@ -12,35 +12,35 @@
 cntd  x0
 // CHECK-INST: cntd	x0
 // CHECK-ENCODING: [0xe0,0xe3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 e0 04 <unknown>
 
 cntd  x0, all
 // CHECK-INST: cntd	x0
 // CHECK-ENCODING: [0xe0,0xe3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 e0 04 <unknown>
 
 cntd  x0, all, mul #1
 // CHECK-INST: cntd	x0
 // CHECK-ENCODING: [0xe0,0xe3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 e0 04 <unknown>
 
 cntd  x0, all, mul #16
 // CHECK-INST: cntd	x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 ef 04 <unknown>
 
 cntd  x0, pow2
 // CHECK-INST: cntd	x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 04 <unknown>
 
 cntd  x0, #28
 // CHECK-INST: cntd	x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 e0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cnth.s b/llvm/test/MC/AArch64/SVE/cnth.s
index a6e058b340773..e2ed0e237720f 100644
--- a/llvm/test/MC/AArch64/SVE/cnth.s
+++ b/llvm/test/MC/AArch64/SVE/cnth.s
@@ -12,35 +12,35 @@
 cnth  x0
 // CHECK-INST: cnth	x0
 // CHECK-ENCODING: [0xe0,0xe3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 60 04 <unknown>
 
 cnth  x0, all
 // CHECK-INST: cnth	x0
 // CHECK-ENCODING: [0xe0,0xe3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 60 04 <unknown>
 
 cnth  x0, all, mul #1
 // CHECK-INST: cnth	x0
 // CHECK-ENCODING: [0xe0,0xe3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 60 04 <unknown>
 
 cnth  x0, all, mul #16
 // CHECK-INST: cnth	x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 6f 04 <unknown>
 
 cnth  x0, pow2
 // CHECK-INST: cnth	x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 04 <unknown>
 
 cnth  x0, #28
 // CHECK-INST: cnth	x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 60 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cntp.s b/llvm/test/MC/AArch64/SVE/cntp.s
index ac472838fc3f1..07abd09c3128c 100644
--- a/llvm/test/MC/AArch64/SVE/cntp.s
+++ b/llvm/test/MC/AArch64/SVE/cntp.s
@@ -12,23 +12,23 @@
 cntp  x0, p15, p0.b
 // CHECK-INST: cntp	x0, p15, p0.b
 // CHECK-ENCODING: [0x00,0xbc,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 bc 20 25 <unknown>
 
 cntp  x0, p15, p0.h
 // CHECK-INST: cntp	x0, p15, p0.h
 // CHECK-ENCODING: [0x00,0xbc,0x60,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 bc 60 25 <unknown>
 
 cntp  x0, p15, p0.s
 // CHECK-INST: cntp	x0, p15, p0.s
 // CHECK-ENCODING: [0x00,0xbc,0xa0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 bc a0 25 <unknown>
 
 cntp  x0, p15, p0.d
 // CHECK-INST: cntp	x0, p15, p0.d
 // CHECK-ENCODING: [0x00,0xbc,0xe0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 bc e0 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cntw.s b/llvm/test/MC/AArch64/SVE/cntw.s
index 851b2e1643da9..6a3d84e84b89e 100644
--- a/llvm/test/MC/AArch64/SVE/cntw.s
+++ b/llvm/test/MC/AArch64/SVE/cntw.s
@@ -12,35 +12,35 @@
 cntw  x0
 // CHECK-INST: cntw	x0
 // CHECK-ENCODING: [0xe0,0xe3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 a0 04 <unknown>
 
 cntw  x0, all
 // CHECK-INST: cntw	x0
 // CHECK-ENCODING: [0xe0,0xe3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 a0 04 <unknown>
 
 cntw  x0, all, mul #1
 // CHECK-INST: cntw	x0
 // CHECK-ENCODING: [0xe0,0xe3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 a0 04 <unknown>
 
 cntw  x0, all, mul #16
 // CHECK-INST: cntw	x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 af 04 <unknown>
 
 cntw  x0, pow2
 // CHECK-INST: cntw	x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a0 04 <unknown>
 
 cntw  x0, #28
 // CHECK-INST: cntw	x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 a0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/cpy.s b/llvm/test/MC/AArch64/SVE/cpy.s
index 441e9466df964..a60ece8e188a3 100644
--- a/llvm/test/MC/AArch64/SVE/cpy.s
+++ b/llvm/test/MC/AArch64/SVE/cpy.s
@@ -12,223 +12,223 @@
 cpy     z0.b, p0/m, w0
 // CHECK-INST: mov     z0.b, p0/m, w0
 // CHECK-ENCODING: [0x00,0xa0,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 28 05 <unknown>
 
 cpy     z0.h, p0/m, w0
 // CHECK-INST: mov     z0.h, p0/m, w0
 // CHECK-ENCODING: [0x00,0xa0,0x68,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 68 05 <unknown>
 
 cpy     z0.s, p0/m, w0
 // CHECK-INST: mov     z0.s, p0/m, w0
 // CHECK-ENCODING: [0x00,0xa0,0xa8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 a8 05 <unknown>
 
 cpy     z0.d, p0/m, x0
 // CHECK-INST: mov     z0.d, p0/m, x0
 // CHECK-ENCODING: [0x00,0xa0,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 e8 05 <unknown>
 
 cpy     z31.b, p7/m, wsp
 // CHECK-INST: mov     z31.b, p7/m, wsp
 // CHECK-ENCODING: [0xff,0xbf,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 28 05 <unknown>
 
 cpy     z31.h, p7/m, wsp
 // CHECK-INST: mov     z31.h, p7/m, wsp
 // CHECK-ENCODING: [0xff,0xbf,0x68,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 68 05 <unknown>
 
 cpy     z31.s, p7/m, wsp
 // CHECK-INST: mov     z31.s, p7/m, wsp
 // CHECK-ENCODING: [0xff,0xbf,0xa8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf a8 05 <unknown>
 
 cpy     z31.d, p7/m, sp
 // CHECK-INST: mov     z31.d, p7/m, sp
 // CHECK-ENCODING: [0xff,0xbf,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf e8 05 <unknown>
 
 cpy     z0.b, p0/m, b0
 // CHECK-INST: mov     z0.b, p0/m, b0
 // CHECK-ENCODING: [0x00,0x80,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 20 05 <unknown>
 
 cpy     z31.b, p7/m, b31
 // CHECK-INST: mov     z31.b, p7/m, b31
 // CHECK-ENCODING: [0xff,0x9f,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 20 05 <unknown>
 
 cpy     z0.h, p0/m, h0
 // CHECK-INST: mov     z0.h, p0/m, h0
 // CHECK-ENCODING: [0x00,0x80,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 60 05 <unknown>
 
 cpy     z31.h, p7/m, h31
 // CHECK-INST: mov     z31.h, p7/m, h31
 // CHECK-ENCODING: [0xff,0x9f,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 60 05 <unknown>
 
 cpy     z0.s, p0/m, s0
 // CHECK-INST: mov     z0.s, p0/m, s0
 // CHECK-ENCODING: [0x00,0x80,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 a0 05 <unknown>
 
 cpy     z31.s, p7/m, s31
 // CHECK-INST: mov     z31.s, p7/m, s31
 // CHECK-ENCODING: [0xff,0x9f,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f a0 05 <unknown>
 
 cpy     z0.d, p0/m, d0
 // CHECK-INST: mov     z0.d, p0/m, d0
 // CHECK-ENCODING: [0x00,0x80,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e0 05 <unknown>
 
 cpy     z31.d, p7/m, d31
 // CHECK-INST: mov     z31.d, p7/m, d31
 // CHECK-ENCODING: [0xff,0x9f,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f e0 05 <unknown>
 
 cpy     z5.b, p0/z, #-128
 // CHECK-INST: mov     z5.b, p0/z, #-128
 // CHECK-ENCODING: [0x05,0x10,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 10 10 05  <unknown>
 
 cpy     z5.b, p0/z, #127
 // CHECK-INST: mov     z5.b, p0/z, #127
 // CHECK-ENCODING: [0xe5,0x0f,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 0f 10 05  <unknown>
 
 cpy     z5.b, p0/z, #255
 // CHECK-INST: mov     z5.b, p0/z, #-1
 // CHECK-ENCODING: [0xe5,0x1f,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 1f 10 05  <unknown>
 
 cpy     z21.h, p0/z, #-128
 // CHECK-INST: mov     z21.h, p0/z, #-128
 // CHECK-ENCODING: [0x15,0x10,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 10 50 05  <unknown>
 
 cpy     z21.h, p0/z, #-128, lsl #8
 // CHECK-INST: mov     z21.h, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 50 05  <unknown>
 
 cpy     z21.h, p0/z, #-32768
 // CHECK-INST: mov     z21.h, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 50 05  <unknown>
 
 cpy     z21.h, p0/z, #127
 // CHECK-INST: mov     z21.h, p0/z, #127
 // CHECK-ENCODING: [0xf5,0x0f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 0f 50 05  <unknown>
 
 cpy     z21.h, p0/z, #127, lsl #8
 // CHECK-INST: mov     z21.h, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 50 05  <unknown>
 
 cpy     z21.h, p0/z, #32512
 // CHECK-INST: mov     z21.h, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 50 05  <unknown>
 
 cpy     z21.s, p0/z, #-128
 // CHECK-INST: mov     z21.s, p0/z, #-128
 // CHECK-ENCODING: [0x15,0x10,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 10 90 05  <unknown>
 
 cpy     z21.s, p0/z, #-128, lsl #8
 // CHECK-INST: mov     z21.s, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 90 05  <unknown>
 
 cpy     z21.s, p0/z, #-32768
 // CHECK-INST: mov     z21.s, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 90 05  <unknown>
 
 cpy     z21.s, p0/z, #127
 // CHECK-INST: mov     z21.s, p0/z, #127
 // CHECK-ENCODING: [0xf5,0x0f,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 0f 90 05  <unknown>
 
 cpy     z21.s, p0/z, #127, lsl #8
 // CHECK-INST: mov     z21.s, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 90 05  <unknown>
 
 cpy     z21.s, p0/z, #32512
 // CHECK-INST: mov     z21.s, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 90 05  <unknown>
 
 cpy     z21.d, p0/z, #-128
 // CHECK-INST: mov     z21.d, p0/z, #-128
 // CHECK-ENCODING: [0x15,0x10,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 10 d0 05  <unknown>
 
 cpy     z21.d, p0/z, #-128, lsl #8
 // CHECK-INST: mov     z21.d, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 d0 05  <unknown>
 
 cpy     z21.d, p0/z, #-32768
 // CHECK-INST: mov     z21.d, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 d0 05  <unknown>
 
 cpy     z21.d, p0/z, #127
 // CHECK-INST: mov     z21.d, p0/z, #127
 // CHECK-ENCODING: [0xf5,0x0f,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 0f d0 05  <unknown>
 
 cpy     z21.d, p0/z, #127, lsl #8
 // CHECK-INST: mov     z21.d, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f d0 05  <unknown>
 
 cpy     z21.d, p0/z, #32512
 // CHECK-INST: mov     z21.d, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f d0 05  <unknown>
 
 // --------------------------------------------------------------------------//
@@ -238,19 +238,19 @@ cpy     z21.d, p0/z, #32512
 cpy z0.b, p0/z, #-129
 // CHECK-INST: mov     z0.b, p0/z, #127
 // CHECK-ENCODING: [0xe0,0x0f,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 0f 10 05  <unknown>
 
 cpy z0.h, p0/z, #-33024
 // CHECK-INST: mov     z0.h, p0/z, #32512
 // CHECK-ENCODING: [0xe0,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 2f 50 05  <unknown>
 
 cpy z0.h, p0/z, #-129, lsl #8
 // CHECK-INST: mov     z0.h, p0/z, #32512
 // CHECK-ENCODING: [0xe0,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 2f 50 05  <unknown>
 
 
@@ -261,43 +261,43 @@ cpy z0.h, p0/z, #-129, lsl #8
 cpy     z5.b, p15/m, #-128
 // CHECK-INST: mov     z5.b, p15/m, #-128
 // CHECK-ENCODING: [0x05,0x50,0x1f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 50 1f 05  <unknown>
 
 cpy     z21.h, p15/m, #-128
 // CHECK-INST: mov     z21.h, p15/m, #-128
 // CHECK-ENCODING: [0x15,0x50,0x5f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 5f 05  <unknown>
 
 cpy     z21.h, p15/m, #-128, lsl #8
 // CHECK-INST: mov     z21.h, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0x5f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 5f 05  <unknown>
 
 cpy     z21.s, p15/m, #-128
 // CHECK-INST: mov     z21.s, p15/m, #-128
 // CHECK-ENCODING: [0x15,0x50,0x9f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 9f 05  <unknown>
 
 cpy     z21.s, p15/m, #-128, lsl #8
 // CHECK-INST: mov     z21.s, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0x9f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 9f 05  <unknown>
 
 cpy     z21.d, p15/m, #-128
 // CHECK-INST: mov     z21.d, p15/m, #-128
 // CHECK-ENCODING: [0x15,0x50,0xdf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 df 05  <unknown>
 
 cpy     z21.d, p15/m, #-128, lsl #8
 // CHECK-INST: mov     z21.d, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0xdf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 df 05  <unknown>
 
 
@@ -307,71 +307,71 @@ cpy     z21.d, p15/m, #-128, lsl #8
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 cpy     z31.d, p7/m, sp
 // CHECK-INST: mov	z31.d, p7/m, sp
 // CHECK-ENCODING: [0xff,0xbf,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf e8 05 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 cpy     z31.d, p7/m, sp
 // CHECK-INST: mov	z31.d, p7/m, sp
 // CHECK-ENCODING: [0xff,0xbf,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf e8 05 <unknown>
 
 movprfx z21.d, p7/z, z28.d
 // CHECK-INST: movprfx	z21.d, p7/z, z28.d
 // CHECK-ENCODING: [0x95,0x3f,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 3f d0 04 <unknown>
 
 cpy     z21.d, p7/m, #-128, lsl #8
 // CHECK-INST: mov	z21.d, p7/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0xd7,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 d7 05 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 cpy     z21.d, p15/m, #-128, lsl #8
 // CHECK-INST: mov	z21.d, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0xdf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 df 05 <unknown>
 
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 cpy     z4.d, p7/m, d31
 // CHECK-INST: mov	z4.d, p7/m, d31
 // CHECK-ENCODING: [0xe4,0x9f,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 9f e0 05 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 cpy     z4.d, p7/m, d31
 // CHECK-INST: mov	z4.d, p7/m, d31
 // CHECK-ENCODING: [0xe4,0x9f,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 9f e0 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ctermeq.s b/llvm/test/MC/AArch64/SVE/ctermeq.s
index f6a8650aa8836..a9da41cd58b77 100644
--- a/llvm/test/MC/AArch64/SVE/ctermeq.s
+++ b/llvm/test/MC/AArch64/SVE/ctermeq.s
@@ -12,23 +12,23 @@
 ctermeq w30, wzr
 // CHECK-INST: ctermeq	w30, wzr
 // CHECK-ENCODING: [0xc0,0x23,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 23 bf 25 <unknown>
 
 ctermeq wzr, w30
 // CHECK-INST: ctermeq	wzr, w30
 // CHECK-ENCODING: [0xe0,0x23,0xbe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 23 be 25 <unknown>
 
 ctermeq x30, xzr
 // CHECK-INST: ctermeq	x30, xzr
 // CHECK-ENCODING: [0xc0,0x23,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 23 ff 25 <unknown>
 
 ctermeq xzr, x30
 // CHECK-INST: ctermeq	xzr, x30
 // CHECK-ENCODING: [0xe0,0x23,0xfe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 23 fe 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ctermne.s b/llvm/test/MC/AArch64/SVE/ctermne.s
index 2f302b50cfa4d..35092ead6a8ec 100644
--- a/llvm/test/MC/AArch64/SVE/ctermne.s
+++ b/llvm/test/MC/AArch64/SVE/ctermne.s
@@ -12,23 +12,23 @@
 ctermne w30, wzr
 // CHECK-INST: ctermne	w30, wzr
 // CHECK-ENCODING: [0xd0,0x23,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: d0 23 bf 25 <unknown>
 
 ctermne wzr, w30
 // CHECK-INST: ctermne	wzr, w30
 // CHECK-ENCODING: [0xf0,0x23,0xbe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f0 23 be 25 <unknown>
 
 ctermne x30, xzr
 // CHECK-INST: ctermne	x30, xzr
 // CHECK-ENCODING: [0xd0,0x23,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: d0 23 ff 25 <unknown>
 
 ctermne xzr, x30
 // CHECK-INST: ctermne	xzr, x30
 // CHECK-ENCODING: [0xf0,0x23,0xfe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f0 23 fe 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/decb.s b/llvm/test/MC/AArch64/SVE/decb.s
index 71f9f6a39db18..02d12bb4f8b8e 100644
--- a/llvm/test/MC/AArch64/SVE/decb.s
+++ b/llvm/test/MC/AArch64/SVE/decb.s
@@ -12,119 +12,119 @@
 decb    x0
 // CHECK-INST: decb    x0
 // CHECK-ENCODING: [0xe0,0xe7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 30 04 <unknown>
 
 decb    x0, all
 // CHECK-INST: decb    x0
 // CHECK-ENCODING: [0xe0,0xe7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 30 04 <unknown>
 
 decb    x0, all, mul #1
 // CHECK-INST: decb    x0
 // CHECK-ENCODING: [0xe0,0xe7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 30 04 <unknown>
 
 decb    x0, all, mul #16
 // CHECK-INST: decb    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe7,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 3f 04 <unknown>
 
 decb    x0, pow2
 // CHECK-INST: decb    x0, pow2
 // CHECK-ENCODING: [0x00,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e4 30 04 <unknown>
 
 decb    x0, vl1
 // CHECK-INST: decb    x0, vl1
 // CHECK-ENCODING: [0x20,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e4 30 04 <unknown>
 
 decb    x0, vl2
 // CHECK-INST: decb    x0, vl2
 // CHECK-ENCODING: [0x40,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e4 30 04 <unknown>
 
 decb    x0, vl3
 // CHECK-INST: decb    x0, vl3
 // CHECK-ENCODING: [0x60,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e4 30 04 <unknown>
 
 decb    x0, vl4
 // CHECK-INST: decb    x0, vl4
 // CHECK-ENCODING: [0x80,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e4 30 04 <unknown>
 
 decb    x0, vl5
 // CHECK-INST: decb    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e4 30 04 <unknown>
 
 decb    x0, vl6
 // CHECK-INST: decb    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e4 30 04 <unknown>
 
 decb    x0, vl7
 // CHECK-INST: decb    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e4 30 04 <unknown>
 
 decb    x0, vl8
 // CHECK-INST: decb    x0, vl8
 // CHECK-ENCODING: [0x00,0xe5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e5 30 04 <unknown>
 
 decb    x0, vl16
 // CHECK-INST: decb    x0, vl16
 // CHECK-ENCODING: [0x20,0xe5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e5 30 04 <unknown>
 
 decb    x0, vl32
 // CHECK-INST: decb    x0, vl32
 // CHECK-ENCODING: [0x40,0xe5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e5 30 04 <unknown>
 
 decb    x0, vl64
 // CHECK-INST: decb    x0, vl64
 // CHECK-ENCODING: [0x60,0xe5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e5 30 04 <unknown>
 
 decb    x0, vl128
 // CHECK-INST: decb    x0, vl128
 // CHECK-ENCODING: [0x80,0xe5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e5 30 04 <unknown>
 
 decb    x0, vl256
 // CHECK-INST: decb    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e5 30 04 <unknown>
 
 decb    x0, #14
 // CHECK-INST: decb    x0, #14
 // CHECK-ENCODING: [0xc0,0xe5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e5 30 04 <unknown>
 
 decb    x0, #28
 // CHECK-INST: decb    x0, #28
 // CHECK-ENCODING: [0x80,0xe7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e7 30 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/decd.s b/llvm/test/MC/AArch64/SVE/decd.s
index 38b53f92d56c5..6a1a2cd2c89e6 100644
--- a/llvm/test/MC/AArch64/SVE/decd.s
+++ b/llvm/test/MC/AArch64/SVE/decd.s
@@ -12,119 +12,119 @@
 decd    x0
 // CHECK-INST: decd    x0
 // CHECK-ENCODING: [0xe0,0xe7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 f0 04 <unknown>
 
 decd    x0, all
 // CHECK-INST: decd    x0
 // CHECK-ENCODING: [0xe0,0xe7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 f0 04 <unknown>
 
 decd    x0, all, mul #1
 // CHECK-INST: decd    x0
 // CHECK-ENCODING: [0xe0,0xe7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 f0 04 <unknown>
 
 decd    x0, all, mul #16
 // CHECK-INST: decd    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe7,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 ff 04 <unknown>
 
 decd    x0, pow2
 // CHECK-INST: decd    x0, pow2
 // CHECK-ENCODING: [0x00,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e4 f0 04 <unknown>
 
 decd    x0, vl1
 // CHECK-INST: decd    x0, vl1
 // CHECK-ENCODING: [0x20,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e4 f0 04 <unknown>
 
 decd    x0, vl2
 // CHECK-INST: decd    x0, vl2
 // CHECK-ENCODING: [0x40,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e4 f0 04 <unknown>
 
 decd    x0, vl3
 // CHECK-INST: decd    x0, vl3
 // CHECK-ENCODING: [0x60,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e4 f0 04 <unknown>
 
 decd    x0, vl4
 // CHECK-INST: decd    x0, vl4
 // CHECK-ENCODING: [0x80,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e4 f0 04 <unknown>
 
 decd    x0, vl5
 // CHECK-INST: decd    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e4 f0 04 <unknown>
 
 decd    x0, vl6
 // CHECK-INST: decd    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e4 f0 04 <unknown>
 
 decd    x0, vl7
 // CHECK-INST: decd    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e4 f0 04 <unknown>
 
 decd    x0, vl8
 // CHECK-INST: decd    x0, vl8
 // CHECK-ENCODING: [0x00,0xe5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e5 f0 04 <unknown>
 
 decd    x0, vl16
 // CHECK-INST: decd    x0, vl16
 // CHECK-ENCODING: [0x20,0xe5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e5 f0 04 <unknown>
 
 decd    x0, vl32
 // CHECK-INST: decd    x0, vl32
 // CHECK-ENCODING: [0x40,0xe5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e5 f0 04 <unknown>
 
 decd    x0, vl64
 // CHECK-INST: decd    x0, vl64
 // CHECK-ENCODING: [0x60,0xe5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e5 f0 04 <unknown>
 
 decd    x0, vl128
 // CHECK-INST: decd    x0, vl128
 // CHECK-ENCODING: [0x80,0xe5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e5 f0 04 <unknown>
 
 decd    x0, vl256
 // CHECK-INST: decd    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e5 f0 04 <unknown>
 
 decd    x0, #14
 // CHECK-INST: decd    x0, #14
 // CHECK-ENCODING: [0xc0,0xe5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e5 f0 04 <unknown>
 
 decd    x0, #28
 // CHECK-INST: decd    x0, #28
 // CHECK-ENCODING: [0x80,0xe7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e7 f0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/dech.s b/llvm/test/MC/AArch64/SVE/dech.s
index 89608699e4d71..f89ad118f525b 100644
--- a/llvm/test/MC/AArch64/SVE/dech.s
+++ b/llvm/test/MC/AArch64/SVE/dech.s
@@ -12,119 +12,119 @@
 dech    x0
 // CHECK-INST: dech    x0
 // CHECK-ENCODING: [0xe0,0xe7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 70 04 <unknown>
 
 dech    x0, all
 // CHECK-INST: dech    x0
 // CHECK-ENCODING: [0xe0,0xe7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 70 04 <unknown>
 
 dech    x0, all, mul #1
 // CHECK-INST: dech    x0
 // CHECK-ENCODING: [0xe0,0xe7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 70 04 <unknown>
 
 dech    x0, all, mul #16
 // CHECK-INST: dech    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe7,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 7f 04 <unknown>
 
 dech    x0, pow2
 // CHECK-INST: dech    x0, pow2
 // CHECK-ENCODING: [0x00,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e4 70 04 <unknown>
 
 dech    x0, vl1
 // CHECK-INST: dech    x0, vl1
 // CHECK-ENCODING: [0x20,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e4 70 04 <unknown>
 
 dech    x0, vl2
 // CHECK-INST: dech    x0, vl2
 // CHECK-ENCODING: [0x40,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e4 70 04 <unknown>
 
 dech    x0, vl3
 // CHECK-INST: dech    x0, vl3
 // CHECK-ENCODING: [0x60,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e4 70 04 <unknown>
 
 dech    x0, vl4
 // CHECK-INST: dech    x0, vl4
 // CHECK-ENCODING: [0x80,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e4 70 04 <unknown>
 
 dech    x0, vl5
 // CHECK-INST: dech    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e4 70 04 <unknown>
 
 dech    x0, vl6
 // CHECK-INST: dech    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e4 70 04 <unknown>
 
 dech    x0, vl7
 // CHECK-INST: dech    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e4 70 04 <unknown>
 
 dech    x0, vl8
 // CHECK-INST: dech    x0, vl8
 // CHECK-ENCODING: [0x00,0xe5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e5 70 04 <unknown>
 
 dech    x0, vl16
 // CHECK-INST: dech    x0, vl16
 // CHECK-ENCODING: [0x20,0xe5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e5 70 04 <unknown>
 
 dech    x0, vl32
 // CHECK-INST: dech    x0, vl32
 // CHECK-ENCODING: [0x40,0xe5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e5 70 04 <unknown>
 
 dech    x0, vl64
 // CHECK-INST: dech    x0, vl64
 // CHECK-ENCODING: [0x60,0xe5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e5 70 04 <unknown>
 
 dech    x0, vl128
 // CHECK-INST: dech    x0, vl128
 // CHECK-ENCODING: [0x80,0xe5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e5 70 04 <unknown>
 
 dech    x0, vl256
 // CHECK-INST: dech    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e5 70 04 <unknown>
 
 dech    x0, #14
 // CHECK-INST: dech    x0, #14
 // CHECK-ENCODING: [0xc0,0xe5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e5 70 04 <unknown>
 
 dech    x0, #28
 // CHECK-INST: dech    x0, #28
 // CHECK-ENCODING: [0x80,0xe7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e7 70 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/decp.s b/llvm/test/MC/AArch64/SVE/decp.s
index e9466cfc789b0..bae5fabbd351f 100644
--- a/llvm/test/MC/AArch64/SVE/decp.s
+++ b/llvm/test/MC/AArch64/SVE/decp.s
@@ -12,85 +12,85 @@
 decp    x0, p0.b
 // CHECK-INST: decp    x0, p0.b
 // CHECK-ENCODING: [0x00,0x88,0x2d,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 2d 25 <unknown>
 
 decp    x0, p0.h
 // CHECK-INST: decp    x0, p0.h
 // CHECK-ENCODING: [0x00,0x88,0x6d,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 6d 25 <unknown>
 
 decp    x0, p0.s
 // CHECK-INST: decp    x0, p0.s
 // CHECK-ENCODING: [0x00,0x88,0xad,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 ad 25 <unknown>
 
 decp    x0, p0.d
 // CHECK-INST: decp    x0, p0.d
 // CHECK-ENCODING: [0x00,0x88,0xed,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 ed 25 <unknown>
 
 decp    xzr, p15.b
 // CHECK-INST: decp    xzr, p15.b
 // CHECK-ENCODING: [0xff,0x89,0x2d,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 2d 25 <unknown>
 
 decp    xzr, p15.h
 // CHECK-INST: decp    xzr, p15.h
 // CHECK-ENCODING: [0xff,0x89,0x6d,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 6d 25 <unknown>
 
 decp    xzr, p15.s
 // CHECK-INST: decp    xzr, p15.s
 // CHECK-ENCODING: [0xff,0x89,0xad,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 ad 25 <unknown>
 
 decp    xzr, p15.d
 // CHECK-INST: decp    xzr, p15.d
 // CHECK-ENCODING: [0xff,0x89,0xed,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 ed 25 <unknown>
 
 decp    z31.h, p15
 // CHECK-INST: decp    z31.h, p15.h
 // CHECK-ENCODING: [0xff,0x81,0x6d,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 6d 25 <unknown>
 
 decp    z31.h, p15.h
 // CHECK-INST: decp    z31.h, p15.h
 // CHECK-ENCODING: [0xff,0x81,0x6d,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 6d 25 <unknown>
 
 decp    z31.s, p15
 // CHECK-INST: decp    z31.s, p15.s
 // CHECK-ENCODING: [0xff,0x81,0xad,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ad 25 <unknown>
 
 decp    z31.s, p15.s
 // CHECK-INST: decp    z31.s, p15.s
 // CHECK-ENCODING: [0xff,0x81,0xad,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ad 25 <unknown>
 
 decp    z31.d, p15
 // CHECK-INST: decp    z31.d, p15.d
 // CHECK-ENCODING: [0xff,0x81,0xed,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ed 25 <unknown>
 
 decp    z31.d, p15.d
 // CHECK-INST: decp    z31.d, p15.d
 // CHECK-ENCODING: [0xff,0x81,0xed,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ed 25 <unknown>
 
 
@@ -100,11 +100,11 @@ decp    z31.d, p15.d
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 decp    z31.d, p15.d
 // CHECK-INST: decp	z31.d, p15
 // CHECK-ENCODING: [0xff,0x81,0xed,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ed 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/decw.s b/llvm/test/MC/AArch64/SVE/decw.s
index 2ad72fc6dd102..d064c39193b59 100644
--- a/llvm/test/MC/AArch64/SVE/decw.s
+++ b/llvm/test/MC/AArch64/SVE/decw.s
@@ -12,119 +12,119 @@
 decw    x0
 // CHECK-INST: decw    x0
 // CHECK-ENCODING: [0xe0,0xe7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 b0 04 <unknown>
 
 decw    x0, all
 // CHECK-INST: decw    x0
 // CHECK-ENCODING: [0xe0,0xe7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 b0 04 <unknown>
 
 decw    x0, all, mul #1
 // CHECK-INST: decw    x0
 // CHECK-ENCODING: [0xe0,0xe7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 b0 04 <unknown>
 
 decw    x0, all, mul #16
 // CHECK-INST: decw    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe7,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e7 bf 04 <unknown>
 
 decw    x0, pow2
 // CHECK-INST: decw    x0, pow2
 // CHECK-ENCODING: [0x00,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e4 b0 04 <unknown>
 
 decw    x0, vl1
 // CHECK-INST: decw    x0, vl1
 // CHECK-ENCODING: [0x20,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e4 b0 04 <unknown>
 
 decw    x0, vl2
 // CHECK-INST: decw    x0, vl2
 // CHECK-ENCODING: [0x40,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e4 b0 04 <unknown>
 
 decw    x0, vl3
 // CHECK-INST: decw    x0, vl3
 // CHECK-ENCODING: [0x60,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e4 b0 04 <unknown>
 
 decw    x0, vl4
 // CHECK-INST: decw    x0, vl4
 // CHECK-ENCODING: [0x80,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e4 b0 04 <unknown>
 
 decw    x0, vl5
 // CHECK-INST: decw    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e4 b0 04 <unknown>
 
 decw    x0, vl6
 // CHECK-INST: decw    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e4 b0 04 <unknown>
 
 decw    x0, vl7
 // CHECK-INST: decw    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e4 b0 04 <unknown>
 
 decw    x0, vl8
 // CHECK-INST: decw    x0, vl8
 // CHECK-ENCODING: [0x00,0xe5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e5 b0 04 <unknown>
 
 decw    x0, vl16
 // CHECK-INST: decw    x0, vl16
 // CHECK-ENCODING: [0x20,0xe5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e5 b0 04 <unknown>
 
 decw    x0, vl32
 // CHECK-INST: decw    x0, vl32
 // CHECK-ENCODING: [0x40,0xe5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e5 b0 04 <unknown>
 
 decw    x0, vl64
 // CHECK-INST: decw    x0, vl64
 // CHECK-ENCODING: [0x60,0xe5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e5 b0 04 <unknown>
 
 decw    x0, vl128
 // CHECK-INST: decw    x0, vl128
 // CHECK-ENCODING: [0x80,0xe5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e5 b0 04 <unknown>
 
 decw    x0, vl256
 // CHECK-INST: decw    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e5 b0 04 <unknown>
 
 decw    x0, #14
 // CHECK-INST: decw    x0, #14
 // CHECK-ENCODING: [0xc0,0xe5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e5 b0 04 <unknown>
 
 decw    x0, #28
 // CHECK-INST: decw    x0, #28
 // CHECK-ENCODING: [0x80,0xe7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e7 b0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
index 1aa8f91c1374d..661f13974d0bc 100644
--- a/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/SVE/directive-arch_extension-negative.s
@@ -3,5 +3,5 @@
 .arch_extension nosve
 
 ptrue   p0.b, pow2
-// CHECK: error: instruction requires: streaming-sve or sve
+// CHECK: error: instruction requires: sve or sme
 // CHECK-NEXT: ptrue   p0.b, pow2
diff --git a/llvm/test/MC/AArch64/SVE/dup.s b/llvm/test/MC/AArch64/SVE/dup.s
index 48bdda3e27e03..dd7077e9d7907 100644
--- a/llvm/test/MC/AArch64/SVE/dup.s
+++ b/llvm/test/MC/AArch64/SVE/dup.s
@@ -12,235 +12,235 @@
 dup     z0.b, w0
 // CHECK-INST: mov     z0.b, w0
 // CHECK-ENCODING: [0x00,0x38,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 20 05 <unknown>
 
 dup     z0.h, w0
 // CHECK-INST: mov     z0.h, w0
 // CHECK-ENCODING: [0x00,0x38,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 60 05 <unknown>
 
 dup     z0.s, w0
 // CHECK-INST: mov     z0.s, w0
 // CHECK-ENCODING: [0x00,0x38,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 a0 05 <unknown>
 
 dup     z0.d, x0
 // CHECK-INST: mov     z0.d, x0
 // CHECK-ENCODING: [0x00,0x38,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 e0 05 <unknown>
 
 dup     z31.h, wsp
 // CHECK-INST: mov     z31.h, wsp
 // CHECK-ENCODING: [0xff,0x3b,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 60 05 <unknown>
 
 dup     z31.s, wsp
 // CHECK-INST: mov     z31.s, wsp
 // CHECK-ENCODING: [0xff,0x3b,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b a0 05 <unknown>
 
 dup     z31.d, sp
 // CHECK-INST: mov     z31.d, sp
 // CHECK-ENCODING: [0xff,0x3b,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b e0 05 <unknown>
 
 dup     z31.b, wsp
 // CHECK-INST: mov     z31.b, wsp
 // CHECK-ENCODING: [0xff,0x3b,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 20 05 <unknown>
 
 dup     z5.b, #-128
 // CHECK-INST: mov     z5.b, #-128
 // CHECK-ENCODING: [0x05,0xd0,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 d0 38 25 <unknown>
 
 dup     z5.b, #127
 // CHECK-INST: mov     z5.b, #127
 // CHECK-ENCODING: [0xe5,0xcf,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 cf 38 25 <unknown>
 
 dup     z5.b, #255
 // CHECK-INST: mov     z5.b, #-1
 // CHECK-ENCODING: [0xe5,0xdf,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 df 38 25 <unknown>
 
 dup     z21.h, #-128
 // CHECK-INST: mov     z21.h, #-128
 // CHECK-ENCODING: [0x15,0xd0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 d0 78 25 <unknown>
 
 dup     z21.h, #-128, lsl #8
 // CHECK-INST: mov     z21.h, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 78 25 <unknown>
 
 dup     z21.h, #-32768
 // CHECK-INST: mov     z21.h, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 78 25 <unknown>
 
 dup     z21.h, #127
 // CHECK-INST: mov     z21.h, #127
 // CHECK-ENCODING: [0xf5,0xcf,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 cf 78 25 <unknown>
 
 dup     z21.h, #127, lsl #8
 // CHECK-INST: mov     z21.h, #32512
 // CHECK-ENCODING: [0xf5,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef 78 25 <unknown>
 
 dup     z21.h, #32512
 // CHECK-INST: mov     z21.h, #32512
 // CHECK-ENCODING: [0xf5,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef 78 25 <unknown>
 
 dup     z21.s, #-128
 // CHECK-INST: mov     z21.s, #-128
 // CHECK-ENCODING: [0x15,0xd0,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 d0 b8 25 <unknown>
 
 dup     z21.s, #-128, lsl #8
 // CHECK-INST: mov     z21.s, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 b8 25 <unknown>
 
 dup     z21.s, #-32768
 // CHECK-INST: mov     z21.s, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 b8 25 <unknown>
 
 dup     z21.s, #127
 // CHECK-INST: mov     z21.s, #127
 // CHECK-ENCODING: [0xf5,0xcf,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 cf b8 25 <unknown>
 
 dup     z21.s, #127, lsl #8
 // CHECK-INST: mov     z21.s, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef b8 25 <unknown>
 
 dup     z21.s, #32512
 // CHECK-INST: mov     z21.s, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef b8 25 <unknown>
 
 dup     z21.d, #-128
 // CHECK-INST: mov     z21.d, #-128
 // CHECK-ENCODING: [0x15,0xd0,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 d0 f8 25 <unknown>
 
 dup     z21.d, #-128, lsl #8
 // CHECK-INST: mov     z21.d, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 f8 25 <unknown>
 
 dup     z21.d, #-32768
 // CHECK-INST: mov     z21.d, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 f8 25 <unknown>
 
 dup     z21.d, #127
 // CHECK-INST: mov     z21.d, #127
 // CHECK-ENCODING: [0xf5,0xcf,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 cf f8 25 <unknown>
 
 dup     z21.d, #127, lsl #8
 // CHECK-INST: mov     z21.d, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef f8 25 <unknown>
 
 dup     z21.d, #32512
 // CHECK-INST: mov     z21.d, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef f8 25 <unknown>
 
 dup     z0.b, z0.b[0]
 // CHECK-INST: mov     z0.b, b0
 // CHECK-ENCODING: [0x00,0x20,0x21,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 21 05 <unknown>
 
 dup     z0.h, z0.h[0]
 // CHECK-INST: mov     z0.h, h0
 // CHECK-ENCODING: [0x00,0x20,0x22,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 22 05 <unknown>
 
 dup     z0.s, z0.s[0]
 // CHECK-INST: mov     z0.s, s0
 // CHECK-ENCODING: [0x00,0x20,0x24,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 24 05 <unknown>
 
 dup     z0.d, z0.d[0]
 // CHECK-INST: mov     z0.d, d0
 // CHECK-ENCODING: [0x00,0x20,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 28 05 <unknown>
 
 dup     z0.q, z0.q[0]
 // CHECK-INST: mov     z0.q, q0
 // CHECK-ENCODING: [0x00,0x20,0x30,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 30 05 <unknown>
 
 dup     z31.b, z31.b[63]
 // CHECK-INST: mov     z31.b, z31.b[63]
 // CHECK-ENCODING: [0xff,0x23,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 ff 05 <unknown>
 
 dup     z31.h, z31.h[31]
 // CHECK-INST: mov     z31.h, z31.h[31]
 // CHECK-ENCODING: [0xff,0x23,0xfe,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 fe 05 <unknown>
 
 dup     z31.s, z31.s[15]
 // CHECK-INST: mov     z31.s, z31.s[15]
 // CHECK-ENCODING: [0xff,0x23,0xfc,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 fc 05 <unknown>
 
 dup     z31.d, z31.d[7]
 // CHECK-INST: mov     z31.d, z31.d[7]
 // CHECK-ENCODING: [0xff,0x23,0xf8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 f8 05 <unknown>
 
 dup     z5.q, z17.q[3]
 // CHECK-INST: mov     z5.q, z17.q[3]
 // CHECK-ENCODING: [0x25,0x22,0xf0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 22 f0 05 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -250,17 +250,17 @@ dup     z5.q, z17.q[3]
 dup     z0.b, #-129
 // CHECK-INST: mov     z0.b, #127
 // CHECK-ENCODING: [0xe0,0xcf,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf 38 25 <unknown>
 
 dup     z0.h, #-33024
 // CHECK-INST: mov     z0.h, #32512
 // CHECK-ENCODING: [0xe0,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ef 78 25 <unknown>
 
 dup     z0.h, #-129, lsl #8
 // CHECK-INST: mov     z0.h, #32512
 // CHECK-ENCODING: [0xe0,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ef 78 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/dupm.s b/llvm/test/MC/AArch64/SVE/dupm.s
index 9f0b863075cad..ad4678ff656f0 100644
--- a/llvm/test/MC/AArch64/SVE/dupm.s
+++ b/llvm/test/MC/AArch64/SVE/dupm.s
@@ -12,59 +12,59 @@
 dupm     z5.b, #0xf9
 // CHECK-INST: dupm     z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e c0 05 <unknown>
 
 dupm     z5.h, #0xf9f9
 // CHECK-INST: dupm     z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e c0 05 <unknown>
 
 dupm     z5.s, #0xf9f9f9f9
 // CHECK-INST: dupm     z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e c0 05 <unknown>
 
 dupm     z5.d, #0xf9f9f9f9f9f9f9f9
 // CHECK-INST: dupm     z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e c0 05 <unknown>
 
 dupm     z23.h, #0xfff9
 // CHECK-INST: dupm     z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d c0 05 <unknown>
 
 dupm     z23.s, #0xfff9fff9
 // CHECK-INST: dupm     z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d c0 05 <unknown>
 
 dupm     z23.d, #0xfff9fff9fff9fff9
 // CHECK-INST: dupm     z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d c0 05 <unknown>
 
 dupm     z0.s, #0xfffffff9
 // CHECK-INST: dupm     z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb c0 05 <unknown>
 
 dupm     z0.d, #0xfffffff9fffffff9
 // CHECK-INST: dupm     z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb c0 05 <unknown>
 
 dupm     z0.d, #0xfffffffffffffff9
 // CHECK-INST: dupm     z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0xc3,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef c3 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/eon.s b/llvm/test/MC/AArch64/SVE/eon.s
index d2583b1a13388..cf66bb6deacd9 100644
--- a/llvm/test/MC/AArch64/SVE/eon.s
+++ b/llvm/test/MC/AArch64/SVE/eon.s
@@ -12,49 +12,49 @@
 eon     z5.b, z5.b, #0xf9
 // CHECK-INST: eor     z5.b, z5.b, #0x6
 // CHECK-ENCODING: [0x25,0x3e,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 3e 40 05 <unknown>
 
 eon     z23.h, z23.h, #0xfff9
 // CHECK-INST: eor     z23.h, z23.h, #0x6
 // CHECK-ENCODING: [0x37,0x7c,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 37 7c 40 05 <unknown>
 
 eon     z0.s, z0.s, #0xfffffff9
 // CHECK-INST: eor     z0.s, z0.s, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 40 05 <unknown>
 
 eon     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-INST: eor     z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x43,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 43 05 <unknown>
 
 eon     z5.b, z5.b, #0x6
 // CHECK-INST: eor     z5.b, z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e 40 05 <unknown>
 
 eon     z23.h, z23.h, #0x6
 // CHECK-INST: eor     z23.h, z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d 40 05 <unknown>
 
 eon     z0.s, z0.s, #0x6
 // CHECK-INST: eor     z0.s, z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb 40 05 <unknown>
 
 eon     z0.d, z0.d, #0x6
 // CHECK-INST: eor     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x43,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 43 05 <unknown>
 
 
@@ -64,11 +64,11 @@ eon     z0.d, z0.d, #0x6
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 eon     z0.d, z0.d, #0x6
 // CHECK-INST: eor	z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x43,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 43 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/eor.s b/llvm/test/MC/AArch64/SVE/eor.s
index 861799660d4ee..5b38312b84fdc 100644
--- a/llvm/test/MC/AArch64/SVE/eor.s
+++ b/llvm/test/MC/AArch64/SVE/eor.s
@@ -12,103 +12,103 @@
 eor     z5.b, z5.b, #0xf9
 // CHECK-INST: eor     z5.b, z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e 40 05 <unknown>
 
 eor     z23.h, z23.h, #0xfff9
 // CHECK-INST: eor     z23.h, z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d 40 05 <unknown>
 
 eor     z0.s, z0.s, #0xfffffff9
 // CHECK-INST: eor     z0.s, z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb 40 05 <unknown>
 
 eor     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-INST: eor     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x43,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 43 05 <unknown>
 
 eor     z5.b, z5.b, #0x6
 // CHECK-INST: eor     z5.b, z5.b, #0x6
 // CHECK-ENCODING: [0x25,0x3e,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 3e 40 05 <unknown>
 
 eor     z23.h, z23.h, #0x6
 // CHECK-INST: eor     z23.h, z23.h, #0x6
 // CHECK-ENCODING: [0x37,0x7c,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 37 7c 40 05 <unknown>
 
 eor     z0.s, z0.s, #0x6
 // CHECK-INST: eor     z0.s, z0.s, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x40,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 40 05 <unknown>
 
 eor     z0.d, z0.d, #0x6
 // CHECK-INST: eor     z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x43,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 43 05 <unknown>
 
 eor     z23.d, z13.d, z8.d
 // CHECK-INST: eor     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x31,0xa8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 31 a8 04 <unknown>
 
 eor     z0.d, z0.d, z0.d
 // CHECK-INST: eor     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 a0 04 <unknown>
 
 eor     z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: eor     z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x99,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 99 04 <unknown>
 
 eor     z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: eor     z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x59,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 59 04 <unknown>
 
 eor     z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: eor     z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xd9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f d9 04 <unknown>
 
 eor     z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: eor     z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x19,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 19 04 <unknown>
 
 eor     p0.b, p0/z, p0.b, p1.b
 // CHECK-INST: eor     p0.b, p0/z, p0.b, p1.b
 // CHECK-ENCODING: [0x00,0x42,0x01,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 01 25 <unknown>
 
 eor     p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: not     p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x42,0x00,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 00 25 <unknown>
 
 eor     p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: not     p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7f,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7f 0f 25 <unknown>
 
 
@@ -118,19 +118,19 @@ eor     p15.b, p15/z, p15.b, p15.b
 eor     z0.s, z0.s, z0.s
 // CHECK-INST: eor     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 a0 04 <unknown>
 
 eor     z0.h, z0.h, z0.h
 // CHECK-INST: eor     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 a0 04 <unknown>
 
 eor     z0.b, z0.b, z0.b
 // CHECK-INST: eor     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 a0 04 <unknown>
 
 
@@ -140,35 +140,35 @@ eor     z0.b, z0.b, z0.b
 movprfx z4.b, p7/z, z6.b
 // CHECK-INST: movprfx	z4.b, p7/z, z6.b
 // CHECK-ENCODING: [0xc4,0x3c,0x10,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c 10 04 <unknown>
 
 eor     z4.b, p7/m, z4.b, z31.b
 // CHECK-INST: eor	z4.b, p7/m, z4.b, z31.b
 // CHECK-ENCODING: [0xe4,0x1f,0x19,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f 19 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 eor     z4.b, p7/m, z4.b, z31.b
 // CHECK-INST: eor	z4.b, p7/m, z4.b, z31.b
 // CHECK-ENCODING: [0xe4,0x1f,0x19,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f 19 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 eor     z0.d, z0.d, #0x6
 // CHECK-INST: eor	z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x43,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 43 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/eors.s b/llvm/test/MC/AArch64/SVE/eors.s
index b48369c67450a..4ec189c867bc8 100644
--- a/llvm/test/MC/AArch64/SVE/eors.s
+++ b/llvm/test/MC/AArch64/SVE/eors.s
@@ -12,17 +12,17 @@
 eors    p0.b, p0/z, p0.b, p1.b
 // CHECK-INST: eors    p0.b, p0/z, p0.b, p1.b
 // CHECK-ENCODING: [0x00,0x42,0x41,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 41 25 <unknown>
 
 eors    p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: nots    p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x42,0x40,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 40 25 <unknown>
 
 eors    p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: nots    p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7f,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7f 4f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/eorv.s b/llvm/test/MC/AArch64/SVE/eorv.s
index eddd66b25b41e..3a85be1bdf67b 100644
--- a/llvm/test/MC/AArch64/SVE/eorv.s
+++ b/llvm/test/MC/AArch64/SVE/eorv.s
@@ -12,23 +12,23 @@
 eorv b0, p7, z31.b
 // CHECK-INST: eorv	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x19,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 19 04 <unknown>
 
 eorv h0, p7, z31.h
 // CHECK-INST: eorv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x59,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 59 04 <unknown>
 
 eorv s0, p7, z31.s
 // CHECK-INST: eorv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x99,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 99 04 <unknown>
 
 eorv d0, p7, z31.d
 // CHECK-INST: eorv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xd9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f d9 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ext.s b/llvm/test/MC/AArch64/SVE/ext.s
index 301c2db24f84e..733e5ee8ca7bd 100644
--- a/llvm/test/MC/AArch64/SVE/ext.s
+++ b/llvm/test/MC/AArch64/SVE/ext.s
@@ -12,13 +12,13 @@
 ext z31.b, z31.b, z0.b, #0
 // CHECK-INST: ext	z31.b, z31.b, z0.b, #0
 // CHECK-ENCODING: [0x1f,0x00,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 00 20 05 <unknown>
 
 ext z31.b, z31.b, z0.b, #255
 // CHECK-INST: ext	z31.b, z31.b, z0.b, #255
 // CHECK-ENCODING: [0x1f,0x1c,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 1c 3f 05 <unknown>
 
 
@@ -28,11 +28,11 @@ ext z31.b, z31.b, z0.b, #255
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 ext z31.b, z31.b, z0.b, #255
 // CHECK-INST: ext	z31.b, z31.b, z0.b, #255
 // CHECK-ENCODING: [0x1f,0x1c,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 1c 3f 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fabd.s b/llvm/test/MC/AArch64/SVE/fabd.s
index 5656354e19c22..d215cc4542e5c 100644
--- a/llvm/test/MC/AArch64/SVE/fabd.s
+++ b/llvm/test/MC/AArch64/SVE/fabd.s
@@ -12,19 +12,19 @@
 fabd    z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fabd	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x48,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 48 65 <unknown>
 
 fabd    z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fabd	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x88,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 88 65 <unknown>
 
 fabd    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fabd	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c8 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fabd    z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fabd    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fabd	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c8 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fabd    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fabd	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c8 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fabs.s b/llvm/test/MC/AArch64/SVE/fabs.s
index 7c3e9595049aa..c83db482d5377 100644
--- a/llvm/test/MC/AArch64/SVE/fabs.s
+++ b/llvm/test/MC/AArch64/SVE/fabs.s
@@ -12,19 +12,19 @@
 fabs    z31.h, p7/m, z31.h
 // CHECK-INST: fabs	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x5c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 5c 04 <unknown>
 
 fabs    z31.s, p7/m, z31.s
 // CHECK-INST: fabs	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x9c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 9c 04 <unknown>
 
 fabs    z31.d, p7/m, z31.d
 // CHECK-INST: fabs	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xdc,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf dc 04 <unknown>
 
 
@@ -34,23 +34,23 @@ fabs    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 fabs    z4.d, p7/m, z31.d
 // CHECK-INST: fabs	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xdc,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf dc 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 fabs    z4.d, p7/m, z31.d
 // CHECK-INST: fabs	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xdc,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf dc 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/facge.s b/llvm/test/MC/AArch64/SVE/facge.s
index 79cc24b353b10..e08deb567b3e5 100644
--- a/llvm/test/MC/AArch64/SVE/facge.s
+++ b/llvm/test/MC/AArch64/SVE/facge.s
@@ -12,17 +12,17 @@
 facge   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: facge	p0.h, p0/z, z0.h, z1.h
 // CHECK-ENCODING: [0x10,0xc0,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 41 65 <unknown>
 
 facge   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: facge	p0.s, p0/z, z0.s, z1.s
 // CHECK-ENCODING: [0x10,0xc0,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 81 65 <unknown>
 
 facge   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: facge	p0.d, p0/z, z0.d, z1.d
 // CHECK-ENCODING: [0x10,0xc0,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 c0 c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/facgt.s b/llvm/test/MC/AArch64/SVE/facgt.s
index 4021d3c968f26..4c2a01bbf6140 100644
--- a/llvm/test/MC/AArch64/SVE/facgt.s
+++ b/llvm/test/MC/AArch64/SVE/facgt.s
@@ -12,17 +12,17 @@
 facgt   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: facgt	p0.h, p0/z, z0.h, z1.h
 // CHECK-ENCODING: [0x10,0xe0,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 41 65 <unknown>
 
 facgt   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: facgt	p0.s, p0/z, z0.s, z1.s
 // CHECK-ENCODING: [0x10,0xe0,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 81 65 <unknown>
 
 facgt   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: facgt	p0.d, p0/z, z0.d, z1.d
 // CHECK-ENCODING: [0x10,0xe0,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 e0 c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/facle.s b/llvm/test/MC/AArch64/SVE/facle.s
index 0a4a2eb34b6bb..a1b1340caf9a3 100644
--- a/llvm/test/MC/AArch64/SVE/facle.s
+++ b/llvm/test/MC/AArch64/SVE/facle.s
@@ -12,17 +12,17 @@
 facle   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: facge	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x30,0xc0,0x40,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 c0 40 65 <unknown>
 
 facle   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: facge	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x30,0xc0,0x80,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 c0 80 65 <unknown>
 
 facle   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: facge	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x30,0xc0,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 c0 c0 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/faclt.s b/llvm/test/MC/AArch64/SVE/faclt.s
index de620adfa9bef..7ba7e2cea9237 100644
--- a/llvm/test/MC/AArch64/SVE/faclt.s
+++ b/llvm/test/MC/AArch64/SVE/faclt.s
@@ -12,17 +12,17 @@
 faclt   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: facgt	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x30,0xe0,0x40,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 e0 40 65 <unknown>
 
 faclt   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: facgt	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x30,0xe0,0x80,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 e0 80 65 <unknown>
 
 faclt   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: facgt	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x30,0xe0,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 e0 c0 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fadd.s b/llvm/test/MC/AArch64/SVE/fadd.s
index fecf763f0c24f..e44e4e9bf25fb 100644
--- a/llvm/test/MC/AArch64/SVE/fadd.s
+++ b/llvm/test/MC/AArch64/SVE/fadd.s
@@ -12,85 +12,85 @@
 fadd    z0.h, p0/m, z0.h, #0.500000000000000
 // CHECK-INST: fadd    z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x58,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 58 65 <unknown>
 
 fadd    z0.h, p0/m, z0.h, #0.5
 // CHECK-INST: fadd    z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x58,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 58 65 <unknown>
 
 fadd    z0.s, p0/m, z0.s, #0.5
 // CHECK-INST: fadd    z0.s, p0/m, z0.s, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x98,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 98 65 <unknown>
 
 fadd    z0.d, p0/m, z0.d, #0.5
 // CHECK-INST: fadd    z0.d, p0/m, z0.d, #0.5
 // CHECK-ENCODING: [0x00,0x80,0xd8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d8 65 <unknown>
 
 fadd    z31.h, p7/m, z31.h, #1.000000000000000
 // CHECK-INST: fadd    z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x58,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 58 65 <unknown>
 
 fadd    z31.h, p7/m, z31.h, #1.0
 // CHECK-INST: fadd    z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x58,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 58 65 <unknown>
 
 fadd    z31.s, p7/m, z31.s, #1.0
 // CHECK-INST: fadd    z31.s, p7/m, z31.s, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x98,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 98 65 <unknown>
 
 fadd    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fadd    z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xd8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c d8 65 <unknown>
 
 fadd    z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fadd	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x40,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 40 65 <unknown>
 
 fadd    z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fadd	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x80,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 80 65 <unknown>
 
 fadd    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fadd	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c0 65 <unknown>
 
 fadd z0.h, z1.h, z31.h
 // CHECK-INST: fadd	z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x00,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 5f 65 <unknown>
 
 fadd z0.s, z1.s, z31.s
 // CHECK-INST: fadd	z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x00,0x9f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 9f 65 <unknown>
 
 fadd z0.d, z1.d, z31.d
 // CHECK-INST: fadd	z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x00,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 df 65 <unknown>
 
 
@@ -100,47 +100,47 @@ fadd z0.d, z1.d, z31.d
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 fadd    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fadd	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xd8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c d8 65 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fadd    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fadd	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xd8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c d8 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fadd    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fadd	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c0 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fadd    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fadd	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c0 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/faddv.s b/llvm/test/MC/AArch64/SVE/faddv.s
index 98c47ba5a87a4..9a6056d91889e 100644
--- a/llvm/test/MC/AArch64/SVE/faddv.s
+++ b/llvm/test/MC/AArch64/SVE/faddv.s
@@ -12,17 +12,17 @@
 faddv h0, p7, z31.h
 // CHECK-INST: faddv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x40,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 40 65 <unknown>
 
 faddv s0, p7, z31.s
 // CHECK-INST: faddv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x80,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 80 65 <unknown>
 
 faddv d0, p7, z31.d
 // CHECK-INST: faddv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c0 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcadd.s b/llvm/test/MC/AArch64/SVE/fcadd.s
index e904453ef8c48..d10888de89956 100644
--- a/llvm/test/MC/AArch64/SVE/fcadd.s
+++ b/llvm/test/MC/AArch64/SVE/fcadd.s
@@ -12,37 +12,37 @@
 fcadd   z0.h, p0/m, z0.h, z0.h, #90
 // CHECK-INST: fcadd   z0.h, p0/m, z0.h, z0.h, #90
 // CHECK-ENCODING: [0x00,0x80,0x40,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 40 64 <unknown>
 
 fcadd   z0.s, p0/m, z0.s, z0.s, #90
 // CHECK-INST: fcadd   z0.s, p0/m, z0.s, z0.s, #90
 // CHECK-ENCODING: [0x00,0x80,0x80,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 80 64 <unknown>
 
 fcadd   z0.d, p0/m, z0.d, z0.d, #90
 // CHECK-INST: fcadd   z0.d, p0/m, z0.d, z0.d, #90
 // CHECK-ENCODING: [0x00,0x80,0xc0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 c0 64 <unknown>
 
 fcadd   z31.h, p7/m, z31.h, z31.h, #270
 // CHECK-INST: fcadd   z31.h, p7/m, z31.h, z31.h, #270
 // CHECK-ENCODING: [0xff,0x9f,0x41,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 41 64 <unknown>
 
 fcadd   z31.s, p7/m, z31.s, z31.s, #270
 // CHECK-INST: fcadd   z31.s, p7/m, z31.s, z31.s, #270
 // CHECK-ENCODING: [0xff,0x9f,0x81,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 81 64 <unknown>
 
 fcadd   z31.d, p7/m, z31.d, z31.d, #270
 // CHECK-INST: fcadd   z31.d, p7/m, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xff,0x9f,0xc1,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f c1 64 <unknown>
 
 
@@ -52,23 +52,23 @@ fcadd   z31.d, p7/m, z31.d, z31.d, #270
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 fcadd   z4.d, p7/m, z4.d, z31.d, #270
 // CHECK-INST: fcadd	z4.d, p7/m, z4.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0x9f,0xc1,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 9f c1 64 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 fcadd   z4.d, p7/m, z4.d, z31.d, #270
 // CHECK-INST: fcadd	z4.d, p7/m, z4.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0x9f,0xc1,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 9f c1 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmeq.s b/llvm/test/MC/AArch64/SVE/fcmeq.s
index 8f75330f12dbf..a523546141eb9 100644
--- a/llvm/test/MC/AArch64/SVE/fcmeq.s
+++ b/llvm/test/MC/AArch64/SVE/fcmeq.s
@@ -12,35 +12,35 @@
 fcmeq   p0.h, p0/z, z0.h, #0.0
 // CHECK-INST: fcmeq	p0.h, p0/z, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x52,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 52 65 <unknown>
 
 fcmeq   p0.s, p0/z, z0.s, #0.0
 // CHECK-INST: fcmeq	p0.s, p0/z, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x92,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 92 65 <unknown>
 
 fcmeq   p0.d, p0/z, z0.d, #0.0
 // CHECK-INST: fcmeq	p0.d, p0/z, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x20,0xd2,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 d2 65 <unknown>
 
 fcmeq   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: fcmeq	p0.h, p0/z, z0.h, z1.h
 // CHECK-ENCODING: [0x00,0x60,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 41 65 <unknown>
 
 fcmeq   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: fcmeq	p0.s, p0/z, z0.s, z1.s
 // CHECK-ENCODING: [0x00,0x60,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 81 65 <unknown>
 
 fcmeq   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: fcmeq	p0.d, p0/z, z0.d, z1.d
 // CHECK-ENCODING: [0x00,0x60,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmge.s b/llvm/test/MC/AArch64/SVE/fcmge.s
index 60d6325966d72..9500c79affbc0 100644
--- a/llvm/test/MC/AArch64/SVE/fcmge.s
+++ b/llvm/test/MC/AArch64/SVE/fcmge.s
@@ -12,35 +12,35 @@
 fcmge   p0.h, p0/z, z0.h, #0.0
 // CHECK-INST: fcmge	p0.h, p0/z, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x50,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 50 65 <unknown>
 
 fcmge   p0.s, p0/z, z0.s, #0.0
 // CHECK-INST: fcmge	p0.s, p0/z, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x90,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 90 65 <unknown>
 
 fcmge   p0.d, p0/z, z0.d, #0.0
 // CHECK-INST: fcmge	p0.d, p0/z, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x20,0xd0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 d0 65 <unknown>
 
 fcmge   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: fcmge	p0.h, p0/z, z0.h, z1.h
 // CHECK-ENCODING: [0x00,0x40,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 41 65 <unknown>
 
 fcmge   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: fcmge	p0.s, p0/z, z0.s, z1.s
 // CHECK-ENCODING: [0x00,0x40,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 81 65 <unknown>
 
 fcmge   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: fcmge	p0.d, p0/z, z0.d, z1.d
 // CHECK-ENCODING: [0x00,0x40,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmgt.s b/llvm/test/MC/AArch64/SVE/fcmgt.s
index d69adf168dcea..e352a5e17f076 100644
--- a/llvm/test/MC/AArch64/SVE/fcmgt.s
+++ b/llvm/test/MC/AArch64/SVE/fcmgt.s
@@ -12,35 +12,35 @@
 fcmgt   p0.h, p0/z, z0.h, #0.0
 // CHECK-INST: fcmgt	p0.h, p0/z, z0.h, #0.0
 // CHECK-ENCODING: [0x10,0x20,0x50,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 50 65 <unknown>
 
 fcmgt   p0.s, p0/z, z0.s, #0.0
 // CHECK-INST: fcmgt	p0.s, p0/z, z0.s, #0.0
 // CHECK-ENCODING: [0x10,0x20,0x90,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 90 65 <unknown>
 
 fcmgt   p0.d, p0/z, z0.d, #0.0
 // CHECK-INST: fcmgt	p0.d, p0/z, z0.d, #0.0
 // CHECK-ENCODING: [0x10,0x20,0xd0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 d0 65 <unknown>
 
 fcmgt   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: fcmgt	p0.h, p0/z, z0.h, z1.h
 // CHECK-ENCODING: [0x10,0x40,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 41 65 <unknown>
 
 fcmgt   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: fcmgt	p0.s, p0/z, z0.s, z1.s
 // CHECK-ENCODING: [0x10,0x40,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 81 65 <unknown>
 
 fcmgt   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: fcmgt	p0.d, p0/z, z0.d, z1.d
 // CHECK-ENCODING: [0x10,0x40,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmla.s b/llvm/test/MC/AArch64/SVE/fcmla.s
index 24cf8e840fdab..e3952e1943bef 100644
--- a/llvm/test/MC/AArch64/SVE/fcmla.s
+++ b/llvm/test/MC/AArch64/SVE/fcmla.s
@@ -12,97 +12,97 @@
 fcmla   z0.h, p0/m, z0.h, z0.h, #0
 // CHECK-INST: fcmla z0.h, p0/m, z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0x00,0x40,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 40 64 <unknown>
 
 fcmla   z0.s, p0/m, z0.s, z0.s, #0
 // CHECK-INST: fcmla z0.s, p0/m, z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0x00,0x80,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 80 64 <unknown>
 
 fcmla   z0.d, p0/m, z0.d, z0.d, #0
 // CHECK-INST: fcmla z0.d, p0/m, z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0x00,0xc0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 c0 64 <unknown>
 
 fcmla   z0.h, p0/m, z1.h, z2.h, #90
 // CHECK-INST: fcmla z0.h, p0/m, z1.h, z2.h, #90
 // CHECK-ENCODING: [0x20,0x20,0x42,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 20 42 64 <unknown>
 
 fcmla   z0.s, p0/m, z1.s, z2.s, #90
 // CHECK-INST: fcmla z0.s, p0/m, z1.s, z2.s, #90
 // CHECK-ENCODING: [0x20,0x20,0x82,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 20 82 64 <unknown>
 
 fcmla   z0.d, p0/m, z1.d, z2.d, #90
 // CHECK-INST: fcmla z0.d, p0/m, z1.d, z2.d, #90
 // CHECK-ENCODING: [0x20,0x20,0xc2,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 20 c2 64 <unknown>
 
 fcmla   z29.h, p7/m, z30.h, z31.h, #180
 // CHECK-INST: fcmla z29.h, p7/m, z30.h, z31.h, #180
 // CHECK-ENCODING: [0xdd,0x5f,0x5f,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: dd 5f 5f 64 <unknown>
 
 fcmla   z29.s, p7/m, z30.s, z31.s, #180
 // CHECK-INST: fcmla z29.s, p7/m, z30.s, z31.s, #180
 // CHECK-ENCODING: [0xdd,0x5f,0x9f,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: dd 5f 9f 64 <unknown>
 
 fcmla   z29.d, p7/m, z30.d, z31.d, #180
 // CHECK-INST: fcmla z29.d, p7/m, z30.d, z31.d, #180
 // CHECK-ENCODING: [0xdd,0x5f,0xdf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: dd 5f df 64 <unknown>
 
 fcmla   z31.h, p7/m, z31.h, z31.h, #270
 // CHECK-INST: fcmla z31.h, p7/m, z31.h, z31.h, #270
 // CHECK-ENCODING: [0xff,0x7f,0x5f,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7f 5f 64 <unknown>
 
 fcmla   z31.s, p7/m, z31.s, z31.s, #270
 // CHECK-INST: fcmla z31.s, p7/m, z31.s, z31.s, #270
 // CHECK-ENCODING: [0xff,0x7f,0x9f,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7f 9f 64 <unknown>
 
 fcmla   z31.d, p7/m, z31.d, z31.d, #270
 // CHECK-INST: fcmla z31.d, p7/m, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xff,0x7f,0xdf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7f df 64 <unknown>
 
 fcmla   z0.h, z0.h, z0.h[0], #0
 // CHECK-INST: fcmla   z0.h, z0.h, z0.h[0], #0
 // CHECK-ENCODING: [0x00,0x10,0xa0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 10 a0 64 <unknown>
 
 fcmla   z23.s, z13.s, z8.s[0], #270
 // CHECK-INST: fcmla   z23.s, z13.s, z8.s[0], #270
 // CHECK-ENCODING: [0xb7,0x1d,0xe8,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 1d e8 64 <unknown>
 
 fcmla   z31.h, z31.h, z7.h[3], #270
 // CHECK-INST: fcmla   z31.h, z31.h, z7.h[3], #270
 // CHECK-ENCODING: [0xff,0x1f,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f bf 64 <unknown>
 
 fcmla   z21.s, z10.s, z5.s[1], #90
 // CHECK-INST: fcmla   z21.s, z10.s, z5.s[1], #90
 // CHECK-ENCODING: [0x55,0x15,0xf5,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 f5 64 <unknown>
 
 
@@ -112,35 +112,35 @@ fcmla   z21.s, z10.s, z5.s[1], #90
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 fcmla   z4.d, p7/m, z31.d, z31.d, #270
 // CHECK-INST: fcmla	z4.d, p7/m, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0x7f,0xdf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 7f df 64 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 fcmla   z4.d, p7/m, z31.d, z31.d, #270
 // CHECK-INST: fcmla	z4.d, p7/m, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0x7f,0xdf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 7f df 64 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 fcmla   z21.s, z10.s, z5.s[1], #90
 // CHECK-INST: fcmla	z21.s, z10.s, z5.s[1], #90
 // CHECK-ENCODING: [0x55,0x15,0xf5,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 f5 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmle.s b/llvm/test/MC/AArch64/SVE/fcmle.s
index 119b873f79812..95ed55a87f885 100644
--- a/llvm/test/MC/AArch64/SVE/fcmle.s
+++ b/llvm/test/MC/AArch64/SVE/fcmle.s
@@ -12,35 +12,35 @@
 fcmle   p0.h, p0/z, z0.h, #0.0
 // CHECK-INST: fcmle	p0.h, p0/z, z0.h, #0.0
 // CHECK-ENCODING: [0x10,0x20,0x51,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 51 65 <unknown>
 
 fcmle   p0.s, p0/z, z0.s, #0.0
 // CHECK-INST: fcmle	p0.s, p0/z, z0.s, #0.0
 // CHECK-ENCODING: [0x10,0x20,0x91,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 91 65 <unknown>
 
 fcmle   p0.d, p0/z, z0.d, #0.0
 // CHECK-INST: fcmle	p0.d, p0/z, z0.d, #0.0
 // CHECK-ENCODING: [0x10,0x20,0xd1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 20 d1 65 <unknown>
 
 fcmle   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: fcmge	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x20,0x40,0x40,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 40 40 65 <unknown>
 
 fcmle   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: fcmge	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x20,0x40,0x80,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 40 80 65 <unknown>
 
 fcmle   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: fcmge	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x20,0x40,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 40 c0 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmlt.s b/llvm/test/MC/AArch64/SVE/fcmlt.s
index 62399d0e8b9cb..a145b386aef04 100644
--- a/llvm/test/MC/AArch64/SVE/fcmlt.s
+++ b/llvm/test/MC/AArch64/SVE/fcmlt.s
@@ -12,35 +12,35 @@
 fcmlt   p0.h, p0/z, z0.h, #0.0
 // CHECK-INST: fcmlt	p0.h, p0/z, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x51,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 51 65 <unknown>
 
 fcmlt   p0.s, p0/z, z0.s, #0.0
 // CHECK-INST: fcmlt	p0.s, p0/z, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x91,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 91 65 <unknown>
 
 fcmlt   p0.d, p0/z, z0.d, #0.0
 // CHECK-INST: fcmlt	p0.d, p0/z, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x20,0xd1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 d1 65 <unknown>
 
 fcmlt   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: fcmgt	p0.h, p0/z, z1.h, z0.h
 // CHECK-ENCODING: [0x30,0x40,0x40,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 40 40 65 <unknown>
 
 fcmlt   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: fcmgt	p0.s, p0/z, z1.s, z0.s
 // CHECK-ENCODING: [0x30,0x40,0x80,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 40 80 65 <unknown>
 
 fcmlt   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: fcmgt	p0.d, p0/z, z1.d, z0.d
 // CHECK-ENCODING: [0x30,0x40,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 30 40 c0 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmne.s b/llvm/test/MC/AArch64/SVE/fcmne.s
index d1b8f8e29ce00..355044df20fb3 100644
--- a/llvm/test/MC/AArch64/SVE/fcmne.s
+++ b/llvm/test/MC/AArch64/SVE/fcmne.s
@@ -12,35 +12,35 @@
 fcmne   p0.h, p0/z, z0.h, #0.0
 // CHECK-INST: fcmne	p0.h, p0/z, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x53,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 53 65 <unknown>
 
 fcmne   p0.s, p0/z, z0.s, #0.0
 // CHECK-INST: fcmne	p0.s, p0/z, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x20,0x93,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 93 65 <unknown>
 
 fcmne   p0.d, p0/z, z0.d, #0.0
 // CHECK-INST: fcmne	p0.d, p0/z, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x20,0xd3,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 d3 65 <unknown>
 
 fcmne   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: fcmne	p0.h, p0/z, z0.h, z1.h
 // CHECK-ENCODING: [0x10,0x60,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 60 41 65 <unknown>
 
 fcmne   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: fcmne	p0.s, p0/z, z0.s, z1.s
 // CHECK-ENCODING: [0x10,0x60,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 60 81 65 <unknown>
 
 fcmne   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: fcmne	p0.d, p0/z, z0.d, z1.d
 // CHECK-ENCODING: [0x10,0x60,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 60 c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcmuo.s b/llvm/test/MC/AArch64/SVE/fcmuo.s
index dece61eef2086..d75ab77c18ca8 100644
--- a/llvm/test/MC/AArch64/SVE/fcmuo.s
+++ b/llvm/test/MC/AArch64/SVE/fcmuo.s
@@ -12,18 +12,18 @@
 fcmuo   p0.h, p0/z, z0.h, z1.h
 // CHECK-INST: fcmuo	p0.h, p0/z, z0.h, z1.h
 // CHECK-ENCODING: [0x00,0xc0,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 41 65 <unknown>
 
 fcmuo   p0.s, p0/z, z0.s, z1.s
 // CHECK-INST: fcmuo	p0.s, p0/z, z0.s, z1.s
 // CHECK-ENCODING: [0x00,0xc0,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 81 65 <unknown>
 
 fcmuo   p0.d, p0/z, z0.d, z1.d
 // CHECK-INST: fcmuo	p0.d, p0/z, z0.d, z1.d
 // CHECK-ENCODING: [0x00,0xc0,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 c1 65 <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE/fcpy.s b/llvm/test/MC/AArch64/SVE/fcpy.s
index 6cdd3b5f8b2cf..f2c4776ccf528 100644
--- a/llvm/test/MC/AArch64/SVE/fcpy.s
+++ b/llvm/test/MC/AArch64/SVE/fcpy.s
@@ -12,1549 +12,1549 @@
 fcpy z0.h, p0/m, #-0.12500000
 // CHECK-INST: fmov z0.h, p0/m, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 50 05 <unknown>
 
 fcpy z0.s, p0/m, #-0.12500000
 // CHECK-INST: fmov z0.s, p0/m, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 90 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.13281250
 // CHECK-INST: fmov z0.d, p0/m, #-0.13281250
 // CHECK-ENCODING: [0x20,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.14062500
 // CHECK-INST: fmov z0.d, p0/m, #-0.14062500
 // CHECK-ENCODING: [0x40,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.14843750
 // CHECK-INST: fmov z0.d, p0/m, #-0.14843750
 // CHECK-ENCODING: [0x60,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.15625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.15625000
 // CHECK-ENCODING: [0x80,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.16406250
 // CHECK-INST: fmov z0.d, p0/m, #-0.16406250
 // CHECK-ENCODING: [0xa0,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.17187500
 // CHECK-INST: fmov z0.d, p0/m, #-0.17187500
 // CHECK-ENCODING: [0xc0,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.17968750
 // CHECK-INST: fmov z0.d, p0/m, #-0.17968750
 // CHECK-ENCODING: [0xe0,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.18750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.18750000
 // CHECK-ENCODING: [0x00,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.19531250
 // CHECK-INST: fmov z0.d, p0/m, #-0.19531250
 // CHECK-ENCODING: [0x20,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.20312500
 // CHECK-INST: fmov z0.d, p0/m, #-0.20312500
 // CHECK-ENCODING: [0x40,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.21093750
 // CHECK-INST: fmov z0.d, p0/m, #-0.21093750
 // CHECK-ENCODING: [0x60,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.21875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.21875000
 // CHECK-ENCODING: [0x80,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.22656250
 // CHECK-INST: fmov z0.d, p0/m, #-0.22656250
 // CHECK-ENCODING: [0xa0,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.23437500
 // CHECK-INST: fmov z0.d, p0/m, #-0.23437500
 // CHECK-ENCODING: [0xc0,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.24218750
 // CHECK-INST: fmov z0.d, p0/m, #-0.24218750
 // CHECK-ENCODING: [0xe0,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-0.25000000
 // CHECK-ENCODING: [0x00,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.26562500
 // CHECK-INST: fmov z0.d, p0/m, #-0.26562500
 // CHECK-ENCODING: [0x20,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.28125000
 // CHECK-INST: fmov z0.d, p0/m, #-0.28125000
 // CHECK-ENCODING: [0x40,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.29687500
 // CHECK-INST: fmov z0.d, p0/m, #-0.29687500
 // CHECK-ENCODING: [0x60,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.31250000
 // CHECK-INST: fmov z0.d, p0/m, #-0.31250000
 // CHECK-ENCODING: [0x80,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.32812500
 // CHECK-INST: fmov z0.d, p0/m, #-0.32812500
 // CHECK-ENCODING: [0xa0,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.34375000
 // CHECK-INST: fmov z0.d, p0/m, #-0.34375000
 // CHECK-ENCODING: [0xc0,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.35937500
 // CHECK-INST: fmov z0.d, p0/m, #-0.35937500
 // CHECK-ENCODING: [0xe0,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 da d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.37500000
 // CHECK-ENCODING: [0x00,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.39062500
 // CHECK-INST: fmov z0.d, p0/m, #-0.39062500
 // CHECK-ENCODING: [0x20,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.40625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.40625000
 // CHECK-ENCODING: [0x40,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.42187500
 // CHECK-INST: fmov z0.d, p0/m, #-0.42187500
 // CHECK-ENCODING: [0x60,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.43750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.43750000
 // CHECK-ENCODING: [0x80,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.45312500
 // CHECK-INST: fmov z0.d, p0/m, #-0.45312500
 // CHECK-ENCODING: [0xa0,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.46875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.46875000
 // CHECK-ENCODING: [0xc0,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.48437500
 // CHECK-INST: fmov z0.d, p0/m, #-0.48437500
 // CHECK-ENCODING: [0xe0,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 db d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-0.50000000
 // CHECK-ENCODING: [0x00,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.53125000
 // CHECK-INST: fmov z0.d, p0/m, #-0.53125000
 // CHECK-ENCODING: [0x20,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.56250000
 // CHECK-INST: fmov z0.d, p0/m, #-0.56250000
 // CHECK-ENCODING: [0x40,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.59375000
 // CHECK-INST: fmov z0.d, p0/m, #-0.59375000
 // CHECK-ENCODING: [0x60,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.62500000
 // CHECK-ENCODING: [0x80,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.65625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.65625000
 // CHECK-ENCODING: [0xa0,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.68750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.68750000
 // CHECK-ENCODING: [0xc0,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.71875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.71875000
 // CHECK-ENCODING: [0xe0,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 dc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-0.75000000
 // CHECK-ENCODING: [0x00,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.78125000
 // CHECK-INST: fmov z0.d, p0/m, #-0.78125000
 // CHECK-ENCODING: [0x20,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.81250000
 // CHECK-INST: fmov z0.d, p0/m, #-0.81250000
 // CHECK-ENCODING: [0x40,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.84375000
 // CHECK-INST: fmov z0.d, p0/m, #-0.84375000
 // CHECK-ENCODING: [0x60,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.87500000
 // CHECK-ENCODING: [0x80,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.90625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.90625000
 // CHECK-ENCODING: [0xa0,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.93750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.93750000
 // CHECK-ENCODING: [0xc0,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-0.96875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.96875000
 // CHECK-ENCODING: [0xe0,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 dd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.00000000
 // CHECK-ENCODING: [0x00,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.06250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.06250000
 // CHECK-ENCODING: [0x20,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.12500000
 // CHECK-ENCODING: [0x40,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.18750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.18750000
 // CHECK-ENCODING: [0x60,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.25000000
 // CHECK-ENCODING: [0x80,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.31250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.31250000
 // CHECK-ENCODING: [0xa0,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.37500000
 // CHECK-ENCODING: [0xc0,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.43750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.43750000
 // CHECK-ENCODING: [0xe0,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 de d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.50000000
 // CHECK-ENCODING: [0x00,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.56250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.56250000
 // CHECK-ENCODING: [0x20,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.62500000
 // CHECK-ENCODING: [0x40,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.68750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.68750000
 // CHECK-ENCODING: [0x60,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.75000000
 // CHECK-ENCODING: [0x80,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.81250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.81250000
 // CHECK-ENCODING: [0xa0,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.87500000
 // CHECK-ENCODING: [0xc0,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-1.93750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.93750000
 // CHECK-ENCODING: [0xe0,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.00000000
 // CHECK-ENCODING: [0x00,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.12500000
 // CHECK-ENCODING: [0x20,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.25000000
 // CHECK-ENCODING: [0x40,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.37500000
 // CHECK-ENCODING: [0x60,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.50000000
 // CHECK-ENCODING: [0x80,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.62500000
 // CHECK-ENCODING: [0xa0,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.75000000
 // CHECK-ENCODING: [0xc0,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-2.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.87500000
 // CHECK-ENCODING: [0xe0,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.00000000
 // CHECK-ENCODING: [0x00,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.12500000
 // CHECK-ENCODING: [0x20,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.25000000
 // CHECK-ENCODING: [0x40,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.37500000
 // CHECK-ENCODING: [0x60,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.50000000
 // CHECK-ENCODING: [0x80,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.62500000
 // CHECK-ENCODING: [0xa0,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.75000000
 // CHECK-ENCODING: [0xc0,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-3.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.87500000
 // CHECK-ENCODING: [0xe0,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-4.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.00000000
 // CHECK-ENCODING: [0x00,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-4.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.25000000
 // CHECK-ENCODING: [0x20,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-4.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.50000000
 // CHECK-ENCODING: [0x40,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-4.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.75000000
 // CHECK-ENCODING: [0x60,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-5.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.00000000
 // CHECK-ENCODING: [0x80,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-5.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.25000000
 // CHECK-ENCODING: [0xa0,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-5.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.50000000
 // CHECK-ENCODING: [0xc0,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-5.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.75000000
 // CHECK-ENCODING: [0xe0,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-6.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.00000000
 // CHECK-ENCODING: [0x00,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-6.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.25000000
 // CHECK-ENCODING: [0x20,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-6.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.50000000
 // CHECK-ENCODING: [0x40,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-6.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.75000000
 // CHECK-ENCODING: [0x60,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-7.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.00000000
 // CHECK-ENCODING: [0x80,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-7.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.25000000
 // CHECK-ENCODING: [0xa0,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-7.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.50000000
 // CHECK-ENCODING: [0xc0,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-7.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.75000000
 // CHECK-ENCODING: [0xe0,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-8.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-8.00000000
 // CHECK-ENCODING: [0x00,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-8.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-8.50000000
 // CHECK-ENCODING: [0x20,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-9.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-9.00000000
 // CHECK-ENCODING: [0x40,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-9.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-9.50000000
 // CHECK-ENCODING: [0x60,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-10.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-10.00000000
 // CHECK-ENCODING: [0x80,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-10.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-10.50000000
 // CHECK-ENCODING: [0xa0,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-11.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-11.00000000
 // CHECK-ENCODING: [0xc0,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-11.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-11.50000000
 // CHECK-ENCODING: [0xe0,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-12.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-12.00000000
 // CHECK-ENCODING: [0x00,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-12.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-12.50000000
 // CHECK-ENCODING: [0x20,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-13.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-13.00000000
 // CHECK-ENCODING: [0x40,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-13.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-13.50000000
 // CHECK-ENCODING: [0x60,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-14.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-14.00000000
 // CHECK-ENCODING: [0x80,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-14.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-14.50000000
 // CHECK-ENCODING: [0xa0,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-15.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-15.00000000
 // CHECK-ENCODING: [0xc0,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-15.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-15.50000000
 // CHECK-ENCODING: [0xe0,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-16.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-16.00000000
 // CHECK-ENCODING: [0x00,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-17.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-17.00000000
 // CHECK-ENCODING: [0x20,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-18.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-18.00000000
 // CHECK-ENCODING: [0x40,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-19.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-19.00000000
 // CHECK-ENCODING: [0x60,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-20.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-20.00000000
 // CHECK-ENCODING: [0x80,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-21.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-21.00000000
 // CHECK-ENCODING: [0xa0,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-22.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-22.00000000
 // CHECK-ENCODING: [0xc0,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-23.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-23.00000000
 // CHECK-ENCODING: [0xe0,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-24.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-24.00000000
 // CHECK-ENCODING: [0x00,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-25.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-25.00000000
 // CHECK-ENCODING: [0x20,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-26.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-26.00000000
 // CHECK-ENCODING: [0x40,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-27.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-27.00000000
 // CHECK-ENCODING: [0x60,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-28.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-28.00000000
 // CHECK-ENCODING: [0x80,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-29.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-29.00000000
 // CHECK-ENCODING: [0xa0,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-30.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-30.00000000
 // CHECK-ENCODING: [0xc0,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #-31.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-31.00000000
 // CHECK-ENCODING: [0xe0,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.12500000
 // CHECK-INST: fmov z0.d, p0/m, #0.12500000
 // CHECK-ENCODING: [0x00,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.13281250
 // CHECK-INST: fmov z0.d, p0/m, #0.13281250
 // CHECK-ENCODING: [0x20,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.14062500
 // CHECK-INST: fmov z0.d, p0/m, #0.14062500
 // CHECK-ENCODING: [0x40,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.14843750
 // CHECK-INST: fmov z0.d, p0/m, #0.14843750
 // CHECK-ENCODING: [0x60,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.15625000
 // CHECK-INST: fmov z0.d, p0/m, #0.15625000
 // CHECK-ENCODING: [0x80,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.16406250
 // CHECK-INST: fmov z0.d, p0/m, #0.16406250
 // CHECK-ENCODING: [0xa0,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.17187500
 // CHECK-INST: fmov z0.d, p0/m, #0.17187500
 // CHECK-ENCODING: [0xc0,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.17968750
 // CHECK-INST: fmov z0.d, p0/m, #0.17968750
 // CHECK-ENCODING: [0xe0,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c8 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.18750000
 // CHECK-INST: fmov z0.d, p0/m, #0.18750000
 // CHECK-ENCODING: [0x00,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.19531250
 // CHECK-INST: fmov z0.d, p0/m, #0.19531250
 // CHECK-ENCODING: [0x20,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.20312500
 // CHECK-INST: fmov z0.d, p0/m, #0.20312500
 // CHECK-ENCODING: [0x40,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.21093750
 // CHECK-INST: fmov z0.d, p0/m, #0.21093750
 // CHECK-ENCODING: [0x60,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.21875000
 // CHECK-INST: fmov z0.d, p0/m, #0.21875000
 // CHECK-ENCODING: [0x80,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.22656250
 // CHECK-INST: fmov z0.d, p0/m, #0.22656250
 // CHECK-ENCODING: [0xa0,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.23437500
 // CHECK-INST: fmov z0.d, p0/m, #0.23437500
 // CHECK-ENCODING: [0xc0,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.24218750
 // CHECK-INST: fmov z0.d, p0/m, #0.24218750
 // CHECK-ENCODING: [0xe0,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c9 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.25000000
 // CHECK-INST: fmov z0.d, p0/m, #0.25000000
 // CHECK-ENCODING: [0x00,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.26562500
 // CHECK-INST: fmov z0.d, p0/m, #0.26562500
 // CHECK-ENCODING: [0x20,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.28125000
 // CHECK-INST: fmov z0.d, p0/m, #0.28125000
 // CHECK-ENCODING: [0x40,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.29687500
 // CHECK-INST: fmov z0.d, p0/m, #0.29687500
 // CHECK-ENCODING: [0x60,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.31250000
 // CHECK-INST: fmov z0.d, p0/m, #0.31250000
 // CHECK-ENCODING: [0x80,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.32812500
 // CHECK-INST: fmov z0.d, p0/m, #0.32812500
 // CHECK-ENCODING: [0xa0,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.34375000
 // CHECK-INST: fmov z0.d, p0/m, #0.34375000
 // CHECK-ENCODING: [0xc0,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.35937500
 // CHECK-INST: fmov z0.d, p0/m, #0.35937500
 // CHECK-ENCODING: [0xe0,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ca d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.37500000
 // CHECK-INST: fmov z0.d, p0/m, #0.37500000
 // CHECK-ENCODING: [0x00,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.39062500
 // CHECK-INST: fmov z0.d, p0/m, #0.39062500
 // CHECK-ENCODING: [0x20,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.40625000
 // CHECK-INST: fmov z0.d, p0/m, #0.40625000
 // CHECK-ENCODING: [0x40,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.42187500
 // CHECK-INST: fmov z0.d, p0/m, #0.42187500
 // CHECK-ENCODING: [0x60,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.43750000
 // CHECK-INST: fmov z0.d, p0/m, #0.43750000
 // CHECK-ENCODING: [0x80,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.45312500
 // CHECK-INST: fmov z0.d, p0/m, #0.45312500
 // CHECK-ENCODING: [0xa0,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.46875000
 // CHECK-INST: fmov z0.d, p0/m, #0.46875000
 // CHECK-ENCODING: [0xc0,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.48437500
 // CHECK-INST: fmov z0.d, p0/m, #0.48437500
 // CHECK-ENCODING: [0xe0,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.50000000
 // CHECK-INST: fmov z0.d, p0/m, #0.50000000
 // CHECK-ENCODING: [0x00,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.53125000
 // CHECK-INST: fmov z0.d, p0/m, #0.53125000
 // CHECK-ENCODING: [0x20,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.56250000
 // CHECK-INST: fmov z0.d, p0/m, #0.56250000
 // CHECK-ENCODING: [0x40,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.59375000
 // CHECK-INST: fmov z0.d, p0/m, #0.59375000
 // CHECK-ENCODING: [0x60,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.62500000
 // CHECK-INST: fmov z0.d, p0/m, #0.62500000
 // CHECK-ENCODING: [0x80,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.65625000
 // CHECK-INST: fmov z0.d, p0/m, #0.65625000
 // CHECK-ENCODING: [0xa0,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.68750000
 // CHECK-INST: fmov z0.d, p0/m, #0.68750000
 // CHECK-ENCODING: [0xc0,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.71875000
 // CHECK-INST: fmov z0.d, p0/m, #0.71875000
 // CHECK-ENCODING: [0xe0,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cc d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.75000000
 // CHECK-INST: fmov z0.d, p0/m, #0.75000000
 // CHECK-ENCODING: [0x00,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.78125000
 // CHECK-INST: fmov z0.d, p0/m, #0.78125000
 // CHECK-ENCODING: [0x20,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.81250000
 // CHECK-INST: fmov z0.d, p0/m, #0.81250000
 // CHECK-ENCODING: [0x40,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.84375000
 // CHECK-INST: fmov z0.d, p0/m, #0.84375000
 // CHECK-ENCODING: [0x60,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.87500000
 // CHECK-INST: fmov z0.d, p0/m, #0.87500000
 // CHECK-ENCODING: [0x80,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.90625000
 // CHECK-INST: fmov z0.d, p0/m, #0.90625000
 // CHECK-ENCODING: [0xa0,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.93750000
 // CHECK-INST: fmov z0.d, p0/m, #0.93750000
 // CHECK-ENCODING: [0xc0,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #0.96875000
 // CHECK-INST: fmov z0.d, p0/m, #0.96875000
 // CHECK-ENCODING: [0xe0,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cd d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.00000000
 // CHECK-INST: fmov z0.d, p0/m, #1.00000000
 // CHECK-ENCODING: [0x00,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.06250000
 // CHECK-INST: fmov z0.d, p0/m, #1.06250000
 // CHECK-ENCODING: [0x20,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.12500000
 // CHECK-INST: fmov z0.d, p0/m, #1.12500000
 // CHECK-ENCODING: [0x40,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.18750000
 // CHECK-INST: fmov z0.d, p0/m, #1.18750000
 // CHECK-ENCODING: [0x60,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.25000000
 // CHECK-INST: fmov z0.d, p0/m, #1.25000000
 // CHECK-ENCODING: [0x80,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.31250000
 // CHECK-INST: fmov z0.d, p0/m, #1.31250000
 // CHECK-ENCODING: [0xa0,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.37500000
 // CHECK-INST: fmov z0.d, p0/m, #1.37500000
 // CHECK-ENCODING: [0xc0,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.43750000
 // CHECK-INST: fmov z0.d, p0/m, #1.43750000
 // CHECK-ENCODING: [0xe0,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ce d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.50000000
 // CHECK-INST: fmov z0.d, p0/m, #1.50000000
 // CHECK-ENCODING: [0x00,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.56250000
 // CHECK-INST: fmov z0.d, p0/m, #1.56250000
 // CHECK-ENCODING: [0x20,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.62500000
 // CHECK-INST: fmov z0.d, p0/m, #1.62500000
 // CHECK-ENCODING: [0x40,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.68750000
 // CHECK-INST: fmov z0.d, p0/m, #1.68750000
 // CHECK-ENCODING: [0x60,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.75000000
 // CHECK-INST: fmov z0.d, p0/m, #1.75000000
 // CHECK-ENCODING: [0x80,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.81250000
 // CHECK-INST: fmov z0.d, p0/m, #1.81250000
 // CHECK-ENCODING: [0xa0,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.87500000
 // CHECK-INST: fmov z0.d, p0/m, #1.87500000
 // CHECK-ENCODING: [0xc0,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #1.93750000
 // CHECK-INST: fmov z0.d, p0/m, #1.93750000
 // CHECK-ENCODING: [0xe0,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.00000000
 // CHECK-INST: fmov z0.d, p0/m, #2.00000000
 // CHECK-ENCODING: [0x00,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.12500000
 // CHECK-INST: fmov z0.d, p0/m, #2.12500000
 // CHECK-ENCODING: [0x20,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.25000000
 // CHECK-INST: fmov z0.d, p0/m, #2.25000000
 // CHECK-ENCODING: [0x40,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.37500000
 // CHECK-INST: fmov z0.d, p0/m, #2.37500000
 // CHECK-ENCODING: [0x60,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.50000000
 // CHECK-INST: fmov z0.d, p0/m, #2.50000000
 // CHECK-ENCODING: [0x80,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.62500000
 // CHECK-INST: fmov z0.d, p0/m, #2.62500000
 // CHECK-ENCODING: [0xa0,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.75000000
 // CHECK-INST: fmov z0.d, p0/m, #2.75000000
 // CHECK-ENCODING: [0xc0,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #2.87500000
 // CHECK-INST: fmov z0.d, p0/m, #2.87500000
 // CHECK-ENCODING: [0xe0,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c0 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.00000000
 // CHECK-INST: fmov z0.d, p0/m, #3.00000000
 // CHECK-ENCODING: [0x00,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.12500000
 // CHECK-INST: fmov z0.d, p0/m, #3.12500000
 // CHECK-ENCODING: [0x20,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.25000000
 // CHECK-INST: fmov z0.d, p0/m, #3.25000000
 // CHECK-ENCODING: [0x40,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.37500000
 // CHECK-INST: fmov z0.d, p0/m, #3.37500000
 // CHECK-ENCODING: [0x60,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.50000000
 // CHECK-INST: fmov z0.d, p0/m, #3.50000000
 // CHECK-ENCODING: [0x80,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.62500000
 // CHECK-INST: fmov z0.d, p0/m, #3.62500000
 // CHECK-ENCODING: [0xa0,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.75000000
 // CHECK-INST: fmov z0.d, p0/m, #3.75000000
 // CHECK-ENCODING: [0xc0,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #3.87500000
 // CHECK-INST: fmov z0.d, p0/m, #3.87500000
 // CHECK-ENCODING: [0xe0,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c1 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #4.00000000
 // CHECK-INST: fmov z0.d, p0/m, #4.00000000
 // CHECK-ENCODING: [0x00,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #4.25000000
 // CHECK-INST: fmov z0.d, p0/m, #4.25000000
 // CHECK-ENCODING: [0x20,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #4.50000000
 // CHECK-INST: fmov z0.d, p0/m, #4.50000000
 // CHECK-ENCODING: [0x40,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #4.75000000
 // CHECK-INST: fmov z0.d, p0/m, #4.75000000
 // CHECK-ENCODING: [0x60,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #5.00000000
 // CHECK-INST: fmov z0.d, p0/m, #5.00000000
 // CHECK-ENCODING: [0x80,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #5.25000000
 // CHECK-INST: fmov z0.d, p0/m, #5.25000000
 // CHECK-ENCODING: [0xa0,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #5.50000000
 // CHECK-INST: fmov z0.d, p0/m, #5.50000000
 // CHECK-ENCODING: [0xc0,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #5.75000000
 // CHECK-INST: fmov z0.d, p0/m, #5.75000000
 // CHECK-ENCODING: [0xe0,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c2 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #6.00000000
 // CHECK-INST: fmov z0.d, p0/m, #6.00000000
 // CHECK-ENCODING: [0x00,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #6.25000000
 // CHECK-INST: fmov z0.d, p0/m, #6.25000000
 // CHECK-ENCODING: [0x20,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #6.50000000
 // CHECK-INST: fmov z0.d, p0/m, #6.50000000
 // CHECK-ENCODING: [0x40,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #6.75000000
 // CHECK-INST: fmov z0.d, p0/m, #6.75000000
 // CHECK-ENCODING: [0x60,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #7.00000000
 // CHECK-INST: fmov z0.d, p0/m, #7.00000000
 // CHECK-ENCODING: [0x80,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #7.25000000
 // CHECK-INST: fmov z0.d, p0/m, #7.25000000
 // CHECK-ENCODING: [0xa0,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #7.50000000
 // CHECK-INST: fmov z0.d, p0/m, #7.50000000
 // CHECK-ENCODING: [0xc0,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #7.75000000
 // CHECK-INST: fmov z0.d, p0/m, #7.75000000
 // CHECK-ENCODING: [0xe0,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #8.00000000
 // CHECK-INST: fmov z0.d, p0/m, #8.00000000
 // CHECK-ENCODING: [0x00,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #8.50000000
 // CHECK-INST: fmov z0.d, p0/m, #8.50000000
 // CHECK-ENCODING: [0x20,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #9.00000000
 // CHECK-INST: fmov z0.d, p0/m, #9.00000000
 // CHECK-ENCODING: [0x40,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #9.50000000
 // CHECK-INST: fmov z0.d, p0/m, #9.50000000
 // CHECK-ENCODING: [0x60,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #10.00000000
 // CHECK-INST: fmov z0.d, p0/m, #10.00000000
 // CHECK-ENCODING: [0x80,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #10.50000000
 // CHECK-INST: fmov z0.d, p0/m, #10.50000000
 // CHECK-ENCODING: [0xa0,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #11.00000000
 // CHECK-INST: fmov z0.d, p0/m, #11.00000000
 // CHECK-ENCODING: [0xc0,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #11.50000000
 // CHECK-INST: fmov z0.d, p0/m, #11.50000000
 // CHECK-ENCODING: [0xe0,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c4 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #12.00000000
 // CHECK-INST: fmov z0.d, p0/m, #12.00000000
 // CHECK-ENCODING: [0x00,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #12.50000000
 // CHECK-INST: fmov z0.d, p0/m, #12.50000000
 // CHECK-ENCODING: [0x20,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #13.00000000
 // CHECK-INST: fmov z0.d, p0/m, #13.00000000
 // CHECK-ENCODING: [0x40,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #13.50000000
 // CHECK-INST: fmov z0.d, p0/m, #13.50000000
 // CHECK-ENCODING: [0x60,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #14.00000000
 // CHECK-INST: fmov z0.d, p0/m, #14.00000000
 // CHECK-ENCODING: [0x80,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #14.50000000
 // CHECK-INST: fmov z0.d, p0/m, #14.50000000
 // CHECK-ENCODING: [0xa0,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #15.00000000
 // CHECK-INST: fmov z0.d, p0/m, #15.00000000
 // CHECK-ENCODING: [0xc0,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #15.50000000
 // CHECK-INST: fmov z0.d, p0/m, #15.50000000
 // CHECK-ENCODING: [0xe0,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c5 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #16.00000000
 // CHECK-INST: fmov z0.d, p0/m, #16.00000000
 // CHECK-ENCODING: [0x00,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #17.00000000
 // CHECK-INST: fmov z0.d, p0/m, #17.00000000
 // CHECK-ENCODING: [0x20,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #18.00000000
 // CHECK-INST: fmov z0.d, p0/m, #18.00000000
 // CHECK-ENCODING: [0x40,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #19.00000000
 // CHECK-INST: fmov z0.d, p0/m, #19.00000000
 // CHECK-ENCODING: [0x60,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #20.00000000
 // CHECK-INST: fmov z0.d, p0/m, #20.00000000
 // CHECK-ENCODING: [0x80,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #21.00000000
 // CHECK-INST: fmov z0.d, p0/m, #21.00000000
 // CHECK-ENCODING: [0xa0,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #22.00000000
 // CHECK-INST: fmov z0.d, p0/m, #22.00000000
 // CHECK-ENCODING: [0xc0,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #23.00000000
 // CHECK-INST: fmov z0.d, p0/m, #23.00000000
 // CHECK-ENCODING: [0xe0,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c6 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #24.00000000
 // CHECK-INST: fmov z0.d, p0/m, #24.00000000
 // CHECK-ENCODING: [0x00,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #25.00000000
 // CHECK-INST: fmov z0.d, p0/m, #25.00000000
 // CHECK-ENCODING: [0x20,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #26.00000000
 // CHECK-INST: fmov z0.d, p0/m, #26.00000000
 // CHECK-ENCODING: [0x40,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #27.00000000
 // CHECK-INST: fmov z0.d, p0/m, #27.00000000
 // CHECK-ENCODING: [0x60,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #28.00000000
 // CHECK-INST: fmov z0.d, p0/m, #28.00000000
 // CHECK-ENCODING: [0x80,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #29.00000000
 // CHECK-INST: fmov z0.d, p0/m, #29.00000000
 // CHECK-ENCODING: [0xa0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #30.00000000
 // CHECK-INST: fmov z0.d, p0/m, #30.00000000
 // CHECK-ENCODING: [0xc0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c7 d0 05 <unknown>
 
 fcpy z0.d, p0/m, #31.00000000
 // CHECK-INST: fmov z0.d, p0/m, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 d0 05 <unknown>
 
 
@@ -1564,23 +1564,23 @@ fcpy z0.d, p0/m, #31.00000000
 movprfx z0.d, p0/z, z7.d
 // CHECK-INST: movprfx	z0.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe0,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 20 d0 04 <unknown>
 
 fcpy z0.d, p0/m, #31.00000000
 // CHECK-INST: fmov	z0.d, p0/m, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 d0 05 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fcpy z0.d, p0/m, #31.00000000
 // CHECK-INST: fmov	z0.d, p0/m, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 d0 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcvt.s b/llvm/test/MC/AArch64/SVE/fcvt.s
index cc9f7722bee39..d68d4f9494506 100644
--- a/llvm/test/MC/AArch64/SVE/fcvt.s
+++ b/llvm/test/MC/AArch64/SVE/fcvt.s
@@ -12,37 +12,37 @@
 fcvt    z0.h, p0/m, z0.s
 // CHECK-INST: fcvt    z0.h, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x88,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 88 65 <unknown>
 
 fcvt    z0.h, p0/m, z0.d
 // CHECK-INST: fcvt    z0.h, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xc8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c8 65 <unknown>
 
 fcvt    z0.s, p0/m, z0.h
 // CHECK-INST: fcvt    z0.s, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x89,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 89 65 <unknown>
 
 fcvt    z0.s, p0/m, z0.d
 // CHECK-INST: fcvt    z0.s, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xca,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 ca 65 <unknown>
 
 fcvt    z0.d, p0/m, z0.h
 // CHECK-INST: fcvt    z0.d, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0xc9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c9 65 <unknown>
 
 fcvt    z0.d, p0/m, z0.s
 // CHECK-INST: fcvt    z0.d, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0xcb,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 cb 65 <unknown>
 
 
@@ -52,23 +52,23 @@ fcvt    z0.d, p0/m, z0.s
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 fcvt    z5.d, p0/m, z0.s
 // CHECK-INST: fcvt	z5.d, p0/m, z0.s
 // CHECK-ENCODING: [0x05,0xa0,0xcb,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 cb 65 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 fcvt    z5.d, p0/m, z0.s
 // CHECK-INST: fcvt	z5.d, p0/m, z0.s
 // CHECK-ENCODING: [0x05,0xa0,0xcb,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 cb 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcvtzs.s b/llvm/test/MC/AArch64/SVE/fcvtzs.s
index a25f9ec13d03e..84ad36694ce80 100644
--- a/llvm/test/MC/AArch64/SVE/fcvtzs.s
+++ b/llvm/test/MC/AArch64/SVE/fcvtzs.s
@@ -12,43 +12,43 @@
 fcvtzs  z0.h, p0/m, z0.h
 // CHECK-INST: fcvtzs  z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x5a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 5a 65 <unknown>
 
 fcvtzs  z0.s, p0/m, z0.h
 // CHECK-INST: fcvtzs  z0.s, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x5c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 5c 65 <unknown>
 
 fcvtzs  z0.s, p0/m, z0.s
 // CHECK-INST: fcvtzs  z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x9c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 9c 65 <unknown>
 
 fcvtzs  z0.s, p0/m, z0.d
 // CHECK-INST: fcvtzs  z0.s, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd8,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d8 65 <unknown>
 
 fcvtzs  z0.d, p0/m, z0.h
 // CHECK-INST: fcvtzs  z0.d, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x5e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 5e 65 <unknown>
 
 fcvtzs  z0.d, p0/m, z0.s
 // CHECK-INST: fcvtzs  z0.d, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0xdc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 dc 65 <unknown>
 
 fcvtzs  z0.d, p0/m, z0.d
 // CHECK-INST: fcvtzs  z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xde,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 de 65 <unknown>
 
 
@@ -58,23 +58,23 @@ fcvtzs  z0.d, p0/m, z0.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 fcvtzs  z5.d, p0/m, z0.d
 // CHECK-INST: fcvtzs	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xde,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 de 65 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 fcvtzs  z5.d, p0/m, z0.d
 // CHECK-INST: fcvtzs	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xde,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 de 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fcvtzu.s b/llvm/test/MC/AArch64/SVE/fcvtzu.s
index 9ea5dde0021ad..bcab336eb3f8e 100644
--- a/llvm/test/MC/AArch64/SVE/fcvtzu.s
+++ b/llvm/test/MC/AArch64/SVE/fcvtzu.s
@@ -12,43 +12,43 @@
 fcvtzu  z0.h, p0/m, z0.h
 // CHECK-INST: fcvtzu  z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x5b,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 5b 65 <unknown>
 
 fcvtzu  z0.s, p0/m, z0.h
 // CHECK-INST: fcvtzu  z0.s, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x5d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 5d 65 <unknown>
 
 fcvtzu  z0.s, p0/m, z0.s
 // CHECK-INST: fcvtzu  z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x9d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 9d 65 <unknown>
 
 fcvtzu  z0.s, p0/m, z0.d
 // CHECK-INST: fcvtzu  z0.s, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d9 65 <unknown>
 
 fcvtzu  z0.d, p0/m, z0.h
 // CHECK-INST: fcvtzu  z0.d, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 5f 65 <unknown>
 
 fcvtzu  z0.d, p0/m, z0.s
 // CHECK-INST: fcvtzu  z0.d, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0xdd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 dd 65 <unknown>
 
 fcvtzu  z0.d, p0/m, z0.d
 // CHECK-INST: fcvtzu  z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 df 65 <unknown>
 
 
@@ -58,23 +58,23 @@ fcvtzu  z0.d, p0/m, z0.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 fcvtzu  z5.d, p0/m, z0.d
 // CHECK-INST: fcvtzu	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 df 65 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 fcvtzu  z5.d, p0/m, z0.d
 // CHECK-INST: fcvtzu	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 df 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fdiv.s b/llvm/test/MC/AArch64/SVE/fdiv.s
index 32477fbe8162d..8b137bdeb7b6c 100644
--- a/llvm/test/MC/AArch64/SVE/fdiv.s
+++ b/llvm/test/MC/AArch64/SVE/fdiv.s
@@ -12,19 +12,19 @@
 fdiv    z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fdiv	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x4d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 4d 65 <unknown>
 
 fdiv    z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fdiv	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x8d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 8d 65 <unknown>
 
 fdiv    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fdiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xcd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f cd 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fdiv    z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fdiv    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fdiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xcd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f cd 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fdiv    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fdiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xcd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f cd 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fdivr.s b/llvm/test/MC/AArch64/SVE/fdivr.s
index e1b33ff6ec213..864e46271710b 100644
--- a/llvm/test/MC/AArch64/SVE/fdivr.s
+++ b/llvm/test/MC/AArch64/SVE/fdivr.s
@@ -12,19 +12,19 @@
 fdivr   z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fdivr	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x4c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 4c 65 <unknown>
 
 fdivr   z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fdivr	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x8c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 8c 65 <unknown>
 
 fdivr   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fdivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xcc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f cc 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fdivr   z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fdivr   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fdivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xcc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f cc 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fdivr   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fdivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xcc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f cc 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fdup.s b/llvm/test/MC/AArch64/SVE/fdup.s
index 39413e7fad6d2..7e68f5aaf72b8 100644
--- a/llvm/test/MC/AArch64/SVE/fdup.s
+++ b/llvm/test/MC/AArch64/SVE/fdup.s
@@ -12,1547 +12,1547 @@
 fdup z0.h, #-0.12500000
 // CHECK-INST: fmov z0.h, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0x79,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 79 25 <unknown>
 
 fdup z0.s, #-0.12500000
 // CHECK-INST: fmov z0.s, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0xb9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 b9 25 <unknown>
 
 fdup z0.d, #-0.12500000
 // CHECK-INST: fmov z0.d, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.13281250
 // CHECK-INST: fmov z0.d, #-0.13281250
 // CHECK-ENCODING: [0x20,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.14062500
 // CHECK-INST: fmov z0.d, #-0.14062500
 // CHECK-ENCODING: [0x40,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.14843750
 // CHECK-INST: fmov z0.d, #-0.14843750
 // CHECK-ENCODING: [0x60,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.15625000
 // CHECK-INST: fmov z0.d, #-0.15625000
 // CHECK-ENCODING: [0x80,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.16406250
 // CHECK-INST: fmov z0.d, #-0.16406250
 // CHECK-ENCODING: [0xa0,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.17187500
 // CHECK-INST: fmov z0.d, #-0.17187500
 // CHECK-ENCODING: [0xc0,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.17968750
 // CHECK-INST: fmov z0.d, #-0.17968750
 // CHECK-ENCODING: [0xe0,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d8 f9 25 <unknown>
 
 fdup z0.d, #-0.18750000
 // CHECK-INST: fmov z0.d, #-0.18750000
 // CHECK-ENCODING: [0x00,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.19531250
 // CHECK-INST: fmov z0.d, #-0.19531250
 // CHECK-ENCODING: [0x20,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.20312500
 // CHECK-INST: fmov z0.d, #-0.20312500
 // CHECK-ENCODING: [0x40,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.21093750
 // CHECK-INST: fmov z0.d, #-0.21093750
 // CHECK-ENCODING: [0x60,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.21875000
 // CHECK-INST: fmov z0.d, #-0.21875000
 // CHECK-ENCODING: [0x80,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.22656250
 // CHECK-INST: fmov z0.d, #-0.22656250
 // CHECK-ENCODING: [0xa0,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.23437500
 // CHECK-INST: fmov z0.d, #-0.23437500
 // CHECK-ENCODING: [0xc0,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.24218750
 // CHECK-INST: fmov z0.d, #-0.24218750
 // CHECK-ENCODING: [0xe0,0xd9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d9 f9 25 <unknown>
 
 fdup z0.d, #-0.25000000
 // CHECK-INST: fmov z0.d, #-0.25000000
 // CHECK-ENCODING: [0x00,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 da f9 25 <unknown>
 
 fdup z0.d, #-0.26562500
 // CHECK-INST: fmov z0.d, #-0.26562500
 // CHECK-ENCODING: [0x20,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 da f9 25 <unknown>
 
 fdup z0.d, #-0.28125000
 // CHECK-INST: fmov z0.d, #-0.28125000
 // CHECK-ENCODING: [0x40,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 da f9 25 <unknown>
 
 fdup z0.d, #-0.29687500
 // CHECK-INST: fmov z0.d, #-0.29687500
 // CHECK-ENCODING: [0x60,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 da f9 25 <unknown>
 
 fdup z0.d, #-0.31250000
 // CHECK-INST: fmov z0.d, #-0.31250000
 // CHECK-ENCODING: [0x80,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 da f9 25 <unknown>
 
 fdup z0.d, #-0.32812500
 // CHECK-INST: fmov z0.d, #-0.32812500
 // CHECK-ENCODING: [0xa0,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 da f9 25 <unknown>
 
 fdup z0.d, #-0.34375000
 // CHECK-INST: fmov z0.d, #-0.34375000
 // CHECK-ENCODING: [0xc0,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 da f9 25 <unknown>
 
 fdup z0.d, #-0.35937500
 // CHECK-INST: fmov z0.d, #-0.35937500
 // CHECK-ENCODING: [0xe0,0xda,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 da f9 25 <unknown>
 
 fdup z0.d, #-0.37500000
 // CHECK-INST: fmov z0.d, #-0.37500000
 // CHECK-ENCODING: [0x00,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 db f9 25 <unknown>
 
 fdup z0.d, #-0.39062500
 // CHECK-INST: fmov z0.d, #-0.39062500
 // CHECK-ENCODING: [0x20,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 db f9 25 <unknown>
 
 fdup z0.d, #-0.40625000
 // CHECK-INST: fmov z0.d, #-0.40625000
 // CHECK-ENCODING: [0x40,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 db f9 25 <unknown>
 
 fdup z0.d, #-0.42187500
 // CHECK-INST: fmov z0.d, #-0.42187500
 // CHECK-ENCODING: [0x60,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 db f9 25 <unknown>
 
 fdup z0.d, #-0.43750000
 // CHECK-INST: fmov z0.d, #-0.43750000
 // CHECK-ENCODING: [0x80,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 db f9 25 <unknown>
 
 fdup z0.d, #-0.45312500
 // CHECK-INST: fmov z0.d, #-0.45312500
 // CHECK-ENCODING: [0xa0,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 db f9 25 <unknown>
 
 fdup z0.d, #-0.46875000
 // CHECK-INST: fmov z0.d, #-0.46875000
 // CHECK-ENCODING: [0xc0,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 db f9 25 <unknown>
 
 fdup z0.d, #-0.48437500
 // CHECK-INST: fmov z0.d, #-0.48437500
 // CHECK-ENCODING: [0xe0,0xdb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 db f9 25 <unknown>
 
 fdup z0.d, #-0.50000000
 // CHECK-INST: fmov z0.d, #-0.50000000
 // CHECK-ENCODING: [0x00,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 dc f9 25 <unknown>
 
 fdup z0.d, #-0.53125000
 // CHECK-INST: fmov z0.d, #-0.53125000
 // CHECK-ENCODING: [0x20,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc f9 25 <unknown>
 
 fdup z0.d, #-0.56250000
 // CHECK-INST: fmov z0.d, #-0.56250000
 // CHECK-ENCODING: [0x40,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 dc f9 25 <unknown>
 
 fdup z0.d, #-0.59375000
 // CHECK-INST: fmov z0.d, #-0.59375000
 // CHECK-ENCODING: [0x60,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 dc f9 25 <unknown>
 
 fdup z0.d, #-0.62500000
 // CHECK-INST: fmov z0.d, #-0.62500000
 // CHECK-ENCODING: [0x80,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 dc f9 25 <unknown>
 
 fdup z0.d, #-0.65625000
 // CHECK-INST: fmov z0.d, #-0.65625000
 // CHECK-ENCODING: [0xa0,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 dc f9 25 <unknown>
 
 fdup z0.d, #-0.68750000
 // CHECK-INST: fmov z0.d, #-0.68750000
 // CHECK-ENCODING: [0xc0,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 dc f9 25 <unknown>
 
 fdup z0.d, #-0.71875000
 // CHECK-INST: fmov z0.d, #-0.71875000
 // CHECK-ENCODING: [0xe0,0xdc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 dc f9 25 <unknown>
 
 fdup z0.d, #-0.75000000
 // CHECK-INST: fmov z0.d, #-0.75000000
 // CHECK-ENCODING: [0x00,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 dd f9 25 <unknown>
 
 fdup z0.d, #-0.78125000
 // CHECK-INST: fmov z0.d, #-0.78125000
 // CHECK-ENCODING: [0x20,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dd f9 25 <unknown>
 
 fdup z0.d, #-0.81250000
 // CHECK-INST: fmov z0.d, #-0.81250000
 // CHECK-ENCODING: [0x40,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 dd f9 25 <unknown>
 
 fdup z0.d, #-0.84375000
 // CHECK-INST: fmov z0.d, #-0.84375000
 // CHECK-ENCODING: [0x60,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 dd f9 25 <unknown>
 
 fdup z0.d, #-0.87500000
 // CHECK-INST: fmov z0.d, #-0.87500000
 // CHECK-ENCODING: [0x80,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 dd f9 25 <unknown>
 
 fdup z0.d, #-0.90625000
 // CHECK-INST: fmov z0.d, #-0.90625000
 // CHECK-ENCODING: [0xa0,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 dd f9 25 <unknown>
 
 fdup z0.d, #-0.93750000
 // CHECK-INST: fmov z0.d, #-0.93750000
 // CHECK-ENCODING: [0xc0,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 dd f9 25 <unknown>
 
 fdup z0.d, #-0.96875000
 // CHECK-INST: fmov z0.d, #-0.96875000
 // CHECK-ENCODING: [0xe0,0xdd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 dd f9 25 <unknown>
 
 fdup z0.d, #-1.00000000
 // CHECK-INST: fmov z0.d, #-1.00000000
 // CHECK-ENCODING: [0x00,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 de f9 25 <unknown>
 
 fdup z0.d, #-1.06250000
 // CHECK-INST: fmov z0.d, #-1.06250000
 // CHECK-ENCODING: [0x20,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 de f9 25 <unknown>
 
 fdup z0.d, #-1.12500000
 // CHECK-INST: fmov z0.d, #-1.12500000
 // CHECK-ENCODING: [0x40,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 de f9 25 <unknown>
 
 fdup z0.d, #-1.18750000
 // CHECK-INST: fmov z0.d, #-1.18750000
 // CHECK-ENCODING: [0x60,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 de f9 25 <unknown>
 
 fdup z0.d, #-1.25000000
 // CHECK-INST: fmov z0.d, #-1.25000000
 // CHECK-ENCODING: [0x80,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 de f9 25 <unknown>
 
 fdup z0.d, #-1.31250000
 // CHECK-INST: fmov z0.d, #-1.31250000
 // CHECK-ENCODING: [0xa0,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 de f9 25 <unknown>
 
 fdup z0.d, #-1.37500000
 // CHECK-INST: fmov z0.d, #-1.37500000
 // CHECK-ENCODING: [0xc0,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 de f9 25 <unknown>
 
 fdup z0.d, #-1.43750000
 // CHECK-INST: fmov z0.d, #-1.43750000
 // CHECK-ENCODING: [0xe0,0xde,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 de f9 25 <unknown>
 
 fdup z0.d, #-1.50000000
 // CHECK-INST: fmov z0.d, #-1.50000000
 // CHECK-ENCODING: [0x00,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 df f9 25 <unknown>
 
 fdup z0.d, #-1.56250000
 // CHECK-INST: fmov z0.d, #-1.56250000
 // CHECK-ENCODING: [0x20,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 df f9 25 <unknown>
 
 fdup z0.d, #-1.62500000
 // CHECK-INST: fmov z0.d, #-1.62500000
 // CHECK-ENCODING: [0x40,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 df f9 25 <unknown>
 
 fdup z0.d, #-1.68750000
 // CHECK-INST: fmov z0.d, #-1.68750000
 // CHECK-ENCODING: [0x60,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 df f9 25 <unknown>
 
 fdup z0.d, #-1.75000000
 // CHECK-INST: fmov z0.d, #-1.75000000
 // CHECK-ENCODING: [0x80,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 df f9 25 <unknown>
 
 fdup z0.d, #-1.81250000
 // CHECK-INST: fmov z0.d, #-1.81250000
 // CHECK-ENCODING: [0xa0,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 df f9 25 <unknown>
 
 fdup z0.d, #-1.87500000
 // CHECK-INST: fmov z0.d, #-1.87500000
 // CHECK-ENCODING: [0xc0,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 df f9 25 <unknown>
 
 fdup z0.d, #-1.93750000
 // CHECK-INST: fmov z0.d, #-1.93750000
 // CHECK-ENCODING: [0xe0,0xdf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df f9 25 <unknown>
 
 fdup z0.d, #-2.00000000
 // CHECK-INST: fmov z0.d, #-2.00000000
 // CHECK-ENCODING: [0x00,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 f9 25 <unknown>
 
 fdup z0.d, #-2.12500000
 // CHECK-INST: fmov z0.d, #-2.12500000
 // CHECK-ENCODING: [0x20,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d0 f9 25 <unknown>
 
 fdup z0.d, #-2.25000000
 // CHECK-INST: fmov z0.d, #-2.25000000
 // CHECK-ENCODING: [0x40,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d0 f9 25 <unknown>
 
 fdup z0.d, #-2.37500000
 // CHECK-INST: fmov z0.d, #-2.37500000
 // CHECK-ENCODING: [0x60,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d0 f9 25 <unknown>
 
 fdup z0.d, #-2.50000000
 // CHECK-INST: fmov z0.d, #-2.50000000
 // CHECK-ENCODING: [0x80,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d0 f9 25 <unknown>
 
 fdup z0.d, #-2.62500000
 // CHECK-INST: fmov z0.d, #-2.62500000
 // CHECK-ENCODING: [0xa0,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d0 f9 25 <unknown>
 
 fdup z0.d, #-2.75000000
 // CHECK-INST: fmov z0.d, #-2.75000000
 // CHECK-ENCODING: [0xc0,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d0 f9 25 <unknown>
 
 fdup z0.d, #-2.87500000
 // CHECK-INST: fmov z0.d, #-2.87500000
 // CHECK-ENCODING: [0xe0,0xd0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d0 f9 25 <unknown>
 
 fdup z0.d, #-3.00000000
 // CHECK-INST: fmov z0.d, #-3.00000000
 // CHECK-ENCODING: [0x00,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d1 f9 25 <unknown>
 
 fdup z0.d, #-3.12500000
 // CHECK-INST: fmov z0.d, #-3.12500000
 // CHECK-ENCODING: [0x20,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d1 f9 25 <unknown>
 
 fdup z0.d, #-3.25000000
 // CHECK-INST: fmov z0.d, #-3.25000000
 // CHECK-ENCODING: [0x40,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d1 f9 25 <unknown>
 
 fdup z0.d, #-3.37500000
 // CHECK-INST: fmov z0.d, #-3.37500000
 // CHECK-ENCODING: [0x60,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d1 f9 25 <unknown>
 
 fdup z0.d, #-3.50000000
 // CHECK-INST: fmov z0.d, #-3.50000000
 // CHECK-ENCODING: [0x80,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d1 f9 25 <unknown>
 
 fdup z0.d, #-3.62500000
 // CHECK-INST: fmov z0.d, #-3.62500000
 // CHECK-ENCODING: [0xa0,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d1 f9 25 <unknown>
 
 fdup z0.d, #-3.75000000
 // CHECK-INST: fmov z0.d, #-3.75000000
 // CHECK-ENCODING: [0xc0,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d1 f9 25 <unknown>
 
 fdup z0.d, #-3.87500000
 // CHECK-INST: fmov z0.d, #-3.87500000
 // CHECK-ENCODING: [0xe0,0xd1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d1 f9 25 <unknown>
 
 fdup z0.d, #-4.00000000
 // CHECK-INST: fmov z0.d, #-4.00000000
 // CHECK-ENCODING: [0x00,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d2 f9 25 <unknown>
 
 fdup z0.d, #-4.25000000
 // CHECK-INST: fmov z0.d, #-4.25000000
 // CHECK-ENCODING: [0x20,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d2 f9 25 <unknown>
 
 fdup z0.d, #-4.50000000
 // CHECK-INST: fmov z0.d, #-4.50000000
 // CHECK-ENCODING: [0x40,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d2 f9 25 <unknown>
 
 fdup z0.d, #-4.75000000
 // CHECK-INST: fmov z0.d, #-4.75000000
 // CHECK-ENCODING: [0x60,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d2 f9 25 <unknown>
 
 fdup z0.d, #-5.00000000
 // CHECK-INST: fmov z0.d, #-5.00000000
 // CHECK-ENCODING: [0x80,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d2 f9 25 <unknown>
 
 fdup z0.d, #-5.25000000
 // CHECK-INST: fmov z0.d, #-5.25000000
 // CHECK-ENCODING: [0xa0,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d2 f9 25 <unknown>
 
 fdup z0.d, #-5.50000000
 // CHECK-INST: fmov z0.d, #-5.50000000
 // CHECK-ENCODING: [0xc0,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d2 f9 25 <unknown>
 
 fdup z0.d, #-5.75000000
 // CHECK-INST: fmov z0.d, #-5.75000000
 // CHECK-ENCODING: [0xe0,0xd2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d2 f9 25 <unknown>
 
 fdup z0.d, #-6.00000000
 // CHECK-INST: fmov z0.d, #-6.00000000
 // CHECK-ENCODING: [0x00,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d3 f9 25 <unknown>
 
 fdup z0.d, #-6.25000000
 // CHECK-INST: fmov z0.d, #-6.25000000
 // CHECK-ENCODING: [0x20,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d3 f9 25 <unknown>
 
 fdup z0.d, #-6.50000000
 // CHECK-INST: fmov z0.d, #-6.50000000
 // CHECK-ENCODING: [0x40,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d3 f9 25 <unknown>
 
 fdup z0.d, #-6.75000000
 // CHECK-INST: fmov z0.d, #-6.75000000
 // CHECK-ENCODING: [0x60,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d3 f9 25 <unknown>
 
 fdup z0.d, #-7.00000000
 // CHECK-INST: fmov z0.d, #-7.00000000
 // CHECK-ENCODING: [0x80,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d3 f9 25 <unknown>
 
 fdup z0.d, #-7.25000000
 // CHECK-INST: fmov z0.d, #-7.25000000
 // CHECK-ENCODING: [0xa0,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d3 f9 25 <unknown>
 
 fdup z0.d, #-7.50000000
 // CHECK-INST: fmov z0.d, #-7.50000000
 // CHECK-ENCODING: [0xc0,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d3 f9 25 <unknown>
 
 fdup z0.d, #-7.75000000
 // CHECK-INST: fmov z0.d, #-7.75000000
 // CHECK-ENCODING: [0xe0,0xd3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d3 f9 25 <unknown>
 
 fdup z0.d, #-8.00000000
 // CHECK-INST: fmov z0.d, #-8.00000000
 // CHECK-ENCODING: [0x00,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d4 f9 25 <unknown>
 
 fdup z0.d, #-8.50000000
 // CHECK-INST: fmov z0.d, #-8.50000000
 // CHECK-ENCODING: [0x20,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d4 f9 25 <unknown>
 
 fdup z0.d, #-9.00000000
 // CHECK-INST: fmov z0.d, #-9.00000000
 // CHECK-ENCODING: [0x40,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d4 f9 25 <unknown>
 
 fdup z0.d, #-9.50000000
 // CHECK-INST: fmov z0.d, #-9.50000000
 // CHECK-ENCODING: [0x60,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d4 f9 25 <unknown>
 
 fdup z0.d, #-10.00000000
 // CHECK-INST: fmov z0.d, #-10.00000000
 // CHECK-ENCODING: [0x80,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d4 f9 25 <unknown>
 
 fdup z0.d, #-10.50000000
 // CHECK-INST: fmov z0.d, #-10.50000000
 // CHECK-ENCODING: [0xa0,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d4 f9 25 <unknown>
 
 fdup z0.d, #-11.00000000
 // CHECK-INST: fmov z0.d, #-11.00000000
 // CHECK-ENCODING: [0xc0,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d4 f9 25 <unknown>
 
 fdup z0.d, #-11.50000000
 // CHECK-INST: fmov z0.d, #-11.50000000
 // CHECK-ENCODING: [0xe0,0xd4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d4 f9 25 <unknown>
 
 fdup z0.d, #-12.00000000
 // CHECK-INST: fmov z0.d, #-12.00000000
 // CHECK-ENCODING: [0x00,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d5 f9 25 <unknown>
 
 fdup z0.d, #-12.50000000
 // CHECK-INST: fmov z0.d, #-12.50000000
 // CHECK-ENCODING: [0x20,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d5 f9 25 <unknown>
 
 fdup z0.d, #-13.00000000
 // CHECK-INST: fmov z0.d, #-13.00000000
 // CHECK-ENCODING: [0x40,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d5 f9 25 <unknown>
 
 fdup z0.d, #-13.50000000
 // CHECK-INST: fmov z0.d, #-13.50000000
 // CHECK-ENCODING: [0x60,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d5 f9 25 <unknown>
 
 fdup z0.d, #-14.00000000
 // CHECK-INST: fmov z0.d, #-14.00000000
 // CHECK-ENCODING: [0x80,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d5 f9 25 <unknown>
 
 fdup z0.d, #-14.50000000
 // CHECK-INST: fmov z0.d, #-14.50000000
 // CHECK-ENCODING: [0xa0,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d5 f9 25 <unknown>
 
 fdup z0.d, #-15.00000000
 // CHECK-INST: fmov z0.d, #-15.00000000
 // CHECK-ENCODING: [0xc0,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d5 f9 25 <unknown>
 
 fdup z0.d, #-15.50000000
 // CHECK-INST: fmov z0.d, #-15.50000000
 // CHECK-ENCODING: [0xe0,0xd5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d5 f9 25 <unknown>
 
 fdup z0.d, #-16.00000000
 // CHECK-INST: fmov z0.d, #-16.00000000
 // CHECK-ENCODING: [0x00,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d6 f9 25 <unknown>
 
 fdup z0.d, #-17.00000000
 // CHECK-INST: fmov z0.d, #-17.00000000
 // CHECK-ENCODING: [0x20,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d6 f9 25 <unknown>
 
 fdup z0.d, #-18.00000000
 // CHECK-INST: fmov z0.d, #-18.00000000
 // CHECK-ENCODING: [0x40,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d6 f9 25 <unknown>
 
 fdup z0.d, #-19.00000000
 // CHECK-INST: fmov z0.d, #-19.00000000
 // CHECK-ENCODING: [0x60,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d6 f9 25 <unknown>
 
 fdup z0.d, #-20.00000000
 // CHECK-INST: fmov z0.d, #-20.00000000
 // CHECK-ENCODING: [0x80,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d6 f9 25 <unknown>
 
 fdup z0.d, #-21.00000000
 // CHECK-INST: fmov z0.d, #-21.00000000
 // CHECK-ENCODING: [0xa0,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d6 f9 25 <unknown>
 
 fdup z0.d, #-22.00000000
 // CHECK-INST: fmov z0.d, #-22.00000000
 // CHECK-ENCODING: [0xc0,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d6 f9 25 <unknown>
 
 fdup z0.d, #-23.00000000
 // CHECK-INST: fmov z0.d, #-23.00000000
 // CHECK-ENCODING: [0xe0,0xd6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d6 f9 25 <unknown>
 
 fdup z0.d, #-24.00000000
 // CHECK-INST: fmov z0.d, #-24.00000000
 // CHECK-ENCODING: [0x00,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d7 f9 25 <unknown>
 
 fdup z0.d, #-25.00000000
 // CHECK-INST: fmov z0.d, #-25.00000000
 // CHECK-ENCODING: [0x20,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d7 f9 25 <unknown>
 
 fdup z0.d, #-26.00000000
 // CHECK-INST: fmov z0.d, #-26.00000000
 // CHECK-ENCODING: [0x40,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d7 f9 25 <unknown>
 
 fdup z0.d, #-27.00000000
 // CHECK-INST: fmov z0.d, #-27.00000000
 // CHECK-ENCODING: [0x60,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d7 f9 25 <unknown>
 
 fdup z0.d, #-28.00000000
 // CHECK-INST: fmov z0.d, #-28.00000000
 // CHECK-ENCODING: [0x80,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d7 f9 25 <unknown>
 
 fdup z0.d, #-29.00000000
 // CHECK-INST: fmov z0.d, #-29.00000000
 // CHECK-ENCODING: [0xa0,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d7 f9 25 <unknown>
 
 fdup z0.d, #-30.00000000
 // CHECK-INST: fmov z0.d, #-30.00000000
 // CHECK-ENCODING: [0xc0,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d7 f9 25 <unknown>
 
 fdup z0.d, #-31.00000000
 // CHECK-INST: fmov z0.d, #-31.00000000
 // CHECK-ENCODING: [0xe0,0xd7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d7 f9 25 <unknown>
 
 fdup z0.d, #0.12500000
 // CHECK-INST: fmov z0.d, #0.12500000
 // CHECK-ENCODING: [0x00,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 f9 25 <unknown>
 
 fdup z0.d, #0.13281250
 // CHECK-INST: fmov z0.d, #0.13281250
 // CHECK-ENCODING: [0x20,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c8 f9 25 <unknown>
 
 fdup z0.d, #0.14062500
 // CHECK-INST: fmov z0.d, #0.14062500
 // CHECK-ENCODING: [0x40,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c8 f9 25 <unknown>
 
 fdup z0.d, #0.14843750
 // CHECK-INST: fmov z0.d, #0.14843750
 // CHECK-ENCODING: [0x60,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c8 f9 25 <unknown>
 
 fdup z0.d, #0.15625000
 // CHECK-INST: fmov z0.d, #0.15625000
 // CHECK-ENCODING: [0x80,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c8 f9 25 <unknown>
 
 fdup z0.d, #0.16406250
 // CHECK-INST: fmov z0.d, #0.16406250
 // CHECK-ENCODING: [0xa0,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c8 f9 25 <unknown>
 
 fdup z0.d, #0.17187500
 // CHECK-INST: fmov z0.d, #0.17187500
 // CHECK-ENCODING: [0xc0,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c8 f9 25 <unknown>
 
 fdup z0.d, #0.17968750
 // CHECK-INST: fmov z0.d, #0.17968750
 // CHECK-ENCODING: [0xe0,0xc8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c8 f9 25 <unknown>
 
 fdup z0.d, #0.18750000
 // CHECK-INST: fmov z0.d, #0.18750000
 // CHECK-ENCODING: [0x00,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c9 f9 25 <unknown>
 
 fdup z0.d, #0.19531250
 // CHECK-INST: fmov z0.d, #0.19531250
 // CHECK-ENCODING: [0x20,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c9 f9 25 <unknown>
 
 fdup z0.d, #0.20312500
 // CHECK-INST: fmov z0.d, #0.20312500
 // CHECK-ENCODING: [0x40,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c9 f9 25 <unknown>
 
 fdup z0.d, #0.21093750
 // CHECK-INST: fmov z0.d, #0.21093750
 // CHECK-ENCODING: [0x60,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c9 f9 25 <unknown>
 
 fdup z0.d, #0.21875000
 // CHECK-INST: fmov z0.d, #0.21875000
 // CHECK-ENCODING: [0x80,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c9 f9 25 <unknown>
 
 fdup z0.d, #0.22656250
 // CHECK-INST: fmov z0.d, #0.22656250
 // CHECK-ENCODING: [0xa0,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c9 f9 25 <unknown>
 
 fdup z0.d, #0.23437500
 // CHECK-INST: fmov z0.d, #0.23437500
 // CHECK-ENCODING: [0xc0,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c9 f9 25 <unknown>
 
 fdup z0.d, #0.24218750
 // CHECK-INST: fmov z0.d, #0.24218750
 // CHECK-ENCODING: [0xe0,0xc9,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c9 f9 25 <unknown>
 
 fdup z0.d, #0.25000000
 // CHECK-INST: fmov z0.d, #0.25000000
 // CHECK-ENCODING: [0x00,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ca f9 25 <unknown>
 
 fdup z0.d, #0.26562500
 // CHECK-INST: fmov z0.d, #0.26562500
 // CHECK-ENCODING: [0x20,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ca f9 25 <unknown>
 
 fdup z0.d, #0.28125000
 // CHECK-INST: fmov z0.d, #0.28125000
 // CHECK-ENCODING: [0x40,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ca f9 25 <unknown>
 
 fdup z0.d, #0.29687500
 // CHECK-INST: fmov z0.d, #0.29687500
 // CHECK-ENCODING: [0x60,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ca f9 25 <unknown>
 
 fdup z0.d, #0.31250000
 // CHECK-INST: fmov z0.d, #0.31250000
 // CHECK-ENCODING: [0x80,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ca f9 25 <unknown>
 
 fdup z0.d, #0.32812500
 // CHECK-INST: fmov z0.d, #0.32812500
 // CHECK-ENCODING: [0xa0,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ca f9 25 <unknown>
 
 fdup z0.d, #0.34375000
 // CHECK-INST: fmov z0.d, #0.34375000
 // CHECK-ENCODING: [0xc0,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 ca f9 25 <unknown>
 
 fdup z0.d, #0.35937500
 // CHECK-INST: fmov z0.d, #0.35937500
 // CHECK-ENCODING: [0xe0,0xca,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ca f9 25 <unknown>
 
 fdup z0.d, #0.37500000
 // CHECK-INST: fmov z0.d, #0.37500000
 // CHECK-ENCODING: [0x00,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cb f9 25 <unknown>
 
 fdup z0.d, #0.39062500
 // CHECK-INST: fmov z0.d, #0.39062500
 // CHECK-ENCODING: [0x20,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cb f9 25 <unknown>
 
 fdup z0.d, #0.40625000
 // CHECK-INST: fmov z0.d, #0.40625000
 // CHECK-ENCODING: [0x40,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cb f9 25 <unknown>
 
 fdup z0.d, #0.42187500
 // CHECK-INST: fmov z0.d, #0.42187500
 // CHECK-ENCODING: [0x60,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cb f9 25 <unknown>
 
 fdup z0.d, #0.43750000
 // CHECK-INST: fmov z0.d, #0.43750000
 // CHECK-ENCODING: [0x80,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cb f9 25 <unknown>
 
 fdup z0.d, #0.45312500
 // CHECK-INST: fmov z0.d, #0.45312500
 // CHECK-ENCODING: [0xa0,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cb f9 25 <unknown>
 
 fdup z0.d, #0.46875000
 // CHECK-INST: fmov z0.d, #0.46875000
 // CHECK-ENCODING: [0xc0,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cb f9 25 <unknown>
 
 fdup z0.d, #0.48437500
 // CHECK-INST: fmov z0.d, #0.48437500
 // CHECK-ENCODING: [0xe0,0xcb,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb f9 25 <unknown>
 
 fdup z0.d, #0.50000000
 // CHECK-INST: fmov z0.d, #0.50000000
 // CHECK-ENCODING: [0x00,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc f9 25 <unknown>
 
 fdup z0.d, #0.53125000
 // CHECK-INST: fmov z0.d, #0.53125000
 // CHECK-ENCODING: [0x20,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cc f9 25 <unknown>
 
 fdup z0.d, #0.56250000
 // CHECK-INST: fmov z0.d, #0.56250000
 // CHECK-ENCODING: [0x40,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cc f9 25 <unknown>
 
 fdup z0.d, #0.59375000
 // CHECK-INST: fmov z0.d, #0.59375000
 // CHECK-ENCODING: [0x60,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cc f9 25 <unknown>
 
 fdup z0.d, #0.62500000
 // CHECK-INST: fmov z0.d, #0.62500000
 // CHECK-ENCODING: [0x80,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cc f9 25 <unknown>
 
 fdup z0.d, #0.65625000
 // CHECK-INST: fmov z0.d, #0.65625000
 // CHECK-ENCODING: [0xa0,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cc f9 25 <unknown>
 
 fdup z0.d, #0.68750000
 // CHECK-INST: fmov z0.d, #0.68750000
 // CHECK-ENCODING: [0xc0,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cc f9 25 <unknown>
 
 fdup z0.d, #0.71875000
 // CHECK-INST: fmov z0.d, #0.71875000
 // CHECK-ENCODING: [0xe0,0xcc,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cc f9 25 <unknown>
 
 fdup z0.d, #0.75000000
 // CHECK-INST: fmov z0.d, #0.75000000
 // CHECK-ENCODING: [0x00,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cd f9 25 <unknown>
 
 fdup z0.d, #0.78125000
 // CHECK-INST: fmov z0.d, #0.78125000
 // CHECK-ENCODING: [0x20,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cd f9 25 <unknown>
 
 fdup z0.d, #0.81250000
 // CHECK-INST: fmov z0.d, #0.81250000
 // CHECK-ENCODING: [0x40,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cd f9 25 <unknown>
 
 fdup z0.d, #0.84375000
 // CHECK-INST: fmov z0.d, #0.84375000
 // CHECK-ENCODING: [0x60,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cd f9 25 <unknown>
 
 fdup z0.d, #0.87500000
 // CHECK-INST: fmov z0.d, #0.87500000
 // CHECK-ENCODING: [0x80,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cd f9 25 <unknown>
 
 fdup z0.d, #0.90625000
 // CHECK-INST: fmov z0.d, #0.90625000
 // CHECK-ENCODING: [0xa0,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cd f9 25 <unknown>
 
 fdup z0.d, #0.93750000
 // CHECK-INST: fmov z0.d, #0.93750000
 // CHECK-ENCODING: [0xc0,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cd f9 25 <unknown>
 
 fdup z0.d, #0.96875000
 // CHECK-INST: fmov z0.d, #0.96875000
 // CHECK-ENCODING: [0xe0,0xcd,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cd f9 25 <unknown>
 
 fdup z0.d, #1.00000000
 // CHECK-INST: fmov z0.d, #1.00000000
 // CHECK-ENCODING: [0x00,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ce f9 25 <unknown>
 
 fdup z0.d, #1.06250000
 // CHECK-INST: fmov z0.d, #1.06250000
 // CHECK-ENCODING: [0x20,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ce f9 25 <unknown>
 
 fdup z0.d, #1.12500000
 // CHECK-INST: fmov z0.d, #1.12500000
 // CHECK-ENCODING: [0x40,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ce f9 25 <unknown>
 
 fdup z0.d, #1.18750000
 // CHECK-INST: fmov z0.d, #1.18750000
 // CHECK-ENCODING: [0x60,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ce f9 25 <unknown>
 
 fdup z0.d, #1.25000000
 // CHECK-INST: fmov z0.d, #1.25000000
 // CHECK-ENCODING: [0x80,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ce f9 25 <unknown>
 
 fdup z0.d, #1.31250000
 // CHECK-INST: fmov z0.d, #1.31250000
 // CHECK-ENCODING: [0xa0,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ce f9 25 <unknown>
 
 fdup z0.d, #1.37500000
 // CHECK-INST: fmov z0.d, #1.37500000
 // CHECK-ENCODING: [0xc0,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 ce f9 25 <unknown>
 
 fdup z0.d, #1.43750000
 // CHECK-INST: fmov z0.d, #1.43750000
 // CHECK-ENCODING: [0xe0,0xce,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ce f9 25 <unknown>
 
 fdup z0.d, #1.50000000
 // CHECK-INST: fmov z0.d, #1.50000000
 // CHECK-ENCODING: [0x00,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cf f9 25 <unknown>
 
 fdup z0.d, #1.56250000
 // CHECK-INST: fmov z0.d, #1.56250000
 // CHECK-ENCODING: [0x20,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cf f9 25 <unknown>
 
 fdup z0.d, #1.62500000
 // CHECK-INST: fmov z0.d, #1.62500000
 // CHECK-ENCODING: [0x40,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cf f9 25 <unknown>
 
 fdup z0.d, #1.68750000
 // CHECK-INST: fmov z0.d, #1.68750000
 // CHECK-ENCODING: [0x60,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cf f9 25 <unknown>
 
 fdup z0.d, #1.75000000
 // CHECK-INST: fmov z0.d, #1.75000000
 // CHECK-ENCODING: [0x80,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cf f9 25 <unknown>
 
 fdup z0.d, #1.81250000
 // CHECK-INST: fmov z0.d, #1.81250000
 // CHECK-ENCODING: [0xa0,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cf f9 25 <unknown>
 
 fdup z0.d, #1.87500000
 // CHECK-INST: fmov z0.d, #1.87500000
 // CHECK-ENCODING: [0xc0,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cf f9 25 <unknown>
 
 fdup z0.d, #1.93750000
 // CHECK-INST: fmov z0.d, #1.93750000
 // CHECK-ENCODING: [0xe0,0xcf,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf f9 25 <unknown>
 
 fdup z0.d, #2.00000000
 // CHECK-INST: fmov z0.d, #2.00000000
 // CHECK-ENCODING: [0x00,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 f9 25 <unknown>
 
 fdup z0.d, #2.12500000
 // CHECK-INST: fmov z0.d, #2.12500000
 // CHECK-ENCODING: [0x20,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c0 f9 25 <unknown>
 
 fdup z0.d, #2.25000000
 // CHECK-INST: fmov z0.d, #2.25000000
 // CHECK-ENCODING: [0x40,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c0 f9 25 <unknown>
 
 fdup z0.d, #2.37500000
 // CHECK-INST: fmov z0.d, #2.37500000
 // CHECK-ENCODING: [0x60,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c0 f9 25 <unknown>
 
 fdup z0.d, #2.50000000
 // CHECK-INST: fmov z0.d, #2.50000000
 // CHECK-ENCODING: [0x80,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c0 f9 25 <unknown>
 
 fdup z0.d, #2.62500000
 // CHECK-INST: fmov z0.d, #2.62500000
 // CHECK-ENCODING: [0xa0,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c0 f9 25 <unknown>
 
 fdup z0.d, #2.75000000
 // CHECK-INST: fmov z0.d, #2.75000000
 // CHECK-ENCODING: [0xc0,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c0 f9 25 <unknown>
 
 fdup z0.d, #2.87500000
 // CHECK-INST: fmov z0.d, #2.87500000
 // CHECK-ENCODING: [0xe0,0xc0,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c0 f9 25 <unknown>
 
 fdup z0.d, #3.00000000
 // CHECK-INST: fmov z0.d, #3.00000000
 // CHECK-ENCODING: [0x00,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c1 f9 25 <unknown>
 
 fdup z0.d, #3.12500000
 // CHECK-INST: fmov z0.d, #3.12500000
 // CHECK-ENCODING: [0x20,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c1 f9 25 <unknown>
 
 fdup z0.d, #3.25000000
 // CHECK-INST: fmov z0.d, #3.25000000
 // CHECK-ENCODING: [0x40,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c1 f9 25 <unknown>
 
 fdup z0.d, #3.37500000
 // CHECK-INST: fmov z0.d, #3.37500000
 // CHECK-ENCODING: [0x60,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c1 f9 25 <unknown>
 
 fdup z0.d, #3.50000000
 // CHECK-INST: fmov z0.d, #3.50000000
 // CHECK-ENCODING: [0x80,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c1 f9 25 <unknown>
 
 fdup z0.d, #3.62500000
 // CHECK-INST: fmov z0.d, #3.62500000
 // CHECK-ENCODING: [0xa0,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c1 f9 25 <unknown>
 
 fdup z0.d, #3.75000000
 // CHECK-INST: fmov z0.d, #3.75000000
 // CHECK-ENCODING: [0xc0,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c1 f9 25 <unknown>
 
 fdup z0.d, #3.87500000
 // CHECK-INST: fmov z0.d, #3.87500000
 // CHECK-ENCODING: [0xe0,0xc1,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c1 f9 25 <unknown>
 
 fdup z0.d, #4.00000000
 // CHECK-INST: fmov z0.d, #4.00000000
 // CHECK-ENCODING: [0x00,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c2 f9 25 <unknown>
 
 fdup z0.d, #4.25000000
 // CHECK-INST: fmov z0.d, #4.25000000
 // CHECK-ENCODING: [0x20,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c2 f9 25 <unknown>
 
 fdup z0.d, #4.50000000
 // CHECK-INST: fmov z0.d, #4.50000000
 // CHECK-ENCODING: [0x40,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c2 f9 25 <unknown>
 
 fdup z0.d, #4.75000000
 // CHECK-INST: fmov z0.d, #4.75000000
 // CHECK-ENCODING: [0x60,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c2 f9 25 <unknown>
 
 fdup z0.d, #5.00000000
 // CHECK-INST: fmov z0.d, #5.00000000
 // CHECK-ENCODING: [0x80,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c2 f9 25 <unknown>
 
 fdup z0.d, #5.25000000
 // CHECK-INST: fmov z0.d, #5.25000000
 // CHECK-ENCODING: [0xa0,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c2 f9 25 <unknown>
 
 fdup z0.d, #5.50000000
 // CHECK-INST: fmov z0.d, #5.50000000
 // CHECK-ENCODING: [0xc0,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c2 f9 25 <unknown>
 
 fdup z0.d, #5.75000000
 // CHECK-INST: fmov z0.d, #5.75000000
 // CHECK-ENCODING: [0xe0,0xc2,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c2 f9 25 <unknown>
 
 fdup z0.d, #6.00000000
 // CHECK-INST: fmov z0.d, #6.00000000
 // CHECK-ENCODING: [0x00,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c3 f9 25 <unknown>
 
 fdup z0.d, #6.25000000
 // CHECK-INST: fmov z0.d, #6.25000000
 // CHECK-ENCODING: [0x20,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c3 f9 25 <unknown>
 
 fdup z0.d, #6.50000000
 // CHECK-INST: fmov z0.d, #6.50000000
 // CHECK-ENCODING: [0x40,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c3 f9 25 <unknown>
 
 fdup z0.d, #6.75000000
 // CHECK-INST: fmov z0.d, #6.75000000
 // CHECK-ENCODING: [0x60,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c3 f9 25 <unknown>
 
 fdup z0.d, #7.00000000
 // CHECK-INST: fmov z0.d, #7.00000000
 // CHECK-ENCODING: [0x80,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c3 f9 25 <unknown>
 
 fdup z0.d, #7.25000000
 // CHECK-INST: fmov z0.d, #7.25000000
 // CHECK-ENCODING: [0xa0,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c3 f9 25 <unknown>
 
 fdup z0.d, #7.50000000
 // CHECK-INST: fmov z0.d, #7.50000000
 // CHECK-ENCODING: [0xc0,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c3 f9 25 <unknown>
 
 fdup z0.d, #7.75000000
 // CHECK-INST: fmov z0.d, #7.75000000
 // CHECK-ENCODING: [0xe0,0xc3,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 f9 25 <unknown>
 
 fdup z0.d, #8.00000000
 // CHECK-INST: fmov z0.d, #8.00000000
 // CHECK-ENCODING: [0x00,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 f9 25 <unknown>
 
 fdup z0.d, #8.50000000
 // CHECK-INST: fmov z0.d, #8.50000000
 // CHECK-ENCODING: [0x20,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c4 f9 25 <unknown>
 
 fdup z0.d, #9.00000000
 // CHECK-INST: fmov z0.d, #9.00000000
 // CHECK-ENCODING: [0x40,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c4 f9 25 <unknown>
 
 fdup z0.d, #9.50000000
 // CHECK-INST: fmov z0.d, #9.50000000
 // CHECK-ENCODING: [0x60,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c4 f9 25 <unknown>
 
 fdup z0.d, #10.00000000
 // CHECK-INST: fmov z0.d, #10.00000000
 // CHECK-ENCODING: [0x80,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c4 f9 25 <unknown>
 
 fdup z0.d, #10.50000000
 // CHECK-INST: fmov z0.d, #10.50000000
 // CHECK-ENCODING: [0xa0,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c4 f9 25 <unknown>
 
 fdup z0.d, #11.00000000
 // CHECK-INST: fmov z0.d, #11.00000000
 // CHECK-ENCODING: [0xc0,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c4 f9 25 <unknown>
 
 fdup z0.d, #11.50000000
 // CHECK-INST: fmov z0.d, #11.50000000
 // CHECK-ENCODING: [0xe0,0xc4,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c4 f9 25 <unknown>
 
 fdup z0.d, #12.00000000
 // CHECK-INST: fmov z0.d, #12.00000000
 // CHECK-ENCODING: [0x00,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c5 f9 25 <unknown>
 
 fdup z0.d, #12.50000000
 // CHECK-INST: fmov z0.d, #12.50000000
 // CHECK-ENCODING: [0x20,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c5 f9 25 <unknown>
 
 fdup z0.d, #13.00000000
 // CHECK-INST: fmov z0.d, #13.00000000
 // CHECK-ENCODING: [0x40,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c5 f9 25 <unknown>
 
 fdup z0.d, #13.50000000
 // CHECK-INST: fmov z0.d, #13.50000000
 // CHECK-ENCODING: [0x60,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c5 f9 25 <unknown>
 
 fdup z0.d, #14.00000000
 // CHECK-INST: fmov z0.d, #14.00000000
 // CHECK-ENCODING: [0x80,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c5 f9 25 <unknown>
 
 fdup z0.d, #14.50000000
 // CHECK-INST: fmov z0.d, #14.50000000
 // CHECK-ENCODING: [0xa0,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c5 f9 25 <unknown>
 
 fdup z0.d, #15.00000000
 // CHECK-INST: fmov z0.d, #15.00000000
 // CHECK-ENCODING: [0xc0,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c5 f9 25 <unknown>
 
 fdup z0.d, #15.50000000
 // CHECK-INST: fmov z0.d, #15.50000000
 // CHECK-ENCODING: [0xe0,0xc5,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c5 f9 25 <unknown>
 
 fdup z0.d, #16.00000000
 // CHECK-INST: fmov z0.d, #16.00000000
 // CHECK-ENCODING: [0x00,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c6 f9 25 <unknown>
 
 fdup z0.d, #17.00000000
 // CHECK-INST: fmov z0.d, #17.00000000
 // CHECK-ENCODING: [0x20,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c6 f9 25 <unknown>
 
 fdup z0.d, #18.00000000
 // CHECK-INST: fmov z0.d, #18.00000000
 // CHECK-ENCODING: [0x40,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c6 f9 25 <unknown>
 
 fdup z0.d, #19.00000000
 // CHECK-INST: fmov z0.d, #19.00000000
 // CHECK-ENCODING: [0x60,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c6 f9 25 <unknown>
 
 fdup z0.d, #20.00000000
 // CHECK-INST: fmov z0.d, #20.00000000
 // CHECK-ENCODING: [0x80,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c6 f9 25 <unknown>
 
 fdup z0.d, #21.00000000
 // CHECK-INST: fmov z0.d, #21.00000000
 // CHECK-ENCODING: [0xa0,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c6 f9 25 <unknown>
 
 fdup z0.d, #22.00000000
 // CHECK-INST: fmov z0.d, #22.00000000
 // CHECK-ENCODING: [0xc0,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c6 f9 25 <unknown>
 
 fdup z0.d, #23.00000000
 // CHECK-INST: fmov z0.d, #23.00000000
 // CHECK-ENCODING: [0xe0,0xc6,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c6 f9 25 <unknown>
 
 fdup z0.d, #24.00000000
 // CHECK-INST: fmov z0.d, #24.00000000
 // CHECK-ENCODING: [0x00,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c7 f9 25 <unknown>
 
 fdup z0.d, #25.00000000
 // CHECK-INST: fmov z0.d, #25.00000000
 // CHECK-ENCODING: [0x20,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c7 f9 25 <unknown>
 
 fdup z0.d, #26.00000000
 // CHECK-INST: fmov z0.d, #26.00000000
 // CHECK-ENCODING: [0x40,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c7 f9 25 <unknown>
 
 fdup z0.d, #27.00000000
 // CHECK-INST: fmov z0.d, #27.00000000
 // CHECK-ENCODING: [0x60,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c7 f9 25 <unknown>
 
 fdup z0.d, #28.00000000
 // CHECK-INST: fmov z0.d, #28.00000000
 // CHECK-ENCODING: [0x80,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c7 f9 25 <unknown>
 
 fdup z0.d, #29.00000000
 // CHECK-INST: fmov z0.d, #29.00000000
 // CHECK-ENCODING: [0xa0,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c7 f9 25 <unknown>
 
 fdup z0.d, #30.00000000
 // CHECK-INST: fmov z0.d, #30.00000000
 // CHECK-ENCODING: [0xc0,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c7 f9 25 <unknown>
 
 fdup z0.d, #31.00000000
 // CHECK-INST: fmov z0.d, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 f9 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmad.s b/llvm/test/MC/AArch64/SVE/fmad.s
index 3ee0abe883c40..3b8bdedc378d4 100644
--- a/llvm/test/MC/AArch64/SVE/fmad.s
+++ b/llvm/test/MC/AArch64/SVE/fmad.s
@@ -12,19 +12,19 @@
 fmad z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fmad	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x9c,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 9c 7f 65 <unknown>
 
 fmad z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fmad	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x9c,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 9c bf 65 <unknown>
 
 fmad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x9c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 9c ff 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fmad z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x9c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 9c ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x9c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 9c ff 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmax.s b/llvm/test/MC/AArch64/SVE/fmax.s
index 29616ddc92cee..be2f012f40d9f 100644
--- a/llvm/test/MC/AArch64/SVE/fmax.s
+++ b/llvm/test/MC/AArch64/SVE/fmax.s
@@ -12,61 +12,61 @@
 fmax    z0.h, p0/m, z0.h, #0.000000000000000
 // CHECK-INST: fmax    z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5e 65 <unknown>
 
 fmax    z0.h, p0/m, z0.h, #0.0
 // CHECK-INST: fmax    z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5e 65 <unknown>
 
 fmax    z0.s, p0/m, z0.s, #0.0
 // CHECK-INST: fmax    z0.s, p0/m, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x9e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 9e 65 <unknown>
 
 fmax    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fmax    z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xde,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c de 65 <unknown>
 
 fmax    z31.h, p7/m, z31.h, #1.0
 // CHECK-INST: fmax    z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5e 65 <unknown>
 
 fmax    z31.s, p7/m, z31.s, #1.0
 // CHECK-INST: fmax    z31.s, p7/m, z31.s, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x9e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 9e 65 <unknown>
 
 fmax    z0.d, p0/m, z0.d, #0.0
 // CHECK-INST: fmax    z0.d, p0/m, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x80,0xde,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 de 65 <unknown>
 
 fmax    z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fmax	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x46,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 46 65 <unknown>
 
 fmax    z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fmax	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x86,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 86 65 <unknown>
 
 fmax    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmax	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c6 65 <unknown>
 
 
@@ -76,47 +76,47 @@ fmax    z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p0/z, z7.d
 // CHECK-INST: movprfx	z0.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe0,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 20 d0 04 <unknown>
 
 fmax    z0.d, p0/m, z0.d, #0.0
 // CHECK-INST: fmax	z0.d, p0/m, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x80,0xde,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 de 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmax    z0.d, p0/m, z0.d, #0.0
 // CHECK-INST: fmax	z0.d, p0/m, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x80,0xde,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 de 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmax    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmax	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c6 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmax    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmax	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c6 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmaxnm.s b/llvm/test/MC/AArch64/SVE/fmaxnm.s
index 0ec38de5fe824..f5b10bf20c94c 100644
--- a/llvm/test/MC/AArch64/SVE/fmaxnm.s
+++ b/llvm/test/MC/AArch64/SVE/fmaxnm.s
@@ -12,67 +12,67 @@
 fmaxnm  z0.h, p0/m, z0.h, #0.000000000000000
 // CHECK-INST: fmaxnm	z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5c 65 <unknown>
 
 fmaxnm  z0.h, p0/m, z0.h, #0.0
 // CHECK-INST: fmaxnm	z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5c 65 <unknown>
 
 fmaxnm  z0.s, p0/m, z0.s, #0.0
 // CHECK-INST: fmaxnm	z0.s, p0/m, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x9c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 9c 65 <unknown>
 
 fmaxnm  z0.d, p0/m, z0.d, #0.0
 // CHECK-INST: fmaxnm	z0.d, p0/m, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x80,0xdc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 dc 65 <unknown>
 
 fmaxnm  z31.h, p7/m, z31.h, #1.000000000000000
 // CHECK-INST: fmaxnm	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5c 65 <unknown>
 
 fmaxnm  z31.h, p7/m, z31.h, #1.0
 // CHECK-INST: fmaxnm	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5c 65 <unknown>
 
 fmaxnm  z31.s, p7/m, z31.s, #1.0
 // CHECK-INST: fmaxnm	z31.s, p7/m, z31.s, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x9c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 9c 65 <unknown>
 
 fmaxnm  z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fmaxnm	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c dc 65 <unknown>
 
 fmaxnm  z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fmaxnm	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x44,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 44 65 <unknown>
 
 fmaxnm  z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fmaxnm	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x84,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 84 65 <unknown>
 
 fmaxnm  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmaxnm	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c4 65 <unknown>
 
 
@@ -82,47 +82,47 @@ fmaxnm  z0.d, p7/m, z0.d, z31.d
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 fmaxnm  z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fmaxnm	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c dc 65 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fmaxnm  z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fmaxnm	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c dc 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmaxnm  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmaxnm	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c4 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmaxnm  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmaxnm	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c4 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmaxnmv.s b/llvm/test/MC/AArch64/SVE/fmaxnmv.s
index ee37d1cfe04cf..92f4fd7294a84 100644
--- a/llvm/test/MC/AArch64/SVE/fmaxnmv.s
+++ b/llvm/test/MC/AArch64/SVE/fmaxnmv.s
@@ -12,17 +12,17 @@
 fmaxnmv h0, p7, z31.h
 // CHECK-INST: fmaxnmv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x44,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 44 65 <unknown>
 
 fmaxnmv s0, p7, z31.s
 // CHECK-INST: fmaxnmv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x84,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 84 65 <unknown>
 
 fmaxnmv d0, p7, z31.d
 // CHECK-INST: fmaxnmv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c4 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmaxv.s b/llvm/test/MC/AArch64/SVE/fmaxv.s
index 922df3e8a79a2..09ea407e8b2ae 100644
--- a/llvm/test/MC/AArch64/SVE/fmaxv.s
+++ b/llvm/test/MC/AArch64/SVE/fmaxv.s
@@ -12,17 +12,17 @@
 fmaxv h0, p7, z31.h
 // CHECK-INST: fmaxv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x46,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 46 65 <unknown>
 
 fmaxv s0, p7, z31.s
 // CHECK-INST: fmaxv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x86,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 86 65 <unknown>
 
 fmaxv d0, p7, z31.d
 // CHECK-INST: fmaxv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c6 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmin.s b/llvm/test/MC/AArch64/SVE/fmin.s
index 57231302ffc14..16717fd704a42 100644
--- a/llvm/test/MC/AArch64/SVE/fmin.s
+++ b/llvm/test/MC/AArch64/SVE/fmin.s
@@ -12,67 +12,67 @@
 fmin    z0.h, p0/m, z0.h, #0.000000000000000
 // CHECK-INST: fmin	z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5f 65 <unknown>
 
 fmin    z0.h, p0/m, z0.h, #0.0
 // CHECK-INST: fmin	z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5f 65 <unknown>
 
 fmin    z0.s, p0/m, z0.s, #0.0
 // CHECK-INST: fmin	z0.s, p0/m, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x9f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 9f 65 <unknown>
 
 fmin    z0.d, p0/m, z0.d, #0.0
 // CHECK-INST: fmin	z0.d, p0/m, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x80,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 df 65 <unknown>
 
 fmin    z31.h, p7/m, z31.h, #1.000000000000000
 // CHECK-INST: fmin	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5f 65 <unknown>
 
 fmin    z31.h, p7/m, z31.h, #1.0
 // CHECK-INST: fmin	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5f 65 <unknown>
 
 fmin    z31.s, p7/m, z31.s, #1.0
 // CHECK-INST: fmin	z31.s, p7/m, z31.s, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x9f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 9f 65 <unknown>
 
 fmin    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fmin	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c df 65 <unknown>
 
 fmin    z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fmin	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x47,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 47 65 <unknown>
 
 fmin    z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fmin	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x87,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 87 65 <unknown>
 
 fmin    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmin	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c7 65 <unknown>
 
 
@@ -82,47 +82,47 @@ fmin    z0.d, p7/m, z0.d, z31.d
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 fmin    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fmin	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c df 65 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fmin    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fmin	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c df 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmin    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmin	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c7 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmin    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmin	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c7 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fminnm.s b/llvm/test/MC/AArch64/SVE/fminnm.s
index cb5bcae4fd466..968a4d3cf00a6 100644
--- a/llvm/test/MC/AArch64/SVE/fminnm.s
+++ b/llvm/test/MC/AArch64/SVE/fminnm.s
@@ -12,67 +12,67 @@
 fminnm  z0.h, p0/m, z0.h, #0.000000000000000
 // CHECK-INST: fminnm	z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5d 65 <unknown>
 
 fminnm  z0.h, p0/m, z0.h, #0.0
 // CHECK-INST: fminnm	z0.h, p0/m, z0.h, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x5d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5d 65 <unknown>
 
 fminnm  z0.s, p0/m, z0.s, #0.0
 // CHECK-INST: fminnm	z0.s, p0/m, z0.s, #0.0
 // CHECK-ENCODING: [0x00,0x80,0x9d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 9d 65 <unknown>
 
 fminnm  z0.d, p0/m, z0.d, #0.0
 // CHECK-INST: fminnm	z0.d, p0/m, z0.d, #0.0
 // CHECK-ENCODING: [0x00,0x80,0xdd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 dd 65 <unknown>
 
 fminnm  z31.h, p7/m, z31.h, #1.000000000000000
 // CHECK-INST: fminnm	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5d 65 <unknown>
 
 fminnm  z31.h, p7/m, z31.h, #1.0
 // CHECK-INST: fminnm	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5d 65 <unknown>
 
 fminnm  z31.s, p7/m, z31.s, #1.0
 // CHECK-INST: fminnm	z31.s, p7/m, z31.s, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x9d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 9d 65 <unknown>
 
 fminnm  z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fminnm	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c dd 65 <unknown>
 
 fminnm  z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fminnm	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x45,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 45 65 <unknown>
 
 fminnm  z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fminnm	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x85,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 85 65 <unknown>
 
 fminnm  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fminnm	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc5,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c5 65 <unknown>
 
 
@@ -82,47 +82,47 @@ fminnm  z0.d, p7/m, z0.d, z31.d
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 fminnm  z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fminnm	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c dd 65 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fminnm  z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fminnm	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c dd 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fminnm  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fminnm	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc5,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c5 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fminnm  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fminnm	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc5,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c5 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fminnmv.s b/llvm/test/MC/AArch64/SVE/fminnmv.s
index 7e80f8cdfb062..0c152446bb109 100644
--- a/llvm/test/MC/AArch64/SVE/fminnmv.s
+++ b/llvm/test/MC/AArch64/SVE/fminnmv.s
@@ -12,17 +12,17 @@
 fminnmv h0, p7, z31.h
 // CHECK-INST: fminnmv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x45,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 45 65 <unknown>
 
 fminnmv s0, p7, z31.s
 // CHECK-INST: fminnmv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x85,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 85 65 <unknown>
 
 fminnmv d0, p7, z31.d
 // CHECK-INST: fminnmv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc5,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c5 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fminv.s b/llvm/test/MC/AArch64/SVE/fminv.s
index 0c1dde056d39f..67079ebee7201 100644
--- a/llvm/test/MC/AArch64/SVE/fminv.s
+++ b/llvm/test/MC/AArch64/SVE/fminv.s
@@ -12,17 +12,17 @@
 fminv h0, p7, z31.h
 // CHECK-INST: fminv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x47,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 47 65 <unknown>
 
 fminv s0, p7, z31.s
 // CHECK-INST: fminv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x87,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 87 65 <unknown>
 
 fminv d0, p7, z31.d
 // CHECK-INST: fminv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c7 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmla.s b/llvm/test/MC/AArch64/SVE/fmla.s
index 3791a498c2312..17f32f4b5c400 100644
--- a/llvm/test/MC/AArch64/SVE/fmla.s
+++ b/llvm/test/MC/AArch64/SVE/fmla.s
@@ -12,37 +12,37 @@
 fmla z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fmla	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x1c,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c 7f 65 <unknown>
 
 fmla z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fmla	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x1c,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c bf 65 <unknown>
 
 fmla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x1c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c ff 65 <unknown>
 
 fmla z0.h, z1.h, z7.h[7]
 // CHECK-INST: fmla	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x00,0x7f,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 7f 64 <unknown>
 
 fmla z0.s, z1.s, z7.s[3]
 // CHECK-INST: fmla	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0x00,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 bf 64 <unknown>
 
 fmla z0.d, z1.d, z7.d[1]
 // CHECK-INST: fmla	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x00,0xf7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 f7 64 <unknown>
 
 
@@ -52,35 +52,35 @@ fmla z0.d, z1.d, z7.d[1]
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x1c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x1c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmla z0.d, z1.d, z7.d[1]
 // CHECK-INST: fmla	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x00,0xf7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 f7 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmls.s b/llvm/test/MC/AArch64/SVE/fmls.s
index 860cd3c845d08..3725ee1a4cf97 100644
--- a/llvm/test/MC/AArch64/SVE/fmls.s
+++ b/llvm/test/MC/AArch64/SVE/fmls.s
@@ -12,37 +12,37 @@
 fmls z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fmls	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x3c,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 3c 7f 65 <unknown>
 
 fmls z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fmls	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x3c,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 3c bf 65 <unknown>
 
 fmls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x3c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 3c ff 65 <unknown>
 
 fmls z0.h, z1.h, z7.h[7]
 // CHECK-INST: fmls	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x04,0x7f,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 7f 64 <unknown>
 
 fmls z0.s, z1.s, z7.s[3]
 // CHECK-INST: fmls	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0x04,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 bf 64 <unknown>
 
 fmls z0.d, z1.d, z7.d[1]
 // CHECK-INST: fmls	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x04,0xf7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 f7 64 <unknown>
 
 
@@ -52,35 +52,35 @@ fmls z0.d, z1.d, z7.d[1]
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x3c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 3c ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x3c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 3c ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmls z0.d, z1.d, z7.d[1]
 // CHECK-INST: fmls	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x04,0xf7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 f7 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmov.s b/llvm/test/MC/AArch64/SVE/fmov.s
index 9209fc64819a9..41453acab3813 100644
--- a/llvm/test/MC/AArch64/SVE/fmov.s
+++ b/llvm/test/MC/AArch64/SVE/fmov.s
@@ -12,1591 +12,1591 @@
 fmov z0.h, #0.0
 // CHECK-INST: mov     z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 78 25
 
 fmov z0.s, #0.0
 // CHECK-INST: mov     z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 b8 25
 
 fmov z0.d, #0.0
 // CHECK-INST: mov     z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 f8 25
 
 fmov z0.h, #-0.12500000
 // CHECK-INST: fmov z0.h, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0x79,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 79 25 <unknown>
 
 fmov z0.s, #-0.12500000
 // CHECK-INST: fmov z0.s, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0xb9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 b9 25 <unknown>
 
 fmov z0.d, #-0.12500000
 // CHECK-INST: fmov z0.d, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 f9 25 <unknown>
 
 fmov z0.d, #31.00000000
 // CHECK-INST: fmov z0.d, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xf9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 f9 25 <unknown>
 
 fmov z0.h, p0/m, #-0.12500000
 // CHECK-INST: fmov z0.h, p0/m, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 50 05 <unknown>
 
 fmov z0.s, p0/m, #-0.12500000
 // CHECK-INST: fmov z0.s, p0/m, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 90 05 <unknown>
 
 fmov z0.d, p0/m, #-0.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.12500000
 // CHECK-ENCODING: [0x00,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.13281250
 // CHECK-INST: fmov z0.d, p0/m, #-0.13281250
 // CHECK-ENCODING: [0x20,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.14062500
 // CHECK-INST: fmov z0.d, p0/m, #-0.14062500
 // CHECK-ENCODING: [0x40,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.14843750
 // CHECK-INST: fmov z0.d, p0/m, #-0.14843750
 // CHECK-ENCODING: [0x60,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.15625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.15625000
 // CHECK-ENCODING: [0x80,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.16406250
 // CHECK-INST: fmov z0.d, p0/m, #-0.16406250
 // CHECK-ENCODING: [0xa0,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.17187500
 // CHECK-INST: fmov z0.d, p0/m, #-0.17187500
 // CHECK-ENCODING: [0xc0,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.17968750
 // CHECK-INST: fmov z0.d, p0/m, #-0.17968750
 // CHECK-ENCODING: [0xe0,0xd8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.18750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.18750000
 // CHECK-ENCODING: [0x00,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.19531250
 // CHECK-INST: fmov z0.d, p0/m, #-0.19531250
 // CHECK-ENCODING: [0x20,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.20312500
 // CHECK-INST: fmov z0.d, p0/m, #-0.20312500
 // CHECK-ENCODING: [0x40,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.21093750
 // CHECK-INST: fmov z0.d, p0/m, #-0.21093750
 // CHECK-ENCODING: [0x60,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.21875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.21875000
 // CHECK-ENCODING: [0x80,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.22656250
 // CHECK-INST: fmov z0.d, p0/m, #-0.22656250
 // CHECK-ENCODING: [0xa0,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.23437500
 // CHECK-INST: fmov z0.d, p0/m, #-0.23437500
 // CHECK-ENCODING: [0xc0,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.24218750
 // CHECK-INST: fmov z0.d, p0/m, #-0.24218750
 // CHECK-ENCODING: [0xe0,0xd9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-0.25000000
 // CHECK-ENCODING: [0x00,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.26562500
 // CHECK-INST: fmov z0.d, p0/m, #-0.26562500
 // CHECK-ENCODING: [0x20,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.28125000
 // CHECK-INST: fmov z0.d, p0/m, #-0.28125000
 // CHECK-ENCODING: [0x40,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.29687500
 // CHECK-INST: fmov z0.d, p0/m, #-0.29687500
 // CHECK-ENCODING: [0x60,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.31250000
 // CHECK-INST: fmov z0.d, p0/m, #-0.31250000
 // CHECK-ENCODING: [0x80,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.32812500
 // CHECK-INST: fmov z0.d, p0/m, #-0.32812500
 // CHECK-ENCODING: [0xa0,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.34375000
 // CHECK-INST: fmov z0.d, p0/m, #-0.34375000
 // CHECK-ENCODING: [0xc0,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.35937500
 // CHECK-INST: fmov z0.d, p0/m, #-0.35937500
 // CHECK-ENCODING: [0xe0,0xda,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 da d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.37500000
 // CHECK-ENCODING: [0x00,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.39062500
 // CHECK-INST: fmov z0.d, p0/m, #-0.39062500
 // CHECK-ENCODING: [0x20,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.40625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.40625000
 // CHECK-ENCODING: [0x40,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.42187500
 // CHECK-INST: fmov z0.d, p0/m, #-0.42187500
 // CHECK-ENCODING: [0x60,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.43750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.43750000
 // CHECK-ENCODING: [0x80,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.45312500
 // CHECK-INST: fmov z0.d, p0/m, #-0.45312500
 // CHECK-ENCODING: [0xa0,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.46875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.46875000
 // CHECK-ENCODING: [0xc0,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.48437500
 // CHECK-INST: fmov z0.d, p0/m, #-0.48437500
 // CHECK-ENCODING: [0xe0,0xdb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 db d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-0.50000000
 // CHECK-ENCODING: [0x00,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.53125000
 // CHECK-INST: fmov z0.d, p0/m, #-0.53125000
 // CHECK-ENCODING: [0x20,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.56250000
 // CHECK-INST: fmov z0.d, p0/m, #-0.56250000
 // CHECK-ENCODING: [0x40,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.59375000
 // CHECK-INST: fmov z0.d, p0/m, #-0.59375000
 // CHECK-ENCODING: [0x60,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.62500000
 // CHECK-ENCODING: [0x80,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.65625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.65625000
 // CHECK-ENCODING: [0xa0,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.68750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.68750000
 // CHECK-ENCODING: [0xc0,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.71875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.71875000
 // CHECK-ENCODING: [0xe0,0xdc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 dc d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-0.75000000
 // CHECK-ENCODING: [0x00,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.78125000
 // CHECK-INST: fmov z0.d, p0/m, #-0.78125000
 // CHECK-ENCODING: [0x20,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.81250000
 // CHECK-INST: fmov z0.d, p0/m, #-0.81250000
 // CHECK-ENCODING: [0x40,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.84375000
 // CHECK-INST: fmov z0.d, p0/m, #-0.84375000
 // CHECK-ENCODING: [0x60,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-0.87500000
 // CHECK-ENCODING: [0x80,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.90625000
 // CHECK-INST: fmov z0.d, p0/m, #-0.90625000
 // CHECK-ENCODING: [0xa0,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.93750000
 // CHECK-INST: fmov z0.d, p0/m, #-0.93750000
 // CHECK-ENCODING: [0xc0,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-0.96875000
 // CHECK-INST: fmov z0.d, p0/m, #-0.96875000
 // CHECK-ENCODING: [0xe0,0xdd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 dd d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.00000000
 // CHECK-ENCODING: [0x00,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.06250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.06250000
 // CHECK-ENCODING: [0x20,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.12500000
 // CHECK-ENCODING: [0x40,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.18750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.18750000
 // CHECK-ENCODING: [0x60,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.25000000
 // CHECK-ENCODING: [0x80,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.31250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.31250000
 // CHECK-ENCODING: [0xa0,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.37500000
 // CHECK-ENCODING: [0xc0,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.43750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.43750000
 // CHECK-ENCODING: [0xe0,0xde,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 de d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.50000000
 // CHECK-ENCODING: [0x00,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.56250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.56250000
 // CHECK-ENCODING: [0x20,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.62500000
 // CHECK-ENCODING: [0x40,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.68750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.68750000
 // CHECK-ENCODING: [0x60,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-1.75000000
 // CHECK-ENCODING: [0x80,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.81250000
 // CHECK-INST: fmov z0.d, p0/m, #-1.81250000
 // CHECK-ENCODING: [0xa0,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-1.87500000
 // CHECK-ENCODING: [0xc0,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-1.93750000
 // CHECK-INST: fmov z0.d, p0/m, #-1.93750000
 // CHECK-ENCODING: [0xe0,0xdf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.00000000
 // CHECK-ENCODING: [0x00,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.12500000
 // CHECK-ENCODING: [0x20,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.25000000
 // CHECK-ENCODING: [0x40,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.37500000
 // CHECK-ENCODING: [0x60,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.50000000
 // CHECK-ENCODING: [0x80,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.62500000
 // CHECK-ENCODING: [0xa0,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-2.75000000
 // CHECK-ENCODING: [0xc0,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-2.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-2.87500000
 // CHECK-ENCODING: [0xe0,0xd0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.00000000
 // CHECK-ENCODING: [0x00,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.12500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.12500000
 // CHECK-ENCODING: [0x20,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.25000000
 // CHECK-ENCODING: [0x40,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.37500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.37500000
 // CHECK-ENCODING: [0x60,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.50000000
 // CHECK-ENCODING: [0x80,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.62500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.62500000
 // CHECK-ENCODING: [0xa0,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-3.75000000
 // CHECK-ENCODING: [0xc0,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-3.87500000
 // CHECK-INST: fmov z0.d, p0/m, #-3.87500000
 // CHECK-ENCODING: [0xe0,0xd1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-4.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.00000000
 // CHECK-ENCODING: [0x00,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-4.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.25000000
 // CHECK-ENCODING: [0x20,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-4.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.50000000
 // CHECK-ENCODING: [0x40,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-4.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-4.75000000
 // CHECK-ENCODING: [0x60,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-5.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.00000000
 // CHECK-ENCODING: [0x80,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-5.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.25000000
 // CHECK-ENCODING: [0xa0,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-5.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.50000000
 // CHECK-ENCODING: [0xc0,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-5.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-5.75000000
 // CHECK-ENCODING: [0xe0,0xd2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-6.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.00000000
 // CHECK-ENCODING: [0x00,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-6.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.25000000
 // CHECK-ENCODING: [0x20,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-6.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.50000000
 // CHECK-ENCODING: [0x40,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-6.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-6.75000000
 // CHECK-ENCODING: [0x60,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-7.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.00000000
 // CHECK-ENCODING: [0x80,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-7.25000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.25000000
 // CHECK-ENCODING: [0xa0,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-7.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.50000000
 // CHECK-ENCODING: [0xc0,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-7.75000000
 // CHECK-INST: fmov z0.d, p0/m, #-7.75000000
 // CHECK-ENCODING: [0xe0,0xd3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-8.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-8.00000000
 // CHECK-ENCODING: [0x00,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-8.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-8.50000000
 // CHECK-ENCODING: [0x20,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-9.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-9.00000000
 // CHECK-ENCODING: [0x40,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-9.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-9.50000000
 // CHECK-ENCODING: [0x60,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-10.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-10.00000000
 // CHECK-ENCODING: [0x80,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-10.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-10.50000000
 // CHECK-ENCODING: [0xa0,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-11.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-11.00000000
 // CHECK-ENCODING: [0xc0,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-11.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-11.50000000
 // CHECK-ENCODING: [0xe0,0xd4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-12.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-12.00000000
 // CHECK-ENCODING: [0x00,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-12.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-12.50000000
 // CHECK-ENCODING: [0x20,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-13.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-13.00000000
 // CHECK-ENCODING: [0x40,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-13.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-13.50000000
 // CHECK-ENCODING: [0x60,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-14.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-14.00000000
 // CHECK-ENCODING: [0x80,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-14.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-14.50000000
 // CHECK-ENCODING: [0xa0,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-15.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-15.00000000
 // CHECK-ENCODING: [0xc0,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-15.50000000
 // CHECK-INST: fmov z0.d, p0/m, #-15.50000000
 // CHECK-ENCODING: [0xe0,0xd5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-16.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-16.00000000
 // CHECK-ENCODING: [0x00,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-17.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-17.00000000
 // CHECK-ENCODING: [0x20,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-18.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-18.00000000
 // CHECK-ENCODING: [0x40,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-19.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-19.00000000
 // CHECK-ENCODING: [0x60,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-20.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-20.00000000
 // CHECK-ENCODING: [0x80,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-21.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-21.00000000
 // CHECK-ENCODING: [0xa0,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-22.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-22.00000000
 // CHECK-ENCODING: [0xc0,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-23.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-23.00000000
 // CHECK-ENCODING: [0xe0,0xd6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-24.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-24.00000000
 // CHECK-ENCODING: [0x00,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-25.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-25.00000000
 // CHECK-ENCODING: [0x20,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-26.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-26.00000000
 // CHECK-ENCODING: [0x40,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-27.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-27.00000000
 // CHECK-ENCODING: [0x60,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-28.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-28.00000000
 // CHECK-ENCODING: [0x80,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-29.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-29.00000000
 // CHECK-ENCODING: [0xa0,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-30.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-30.00000000
 // CHECK-ENCODING: [0xc0,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #-31.00000000
 // CHECK-INST: fmov z0.d, p0/m, #-31.00000000
 // CHECK-ENCODING: [0xe0,0xd7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 d7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.12500000
 // CHECK-INST: fmov z0.d, p0/m, #0.12500000
 // CHECK-ENCODING: [0x00,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.13281250
 // CHECK-INST: fmov z0.d, p0/m, #0.13281250
 // CHECK-ENCODING: [0x20,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.14062500
 // CHECK-INST: fmov z0.d, p0/m, #0.14062500
 // CHECK-ENCODING: [0x40,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.14843750
 // CHECK-INST: fmov z0.d, p0/m, #0.14843750
 // CHECK-ENCODING: [0x60,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.15625000
 // CHECK-INST: fmov z0.d, p0/m, #0.15625000
 // CHECK-ENCODING: [0x80,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.16406250
 // CHECK-INST: fmov z0.d, p0/m, #0.16406250
 // CHECK-ENCODING: [0xa0,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.17187500
 // CHECK-INST: fmov z0.d, p0/m, #0.17187500
 // CHECK-ENCODING: [0xc0,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.17968750
 // CHECK-INST: fmov z0.d, p0/m, #0.17968750
 // CHECK-ENCODING: [0xe0,0xc8,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c8 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.18750000
 // CHECK-INST: fmov z0.d, p0/m, #0.18750000
 // CHECK-ENCODING: [0x00,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.19531250
 // CHECK-INST: fmov z0.d, p0/m, #0.19531250
 // CHECK-ENCODING: [0x20,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.20312500
 // CHECK-INST: fmov z0.d, p0/m, #0.20312500
 // CHECK-ENCODING: [0x40,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.21093750
 // CHECK-INST: fmov z0.d, p0/m, #0.21093750
 // CHECK-ENCODING: [0x60,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.21875000
 // CHECK-INST: fmov z0.d, p0/m, #0.21875000
 // CHECK-ENCODING: [0x80,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.22656250
 // CHECK-INST: fmov z0.d, p0/m, #0.22656250
 // CHECK-ENCODING: [0xa0,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.23437500
 // CHECK-INST: fmov z0.d, p0/m, #0.23437500
 // CHECK-ENCODING: [0xc0,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.24218750
 // CHECK-INST: fmov z0.d, p0/m, #0.24218750
 // CHECK-ENCODING: [0xe0,0xc9,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c9 d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.25000000
 // CHECK-INST: fmov z0.d, p0/m, #0.25000000
 // CHECK-ENCODING: [0x00,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.26562500
 // CHECK-INST: fmov z0.d, p0/m, #0.26562500
 // CHECK-ENCODING: [0x20,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.28125000
 // CHECK-INST: fmov z0.d, p0/m, #0.28125000
 // CHECK-ENCODING: [0x40,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.29687500
 // CHECK-INST: fmov z0.d, p0/m, #0.29687500
 // CHECK-ENCODING: [0x60,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.31250000
 // CHECK-INST: fmov z0.d, p0/m, #0.31250000
 // CHECK-ENCODING: [0x80,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.32812500
 // CHECK-INST: fmov z0.d, p0/m, #0.32812500
 // CHECK-ENCODING: [0xa0,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.34375000
 // CHECK-INST: fmov z0.d, p0/m, #0.34375000
 // CHECK-ENCODING: [0xc0,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.35937500
 // CHECK-INST: fmov z0.d, p0/m, #0.35937500
 // CHECK-ENCODING: [0xe0,0xca,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ca d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.37500000
 // CHECK-INST: fmov z0.d, p0/m, #0.37500000
 // CHECK-ENCODING: [0x00,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.39062500
 // CHECK-INST: fmov z0.d, p0/m, #0.39062500
 // CHECK-ENCODING: [0x20,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.40625000
 // CHECK-INST: fmov z0.d, p0/m, #0.40625000
 // CHECK-ENCODING: [0x40,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.42187500
 // CHECK-INST: fmov z0.d, p0/m, #0.42187500
 // CHECK-ENCODING: [0x60,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.43750000
 // CHECK-INST: fmov z0.d, p0/m, #0.43750000
 // CHECK-ENCODING: [0x80,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.45312500
 // CHECK-INST: fmov z0.d, p0/m, #0.45312500
 // CHECK-ENCODING: [0xa0,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.46875000
 // CHECK-INST: fmov z0.d, p0/m, #0.46875000
 // CHECK-ENCODING: [0xc0,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.48437500
 // CHECK-INST: fmov z0.d, p0/m, #0.48437500
 // CHECK-ENCODING: [0xe0,0xcb,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.50000000
 // CHECK-INST: fmov z0.d, p0/m, #0.50000000
 // CHECK-ENCODING: [0x00,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.53125000
 // CHECK-INST: fmov z0.d, p0/m, #0.53125000
 // CHECK-ENCODING: [0x20,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.56250000
 // CHECK-INST: fmov z0.d, p0/m, #0.56250000
 // CHECK-ENCODING: [0x40,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.59375000
 // CHECK-INST: fmov z0.d, p0/m, #0.59375000
 // CHECK-ENCODING: [0x60,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.62500000
 // CHECK-INST: fmov z0.d, p0/m, #0.62500000
 // CHECK-ENCODING: [0x80,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.65625000
 // CHECK-INST: fmov z0.d, p0/m, #0.65625000
 // CHECK-ENCODING: [0xa0,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.68750000
 // CHECK-INST: fmov z0.d, p0/m, #0.68750000
 // CHECK-ENCODING: [0xc0,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.71875000
 // CHECK-INST: fmov z0.d, p0/m, #0.71875000
 // CHECK-ENCODING: [0xe0,0xcc,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cc d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.75000000
 // CHECK-INST: fmov z0.d, p0/m, #0.75000000
 // CHECK-ENCODING: [0x00,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.78125000
 // CHECK-INST: fmov z0.d, p0/m, #0.78125000
 // CHECK-ENCODING: [0x20,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.81250000
 // CHECK-INST: fmov z0.d, p0/m, #0.81250000
 // CHECK-ENCODING: [0x40,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.84375000
 // CHECK-INST: fmov z0.d, p0/m, #0.84375000
 // CHECK-ENCODING: [0x60,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.87500000
 // CHECK-INST: fmov z0.d, p0/m, #0.87500000
 // CHECK-ENCODING: [0x80,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.90625000
 // CHECK-INST: fmov z0.d, p0/m, #0.90625000
 // CHECK-ENCODING: [0xa0,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.93750000
 // CHECK-INST: fmov z0.d, p0/m, #0.93750000
 // CHECK-ENCODING: [0xc0,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #0.96875000
 // CHECK-INST: fmov z0.d, p0/m, #0.96875000
 // CHECK-ENCODING: [0xe0,0xcd,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cd d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.00000000
 // CHECK-INST: fmov z0.d, p0/m, #1.00000000
 // CHECK-ENCODING: [0x00,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.06250000
 // CHECK-INST: fmov z0.d, p0/m, #1.06250000
 // CHECK-ENCODING: [0x20,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.12500000
 // CHECK-INST: fmov z0.d, p0/m, #1.12500000
 // CHECK-ENCODING: [0x40,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.18750000
 // CHECK-INST: fmov z0.d, p0/m, #1.18750000
 // CHECK-ENCODING: [0x60,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.25000000
 // CHECK-INST: fmov z0.d, p0/m, #1.25000000
 // CHECK-ENCODING: [0x80,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.31250000
 // CHECK-INST: fmov z0.d, p0/m, #1.31250000
 // CHECK-ENCODING: [0xa0,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.37500000
 // CHECK-INST: fmov z0.d, p0/m, #1.37500000
 // CHECK-ENCODING: [0xc0,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.43750000
 // CHECK-INST: fmov z0.d, p0/m, #1.43750000
 // CHECK-ENCODING: [0xe0,0xce,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ce d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.50000000
 // CHECK-INST: fmov z0.d, p0/m, #1.50000000
 // CHECK-ENCODING: [0x00,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.56250000
 // CHECK-INST: fmov z0.d, p0/m, #1.56250000
 // CHECK-ENCODING: [0x20,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.62500000
 // CHECK-INST: fmov z0.d, p0/m, #1.62500000
 // CHECK-ENCODING: [0x40,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.68750000
 // CHECK-INST: fmov z0.d, p0/m, #1.68750000
 // CHECK-ENCODING: [0x60,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.75000000
 // CHECK-INST: fmov z0.d, p0/m, #1.75000000
 // CHECK-ENCODING: [0x80,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.81250000
 // CHECK-INST: fmov z0.d, p0/m, #1.81250000
 // CHECK-ENCODING: [0xa0,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.87500000
 // CHECK-INST: fmov z0.d, p0/m, #1.87500000
 // CHECK-ENCODING: [0xc0,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #1.93750000
 // CHECK-INST: fmov z0.d, p0/m, #1.93750000
 // CHECK-ENCODING: [0xe0,0xcf,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.00000000
 // CHECK-INST: fmov z0.d, p0/m, #2.00000000
 // CHECK-ENCODING: [0x00,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.12500000
 // CHECK-INST: fmov z0.d, p0/m, #2.12500000
 // CHECK-ENCODING: [0x20,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.25000000
 // CHECK-INST: fmov z0.d, p0/m, #2.25000000
 // CHECK-ENCODING: [0x40,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.37500000
 // CHECK-INST: fmov z0.d, p0/m, #2.37500000
 // CHECK-ENCODING: [0x60,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.50000000
 // CHECK-INST: fmov z0.d, p0/m, #2.50000000
 // CHECK-ENCODING: [0x80,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.62500000
 // CHECK-INST: fmov z0.d, p0/m, #2.62500000
 // CHECK-ENCODING: [0xa0,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.75000000
 // CHECK-INST: fmov z0.d, p0/m, #2.75000000
 // CHECK-ENCODING: [0xc0,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #2.87500000
 // CHECK-INST: fmov z0.d, p0/m, #2.87500000
 // CHECK-ENCODING: [0xe0,0xc0,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c0 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.00000000
 // CHECK-INST: fmov z0.d, p0/m, #3.00000000
 // CHECK-ENCODING: [0x00,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.12500000
 // CHECK-INST: fmov z0.d, p0/m, #3.12500000
 // CHECK-ENCODING: [0x20,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.25000000
 // CHECK-INST: fmov z0.d, p0/m, #3.25000000
 // CHECK-ENCODING: [0x40,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.37500000
 // CHECK-INST: fmov z0.d, p0/m, #3.37500000
 // CHECK-ENCODING: [0x60,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.50000000
 // CHECK-INST: fmov z0.d, p0/m, #3.50000000
 // CHECK-ENCODING: [0x80,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.62500000
 // CHECK-INST: fmov z0.d, p0/m, #3.62500000
 // CHECK-ENCODING: [0xa0,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.75000000
 // CHECK-INST: fmov z0.d, p0/m, #3.75000000
 // CHECK-ENCODING: [0xc0,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #3.87500000
 // CHECK-INST: fmov z0.d, p0/m, #3.87500000
 // CHECK-ENCODING: [0xe0,0xc1,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c1 d0 05 <unknown>
 
 fmov z0.d, p0/m, #4.00000000
 // CHECK-INST: fmov z0.d, p0/m, #4.00000000
 // CHECK-ENCODING: [0x00,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #4.25000000
 // CHECK-INST: fmov z0.d, p0/m, #4.25000000
 // CHECK-ENCODING: [0x20,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #4.50000000
 // CHECK-INST: fmov z0.d, p0/m, #4.50000000
 // CHECK-ENCODING: [0x40,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #4.75000000
 // CHECK-INST: fmov z0.d, p0/m, #4.75000000
 // CHECK-ENCODING: [0x60,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #5.00000000
 // CHECK-INST: fmov z0.d, p0/m, #5.00000000
 // CHECK-ENCODING: [0x80,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #5.25000000
 // CHECK-INST: fmov z0.d, p0/m, #5.25000000
 // CHECK-ENCODING: [0xa0,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #5.50000000
 // CHECK-INST: fmov z0.d, p0/m, #5.50000000
 // CHECK-ENCODING: [0xc0,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #5.75000000
 // CHECK-INST: fmov z0.d, p0/m, #5.75000000
 // CHECK-ENCODING: [0xe0,0xc2,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c2 d0 05 <unknown>
 
 fmov z0.d, p0/m, #6.00000000
 // CHECK-INST: fmov z0.d, p0/m, #6.00000000
 // CHECK-ENCODING: [0x00,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #6.25000000
 // CHECK-INST: fmov z0.d, p0/m, #6.25000000
 // CHECK-ENCODING: [0x20,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #6.50000000
 // CHECK-INST: fmov z0.d, p0/m, #6.50000000
 // CHECK-ENCODING: [0x40,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #6.75000000
 // CHECK-INST: fmov z0.d, p0/m, #6.75000000
 // CHECK-ENCODING: [0x60,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #7.00000000
 // CHECK-INST: fmov z0.d, p0/m, #7.00000000
 // CHECK-ENCODING: [0x80,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #7.25000000
 // CHECK-INST: fmov z0.d, p0/m, #7.25000000
 // CHECK-ENCODING: [0xa0,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #7.50000000
 // CHECK-INST: fmov z0.d, p0/m, #7.50000000
 // CHECK-ENCODING: [0xc0,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #7.75000000
 // CHECK-INST: fmov z0.d, p0/m, #7.75000000
 // CHECK-ENCODING: [0xe0,0xc3,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 d0 05 <unknown>
 
 fmov z0.d, p0/m, #8.00000000
 // CHECK-INST: fmov z0.d, p0/m, #8.00000000
 // CHECK-ENCODING: [0x00,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #8.50000000
 // CHECK-INST: fmov z0.d, p0/m, #8.50000000
 // CHECK-ENCODING: [0x20,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #9.00000000
 // CHECK-INST: fmov z0.d, p0/m, #9.00000000
 // CHECK-ENCODING: [0x40,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #9.50000000
 // CHECK-INST: fmov z0.d, p0/m, #9.50000000
 // CHECK-ENCODING: [0x60,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #10.00000000
 // CHECK-INST: fmov z0.d, p0/m, #10.00000000
 // CHECK-ENCODING: [0x80,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #10.50000000
 // CHECK-INST: fmov z0.d, p0/m, #10.50000000
 // CHECK-ENCODING: [0xa0,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #11.00000000
 // CHECK-INST: fmov z0.d, p0/m, #11.00000000
 // CHECK-ENCODING: [0xc0,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #11.50000000
 // CHECK-INST: fmov z0.d, p0/m, #11.50000000
 // CHECK-ENCODING: [0xe0,0xc4,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c4 d0 05 <unknown>
 
 fmov z0.d, p0/m, #12.00000000
 // CHECK-INST: fmov z0.d, p0/m, #12.00000000
 // CHECK-ENCODING: [0x00,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #12.50000000
 // CHECK-INST: fmov z0.d, p0/m, #12.50000000
 // CHECK-ENCODING: [0x20,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #13.00000000
 // CHECK-INST: fmov z0.d, p0/m, #13.00000000
 // CHECK-ENCODING: [0x40,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #13.50000000
 // CHECK-INST: fmov z0.d, p0/m, #13.50000000
 // CHECK-ENCODING: [0x60,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #14.00000000
 // CHECK-INST: fmov z0.d, p0/m, #14.00000000
 // CHECK-ENCODING: [0x80,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #14.50000000
 // CHECK-INST: fmov z0.d, p0/m, #14.50000000
 // CHECK-ENCODING: [0xa0,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #15.00000000
 // CHECK-INST: fmov z0.d, p0/m, #15.00000000
 // CHECK-ENCODING: [0xc0,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #15.50000000
 // CHECK-INST: fmov z0.d, p0/m, #15.50000000
 // CHECK-ENCODING: [0xe0,0xc5,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c5 d0 05 <unknown>
 
 fmov z0.d, p0/m, #16.00000000
 // CHECK-INST: fmov z0.d, p0/m, #16.00000000
 // CHECK-ENCODING: [0x00,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #17.00000000
 // CHECK-INST: fmov z0.d, p0/m, #17.00000000
 // CHECK-ENCODING: [0x20,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #18.00000000
 // CHECK-INST: fmov z0.d, p0/m, #18.00000000
 // CHECK-ENCODING: [0x40,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #19.00000000
 // CHECK-INST: fmov z0.d, p0/m, #19.00000000
 // CHECK-ENCODING: [0x60,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #20.00000000
 // CHECK-INST: fmov z0.d, p0/m, #20.00000000
 // CHECK-ENCODING: [0x80,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #21.00000000
 // CHECK-INST: fmov z0.d, p0/m, #21.00000000
 // CHECK-ENCODING: [0xa0,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #22.00000000
 // CHECK-INST: fmov z0.d, p0/m, #22.00000000
 // CHECK-ENCODING: [0xc0,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #23.00000000
 // CHECK-INST: fmov z0.d, p0/m, #23.00000000
 // CHECK-ENCODING: [0xe0,0xc6,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c6 d0 05 <unknown>
 
 fmov z0.d, p0/m, #24.00000000
 // CHECK-INST: fmov z0.d, p0/m, #24.00000000
 // CHECK-ENCODING: [0x00,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #25.00000000
 // CHECK-INST: fmov z0.d, p0/m, #25.00000000
 // CHECK-ENCODING: [0x20,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 c7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #26.00000000
 // CHECK-INST: fmov z0.d, p0/m, #26.00000000
 // CHECK-ENCODING: [0x40,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 c7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #27.00000000
 // CHECK-INST: fmov z0.d, p0/m, #27.00000000
 // CHECK-ENCODING: [0x60,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 c7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #28.00000000
 // CHECK-INST: fmov z0.d, p0/m, #28.00000000
 // CHECK-ENCODING: [0x80,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 c7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #29.00000000
 // CHECK-INST: fmov z0.d, p0/m, #29.00000000
 // CHECK-ENCODING: [0xa0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 c7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #30.00000000
 // CHECK-INST: fmov z0.d, p0/m, #30.00000000
 // CHECK-ENCODING: [0xc0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 c7 d0 05 <unknown>
 
 fmov z0.d, p0/m, #31.00000000
 // CHECK-INST: fmov z0.d, p0/m, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 d0 05 <unknown>
 
 
@@ -1606,23 +1606,23 @@ fmov z0.d, p0/m, #31.00000000
 movprfx z0.d, p0/z, z7.d
 // CHECK-INST: movprfx	z0.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe0,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 20 d0 04 <unknown>
 
 fmov z0.d, p0/m, #31.00000000
 // CHECK-INST: fmov	z0.d, p0/m, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 d0 05 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmov z0.d, p0/m, #31.00000000
 // CHECK-INST: fmov	z0.d, p0/m, #31.00000000
 // CHECK-ENCODING: [0xe0,0xc7,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 d0 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmsb.s b/llvm/test/MC/AArch64/SVE/fmsb.s
index e894ef630f88d..58413f2387a2f 100644
--- a/llvm/test/MC/AArch64/SVE/fmsb.s
+++ b/llvm/test/MC/AArch64/SVE/fmsb.s
@@ -12,19 +12,19 @@
 fmsb z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fmsb	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xbc,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc 7f 65 <unknown>
 
 fmsb z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fmsb	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xbc,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc bf 65 <unknown>
 
 fmsb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmsb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xbc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc ff 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fmsb z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmsb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmsb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xbc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmsb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fmsb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xbc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc ff 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmul.s b/llvm/test/MC/AArch64/SVE/fmul.s
index 8101f92a07f6e..9d5c7ff683ecc 100644
--- a/llvm/test/MC/AArch64/SVE/fmul.s
+++ b/llvm/test/MC/AArch64/SVE/fmul.s
@@ -12,115 +12,115 @@
 fmul    z0.h, p0/m, z0.h, #0.5000000000000
 // CHECK-INST: fmul    z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x5a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5a 65 <unknown>
 
 fmul    z0.h, p0/m, z0.h, #0.5
 // CHECK-INST: fmul    z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x5a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5a 65 <unknown>
 
 fmul    z0.s, p0/m, z0.s, #0.5
 // CHECK-INST: fmul    z0.s, p0/m, z0.s, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x9a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 9a 65 <unknown>
 
 fmul    z0.d, p0/m, z0.d, #0.5
 // CHECK-INST: fmul    z0.d, p0/m, z0.d, #0.5
 // CHECK-ENCODING: [0x00,0x80,0xda,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 da 65 <unknown>
 
 fmul    z31.h, p7/m, z31.h, #2.0
 // CHECK-INST: fmul    z31.h, p7/m, z31.h, #2.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5a 65 <unknown>
 
 fmul    z31.s, p7/m, z31.s, #2.0
 // CHECK-INST: fmul    z31.s, p7/m, z31.s, #2.0
 // CHECK-ENCODING: [0x3f,0x9c,0x9a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 9a 65 <unknown>
 
 fmul    z31.d, p7/m, z31.d, #2.0
 // CHECK-INST: fmul    z31.d, p7/m, z31.d, #2.0
 // CHECK-ENCODING: [0x3f,0x9c,0xda,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c da 65 <unknown>
 
 fmul    z0.h, z0.h, z0.h[0]
 // CHECK-INST: fmul    z0.h, z0.h, z0.h[0]
 // CHECK-ENCODING: [0x00,0x20,0x20,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 20 64 <unknown>
 
 fmul    z0.s, z0.s, z0.s[0]
 // CHECK-INST: fmul    z0.s, z0.s, z0.s[0]
 // CHECK-ENCODING: [0x00,0x20,0xa0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 a0 64 <unknown>
 
 fmul    z0.d, z0.d, z0.d[0]
 // CHECK-INST: fmul    z0.d, z0.d, z0.d[0]
 // CHECK-ENCODING: [0x00,0x20,0xe0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 e0 64 <unknown>
 
 fmul    z31.h, z31.h, z7.h[7]
 // CHECK-INST: fmul    z31.h, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xff,0x23,0x7f,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 7f 64 <unknown>
 
 fmul    z31.s, z31.s, z7.s[3]
 // CHECK-INST: fmul    z31.s, z31.s, z7.s[3]
 // CHECK-ENCODING: [0xff,0x23,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 bf 64 <unknown>
 
 fmul    z31.d, z31.d, z15.d[1]
 // CHECK-INST: fmul    z31.d, z31.d, z15.d[1]
 // CHECK-ENCODING: [0xff,0x23,0xff,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 ff 64 <unknown>
 
 fmul    z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fmul	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x42,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 42 65 <unknown>
 
 fmul    z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fmul	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x82,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 82 65 <unknown>
 
 fmul    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmul	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc2,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c2 65 <unknown>
 
 fmul z0.h, z1.h, z31.h
 // CHECK-INST: fmul	z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x08,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 08 5f 65 <unknown>
 
 fmul z0.s, z1.s, z31.s
 // CHECK-INST: fmul	z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x08,0x9f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 08 9f 65 <unknown>
 
 fmul z0.d, z1.d, z31.d
 // CHECK-INST: fmul	z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x08,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 08 df 65 <unknown>
 
 
@@ -130,47 +130,47 @@ fmul z0.d, z1.d, z31.d
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 fmul    z31.d, p7/m, z31.d, #2.0
 // CHECK-INST: fmul	z31.d, p7/m, z31.d, #2.0
 // CHECK-ENCODING: [0x3f,0x9c,0xda,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c da 65 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fmul    z31.d, p7/m, z31.d, #2.0
 // CHECK-INST: fmul	z31.d, p7/m, z31.d, #2.0
 // CHECK-ENCODING: [0x3f,0x9c,0xda,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c da 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmul    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmul	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc2,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c2 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmul    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmul	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc2,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c2 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fmulx.s b/llvm/test/MC/AArch64/SVE/fmulx.s
index ed8913a78a2ff..e2f5a00895976 100644
--- a/llvm/test/MC/AArch64/SVE/fmulx.s
+++ b/llvm/test/MC/AArch64/SVE/fmulx.s
@@ -12,19 +12,19 @@
 fmulx   z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fmulx	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x4a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 4a 65 <unknown>
 
 fmulx   z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fmulx	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x8a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 8a 65 <unknown>
 
 fmulx   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmulx	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xca,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f ca 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fmulx   z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fmulx   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmulx	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xca,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f ca 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fmulx   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fmulx	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xca,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f ca 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fneg.s b/llvm/test/MC/AArch64/SVE/fneg.s
index 850dab58fb57a..96b4ebd2fbddf 100644
--- a/llvm/test/MC/AArch64/SVE/fneg.s
+++ b/llvm/test/MC/AArch64/SVE/fneg.s
@@ -12,19 +12,19 @@
 fneg    z31.h, p7/m, z31.h
 // CHECK-INST: fneg	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x5d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 5d 04 <unknown>
 
 fneg    z31.s, p7/m, z31.s
 // CHECK-INST: fneg	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x9d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 9d 04 <unknown>
 
 fneg    z31.d, p7/m, z31.d
 // CHECK-INST: fneg	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xdd,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf dd 04 <unknown>
 
 
@@ -34,23 +34,23 @@ fneg    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 fneg    z4.d, p7/m, z31.d
 // CHECK-INST: fneg	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xdd,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf dd 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 fneg    z4.d, p7/m, z31.d
 // CHECK-INST: fneg	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xdd,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf dd 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fnmad.s b/llvm/test/MC/AArch64/SVE/fnmad.s
index 2ced184793d94..c2efaf73a4456 100644
--- a/llvm/test/MC/AArch64/SVE/fnmad.s
+++ b/llvm/test/MC/AArch64/SVE/fnmad.s
@@ -12,19 +12,19 @@
 fnmad z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fnmad	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xdc,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc 7f 65 <unknown>
 
 fnmad z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fnmad	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xdc,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc bf 65 <unknown>
 
 fnmad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xdc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc ff 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fnmad z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fnmad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xdc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fnmad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xdc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 dc ff 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fnmla.s b/llvm/test/MC/AArch64/SVE/fnmla.s
index 86dcbb427e9af..5dc4da4960fa1 100644
--- a/llvm/test/MC/AArch64/SVE/fnmla.s
+++ b/llvm/test/MC/AArch64/SVE/fnmla.s
@@ -12,19 +12,19 @@
 fnmla z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fnmla	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x5c,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c 7f 65 <unknown>
 
 fnmla z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fnmla	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x5c,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c bf 65 <unknown>
 
 fnmla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x5c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c ff 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fnmla z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fnmla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x5c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fnmla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x5c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c ff 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fnmls.s b/llvm/test/MC/AArch64/SVE/fnmls.s
index b46e1ab0f6416..09b53ae18d408 100644
--- a/llvm/test/MC/AArch64/SVE/fnmls.s
+++ b/llvm/test/MC/AArch64/SVE/fnmls.s
@@ -12,19 +12,19 @@
 fnmls z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fnmls	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x7c,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c 7f 65 <unknown>
 
 fnmls z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fnmls	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x7c,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c bf 65 <unknown>
 
 fnmls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x7c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c ff 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fnmls z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fnmls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x7c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fnmls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x7c,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c ff 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fnmsb.s b/llvm/test/MC/AArch64/SVE/fnmsb.s
index eb0c6e5b60e00..c114e1e646820 100644
--- a/llvm/test/MC/AArch64/SVE/fnmsb.s
+++ b/llvm/test/MC/AArch64/SVE/fnmsb.s
@@ -12,19 +12,19 @@
 fnmsb z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: fnmsb	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xfc,0x7f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc 7f 65 <unknown>
 
 fnmsb z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: fnmsb	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xfc,0xbf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc bf 65 <unknown>
 
 fnmsb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmsb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xfc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc ff 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fnmsb z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fnmsb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmsb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xfc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc ff 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fnmsb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: fnmsb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xfc,0xff,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc ff 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frecpe.s b/llvm/test/MC/AArch64/SVE/frecpe.s
index eae3114b9d094..61883653cfd96 100644
--- a/llvm/test/MC/AArch64/SVE/frecpe.s
+++ b/llvm/test/MC/AArch64/SVE/frecpe.s
@@ -12,17 +12,17 @@
 frecpe   z0.h, z31.h
 // CHECK-INST: frecpe	z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x33,0x4e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 33 4e 65 <unknown>
 
 frecpe   z0.s, z31.s
 // CHECK-INST: frecpe	z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x33,0x8e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 33 8e 65 <unknown>
 
 frecpe   z0.d, z31.d
 // CHECK-INST: frecpe	z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x33,0xce,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 33 ce 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frecps.s b/llvm/test/MC/AArch64/SVE/frecps.s
index a09ce28885099..97ac6d1fa8835 100644
--- a/llvm/test/MC/AArch64/SVE/frecps.s
+++ b/llvm/test/MC/AArch64/SVE/frecps.s
@@ -12,17 +12,17 @@
 frecps z0.h, z1.h, z31.h
 // CHECK-INST: frecps	z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x18,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 18 5f 65 <unknown>
 
 frecps z0.s, z1.s, z31.s
 // CHECK-INST: frecps	z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x18,0x9f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 18 9f 65 <unknown>
 
 frecps z0.d, z1.d, z31.d
 // CHECK-INST: frecps	z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x18,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 18 df 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frecpx.s b/llvm/test/MC/AArch64/SVE/frecpx.s
index 8504632e1d28b..66c717ec005c6 100644
--- a/llvm/test/MC/AArch64/SVE/frecpx.s
+++ b/llvm/test/MC/AArch64/SVE/frecpx.s
@@ -12,19 +12,19 @@
 frecpx   z31.h, p7/m, z31.h
 // CHECK-INST: frecpx	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x4c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 4c 65 <unknown>
 
 frecpx   z31.s, p7/m, z31.s
 // CHECK-INST: frecpx	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x8c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 8c 65 <unknown>
 
 frecpx   z31.d, p7/m, z31.d
 // CHECK-INST: frecpx	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xcc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf cc 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frecpx   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frecpx   z4.d, p7/m, z31.d
 // CHECK-INST: frecpx	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xcc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf cc 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frecpx   z4.d, p7/m, z31.d
 // CHECK-INST: frecpx	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xcc,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf cc 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frinta.s b/llvm/test/MC/AArch64/SVE/frinta.s
index d687e79a292df..e30b83f417b30 100644
--- a/llvm/test/MC/AArch64/SVE/frinta.s
+++ b/llvm/test/MC/AArch64/SVE/frinta.s
@@ -12,19 +12,19 @@
 frinta   z31.h, p7/m, z31.h
 // CHECK-INST: frinta	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x44,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 44 65 <unknown>
 
 frinta   z31.s, p7/m, z31.s
 // CHECK-INST: frinta	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x84,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 84 65 <unknown>
 
 frinta   z31.d, p7/m, z31.d
 // CHECK-INST: frinta	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf c4 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frinta   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frinta   z4.d, p7/m, z31.d
 // CHECK-INST: frinta	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c4 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frinta   z4.d, p7/m, z31.d
 // CHECK-INST: frinta	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c4 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frinti.s b/llvm/test/MC/AArch64/SVE/frinti.s
index 210bb58d83df8..82392b67abb8c 100644
--- a/llvm/test/MC/AArch64/SVE/frinti.s
+++ b/llvm/test/MC/AArch64/SVE/frinti.s
@@ -12,19 +12,19 @@
 frinti   z31.h, p7/m, z31.h
 // CHECK-INST: frinti	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x47,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 47 65 <unknown>
 
 frinti   z31.s, p7/m, z31.s
 // CHECK-INST: frinti	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x87,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 87 65 <unknown>
 
 frinti   z31.d, p7/m, z31.d
 // CHECK-INST: frinti	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf c7 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frinti   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frinti   z4.d, p7/m, z31.d
 // CHECK-INST: frinti	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c7 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frinti   z4.d, p7/m, z31.d
 // CHECK-INST: frinti	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c7 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frintm.s b/llvm/test/MC/AArch64/SVE/frintm.s
index ed1968655b10a..b5271fc52f111 100644
--- a/llvm/test/MC/AArch64/SVE/frintm.s
+++ b/llvm/test/MC/AArch64/SVE/frintm.s
@@ -12,19 +12,19 @@
 frintm   z31.h, p7/m, z31.h
 // CHECK-INST: frintm	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x42,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 42 65 <unknown>
 
 frintm   z31.s, p7/m, z31.s
 // CHECK-INST: frintm	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x82,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 82 65 <unknown>
 
 frintm   z31.d, p7/m, z31.d
 // CHECK-INST: frintm	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc2,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf c2 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frintm   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frintm   z4.d, p7/m, z31.d
 // CHECK-INST: frintm	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc2,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c2 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frintm   z4.d, p7/m, z31.d
 // CHECK-INST: frintm	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc2,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c2 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frintn.s b/llvm/test/MC/AArch64/SVE/frintn.s
index f0549be1d9436..47666251ebff4 100644
--- a/llvm/test/MC/AArch64/SVE/frintn.s
+++ b/llvm/test/MC/AArch64/SVE/frintn.s
@@ -12,19 +12,19 @@
 frintn   z31.h, p7/m, z31.h
 // CHECK-INST: frintn	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x40,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 40 65 <unknown>
 
 frintn   z31.s, p7/m, z31.s
 // CHECK-INST: frintn	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x80,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 80 65 <unknown>
 
 frintn   z31.d, p7/m, z31.d
 // CHECK-INST: frintn	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf c0 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frintn   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frintn   z4.d, p7/m, z31.d
 // CHECK-INST: frintn	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c0 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frintn   z4.d, p7/m, z31.d
 // CHECK-INST: frintn	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c0 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frintp.s b/llvm/test/MC/AArch64/SVE/frintp.s
index 9a79bf722f884..06da956fcc1b4 100644
--- a/llvm/test/MC/AArch64/SVE/frintp.s
+++ b/llvm/test/MC/AArch64/SVE/frintp.s
@@ -12,19 +12,19 @@
 frintp   z31.h, p7/m, z31.h
 // CHECK-INST: frintp	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 41 65 <unknown>
 
 frintp   z31.s, p7/m, z31.s
 // CHECK-INST: frintp	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 81 65 <unknown>
 
 frintp   z31.d, p7/m, z31.d
 // CHECK-INST: frintp	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf c1 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frintp   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frintp   z4.d, p7/m, z31.d
 // CHECK-INST: frintp	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c1 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frintp   z4.d, p7/m, z31.d
 // CHECK-INST: frintp	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frintx.s b/llvm/test/MC/AArch64/SVE/frintx.s
index 73680454402a1..e0099ed03f875 100644
--- a/llvm/test/MC/AArch64/SVE/frintx.s
+++ b/llvm/test/MC/AArch64/SVE/frintx.s
@@ -12,19 +12,19 @@
 frintx   z31.h, p7/m, z31.h
 // CHECK-INST: frintx	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x46,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 46 65 <unknown>
 
 frintx   z31.s, p7/m, z31.s
 // CHECK-INST: frintx	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x86,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 86 65 <unknown>
 
 frintx   z31.d, p7/m, z31.d
 // CHECK-INST: frintx	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf c6 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frintx   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frintx   z4.d, p7/m, z31.d
 // CHECK-INST: frintx	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c6 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frintx   z4.d, p7/m, z31.d
 // CHECK-INST: frintx	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c6 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frintz.s b/llvm/test/MC/AArch64/SVE/frintz.s
index 634c4a2c95e43..35e6df4c39c8d 100644
--- a/llvm/test/MC/AArch64/SVE/frintz.s
+++ b/llvm/test/MC/AArch64/SVE/frintz.s
@@ -12,19 +12,19 @@
 frintz   z31.h, p7/m, z31.h
 // CHECK-INST: frintz	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x43,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 43 65 <unknown>
 
 frintz   z31.s, p7/m, z31.s
 // CHECK-INST: frintz	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x83,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 83 65 <unknown>
 
 frintz   z31.d, p7/m, z31.d
 // CHECK-INST: frintz	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc3,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf c3 65 <unknown>
 
 
@@ -34,23 +34,23 @@ frintz   z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 frintz   z4.d, p7/m, z31.d
 // CHECK-INST: frintz	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc3,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c3 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 frintz   z4.d, p7/m, z31.d
 // CHECK-INST: frintz	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xc3,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf c3 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frsqrte.s b/llvm/test/MC/AArch64/SVE/frsqrte.s
index b0771fd85c6c3..49a81f4419a2d 100644
--- a/llvm/test/MC/AArch64/SVE/frsqrte.s
+++ b/llvm/test/MC/AArch64/SVE/frsqrte.s
@@ -12,17 +12,17 @@
 frsqrte  z0.h, z31.h
 // CHECK-INST: frsqrte	z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x33,0x4f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 33 4f 65 <unknown>
 
 frsqrte  z0.s, z31.s
 // CHECK-INST: frsqrte	z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x33,0x8f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 33 8f 65 <unknown>
 
 frsqrte  z0.d, z31.d
 // CHECK-INST: frsqrte	z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x33,0xcf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 33 cf 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/frsqrts.s b/llvm/test/MC/AArch64/SVE/frsqrts.s
index 58232b842dc21..2064fc330ac80 100644
--- a/llvm/test/MC/AArch64/SVE/frsqrts.s
+++ b/llvm/test/MC/AArch64/SVE/frsqrts.s
@@ -12,17 +12,17 @@
 frsqrts z0.h, z1.h, z31.h
 // CHECK-INST: frsqrts	z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x1c,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c 5f 65 <unknown>
 
 frsqrts z0.s, z1.s, z31.s
 // CHECK-INST: frsqrts	z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x1c,0x9f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c 9f 65 <unknown>
 
 frsqrts z0.d, z1.d, z31.d
 // CHECK-INST: frsqrts	z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x1c,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 1c df 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fscale.s b/llvm/test/MC/AArch64/SVE/fscale.s
index 6d03003ae062a..68f24d75a7e86 100644
--- a/llvm/test/MC/AArch64/SVE/fscale.s
+++ b/llvm/test/MC/AArch64/SVE/fscale.s
@@ -12,19 +12,19 @@
 fscale  z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fscale	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x49,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 49 65 <unknown>
 
 fscale  z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fscale	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x89,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 89 65 <unknown>
 
 fscale  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fscale	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c9 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fscale  z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fscale  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fscale	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c9 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fscale  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fscale	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c9 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fsqrt.s b/llvm/test/MC/AArch64/SVE/fsqrt.s
index e453f42fee6ad..235bd63af4522 100644
--- a/llvm/test/MC/AArch64/SVE/fsqrt.s
+++ b/llvm/test/MC/AArch64/SVE/fsqrt.s
@@ -12,19 +12,19 @@
 fsqrt    z31.h, p7/m, z31.h
 // CHECK-INST: fsqrt	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x4d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 4d 65 <unknown>
 
 fsqrt    z31.s, p7/m, z31.s
 // CHECK-INST: fsqrt	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x8d,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 8d 65 <unknown>
 
 fsqrt    z31.d, p7/m, z31.d
 // CHECK-INST: fsqrt	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xcd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf cd 65 <unknown>
 
 
@@ -34,23 +34,23 @@ fsqrt    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 fsqrt    z4.d, p7/m, z31.d
 // CHECK-INST: fsqrt	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xcd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf cd 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 fsqrt    z4.d, p7/m, z31.d
 // CHECK-INST: fsqrt	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xcd,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf cd 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fsub.s b/llvm/test/MC/AArch64/SVE/fsub.s
index 9f8fbe53c6380..51daccc437fad 100644
--- a/llvm/test/MC/AArch64/SVE/fsub.s
+++ b/llvm/test/MC/AArch64/SVE/fsub.s
@@ -12,85 +12,85 @@
 fsub    z0.h, p0/m, z0.h, #0.500000000000000
 // CHECK-INST: fsub	z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x59,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 59 65 <unknown>
 
 fsub    z0.h, p0/m, z0.h, #0.5
 // CHECK-INST: fsub	z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x59,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 59 65 <unknown>
 
 fsub    z0.s, p0/m, z0.s, #0.5
 // CHECK-INST: fsub	z0.s, p0/m, z0.s, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x99,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 99 65 <unknown>
 
 fsub    z0.d, p0/m, z0.d, #0.5
 // CHECK-INST: fsub	z0.d, p0/m, z0.d, #0.5
 // CHECK-ENCODING: [0x00,0x80,0xd9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d9 65 <unknown>
 
 fsub    z31.h, p7/m, z31.h, #1.000000000000000
 // CHECK-INST: fsub	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x59,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 59 65 <unknown>
 
 fsub    z31.h, p7/m, z31.h, #1.0
 // CHECK-INST: fsub	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x59,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 59 65 <unknown>
 
 fsub    z31.s, p7/m, z31.s, #1.0
 // CHECK-INST: fsub	z31.s, p7/m, z31.s, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x99,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 99 65 <unknown>
 
 fsub    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fsub	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xd9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c d9 65 <unknown>
 
 fsub    z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fsub	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x41,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 41 65 <unknown>
 
 fsub    z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fsub	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x81,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 81 65 <unknown>
 
 fsub    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fsub	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c1 65 <unknown>
 
 fsub z0.h, z1.h, z31.h
 // CHECK-INST: fsub	z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x04,0x5f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 5f 65 <unknown>
 
 fsub z0.s, z1.s, z31.s
 // CHECK-INST: fsub	z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x04,0x9f,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 9f 65 <unknown>
 
 fsub z0.d, z1.d, z31.d
 // CHECK-INST: fsub	z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x04,0xdf,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 df 65 <unknown>
 
 
@@ -100,47 +100,47 @@ fsub z0.d, z1.d, z31.d
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 fsub    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fsub	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xd9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c d9 65 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fsub    z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fsub	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xd9,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c d9 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fsub    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fsub	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c1 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fsub    z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fsub	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c1 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/fsubr.s b/llvm/test/MC/AArch64/SVE/fsubr.s
index e7a24dce78def..975a9285c92e0 100644
--- a/llvm/test/MC/AArch64/SVE/fsubr.s
+++ b/llvm/test/MC/AArch64/SVE/fsubr.s
@@ -12,67 +12,67 @@
 fsubr   z0.h, p0/m, z0.h, #0.500000000000000
 // CHECK-INST: fsubr	z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x5b,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5b 65 <unknown>
 
 fsubr   z0.h, p0/m, z0.h, #0.5
 // CHECK-INST: fsubr	z0.h, p0/m, z0.h, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x5b,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 5b 65 <unknown>
 
 fsubr   z0.s, p0/m, z0.s, #0.5
 // CHECK-INST: fsubr	z0.s, p0/m, z0.s, #0.5
 // CHECK-ENCODING: [0x00,0x80,0x9b,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 9b 65 <unknown>
 
 fsubr   z0.d, p0/m, z0.d, #0.5
 // CHECK-INST: fsubr	z0.d, p0/m, z0.d, #0.5
 // CHECK-ENCODING: [0x00,0x80,0xdb,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 db 65 <unknown>
 
 fsubr   z31.h, p7/m, z31.h, #1.000000000000000
 // CHECK-INST: fsubr	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5b,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5b 65 <unknown>
 
 fsubr   z31.h, p7/m, z31.h, #1.0
 // CHECK-INST: fsubr	z31.h, p7/m, z31.h, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x5b,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 5b 65 <unknown>
 
 fsubr   z31.s, p7/m, z31.s, #1.0
 // CHECK-INST: fsubr	z31.s, p7/m, z31.s, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0x9b,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c 9b 65 <unknown>
 
 fsubr   z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fsubr	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdb,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c db 65 <unknown>
 
 fsubr   z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: fsubr	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x43,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 43 65 <unknown>
 
 fsubr   z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: fsubr	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0x83,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 83 65 <unknown>
 
 fsubr   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fsubr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc3,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c3 65 <unknown>
 
 
@@ -82,47 +82,47 @@ fsubr   z0.d, p7/m, z0.d, z31.d
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 fsubr   z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fsubr	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdb,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c db 65 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fsubr   z31.d, p7/m, z31.d, #1.0
 // CHECK-INST: fsubr	z31.d, p7/m, z31.d, #1.0
 // CHECK-ENCODING: [0x3f,0x9c,0xdb,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 3f 9c db 65 <unknown>
 
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 fsubr   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fsubr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc3,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c3 65 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 fsubr   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: fsubr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xc3,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f c3 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ftmad.s b/llvm/test/MC/AArch64/SVE/ftmad.s
index fc6f5c1f22c8a..c97b4058d54f4 100644
--- a/llvm/test/MC/AArch64/SVE/ftmad.s
+++ b/llvm/test/MC/AArch64/SVE/ftmad.s
@@ -32,7 +32,7 @@ ftmad z0.d, z0.d, z31.d, #7
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 ftmad z0.d, z0.d, z31.d, #7
diff --git a/llvm/test/MC/AArch64/SVE/incb.s b/llvm/test/MC/AArch64/SVE/incb.s
index 27b347a31f25c..6dbdb8b2ac91c 100644
--- a/llvm/test/MC/AArch64/SVE/incb.s
+++ b/llvm/test/MC/AArch64/SVE/incb.s
@@ -12,197 +12,197 @@
 incb    x0
 // CHECK-INST: incb    x0
 // CHECK-ENCODING: [0xe0,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 30 04 <unknown>
 
 incb    x0, all
 // CHECK-INST: incb    x0
 // CHECK-ENCODING: [0xe0,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 30 04 <unknown>
 
 incb    x0, all, mul #1
 // CHECK-INST: incb    x0
 // CHECK-ENCODING: [0xe0,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 30 04 <unknown>
 
 incb    x0, all, mul #16
 // CHECK-INST: incb    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 3f 04 <unknown>
 
 incb    x0, pow2
 // CHECK-INST: incb    x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 30 04 <unknown>
 
 incb    x0, vl1
 // CHECK-INST: incb    x0, vl1
 // CHECK-ENCODING: [0x20,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e0 30 04 <unknown>
 
 incb    x0, vl2
 // CHECK-INST: incb    x0, vl2
 // CHECK-ENCODING: [0x40,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e0 30 04 <unknown>
 
 incb    x0, vl3
 // CHECK-INST: incb    x0, vl3
 // CHECK-ENCODING: [0x60,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e0 30 04 <unknown>
 
 incb    x0, vl4
 // CHECK-INST: incb    x0, vl4
 // CHECK-ENCODING: [0x80,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e0 30 04 <unknown>
 
 incb    x0, vl5
 // CHECK-INST: incb    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e0 30 04 <unknown>
 
 incb    x0, vl6
 // CHECK-INST: incb    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e0 30 04 <unknown>
 
 incb    x0, vl7
 // CHECK-INST: incb    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e0 30 04 <unknown>
 
 incb    x0, vl8
 // CHECK-INST: incb    x0, vl8
 // CHECK-ENCODING: [0x00,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e1 30 04 <unknown>
 
 incb    x0, vl16
 // CHECK-INST: incb    x0, vl16
 // CHECK-ENCODING: [0x20,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e1 30 04 <unknown>
 
 incb    x0, vl32
 // CHECK-INST: incb    x0, vl32
 // CHECK-ENCODING: [0x40,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e1 30 04 <unknown>
 
 incb    x0, vl64
 // CHECK-INST: incb    x0, vl64
 // CHECK-ENCODING: [0x60,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e1 30 04 <unknown>
 
 incb    x0, vl128
 // CHECK-INST: incb    x0, vl128
 // CHECK-ENCODING: [0x80,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e1 30 04 <unknown>
 
 incb    x0, vl256
 // CHECK-INST: incb    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e1 30 04 <unknown>
 
 incb    x0, #14
 // CHECK-INST: incb    x0, #14
 // CHECK-ENCODING: [0xc0,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e1 30 04 <unknown>
 
 incb    x0, #15
 // CHECK-INST: incb    x0, #15
 // CHECK-ENCODING: [0xe0,0xe1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e1 30 04 <unknown>
 
 incb    x0, #16
 // CHECK-INST: incb    x0, #16
 // CHECK-ENCODING: [0x00,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e2 30 04 <unknown>
 
 incb    x0, #17
 // CHECK-INST: incb    x0, #17
 // CHECK-ENCODING: [0x20,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e2 30 04 <unknown>
 
 incb    x0, #18
 // CHECK-INST: incb    x0, #18
 // CHECK-ENCODING: [0x40,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e2 30 04 <unknown>
 
 incb    x0, #19
 // CHECK-INST: incb    x0, #19
 // CHECK-ENCODING: [0x60,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e2 30 04 <unknown>
 
 incb    x0, #20
 // CHECK-INST: incb    x0, #20
 // CHECK-ENCODING: [0x80,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e2 30 04 <unknown>
 
 incb    x0, #21
 // CHECK-INST: incb    x0, #21
 // CHECK-ENCODING: [0xa0,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e2 30 04 <unknown>
 
 incb    x0, #22
 // CHECK-INST: incb    x0, #22
 // CHECK-ENCODING: [0xc0,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e2 30 04 <unknown>
 
 incb    x0, #23
 // CHECK-INST: incb    x0, #23
 // CHECK-ENCODING: [0xe0,0xe2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e2 30 04 <unknown>
 
 incb    x0, #24
 // CHECK-INST: incb    x0, #24
 // CHECK-ENCODING: [0x00,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e3 30 04 <unknown>
 
 incb    x0, #25
 // CHECK-INST: incb    x0, #25
 // CHECK-ENCODING: [0x20,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e3 30 04 <unknown>
 
 incb    x0, #26
 // CHECK-INST: incb    x0, #26
 // CHECK-ENCODING: [0x40,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e3 30 04 <unknown>
 
 incb    x0, #27
 // CHECK-INST: incb    x0, #27
 // CHECK-ENCODING: [0x60,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e3 30 04 <unknown>
 
 incb    x0, #28
 // CHECK-INST: incb    x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 30 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/incd.s b/llvm/test/MC/AArch64/SVE/incd.s
index c2e3961453513..5275ed749947a 100644
--- a/llvm/test/MC/AArch64/SVE/incd.s
+++ b/llvm/test/MC/AArch64/SVE/incd.s
@@ -16,25 +16,25 @@
 incd    z0.d
 // CHECK-INST: incd    z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 f0 04 <unknown>
 
 incd    z0.d, all
 // CHECK-INST: incd    z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 f0 04 <unknown>
 
 incd    z0.d, all, mul #1
 // CHECK-INST: incd    z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 f0 04 <unknown>
 
 incd    z0.d, all, mul #16
 // CHECK-INST: incd    z0.d, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 ff 04 <unknown>
 
 
@@ -45,25 +45,25 @@ incd    z0.d, all, mul #16
 incd    x0
 // CHECK-INST: incd    x0
 // CHECK-ENCODING: [0xe0,0xe3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 f0 04 <unknown>
 
 incd    x0, all
 // CHECK-INST: incd    x0
 // CHECK-ENCODING: [0xe0,0xe3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 f0 04 <unknown>
 
 incd    x0, all, mul #1
 // CHECK-INST: incd    x0
 // CHECK-ENCODING: [0xe0,0xe3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 f0 04 <unknown>
 
 incd    x0, all, mul #16
 // CHECK-INST: incd    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 ff 04 <unknown>
 
 
@@ -74,97 +74,97 @@ incd    x0, all, mul #16
 incd    x0, pow2
 // CHECK-INST: incd    x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 f0 04 <unknown>
 
 incd    x0, vl1
 // CHECK-INST: incd    x0, vl1
 // CHECK-ENCODING: [0x20,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e0 f0 04 <unknown>
 
 incd    x0, vl2
 // CHECK-INST: incd    x0, vl2
 // CHECK-ENCODING: [0x40,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e0 f0 04 <unknown>
 
 incd    x0, vl3
 // CHECK-INST: incd    x0, vl3
 // CHECK-ENCODING: [0x60,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e0 f0 04 <unknown>
 
 incd    x0, vl4
 // CHECK-INST: incd    x0, vl4
 // CHECK-ENCODING: [0x80,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e0 f0 04 <unknown>
 
 incd    x0, vl5
 // CHECK-INST: incd    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e0 f0 04 <unknown>
 
 incd    x0, vl6
 // CHECK-INST: incd    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e0 f0 04 <unknown>
 
 incd    x0, vl7
 // CHECK-INST: incd    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e0 f0 04 <unknown>
 
 incd    x0, vl8
 // CHECK-INST: incd    x0, vl8
 // CHECK-ENCODING: [0x00,0xe1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e1 f0 04 <unknown>
 
 incd    x0, vl16
 // CHECK-INST: incd    x0, vl16
 // CHECK-ENCODING: [0x20,0xe1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e1 f0 04 <unknown>
 
 incd    x0, vl32
 // CHECK-INST: incd    x0, vl32
 // CHECK-ENCODING: [0x40,0xe1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e1 f0 04 <unknown>
 
 incd    x0, vl64
 // CHECK-INST: incd    x0, vl64
 // CHECK-ENCODING: [0x60,0xe1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e1 f0 04 <unknown>
 
 incd    x0, vl128
 // CHECK-INST: incd    x0, vl128
 // CHECK-ENCODING: [0x80,0xe1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e1 f0 04 <unknown>
 
 incd    x0, vl256
 // CHECK-INST: incd    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e1 f0 04 <unknown>
 
 incd    x0, #14
 // CHECK-INST: incd    x0, #14
 // CHECK-ENCODING: [0xc0,0xe1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e1 f0 04 <unknown>
 
 incd    x0, #28
 // CHECK-INST: incd    x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 f0 04 <unknown>
 
 
@@ -174,35 +174,35 @@ incd    x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 incd    z0.d
 // CHECK-INST: incd	z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 f0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 incd    z0.d, all, mul #16
 // CHECK-INST: incd	z0.d, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 ff 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 incd    z0.d, all
 // CHECK-INST: incd	z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 f0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/inch.s b/llvm/test/MC/AArch64/SVE/inch.s
index e273f1455d668..6032830045a3d 100644
--- a/llvm/test/MC/AArch64/SVE/inch.s
+++ b/llvm/test/MC/AArch64/SVE/inch.s
@@ -16,25 +16,25 @@
 inch    z0.h
 // CHECK-INST: inch    z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 70 04 <unknown>
 
 inch    z0.h, all
 // CHECK-INST: inch    z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 70 04 <unknown>
 
 inch    z0.h, all, mul #1
 // CHECK-INST: inch    z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 70 04 <unknown>
 
 inch    z0.h, all, mul #16
 // CHECK-INST: inch    z0.h, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 7f 04 <unknown>
 
 
@@ -45,25 +45,25 @@ inch    z0.h, all, mul #16
 inch    x0
 // CHECK-INST: inch    x0
 // CHECK-ENCODING: [0xe0,0xe3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 70 04 <unknown>
 
 inch    x0, all
 // CHECK-INST: inch    x0
 // CHECK-ENCODING: [0xe0,0xe3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 70 04 <unknown>
 
 inch    x0, all, mul #1
 // CHECK-INST: inch    x0
 // CHECK-ENCODING: [0xe0,0xe3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 70 04 <unknown>
 
 inch    x0, all, mul #16
 // CHECK-INST: inch    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 7f 04 <unknown>
 
 
@@ -74,97 +74,97 @@ inch    x0, all, mul #16
 inch    x0, pow2
 // CHECK-INST: inch    x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 70 04 <unknown>
 
 inch    x0, vl1
 // CHECK-INST: inch    x0, vl1
 // CHECK-ENCODING: [0x20,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e0 70 04 <unknown>
 
 inch    x0, vl2
 // CHECK-INST: inch    x0, vl2
 // CHECK-ENCODING: [0x40,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e0 70 04 <unknown>
 
 inch    x0, vl3
 // CHECK-INST: inch    x0, vl3
 // CHECK-ENCODING: [0x60,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e0 70 04 <unknown>
 
 inch    x0, vl4
 // CHECK-INST: inch    x0, vl4
 // CHECK-ENCODING: [0x80,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e0 70 04 <unknown>
 
 inch    x0, vl5
 // CHECK-INST: inch    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e0 70 04 <unknown>
 
 inch    x0, vl6
 // CHECK-INST: inch    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e0 70 04 <unknown>
 
 inch    x0, vl7
 // CHECK-INST: inch    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e0 70 04 <unknown>
 
 inch    x0, vl8
 // CHECK-INST: inch    x0, vl8
 // CHECK-ENCODING: [0x00,0xe1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e1 70 04 <unknown>
 
 inch    x0, vl16
 // CHECK-INST: inch    x0, vl16
 // CHECK-ENCODING: [0x20,0xe1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e1 70 04 <unknown>
 
 inch    x0, vl32
 // CHECK-INST: inch    x0, vl32
 // CHECK-ENCODING: [0x40,0xe1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e1 70 04 <unknown>
 
 inch    x0, vl64
 // CHECK-INST: inch    x0, vl64
 // CHECK-ENCODING: [0x60,0xe1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e1 70 04 <unknown>
 
 inch    x0, vl128
 // CHECK-INST: inch    x0, vl128
 // CHECK-ENCODING: [0x80,0xe1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e1 70 04 <unknown>
 
 inch    x0, vl256
 // CHECK-INST: inch    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e1 70 04 <unknown>
 
 inch    x0, #14
 // CHECK-INST: inch    x0, #14
 // CHECK-ENCODING: [0xc0,0xe1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e1 70 04 <unknown>
 
 inch    x0, #28
 // CHECK-INST: inch    x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 70 04 <unknown>
 
 
@@ -174,35 +174,35 @@ inch    x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 inch    z0.h
 // CHECK-INST: inch	z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 70 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 inch    z0.h, all, mul #16
 // CHECK-INST: inch	z0.h, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 7f 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 inch    z0.h, all
 // CHECK-INST: inch	z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 70 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/incp.s b/llvm/test/MC/AArch64/SVE/incp.s
index 21707711e157b..902befe4f03d8 100644
--- a/llvm/test/MC/AArch64/SVE/incp.s
+++ b/llvm/test/MC/AArch64/SVE/incp.s
@@ -12,85 +12,85 @@
 incp    x0, p0.b
 // CHECK-INST: incp    x0, p0.b
 // CHECK-ENCODING: [0x00,0x88,0x2c,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 2c 25 <unknown>
 
 incp    x0, p0.h
 // CHECK-INST: incp    x0, p0.h
 // CHECK-ENCODING: [0x00,0x88,0x6c,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 6c 25 <unknown>
 
 incp    x0, p0.s
 // CHECK-INST: incp    x0, p0.s
 // CHECK-ENCODING: [0x00,0x88,0xac,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 ac 25 <unknown>
 
 incp    x0, p0.d
 // CHECK-INST: incp    x0, p0.d
 // CHECK-ENCODING: [0x00,0x88,0xec,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 ec 25 <unknown>
 
 incp    xzr, p15.b
 // CHECK-INST: incp    xzr, p15.b
 // CHECK-ENCODING: [0xff,0x89,0x2c,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 2c 25 <unknown>
 
 incp    xzr, p15.h
 // CHECK-INST: incp    xzr, p15.h
 // CHECK-ENCODING: [0xff,0x89,0x6c,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 6c 25 <unknown>
 
 incp    xzr, p15.s
 // CHECK-INST: incp    xzr, p15.s
 // CHECK-ENCODING: [0xff,0x89,0xac,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 ac 25 <unknown>
 
 incp    xzr, p15.d
 // CHECK-INST: incp    xzr, p15.d
 // CHECK-ENCODING: [0xff,0x89,0xec,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 ec 25 <unknown>
 
 incp    z31.h, p15
 // CHECK-INST: incp    z31.h, p15.h
 // CHECK-ENCODING: [0xff,0x81,0x6c,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 6c 25 <unknown>
 
 incp    z31.h, p15.h
 // CHECK-INST: incp    z31.h, p15.h
 // CHECK-ENCODING: [0xff,0x81,0x6c,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 6c 25 <unknown>
 
 incp    z31.s, p15
 // CHECK-INST: incp    z31.s, p15.s
 // CHECK-ENCODING: [0xff,0x81,0xac,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ac 25 <unknown>
 
 incp    z31.s, p15.s
 // CHECK-INST: incp    z31.s, p15.s
 // CHECK-ENCODING: [0xff,0x81,0xac,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ac 25 <unknown>
 
 incp    z31.d, p15
 // CHECK-INST: incp    z31.d, p15.d
 // CHECK-ENCODING: [0xff,0x81,0xec,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ec 25 <unknown>
 
 incp    z31.d, p15.d
 // CHECK-INST: incp    z31.d, p15.d
 // CHECK-ENCODING: [0xff,0x81,0xec,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ec 25 <unknown>
 
 
@@ -100,11 +100,11 @@ incp    z31.d, p15.d
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 incp    z31.d, p15.d
 // CHECK-INST: incp	z31.d, p15.d
 // CHECK-ENCODING: [0xff,0x81,0xec,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 ec 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/incw.s b/llvm/test/MC/AArch64/SVE/incw.s
index 34285f172e485..c4514f369fcc8 100644
--- a/llvm/test/MC/AArch64/SVE/incw.s
+++ b/llvm/test/MC/AArch64/SVE/incw.s
@@ -16,25 +16,25 @@
 incw    z0.s
 // CHECK-INST: incw    z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 b0 04 <unknown>
 
 incw    z0.s, all
 // CHECK-INST: incw    z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 b0 04 <unknown>
 
 incw    z0.s, all, mul #1
 // CHECK-INST: incw    z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 b0 04 <unknown>
 
 incw    z0.s, all, mul #16
 // CHECK-INST: incw    z0.s, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 bf 04 <unknown>
 
 
@@ -45,25 +45,25 @@ incw    z0.s, all, mul #16
 incw    x0
 // CHECK-INST: incw    x0
 // CHECK-ENCODING: [0xe0,0xe3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 b0 04 <unknown>
 
 incw    x0, all
 // CHECK-INST: incw    x0
 // CHECK-ENCODING: [0xe0,0xe3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 b0 04 <unknown>
 
 incw    x0, all, mul #1
 // CHECK-INST: incw    x0
 // CHECK-ENCODING: [0xe0,0xe3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 b0 04 <unknown>
 
 incw    x0, all, mul #16
 // CHECK-INST: incw    x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xe3,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e3 bf 04 <unknown>
 
 
@@ -75,97 +75,97 @@ incw    x0, all, mul #16
 incw    x0, pow2
 // CHECK-INST: incw    x0, pow2
 // CHECK-ENCODING: [0x00,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 b0 04 <unknown>
 
 incw    x0, vl1
 // CHECK-INST: incw    x0, vl1
 // CHECK-ENCODING: [0x20,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e0 b0 04 <unknown>
 
 incw    x0, vl2
 // CHECK-INST: incw    x0, vl2
 // CHECK-ENCODING: [0x40,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e0 b0 04 <unknown>
 
 incw    x0, vl3
 // CHECK-INST: incw    x0, vl3
 // CHECK-ENCODING: [0x60,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e0 b0 04 <unknown>
 
 incw    x0, vl4
 // CHECK-INST: incw    x0, vl4
 // CHECK-ENCODING: [0x80,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e0 b0 04 <unknown>
 
 incw    x0, vl5
 // CHECK-INST: incw    x0, vl5
 // CHECK-ENCODING: [0xa0,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e0 b0 04 <unknown>
 
 incw    x0, vl6
 // CHECK-INST: incw    x0, vl6
 // CHECK-ENCODING: [0xc0,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e0 b0 04 <unknown>
 
 incw    x0, vl7
 // CHECK-INST: incw    x0, vl7
 // CHECK-ENCODING: [0xe0,0xe0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 e0 b0 04 <unknown>
 
 incw    x0, vl8
 // CHECK-INST: incw    x0, vl8
 // CHECK-ENCODING: [0x00,0xe1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e1 b0 04 <unknown>
 
 incw    x0, vl16
 // CHECK-INST: incw    x0, vl16
 // CHECK-ENCODING: [0x20,0xe1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 e1 b0 04 <unknown>
 
 incw    x0, vl32
 // CHECK-INST: incw    x0, vl32
 // CHECK-ENCODING: [0x40,0xe1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 e1 b0 04 <unknown>
 
 incw    x0, vl64
 // CHECK-INST: incw    x0, vl64
 // CHECK-ENCODING: [0x60,0xe1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 e1 b0 04 <unknown>
 
 incw    x0, vl128
 // CHECK-INST: incw    x0, vl128
 // CHECK-ENCODING: [0x80,0xe1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e1 b0 04 <unknown>
 
 incw    x0, vl256
 // CHECK-INST: incw    x0, vl256
 // CHECK-ENCODING: [0xa0,0xe1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 e1 b0 04 <unknown>
 
 incw    x0, #14
 // CHECK-INST: incw    x0, #14
 // CHECK-ENCODING: [0xc0,0xe1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 e1 b0 04 <unknown>
 
 incw    x0, #28
 // CHECK-INST: incw    x0, #28
 // CHECK-ENCODING: [0x80,0xe3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 e3 b0 04 <unknown>
 
 
@@ -175,35 +175,35 @@ incw    x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 incw    z0.s
 // CHECK-INST: incw	z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 b0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 incw    z0.s, all, mul #16
 // CHECK-INST: incw	z0.s, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 bf 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 incw    z0.s, all
 // CHECK-INST: incw	z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 b0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/index.s b/llvm/test/MC/AArch64/SVE/index.s
index 10723e786d792..f4aed61b32863 100644
--- a/llvm/test/MC/AArch64/SVE/index.s
+++ b/llvm/test/MC/AArch64/SVE/index.s
@@ -15,49 +15,49 @@
 index   z0.b, #0, #0
 // CHECK-INST: index   z0.b, #0, #0
 // CHECK-ENCODING: [0x00,0x40,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 20 04 <unknown>
 
 index   z31.b, #-1, #-1
 // CHECK-INST: index   z31.b, #-1, #-1
 // CHECK-ENCODING: [0xff,0x43,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 43 3f 04 <unknown>
 
 index   z0.h, #0, #0
 // CHECK-INST: index   z0.h, #0, #0
 // CHECK-ENCODING: [0x00,0x40,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 60 04 <unknown>
 
 index   z31.h, #-1, #-1
 // CHECK-INST: index   z31.h, #-1, #-1
 // CHECK-ENCODING: [0xff,0x43,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 43 7f 04 <unknown>
 
 index   z0.s, #0, #0
 // CHECK-INST: index   z0.s, #0, #0
 // CHECK-ENCODING: [0x00,0x40,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 a0 04 <unknown>
 
 index   z31.s, #-1, #-1
 // CHECK-INST: index   z31.s, #-1, #-1
 // CHECK-ENCODING: [0xff,0x43,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 43 bf 04 <unknown>
 
 index   z0.d, #0, #0
 // CHECK-INST: index   z0.d, #0, #0
 // CHECK-ENCODING: [0x00,0x40,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 e0 04 <unknown>
 
 index   z31.d, #-1, #-1
 // CHECK-INST: index   z31.d, #-1, #-1
 // CHECK-ENCODING: [0xff,0x43,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 43 ff 04 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -66,49 +66,49 @@ index   z31.d, #-1, #-1
 index   z31.b, #-1, wzr
 // CHECK-INST: index   z31.b, #-1, wzr
 // CHECK-ENCODING: [0xff,0x4b,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 4b 3f 04 <unknown>
 
 index   z23.b, #13, w8
 // CHECK-INST: index   z23.b, #13, w8
 // CHECK-ENCODING: [0xb7,0x49,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 49 28 04 <unknown>
 
 index   z31.h, #-1, wzr
 // CHECK-INST: index   z31.h, #-1, wzr
 // CHECK-ENCODING: [0xff,0x4b,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 4b 7f 04 <unknown>
 
 index   z23.h, #13, w8
 // CHECK-INST: index   z23.h, #13, w8
 // CHECK-ENCODING: [0xb7,0x49,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 49 68 04 <unknown>
 
 index   z31.s, #-1, wzr
 // CHECK-INST: index   z31.s, #-1, wzr
 // CHECK-ENCODING: [0xff,0x4b,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 4b bf 04 <unknown>
 
 index   z23.s, #13, w8
 // CHECK-INST: index   z23.s, #13, w8
 // CHECK-ENCODING: [0xb7,0x49,0xa8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 49 a8 04 <unknown>
 
 index   z31.d, #-1, xzr
 // CHECK-INST: index   z31.d, #-1, xzr
 // CHECK-ENCODING: [0xff,0x4b,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 4b ff 04 <unknown>
 
 index   z23.d, #13, x8
 // CHECK-INST: index   z23.d, #13, x8
 // CHECK-ENCODING: [0xb7,0x49,0xe8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 49 e8 04 <unknown>
 
 
@@ -118,49 +118,49 @@ index   z23.d, #13, x8
 index   z31.b, wzr, #-1
 // CHECK-INST: index   z31.b, wzr, #-1
 // CHECK-ENCODING: [0xff,0x47,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 47 3f 04 <unknown>
 
 index   z23.b, w13, #8
 // CHECK-INST: index   z23.b, w13, #8
 // CHECK-ENCODING: [0xb7,0x45,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 45 28 04 <unknown>
 
 index   z31.h, wzr, #-1
 // CHECK-INST: index   z31.h, wzr, #-1
 // CHECK-ENCODING: [0xff,0x47,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 47 7f 04 <unknown>
 
 index   z23.h, w13, #8
 // CHECK-INST: index   z23.h, w13, #8
 // CHECK-ENCODING: [0xb7,0x45,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 45 68 04 <unknown>
 
 index   z31.s, wzr, #-1
 // CHECK-INST: index   z31.s, wzr, #-1
 // CHECK-ENCODING: [0xff,0x47,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 47 bf 04 <unknown>
 
 index   z23.s, w13, #8
 // CHECK-INST: index   z23.s, w13, #8
 // CHECK-ENCODING: [0xb7,0x45,0xa8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 45 a8 04 <unknown>
 
 index   z31.d, xzr, #-1
 // CHECK-INST: index   z31.d, xzr, #-1
 // CHECK-ENCODING: [0xff,0x47,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 47 ff 04 <unknown>
 
 index   z23.d, x13, #8
 // CHECK-INST: index   z23.d, x13, #8
 // CHECK-ENCODING: [0xb7,0x45,0xe8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 45 e8 04 <unknown>
 
 
@@ -170,47 +170,47 @@ index   z23.d, x13, #8
 index   z31.b, wzr, wzr
 // CHECK-INST: index   z31.b, wzr, wzr
 // CHECK-ENCODING: [0xff,0x4f,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 4f 3f 04 <unknown>
 
 index   z21.b, w10, w21
 // CHECK-INST: index   z21.b, w10, w21
 // CHECK-ENCODING: [0x55,0x4d,0x35,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 4d 35 04 <unknown>
 
 index   z31.h, wzr, wzr
 // check-inst: index   z31.h, wzr, wzr
 // check-encoding: [0xff,0x4f,0x7f,0x04]
-// check-error: instruction requires: streaming-sve or sve
+// check-error: instruction requires: sve or sme
 // check-unknown: ff 4f 7f 04 <unknown>
 
 index   z0.h, w0, w0
 // check-inst: index   z0.h, w0, w0
 // check-encoding: [0x00,0x4c,0x60,0x04]
-// check-error: instruction requires: streaming-sve or sve
+// check-error: instruction requires: sve or sme
 // check-unknown: 00 4c 60 04 <unknown>
 
 index   z31.s, wzr, wzr
 // CHECK-INST: index   z31.s, wzr, wzr
 // CHECK-ENCODING: [0xff,0x4f,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 4f bf 04 <unknown>
 
 index   z21.s, w10, w21
 // CHECK-INST: index   z21.s, w10, w21
 // CHECK-ENCODING: [0x55,0x4d,0xb5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 4d b5 04 <unknown>
 
 index   z31.d, xzr, xzr
 // CHECK-INST: index   z31.d, xzr, xzr
 // CHECK-ENCODING: [0xff,0x4f,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 4f ff 04 <unknown>
 
 index   z21.d, x10, x21
 // CHECK-INST: index   z21.d, x10, x21
 // CHECK-ENCODING: [0x55,0x4d,0xf5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 4d f5 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/insr.s b/llvm/test/MC/AArch64/SVE/insr.s
index 3687a601985aa..37b00db8cdd9f 100644
--- a/llvm/test/MC/AArch64/SVE/insr.s
+++ b/llvm/test/MC/AArch64/SVE/insr.s
@@ -12,73 +12,73 @@
 insr    z0.b, w0
 // CHECK-INST: insr    z0.b, w0
 // CHECK-ENCODING: [0x00,0x38,0x24,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 24 05 <unknown>
 
 insr    z0.h, w0
 // CHECK-INST: insr    z0.h, w0
 // CHECK-ENCODING: [0x00,0x38,0x64,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 64 05 <unknown>
 
 insr    z0.s, w0
 // CHECK-INST: insr    z0.s, w0
 // CHECK-ENCODING: [0x00,0x38,0xa4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 a4 05 <unknown>
 
 insr    z0.d, x0
 // CHECK-INST: insr    z0.d, x0
 // CHECK-ENCODING: [0x00,0x38,0xe4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 e4 05 <unknown>
 
 insr    z31.b, wzr
 // CHECK-INST: insr    z31.b, wzr
 // CHECK-ENCODING: [0xff,0x3b,0x24,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 24 05 <unknown>
 
 insr    z31.h, wzr
 // CHECK-INST: insr    z31.h, wzr
 // CHECK-ENCODING: [0xff,0x3b,0x64,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 64 05 <unknown>
 
 insr    z31.s, wzr
 // CHECK-INST: insr    z31.s, wzr
 // CHECK-ENCODING: [0xff,0x3b,0xa4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b a4 05 <unknown>
 
 insr    z31.d, xzr
 // CHECK-INST: insr    z31.d, xzr
 // CHECK-ENCODING: [0xff,0x3b,0xe4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b e4 05 <unknown>
 
 insr    z31.b, b31
 // CHECK-INST: insr    z31.b, b31
 // CHECK-ENCODING: [0xff,0x3b,0x34,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 34 05 <unknown>
 
 insr    z31.h, h31
 // CHECK-INST: insr    z31.h, h31
 // CHECK-ENCODING: [0xff,0x3b,0x74,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 74 05 <unknown>
 
 insr    z31.s, s31
 // CHECK-INST: insr    z31.s, s31
 // CHECK-ENCODING: [0xff,0x3b,0xb4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b b4 05 <unknown>
 
 insr    z31.d, d31
 // CHECK-INST: insr    z31.d, d31
 // CHECK-ENCODING: [0xff,0x3b,0xf4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b f4 05 <unknown>
 
 
@@ -88,23 +88,23 @@ insr    z31.d, d31
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 insr    z31.d, xzr
 // CHECK-INST: insr	z31.d, xzr
 // CHECK-ENCODING: [0xff,0x3b,0xe4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b e4 05 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 insr    z4.d, d31
 // CHECK-INST: insr	z4.d, d31
 // CHECK-ENCODING: [0xe4,0x3b,0xf4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 3b f4 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/lasta.s b/llvm/test/MC/AArch64/SVE/lasta.s
index 73b436fc7da9e..d6beda10b8b10 100644
--- a/llvm/test/MC/AArch64/SVE/lasta.s
+++ b/llvm/test/MC/AArch64/SVE/lasta.s
@@ -12,47 +12,47 @@
 lasta   w0, p7, z31.b
 // CHECK-INST: lasta	w0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0xbf,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 20 05 <unknown>
 
 lasta   w0, p7, z31.h
 // CHECK-INST: lasta	w0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0xbf,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 60 05 <unknown>
 
 lasta   w0, p7, z31.s
 // CHECK-INST: lasta	w0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0xbf,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf a0 05 <unknown>
 
 lasta   x0, p7, z31.d
 // CHECK-INST: lasta	x0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0xbf,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf e0 05 <unknown>
 
 lasta   b0, p7, z31.b
 // CHECK-INST: lasta	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x9f,0x22,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 22 05 <unknown>
 
 lasta   h0, p7, z31.h
 // CHECK-INST: lasta	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x62,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 62 05 <unknown>
 
 lasta   s0, p7, z31.s
 // CHECK-INST: lasta	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xa2,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f a2 05 <unknown>
 
 lasta   d0, p7, z31.d
 // CHECK-INST: lasta	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe2,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e2 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/lastb.s b/llvm/test/MC/AArch64/SVE/lastb.s
index ae4e077d52750..7f581fcb42c4c 100644
--- a/llvm/test/MC/AArch64/SVE/lastb.s
+++ b/llvm/test/MC/AArch64/SVE/lastb.s
@@ -12,47 +12,47 @@
 lastb   w0, p7, z31.b
 // CHECK-INST: lastb	w0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0xbf,0x21,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 21 05 <unknown>
 
 lastb   w0, p7, z31.h
 // CHECK-INST: lastb	w0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0xbf,0x61,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf 61 05 <unknown>
 
 lastb   w0, p7, z31.s
 // CHECK-INST: lastb	w0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0xbf,0xa1,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf a1 05 <unknown>
 
 lastb   x0, p7, z31.d
 // CHECK-INST: lastb	x0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0xbf,0xe1,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bf e1 05 <unknown>
 
 lastb   b0, p7, z31.b
 // CHECK-INST: lastb	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x9f,0x23,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 23 05 <unknown>
 
 lastb   h0, p7, z31.h
 // CHECK-INST: lastb	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x63,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 63 05 <unknown>
 
 lastb   s0, p7, z31.s
 // CHECK-INST: lastb	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xa3,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f a3 05 <unknown>
 
 lastb   d0, p7, z31.d
 // CHECK-INST: lastb	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe3,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e3 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1b.s b/llvm/test/MC/AArch64/SVE/ld1b.s
index 0dc1deaa249ac..cd07e49c2fa1b 100644
--- a/llvm/test/MC/AArch64/SVE/ld1b.s
+++ b/llvm/test/MC/AArch64/SVE/ld1b.s
@@ -12,131 +12,131 @@
 ld1b     z0.b, p0/z, [x0]
 // CHECK-INST: ld1b     { z0.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 00 a4 <unknown>
 
 ld1b     z0.h, p0/z, [x0]
 // CHECK-INST: ld1b     { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x20,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 20 a4 <unknown>
 
 ld1b     z0.s, p0/z, [x0]
 // CHECK-INST: ld1b     { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x40,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 40 a4 <unknown>
 
 ld1b     z0.d, p0/z, [x0]
 // CHECK-INST: ld1b     { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x60,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 60 a4 <unknown>
 
 ld1b    { z0.b }, p0/z, [x0]
 // CHECK-INST: ld1b    { z0.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 00 a4 <unknown>
 
 ld1b    { z0.h }, p0/z, [x0]
 // CHECK-INST: ld1b    { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x20,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 20 a4 <unknown>
 
 ld1b    { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1b    { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x40,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 40 a4 <unknown>
 
 ld1b    { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1b    { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x60,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 60 a4 <unknown>
 
 ld1b    { z31.b }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1b    { z31.b }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x0f,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 0f a4 <unknown>
 
 ld1b    { z21.b }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1b    { z21.b }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x05,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 05 a4 <unknown>
 
 ld1b    { z31.h }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1b    { z31.h }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x2f,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 2f a4 <unknown>
 
 ld1b    { z21.h }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1b    { z21.h }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x25,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 25 a4 <unknown>
 
 ld1b    { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1b    { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x4f,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 4f a4 <unknown>
 
 ld1b    { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1b    { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x45,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 45 a4 <unknown>
 
 ld1b    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1b    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x6f,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 6f a4 <unknown>
 
 ld1b    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1b    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x65,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 65 a4 <unknown>
 
 ld1b    { z0.b }, p0/z, [sp, x0]
 // CHECK-INST: ld1b    { z0.b }, p0/z, [sp, x0]
 // CHECK-ENCODING: [0xe0,0x43,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 43 00 a4 <unknown>
 
 ld1b    { z0.b }, p0/z, [x0, x0]
 // CHECK-INST: ld1b    { z0.b }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 00 a4 <unknown>
 
 ld1b    { z0.b }, p0/z, [x0, x0, lsl #0]
 // CHECK-INST: ld1b    { z0.b }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 00 a4 <unknown>
 
 ld1b    { z5.h }, p3/z, [x17, x16]
 // CHECK-INST: ld1b    { z5.h }, p3/z, [x17, x16]
 // CHECK-ENCODING: [0x25,0x4e,0x30,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 4e 30 a4 <unknown>
 
 ld1b    { z21.s }, p5/z, [x10, x21]
 // CHECK-INST: ld1b    { z21.s }, p5/z, [x10, x21]
 // CHECK-ENCODING: [0x55,0x55,0x55,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 55 55 a4 <unknown>
 
 ld1b    { z23.d }, p3/z, [x13, x8]
 // CHECK-INST: ld1b    { z23.d }, p3/z, [x13, x8]
 // CHECK-ENCODING: [0xb7,0x4d,0x68,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 4d 68 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1d.s b/llvm/test/MC/AArch64/SVE/ld1d.s
index 9289d9e9b17a2..e3ce674fc0bb1 100644
--- a/llvm/test/MC/AArch64/SVE/ld1d.s
+++ b/llvm/test/MC/AArch64/SVE/ld1d.s
@@ -12,35 +12,35 @@
 ld1d     z0.d, p0/z, [x0]
 // CHECK-INST: ld1d     { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xe0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 e0 a5 <unknown>
 
 ld1d    { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1d    { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xe0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 e0 a5 <unknown>
 
 ld1d    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1d    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0xef,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf ef a5 <unknown>
 
 ld1d    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1d    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0xe5,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 e5 a5 <unknown>
 
 ld1d    { z23.d }, p3/z, [sp, x8, lsl #3]
 // CHECK-INST: ld1d    { z23.d }, p3/z, [sp, x8, lsl #3]
 // CHECK-ENCODING: [0xf7,0x4f,0xe8,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f7 4f e8 a5 <unknown>
 
 ld1d    { z23.d }, p3/z, [x13, x8, lsl #3]
 // CHECK-INST: ld1d    { z23.d }, p3/z, [x13, x8, lsl #3]
 // CHECK-ENCODING: [0xb7,0x4d,0xe8,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 4d e8 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1h.s b/llvm/test/MC/AArch64/SVE/ld1h.s
index b7ec9a5ed9d01..45cfabfb84f91 100644
--- a/llvm/test/MC/AArch64/SVE/ld1h.s
+++ b/llvm/test/MC/AArch64/SVE/ld1h.s
@@ -12,95 +12,95 @@
 ld1h     z0.h, p0/z, [x0]
 // CHECK-INST: ld1h     { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xa0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 a0 a4 <unknown>
 
 ld1h     z0.s, p0/z, [x0]
 // CHECK-INST: ld1h     { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xc0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c0 a4 <unknown>
 
 ld1h     z0.d, p0/z, [x0]
 // CHECK-INST: ld1h     { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xe0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 e0 a4 <unknown>
 
 ld1h    { z0.h }, p0/z, [x0]
 // CHECK-INST: ld1h    { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xa0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 a0 a4 <unknown>
 
 ld1h    { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1h    { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xc0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c0 a4 <unknown>
 
 ld1h    { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1h    { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xe0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 e0 a4 <unknown>
 
 ld1h    { z31.h }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1h    { z31.h }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0xaf,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf af a4 <unknown>
 
 ld1h    { z21.h }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1h    { z21.h }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0xa5,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 a5 a4 <unknown>
 
 ld1h    { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1h    { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0xcf,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf cf a4 <unknown>
 
 ld1h    { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1h    { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0xc5,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 c5 a4 <unknown>
 
 ld1h    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1h    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0xef,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf ef a4 <unknown>
 
 ld1h    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1h    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0xe5,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 e5 a4 <unknown>
 
 ld1h    { z5.h }, p3/z, [sp, x16, lsl #1]
 // CHECK-INST: ld1h    { z5.h }, p3/z, [sp, x16, lsl #1]
 // CHECK-ENCODING: [0xe5,0x4f,0xb0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 4f b0 a4 <unknown>
 
 ld1h    { z5.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-INST: ld1h    { z5.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-ENCODING: [0x25,0x4e,0xb0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 4e b0 a4 <unknown>
 
 ld1h    { z21.s }, p5/z, [x10, x21, lsl #1]
 // CHECK-INST: ld1h    { z21.s }, p5/z, [x10, x21, lsl #1]
 // CHECK-ENCODING: [0x55,0x55,0xd5,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 55 d5 a4 <unknown>
 
 ld1h    { z23.d }, p3/z, [x13, x8, lsl #1]
 // CHECK-INST: ld1h    { z23.d }, p3/z, [x13, x8, lsl #1]
 // CHECK-ENCODING: [0xb7,0x4d,0xe8,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 4d e8 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rb.s b/llvm/test/MC/AArch64/SVE/ld1rb.s
index 59a8b25a5db87..536b9841bf3c0 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rb.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rb.s
@@ -12,47 +12,47 @@
 ld1rb   { z0.b }, p0/z, [x0]
 // CHECK-INST: ld1rb   { z0.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x80,0x40,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 40 84 <unknown>
 
 ld1rb   { z0.h }, p0/z, [x0]
 // CHECK-INST: ld1rb   { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x40,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 40 84 <unknown>
 
 ld1rb   { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1rb   { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xc0,0x40,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 40 84 <unknown>
 
 ld1rb   { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rb   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 84 <unknown>
 
 ld1rb   { z31.b }, p7/z, [sp, #63]
 // CHECK-INST: ld1rb   { z31.b }, p7/z, [sp, #63]
 // CHECK-ENCODING: [0xff,0x9f,0x7f,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 7f 84 <unknown>
 
 ld1rb   { z31.h }, p7/z, [sp, #63]
 // CHECK-INST: ld1rb   { z31.h }, p7/z, [sp, #63]
 // CHECK-ENCODING: [0xff,0xbf,0x7f,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 7f 84 <unknown>
 
 ld1rb   { z31.s }, p7/z, [sp, #63]
 // CHECK-INST: ld1rb   { z31.s }, p7/z, [sp, #63]
 // CHECK-ENCODING: [0xff,0xdf,0x7f,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 7f 84 <unknown>
 
 ld1rb   { z31.d }, p7/z, [sp, #63]
 // CHECK-INST: ld1rb   { z31.d }, p7/z, [sp, #63]
 // CHECK-ENCODING: [0xff,0xff,0x7f,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 7f 84 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rd.s b/llvm/test/MC/AArch64/SVE/ld1rd.s
index 793f36cd6d4bf..9a32d2dc6e774 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rd.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rd.s
@@ -12,11 +12,11 @@
 ld1rd   { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rd   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 c0 85 <unknown>
 
 ld1rd   { z31.d }, p7/z, [sp, #504]
 // CHECK-INST: ld1rd   { z31.d }, p7/z, [sp, #504]
 // CHECK-ENCODING: [0xff,0xff,0xff,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff ff 85 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rh.s b/llvm/test/MC/AArch64/SVE/ld1rh.s
index 063c4b53be7f5..9d5544e6c364f 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rh.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rh.s
@@ -12,35 +12,35 @@
 ld1rh   { z0.h }, p0/z, [x0]
 // CHECK-INST: ld1rh   { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xc0,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c0 84 <unknown>
 
 ld1rh   { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1rh   { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xc0,0xc0,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 c0 84 <unknown>
 
 ld1rh   { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rh   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xc0,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 c0 84 <unknown>
 
 ld1rh   { z31.h }, p7/z, [sp, #126]
 // CHECK-INST: ld1rh   { z31.h }, p7/z, [sp, #126]
 // CHECK-ENCODING: [0xff,0xbf,0xff,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf ff 84 <unknown>
 
 ld1rh   { z31.s }, p7/z, [sp, #126]
 // CHECK-INST: ld1rh   { z31.s }, p7/z, [sp, #126]
 // CHECK-ENCODING: [0xff,0xdf,0xff,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df ff 84 <unknown>
 
 ld1rh   { z31.d }, p7/z, [sp, #126]
 // CHECK-INST: ld1rh   { z31.d }, p7/z, [sp, #126]
 // CHECK-ENCODING: [0xff,0xff,0xff,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff ff 84 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rqb.s b/llvm/test/MC/AArch64/SVE/ld1rqb.s
index 6eaacabb1427a..9ed90e8bdaa2d 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rqb.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rqb.s
@@ -12,29 +12,29 @@
 ld1rqb  { z0.b }, p0/z, [x0]
 // CHECK-INST: ld1rqb  { z0.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x20,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 00 a4 <unknown>
 
 ld1rqb  { z0.b }, p0/z, [x0, x0]
 // CHECK-INST: ld1rqb  { z0.b }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0x00,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 00 a4 <unknown>
 
 ld1rqb  { z31.b }, p7/z, [sp, #-16]
 // CHECK-INST: ld1rqb  { z31.b }, p7/z, [sp, #-16]
 // CHECK-ENCODING: [0xff,0x3f,0x0f,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3f 0f a4 <unknown>
 
 ld1rqb  {  z23.b  }, p3/z, [x13, #-128]
 // CHECK-INST: ld1rqb  {  z23.b  }, p3/z, [x13, #-128]
 // CHECK-ENCODING: [0xb7,0x2d,0x08,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 2d 08 a4 <unknown>
 
 ld1rqb  {  z21.b  }, p5/z, [x10, #112]
 // CHECK-INST: ld1rqb  {  z21.b  }, p5/z, [x10, #112]
 // CHECK-ENCODING: [0x55,0x35,0x07,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 35 07 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rqd.s b/llvm/test/MC/AArch64/SVE/ld1rqd.s
index adf4bfb4bc664..f9a1abe0df8ef 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rqd.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rqd.s
@@ -12,29 +12,29 @@
 ld1rqd  { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rqd  { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x20,0x80,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 80 a5 <unknown>
 
 ld1rqd  { z0.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-INST: ld1rqd  { z0.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0x00,0x80,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 80 a5 <unknown>
 
 ld1rqd  { z31.d }, p7/z, [sp, #-16]
 // CHECK-INST: ld1rqd  { z31.d }, p7/z, [sp, #-16]
 // CHECK-ENCODING: [0xff,0x3f,0x8f,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3f 8f a5 <unknown>
 
 ld1rqd  { z23.d }, p3/z, [x13, #-128]
 // CHECK-INST: ld1rqd  { z23.d }, p3/z, [x13, #-128]
 // CHECK-ENCODING: [0xb7,0x2d,0x88,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 2d 88 a5 <unknown>
 
 ld1rqd  { z23.d }, p3/z, [x13, #112]
 // CHECK-INST: ld1rqd  { z23.d }, p3/z, [x13, #112]
 // CHECK-ENCODING: [0xb7,0x2d,0x87,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 2d 87 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rqh.s b/llvm/test/MC/AArch64/SVE/ld1rqh.s
index 4914a37d3eddb..87fb94a0154f2 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rqh.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rqh.s
@@ -12,29 +12,29 @@
 ld1rqh  { z0.h }, p0/z, [x0]
 // CHECK-INST: ld1rqh  { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x20,0x80,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 80 a4 <unknown>
 
 ld1rqh  { z0.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-INST: ld1rqh  { z0.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x00,0x80,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 80 a4 <unknown>
 
 ld1rqh  { z31.h }, p7/z, [sp, #-16]
 // CHECK-INST: ld1rqh  { z31.h }, p7/z, [sp, #-16]
 // CHECK-ENCODING: [0xff,0x3f,0x8f,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3f 8f a4 <unknown>
 
 ld1rqh  { z23.h }, p3/z, [x13, #-128]
 // CHECK-INST: ld1rqh  { z23.h }, p3/z, [x13, #-128]
 // CHECK-ENCODING: [0xb7,0x2d,0x88,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 2d 88 a4 <unknown>
 
 ld1rqh  { z23.h }, p3/z, [x13, #112]
 // CHECK-INST: ld1rqh  { z23.h }, p3/z, [x13, #112]
 // CHECK-ENCODING: [0xb7,0x2d,0x87,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 2d 87 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rqw.s b/llvm/test/MC/AArch64/SVE/ld1rqw.s
index 4f4c3501bda21..ed2d25da5a8cd 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rqw.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rqw.s
@@ -12,29 +12,29 @@
 ld1rqw  { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1rqw  { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x20,0x00,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 00 a5 <unknown>
 
 ld1rqw  { z0.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-INST: ld1rqw  { z0.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0x00,0x00,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 00 a5 <unknown>
 
 ld1rqw  { z31.s }, p7/z, [sp, #-16]
 // CHECK-INST: ld1rqw  { z31.s }, p7/z, [sp, #-16]
 // CHECK-ENCODING: [0xff,0x3f,0x0f,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3f 0f a5 <unknown>
 
 ld1rqw  { z23.s }, p3/z, [x13, #-128]
 // CHECK-INST: ld1rqw  { z23.s }, p3/z, [x13, #-128]
 // CHECK-ENCODING: [0xb7,0x2d,0x08,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 2d 08 a5 <unknown>
 
 ld1rqw  { z23.s }, p3/z, [x13, #112]
 // CHECK-INST: ld1rqw  { z23.s }, p3/z, [x13, #112]
 // CHECK-ENCODING: [0xb7,0x2d,0x07,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 2d 07 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rsb.s b/llvm/test/MC/AArch64/SVE/ld1rsb.s
index a05ae173a288c..923869afacd08 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rsb.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rsb.s
@@ -12,35 +12,35 @@
 ld1rsb  { z0.h }, p0/z, [x0]
 // CHECK-INST: ld1rsb  { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xc0,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 c0 85 <unknown>
 
 ld1rsb  { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1rsb  { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c0 85 <unknown>
 
 ld1rsb  { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rsb  { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x80,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 c0 85 <unknown>
 
 ld1rsb  { z31.h }, p7/z, [sp, #63]
 // CHECK-INST: ld1rsb  { z31.h }, p7/z, [sp, #63]
 // CHECK-ENCODING: [0xff,0xdf,0xff,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df ff 85 <unknown>
 
 ld1rsb  { z31.s }, p7/z, [sp, #63]
 // CHECK-INST: ld1rsb  { z31.s }, p7/z, [sp, #63]
 // CHECK-ENCODING: [0xff,0xbf,0xff,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf ff 85 <unknown>
 
 ld1rsb  { z31.d }, p7/z, [sp, #63]
 // CHECK-INST: ld1rsb  { z31.d }, p7/z, [sp, #63]
 // CHECK-ENCODING: [0xff,0x9f,0xff,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f ff 85 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rsh.s b/llvm/test/MC/AArch64/SVE/ld1rsh.s
index 26c96f1fe187d..5f5ca153691ba 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rsh.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rsh.s
@@ -12,23 +12,23 @@
 ld1rsh  { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1rsh  { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x40,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 40 85 <unknown>
 
 ld1rsh  { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rsh  { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x80,0x40,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 40 85 <unknown>
 
 ld1rsh  { z31.s }, p7/z, [sp, #126]
 // CHECK-INST: ld1rsh  { z31.s }, p7/z, [sp, #126]
 // CHECK-ENCODING: [0xff,0xbf,0x7f,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 7f 85 <unknown>
 
 ld1rsh  { z31.d }, p7/z, [sp, #126]
 // CHECK-INST: ld1rsh  { z31.d }, p7/z, [sp, #126]
 // CHECK-ENCODING: [0xff,0x9f,0x7f,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 7f 85 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rsw.s b/llvm/test/MC/AArch64/SVE/ld1rsw.s
index 7e155f1061843..c7f9be9cfbafd 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rsw.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rsw.s
@@ -12,11 +12,11 @@
 ld1rsw  { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rsw  { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0x80,0xc0,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 c0 84 <unknown>
 
 ld1rsw  { z31.d }, p7/z, [sp, #252]
 // CHECK-INST: ld1rsw  { z31.d }, p7/z, [sp, #252]
 // CHECK-ENCODING: [0xff,0x9f,0xff,0x84]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f ff 84 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1rw.s b/llvm/test/MC/AArch64/SVE/ld1rw.s
index 82c25d954cf99..f632f6cb5e094 100644
--- a/llvm/test/MC/AArch64/SVE/ld1rw.s
+++ b/llvm/test/MC/AArch64/SVE/ld1rw.s
@@ -12,23 +12,23 @@
 ld1rw   { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1rw   { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xc0,0x40,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 40 85 <unknown>
 
 ld1rw   { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1rw   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 85 <unknown>
 
 ld1rw   { z31.s }, p7/z, [sp, #252]
 // CHECK-INST: ld1rw   { z31.s }, p7/z, [sp, #252]
 // CHECK-ENCODING: [0xff,0xdf,0x7f,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 7f 85 <unknown>
 
 ld1rw   { z31.d }, p7/z, [sp, #252]
 // CHECK-INST: ld1rw   { z31.d }, p7/z, [sp, #252]
 // CHECK-ENCODING: [0xff,0xff,0x7f,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 7f 85 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1sb.s b/llvm/test/MC/AArch64/SVE/ld1sb.s
index b945700b26c7d..9f1f47654c5ce 100644
--- a/llvm/test/MC/AArch64/SVE/ld1sb.s
+++ b/llvm/test/MC/AArch64/SVE/ld1sb.s
@@ -12,101 +12,101 @@
 ld1sb   z0.h, p0/z, [x0]
 // CHECK-INST: ld1sb   { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xc0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c0 a5 <unknown>
 
 ld1sb   z0.s, p0/z, [x0]
 // CHECK-INST: ld1sb   { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xa0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 a0 a5 <unknown>
 
 ld1sb   z0.d, p0/z, [x0]
 // CHECK-INST: ld1sb   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x80,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 80 a5 <unknown>
 
 ld1sb   { z0.h }, p0/z, [x0]
 // CHECK-INST: ld1sb   { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xc0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 c0 a5 <unknown>
 
 ld1sb   { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1sb   { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0xa0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 a0 a5 <unknown>
 
 ld1sb   { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1sb   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x80,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 80 a5 <unknown>
 
 ld1sb   { z31.h }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1sb   { z31.h }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0xcf,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf cf a5 <unknown>
 
 ld1sb   { z21.h }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1sb   { z21.h }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0xc5,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 c5 a5 <unknown>
 
 ld1sb   { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1sb   { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0xaf,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf af a5 <unknown>
 
 ld1sb   { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1sb   { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0xa5,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 a5 a5 <unknown>
 
 ld1sb   { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1sb   { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x8f,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 8f a5 <unknown>
 
 ld1sb   { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1sb   { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x85,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 85 a5 <unknown>
 
 ld1sb    { z0.h }, p0/z, [sp, x0]
 // CHECK-INST: ld1sb    { z0.h }, p0/z, [sp, x0]
 // CHECK-ENCODING: [0xe0,0x43,0xc0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 43 c0 a5 <unknown>
 
 ld1sb    { z0.h }, p0/z, [x0, x0]
 // CHECK-INST: ld1sb    { z0.h }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0xc0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c0 a5 <unknown>
 
 ld1sb    { z0.h }, p0/z, [x0, x0, lsl #0]
 // CHECK-INST: ld1sb    { z0.h }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0xc0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c0 a5 <unknown>
 
 ld1sb    { z21.s }, p5/z, [x10, x21]
 // CHECK-INST: ld1sb    { z21.s }, p5/z, [x10, x21]
 // CHECK-ENCODING: [0x55,0x55,0xb5,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 55 b5 a5 <unknown>
 
 ld1sb    { z23.d }, p3/z, [x13, x8]
 // CHECK-INST: ld1sb    { z23.d }, p3/z, [x13, x8]
 // CHECK-ENCODING: [0xb7,0x4d,0x88,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 4d 88 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1sh.s b/llvm/test/MC/AArch64/SVE/ld1sh.s
index 411cff2d0ae59..872e03fe10b1e 100644
--- a/llvm/test/MC/AArch64/SVE/ld1sh.s
+++ b/llvm/test/MC/AArch64/SVE/ld1sh.s
@@ -12,65 +12,65 @@
 ld1sh   z0.s, p0/z, [x0]
 // CHECK-INST: ld1sh   { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x20,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 20 a5 <unknown>
 
 ld1sh   z0.d, p0/z, [x0]
 // CHECK-INST: ld1sh   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x00,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 00 a5 <unknown>
 
 ld1sh   { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1sh   { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x20,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 20 a5 <unknown>
 
 ld1sh   { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1sh   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x00,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 00 a5 <unknown>
 
 ld1sh   { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1sh   { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x2f,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 2f a5 <unknown>
 
 ld1sh   { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1sh   { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x25,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 25 a5 <unknown>
 
 ld1sh   { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1sh   { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x0f,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 0f a5 <unknown>
 
 ld1sh   { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1sh   { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x05,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 05 a5 <unknown>
 
 ld1sh    { z21.s }, p5/z, [sp, x21, lsl #1]
 // CHECK-INST: ld1sh    { z21.s }, p5/z, [sp, x21, lsl #1]
 // CHECK-ENCODING: [0xf5,0x57,0x35,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 57 35 a5 <unknown>
 
 ld1sh    { z21.s }, p5/z, [x10, x21, lsl #1]
 // CHECK-INST: ld1sh    { z21.s }, p5/z, [x10, x21, lsl #1]
 // CHECK-ENCODING: [0x55,0x55,0x35,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 55 35 a5 <unknown>
 
 ld1sh    { z23.d }, p3/z, [x13, x8, lsl #1]
 // CHECK-INST: ld1sh    { z23.d }, p3/z, [x13, x8, lsl #1]
 // CHECK-ENCODING: [0xb7,0x4d,0x08,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 4d 08 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1sw.s b/llvm/test/MC/AArch64/SVE/ld1sw.s
index e5e4e856b507a..645f6b4fed8a0 100644
--- a/llvm/test/MC/AArch64/SVE/ld1sw.s
+++ b/llvm/test/MC/AArch64/SVE/ld1sw.s
@@ -12,35 +12,35 @@
 ld1sw   z0.d, p0/z, [x0]
 // CHECK-INST: ld1sw   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x80,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 80 a4 <unknown>
 
 ld1sw   { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1sw   { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x80,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 80 a4 <unknown>
 
 ld1sw   { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1sw   { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x8f,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 8f a4 <unknown>
 
 ld1sw   { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1sw   { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x85,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 85 a4 <unknown>
 
 ld1sw    { z23.d }, p3/z, [sp, x8, lsl #2]
 // CHECK-INST: ld1sw    { z23.d }, p3/z, [sp, x8, lsl #2]
 // CHECK-ENCODING: [0xf7,0x4f,0x88,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f7 4f 88 a4 <unknown>
 
 ld1sw    { z23.d }, p3/z, [x13, x8, lsl #2]
 // CHECK-INST: ld1sw    { z23.d }, p3/z, [x13, x8, lsl #2]
 // CHECK-ENCODING: [0xb7,0x4d,0x88,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 4d 88 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld1w.s b/llvm/test/MC/AArch64/SVE/ld1w.s
index 1ca44289e3b81..8f82b3a433fcf 100644
--- a/llvm/test/MC/AArch64/SVE/ld1w.s
+++ b/llvm/test/MC/AArch64/SVE/ld1w.s
@@ -12,65 +12,65 @@
 ld1w     z0.s, p0/z, [x0]
 // CHECK-INST: ld1w     { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x40,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 40 a5 <unknown>
 
 ld1w     z0.d, p0/z, [x0]
 // CHECK-INST: ld1w     { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x60,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 60 a5 <unknown>
 
 ld1w    { z0.s }, p0/z, [x0]
 // CHECK-INST: ld1w    { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x40,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 40 a5 <unknown>
 
 ld1w    { z0.d }, p0/z, [x0]
 // CHECK-INST: ld1w    { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xa0,0x60,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 60 a5 <unknown>
 
 ld1w    { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1w    { z31.s }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x4f,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 4f a5 <unknown>
 
 ld1w    { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1w    { z21.s }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x45,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 45 a5 <unknown>
 
 ld1w    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-INST: ld1w    { z31.d }, p7/z, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xbf,0x6f,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 6f a5 <unknown>
 
 ld1w    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-INST: ld1w    { z21.d }, p5/z, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xb5,0x65,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 b5 65 a5 <unknown>
 
 ld1w    { z21.s }, p5/z, [sp, x21, lsl #2]
 // CHECK-INST: ld1w    { z21.s }, p5/z, [sp, x21, lsl #2]
 // CHECK-ENCODING: [0xf5,0x57,0x55,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 57 55 a5 <unknown>
 
 ld1w    { z21.s }, p5/z, [x10, x21, lsl #2]
 // CHECK-INST: ld1w    { z21.s }, p5/z, [x10, x21, lsl #2]
 // CHECK-ENCODING: [0x55,0x55,0x55,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 55 55 a5 <unknown>
 
 ld1w    { z23.d }, p3/z, [x13, x8, lsl #2]
 // CHECK-INST: ld1w    { z23.d }, p3/z, [x13, x8, lsl #2]
 // CHECK-ENCODING: [0xb7,0x4d,0x68,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 4d 68 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld2b.s b/llvm/test/MC/AArch64/SVE/ld2b.s
index 229935ae8fae2..a756b4afb5597 100644
--- a/llvm/test/MC/AArch64/SVE/ld2b.s
+++ b/llvm/test/MC/AArch64/SVE/ld2b.s
@@ -12,29 +12,29 @@
 ld2b    { z0.b, z1.b }, p0/z, [x0, x0]
 // CHECK-INST: ld2b    { z0.b, z1.b }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0xc0,0x20,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 20 a4 <unknown>
 
 ld2b    { z5.b, z6.b }, p3/z, [x17, x16]
 // CHECK-INST: ld2b    { z5.b, z6.b }, p3/z, [x17, x16]
 // CHECK-ENCODING: [0x25,0xce,0x30,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce 30 a4 <unknown>
 
 ld2b    { z0.b, z1.b }, p0/z, [x0]
 // CHECK-INST: ld2b    { z0.b, z1.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x20,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 20 a4 <unknown>
 
 ld2b    { z23.b, z24.b }, p3/z, [x13, #-16, mul vl]
 // CHECK-INST: ld2b    { z23.b, z24.b }, p3/z, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x28,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 28 a4 <unknown>
 
 ld2b    { z21.b, z22.b }, p5/z, [x10, #10, mul vl]
 // CHECK-INST: ld2b    { z21.b, z22.b }, p5/z, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x25,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 25 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld2d.s b/llvm/test/MC/AArch64/SVE/ld2d.s
index 3ac0f6561740d..6142a3a664b11 100644
--- a/llvm/test/MC/AArch64/SVE/ld2d.s
+++ b/llvm/test/MC/AArch64/SVE/ld2d.s
@@ -12,29 +12,29 @@
 ld2d    { z0.d, z1.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-INST: ld2d    { z0.d, z1.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0xc0,0xa0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a0 a5 <unknown>
 
 ld2d    { z5.d, z6.d }, p3/z, [x17, x16, lsl #3]
 // CHECK-INST: ld2d    { z5.d, z6.d }, p3/z, [x17, x16, lsl #3]
 // CHECK-ENCODING: [0x25,0xce,0xb0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce b0 a5 <unknown>
 
 ld2d    { z0.d, z1.d }, p0/z, [x0]
 // CHECK-INST: ld2d    { z0.d, z1.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a0 a5 <unknown>
 
 ld2d    { z23.d, z24.d }, p3/z, [x13, #-16, mul vl]
 // CHECK-INST: ld2d    { z23.d, z24.d }, p3/z, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xa8,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed a8 a5 <unknown>
 
 ld2d    { z21.d, z22.d }, p5/z, [x10, #10, mul vl]
 // CHECK-INST: ld2d    { z21.d, z22.d }, p5/z, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xa5,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 a5 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld2h.s b/llvm/test/MC/AArch64/SVE/ld2h.s
index c8e1ff91388bc..f92efb7099235 100644
--- a/llvm/test/MC/AArch64/SVE/ld2h.s
+++ b/llvm/test/MC/AArch64/SVE/ld2h.s
@@ -12,29 +12,29 @@
 ld2h    { z0.h, z1.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-INST: ld2h    { z0.h, z1.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0xc0,0xa0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a0 a4 <unknown>
 
 ld2h    { z5.h, z6.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-INST: ld2h    { z5.h, z6.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-ENCODING: [0x25,0xce,0xb0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce b0 a4 <unknown>
 
 ld2h    { z0.h, z1.h }, p0/z, [x0]
 // CHECK-INST: ld2h    { z0.h, z1.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a0 a4 <unknown>
 
 ld2h    { z23.h, z24.h }, p3/z, [x13, #-16, mul vl]
 // CHECK-INST: ld2h    { z23.h, z24.h }, p3/z, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xa8,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed a8 a4 <unknown>
 
 ld2h    { z21.h, z22.h }, p5/z, [x10, #10, mul vl]
 // CHECK-INST: ld2h    { z21.h, z22.h }, p5/z, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xa5,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 a5 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld2w.s b/llvm/test/MC/AArch64/SVE/ld2w.s
index caf2d60a3911c..355071df8b53a 100644
--- a/llvm/test/MC/AArch64/SVE/ld2w.s
+++ b/llvm/test/MC/AArch64/SVE/ld2w.s
@@ -12,29 +12,29 @@
 ld2w    { z0.s, z1.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-INST: ld2w    { z0.s, z1.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0xc0,0x20,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 20 a5 <unknown>
 
 ld2w    { z5.s, z6.s }, p3/z, [x17, x16, lsl #2]
 // CHECK-INST: ld2w    { z5.s, z6.s }, p3/z, [x17, x16, lsl #2]
 // CHECK-ENCODING: [0x25,0xce,0x30,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce 30 a5 <unknown>
 
 ld2w    { z0.s, z1.s }, p0/z, [x0]
 // CHECK-INST: ld2w    { z0.s, z1.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x20,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 20 a5 <unknown>
 
 ld2w    { z23.s, z24.s }, p3/z, [x13, #-16, mul vl]
 // CHECK-INST: ld2w    { z23.s, z24.s }, p3/z, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x28,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 28 a5 <unknown>
 
 ld2w    { z21.s, z22.s }, p5/z, [x10, #10, mul vl]
 // CHECK-INST: ld2w    { z21.s, z22.s }, p5/z, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x25,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 25 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld3b.s b/llvm/test/MC/AArch64/SVE/ld3b.s
index 0e074c0e53c10..b16e7b0e3e60e 100644
--- a/llvm/test/MC/AArch64/SVE/ld3b.s
+++ b/llvm/test/MC/AArch64/SVE/ld3b.s
@@ -12,29 +12,29 @@
 ld3b    { z0.b, z1.b, z2.b }, p0/z, [x0, x0]
 // CHECK-INST: ld3b    { z0.b, z1.b, z2.b }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0xc0,0x40,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 40 a4 <unknown>
 
 ld3b    { z5.b, z6.b, z7.b }, p3/z, [x17, x16]
 // CHECK-INST: ld3b    { z5.b, z6.b, z7.b }, p3/z, [x17, x16]
 // CHECK-ENCODING: [0x25,0xce,0x50,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce 50 a4 <unknown>
 
 ld3b    { z0.b, z1.b, z2.b }, p0/z, [x0]
 // CHECK-INST: ld3b    { z0.b, z1.b, z2.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 a4 <unknown>
 
 ld3b    { z23.b, z24.b, z25.b }, p3/z, [x13, #-24, mul vl]
 // CHECK-INST: ld3b    { z23.b, z24.b, z25.b }, p3/z, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x48,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 48 a4 <unknown>
 
 ld3b    { z21.b, z22.b, z23.b }, p5/z, [x10, #15, mul vl]
 // CHECK-INST: ld3b    { z21.b, z22.b, z23.b }, p5/z, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x45,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 45 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld3d.s b/llvm/test/MC/AArch64/SVE/ld3d.s
index 9f8a8ac746623..7840a3e0f8e4a 100644
--- a/llvm/test/MC/AArch64/SVE/ld3d.s
+++ b/llvm/test/MC/AArch64/SVE/ld3d.s
@@ -12,29 +12,29 @@
 ld3d    { z0.d, z1.d, z2.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-INST: ld3d    { z0.d, z1.d, z2.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0xc0,0xc0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 c0 a5 <unknown>
 
 ld3d    { z5.d, z6.d, z7.d }, p3/z, [x17, x16, lsl #3]
 // CHECK-INST: ld3d    { z5.d, z6.d, z7.d }, p3/z, [x17, x16, lsl #3]
 // CHECK-ENCODING: [0x25,0xce,0xd0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce d0 a5 <unknown>
 
 ld3d    { z0.d, z1.d, z2.d }, p0/z, [x0]
 // CHECK-INST: ld3d    { z0.d, z1.d, z2.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xc0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 c0 a5 <unknown>
 
 ld3d    { z23.d, z24.d, z25.d }, p3/z, [x13, #-24, mul vl]
 // CHECK-INST: ld3d    { z23.d, z24.d, z25.d }, p3/z, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xc8,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed c8 a5 <unknown>
 
 ld3d    { z21.d, z22.d, z23.d }, p5/z, [x10, #15, mul vl]
 // CHECK-INST: ld3d    { z21.d, z22.d, z23.d }, p5/z, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xc5,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 c5 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld3h.s b/llvm/test/MC/AArch64/SVE/ld3h.s
index 92cab32e41604..5304481ebc515 100644
--- a/llvm/test/MC/AArch64/SVE/ld3h.s
+++ b/llvm/test/MC/AArch64/SVE/ld3h.s
@@ -12,29 +12,29 @@
 ld3h    { z0.h, z1.h, z2.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-INST: ld3h    { z0.h, z1.h, z2.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0xc0,0xc0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 c0 a4 <unknown>
 
 ld3h    { z5.h, z6.h, z7.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-INST: ld3h    { z5.h, z6.h, z7.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-ENCODING: [0x25,0xce,0xd0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce d0 a4 <unknown>
 
 ld3h    { z0.h, z1.h, z2.h }, p0/z, [x0]
 // CHECK-INST: ld3h    { z0.h, z1.h, z2.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xc0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 c0 a4 <unknown>
 
 ld3h    { z23.h, z24.h, z25.h }, p3/z, [x13, #-24, mul vl]
 // CHECK-INST: ld3h    { z23.h, z24.h, z25.h }, p3/z, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xc8,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed c8 a4 <unknown>
 
 ld3h    { z21.h, z22.h, z23.h }, p5/z, [x10, #15, mul vl]
 // CHECK-INST: ld3h    { z21.h, z22.h, z23.h }, p5/z, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xc5,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 c5 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld3w.s b/llvm/test/MC/AArch64/SVE/ld3w.s
index 6c7170dcb05ea..46d514ab95aae 100644
--- a/llvm/test/MC/AArch64/SVE/ld3w.s
+++ b/llvm/test/MC/AArch64/SVE/ld3w.s
@@ -12,29 +12,29 @@
 ld3w    { z0.s, z1.s, z2.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-INST: ld3w    { z0.s, z1.s, z2.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0xc0,0x40,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 40 a5 <unknown>
 
 ld3w    { z5.s, z6.s, z7.s }, p3/z, [x17, x16, lsl #2]
 // CHECK-INST: ld3w    { z5.s, z6.s, z7.s }, p3/z, [x17, x16, lsl #2]
 // CHECK-ENCODING: [0x25,0xce,0x50,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce 50 a5 <unknown>
 
 ld3w    { z0.s, z1.s, z2.s }, p0/z, [x0]
 // CHECK-INST: ld3w    { z0.s, z1.s, z2.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 a5 <unknown>
 
 ld3w    { z23.s, z24.s, z25.s }, p3/z, [x13, #-24, mul vl]
 // CHECK-INST: ld3w    { z23.s, z24.s, z25.s }, p3/z, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x48,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 48 a5 <unknown>
 
 ld3w    { z21.s, z22.s, z23.s }, p5/z, [x10, #15, mul vl]
 // CHECK-INST: ld3w    { z21.s, z22.s, z23.s }, p5/z, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x45,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 45 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld4b.s b/llvm/test/MC/AArch64/SVE/ld4b.s
index c656bcdf26f21..768d366882988 100644
--- a/llvm/test/MC/AArch64/SVE/ld4b.s
+++ b/llvm/test/MC/AArch64/SVE/ld4b.s
@@ -12,29 +12,29 @@
 ld4b    { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x0]
 // CHECK-INST: ld4b    { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0xc0,0x60,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 60 a4 <unknown>
 
 ld4b    { z5.b, z6.b, z7.b, z8.b }, p3/z, [x17, x16]
 // CHECK-INST: ld4b    { z5.b, z6.b, z7.b, z8.b }, p3/z, [x17, x16]
 // CHECK-ENCODING: [0x25,0xce,0x70,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce 70 a4 <unknown>
 
 ld4b    { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
 // CHECK-INST: ld4b    { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x60,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 a4 <unknown>
 
 ld4b    { z23.b, z24.b, z25.b, z26.b }, p3/z, [x13, #-32, mul vl]
 // CHECK-INST: ld4b    { z23.b, z24.b, z25.b, z26.b }, p3/z, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x68,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 68 a4 <unknown>
 
 ld4b    { z21.b, z22.b, z23.b, z24.b }, p5/z, [x10, #20, mul vl]
 // CHECK-INST: ld4b    { z21.b, z22.b, z23.b, z24.b }, p5/z, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x65,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 65 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld4d.s b/llvm/test/MC/AArch64/SVE/ld4d.s
index 46a35db517b5a..9e077d78613fb 100644
--- a/llvm/test/MC/AArch64/SVE/ld4d.s
+++ b/llvm/test/MC/AArch64/SVE/ld4d.s
@@ -12,29 +12,29 @@
 ld4d    { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-INST: ld4d    { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0xc0,0xe0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e0 a5 <unknown>
 
 ld4d    { z5.d, z6.d, z7.d, z8.d }, p3/z, [x17, x16, lsl #3]
 // CHECK-INST: ld4d    { z5.d, z6.d, z7.d, z8.d }, p3/z, [x17, x16, lsl #3]
 // CHECK-ENCODING: [0x25,0xce,0xf0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce f0 a5 <unknown>
 
 ld4d    { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
 // CHECK-INST: ld4d    { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 a5 <unknown>
 
 ld4d    { z23.d, z24.d, z25.d, z26.d }, p3/z, [x13, #-32, mul vl]
 // CHECK-INST: ld4d    { z23.d, z24.d, z25.d, z26.d }, p3/z, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xe8,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed e8 a5 <unknown>
 
 ld4d    { z21.d, z22.d, z23.d, z24.d }, p5/z, [x10, #20, mul vl]
 // CHECK-INST: ld4d    { z21.d, z22.d, z23.d, z24.d }, p5/z, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xe5,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 e5 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld4h.s b/llvm/test/MC/AArch64/SVE/ld4h.s
index c173ed140b2a0..db73e6c825f1b 100644
--- a/llvm/test/MC/AArch64/SVE/ld4h.s
+++ b/llvm/test/MC/AArch64/SVE/ld4h.s
@@ -12,29 +12,29 @@
 ld4h    { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-INST: ld4h    { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0xc0,0xe0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e0 a4 <unknown>
 
 ld4h    { z5.h, z6.h, z7.h, z8.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-INST: ld4h    { z5.h, z6.h, z7.h, z8.h }, p3/z, [x17, x16, lsl #1]
 // CHECK-ENCODING: [0x25,0xce,0xf0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce f0 a4 <unknown>
 
 ld4h    { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
 // CHECK-INST: ld4h    { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 a4 <unknown>
 
 ld4h    { z23.h, z24.h, z25.h, z26.h }, p3/z, [x13, #-32, mul vl]
 // CHECK-INST: ld4h    { z23.h, z24.h, z25.h, z26.h }, p3/z, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xe8,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed e8 a4 <unknown>
 
 ld4h    { z21.h, z22.h, z23.h, z24.h }, p5/z, [x10, #20, mul vl]
 // CHECK-INST: ld4h    { z21.h, z22.h, z23.h, z24.h }, p5/z, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xe5,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 e5 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ld4w.s b/llvm/test/MC/AArch64/SVE/ld4w.s
index 29092e710d747..130309c44f8d7 100644
--- a/llvm/test/MC/AArch64/SVE/ld4w.s
+++ b/llvm/test/MC/AArch64/SVE/ld4w.s
@@ -12,29 +12,29 @@
 ld4w    { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-INST: ld4w    { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0xc0,0x60,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 60 a5 <unknown>
 
 ld4w    { z5.s, z6.s, z7.s, z8.s }, p3/z, [x17, x16, lsl #2]
 // CHECK-INST: ld4w    { z5.s, z6.s, z7.s, z8.s }, p3/z, [x17, x16, lsl #2]
 // CHECK-ENCODING: [0x25,0xce,0x70,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 ce 70 a5 <unknown>
 
 ld4w    { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
 // CHECK-INST: ld4w    { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x60,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 a5 <unknown>
 
 ld4w    { z23.s, z24.s, z25.s, z26.s }, p3/z, [x13, #-32, mul vl]
 // CHECK-INST: ld4w    { z23.s, z24.s, z25.s, z26.s }, p3/z, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x68,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 68 a5 <unknown>
 
 ld4w    { z21.s, z22.s, z23.s, z24.s }, p5/z, [x10, #20, mul vl]
 // CHECK-INST: ld4w    { z21.s, z22.s, z23.s, z24.s }, p5/z, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x65,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 65 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ldnt1b.s b/llvm/test/MC/AArch64/SVE/ldnt1b.s
index 0ad86883b3c9c..55fd4139198f2 100644
--- a/llvm/test/MC/AArch64/SVE/ldnt1b.s
+++ b/llvm/test/MC/AArch64/SVE/ldnt1b.s
@@ -12,29 +12,29 @@
 ldnt1b  z0.b, p0/z, [x0]
 // CHECK-INST: ldnt1b  { z0.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 00 a4 <unknown>
 
 ldnt1b  { z0.b }, p0/z, [x0]
 // CHECK-INST: ldnt1b  { z0.b }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 00 a4 <unknown>
 
 ldnt1b  { z23.b }, p3/z, [x13, #-8, mul vl]
 // CHECK-INST: ldnt1b  { z23.b }, p3/z, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x08,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 08 a4 <unknown>
 
 ldnt1b  { z21.b }, p5/z, [x10, #7, mul vl]
 // CHECK-INST: ldnt1b  { z21.b }, p5/z, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x07,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 07 a4 <unknown>
 
 ldnt1b  { z0.b }, p0/z, [x0, x0]
 // CHECK-INST: ldnt1b  { z0.b }, p0/z, [x0, x0]
 // CHECK-ENCODING: [0x00,0xc0,0x00,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 00 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ldnt1d.s b/llvm/test/MC/AArch64/SVE/ldnt1d.s
index 7e881b6ef43a6..627bec9917072 100644
--- a/llvm/test/MC/AArch64/SVE/ldnt1d.s
+++ b/llvm/test/MC/AArch64/SVE/ldnt1d.s
@@ -12,29 +12,29 @@
 ldnt1d  z0.d, p0/z, [x0]
 // CHECK-INST: ldnt1d  { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x80,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 80 a5 <unknown>
 
 ldnt1d  { z0.d }, p0/z, [x0]
 // CHECK-INST: ldnt1d  { z0.d }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x80,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 80 a5 <unknown>
 
 ldnt1d  { z23.d }, p3/z, [x13, #-8, mul vl]
 // CHECK-INST: ldnt1d  { z23.d }, p3/z, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x88,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 88 a5 <unknown>
 
 ldnt1d  { z21.d }, p5/z, [x10, #7, mul vl]
 // CHECK-INST: ldnt1d  { z21.d }, p5/z, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x87,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 87 a5 <unknown>
 
 ldnt1d  { z0.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-INST: ldnt1d  { z0.d }, p0/z, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0xc0,0x80,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 80 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ldnt1h.s b/llvm/test/MC/AArch64/SVE/ldnt1h.s
index 139fd5257cad7..51e597ae1ed44 100644
--- a/llvm/test/MC/AArch64/SVE/ldnt1h.s
+++ b/llvm/test/MC/AArch64/SVE/ldnt1h.s
@@ -12,29 +12,29 @@
 ldnt1h  z0.h, p0/z, [x0]
 // CHECK-INST: ldnt1h  { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x80,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 80 a4 <unknown>
 
 ldnt1h  { z0.h }, p0/z, [x0]
 // CHECK-INST: ldnt1h  { z0.h }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x80,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 80 a4 <unknown>
 
 ldnt1h  { z23.h }, p3/z, [x13, #-8, mul vl]
 // CHECK-INST: ldnt1h  { z23.h }, p3/z, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x88,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 88 a4 <unknown>
 
 ldnt1h  { z21.h }, p5/z, [x10, #7, mul vl]
 // CHECK-INST: ldnt1h  { z21.h }, p5/z, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x87,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 87 a4 <unknown>
 
 ldnt1h  { z0.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-INST: ldnt1h  { z0.h }, p0/z, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0xc0,0x80,0xa4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 80 a4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ldnt1w.s b/llvm/test/MC/AArch64/SVE/ldnt1w.s
index f0e10ee10c82c..be21f828ca763 100644
--- a/llvm/test/MC/AArch64/SVE/ldnt1w.s
+++ b/llvm/test/MC/AArch64/SVE/ldnt1w.s
@@ -12,29 +12,29 @@
 ldnt1w  z0.s, p0/z, [x0]
 // CHECK-INST: ldnt1w  { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x00,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 00 a5 <unknown>
 
 ldnt1w  { z0.s }, p0/z, [x0]
 // CHECK-INST: ldnt1w  { z0.s }, p0/z, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x00,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 00 a5 <unknown>
 
 ldnt1w  { z23.s }, p3/z, [x13, #-8, mul vl]
 // CHECK-INST: ldnt1w  { z23.s }, p3/z, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x08,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 08 a5 <unknown>
 
 ldnt1w  { z21.s }, p5/z, [x10, #7, mul vl]
 // CHECK-INST: ldnt1w  { z21.s }, p5/z, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x07,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 07 a5 <unknown>
 
 ldnt1w  { z0.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-INST: ldnt1w  { z0.s }, p0/z, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0xc0,0x00,0xa5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 00 a5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ldr.s b/llvm/test/MC/AArch64/SVE/ldr.s
index 1e86e0056f896..857e9f37c6fc4 100644
--- a/llvm/test/MC/AArch64/SVE/ldr.s
+++ b/llvm/test/MC/AArch64/SVE/ldr.s
@@ -12,35 +12,35 @@
 ldr     z0, [x0]
 // CHECK-INST: ldr     z0, [x0]
 // CHECK-ENCODING: [0x00,0x40,0x80,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 80 85 <unknown>
 
 ldr     z31, [sp, #-256, mul vl]
 // CHECK-INST: ldr     z31, [sp, #-256, mul vl]
 // CHECK-ENCODING: [0xff,0x43,0xa0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 43 a0 85 <unknown>
 
 ldr     z23, [x13, #255, mul vl]
 // CHECK-INST: ldr     z23, [x13, #255, mul vl]
 // CHECK-ENCODING: [0xb7,0x5d,0x9f,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 5d 9f 85 <unknown>
 
 ldr     p0, [x0]
 // CHECK-INST: ldr     p0, [x0]
 // CHECK-ENCODING: [0x00,0x00,0x80,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 80 85 <unknown>
 
 ldr     p7, [x13, #-256, mul vl]
 // CHECK-INST: ldr     p7, [x13, #-256, mul vl]
 // CHECK-ENCODING: [0xa7,0x01,0xa0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a7 01 a0 85 <unknown>
 
 ldr     p5, [x10, #255, mul vl]
 // CHECK-INST: ldr     p5, [x10, #255, mul vl]
 // CHECK-ENCODING: [0x45,0x1d,0x9f,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 45 1d 9f 85 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/lsl.s b/llvm/test/MC/AArch64/SVE/lsl.s
index b95d973949ec2..3befa0bd5c164 100644
--- a/llvm/test/MC/AArch64/SVE/lsl.s
+++ b/llvm/test/MC/AArch64/SVE/lsl.s
@@ -12,157 +12,157 @@
 lsl     z0.b, z0.b, #0
 // CHECK-INST: lsl	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0x9c,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 9c 28 04 <unknown>
 
 lsl     z31.b, z31.b, #7
 // CHECK-INST: lsl	z31.b, z31.b, #7
 // CHECK-ENCODING: [0xff,0x9f,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 2f 04 <unknown>
 
 lsl     z0.h, z0.h, #0
 // CHECK-INST: lsl	z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0x9c,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 9c 30 04 <unknown>
 
 lsl     z31.h, z31.h, #15
 // CHECK-INST: lsl	z31.h, z31.h, #15
 // CHECK-ENCODING: [0xff,0x9f,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 3f 04 <unknown>
 
 lsl     z0.s, z0.s, #0
 // CHECK-INST: lsl	z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0x9c,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 9c 60 04 <unknown>
 
 lsl     z31.s, z31.s, #31
 // CHECK-INST: lsl	z31.s, z31.s, #31
 // CHECK-ENCODING: [0xff,0x9f,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 7f 04 <unknown>
 
 lsl     z0.d, z0.d, #0
 // CHECK-INST: lsl	z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0x9c,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 9c a0 04 <unknown>
 
 lsl     z31.d, z31.d, #63
 // CHECK-INST: lsl	z31.d, z31.d, #63
 // CHECK-ENCODING: [0xff,0x9f,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f ff 04 <unknown>
 
 lsl     z0.b, p0/m, z0.b, #0
 // CHECK-INST: lsl	z0.b, p0/m, z0.b, #0
 // CHECK-ENCODING: [0x00,0x81,0x03,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 81 03 04 <unknown>
 
 lsl     z31.b, p0/m, z31.b, #7
 // CHECK-INST: lsl	z31.b, p0/m, z31.b, #7
 // CHECK-ENCODING: [0xff,0x81,0x03,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 81 03 04 <unknown>
 
 lsl     z0.h, p0/m, z0.h, #0
 // CHECK-INST: lsl	z0.h, p0/m, z0.h, #0
 // CHECK-ENCODING: [0x00,0x82,0x03,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 82 03 04 <unknown>
 
 lsl     z31.h, p0/m, z31.h, #15
 // CHECK-INST: lsl	z31.h, p0/m, z31.h, #15
 // CHECK-ENCODING: [0xff,0x83,0x03,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 83 03 04 <unknown>
 
 lsl     z0.s, p0/m, z0.s, #0
 // CHECK-INST: lsl	z0.s, p0/m, z0.s, #0
 // CHECK-ENCODING: [0x00,0x80,0x43,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 43 04 <unknown>
 
 lsl     z31.s, p0/m, z31.s, #31
 // CHECK-INST: lsl	z31.s, p0/m, z31.s, #31
 // CHECK-ENCODING: [0xff,0x83,0x43,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 83 43 04 <unknown>
 
 lsl     z0.d, p0/m, z0.d, #0
 // CHECK-INST: lsl	z0.d, p0/m, z0.d, #0
 // CHECK-ENCODING: [0x00,0x80,0x83,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 83 04 <unknown>
 
 lsl     z31.d, p0/m, z31.d, #63
 // CHECK-INST: lsl	z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 83 c3 04 <unknown>
 
 lsl     z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: lsl	z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x80,0x13,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 13 04 <unknown>
 
 lsl     z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: lsl	z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x53,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 53 04 <unknown>
 
 lsl     z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: lsl	z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x80,0x93,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 93 04 <unknown>
 
 lsl     z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: lsl	z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x80,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d3 04 <unknown>
 
 lsl     z0.b, p0/m, z0.b, z1.d
 // CHECK-INST: lsl	z0.b, p0/m, z0.b, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x1b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 1b 04 <unknown>
 
 lsl     z0.h, p0/m, z0.h, z1.d
 // CHECK-INST: lsl	z0.h, p0/m, z0.h, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x5b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 5b 04 <unknown>
 
 lsl     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: lsl	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x9b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 9b 04 <unknown>
 
 lsl     z0.b, z1.b, z2.d
 // CHECK-INST: lsl	z0.b, z1.b, z2.d
 // CHECK-ENCODING: [0x20,0x8c,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 8c 22 04 <unknown>
 
 lsl     z0.h, z1.h, z2.d
 // CHECK-INST: lsl	z0.h, z1.h, z2.d
 // CHECK-ENCODING: [0x20,0x8c,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 8c 62 04 <unknown>
 
 lsl     z0.s, z1.s, z2.d
 // CHECK-INST: lsl	z0.s, z1.s, z2.d
 // CHECK-ENCODING: [0x20,0x8c,0xa2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 8c a2 04 <unknown>
 
 
@@ -172,47 +172,47 @@ lsl     z0.s, z1.s, z2.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx	z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 lsl     z31.d, p0/m, z31.d, #63
 // CHECK-INST: lsl	z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 83 c3 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 lsl     z31.d, p0/m, z31.d, #63
 // CHECK-INST: lsl	z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 83 c3 04 <unknown>
 
 movprfx z0.s, p0/z, z7.s
 // CHECK-INST: movprfx	z0.s, p0/z, z7.s
 // CHECK-ENCODING: [0xe0,0x20,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 20 90 04 <unknown>
 
 lsl     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: lsl	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x9b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 9b 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 lsl     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: lsl	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x9b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 9b 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/lslr.s b/llvm/test/MC/AArch64/SVE/lslr.s
index e53e09eaa0c9b..5fbb8d62f7d11 100644
--- a/llvm/test/MC/AArch64/SVE/lslr.s
+++ b/llvm/test/MC/AArch64/SVE/lslr.s
@@ -12,25 +12,25 @@
 lslr    z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: lslr	z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x80,0x17,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 17 04 <unknown>
 
 lslr    z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: lslr	z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x57,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 57 04 <unknown>
 
 lslr    z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: lslr	z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x80,0x97,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 97 04 <unknown>
 
 lslr    z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: lslr	z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x80,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d7 04 <unknown>
 
 
@@ -40,23 +40,23 @@ lslr    z0.d, p0/m, z0.d, z0.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 lslr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: lslr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x80,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 80 d7 04 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 lslr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: lslr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x80,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 80 d7 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/lsr.s b/llvm/test/MC/AArch64/SVE/lsr.s
index d0706c8fcb53e..d0edca12d5517 100644
--- a/llvm/test/MC/AArch64/SVE/lsr.s
+++ b/llvm/test/MC/AArch64/SVE/lsr.s
@@ -12,157 +12,157 @@
 lsr     z0.b, z0.b, #1
 // CHECK-INST: lsr	z0.b, z0.b, #1
 // CHECK-ENCODING: [0x00,0x94,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 94 2f 04 <unknown>
 
 lsr     z31.b, z31.b, #8
 // CHECK-INST: lsr	z31.b, z31.b, #8
 // CHECK-ENCODING: [0xff,0x97,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 97 28 04 <unknown>
 
 lsr     z0.h, z0.h, #1
 // CHECK-INST: lsr	z0.h, z0.h, #1
 // CHECK-ENCODING: [0x00,0x94,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 94 3f 04 <unknown>
 
 lsr     z31.h, z31.h, #16
 // CHECK-INST: lsr	z31.h, z31.h, #16
 // CHECK-ENCODING: [0xff,0x97,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 97 30 04 <unknown>
 
 lsr     z0.s, z0.s, #1
 // CHECK-INST: lsr	z0.s, z0.s, #1
 // CHECK-ENCODING: [0x00,0x94,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 94 7f 04 <unknown>
 
 lsr     z31.s, z31.s, #32
 // CHECK-INST: lsr	z31.s, z31.s, #32
 // CHECK-ENCODING: [0xff,0x97,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 97 60 04 <unknown>
 
 lsr     z0.d, z0.d, #1
 // CHECK-INST: lsr	z0.d, z0.d, #1
 // CHECK-ENCODING: [0x00,0x94,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 94 ff 04 <unknown>
 
 lsr     z31.d, z31.d, #64
 // CHECK-INST: lsr	z31.d, z31.d, #64
 // CHECK-ENCODING: [0xff,0x97,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 97 a0 04 <unknown>
 
 lsr     z0.b, p0/m, z0.b, #1
 // CHECK-INST: lsr	z0.b, p0/m, z0.b, #1
 // CHECK-ENCODING: [0xe0,0x81,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 81 01 04 <unknown>
 
 lsr     z31.b, p0/m, z31.b, #8
 // CHECK-INST: lsr	z31.b, p0/m, z31.b, #8
 // CHECK-ENCODING: [0x1f,0x81,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 81 01 04 <unknown>
 
 lsr     z0.h, p0/m, z0.h, #1
 // CHECK-INST: lsr	z0.h, p0/m, z0.h, #1
 // CHECK-ENCODING: [0xe0,0x83,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 01 04 <unknown>
 
 lsr     z31.h, p0/m, z31.h, #16
 // CHECK-INST: lsr	z31.h, p0/m, z31.h, #16
 // CHECK-ENCODING: [0x1f,0x82,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 82 01 04 <unknown>
 
 lsr     z0.s, p0/m, z0.s, #1
 // CHECK-INST: lsr	z0.s, p0/m, z0.s, #1
 // CHECK-ENCODING: [0xe0,0x83,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 41 04 <unknown>
 
 lsr     z31.s, p0/m, z31.s, #32
 // CHECK-INST: lsr	z31.s, p0/m, z31.s, #32
 // CHECK-ENCODING: [0x1f,0x80,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 41 04 <unknown>
 
 lsr     z0.d, p0/m, z0.d, #1
 // CHECK-INST: lsr	z0.d, p0/m, z0.d, #1
 // CHECK-ENCODING: [0xe0,0x83,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 83 c1 04 <unknown>
 
 lsr     z31.d, p0/m, z31.d, #64
 // CHECK-INST: lsr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 81 04 <unknown>
 
 lsr     z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: lsr	z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x80,0x11,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 11 04 <unknown>
 
 lsr     z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: lsr	z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x51,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 51 04 <unknown>
 
 lsr     z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: lsr	z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x80,0x91,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 91 04 <unknown>
 
 lsr     z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: lsr	z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x80,0xd1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d1 04 <unknown>
 
 lsr     z0.b, p0/m, z0.b, z1.d
 // CHECK-INST: lsr	z0.b, p0/m, z0.b, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x19,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 19 04 <unknown>
 
 lsr     z0.h, p0/m, z0.h, z1.d
 // CHECK-INST: lsr	z0.h, p0/m, z0.h, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x59,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 59 04 <unknown>
 
 lsr     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: lsr	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x99,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 99 04 <unknown>
 
 lsr     z0.b, z1.b, z2.d
 // CHECK-INST: lsr	z0.b, z1.b, z2.d
 // CHECK-ENCODING: [0x20,0x84,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 84 22 04 <unknown>
 
 lsr     z0.h, z1.h, z2.d
 // CHECK-INST: lsr	z0.h, z1.h, z2.d
 // CHECK-ENCODING: [0x20,0x84,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 84 62 04 <unknown>
 
 lsr     z0.s, z1.s, z2.d
 // CHECK-INST: lsr	z0.s, z1.s, z2.d
 // CHECK-ENCODING: [0x20,0x84,0xa2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 84 a2 04 <unknown>
 
 
@@ -172,47 +172,47 @@ lsr     z0.s, z1.s, z2.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx	z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 lsr     z31.d, p0/m, z31.d, #64
 // CHECK-INST: lsr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 81 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 lsr     z31.d, p0/m, z31.d, #64
 // CHECK-INST: lsr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 80 81 04 <unknown>
 
 movprfx z0.s, p0/z, z7.s
 // CHECK-INST: movprfx	z0.s, p0/z, z7.s
 // CHECK-ENCODING: [0xe0,0x20,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 20 90 04 <unknown>
 
 lsr     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: lsr	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x99,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 99 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 lsr     z0.s, p0/m, z0.s, z1.d
 // CHECK-INST: lsr	z0.s, p0/m, z0.s, z1.d
 // CHECK-ENCODING: [0x20,0x80,0x99,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 80 99 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/lsrr.s b/llvm/test/MC/AArch64/SVE/lsrr.s
index 9ab593abd9162..b5d4fca5382ba 100644
--- a/llvm/test/MC/AArch64/SVE/lsrr.s
+++ b/llvm/test/MC/AArch64/SVE/lsrr.s
@@ -12,25 +12,25 @@
 lsrr    z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: lsrr	z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x80,0x15,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 15 04 <unknown>
 
 lsrr    z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: lsrr	z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x80,0x55,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 55 04 <unknown>
 
 lsrr    z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: lsrr	z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x80,0x95,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 95 04 <unknown>
 
 lsrr    z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: lsrr	z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x80,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 d5 04 <unknown>
 
 
@@ -40,23 +40,23 @@ lsrr    z0.d, p0/m, z0.d, z0.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 lsrr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: lsrr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x80,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 80 d5 04 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 lsrr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: lsrr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x80,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 80 d5 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/mad.s b/llvm/test/MC/AArch64/SVE/mad.s
index 417c8561f4dbf..a92e26262b687 100644
--- a/llvm/test/MC/AArch64/SVE/mad.s
+++ b/llvm/test/MC/AArch64/SVE/mad.s
@@ -12,25 +12,25 @@
 mad z0.b, p7/m, z1.b, z31.b
 // CHECK-INST: mad	z0.b, p7/m, z1.b, z31.b
 // CHECK-ENCODING: [0xe0,0xdf,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df 01 04 <unknown>
 
 mad z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: mad	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0xe0,0xdf,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df 41 04 <unknown>
 
 mad z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: mad	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0xe0,0xdf,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df 81 04 <unknown>
 
 mad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0xe0,0xdf,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df c1 04 <unknown>
 
 
@@ -40,23 +40,23 @@ mad z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 mad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0xe0,0xdf,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df c1 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 mad z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mad	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0xe0,0xdf,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 df c1 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s b/llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s
index 9477199742edb..b80398076ab89 100644
--- a/llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s
+++ b/llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s
@@ -239,13 +239,13 @@ ld1rod z0.d, p1/z, [x2, x3, lsl #3]
 zip1 z0.q, z1.q, z2.q
 // CHECK-INST: zip1 z0.q, z1.q, z2.q
 // CHECK-ENCODING: [0x20,0x00,0xa2,0x05]
-// CHECK-ERROR: instruction requires: f64mm streaming-sve
+// CHECK-ERROR: instruction requires: f64mm sve or sme
 // CHECK-UNKNOWN: 20 00 a2 05 <unknown>
 
 zip2 z0.q, z1.q, z2.q
 // CHECK-INST: zip2 z0.q, z1.q, z2.q
 // CHECK-ENCODING: [0x20,0x04,0xa2,0x05]
-// CHECK-ERROR: instruction requires: f64mm streaming-sve
+// CHECK-ERROR: instruction requires: f64mm sve or sme
 // CHECK-UNKNOWN: 20 04 a2 05 <unknown>
 
 
@@ -255,13 +255,13 @@ zip2 z0.q, z1.q, z2.q
 uzp1 z0.q, z1.q, z2.q
 // CHECK-INST: uzp1 z0.q, z1.q, z2.q
 // CHECK-ENCODING: [0x20,0x08,0xa2,0x05]
-// CHECK-ERROR: instruction requires: f64mm streaming-sve
+// CHECK-ERROR: instruction requires: f64mm sve or sme
 // CHECK-UNKNOWN: 20 08 a2 05 <unknown>
 
 uzp2 z0.q, z1.q, z2.q
 // CHECK-INST: uzp2 z0.q, z1.q, z2.q
 // CHECK-ENCODING: [0x20,0x0c,0xa2,0x05]
-// CHECK-ERROR: instruction requires: f64mm streaming-sve
+// CHECK-ERROR: instruction requires: f64mm sve or sme
 // CHECK-UNKNOWN: 20 0c a2 05 <unknown>
 
 
@@ -271,11 +271,11 @@ uzp2 z0.q, z1.q, z2.q
 trn1 z0.q, z1.q, z2.q
 // CHECK-INST: trn1 z0.q, z1.q, z2.q
 // CHECK-ENCODING: [0x20,0x18,0xa2,0x05]
-// CHECK-ERROR: instruction requires: f64mm streaming-sve
+// CHECK-ERROR: instruction requires: f64mm sve or sme
 // CHECK-UNKNOWN: 20 18 a2 05 <unknown>
 
 trn2 z0.q, z1.q, z2.q
 // CHECK-INST: trn2 z0.q, z1.q, z2.q
 // CHECK-ENCODING: [0x20,0x1c,0xa2,0x05]
-// CHECK-ERROR: instruction requires: f64mm streaming-sve
+// CHECK-ERROR: instruction requires: f64mm sve or sme
 // CHECK-UNKNOWN: 20 1c a2 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s b/llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s
index ef0260a8d06f0..04152bd37aeb4 100644
--- a/llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s
+++ b/llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s
@@ -72,7 +72,7 @@ usmmla z0.s, z1.b, z2.b
 usdot z0.s, z1.b, z2.b
 // CHECK-INST: usdot z0.s, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x78,0x82,0x44]
-// CHECK-ERROR: instruction requires: i8mm streaming-sve
+// CHECK-ERROR: instruction requires: i8mm sve or sme
 // CHECK-UNKNOWN: 20 78 82 44 <unknown>
 
 // Test compatibility with MOVPRFX instruction.
@@ -85,7 +85,7 @@ movprfx z0, z7
 usdot z0.s, z1.b, z2.b
 // CHECK-INST: usdot z0.s, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x78,0x82,0x44]
-// CHECK-ERROR: instruction requires: i8mm streaming-sve
+// CHECK-ERROR: instruction requires: i8mm sve or sme
 // CHECK-UNKNOWN: 20 78 82 44 <unknown>
 
 
@@ -95,13 +95,13 @@ usdot z0.s, z1.b, z2.b
 usdot z0.s, z1.b, z2.b[0]
 // CHECK-INST: usdot z0.s, z1.b, z2.b[0]
 // CHECK-ENCODING: [0x20,0x18,0xa2,0x44]
-// CHECK-ERROR: instruction requires: i8mm streaming-sve
+// CHECK-ERROR: instruction requires: i8mm sve or sme
 // CHECK-UNKNOWN: 20 18 a2 44 <unknown>
 
 sudot z0.s, z1.b, z2.b[3]
 // CHECK-INST: sudot z0.s, z1.b, z2.b[3]
 // CHECK-ENCODING: [0x20,0x1c,0xba,0x44]
-// CHECK-ERROR: instruction requires: i8mm streaming-sve
+// CHECK-ERROR: instruction requires: i8mm sve or sme
 // CHECK-UNKNOWN: 20 1c ba 44 <unknown>
 
 // Test compatibility with MOVPRFX instruction.
@@ -114,7 +114,7 @@ movprfx z0, z7
 usdot z0.s, z1.b, z2.b[0]
 // CHECK-INST: usdot z0.s, z1.b, z2.b[0]
 // CHECK-ENCODING: [0x20,0x18,0xa2,0x44]
-// CHECK-ERROR: instruction requires: i8mm streaming-sve
+// CHECK-ERROR: instruction requires: i8mm sve or sme
 // CHECK-UNKNOWN: 20 18 a2 44 <unknown>
 
 movprfx z0, z7
@@ -125,5 +125,5 @@ movprfx z0, z7
 sudot z0.s, z1.b, z2.b[0]
 // CHECK-INST: sudot z0.s, z1.b, z2.b[0]
 // CHECK-ENCODING: [0x20,0x1c,0xa2,0x44]
-// CHECK-ERROR: instruction requires: i8mm streaming-sve
+// CHECK-ERROR: instruction requires: i8mm sve or sme
 // CHECK-UNKNOWN: 20 1c a2 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/mla.s b/llvm/test/MC/AArch64/SVE/mla.s
index 99ec77dd1c403..5d21bb92b3402 100644
--- a/llvm/test/MC/AArch64/SVE/mla.s
+++ b/llvm/test/MC/AArch64/SVE/mla.s
@@ -12,25 +12,25 @@
 mla z0.b, p7/m, z1.b, z31.b
 // CHECK-INST: mla	z0.b, p7/m, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x5c,0x1f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c 1f 04 <unknown>
 
 mla z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: mla	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x5c,0x5f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c 5f 04 <unknown>
 
 mla z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: mla	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x5c,0x9f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c 9f 04 <unknown>
 
 mla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x5c,0xdf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c df 04 <unknown>
 
 
@@ -40,23 +40,23 @@ mla z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 mla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x5c,0xdf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c df 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 mla z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mla	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x5c,0xdf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 5c df 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/mls.s b/llvm/test/MC/AArch64/SVE/mls.s
index dade119e9dc9a..3068c2da049cd 100644
--- a/llvm/test/MC/AArch64/SVE/mls.s
+++ b/llvm/test/MC/AArch64/SVE/mls.s
@@ -12,25 +12,25 @@
 mls z0.b, p7/m, z1.b, z31.b
 // CHECK-INST: mls	z0.b, p7/m, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x7c,0x1f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c 1f 04 <unknown>
 
 mls z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: mls	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x7c,0x5f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c 5f 04 <unknown>
 
 mls z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: mls	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x7c,0x9f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c 9f 04 <unknown>
 
 mls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x7c,0xdf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c df 04 <unknown>
 
 
@@ -40,23 +40,23 @@ mls z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 mls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x7c,0xdf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c df 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 mls z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: mls	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x7c,0xdf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 7c df 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/mov.s b/llvm/test/MC/AArch64/SVE/mov.s
index 72b25a447e341..f6af81375d9d8 100644
--- a/llvm/test/MC/AArch64/SVE/mov.s
+++ b/llvm/test/MC/AArch64/SVE/mov.s
@@ -12,367 +12,367 @@
 mov     z0.b, w0
 // CHECK-INST: mov     z0.b, w0
 // CHECK-ENCODING: [0x00,0x38,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 20 05 <unknown>
 
 mov     z0.h, w0
 // CHECK-INST: mov     z0.h, w0
 // CHECK-ENCODING: [0x00,0x38,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 60 05 <unknown>
 
 mov     z0.s, w0
 // CHECK-INST: mov     z0.s, w0
 // CHECK-ENCODING: [0x00,0x38,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 a0 05 <unknown>
 
 mov     z0.d, x0
 // CHECK-INST: mov     z0.d, x0
 // CHECK-ENCODING: [0x00,0x38,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 38 e0 05 <unknown>
 
 mov     z31.h, wsp
 // CHECK-INST: mov     z31.h, wsp
 // CHECK-ENCODING: [0xff,0x3b,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 60 05 <unknown>
 
 mov     z31.s, wsp
 // CHECK-INST: mov     z31.s, wsp
 // CHECK-ENCODING: [0xff,0x3b,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b a0 05 <unknown>
 
 mov     z31.d, sp
 // CHECK-INST: mov     z31.d, sp
 // CHECK-ENCODING: [0xff,0x3b,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b e0 05 <unknown>
 
 mov     z31.b, wsp
 // CHECK-INST: mov     z31.b, wsp
 // CHECK-ENCODING: [0xff,0x3b,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 20 05 <unknown>
 
 mov     z0.d, z0.d
 // CHECK-INST: mov     z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 60 04 <unknown>
 
 mov     z31.d, z0.d
 // CHECK-INST: mov     z31.d, z0.d
 // CHECK-ENCODING: [0x1f,0x30,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 30 60 04 <unknown>
 
 mov     z5.b, #-128
 // CHECK-INST: mov     z5.b, #-128
 // CHECK-ENCODING: [0x05,0xd0,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 d0 38 25 <unknown>
 
 mov     z5.b, #127
 // CHECK-INST: mov     z5.b, #127
 // CHECK-ENCODING: [0xe5,0xcf,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 cf 38 25 <unknown>
 
 mov     z5.b, #255
 // CHECK-INST: mov     z5.b, #-1
 // CHECK-ENCODING: [0xe5,0xdf,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 df 38 25 <unknown>
 
 mov     z21.h, #-128
 // CHECK-INST: mov     z21.h, #-128
 // CHECK-ENCODING: [0x15,0xd0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 d0 78 25 <unknown>
 
 mov     z21.h, #-128, lsl #8
 // CHECK-INST: mov     z21.h, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 78 25 <unknown>
 
 mov     z21.h, #-32768
 // CHECK-INST: mov     z21.h, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 78 25 <unknown>
 
 mov     z21.h, #127
 // CHECK-INST: mov     z21.h, #127
 // CHECK-ENCODING: [0xf5,0xcf,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 cf 78 25 <unknown>
 
 mov     z21.h, #127, lsl #8
 // CHECK-INST: mov     z21.h, #32512
 // CHECK-ENCODING: [0xf5,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef 78 25 <unknown>
 
 mov     z21.h, #32512
 // CHECK-INST: mov     z21.h, #32512
 // CHECK-ENCODING: [0xf5,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef 78 25 <unknown>
 
 mov     z21.s, #-128
 // CHECK-INST: mov     z21.s, #-128
 // CHECK-ENCODING: [0x15,0xd0,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 d0 b8 25 <unknown>
 
 mov     z21.s, #-128, lsl #8
 // CHECK-INST: mov     z21.s, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 b8 25 <unknown>
 
 mov     z21.s, #-32768
 // CHECK-INST: mov     z21.s, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 b8 25 <unknown>
 
 mov     z21.s, #127
 // CHECK-INST: mov     z21.s, #127
 // CHECK-ENCODING: [0xf5,0xcf,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 cf b8 25 <unknown>
 
 mov     z21.s, #127, lsl #8
 // CHECK-INST: mov     z21.s, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef b8 25 <unknown>
 
 mov     z21.s, #32512
 // CHECK-INST: mov     z21.s, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef b8 25 <unknown>
 
 mov     z21.d, #-128
 // CHECK-INST: mov     z21.d, #-128
 // CHECK-ENCODING: [0x15,0xd0,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 d0 f8 25 <unknown>
 
 mov     z21.d, #-128, lsl #8
 // CHECK-INST: mov     z21.d, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 f8 25 <unknown>
 
 mov     z21.d, #-32768
 // CHECK-INST: mov     z21.d, #-32768
 // CHECK-ENCODING: [0x15,0xf0,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 f0 f8 25 <unknown>
 
 mov     z21.d, #127
 // CHECK-INST: mov     z21.d, #127
 // CHECK-ENCODING: [0xf5,0xcf,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 cf f8 25 <unknown>
 
 mov     z21.d, #127, lsl #8
 // CHECK-INST: mov     z21.d, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef f8 25 <unknown>
 
 mov     z21.d, #32512
 // CHECK-INST: mov     z21.d, #32512
 // CHECK-ENCODING: [0xf5,0xef,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 ef f8 25 <unknown>
 
 mov     z0.h, #32768
 // CHECK-INST: mov    z0.h, #-32768
 // CHECK-ENCODING: [0x00,0xf0,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 78 25 <unknown>
 
 mov     z0.h, #65280
 // CHECK-INST: mov    z0.h, #-256
 // CHECK-ENCODING: [0xe0,0xff,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 78 25 <unknown>
 
 mov     z0.h, #-33024
 // CHECK-INST: mov z0.h, #32512
 // CHECK-ENCODING: [0xe0,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ef 78 25 <unknown>
 
 mov     z0.h, #-32769
 // CHECK-INST: mov z0.h, #32767
 // CHECK-ENCODING: [0xc0,0x05,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 05 c0 05 <unknown>
 
 mov     z0.s, #-32769
 // CHECK-INST: mov     z0.s, #0xffff7fff
 // CHECK-ENCODING: [0xc0,0x83,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 83 c0 05 <unknown>
 
 mov     z0.s, #32768
 // CHECK-INST: mov     z0.s, #32768
 // CHECK-ENCODING: [0x00,0x88,0xc0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 c0 05 <unknown>
 
 mov     z0.d, #-32769
 // CHECK-INST: mov     z0.d, #0xffffffffffff7fff
 // CHECK-ENCODING: [0xc0,0x87,0xc3,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 87 c3 05 <unknown>
 
 mov     z0.d, #32768
 // CHECK-INST: mov     z0.d, #32768
 // CHECK-ENCODING: [0x00,0x88,0xc3,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 88 c3 05 <unknown>
 
 mov     z0.d, #0xe0000000000003ff
 // CHECK-INST: mov     z0.d, #0xe0000000000003ff
 // CHECK-ENCODING: [0x80,0x19,0xc2,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 19 c2 05 <unknown>
 
 mov     z5.b, p0/z, #-128
 // CHECK-INST: mov     z5.b, p0/z, #-128
 // CHECK-ENCODING: [0x05,0x10,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 10 10 05  <unknown>
 
 mov     z5.b, p0/z, #127
 // CHECK-INST: mov     z5.b, p0/z, #127
 // CHECK-ENCODING: [0xe5,0x0f,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 0f 10 05  <unknown>
 
 mov     z5.b, p0/z, #255
 // CHECK-INST: mov     z5.b, p0/z, #-1
 // CHECK-ENCODING: [0xe5,0x1f,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 1f 10 05  <unknown>
 
 mov     z21.h, p0/z, #-128
 // CHECK-INST: mov     z21.h, p0/z, #-128
 // CHECK-ENCODING: [0x15,0x10,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 10 50 05  <unknown>
 
 mov     z21.h, p0/z, #-128, lsl #8
 // CHECK-INST: mov     z21.h, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 50 05  <unknown>
 
 mov     z21.h, p0/z, #-32768
 // CHECK-INST: mov     z21.h, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 50 05  <unknown>
 
 mov     z21.h, p0/z, #127
 // CHECK-INST: mov     z21.h, p0/z, #127
 // CHECK-ENCODING: [0xf5,0x0f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 0f 50 05  <unknown>
 
 mov     z21.h, p0/z, #127, lsl #8
 // CHECK-INST: mov     z21.h, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 50 05  <unknown>
 
 mov     z21.h, p0/z, #32512
 // CHECK-INST: mov     z21.h, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 50 05  <unknown>
 
 mov     z21.s, p0/z, #-128
 // CHECK-INST: mov     z21.s, p0/z, #-128
 // CHECK-ENCODING: [0x15,0x10,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 10 90 05  <unknown>
 
 mov     z21.s, p0/z, #-128, lsl #8
 // CHECK-INST: mov     z21.s, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 90 05  <unknown>
 
 mov     z21.s, p0/z, #-32768
 // CHECK-INST: mov     z21.s, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 90 05  <unknown>
 
 mov     z21.s, p0/z, #127
 // CHECK-INST: mov     z21.s, p0/z, #127
 // CHECK-ENCODING: [0xf5,0x0f,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 0f 90 05  <unknown>
 
 mov     z21.s, p0/z, #127, lsl #8
 // CHECK-INST: mov     z21.s, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 90 05  <unknown>
 
 mov     z21.s, p0/z, #32512
 // CHECK-INST: mov     z21.s, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0x90,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f 90 05  <unknown>
 
 mov     z21.d, p0/z, #-128
 // CHECK-INST: mov     z21.d, p0/z, #-128
 // CHECK-ENCODING: [0x15,0x10,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 10 d0 05  <unknown>
 
 mov     z21.d, p0/z, #-128, lsl #8
 // CHECK-INST: mov     z21.d, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 d0 05  <unknown>
 
 mov     z21.d, p0/z, #-32768
 // CHECK-INST: mov     z21.d, p0/z, #-32768
 // CHECK-ENCODING: [0x15,0x30,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 30 d0 05  <unknown>
 
 mov     z21.d, p0/z, #127
 // CHECK-INST: mov     z21.d, p0/z, #127
 // CHECK-ENCODING: [0xf5,0x0f,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 0f d0 05  <unknown>
 
 mov     z21.d, p0/z, #127, lsl #8
 // CHECK-INST: mov     z21.d, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f d0 05  <unknown>
 
 mov     z21.d, p0/z, #32512
 // CHECK-INST: mov     z21.d, p0/z, #32512
 // CHECK-ENCODING: [0xf5,0x2f,0xd0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f5 2f d0 05  <unknown>
 
 
@@ -383,49 +383,49 @@ mov     z21.d, p0/z, #32512
 mov     z0.b, #-129
 // CHECK-INST: mov     z0.b, #127
 // CHECK-ENCODING: [0xe0,0xcf,0x38,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf 38 25 <unknown>
 
 mov     z0.h, #-129, lsl #8
 // CHECK-INST: mov     z0.h, #32512
 // CHECK-ENCODING: [0xe0,0xef,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ef 78 25 <unknown>
 
 mov     z5.h, #0xfffa
 // CHECK-INST: mov     z5.h, #-6
 // CHECK-ENCODING: [0x45,0xdf,0x78,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 45 df 78 25 <unknown>
 
 mov     z5.s, #0xfffffffa
 // CHECK-INST: mov     z5.s, #-6
 // CHECK-ENCODING: [0x45,0xdf,0xb8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 45 df b8 25 <unknown>
 
 mov     z5.d, #0xfffffffffffffffa
 // CHECK-INST: mov     z5.d, #-6
 // CHECK-ENCODING: [0x45,0xdf,0xf8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 45 df f8 25 <unknown>
 
 mov     z0.b, p0/z, #-129
 // CHECK-INST: mov     z0.b, p0/z, #127
 // CHECK-ENCODING: [0xe0,0x0f,0x10,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 0f 10 05 <unknown>
 
 mov     z0.h, p0/z, #-33024
 // CHECK-INST: mov     z0.h, p0/z, #32512
 // CHECK-ENCODING: [0xe0,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 2f 50 05 <unknown>
 
 mov     z0.h, p0/z, #-129, lsl #8
 // CHECK-INST: mov     z0.h, p0/z, #32512
 // CHECK-ENCODING: [0xe0,0x2f,0x50,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 2f 50 05 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -435,43 +435,43 @@ mov     z0.h, p0/z, #-129, lsl #8
 mov     z5.b, p15/m, #-128
 // CHECK-INST: mov     z5.b, p15/m, #-128
 // CHECK-ENCODING: [0x05,0x50,0x1f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 50 1f 05  <unknown>
 
 mov     z21.h, p15/m, #-128
 // CHECK-INST: mov     z21.h, p15/m, #-128
 // CHECK-ENCODING: [0x15,0x50,0x5f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 5f 05  <unknown>
 
 mov     z21.h, p15/m, #-128, lsl #8
 // CHECK-INST: mov     z21.h, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0x5f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 5f 05  <unknown>
 
 mov     z21.s, p15/m, #-128
 // CHECK-INST: mov     z21.s, p15/m, #-128
 // CHECK-ENCODING: [0x15,0x50,0x9f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 9f 05  <unknown>
 
 mov     z21.s, p15/m, #-128, lsl #8
 // CHECK-INST: mov     z21.s, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0x9f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 9f 05  <unknown>
 
 mov     z21.d, p15/m, #-128
 // CHECK-INST: mov     z21.d, p15/m, #-128
 // CHECK-ENCODING: [0x15,0x50,0xdf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 50 df 05  <unknown>
 
 mov     z21.d, p15/m, #-128, lsl #8
 // CHECK-INST: mov     z21.d, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0xdf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 df 05  <unknown>
 
 // --------------------------------------------------------------------------//
@@ -480,91 +480,91 @@ mov     z21.d, p15/m, #-128, lsl #8
 mov     z0.b, z0.b[0]
 // CHECK-INST: mov     z0.b, b0
 // CHECK-ENCODING: [0x00,0x20,0x21,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 21 05 <unknown>
 
 mov     z0.h, z0.h[0]
 // CHECK-INST: mov     z0.h, h0
 // CHECK-ENCODING: [0x00,0x20,0x22,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 22 05 <unknown>
 
 mov     z0.s, z0.s[0]
 // CHECK-INST: mov     z0.s, s0
 // CHECK-ENCODING: [0x00,0x20,0x24,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 24 05 <unknown>
 
 mov     z0.d, z0.d[0]
 // CHECK-INST: mov     z0.d, d0
 // CHECK-ENCODING: [0x00,0x20,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 28 05 <unknown>
 
 mov     z0.q, z0.q[0]
 // CHECK-INST: mov     z0.q, q0
 // CHECK-ENCODING: [0x00,0x20,0x30,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 30 05 <unknown>
 
 mov     z0.b, b0
 // CHECK-INST: mov     z0.b, b0
 // CHECK-ENCODING: [0x00,0x20,0x21,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 21 05 <unknown>
 
 mov     z0.h, h0
 // CHECK-INST: mov     z0.h, h0
 // CHECK-ENCODING: [0x00,0x20,0x22,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 22 05 <unknown>
 
 mov     z0.s, s0
 // CHECK-INST: mov     z0.s, s0
 // CHECK-ENCODING: [0x00,0x20,0x24,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 24 05 <unknown>
 
 mov     z0.d, d0
 // CHECK-INST: mov     z0.d, d0
 // CHECK-ENCODING: [0x00,0x20,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 28 05 <unknown>
 
 mov     z0.q, q0
 // CHECK-INST: mov     z0.q, q0
 // CHECK-ENCODING: [0x00,0x20,0x30,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 30 05 <unknown>
 
 mov     z31.b, z31.b[63]
 // CHECK-INST: mov     z31.b, z31.b[63]
 // CHECK-ENCODING: [0xff,0x23,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 ff 05 <unknown>
 
 mov     z31.h, z31.h[31]
 // CHECK-INST: mov     z31.h, z31.h[31]
 // CHECK-ENCODING: [0xff,0x23,0xfe,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 fe 05 <unknown>
 
 mov     z31.s, z31.s[15]
 // CHECK-INST: mov     z31.s, z31.s[15]
 // CHECK-ENCODING: [0xff,0x23,0xfc,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 fc 05 <unknown>
 
 mov     z31.d, z31.d[7]
 // CHECK-INST: mov     z31.d, z31.d[7]
 // CHECK-ENCODING: [0xff,0x23,0xf8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 23 f8 05 <unknown>
 
 mov     z5.q, z17.q[3]
 // CHECK-INST: mov     z5.q, z17.q[3]
 // CHECK-ENCODING: [0x25,0x22,0xf0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 22 f0 05 <unknown>
 
 
@@ -574,157 +574,157 @@ mov     z5.q, z17.q[3]
 mov     z0.b, p0/m, w0
 // CHECK-INST: mov     z0.b, p0/m, w0
 // CHECK-ENCODING: [0x00,0xa0,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 28 05 <unknown>
 
 mov     z0.h, p0/m, w0
 // CHECK-INST: mov     z0.h, p0/m, w0
 // CHECK-ENCODING: [0x00,0xa0,0x68,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 68 05 <unknown>
 
 mov     z0.s, p0/m, w0
 // CHECK-INST: mov     z0.s, p0/m, w0
 // CHECK-ENCODING: [0x00,0xa0,0xa8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 a8 05 <unknown>
 
 mov     z0.d, p0/m, x0
 // CHECK-INST: mov     z0.d, p0/m, x0
 // CHECK-ENCODING: [0x00,0xa0,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 e8 05 <unknown>
 
 mov     z31.b, p7/m, wsp
 // CHECK-INST: mov     z31.b, p7/m, wsp
 // CHECK-ENCODING: [0xff,0xbf,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 28 05 <unknown>
 
 mov     z31.h, p7/m, wsp
 // CHECK-INST: mov     z31.h, p7/m, wsp
 // CHECK-ENCODING: [0xff,0xbf,0x68,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 68 05 <unknown>
 
 mov     z31.s, p7/m, wsp
 // CHECK-INST: mov     z31.s, p7/m, wsp
 // CHECK-ENCODING: [0xff,0xbf,0xa8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf a8 05 <unknown>
 
 mov     z31.d, p7/m, sp
 // CHECK-INST: mov     z31.d, p7/m, sp
 // CHECK-ENCODING: [0xff,0xbf,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf e8 05 <unknown>
 
 mov     z0.b, p0/m, b0
 // CHECK-INST: mov     z0.b, p0/m, b0
 // CHECK-ENCODING: [0x00,0x80,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 20 05 <unknown>
 
 mov     z31.b, p7/m, b31
 // CHECK-INST: mov     z31.b, p7/m, b31
 // CHECK-ENCODING: [0xff,0x9f,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 20 05 <unknown>
 
 mov     z0.h, p0/m, h0
 // CHECK-INST: mov     z0.h, p0/m, h0
 // CHECK-ENCODING: [0x00,0x80,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 60 05 <unknown>
 
 mov     z31.h, p7/m, h31
 // CHECK-INST: mov     z31.h, p7/m, h31
 // CHECK-ENCODING: [0xff,0x9f,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 60 05 <unknown>
 
 mov     z0.s, p0/m, s0
 // CHECK-INST: mov     z0.s, p0/m, s0
 // CHECK-ENCODING: [0x00,0x80,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 a0 05 <unknown>
 
 mov     z31.s, p7/m, s31
 // CHECK-INST: mov     z31.s, p7/m, s31
 // CHECK-ENCODING: [0xff,0x9f,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f a0 05 <unknown>
 
 mov     z0.d, p0/m, d0
 // CHECK-INST: mov     z0.d, p0/m, d0
 // CHECK-ENCODING: [0x00,0x80,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e0 05 <unknown>
 
 mov     z31.d, p7/m, d31
 // CHECK-INST: mov     z31.d, p7/m, d31
 // CHECK-ENCODING: [0xff,0x9f,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f e0 05 <unknown>
 
 mov     p0.b, p0/m, p0.b
 // CHECK-INST: mov     p0.b, p0/m, p0.b
 // CHECK-ENCODING: [0x10,0x42,0x00,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 42 00 25 <unknown>
 
 mov     p15.b, p15/m, p15.b
 // CHECK-INST: mov     p15.b, p15/m, p15.b
 // CHECK-ENCODING: [0xff,0x7f,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7f 0f 25 <unknown>
 
 mov     z31.b, p15/m, z31.b
 // CHECK-INST: mov     z31.b, p15/m, z31.b
 // CHECK-ENCODING: [0xff,0xff,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 3f 05 <unknown>
 
 mov     z31.h, p15/m, z31.h
 // CHECK-INST: mov     z31.h, p15/m, z31.h
 // CHECK-ENCODING: [0xff,0xff,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 7f 05 <unknown>
 
 mov     z31.s, p15/m, z31.s
 // CHECK-INST: mov     z31.s, p15/m, z31.s
 // CHECK-ENCODING: [0xff,0xff,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff bf 05 <unknown>
 
 mov     z31.d, p15/m, z31.d
 // CHECK-INST: mov     z31.d, p15/m, z31.d
 // CHECK-ENCODING: [0xff,0xff,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff ff 05 <unknown>
 
 mov     p0.b, p0.b
 // CHECK-INST: mov     p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x80,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 80 25 <unknown>
 
 mov     p15.b, p15.b
 // CHECK-INST: mov     p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 8f 25 <unknown>
 
 mov     p0.b, p0/z, p0.b
 // CHECK-INST: mov     p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x00,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 00 25 <unknown>
 
 mov     p15.b, p15/z, p15.b
 // CHECK-INST: mov     p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 0f 25 <unknown>
 
 
@@ -734,71 +734,71 @@ mov     p15.b, p15/z, p15.b
 movprfx z31.d, p7/z, z6.d
 // CHECK-INST: movprfx	z31.d, p7/z, z6.d
 // CHECK-ENCODING: [0xdf,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 3c d0 04 <unknown>
 
 mov     z31.d, p7/m, sp
 // CHECK-INST: mov	z31.d, p7/m, sp
 // CHECK-ENCODING: [0xff,0xbf,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf e8 05 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 mov     z31.d, p7/m, sp
 // CHECK-INST: mov	z31.d, p7/m, sp
 // CHECK-ENCODING: [0xff,0xbf,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf e8 05 <unknown>
 
 movprfx z21.d, p7/z, z28.d
 // CHECK-INST: movprfx	z21.d, p7/z, z28.d
 // CHECK-ENCODING: [0x95,0x3f,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 3f d0 04 <unknown>
 
 mov     z21.d, p7/m, #-128, lsl #8
 // CHECK-INST: mov	z21.d, p7/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0xd7,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 d7 05 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 mov     z21.d, p15/m, #-128, lsl #8
 // CHECK-INST: mov	z21.d, p15/m, #-32768
 // CHECK-ENCODING: [0x15,0x70,0xdf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 70 df 05 <unknown>
 
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 mov     z4.d, p7/m, d31
 // CHECK-INST: mov	z4.d, p7/m, d31
 // CHECK-ENCODING: [0xe4,0x9f,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 9f e0 05 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 mov     z4.d, p7/m, d31
 // CHECK-INST: mov	z4.d, p7/m, d31
 // CHECK-ENCODING: [0xe4,0x9f,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 9f e0 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/movprfx.s b/llvm/test/MC/AArch64/SVE/movprfx.s
index 869189e1874ff..52111b6f1073a 100644
--- a/llvm/test/MC/AArch64/SVE/movprfx.s
+++ b/llvm/test/MC/AArch64/SVE/movprfx.s
@@ -40,7 +40,7 @@
 movprfx z0, z1
 // CHECK-INST: movprfx  z0, z1
 // CHECK-ENCODING: [0x20,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc 20 04 <unknown>
 
 hlt #1
@@ -50,7 +50,7 @@ hlt #1
 movprfx z0.d, p0/z, z1.d
 // CHECK-INST: movprfx  z0.d, p0/z, z1.d
 // CHECK-ENCODING: [0x20,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 20 d0 04 <unknown>
 
 hlt #1
@@ -60,7 +60,7 @@ hlt #1
 movprfx z0, z1
 // CHECK-INST: movprfx  z0, z1
 // CHECK-ENCODING: [0x20,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc 20 04 <unknown>
 
 brk #1
@@ -70,7 +70,7 @@ brk #1
 movprfx z0.d, p0/z, z1.d
 // CHECK-INST: movprfx  z0.d, p0/z, z1.d
 // CHECK-ENCODING: [0x20,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 20 d0 04 <unknown>
 
 brk #1
@@ -83,17 +83,17 @@ brk #1
 movprfx z0, z1
 // CHECK-INST: movprfx  z0, z1
 // CHECK-ENCODING: [0x20,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 bc 20 04 <unknown>
 
 add z0.d, p0/m, z0.d, z1.d
 // CHECK-INST: add      z0.d, p0/m, z0.d, z1.d
 // CHECK-ENCODING: [0x20,0x00,0xc0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 c0 04 <unknown>
 
 add z0.d, p0/m, z0.d, z1.d
 // CHECK-INST: add      z0.d, p0/m, z0.d, z1.d
 // CHECK-ENCODING: [0x20,0x00,0xc0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 c0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/movs.s b/llvm/test/MC/AArch64/SVE/movs.s
index d2d14d9312622..465ae9d4f5322 100644
--- a/llvm/test/MC/AArch64/SVE/movs.s
+++ b/llvm/test/MC/AArch64/SVE/movs.s
@@ -12,23 +12,23 @@
 movs    p0.b, p0.b
 // CHECK-INST: movs    p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x40,0xc0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c0 25 <unknown>
 
 movs    p15.b, p15.b
 // CHECK-INST: movs    p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d cf 25 <unknown>
 
 movs    p0.b, p0/z, p0.b
 // CHECK-INST: movs    p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x40,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 40 25 <unknown>
 
 movs    p15.b, p15/z, p15.b
 // CHECK-INST: movs    p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 4f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/msb.s b/llvm/test/MC/AArch64/SVE/msb.s
index 0a08783b0bb86..9a3179d80e910 100644
--- a/llvm/test/MC/AArch64/SVE/msb.s
+++ b/llvm/test/MC/AArch64/SVE/msb.s
@@ -12,25 +12,25 @@
 msb z0.b, p7/m, z1.b, z31.b
 // CHECK-INST: msb	z0.b, p7/m, z1.b, z31.b
 // CHECK-ENCODING: [0xe0,0xff,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 01 04 <unknown>
 
 msb z0.h, p7/m, z1.h, z31.h
 // CHECK-INST: msb	z0.h, p7/m, z1.h, z31.h
 // CHECK-ENCODING: [0xe0,0xff,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 41 04 <unknown>
 
 msb z0.s, p7/m, z1.s, z31.s
 // CHECK-INST: msb	z0.s, p7/m, z1.s, z31.s
 // CHECK-ENCODING: [0xe0,0xff,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 81 04 <unknown>
 
 msb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: msb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0xe0,0xff,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff c1 04 <unknown>
 
 
@@ -40,23 +40,23 @@ msb z0.d, p7/m, z1.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 msb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: msb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0xe0,0xff,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff c1 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 msb z0.d, p7/m, z1.d, z31.d
 // CHECK-INST: msb	z0.d, p7/m, z1.d, z31.d
 // CHECK-ENCODING: [0xe0,0xff,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff c1 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/mul.s b/llvm/test/MC/AArch64/SVE/mul.s
index 5016db43d6b05..77ee314c17701 100644
--- a/llvm/test/MC/AArch64/SVE/mul.s
+++ b/llvm/test/MC/AArch64/SVE/mul.s
@@ -12,73 +12,73 @@
 mul z0.b, p7/m, z0.b, z31.b
 // CHECK-INST: mul	z0.b, p7/m, z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0x1f,0x10,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 10 04 <unknown>
 
 mul z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: mul	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x1f,0x50,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 50 04 <unknown>
 
 mul z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: mul	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x1f,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 90 04 <unknown>
 
 mul z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: mul	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d0 04 <unknown>
 
 mul z31.b, z31.b, #-128
 // CHECK-INST: mul	z31.b, z31.b, #-128
 // CHECK-ENCODING: [0x1f,0xd0,0x30,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f d0 30 25 <unknown>
 
 mul z31.b, z31.b, #127
 // CHECK-INST: mul	z31.b, z31.b, #127
 // CHECK-ENCODING: [0xff,0xcf,0x30,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf 30 25 <unknown>
 
 mul z31.h, z31.h, #-128
 // CHECK-INST: mul	z31.h, z31.h, #-128
 // CHECK-ENCODING: [0x1f,0xd0,0x70,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f d0 70 25 <unknown>
 
 mul z31.h, z31.h, #127
 // CHECK-INST: mul	z31.h, z31.h, #127
 // CHECK-ENCODING: [0xff,0xcf,0x70,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf 70 25 <unknown>
 
 mul z31.s, z31.s, #-128
 // CHECK-INST: mul	z31.s, z31.s, #-128
 // CHECK-ENCODING: [0x1f,0xd0,0xb0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f d0 b0 25 <unknown>
 
 mul z31.s, z31.s, #127
 // CHECK-INST: mul	z31.s, z31.s, #127
 // CHECK-ENCODING: [0xff,0xcf,0xb0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf b0 25 <unknown>
 
 mul z31.d, z31.d, #-128
 // CHECK-INST: mul	z31.d, z31.d, #-128
 // CHECK-ENCODING: [0x1f,0xd0,0xf0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f d0 f0 25 <unknown>
 
 mul z31.d, z31.d, #127
 // CHECK-INST: mul	z31.d, z31.d, #127
 // CHECK-ENCODING: [0xff,0xcf,0xf0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf f0 25 <unknown>
 
 
@@ -88,35 +88,35 @@ mul z31.d, z31.d, #127
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 mul z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: mul	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 mul z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: mul	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d0 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 mul z31.d, z31.d, #127
 // CHECK-INST: mul	z31.d, z31.d, #127
 // CHECK-ENCODING: [0xff,0xcf,0xf0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf f0 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/nand.s b/llvm/test/MC/AArch64/SVE/nand.s
index cc2831c7bf513..0e6cef662a6fe 100644
--- a/llvm/test/MC/AArch64/SVE/nand.s
+++ b/llvm/test/MC/AArch64/SVE/nand.s
@@ -12,11 +12,11 @@
 nand    p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: nand    p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x10,0x42,0x80,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 42 80 25 <unknown>
 
 nand    p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: nand    p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0x7f,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7f 8f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/nands.s b/llvm/test/MC/AArch64/SVE/nands.s
index d5540e3d897ea..0b80f861ef980 100644
--- a/llvm/test/MC/AArch64/SVE/nands.s
+++ b/llvm/test/MC/AArch64/SVE/nands.s
@@ -12,11 +12,11 @@
 nands   p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: nands   p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x10,0x42,0xc0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 42 c0 25 <unknown>
 
 nands   p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: nands   p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0x7f,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7f cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/neg.s b/llvm/test/MC/AArch64/SVE/neg.s
index 11881e795f9b0..b69bb5d22d8ad 100644
--- a/llvm/test/MC/AArch64/SVE/neg.s
+++ b/llvm/test/MC/AArch64/SVE/neg.s
@@ -12,49 +12,49 @@
 neg     z0.b, p0/m, z0.b
 // CHECK-INST: neg     z0.b, p0/m, z0.b
 // CHECK-ENCODING: [0x00,0xa0,0x17,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 17 04 <unknown>
 
 neg     z0.h, p0/m, z0.h
 // CHECK-INST: neg     z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x57,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 57 04 <unknown>
 
 neg     z0.s, p0/m, z0.s
 // CHECK-INST: neg     z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x97,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 97 04 <unknown>
 
 neg     z0.d, p0/m, z0.d
 // CHECK-INST: neg     z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d7 04 <unknown>
 
 neg     z31.b, p7/m, z31.b
 // CHECK-INST: neg     z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x17,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 17 04 <unknown>
 
 neg     z31.h, p7/m, z31.h
 // CHECK-INST: neg     z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x57,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 57 04 <unknown>
 
 neg     z31.s, p7/m, z31.s
 // CHECK-INST: neg     z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x97,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 97 04 <unknown>
 
 neg     z31.d, p7/m, z31.d
 // CHECK-INST: neg     z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d7 04 <unknown>
 
 
@@ -64,23 +64,23 @@ neg     z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 neg     z4.d, p7/m, z31.d
 // CHECK-INST: neg	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d7 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 neg     z4.d, p7/m, z31.d
 // CHECK-INST: neg	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d7 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/nor.s b/llvm/test/MC/AArch64/SVE/nor.s
index a78074c6c6c58..70d6a67c7b0e1 100644
--- a/llvm/test/MC/AArch64/SVE/nor.s
+++ b/llvm/test/MC/AArch64/SVE/nor.s
@@ -12,11 +12,11 @@
 nor     p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: nor     p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x42,0x80,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 80 25 <unknown>
 
 nor     p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: nor     p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7f,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7f 8f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/nors.s b/llvm/test/MC/AArch64/SVE/nors.s
index 76492f2603b42..2551b30e77216 100644
--- a/llvm/test/MC/AArch64/SVE/nors.s
+++ b/llvm/test/MC/AArch64/SVE/nors.s
@@ -12,11 +12,11 @@
 nors    p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: nors    p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x42,0xc0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 c0 25 <unknown>
 
 nors    p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: nors    p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7f,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7f cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/not.s b/llvm/test/MC/AArch64/SVE/not.s
index b13eb6fdf58cb..0fc7b36881172 100644
--- a/llvm/test/MC/AArch64/SVE/not.s
+++ b/llvm/test/MC/AArch64/SVE/not.s
@@ -12,37 +12,37 @@
 not     z31.b, p7/m, z31.b
 // CHECK-INST: not	z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x1e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 1e 04 <unknown>
 
 not     z31.h, p7/m, z31.h
 // CHECK-INST: not	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x5e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 5e 04 <unknown>
 
 not     z31.s, p7/m, z31.s
 // CHECK-INST: not	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x9e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 9e 04 <unknown>
 
 not     z31.d, p7/m, z31.d
 // CHECK-INST: not	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xde,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf de 04 <unknown>
 
 not     p0.b, p0/z, p0.b
 // CHECK-INST: not     p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x42,0x00,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 00 25 <unknown>
 
 not     p15.b, p15/z, p15.b
 // CHECK-INST: not     p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7f,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7f 0f 25 <unknown>
 
 
@@ -52,23 +52,23 @@ not     p15.b, p15/z, p15.b
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 not     z4.d, p7/m, z31.d
 // CHECK-INST: not	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xde,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf de 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 not     z4.d, p7/m, z31.d
 // CHECK-INST: not	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xde,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf de 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/nots.s b/llvm/test/MC/AArch64/SVE/nots.s
index 2a3d32ccd33cd..4b6c79c36fa90 100644
--- a/llvm/test/MC/AArch64/SVE/nots.s
+++ b/llvm/test/MC/AArch64/SVE/nots.s
@@ -12,11 +12,11 @@
 nots    p0.b, p0/z, p0.b
 // CHECK-INST: nots    p0.b, p0/z, p0.b
 // CHECK-ENCODING: [0x00,0x42,0x40,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 42 40 25 <unknown>
 
 nots    p15.b, p15/z, p15.b
 // CHECK-INST: nots    p15.b, p15/z, p15.b
 // CHECK-ENCODING: [0xef,0x7f,0x4f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7f 4f 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/orn.s b/llvm/test/MC/AArch64/SVE/orn.s
index 1eb726e2c2070..ce1b9c9f6fcd8 100644
--- a/llvm/test/MC/AArch64/SVE/orn.s
+++ b/llvm/test/MC/AArch64/SVE/orn.s
@@ -12,61 +12,61 @@
 orn     z5.b, z5.b, #0xf9
 // CHECK-INST: orr     z5.b, z5.b, #0x6
 // CHECK-ENCODING: [0x25,0x3e,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 3e 00 05 <unknown>
 
 orn     z23.h, z23.h, #0xfff9
 // CHECK-INST: orr     z23.h, z23.h, #0x6
 // CHECK-ENCODING: [0x37,0x7c,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 37 7c 00 05 <unknown>
 
 orn     z0.s, z0.s, #0xfffffff9
 // CHECK-INST: orr     z0.s, z0.s, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 00 05 <unknown>
 
 orn     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-INST: orr     z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x03,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 03 05 <unknown>
 
 orn     z5.b, z5.b, #0x6
 // CHECK-INST: orr     z5.b, z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e 00 05 <unknown>
 
 orn     z23.h, z23.h, #0x6
 // CHECK-INST: orr     z23.h, z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d 00 05 <unknown>
 
 orn     z0.s, z0.s, #0x6
 // CHECK-INST: orr     z0.s, z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb 00 05 <unknown>
 
 orn     z0.d, z0.d, #0x6
 // CHECK-INST: orr     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x03,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 03 05 <unknown>
 
 orn     p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: orn     p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x10,0x40,0x80,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 80 25 <unknown>
 
 orn     p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: orn     p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0x7d,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7d 8f 25 <unknown>
 
 
@@ -76,11 +76,11 @@ orn     p15.b, p15/z, p15.b, p15.b
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 orn     z0.d, z0.d, #0x6
 // CHECK-INST: orr	z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x03,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 03 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/orns.s b/llvm/test/MC/AArch64/SVE/orns.s
index f0806a95892ce..de8aa4a4b736a 100644
--- a/llvm/test/MC/AArch64/SVE/orns.s
+++ b/llvm/test/MC/AArch64/SVE/orns.s
@@ -12,11 +12,11 @@
 orns    p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: orns    p0.b, p0/z, p0.b, p0.b
 // CHECK-ENCODING: [0x10,0x40,0xc0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 40 c0 25 <unknown>
 
 orns    p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: orns    p15.b, p15/z, p15.b, p15.b
 // CHECK-ENCODING: [0xff,0x7d,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7d cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/orr.s b/llvm/test/MC/AArch64/SVE/orr.s
index 88d24b310daef..c848a34654be2 100644
--- a/llvm/test/MC/AArch64/SVE/orr.s
+++ b/llvm/test/MC/AArch64/SVE/orr.s
@@ -14,103 +14,103 @@
 orr     z5.b, z5.b, #0xf9
 // CHECK-INST: orr     z5.b, z5.b, #0xf9
 // CHECK-ENCODING: [0xa5,0x2e,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a5 2e 00 05 <unknown>
 
 orr     z23.h, z23.h, #0xfff9
 // CHECK-INST: orr     z23.h, z23.h, #0xfff9
 // CHECK-ENCODING: [0xb7,0x6d,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 6d 00 05 <unknown>
 
 orr     z0.s, z0.s, #0xfffffff9
 // CHECK-INST: orr     z0.s, z0.s, #0xfffffff9
 // CHECK-ENCODING: [0xa0,0xeb,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 eb 00 05 <unknown>
 
 orr     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-INST: orr     z0.d, z0.d, #0xfffffffffffffff9
 // CHECK-ENCODING: [0xa0,0xef,0x03,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 ef 03 05 <unknown>
 
 orr     z5.b, z5.b, #0x6
 // CHECK-INST: orr     z5.b, z5.b, #0x6
 // CHECK-ENCODING: [0x25,0x3e,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 3e 00 05 <unknown>
 
 orr     z23.h, z23.h, #0x6
 // CHECK-INST: orr     z23.h, z23.h, #0x6
 // CHECK-ENCODING: [0x37,0x7c,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 37 7c 00 05 <unknown>
 
 orr     z0.s, z0.s, #0x6
 // CHECK-INST: orr     z0.s, z0.s, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x00,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 00 05 <unknown>
 
 orr     z0.d, z0.d, #0x6
 // CHECK-INST: orr     z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x03,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 03 05 <unknown>
 
 orr     z0.d, z0.d, z0.d    // should use mov-alias
 // CHECK-INST: mov     z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 60 04 <unknown>
 
 orr     z23.d, z13.d, z8.d  // should not use mov-alias
 // CHECK-INST: orr     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x31,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 31 68 04 <unknown>
 
 orr     z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: orr     z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x18,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 18 04 <unknown>
 
 orr     z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: orr     z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x58,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 58 04 <unknown>
 
 orr     z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: orr     z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x98,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 98 04 <unknown>
 
 orr     z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: orr     z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xd8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f d8 04 <unknown>
 
 orr     p0.b, p0/z, p0.b, p1.b
 // CHECK-INST: orr     p0.b, p0/z, p0.b, p1.b
 // CHECK-ENCODING: [0x00,0x40,0x81,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 81 25 <unknown>
 
 orr     p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: mov     p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x80,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 80 25 <unknown>
 
 orr     p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: mov     p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0x8f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d 8f 25 <unknown>
 
 
@@ -120,37 +120,37 @@ orr     p15.b, p15/z, p15.b, p15.b
 orr     z0.s, z0.s, z0.s
 // CHECK-INST: mov     z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 60 04 <unknown>
 
 orr     z0.h, z0.h, z0.h
 // CHECK-INST: mov     z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 60 04 <unknown>
 
 orr     z0.b, z0.b, z0.b
 // CHECK-INST: mov     z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x30,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 30 60 04 <unknown>
 
 orr     z23.s, z13.s, z8.s  // should not use mov-alias
 // CHECK-INST: orr     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x31,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 31 68 04 <unknown>
 
 orr     z23.h, z13.h, z8.h  // should not use mov-alias
 // CHECK-INST: orr     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x31,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 31 68 04 <unknown>
 
 orr     z23.b, z13.b, z8.b  // should not use mov-alias
 // CHECK-INST: orr     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x31,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 31 68 04 <unknown>
 
 
@@ -160,35 +160,35 @@ orr     z23.b, z13.b, z8.b  // should not use mov-alias
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 orr     z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: orr	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xd8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f d8 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 orr     z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: orr	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xd8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f d8 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 orr     z0.d, z0.d, #0x6
 // CHECK-INST: orr	z0.d, z0.d, #0x6
 // CHECK-ENCODING: [0x20,0xf8,0x03,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 03 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/orrs.s b/llvm/test/MC/AArch64/SVE/orrs.s
index 1faab3826d913..2c6a4881b384f 100644
--- a/llvm/test/MC/AArch64/SVE/orrs.s
+++ b/llvm/test/MC/AArch64/SVE/orrs.s
@@ -12,17 +12,17 @@
 orrs    p0.b, p0/z, p0.b, p1.b
 // CHECK-INST: orrs    p0.b, p0/z, p0.b, p1.b
 // CHECK-ENCODING: [0x00,0x40,0xc1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c1 25 <unknown>
 
 orrs    p0.b, p0/z, p0.b, p0.b
 // CHECK-INST: movs    p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x40,0xc0,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c0 25 <unknown>
 
 orrs    p15.b, p15/z, p15.b, p15.b
 // CHECK-INST: movs    p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x7d,0xcf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 7d cf 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/orv.s b/llvm/test/MC/AArch64/SVE/orv.s
index 3f07878d8de07..fefdf238d3757 100644
--- a/llvm/test/MC/AArch64/SVE/orv.s
+++ b/llvm/test/MC/AArch64/SVE/orv.s
@@ -12,23 +12,23 @@
 orv b0, p7, z31.b
 // CHECK-INST: orv	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x18,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 18 04 <unknown>
 
 orv h0, p7, z31.h
 // CHECK-INST: orv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x58,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 58 04 <unknown>
 
 orv s0, p7, z31.s
 // CHECK-INST: orv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x98,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 98 04 <unknown>
 
 orv d0, p7, z31.d
 // CHECK-INST: orv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xd8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f d8 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/pfalse.s b/llvm/test/MC/AArch64/SVE/pfalse.s
index 3eb4488cb8208..d87b12344561a 100644
--- a/llvm/test/MC/AArch64/SVE/pfalse.s
+++ b/llvm/test/MC/AArch64/SVE/pfalse.s
@@ -12,5 +12,5 @@
 pfalse p15.b
 // CHECK-INST: pfalse	p15.b
 // CHECK-ENCODING: [0x0f,0xe4,0x18,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f e4 18 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/pfirst.s b/llvm/test/MC/AArch64/SVE/pfirst.s
index 2bf001d37a284..89db229d52663 100644
--- a/llvm/test/MC/AArch64/SVE/pfirst.s
+++ b/llvm/test/MC/AArch64/SVE/pfirst.s
@@ -12,11 +12,11 @@
 pfirst p0.b, p15, p0.b
 // CHECK-INST: pfirst	p0.b, p15, p0.b
 // CHECK-ENCODING: [0xe0,0xc1,0x58,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c1 58 25 <unknown>
 
 pfirst p15.b, p15, p15.b
 // CHECK-INST: pfirst	p15.b, p15, p15.b
 // CHECK-ENCODING: [0xef,0xc1,0x58,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef c1 58 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/pnext.s b/llvm/test/MC/AArch64/SVE/pnext.s
index 26f524959695d..74e9830c991d1 100644
--- a/llvm/test/MC/AArch64/SVE/pnext.s
+++ b/llvm/test/MC/AArch64/SVE/pnext.s
@@ -12,29 +12,29 @@
 pnext p15.b, p15, p15.b
 // CHECK-INST: pnext	p15.b, p15, p15.b
 // CHECK-ENCODING: [0xef,0xc5,0x19,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef c5 19 25 <unknown>
 
 pnext p0.b, p15, p0.b
 // CHECK-INST: pnext	p0.b, p15, p0.b
 // CHECK-ENCODING: [0xe0,0xc5,0x19,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c5 19 25 <unknown>
 
 pnext p0.h, p15, p0.h
 // CHECK-INST: pnext	p0.h, p15, p0.h
 // CHECK-ENCODING: [0xe0,0xc5,0x59,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c5 59 25 <unknown>
 
 pnext p0.s, p15, p0.s
 // CHECK-INST: pnext	p0.s, p15, p0.s
 // CHECK-ENCODING: [0xe0,0xc5,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c5 99 25 <unknown>
 
 pnext p0.d, p15, p0.d
 // CHECK-INST: pnext	p0.d, p15, p0.d
 // CHECK-ENCODING: [0xe0,0xc5,0xd9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c5 d9 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/prfb.s b/llvm/test/MC/AArch64/SVE/prfb.s
index 9f4e78f01237d..ea7e43b963f76 100644
--- a/llvm/test/MC/AArch64/SVE/prfb.s
+++ b/llvm/test/MC/AArch64/SVE/prfb.s
@@ -15,169 +15,169 @@
 prfb    #0, p0, [x0]
 // CHECK-INST: prfb	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 c0 85 <unknown>
 
 prfb	pldl1keep, p0, [x0]
 // CHECK-INST: prfb	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 c0 85 <unknown>
 
 prfb    #1, p0, [x0]
 // CHECK-INST: prfb	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 00 c0 85 <unknown>
 
 prfb	pldl1strm, p0, [x0]
 // CHECK-INST: prfb	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 00 c0 85 <unknown>
 
 prfb    #2, p0, [x0]
 // CHECK-INST: prfb	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 00 c0 85 <unknown>
 
 prfb	pldl2keep, p0, [x0]
 // CHECK-INST: prfb	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 00 c0 85 <unknown>
 
 prfb    #3, p0, [x0]
 // CHECK-INST: prfb	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 00 c0 85 <unknown>
 
 prfb	pldl2strm, p0, [x0]
 // CHECK-INST: prfb	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 00 c0 85 <unknown>
 
 prfb    #4, p0, [x0]
 // CHECK-INST: prfb	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 00 c0 85 <unknown>
 
 prfb	pldl3keep, p0, [x0]
 // CHECK-INST: prfb	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 00 c0 85 <unknown>
 
 prfb    #5, p0, [x0]
 // CHECK-INST: prfb	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 00 c0 85 <unknown>
 
 prfb	pldl3strm, p0, [x0]
 // CHECK-INST: prfb	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 00 c0 85 <unknown>
 
 prfb    #6, p0, [x0]
 // CHECK-INST: prfb	#6, p0, [x0]
 // CHECK-ENCODING: [0x06,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 06 00 c0 85 <unknown>
 
 prfb    #7, p0, [x0]
 // CHECK-INST: prfb	#7, p0, [x0]
 // CHECK-ENCODING: [0x07,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 00 c0 85 <unknown>
 
 prfb    #8, p0, [x0]
 // CHECK-INST: prfb	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 00 c0 85 <unknown>
 
 prfb	pstl1keep, p0, [x0]
 // CHECK-INST: prfb	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 00 c0 85 <unknown>
 
 prfb    #9, p0, [x0]
 // CHECK-INST: prfb	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 00 c0 85 <unknown>
 
 prfb	pstl1strm, p0, [x0]
 // CHECK-INST: prfb	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 00 c0 85 <unknown>
 
 prfb    #10, p0, [x0]
 // CHECK-INST: prfb	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 00 c0 85 <unknown>
 
 prfb	pstl2keep, p0, [x0]
 // CHECK-INST: prfb	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 00 c0 85 <unknown>
 
 prfb    #11, p0, [x0]
 // CHECK-INST: prfb	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 00 c0 85 <unknown>
 
 prfb	pstl2strm, p0, [x0]
 // CHECK-INST: prfb	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 00 c0 85 <unknown>
 
 prfb    #12, p0, [x0]
 // CHECK-INST: prfb	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 00 c0 85 <unknown>
 
 prfb	pstl3keep, p0, [x0]
 // CHECK-INST: prfb	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 00 c0 85 <unknown>
 
 prfb    #13, p0, [x0]
 // CHECK-INST: prfb	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 00 c0 85 <unknown>
 
 prfb	pstl3strm, p0, [x0]
 // CHECK-INST: prfb	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 00 c0 85 <unknown>
 
 prfb    #14, p0, [x0]
 // CHECK-INST: prfb	#14, p0, [x0]
 // CHECK-ENCODING: [0x0e,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0e 00 c0 85 <unknown>
 
 prfb    #15, p0, [x0]
 // CHECK-INST: prfb	#15, p0, [x0]
 // CHECK-ENCODING: [0x0f,0x00,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 00 c0 85 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -186,11 +186,11 @@ prfb    #15, p0, [x0]
 prfb    #1, p0, [x0, #-32, mul vl]
 // CHECK-INST: prfb pldl1strm, p0, [x0, #-32, mul vl]
 // CHECK-ENCODING: [0x01,0x00,0xe0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 00 e0 85 <unknown>
 
 prfb    #1, p0, [x0, #31, mul vl]
 // CHECK-INST: prfb pldl1strm, p0, [x0, #31, mul vl]
 // CHECK-ENCODING: [0x01,0x00,0xdf,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 00 df 85 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/prfd.s b/llvm/test/MC/AArch64/SVE/prfd.s
index 579fcb082629f..df2dfb5825f2b 100644
--- a/llvm/test/MC/AArch64/SVE/prfd.s
+++ b/llvm/test/MC/AArch64/SVE/prfd.s
@@ -15,169 +15,169 @@
 prfd    #0, p0, [x0]
 // CHECK-INST: prfd	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 c0 85 <unknown>
 
 prfd	pldl1keep, p0, [x0]
 // CHECK-INST: prfd	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 c0 85 <unknown>
 
 prfd    #1, p0, [x0]
 // CHECK-INST: prfd	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 60 c0 85 <unknown>
 
 prfd	pldl1strm, p0, [x0]
 // CHECK-INST: prfd	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 60 c0 85 <unknown>
 
 prfd    #2, p0, [x0]
 // CHECK-INST: prfd	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 60 c0 85 <unknown>
 
 prfd	pldl2keep, p0, [x0]
 // CHECK-INST: prfd	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 60 c0 85 <unknown>
 
 prfd    #3, p0, [x0]
 // CHECK-INST: prfd	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 60 c0 85 <unknown>
 
 prfd	pldl2strm, p0, [x0]
 // CHECK-INST: prfd	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 60 c0 85 <unknown>
 
 prfd    #4, p0, [x0]
 // CHECK-INST: prfd	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 60 c0 85 <unknown>
 
 prfd	pldl3keep, p0, [x0]
 // CHECK-INST: prfd	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 60 c0 85 <unknown>
 
 prfd    #5, p0, [x0]
 // CHECK-INST: prfd	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 60 c0 85 <unknown>
 
 prfd	pldl3strm, p0, [x0]
 // CHECK-INST: prfd	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 60 c0 85 <unknown>
 
 prfd    #6, p0, [x0]
 // CHECK-INST: prfd	#6, p0, [x0]
 // CHECK-ENCODING: [0x06,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 06 60 c0 85 <unknown>
 
 prfd    #7, p0, [x0]
 // CHECK-INST: prfd	#7, p0, [x0]
 // CHECK-ENCODING: [0x07,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 60 c0 85 <unknown>
 
 prfd    #8, p0, [x0]
 // CHECK-INST: prfd	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 60 c0 85 <unknown>
 
 prfd	pstl1keep, p0, [x0]
 // CHECK-INST: prfd	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 60 c0 85 <unknown>
 
 prfd    #9, p0, [x0]
 // CHECK-INST: prfd	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 60 c0 85 <unknown>
 
 prfd	pstl1strm, p0, [x0]
 // CHECK-INST: prfd	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 60 c0 85 <unknown>
 
 prfd    #10, p0, [x0]
 // CHECK-INST: prfd	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 60 c0 85 <unknown>
 
 prfd	pstl2keep, p0, [x0]
 // CHECK-INST: prfd	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 60 c0 85 <unknown>
 
 prfd    #11, p0, [x0]
 // CHECK-INST: prfd	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 60 c0 85 <unknown>
 
 prfd	pstl2strm, p0, [x0]
 // CHECK-INST: prfd	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 60 c0 85 <unknown>
 
 prfd    #12, p0, [x0]
 // CHECK-INST: prfd	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 60 c0 85 <unknown>
 
 prfd	pstl3keep, p0, [x0]
 // CHECK-INST: prfd	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 60 c0 85 <unknown>
 
 prfd    #13, p0, [x0]
 // CHECK-INST: prfd	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 60 c0 85 <unknown>
 
 prfd	pstl3strm, p0, [x0]
 // CHECK-INST: prfd	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 60 c0 85 <unknown>
 
 prfd    #14, p0, [x0]
 // CHECK-INST: prfd	#14, p0, [x0]
 // CHECK-ENCODING: [0x0e,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0e 60 c0 85 <unknown>
 
 prfd    #15, p0, [x0]
 // CHECK-INST: prfd	#15, p0, [x0]
 // CHECK-ENCODING: [0x0f,0x60,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 60 c0 85 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -186,11 +186,11 @@ prfd    #15, p0, [x0]
 prfd    pldl1strm, p0, [x0, #-32, mul vl]
 // CHECK-INST: prfd     pldl1strm, p0, [x0, #-32, mul vl]
 // CHECK-ENCODING: [0x01,0x60,0xe0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 60 e0 85
 
 prfd    pldl1strm, p0, [x0, #31, mul vl]
 // CHECK-INST: prfd     pldl1strm, p0, [x0, #31, mul vl]
 // CHECK-ENCODING: [0x01,0x60,0xdf,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 60 df 85
diff --git a/llvm/test/MC/AArch64/SVE/prfh.s b/llvm/test/MC/AArch64/SVE/prfh.s
index ddefe7797a891..211de5096eaf3 100644
--- a/llvm/test/MC/AArch64/SVE/prfh.s
+++ b/llvm/test/MC/AArch64/SVE/prfh.s
@@ -15,169 +15,169 @@
 prfh    #0, p0, [x0]
 // CHECK-INST: prfh	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 c0 85 <unknown>
 
 prfh	pldl1keep, p0, [x0]
 // CHECK-INST: prfh	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 20 c0 85 <unknown>
 
 prfh    #1, p0, [x0]
 // CHECK-INST: prfh	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 20 c0 85 <unknown>
 
 prfh	pldl1strm, p0, [x0]
 // CHECK-INST: prfh	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 20 c0 85 <unknown>
 
 prfh    #2, p0, [x0]
 // CHECK-INST: prfh	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 20 c0 85 <unknown>
 
 prfh	pldl2keep, p0, [x0]
 // CHECK-INST: prfh	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 20 c0 85 <unknown>
 
 prfh    #3, p0, [x0]
 // CHECK-INST: prfh	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 20 c0 85 <unknown>
 
 prfh	pldl2strm, p0, [x0]
 // CHECK-INST: prfh	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 20 c0 85 <unknown>
 
 prfh    #4, p0, [x0]
 // CHECK-INST: prfh	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 20 c0 85 <unknown>
 
 prfh	pldl3keep, p0, [x0]
 // CHECK-INST: prfh	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 20 c0 85 <unknown>
 
 prfh    #5, p0, [x0]
 // CHECK-INST: prfh	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 20 c0 85 <unknown>
 
 prfh	pldl3strm, p0, [x0]
 // CHECK-INST: prfh	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 20 c0 85 <unknown>
 
 prfh    #6, p0, [x0]
 // CHECK-INST: prfh	#6, p0, [x0]
 // CHECK-ENCODING: [0x06,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 06 20 c0 85 <unknown>
 
 prfh    #7, p0, [x0]
 // CHECK-INST: prfh	#7, p0, [x0]
 // CHECK-ENCODING: [0x07,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 20 c0 85 <unknown>
 
 prfh    #8, p0, [x0]
 // CHECK-INST: prfh	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 20 c0 85 <unknown>
 
 prfh	pstl1keep, p0, [x0]
 // CHECK-INST: prfh	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 20 c0 85 <unknown>
 
 prfh    #9, p0, [x0]
 // CHECK-INST: prfh	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 20 c0 85 <unknown>
 
 prfh	pstl1strm, p0, [x0]
 // CHECK-INST: prfh	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 20 c0 85 <unknown>
 
 prfh    #10, p0, [x0]
 // CHECK-INST: prfh	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 20 c0 85 <unknown>
 
 prfh	pstl2keep, p0, [x0]
 // CHECK-INST: prfh	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 20 c0 85 <unknown>
 
 prfh    #11, p0, [x0]
 // CHECK-INST: prfh	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 20 c0 85 <unknown>
 
 prfh	pstl2strm, p0, [x0]
 // CHECK-INST: prfh	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 20 c0 85 <unknown>
 
 prfh    #12, p0, [x0]
 // CHECK-INST: prfh	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 20 c0 85 <unknown>
 
 prfh	pstl3keep, p0, [x0]
 // CHECK-INST: prfh	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 20 c0 85 <unknown>
 
 prfh    #13, p0, [x0]
 // CHECK-INST: prfh	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 20 c0 85 <unknown>
 
 prfh	pstl3strm, p0, [x0]
 // CHECK-INST: prfh	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 20 c0 85 <unknown>
 
 prfh    #14, p0, [x0]
 // CHECK-INST: prfh	#14, p0, [x0]
 // CHECK-ENCODING: [0x0e,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0e 20 c0 85 <unknown>
 
 prfh    #15, p0, [x0]
 // CHECK-INST: prfh	#15, p0, [x0]
 // CHECK-ENCODING: [0x0f,0x20,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 20 c0 85 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -186,11 +186,11 @@ prfh    #15, p0, [x0]
 prfh    pldl1strm, p0, [x0, #-32, mul vl]
 // CHECK-INST: prfh     pldl1strm, p0, [x0, #-32, mul vl]
 // CHECK-ENCODING: [0x01,0x20,0xe0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 20 e0 85
 
 prfh    pldl1strm, p0, [x0, #31, mul vl]
 // CHECK-INST: prfh     pldl1strm, p0, [x0, #31, mul vl]
 // CHECK-ENCODING: [0x01,0x20,0xdf,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 20 df 85
diff --git a/llvm/test/MC/AArch64/SVE/prfw.s b/llvm/test/MC/AArch64/SVE/prfw.s
index 2805dd0f3d177..46d1ee2dc7910 100644
--- a/llvm/test/MC/AArch64/SVE/prfw.s
+++ b/llvm/test/MC/AArch64/SVE/prfw.s
@@ -15,169 +15,169 @@
 prfw    #0, p0, [x0]
 // CHECK-INST: prfw	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c0 85 <unknown>
 
 prfw	pldl1keep, p0, [x0]
 // CHECK-INST: prfw	pldl1keep, p0, [x0]
 // CHECK-ENCODING: [0x00,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c0 85 <unknown>
 
 prfw    #1, p0, [x0]
 // CHECK-INST: prfw	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 40 c0 85 <unknown>
 
 prfw	pldl1strm, p0, [x0]
 // CHECK-INST: prfw	pldl1strm, p0, [x0]
 // CHECK-ENCODING: [0x01,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 40 c0 85 <unknown>
 
 prfw    #2, p0, [x0]
 // CHECK-INST: prfw	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 40 c0 85 <unknown>
 
 prfw	pldl2keep, p0, [x0]
 // CHECK-INST: prfw	pldl2keep, p0, [x0]
 // CHECK-ENCODING: [0x02,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 02 40 c0 85 <unknown>
 
 prfw    #3, p0, [x0]
 // CHECK-INST: prfw	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 40 c0 85 <unknown>
 
 prfw	pldl2strm, p0, [x0]
 // CHECK-INST: prfw	pldl2strm, p0, [x0]
 // CHECK-ENCODING: [0x03,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 03 40 c0 85 <unknown>
 
 prfw    #4, p0, [x0]
 // CHECK-INST: prfw	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 40 c0 85 <unknown>
 
 prfw	pldl3keep, p0, [x0]
 // CHECK-INST: prfw	pldl3keep, p0, [x0]
 // CHECK-ENCODING: [0x04,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 04 40 c0 85 <unknown>
 
 prfw    #5, p0, [x0]
 // CHECK-INST: prfw	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 40 c0 85 <unknown>
 
 prfw	pldl3strm, p0, [x0]
 // CHECK-INST: prfw	pldl3strm, p0, [x0]
 // CHECK-ENCODING: [0x05,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 40 c0 85 <unknown>
 
 prfw    #6, p0, [x0]
 // CHECK-INST: prfw	#6, p0, [x0]
 // CHECK-ENCODING: [0x06,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 06 40 c0 85 <unknown>
 
 prfw    #7, p0, [x0]
 // CHECK-INST: prfw	#7, p0, [x0]
 // CHECK-ENCODING: [0x07,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 40 c0 85 <unknown>
 
 prfw    #8, p0, [x0]
 // CHECK-INST: prfw	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 40 c0 85 <unknown>
 
 prfw	pstl1keep, p0, [x0]
 // CHECK-INST: prfw	pstl1keep, p0, [x0]
 // CHECK-ENCODING: [0x08,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 08 40 c0 85 <unknown>
 
 prfw    #9, p0, [x0]
 // CHECK-INST: prfw	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 40 c0 85 <unknown>
 
 prfw	pstl1strm, p0, [x0]
 // CHECK-INST: prfw	pstl1strm, p0, [x0]
 // CHECK-ENCODING: [0x09,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 09 40 c0 85 <unknown>
 
 prfw    #10, p0, [x0]
 // CHECK-INST: prfw	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 40 c0 85 <unknown>
 
 prfw	pstl2keep, p0, [x0]
 // CHECK-INST: prfw	pstl2keep, p0, [x0]
 // CHECK-ENCODING: [0x0a,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0a 40 c0 85 <unknown>
 
 prfw    #11, p0, [x0]
 // CHECK-INST: prfw	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 40 c0 85 <unknown>
 
 prfw	pstl2strm, p0, [x0]
 // CHECK-INST: prfw	pstl2strm, p0, [x0]
 // CHECK-ENCODING: [0x0b,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0b 40 c0 85 <unknown>
 
 prfw    #12, p0, [x0]
 // CHECK-INST: prfw	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 40 c0 85 <unknown>
 
 prfw	pstl3keep, p0, [x0]
 // CHECK-INST: prfw	pstl3keep, p0, [x0]
 // CHECK-ENCODING: [0x0c,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0c 40 c0 85 <unknown>
 
 prfw    #13, p0, [x0]
 // CHECK-INST: prfw	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 40 c0 85 <unknown>
 
 prfw	pstl3strm, p0, [x0]
 // CHECK-INST: prfw	pstl3strm, p0, [x0]
 // CHECK-ENCODING: [0x0d,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0d 40 c0 85 <unknown>
 
 prfw    #14, p0, [x0]
 // CHECK-INST: prfw	#14, p0, [x0]
 // CHECK-ENCODING: [0x0e,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0e 40 c0 85 <unknown>
 
 prfw    #15, p0, [x0]
 // CHECK-INST: prfw	#15, p0, [x0]
 // CHECK-ENCODING: [0x0f,0x40,0xc0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 40 c0 85 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -186,11 +186,11 @@ prfw    #15, p0, [x0]
 prfw    pldl1strm, p0, [x0, #-32, mul vl]
 // CHECK-INST: prfw     pldl1strm, p0, [x0, #-32, mul vl]
 // CHECK-ENCODING: [0x01,0x40,0xe0,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 40 e0 85
 
 prfw    pldl1strm, p0, [x0, #31, mul vl]
 // CHECK-INST: prfw     pldl1strm, p0, [x0, #31, mul vl]
 // CHECK-ENCODING: [0x01,0x40,0xdf,0x85]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 01 40 df 85
diff --git a/llvm/test/MC/AArch64/SVE/ptest.s b/llvm/test/MC/AArch64/SVE/ptest.s
index 449f89dae79f7..42ed050848dc2 100644
--- a/llvm/test/MC/AArch64/SVE/ptest.s
+++ b/llvm/test/MC/AArch64/SVE/ptest.s
@@ -12,11 +12,11 @@
 ptest p15, p0.b
 // CHECK-INST: ptest	p15, p0.b
 // CHECK-ENCODING: [0x00,0xfc,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc 50 25 <unknown>
 
 ptest p15, p15.b
 // CHECK-INST: ptest	p15, p15.b
 // CHECK-ENCODING: [0xe0,0xfd,0x50,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fd 50 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ptrue.s b/llvm/test/MC/AArch64/SVE/ptrue.s
index a9118b0901ab6..f230b1feec6d8 100644
--- a/llvm/test/MC/AArch64/SVE/ptrue.s
+++ b/llvm/test/MC/AArch64/SVE/ptrue.s
@@ -16,25 +16,25 @@
 ptrue   p0.b, pow2
 // CHECK-INST: ptrue   p0.b, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x18,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 18 25 <unknown>
 
 ptrue   p0.h, pow2
 // CHECK-INST: ptrue   p0.h, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x58,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 58 25 <unknown>
 
 ptrue   p0.s, pow2
 // CHECK-INST: ptrue   p0.s, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 98 25 <unknown>
 
 ptrue   p0.d, pow2
 // CHECK-INST: ptrue   p0.d, pow2
 // CHECK-ENCODING: [0x00,0xe0,0xd8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 d8 25 <unknown>
 
 // ---------------------------------------------------------------------------//
@@ -44,25 +44,25 @@ ptrue   p0.d, pow2
 ptrue   p15.b
 // CHECK-INST: ptrue   p15.b
 // CHECK-ENCODING: [0xef,0xe3,0x18,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef e3 18 25 <unknown>
 
 ptrue   p15.h
 // CHECK-INST: ptrue   p15.h
 // CHECK-ENCODING: [0xef,0xe3,0x58,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef e3 58 25 <unknown>
 
 ptrue   p15.s
 // CHECK-INST: ptrue   p15.s
 // CHECK-ENCODING: [0xef,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef e3 98 25 <unknown>
 
 ptrue   p15.d
 // CHECK-INST: ptrue   p15.d
 // CHECK-ENCODING: [0xef,0xe3,0xd8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef e3 d8 25 <unknown>
 
 // ---------------------------------------------------------------------------//
@@ -72,103 +72,103 @@ ptrue   p15.d
 ptrue   p7.s, #1
 // CHECK-INST: ptrue   p7.s, vl1
 // CHECK-ENCODING: [0x27,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 27 e0 98 25 <unknown>
 
 ptrue   p7.s, vl1
 // CHECK-INST: ptrue   p7.s, vl1
 // CHECK-ENCODING: [0x27,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 27 e0 98 25 <unknown>
 
 ptrue   p7.s, vl2
 // CHECK-INST: ptrue   p7.s, vl2
 // CHECK-ENCODING: [0x47,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 47 e0 98 25 <unknown>
 
 ptrue   p7.s, vl3
 // CHECK-INST: ptrue   p7.s, vl3
 // CHECK-ENCODING: [0x67,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 67 e0 98 25 <unknown>
 
 ptrue   p7.s, vl4
 // CHECK-INST: ptrue   p7.s, vl4
 // CHECK-ENCODING: [0x87,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 87 e0 98 25 <unknown>
 
 ptrue   p7.s, vl5
 // CHECK-INST: ptrue   p7.s, vl5
 // CHECK-ENCODING: [0xa7,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a7 e0 98 25 <unknown>
 
 ptrue   p7.s, vl6
 // CHECK-INST: ptrue   p7.s, vl6
 // CHECK-ENCODING: [0xc7,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c7 e0 98 25 <unknown>
 
 ptrue   p7.s, vl7
 // CHECK-INST: ptrue   p7.s, vl7
 // CHECK-ENCODING: [0xe7,0xe0,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e7 e0 98 25 <unknown>
 
 ptrue   p7.s, vl8
 // CHECK-INST: ptrue   p7.s, vl8
 // CHECK-ENCODING: [0x07,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 e1 98 25 <unknown>
 
 ptrue   p7.s, vl16
 // CHECK-INST: ptrue   p7.s, vl16
 // CHECK-ENCODING: [0x27,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 27 e1 98 25 <unknown>
 
 ptrue   p7.s, vl32
 // CHECK-INST: ptrue   p7.s, vl32
 // CHECK-ENCODING: [0x47,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 47 e1 98 25 <unknown>
 
 ptrue   p7.s, vl64
 // CHECK-INST: ptrue   p7.s, vl64
 // CHECK-ENCODING: [0x67,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 67 e1 98 25 <unknown>
 
 ptrue   p7.s, vl128
 // CHECK-INST: ptrue   p7.s, vl128
 // CHECK-ENCODING: [0x87,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 87 e1 98 25 <unknown>
 
 ptrue   p7.s, vl256
 // CHECK-INST: ptrue   p7.s, vl256
 // CHECK-ENCODING: [0xa7,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a7 e1 98 25 <unknown>
 
 ptrue   p7.s, mul4
 // CHECK-INST: ptrue   p7.s, mul4
 // CHECK-ENCODING: [0xa7,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a7 e3 98 25 <unknown>
 
 ptrue   p7.s, mul3
 // CHECK-INST: ptrue   p7.s, mul3
 // CHECK-ENCODING: [0xc7,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c7 e3 98 25 <unknown>
 
 ptrue   p7.s, all
 // CHECK-INST: ptrue   p7.s
 // CHECK-ENCODING: [0xe7,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e7 e3 98 25 <unknown>
 
 // ---------------------------------------------------------------------------//
@@ -178,89 +178,89 @@ ptrue   p7.s, all
 ptrue   p7.s, #14
 // CHECK-INST: ptrue   p7.s, #14
 // CHECK-ENCODING: [0xc7,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c7 e1 98 25 <unknown>
 
 ptrue   p7.s, #15
 // CHECK-INST: ptrue   p7.s, #15
 // CHECK-ENCODING: [0xe7,0xe1,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e7 e1 98 25 <unknown>
 
 ptrue   p7.s, #16
 // CHECK-INST: ptrue   p7.s, #16
 // CHECK-ENCODING: [0x07,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 e2 98 25 <unknown>
 
 ptrue   p7.s, #17
 // CHECK-INST: ptrue   p7.s, #17
 // CHECK-ENCODING: [0x27,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 27 e2 98 25 <unknown>
 
 ptrue   p7.s, #18
 // CHECK-INST: ptrue   p7.s, #18
 // CHECK-ENCODING: [0x47,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 47 e2 98 25 <unknown>
 
 ptrue   p7.s, #19
 // CHECK-INST: ptrue   p7.s, #19
 // CHECK-ENCODING: [0x67,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 67 e2 98 25 <unknown>
 
 ptrue   p7.s, #20
 // CHECK-INST: ptrue   p7.s, #20
 // CHECK-ENCODING: [0x87,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 87 e2 98 25 <unknown>
 
 ptrue   p7.s, #21
 // CHECK-INST: ptrue   p7.s, #21
 // CHECK-ENCODING: [0xa7,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a7 e2 98 25 <unknown>
 
 ptrue   p7.s, #22
 // CHECK-INST: ptrue   p7.s, #22
 // CHECK-ENCODING: [0xc7,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c7 e2 98 25 <unknown>
 
 ptrue   p7.s, #23
 // CHECK-INST: ptrue   p7.s, #23
 // CHECK-ENCODING: [0xe7,0xe2,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e7 e2 98 25 <unknown>
 
 ptrue   p7.s, #24
 // CHECK-INST: ptrue   p7.s, #24
 // CHECK-ENCODING: [0x07,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 e3 98 25 <unknown>
 
 ptrue   p7.s, #25
 // CHECK-INST: ptrue   p7.s, #25
 // CHECK-ENCODING: [0x27,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 27 e3 98 25 <unknown>
 
 ptrue   p7.s, #26
 // CHECK-INST: ptrue   p7.s, #26
 // CHECK-ENCODING: [0x47,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 47 e3 98 25 <unknown>
 
 ptrue   p7.s, #27
 // CHECK-INST: ptrue   p7.s, #27
 // CHECK-ENCODING: [0x67,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 67 e3 98 25 <unknown>
 
 ptrue   p7.s, #28
 // CHECK-INST: ptrue   p7.s, #28
 // CHECK-ENCODING: [0x87,0xe3,0x98,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 87 e3 98 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ptrues.s b/llvm/test/MC/AArch64/SVE/ptrues.s
index f9fcb5d70a103..b8cff9e678cc5 100644
--- a/llvm/test/MC/AArch64/SVE/ptrues.s
+++ b/llvm/test/MC/AArch64/SVE/ptrues.s
@@ -16,25 +16,25 @@
 ptrues   p0.b, pow2
 // CHECK-INST: ptrues   p0.b, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x19,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	00 e0 19 25  <unknown>
 
 ptrues   p0.h, pow2
 // CHECK-INST: ptrues   p0.h, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x59,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	00 e0 59 25  <unknown>
 
 ptrues   p0.s, pow2
 // CHECK-INST: ptrues   p0.s, pow2
 // CHECK-ENCODING: [0x00,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	00 e0 99 25  <unknown>
 
 ptrues   p0.d, pow2
 // CHECK-INST: ptrues   p0.d, pow2
 // CHECK-ENCODING: [0x00,0xe0,0xd9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	00 e0 d9 25  <unknown>
 
 // ---------------------------------------------------------------------------//
@@ -44,25 +44,25 @@ ptrues   p0.d, pow2
 ptrues   p15.b
 // CHECK-INST: ptrues   p15.b
 // CHECK-ENCODING: [0xef,0xe3,0x19,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	ef e3 19 25  <unknown>
 
 ptrues   p15.h
 // CHECK-INST: ptrues   p15.h
 // CHECK-ENCODING: [0xef,0xe3,0x59,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	ef e3 59 25  <unknown>
 
 ptrues   p15.s
 // CHECK-INST: ptrues   p15.s
 // CHECK-ENCODING: [0xef,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	ef e3 99 25  <unknown>
 
 ptrues   p15.d
 // CHECK-INST: ptrues   p15.d
 // CHECK-ENCODING: [0xef,0xe3,0xd9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	ef e3 d9 25  <unknown>
 
 // ---------------------------------------------------------------------------//
@@ -72,103 +72,103 @@ ptrues   p15.d
 ptrues   p7.s, #1
 // CHECK-INST: ptrues   p7.s, vl1
 // CHECK-ENCODING: [0x27,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	27 e0 99 25  <unknown>
 
 ptrues   p7.s, vl1
 // CHECK-INST: ptrues   p7.s, vl1
 // CHECK-ENCODING: [0x27,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	27 e0 99 25  <unknown>
 
 ptrues   p7.s, vl2
 // CHECK-INST: ptrues   p7.s, vl2
 // CHECK-ENCODING: [0x47,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	47 e0 99 25  <unknown>
 
 ptrues   p7.s, vl3
 // CHECK-INST: ptrues   p7.s, vl3
 // CHECK-ENCODING: [0x67,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	67 e0 99 25  <unknown>
 
 ptrues   p7.s, vl4
 // CHECK-INST: ptrues   p7.s, vl4
 // CHECK-ENCODING: [0x87,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	87 e0 99 25  <unknown>
 
 ptrues   p7.s, vl5
 // CHECK-INST: ptrues   p7.s, vl5
 // CHECK-ENCODING: [0xa7,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	a7 e0 99 25  <unknown>
 
 ptrues   p7.s, vl6
 // CHECK-INST: ptrues   p7.s, vl6
 // CHECK-ENCODING: [0xc7,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	c7 e0 99 25  <unknown>
 
 ptrues   p7.s, vl7
 // CHECK-INST: ptrues   p7.s, vl7
 // CHECK-ENCODING: [0xe7,0xe0,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	e7 e0 99 25  <unknown>
 
 ptrues   p7.s, vl8
 // CHECK-INST: ptrues   p7.s, vl8
 // CHECK-ENCODING: [0x07,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	07 e1 99 25  <unknown>
 
 ptrues   p7.s, vl16
 // CHECK-INST: ptrues   p7.s, vl16
 // CHECK-ENCODING: [0x27,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	27 e1 99 25  <unknown>
 
 ptrues   p7.s, vl32
 // CHECK-INST: ptrues   p7.s, vl32
 // CHECK-ENCODING: [0x47,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	47 e1 99 25  <unknown>
 
 ptrues   p7.s, vl64
 // CHECK-INST: ptrues   p7.s, vl64
 // CHECK-ENCODING: [0x67,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	67 e1 99 25  <unknown>
 
 ptrues   p7.s, vl128
 // CHECK-INST: ptrues   p7.s, vl128
 // CHECK-ENCODING: [0x87,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	87 e1 99 25  <unknown>
 
 ptrues   p7.s, vl256
 // CHECK-INST: ptrues   p7.s, vl256
 // CHECK-ENCODING: [0xa7,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	a7 e1 99 25  <unknown>
 
 ptrues   p7.s, mul4
 // CHECK-INST: ptrues   p7.s, mul4
 // CHECK-ENCODING: [0xa7,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	a7 e3 99 25  <unknown>
 
 ptrues   p7.s, mul3
 // CHECK-INST: ptrues   p7.s, mul3
 // CHECK-ENCODING: [0xc7,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	c7 e3 99 25  <unknown>
 
 ptrues   p7.s, all
 // CHECK-INST: ptrues   p7.s
 // CHECK-ENCODING: [0xe7,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN:	e7 e3 99 25  <unknown>
 
 // ---------------------------------------------------------------------------//
@@ -178,89 +178,89 @@ ptrues   p7.s, all
 ptrues   p7.s, #14
 // CHECK-INST: ptrues   p7.s, #14
 // CHECK-ENCODING: [0xc7,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c7 e1 99 25 <unknown>
 
 ptrues   p7.s, #15
 // CHECK-INST: ptrues   p7.s, #15
 // CHECK-ENCODING: [0xe7,0xe1,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e7 e1 99 25 <unknown>
 
 ptrues   p7.s, #16
 // CHECK-INST: ptrues   p7.s, #16
 // CHECK-ENCODING: [0x07,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 e2 99 25 <unknown>
 
 ptrues   p7.s, #17
 // CHECK-INST: ptrues   p7.s, #17
 // CHECK-ENCODING: [0x27,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 27 e2 99 25 <unknown>
 
 ptrues   p7.s, #18
 // CHECK-INST: ptrues   p7.s, #18
 // CHECK-ENCODING: [0x47,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 47 e2 99 25 <unknown>
 
 ptrues   p7.s, #19
 // CHECK-INST: ptrues   p7.s, #19
 // CHECK-ENCODING: [0x67,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 67 e2 99 25 <unknown>
 
 ptrues   p7.s, #20
 // CHECK-INST: ptrues   p7.s, #20
 // CHECK-ENCODING: [0x87,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 87 e2 99 25 <unknown>
 
 ptrues   p7.s, #21
 // CHECK-INST: ptrues   p7.s, #21
 // CHECK-ENCODING: [0xa7,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a7 e2 99 25 <unknown>
 
 ptrues   p7.s, #22
 // CHECK-INST: ptrues   p7.s, #22
 // CHECK-ENCODING: [0xc7,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c7 e2 99 25 <unknown>
 
 ptrues   p7.s, #23
 // CHECK-INST: ptrues   p7.s, #23
 // CHECK-ENCODING: [0xe7,0xe2,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e7 e2 99 25 <unknown>
 
 ptrues   p7.s, #24
 // CHECK-INST: ptrues   p7.s, #24
 // CHECK-ENCODING: [0x07,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 07 e3 99 25 <unknown>
 
 ptrues   p7.s, #25
 // CHECK-INST: ptrues   p7.s, #25
 // CHECK-ENCODING: [0x27,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 27 e3 99 25 <unknown>
 
 ptrues   p7.s, #26
 // CHECK-INST: ptrues   p7.s, #26
 // CHECK-ENCODING: [0x47,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 47 e3 99 25 <unknown>
 
 ptrues   p7.s, #27
 // CHECK-INST: ptrues   p7.s, #27
 // CHECK-ENCODING: [0x67,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 67 e3 99 25 <unknown>
 
 ptrues   p7.s, #28
 // CHECK-INST: ptrues   p7.s, #28
 // CHECK-ENCODING: [0x87,0xe3,0x99,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 87 e3 99 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/punpkhi.s b/llvm/test/MC/AArch64/SVE/punpkhi.s
index 0c27f30c49749..9a368b87c2e96 100644
--- a/llvm/test/MC/AArch64/SVE/punpkhi.s
+++ b/llvm/test/MC/AArch64/SVE/punpkhi.s
@@ -12,11 +12,11 @@
 punpkhi p0.h, p0.b
 // CHECK-INST: punpkhi	p0.h, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x31,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 31 05 <unknown>
 
 punpkhi p15.h, p15.b
 // CHECK-INST: punpkhi	p15.h, p15.b
 // CHECK-ENCODING: [0xef,0x41,0x31,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 41 31 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/punpklo.s b/llvm/test/MC/AArch64/SVE/punpklo.s
index 3a9f076ed7dac..8ca8bb2f75678 100644
--- a/llvm/test/MC/AArch64/SVE/punpklo.s
+++ b/llvm/test/MC/AArch64/SVE/punpklo.s
@@ -12,11 +12,11 @@
 punpklo p0.h, p0.b
 // CHECK-INST: punpklo	p0.h, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x30,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 30 05 <unknown>
 
 punpklo p15.h, p15.b
 // CHECK-INST: punpklo	p15.h, p15.b
 // CHECK-ENCODING: [0xef,0x41,0x30,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 41 30 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/rbit.s b/llvm/test/MC/AArch64/SVE/rbit.s
index 70e2036727486..ac299bf1cbe08 100644
--- a/llvm/test/MC/AArch64/SVE/rbit.s
+++ b/llvm/test/MC/AArch64/SVE/rbit.s
@@ -12,25 +12,25 @@
 rbit  z0.b, p7/m, z31.b
 // CHECK-INST: rbit	z0.b, p7/m, z31.b
 // CHECK-ENCODING: [0xe0,0x9f,0x27,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 27 05 <unknown>
 
 rbit  z0.h, p7/m, z31.h
 // CHECK-INST: rbit	z0.h, p7/m, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x67,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 67 05 <unknown>
 
 rbit  z0.s, p7/m, z31.s
 // CHECK-INST: rbit	z0.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xa7,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f a7 05 <unknown>
 
 rbit  z0.d, p7/m, z31.d
 // CHECK-INST: rbit	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe7,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e7 05 <unknown>
 
 
@@ -40,23 +40,23 @@ rbit  z0.d, p7/m, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 rbit  z0.d, p7/m, z31.d
 // CHECK-INST: rbit	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe7,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e7 05 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 rbit  z0.d, p7/m, z31.d
 // CHECK-INST: rbit	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe7,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e7 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/rdvl.s b/llvm/test/MC/AArch64/SVE/rdvl.s
index 5cf8c368e9077..50877f5c7342d 100644
--- a/llvm/test/MC/AArch64/SVE/rdvl.s
+++ b/llvm/test/MC/AArch64/SVE/rdvl.s
@@ -12,23 +12,23 @@
 rdvl    x0, #0
 // CHECK-INST: rdvl    x0, #0
 // CHECK-ENCODING: [0x00,0x50,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 50 bf 04 <unknown>
 
 rdvl    xzr, #-1
 // CHECK-INST: rdvl    xzr, #-1
 // CHECK-ENCODING: [0xff,0x57,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 57 bf 04 <unknown>
 
 rdvl    x23, #31
 // CHECK-INST: rdvl    x23, #31
 // CHECK-ENCODING: [0xf7,0x53,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: f7 53 bf 04 <unknown>
 
 rdvl    x21, #-32
 // CHECK-INST: rdvl    x21, #-32
 // CHECK-ENCODING: [0x15,0x54,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 15 54 bf 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/rev.s b/llvm/test/MC/AArch64/SVE/rev.s
index 47f9b758d5318..f562401e9567c 100644
--- a/llvm/test/MC/AArch64/SVE/rev.s
+++ b/llvm/test/MC/AArch64/SVE/rev.s
@@ -12,23 +12,23 @@
 rev   z0.b, z31.b
 // CHECK-INST: rev	z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0x3b,0x38,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3b 38 05 <unknown>
 
 rev   z0.h, z31.h
 // CHECK-INST: rev	z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x3b,0x78,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3b 78 05 <unknown>
 
 rev   z0.s, z31.s
 // CHECK-INST: rev	z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x3b,0xb8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3b b8 05 <unknown>
 
 rev   z0.d, z31.d
 // CHECK-INST: rev	z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x3b,0xf8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3b f8 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/revb.s b/llvm/test/MC/AArch64/SVE/revb.s
index c71da4bd90a6f..d8e490dbe8eb6 100644
--- a/llvm/test/MC/AArch64/SVE/revb.s
+++ b/llvm/test/MC/AArch64/SVE/revb.s
@@ -12,19 +12,19 @@
 revb  z0.h, p7/m, z31.h
 // CHECK-INST: revb	z0.h, p7/m, z31.h
 // CHECK-ENCODING: [0xe0,0x9f,0x64,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f 64 05 <unknown>
 
 revb  z0.s, p7/m, z31.s
 // CHECK-INST: revb	z0.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xa4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f a4 05 <unknown>
 
 revb  z0.d, p7/m, z31.d
 // CHECK-INST: revb	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e4 05 <unknown>
 
 
@@ -34,23 +34,23 @@ revb  z0.d, p7/m, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 revb  z0.d, p7/m, z31.d
 // CHECK-INST: revb	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e4 05 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 revb  z0.d, p7/m, z31.d
 // CHECK-INST: revb	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe4,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e4 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/revh.s b/llvm/test/MC/AArch64/SVE/revh.s
index 1cc8f81b73640..687fe9e08e7ce 100644
--- a/llvm/test/MC/AArch64/SVE/revh.s
+++ b/llvm/test/MC/AArch64/SVE/revh.s
@@ -12,13 +12,13 @@
 revh  z0.s, p7/m, z31.s
 // CHECK-INST: revh	z0.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe0,0x9f,0xa5,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f a5 05 <unknown>
 
 revh  z0.d, p7/m, z31.d
 // CHECK-INST: revh	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe5,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e5 05 <unknown>
 
 
@@ -28,23 +28,23 @@ revh  z0.d, p7/m, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 revh  z0.d, p7/m, z31.d
 // CHECK-INST: revh	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe5,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e5 05 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 revh  z0.d, p7/m, z31.d
 // CHECK-INST: revh	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe5,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e5 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/revw.s b/llvm/test/MC/AArch64/SVE/revw.s
index 2a7afdcfdffb6..3f50f4a95f4c7 100644
--- a/llvm/test/MC/AArch64/SVE/revw.s
+++ b/llvm/test/MC/AArch64/SVE/revw.s
@@ -12,7 +12,7 @@
 revw  z0.d, p7/m, z31.d
 // CHECK-INST: revw	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe6,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e6 05 <unknown>
 
 
@@ -22,23 +22,23 @@ revw  z0.d, p7/m, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 revw  z0.d, p7/m, z31.d
 // CHECK-INST: revw	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe6,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e6 05 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 revw  z0.d, p7/m, z31.d
 // CHECK-INST: revw	z0.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe0,0x9f,0xe6,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 9f e6 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sabd.s b/llvm/test/MC/AArch64/SVE/sabd.s
index 421d46d78699d..e03136aa64021 100644
--- a/llvm/test/MC/AArch64/SVE/sabd.s
+++ b/llvm/test/MC/AArch64/SVE/sabd.s
@@ -12,25 +12,25 @@
 sabd  z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: sabd	z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x0c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 0c 04 <unknown>
 
 sabd  z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: sabd	z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x4c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 4c 04 <unknown>
 
 sabd  z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: sabd	z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x8c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 8c 04 <unknown>
 
 sabd  z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: sabd	z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xcc,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f cc 04 <unknown>
 
 
@@ -40,23 +40,23 @@ sabd  z31.d, p7/m, z31.d, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 sabd  z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: sabd	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xcc,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f cc 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sabd  z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: sabd	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xcc,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f cc 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/saddv.s b/llvm/test/MC/AArch64/SVE/saddv.s
index e449aae094579..f2ce4d91220c4 100644
--- a/llvm/test/MC/AArch64/SVE/saddv.s
+++ b/llvm/test/MC/AArch64/SVE/saddv.s
@@ -12,17 +12,17 @@
 saddv d0, p7, z31.b
 // CHECK-INST: saddv	d0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x00,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 00 04 <unknown>
 
 saddv d0, p7, z31.h
 // CHECK-INST: saddv	d0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x40,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 40 04 <unknown>
 
 saddv d0, p7, z31.s
 // CHECK-INST: saddv	d0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x80,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 80 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/scvtf.s b/llvm/test/MC/AArch64/SVE/scvtf.s
index f10f158bd850c..a7cd2079453f5 100644
--- a/llvm/test/MC/AArch64/SVE/scvtf.s
+++ b/llvm/test/MC/AArch64/SVE/scvtf.s
@@ -12,43 +12,43 @@
 scvtf   z0.h, p0/m, z0.h
 // CHECK-INST: scvtf   z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x52,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 52 65 <unknown>
 
 scvtf   z0.h, p0/m, z0.s
 // CHECK-INST: scvtf   z0.h, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x54,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 54 65 <unknown>
 
 scvtf   z0.h, p0/m, z0.d
 // CHECK-INST: scvtf   z0.h, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0x56,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 56 65 <unknown>
 
 scvtf   z0.s, p0/m, z0.s
 // CHECK-INST: scvtf   z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x94,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 94 65 <unknown>
 
 scvtf   z0.s, p0/m, z0.d
 // CHECK-INST: scvtf   z0.s, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd4,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d4 65 <unknown>
 
 scvtf   z0.d, p0/m, z0.s
 // CHECK-INST: scvtf   z0.d, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0xd0,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d0 65 <unknown>
 
 scvtf   z0.d, p0/m, z0.d
 // CHECK-INST: scvtf   z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d6 65 <unknown>
 
 
@@ -58,23 +58,23 @@ scvtf   z0.d, p0/m, z0.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 scvtf   z5.d, p0/m, z0.d
 // CHECK-INST: scvtf	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xd6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 d6 65 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 scvtf   z5.d, p0/m, z0.d
 // CHECK-INST: scvtf	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xd6,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 d6 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sdiv.s b/llvm/test/MC/AArch64/SVE/sdiv.s
index dffad1b1198c1..d75e449292cf4 100644
--- a/llvm/test/MC/AArch64/SVE/sdiv.s
+++ b/llvm/test/MC/AArch64/SVE/sdiv.s
@@ -12,13 +12,13 @@
 sdiv   z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: sdiv	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x1f,0x94,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 94 04 <unknown>
 
 sdiv   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: sdiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d4 04 <unknown>
 
 
@@ -28,23 +28,23 @@ sdiv   z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 sdiv   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: sdiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d4 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sdiv   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: sdiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d4 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sdivr.s b/llvm/test/MC/AArch64/SVE/sdivr.s
index 317b61a5d4c7b..6f8a75b9fa78f 100644
--- a/llvm/test/MC/AArch64/SVE/sdivr.s
+++ b/llvm/test/MC/AArch64/SVE/sdivr.s
@@ -12,13 +12,13 @@
 sdivr  z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: sdivr	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x1f,0x96,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 96 04 <unknown>
 
 sdivr  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: sdivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d6 04 <unknown>
 
 
@@ -28,23 +28,23 @@ sdivr  z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 sdivr  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: sdivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d6 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sdivr  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: sdivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d6 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sdot.s b/llvm/test/MC/AArch64/SVE/sdot.s
index 63d4671592cc0..0fe300c399379 100644
--- a/llvm/test/MC/AArch64/SVE/sdot.s
+++ b/llvm/test/MC/AArch64/SVE/sdot.s
@@ -12,25 +12,25 @@
 sdot  z0.s, z1.b, z31.b
 // CHECK-INST: sdot	z0.s, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x00,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 9f 44 <unknown>
 
 sdot  z0.d, z1.h, z31.h
 // CHECK-INST: sdot	z0.d, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x00,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 df 44 <unknown>
 
 sdot  z0.s, z1.b, z7.b[3]
 // CHECK-INST: sdot	z0.s, z1.b, z7.b[3]
 // CHECK-ENCODING: [0x20,0x00,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 bf 44 <unknown>
 
 sdot  z0.d, z1.h, z15.h[1]
 // CHECK-INST: sdot	z0.d, z1.h, z15.h[1]
 // CHECK-ENCODING: [0x20,0x00,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 ff 44 <unknown>
 
 
@@ -40,23 +40,23 @@ sdot  z0.d, z1.h, z15.h[1]
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sdot  z0.d, z1.h, z31.h
 // CHECK-INST: sdot	z0.d, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x00,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 df 44 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sdot  z0.d, z1.h, z15.h[1]
 // CHECK-INST: sdot	z0.d, z1.h, z15.h[1]
 // CHECK-ENCODING: [0x20,0x00,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 00 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sel.s b/llvm/test/MC/AArch64/SVE/sel.s
index c5c2ce435adb1..3cb4dbab39f43 100644
--- a/llvm/test/MC/AArch64/SVE/sel.s
+++ b/llvm/test/MC/AArch64/SVE/sel.s
@@ -12,59 +12,59 @@
 sel     p0.b, p0, p0.b, p0.b
 // CHECK-INST: mov     p0.b, p0/m, p0.b
 // CHECK-ENCODING: [0x10,0x42,0x00,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 10 42 00 25 <unknown>
 
 sel     p15.b, p15, p15.b, p15.b
 // CHECK-INST: mov     p15.b, p15/m, p15.b
 // CHECK-ENCODING: [0xff,0x7f,0x0f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 7f 0f 25 <unknown>
 
 sel     z31.b, p15, z31.b, z31.b
 // CHECK-INST: mov     z31.b, p15/m, z31.b
 // CHECK-ENCODING: [0xff,0xff,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 3f 05 <unknown>
 
 sel     z31.h, p15, z31.h, z31.h
 // CHECK-INST: mov     z31.h, p15/m, z31.h
 // CHECK-ENCODING: [0xff,0xff,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 7f 05 <unknown>
 
 sel     z31.s, p15, z31.s, z31.s
 // CHECK-INST: mov     z31.s, p15/m, z31.s
 // CHECK-ENCODING: [0xff,0xff,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff bf 05 <unknown>
 
 sel     z31.d, p15, z31.d, z31.d
 // CHECK-INST: mov     z31.d, p15/m, z31.d
 // CHECK-ENCODING: [0xff,0xff,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff ff 05 <unknown>
 
 sel     z23.s, p11, z13.s, z8.s
 // CHECK-INST: sel     z23.s, p11, z13.s, z8.s
 // CHECK-ENCODING: [0xb7,0xed,0xa8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed a8 05 <unknown>
 
 sel     z23.d, p11, z13.d, z8.d
 // CHECK-INST: sel     z23.d, p11, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0xed,0xe8,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed e8 05 <unknown>
 
 sel     z23.h, p11, z13.h, z8.h
 // CHECK-INST: sel     z23.h, p11, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0xed,0x68,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 68 05 <unknown>
 
 sel     z23.b, p11, z13.b, z8.b
 // CHECK-INST: sel     z23.b, p11, z13.b, z8.b
 // CHECK-ENCODING: [0xb7,0xed,0x28,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 28 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/smax.s b/llvm/test/MC/AArch64/SVE/smax.s
index f668029c328c2..1de6ccf907ba1 100644
--- a/llvm/test/MC/AArch64/SVE/smax.s
+++ b/llvm/test/MC/AArch64/SVE/smax.s
@@ -12,73 +12,73 @@
 smax    z0.b, z0.b, #-128
 // CHECK-INST: smax	z0.b, z0.b, #-128
 // CHECK-ENCODING: [0x00,0xd0,0x28,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 28 25 <unknown>
 
 smax    z31.b, z31.b, #127
 // CHECK-INST: smax	z31.b, z31.b, #127
 // CHECK-ENCODING: [0xff,0xcf,0x28,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf 28 25 <unknown>
 
 smax    z0.h, z0.h, #-128
 // CHECK-INST: smax	z0.h, z0.h, #-128
 // CHECK-ENCODING: [0x00,0xd0,0x68,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 68 25 <unknown>
 
 smax    z31.h, z31.h, #127
 // CHECK-INST: smax	z31.h, z31.h, #127
 // CHECK-ENCODING: [0xff,0xcf,0x68,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf 68 25 <unknown>
 
 smax    z0.s, z0.s, #-128
 // CHECK-INST: smax	z0.s, z0.s, #-128
 // CHECK-ENCODING: [0x00,0xd0,0xa8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 a8 25 <unknown>
 
 smax    z31.s, z31.s, #127
 // CHECK-INST: smax	z31.s, z31.s, #127
 // CHECK-ENCODING: [0xff,0xcf,0xa8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf a8 25 <unknown>
 
 smax    z0.d, z0.d, #-128
 // CHECK-INST: smax	z0.d, z0.d, #-128
 // CHECK-ENCODING: [0x00,0xd0,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 e8 25 <unknown>
 
 smax    z31.d, z31.d, #127
 // CHECK-INST: smax	z31.d, z31.d, #127
 // CHECK-ENCODING: [0xff,0xcf,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf e8 25 <unknown>
 
 smax    z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: smax    z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x08,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 08 04 <unknown>
 
 smax    z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: smax    z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x48,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 48 04 <unknown>
 
 smax    z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: smax    z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x88,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 88 04 <unknown>
 
 smax    z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: smax    z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xc8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f c8 04 <unknown>
 
 
@@ -88,35 +88,35 @@ smax    z31.d, p7/m, z31.d, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 smax    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: smax	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xc8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f c8 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 smax    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: smax	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xc8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f c8 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 smax    z31.d, z31.d, #127
 // CHECK-INST: smax	z31.d, z31.d, #127
 // CHECK-ENCODING: [0xff,0xcf,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf e8 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/smaxv.s b/llvm/test/MC/AArch64/SVE/smaxv.s
index f3867cf05127a..01d864c8f5e64 100644
--- a/llvm/test/MC/AArch64/SVE/smaxv.s
+++ b/llvm/test/MC/AArch64/SVE/smaxv.s
@@ -12,23 +12,23 @@
 smaxv b0, p7, z31.b
 // CHECK-INST: smaxv	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x08,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 08 04 <unknown>
 
 smaxv h0, p7, z31.h
 // CHECK-INST: smaxv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x48,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 48 04 <unknown>
 
 smaxv s0, p7, z31.s
 // CHECK-INST: smaxv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x88,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 88 04 <unknown>
 
 smaxv d0, p7, z31.d
 // CHECK-INST: smaxv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c8 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/smin.s b/llvm/test/MC/AArch64/SVE/smin.s
index 4b8528167d3f1..5df5aeeb05a89 100644
--- a/llvm/test/MC/AArch64/SVE/smin.s
+++ b/llvm/test/MC/AArch64/SVE/smin.s
@@ -12,73 +12,73 @@
 smin    z0.b, z0.b, #-128
 // CHECK-INST: smin	z0.b, z0.b, #-128
 // CHECK-ENCODING: [0x00,0xd0,0x2a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 2a 25 <unknown>
 
 smin    z31.b, z31.b, #127
 // CHECK-INST: smin	z31.b, z31.b, #127
 // CHECK-ENCODING: [0xff,0xcf,0x2a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf 2a 25 <unknown>
 
 smin    z0.h, z0.h, #-128
 // CHECK-INST: smin	z0.h, z0.h, #-128
 // CHECK-ENCODING: [0x00,0xd0,0x6a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 6a 25 <unknown>
 
 smin    z31.h, z31.h, #127
 // CHECK-INST: smin	z31.h, z31.h, #127
 // CHECK-ENCODING: [0xff,0xcf,0x6a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf 6a 25 <unknown>
 
 smin    z0.s, z0.s, #-128
 // CHECK-INST: smin	z0.s, z0.s, #-128
 // CHECK-ENCODING: [0x00,0xd0,0xaa,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 aa 25 <unknown>
 
 smin    z31.s, z31.s, #127
 // CHECK-INST: smin	z31.s, z31.s, #127
 // CHECK-ENCODING: [0xff,0xcf,0xaa,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf aa 25 <unknown>
 
 smin    z0.d, z0.d, #-128
 // CHECK-INST: smin	z0.d, z0.d, #-128
 // CHECK-ENCODING: [0x00,0xd0,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 d0 ea 25 <unknown>
 
 smin    z31.d, z31.d, #127
 // CHECK-INST: smin	z31.d, z31.d, #127
 // CHECK-ENCODING: [0xff,0xcf,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf ea 25 <unknown>
 
 smin    z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: smin	z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x0a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 0a 04 <unknown>
 
 smin    z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: smin	z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x4a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 4a 04 <unknown>
 
 smin    z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: smin	z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x8a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 8a 04 <unknown>
 
 smin    z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: smin	z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xca,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f ca 04 <unknown>
 
 
@@ -88,35 +88,35 @@ smin    z31.d, p7/m, z31.d, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 smin    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: smin	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xca,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f ca 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 smin    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: smin	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xca,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f ca 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 smin    z31.d, z31.d, #127
 // CHECK-INST: smin	z31.d, z31.d, #127
 // CHECK-ENCODING: [0xff,0xcf,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff cf ea 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sminv.s b/llvm/test/MC/AArch64/SVE/sminv.s
index 3ac72b9720561..c0ab800e73df4 100644
--- a/llvm/test/MC/AArch64/SVE/sminv.s
+++ b/llvm/test/MC/AArch64/SVE/sminv.s
@@ -12,23 +12,23 @@
 sminv b0, p7, z31.b
 // CHECK-INST: sminv	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x0a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 0a 04 <unknown>
 
 sminv h0, p7, z31.h
 // CHECK-INST: sminv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x4a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 4a 04 <unknown>
 
 sminv s0, p7, z31.s
 // CHECK-INST: sminv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x8a,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 8a 04 <unknown>
 
 sminv d0, p7, z31.d
 // CHECK-INST: sminv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xca,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f ca 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/smulh.s b/llvm/test/MC/AArch64/SVE/smulh.s
index 53a67673acc68..3c531620e0f68 100644
--- a/llvm/test/MC/AArch64/SVE/smulh.s
+++ b/llvm/test/MC/AArch64/SVE/smulh.s
@@ -12,25 +12,25 @@
 smulh z0.b, p7/m, z0.b, z31.b
 // CHECK-INST: smulh	z0.b, p7/m, z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0x1f,0x12,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 12 04 <unknown>
 
 smulh z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: smulh	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x1f,0x52,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 52 04 <unknown>
 
 smulh z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: smulh	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x1f,0x92,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 92 04 <unknown>
 
 smulh z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: smulh	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d2 04 <unknown>
 
 
@@ -40,23 +40,23 @@ smulh z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 smulh z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: smulh	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d2 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 smulh z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: smulh	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d2 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/splice.s b/llvm/test/MC/AArch64/SVE/splice.s
index 83af81e204ff4..64c7959cc2f88 100644
--- a/llvm/test/MC/AArch64/SVE/splice.s
+++ b/llvm/test/MC/AArch64/SVE/splice.s
@@ -12,25 +12,25 @@
 splice  z31.b, p7, z31.b, z31.b
 // CHECK-INST: splice  z31.b, p7, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x9f,0x2c,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 2c 05 <unknown>
 
 splice  z31.h, p7, z31.h, z31.h
 // CHECK-INST: splice  z31.h, p7, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x9f,0x6c,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f 6c 05 <unknown>
 
 splice  z31.s, p7, z31.s, z31.s
 // CHECK-INST: splice  z31.s, p7, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x9f,0xac,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f ac 05 <unknown>
 
 splice  z31.d, p7, z31.d, z31.d
 // CHECK-INST: splice  z31.d, p7, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x9f,0xec,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 9f ec 05 <unknown>
 
 
@@ -40,11 +40,11 @@ splice  z31.d, p7, z31.d, z31.d
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 splice  z4.d, p7, z4.d, z31.d
 // CHECK-INST: splice	z4.d, p7, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x9f,0xec,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 9f ec 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqadd.s b/llvm/test/MC/AArch64/SVE/sqadd.s
index 0535766d99cf1..d20343d5fa70d 100644
--- a/llvm/test/MC/AArch64/SVE/sqadd.s
+++ b/llvm/test/MC/AArch64/SVE/sqadd.s
@@ -13,109 +13,109 @@
 sqadd     z0.b, z0.b, z0.b
 // CHECK-INST: sqadd z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x10,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 10 20 04 <unknown>
 
 sqadd     z0.h, z0.h, z0.h
 // CHECK-INST: sqadd z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x10,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 10 60 04 <unknown>
 
 sqadd     z0.s, z0.s, z0.s
 // CHECK-INST: sqadd z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x10,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 10 a0 04 <unknown>
 
 sqadd     z0.d, z0.d, z0.d
 // CHECK-INST: sqadd z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x10,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 10 e0 04 <unknown>
 
 sqadd     z0.b, z0.b, #0
 // CHECK-INST: sqadd z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x24,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 24 25 <unknown>
 
 sqadd     z31.b, z31.b, #255
 // CHECK-INST: sqadd z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x24,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 24 25 <unknown>
 
 sqadd     z0.h, z0.h, #0
 // CHECK-INST: sqadd z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x64,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 64 25 <unknown>
 
 sqadd     z0.h, z0.h, #0, lsl #8
 // CHECK-INST: sqadd z0.h, z0.h, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0x64,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 64 25 <unknown>
 
 sqadd     z31.h, z31.h, #255, lsl #8
 // CHECK-INST: sqadd z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x64,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 64 25 <unknown>
 
 sqadd     z31.h, z31.h, #65280
 // CHECK-INST: sqadd z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x64,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 64 25 <unknown>
 
 sqadd     z0.s, z0.s, #0
 // CHECK-INST: sqadd z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xa4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a4 25 <unknown>
 
 sqadd     z0.s, z0.s, #0, lsl #8
 // CHECK-INST: sqadd z0.s, z0.s, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xa4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a4 25 <unknown>
 
 sqadd     z31.s, z31.s, #255, lsl #8
 // CHECK-INST: sqadd z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a4 25 <unknown>
 
 sqadd     z31.s, z31.s, #65280
 // CHECK-INST: sqadd z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a4 25 <unknown>
 
 sqadd     z0.d, z0.d, #0
 // CHECK-INST: sqadd z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xe4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e4 25 <unknown>
 
 sqadd     z0.d, z0.d, #0, lsl #8
 // CHECK-INST: sqadd z0.d, z0.d, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xe4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e4 25 <unknown>
 
 sqadd     z31.d, z31.d, #255, lsl #8
 // CHECK-INST: sqadd z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e4 25 <unknown>
 
 sqadd     z31.d, z31.d, #65280
 // CHECK-INST: sqadd z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e4 25 <unknown>
 
 
@@ -125,11 +125,11 @@ sqadd     z31.d, z31.d, #65280
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqadd     z31.d, z31.d, #65280
 // CHECK-INST: sqadd	z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe4,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e4 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqdecb.s b/llvm/test/MC/AArch64/SVE/sqdecb.s
index 8f4f65be2b5c4..9c9a75d315842 100644
--- a/llvm/test/MC/AArch64/SVE/sqdecb.s
+++ b/llvm/test/MC/AArch64/SVE/sqdecb.s
@@ -16,25 +16,25 @@
 sqdecb  x0
 // CHECK-INST: sqdecb  x0
 // CHECK-ENCODING: [0xe0,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 30 04 <unknown>
 
 sqdecb  x0, all
 // CHECK-INST: sqdecb  x0
 // CHECK-ENCODING: [0xe0,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 30 04 <unknown>
 
 sqdecb  x0, all, mul #1
 // CHECK-INST: sqdecb  x0
 // CHECK-ENCODING: [0xe0,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 30 04 <unknown>
 
 sqdecb  x0, all, mul #16
 // CHECK-INST: sqdecb  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 3f 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqdecb  x0, all, mul #16
 sqdecb  x0, w0
 // CHECK-INST: sqdecb  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 20 04 <unknown>
 
 sqdecb  x0, w0, all
 // CHECK-INST: sqdecb  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 20 04 <unknown>
 
 sqdecb  x0, w0, all, mul #1
 // CHECK-INST: sqdecb  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 20 04 <unknown>
 
 sqdecb  x0, w0, all, mul #16
 // CHECK-INST: sqdecb  x0, w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 2f 04 <unknown>
 
 sqdecb  x0, w0, pow2
 // CHECK-INST: sqdecb  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 20 04 <unknown>
 
 sqdecb  x0, w0, pow2, mul #16
 // CHECK-INST: sqdecb  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf8,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 2f 04 <unknown>
 
 
@@ -86,173 +86,173 @@ sqdecb  x0, w0, pow2, mul #16
 sqdecb  x0, pow2
 // CHECK-INST: sqdecb  x0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 30 04 <unknown>
 
 sqdecb  x0, vl1
 // CHECK-INST: sqdecb  x0, vl1
 // CHECK-ENCODING: [0x20,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 30 04 <unknown>
 
 sqdecb  x0, vl2
 // CHECK-INST: sqdecb  x0, vl2
 // CHECK-ENCODING: [0x40,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f8 30 04 <unknown>
 
 sqdecb  x0, vl3
 // CHECK-INST: sqdecb  x0, vl3
 // CHECK-ENCODING: [0x60,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f8 30 04 <unknown>
 
 sqdecb  x0, vl4
 // CHECK-INST: sqdecb  x0, vl4
 // CHECK-ENCODING: [0x80,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f8 30 04 <unknown>
 
 sqdecb  x0, vl5
 // CHECK-INST: sqdecb  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f8 30 04 <unknown>
 
 sqdecb  x0, vl6
 // CHECK-INST: sqdecb  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f8 30 04 <unknown>
 
 sqdecb  x0, vl7
 // CHECK-INST: sqdecb  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf8,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f8 30 04 <unknown>
 
 sqdecb  x0, vl8
 // CHECK-INST: sqdecb  x0, vl8
 // CHECK-ENCODING: [0x00,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f9 30 04 <unknown>
 
 sqdecb  x0, vl16
 // CHECK-INST: sqdecb  x0, vl16
 // CHECK-ENCODING: [0x20,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f9 30 04 <unknown>
 
 sqdecb  x0, vl32
 // CHECK-INST: sqdecb  x0, vl32
 // CHECK-ENCODING: [0x40,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f9 30 04 <unknown>
 
 sqdecb  x0, vl64
 // CHECK-INST: sqdecb  x0, vl64
 // CHECK-ENCODING: [0x60,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f9 30 04 <unknown>
 
 sqdecb  x0, vl128
 // CHECK-INST: sqdecb  x0, vl128
 // CHECK-ENCODING: [0x80,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f9 30 04 <unknown>
 
 sqdecb  x0, vl256
 // CHECK-INST: sqdecb  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f9 30 04 <unknown>
 
 sqdecb  x0, #14
 // CHECK-INST: sqdecb  x0, #14
 // CHECK-ENCODING: [0xc0,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f9 30 04 <unknown>
 
 sqdecb  x0, #15
 // CHECK-INST: sqdecb  x0, #15
 // CHECK-ENCODING: [0xe0,0xf9,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f9 30 04 <unknown>
 
 sqdecb  x0, #16
 // CHECK-INST: sqdecb  x0, #16
 // CHECK-ENCODING: [0x00,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fa 30 04 <unknown>
 
 sqdecb  x0, #17
 // CHECK-INST: sqdecb  x0, #17
 // CHECK-ENCODING: [0x20,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fa 30 04 <unknown>
 
 sqdecb  x0, #18
 // CHECK-INST: sqdecb  x0, #18
 // CHECK-ENCODING: [0x40,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fa 30 04 <unknown>
 
 sqdecb  x0, #19
 // CHECK-INST: sqdecb  x0, #19
 // CHECK-ENCODING: [0x60,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fa 30 04 <unknown>
 
 sqdecb  x0, #20
 // CHECK-INST: sqdecb  x0, #20
 // CHECK-ENCODING: [0x80,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fa 30 04 <unknown>
 
 sqdecb  x0, #21
 // CHECK-INST: sqdecb  x0, #21
 // CHECK-ENCODING: [0xa0,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fa 30 04 <unknown>
 
 sqdecb  x0, #22
 // CHECK-INST: sqdecb  x0, #22
 // CHECK-ENCODING: [0xc0,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fa 30 04 <unknown>
 
 sqdecb  x0, #23
 // CHECK-INST: sqdecb  x0, #23
 // CHECK-ENCODING: [0xe0,0xfa,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fa 30 04 <unknown>
 
 sqdecb  x0, #24
 // CHECK-INST: sqdecb  x0, #24
 // CHECK-ENCODING: [0x00,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fb 30 04 <unknown>
 
 sqdecb  x0, #25
 // CHECK-INST: sqdecb  x0, #25
 // CHECK-ENCODING: [0x20,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fb 30 04 <unknown>
 
 sqdecb  x0, #26
 // CHECK-INST: sqdecb  x0, #26
 // CHECK-ENCODING: [0x40,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fb 30 04 <unknown>
 
 sqdecb  x0, #27
 // CHECK-INST: sqdecb  x0, #27
 // CHECK-ENCODING: [0x60,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fb 30 04 <unknown>
 
 sqdecb  x0, #28
 // CHECK-INST: sqdecb  x0, #28
 // CHECK-ENCODING: [0x80,0xfb,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fb 30 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqdecd.s b/llvm/test/MC/AArch64/SVE/sqdecd.s
index cfe3b7e52b3fa..a107438889024 100644
--- a/llvm/test/MC/AArch64/SVE/sqdecd.s
+++ b/llvm/test/MC/AArch64/SVE/sqdecd.s
@@ -16,25 +16,25 @@
 sqdecd  x0
 // CHECK-INST: sqdecd  x0
 // CHECK-ENCODING: [0xe0,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb f0 04 <unknown>
 
 sqdecd  x0, all
 // CHECK-INST: sqdecd  x0
 // CHECK-ENCODING: [0xe0,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb f0 04 <unknown>
 
 sqdecd  x0, all, mul #1
 // CHECK-INST: sqdecd  x0
 // CHECK-ENCODING: [0xe0,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb f0 04 <unknown>
 
 sqdecd  x0, all, mul #16
 // CHECK-INST: sqdecd  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb ff 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqdecd  x0, all, mul #16
 sqdecd  x0, w0
 // CHECK-INST: sqdecd  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb e0 04 <unknown>
 
 sqdecd  x0, w0, all
 // CHECK-INST: sqdecd  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb e0 04 <unknown>
 
 sqdecd  x0, w0, all, mul #1
 // CHECK-INST: sqdecd  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb e0 04 <unknown>
 
 sqdecd  x0, w0, all, mul #16
 // CHECK-INST: sqdecd  x0, w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb ef 04 <unknown>
 
 sqdecd  x0, w0, pow2
 // CHECK-INST: sqdecd  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 e0 04 <unknown>
 
 sqdecd  x0, w0, pow2, mul #16
 // CHECK-INST: sqdecd  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf8,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 ef 04 <unknown>
 
 
@@ -85,37 +85,37 @@ sqdecd  x0, w0, pow2, mul #16
 sqdecd  z0.d
 // CHECK-INST: sqdecd  z0.d
 // CHECK-ENCODING: [0xe0,0xcb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb e0 04 <unknown>
 
 sqdecd  z0.d, all
 // CHECK-INST: sqdecd  z0.d
 // CHECK-ENCODING: [0xe0,0xcb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb e0 04 <unknown>
 
 sqdecd  z0.d, all, mul #1
 // CHECK-INST: sqdecd  z0.d
 // CHECK-ENCODING: [0xe0,0xcb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb e0 04 <unknown>
 
 sqdecd  z0.d, all, mul #16
 // CHECK-INST: sqdecd  z0.d, all, mul #16
 // CHECK-ENCODING: [0xe0,0xcb,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb ef 04 <unknown>
 
 sqdecd  z0.d, pow2
 // CHECK-INST: sqdecd  z0.d, pow2
 // CHECK-ENCODING: [0x00,0xc8,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 e0 04 <unknown>
 
 sqdecd  z0.d, pow2, mul #16
 // CHECK-INST: sqdecd  z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc8,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 ef 04 <unknown>
 
 
@@ -126,175 +126,175 @@ sqdecd  z0.d, pow2, mul #16
 sqdecd  x0, pow2
 // CHECK-INST: sqdecd  x0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 f0 04 <unknown>
 
 sqdecd  x0, vl1
 // CHECK-INST: sqdecd  x0, vl1
 // CHECK-ENCODING: [0x20,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 f0 04 <unknown>
 
 sqdecd  x0, vl2
 // CHECK-INST: sqdecd  x0, vl2
 // CHECK-ENCODING: [0x40,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f8 f0 04 <unknown>
 
 sqdecd  x0, vl3
 // CHECK-INST: sqdecd  x0, vl3
 // CHECK-ENCODING: [0x60,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f8 f0 04 <unknown>
 
 sqdecd  x0, vl4
 // CHECK-INST: sqdecd  x0, vl4
 // CHECK-ENCODING: [0x80,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f8 f0 04 <unknown>
 
 sqdecd  x0, vl5
 // CHECK-INST: sqdecd  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f8 f0 04 <unknown>
 
 sqdecd  x0, vl6
 // CHECK-INST: sqdecd  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f8 f0 04 <unknown>
 
 sqdecd  x0, vl7
 // CHECK-INST: sqdecd  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf8,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f8 f0 04 <unknown>
 
 sqdecd  x0, vl8
 // CHECK-INST: sqdecd  x0, vl8
 // CHECK-ENCODING: [0x00,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f9 f0 04 <unknown>
 
 sqdecd  x0, vl16
 // CHECK-INST: sqdecd  x0, vl16
 // CHECK-ENCODING: [0x20,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f9 f0 04 <unknown>
 
 sqdecd  x0, vl32
 // CHECK-INST: sqdecd  x0, vl32
 // CHECK-ENCODING: [0x40,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f9 f0 04 <unknown>
 
 sqdecd  x0, vl64
 // CHECK-INST: sqdecd  x0, vl64
 // CHECK-ENCODING: [0x60,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f9 f0 04 <unknown>
 
 sqdecd  x0, vl128
 // CHECK-INST: sqdecd  x0, vl128
 // CHECK-ENCODING: [0x80,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f9 f0 04 <unknown>
 
 sqdecd  x0, vl256
 // CHECK-INST: sqdecd  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f9 f0 04 <unknown>
 
 sqdecd  x0, #14
 // CHECK-INST: sqdecd  x0, #14
 // CHECK-ENCODING: [0xc0,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f9 f0 04 <unknown>
 
 sqdecd  x0, #15
 // CHECK-INST: sqdecd  x0, #15
 // CHECK-ENCODING: [0xe0,0xf9,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f9 f0 04 <unknown>
 
 sqdecd  x0, #16
 // CHECK-INST: sqdecd  x0, #16
 // CHECK-ENCODING: [0x00,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fa f0 04 <unknown>
 
 sqdecd  x0, #17
 // CHECK-INST: sqdecd  x0, #17
 // CHECK-ENCODING: [0x20,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fa f0 04 <unknown>
 
 sqdecd  x0, #18
 // CHECK-INST: sqdecd  x0, #18
 // CHECK-ENCODING: [0x40,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fa f0 04 <unknown>
 
 sqdecd  x0, #19
 // CHECK-INST: sqdecd  x0, #19
 // CHECK-ENCODING: [0x60,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fa f0 04 <unknown>
 
 sqdecd  x0, #20
 // CHECK-INST: sqdecd  x0, #20
 // CHECK-ENCODING: [0x80,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fa f0 04 <unknown>
 
 sqdecd  x0, #21
 // CHECK-INST: sqdecd  x0, #21
 // CHECK-ENCODING: [0xa0,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fa f0 04 <unknown>
 
 sqdecd  x0, #22
 // CHECK-INST: sqdecd  x0, #22
 // CHECK-ENCODING: [0xc0,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fa f0 04 <unknown>
 
 sqdecd  x0, #23
 // CHECK-INST: sqdecd  x0, #23
 // CHECK-ENCODING: [0xe0,0xfa,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fa f0 04 <unknown>
 
 sqdecd  x0, #24
 // CHECK-INST: sqdecd  x0, #24
 // CHECK-ENCODING: [0x00,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fb f0 04 <unknown>
 
 sqdecd  x0, #25
 // CHECK-INST: sqdecd  x0, #25
 // CHECK-ENCODING: [0x20,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fb f0 04 <unknown>
 
 sqdecd  x0, #26
 // CHECK-INST: sqdecd  x0, #26
 // CHECK-ENCODING: [0x40,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fb f0 04 <unknown>
 
 sqdecd  x0, #27
 // CHECK-INST: sqdecd  x0, #27
 // CHECK-ENCODING: [0x60,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fb f0 04 <unknown>
 
 sqdecd  x0, #28
 // CHECK-INST: sqdecd  x0, #28
 // CHECK-ENCODING: [0x80,0xfb,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fb f0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ sqdecd  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdecd  z0.d
 // CHECK-INST: sqdecd	z0.d
 // CHECK-ENCODING: [0xe0,0xcb,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb e0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdecd  z0.d, pow2, mul #16
 // CHECK-INST: sqdecd	z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc8,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 ef 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdecd  z0.d, pow2
 // CHECK-INST: sqdecd	z0.d, pow2
 // CHECK-ENCODING: [0x00,0xc8,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 e0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqdech.s b/llvm/test/MC/AArch64/SVE/sqdech.s
index b6cb8991c2dbd..44ab11d90da5f 100644
--- a/llvm/test/MC/AArch64/SVE/sqdech.s
+++ b/llvm/test/MC/AArch64/SVE/sqdech.s
@@ -16,25 +16,25 @@
 sqdech  x0
 // CHECK-INST: sqdech  x0
 // CHECK-ENCODING: [0xe0,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 70 04 <unknown>
 
 sqdech  x0, all
 // CHECK-INST: sqdech  x0
 // CHECK-ENCODING: [0xe0,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 70 04 <unknown>
 
 sqdech  x0, all, mul #1
 // CHECK-INST: sqdech  x0
 // CHECK-ENCODING: [0xe0,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 70 04 <unknown>
 
 sqdech  x0, all, mul #16
 // CHECK-INST: sqdech  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 7f 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqdech  x0, all, mul #16
 sqdech  x0, w0
 // CHECK-INST: sqdech  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 60 04 <unknown>
 
 sqdech  x0, w0, all
 // CHECK-INST: sqdech  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 60 04 <unknown>
 
 sqdech  x0, w0, all, mul #1
 // CHECK-INST: sqdech  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 60 04 <unknown>
 
 sqdech  x0, w0, all, mul #16
 // CHECK-INST: sqdech  x0, w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb 6f 04 <unknown>
 
 sqdech  x0, w0, pow2
 // CHECK-INST: sqdech  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 60 04 <unknown>
 
 sqdech  x0, w0, pow2, mul #16
 // CHECK-INST: sqdech  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf8,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 6f 04 <unknown>
 
 
@@ -85,37 +85,37 @@ sqdech  x0, w0, pow2, mul #16
 sqdech  z0.h
 // CHECK-INST: sqdech  z0.h
 // CHECK-ENCODING: [0xe0,0xcb,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb 60 04 <unknown>
 
 sqdech  z0.h, all
 // CHECK-INST: sqdech  z0.h
 // CHECK-ENCODING: [0xe0,0xcb,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb 60 04 <unknown>
 
 sqdech  z0.h, all, mul #1
 // CHECK-INST: sqdech  z0.h
 // CHECK-ENCODING: [0xe0,0xcb,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb 60 04 <unknown>
 
 sqdech  z0.h, all, mul #16
 // CHECK-INST: sqdech  z0.h, all, mul #16
 // CHECK-ENCODING: [0xe0,0xcb,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb 6f 04 <unknown>
 
 sqdech  z0.h, pow2
 // CHECK-INST: sqdech  z0.h, pow2
 // CHECK-ENCODING: [0x00,0xc8,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 60 04 <unknown>
 
 sqdech  z0.h, pow2, mul #16
 // CHECK-INST: sqdech  z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc8,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 6f 04 <unknown>
 
 
@@ -126,175 +126,175 @@ sqdech  z0.h, pow2, mul #16
 sqdech  x0, pow2
 // CHECK-INST: sqdech  x0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 70 04 <unknown>
 
 sqdech  x0, vl1
 // CHECK-INST: sqdech  x0, vl1
 // CHECK-ENCODING: [0x20,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 70 04 <unknown>
 
 sqdech  x0, vl2
 // CHECK-INST: sqdech  x0, vl2
 // CHECK-ENCODING: [0x40,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f8 70 04 <unknown>
 
 sqdech  x0, vl3
 // CHECK-INST: sqdech  x0, vl3
 // CHECK-ENCODING: [0x60,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f8 70 04 <unknown>
 
 sqdech  x0, vl4
 // CHECK-INST: sqdech  x0, vl4
 // CHECK-ENCODING: [0x80,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f8 70 04 <unknown>
 
 sqdech  x0, vl5
 // CHECK-INST: sqdech  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f8 70 04 <unknown>
 
 sqdech  x0, vl6
 // CHECK-INST: sqdech  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f8 70 04 <unknown>
 
 sqdech  x0, vl7
 // CHECK-INST: sqdech  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf8,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f8 70 04 <unknown>
 
 sqdech  x0, vl8
 // CHECK-INST: sqdech  x0, vl8
 // CHECK-ENCODING: [0x00,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f9 70 04 <unknown>
 
 sqdech  x0, vl16
 // CHECK-INST: sqdech  x0, vl16
 // CHECK-ENCODING: [0x20,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f9 70 04 <unknown>
 
 sqdech  x0, vl32
 // CHECK-INST: sqdech  x0, vl32
 // CHECK-ENCODING: [0x40,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f9 70 04 <unknown>
 
 sqdech  x0, vl64
 // CHECK-INST: sqdech  x0, vl64
 // CHECK-ENCODING: [0x60,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f9 70 04 <unknown>
 
 sqdech  x0, vl128
 // CHECK-INST: sqdech  x0, vl128
 // CHECK-ENCODING: [0x80,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f9 70 04 <unknown>
 
 sqdech  x0, vl256
 // CHECK-INST: sqdech  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f9 70 04 <unknown>
 
 sqdech  x0, #14
 // CHECK-INST: sqdech  x0, #14
 // CHECK-ENCODING: [0xc0,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f9 70 04 <unknown>
 
 sqdech  x0, #15
 // CHECK-INST: sqdech  x0, #15
 // CHECK-ENCODING: [0xe0,0xf9,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f9 70 04 <unknown>
 
 sqdech  x0, #16
 // CHECK-INST: sqdech  x0, #16
 // CHECK-ENCODING: [0x00,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fa 70 04 <unknown>
 
 sqdech  x0, #17
 // CHECK-INST: sqdech  x0, #17
 // CHECK-ENCODING: [0x20,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fa 70 04 <unknown>
 
 sqdech  x0, #18
 // CHECK-INST: sqdech  x0, #18
 // CHECK-ENCODING: [0x40,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fa 70 04 <unknown>
 
 sqdech  x0, #19
 // CHECK-INST: sqdech  x0, #19
 // CHECK-ENCODING: [0x60,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fa 70 04 <unknown>
 
 sqdech  x0, #20
 // CHECK-INST: sqdech  x0, #20
 // CHECK-ENCODING: [0x80,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fa 70 04 <unknown>
 
 sqdech  x0, #21
 // CHECK-INST: sqdech  x0, #21
 // CHECK-ENCODING: [0xa0,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fa 70 04 <unknown>
 
 sqdech  x0, #22
 // CHECK-INST: sqdech  x0, #22
 // CHECK-ENCODING: [0xc0,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fa 70 04 <unknown>
 
 sqdech  x0, #23
 // CHECK-INST: sqdech  x0, #23
 // CHECK-ENCODING: [0xe0,0xfa,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fa 70 04 <unknown>
 
 sqdech  x0, #24
 // CHECK-INST: sqdech  x0, #24
 // CHECK-ENCODING: [0x00,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fb 70 04 <unknown>
 
 sqdech  x0, #25
 // CHECK-INST: sqdech  x0, #25
 // CHECK-ENCODING: [0x20,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fb 70 04 <unknown>
 
 sqdech  x0, #26
 // CHECK-INST: sqdech  x0, #26
 // CHECK-ENCODING: [0x40,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fb 70 04 <unknown>
 
 sqdech  x0, #27
 // CHECK-INST: sqdech  x0, #27
 // CHECK-ENCODING: [0x60,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fb 70 04 <unknown>
 
 sqdech  x0, #28
 // CHECK-INST: sqdech  x0, #28
 // CHECK-ENCODING: [0x80,0xfb,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fb 70 04 <unknown>
 
 
@@ -304,35 +304,35 @@ sqdech  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdech  z0.h
 // CHECK-INST: sqdech	z0.h
 // CHECK-ENCODING: [0xe0,0xcb,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb 60 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdech  z0.h, pow2, mul #16
 // CHECK-INST: sqdech	z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc8,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 6f 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdech  z0.h, pow2
 // CHECK-INST: sqdech	z0.h, pow2
 // CHECK-ENCODING: [0x00,0xc8,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 60 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqdecp.s b/llvm/test/MC/AArch64/SVE/sqdecp.s
index adbc5968ab79d..16c6e984293ed 100644
--- a/llvm/test/MC/AArch64/SVE/sqdecp.s
+++ b/llvm/test/MC/AArch64/SVE/sqdecp.s
@@ -12,85 +12,85 @@
 sqdecp  x0, p0.b
 // CHECK-INST: sqdecp x0, p0.b
 // CHECK-ENCODING: [0x00,0x8c,0x2a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 2a 25 <unknown>
 
 sqdecp  x0, p0.h
 // CHECK-INST: sqdecp x0, p0.h
 // CHECK-ENCODING: [0x00,0x8c,0x6a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 6a 25 <unknown>
 
 sqdecp  x0, p0.s
 // CHECK-INST: sqdecp x0, p0.s
 // CHECK-ENCODING: [0x00,0x8c,0xaa,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c aa 25 <unknown>
 
 sqdecp  x0, p0.d
 // CHECK-INST: sqdecp x0, p0.d
 // CHECK-ENCODING: [0x00,0x8c,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c ea 25 <unknown>
 
 sqdecp  xzr, p15.b, wzr
 // CHECK-INST: sqdecp xzr, p15.b, wzr
 // CHECK-ENCODING: [0xff,0x89,0x2a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 2a 25 <unknown>
 
 sqdecp  xzr, p15.h, wzr
 // CHECK-INST: sqdecp xzr, p15.h, wzr
 // CHECK-ENCODING: [0xff,0x89,0x6a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 6a 25 <unknown>
 
 sqdecp  xzr, p15.s, wzr
 // CHECK-INST: sqdecp xzr, p15.s, wzr
 // CHECK-ENCODING: [0xff,0x89,0xaa,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 aa 25 <unknown>
 
 sqdecp  xzr, p15.d, wzr
 // CHECK-INST: sqdecp xzr, p15.d, wzr
 // CHECK-ENCODING: [0xff,0x89,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 ea 25 <unknown>
 
 sqdecp  z0.h, p0
 // CHECK-INST: sqdecp z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x6a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 6a 25 <unknown>
 
 sqdecp  z0.h, p0.h
 // CHECK-INST: sqdecp z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x6a,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 6a 25 <unknown>
 
 sqdecp  z0.s, p0
 // CHECK-INST: sqdecp z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xaa,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 aa 25 <unknown>
 
 sqdecp  z0.s, p0.s
 // CHECK-INST: sqdecp z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xaa,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 aa 25 <unknown>
 
 sqdecp  z0.d, p0
 // CHECK-INST: sqdecp z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 ea 25 <unknown>
 
 sqdecp  z0.d, p0.d
 // CHECK-INST: sqdecp z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 ea 25 <unknown>
 
 
@@ -100,11 +100,11 @@ sqdecp  z0.d, p0.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdecp  z0.d, p0.d
 // CHECK-INST: sqdecp	z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xea,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 ea 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqdecw.s b/llvm/test/MC/AArch64/SVE/sqdecw.s
index 110c38eb8f433..9ce9e47498f29 100644
--- a/llvm/test/MC/AArch64/SVE/sqdecw.s
+++ b/llvm/test/MC/AArch64/SVE/sqdecw.s
@@ -16,25 +16,25 @@
 sqdecw  x0
 // CHECK-INST: sqdecw  x0
 // CHECK-ENCODING: [0xe0,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb b0 04 <unknown>
 
 sqdecw  x0, all
 // CHECK-INST: sqdecw  x0
 // CHECK-ENCODING: [0xe0,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb b0 04 <unknown>
 
 sqdecw  x0, all, mul #1
 // CHECK-INST: sqdecw  x0
 // CHECK-ENCODING: [0xe0,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb b0 04 <unknown>
 
 sqdecw  x0, all, mul #16
 // CHECK-INST: sqdecw  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb bf 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqdecw  x0, all, mul #16
 sqdecw  x0, w0
 // CHECK-INST: sqdecw  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb a0 04 <unknown>
 
 sqdecw  x0, w0, all
 // CHECK-INST: sqdecw  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb a0 04 <unknown>
 
 sqdecw  x0, w0, all, mul #1
 // CHECK-INST: sqdecw  x0, w0
 // CHECK-ENCODING: [0xe0,0xfb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb a0 04 <unknown>
 
 sqdecw  x0, w0, all, mul #16
 // CHECK-INST: sqdecw  x0, w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xfb,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fb af 04 <unknown>
 
 sqdecw  x0, w0, pow2
 // CHECK-INST: sqdecw  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 a0 04 <unknown>
 
 sqdecw  x0, w0, pow2, mul #16
 // CHECK-INST: sqdecw  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf8,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 af 04 <unknown>
 
 
@@ -85,37 +85,37 @@ sqdecw  x0, w0, pow2, mul #16
 sqdecw  z0.s
 // CHECK-INST: sqdecw  z0.s
 // CHECK-ENCODING: [0xe0,0xcb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb a0 04 <unknown>
 
 sqdecw  z0.s, all
 // CHECK-INST: sqdecw  z0.s
 // CHECK-ENCODING: [0xe0,0xcb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb a0 04 <unknown>
 
 sqdecw  z0.s, all, mul #1
 // CHECK-INST: sqdecw  z0.s
 // CHECK-ENCODING: [0xe0,0xcb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb a0 04 <unknown>
 
 sqdecw  z0.s, all, mul #16
 // CHECK-INST: sqdecw  z0.s, all, mul #16
 // CHECK-ENCODING: [0xe0,0xcb,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb af 04 <unknown>
 
 sqdecw  z0.s, pow2
 // CHECK-INST: sqdecw  z0.s, pow2
 // CHECK-ENCODING: [0x00,0xc8,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 a0 04 <unknown>
 
 sqdecw  z0.s, pow2, mul #16
 // CHECK-INST: sqdecw  z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc8,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 af 04 <unknown>
 
 
@@ -126,175 +126,175 @@ sqdecw  z0.s, pow2, mul #16
 sqdecw  x0, pow2
 // CHECK-INST: sqdecw  x0, pow2
 // CHECK-ENCODING: [0x00,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f8 b0 04 <unknown>
 
 sqdecw  x0, vl1
 // CHECK-INST: sqdecw  x0, vl1
 // CHECK-ENCODING: [0x20,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f8 b0 04 <unknown>
 
 sqdecw  x0, vl2
 // CHECK-INST: sqdecw  x0, vl2
 // CHECK-ENCODING: [0x40,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f8 b0 04 <unknown>
 
 sqdecw  x0, vl3
 // CHECK-INST: sqdecw  x0, vl3
 // CHECK-ENCODING: [0x60,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f8 b0 04 <unknown>
 
 sqdecw  x0, vl4
 // CHECK-INST: sqdecw  x0, vl4
 // CHECK-ENCODING: [0x80,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f8 b0 04 <unknown>
 
 sqdecw  x0, vl5
 // CHECK-INST: sqdecw  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f8 b0 04 <unknown>
 
 sqdecw  x0, vl6
 // CHECK-INST: sqdecw  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f8 b0 04 <unknown>
 
 sqdecw  x0, vl7
 // CHECK-INST: sqdecw  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf8,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f8 b0 04 <unknown>
 
 sqdecw  x0, vl8
 // CHECK-INST: sqdecw  x0, vl8
 // CHECK-ENCODING: [0x00,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f9 b0 04 <unknown>
 
 sqdecw  x0, vl16
 // CHECK-INST: sqdecw  x0, vl16
 // CHECK-ENCODING: [0x20,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f9 b0 04 <unknown>
 
 sqdecw  x0, vl32
 // CHECK-INST: sqdecw  x0, vl32
 // CHECK-ENCODING: [0x40,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f9 b0 04 <unknown>
 
 sqdecw  x0, vl64
 // CHECK-INST: sqdecw  x0, vl64
 // CHECK-ENCODING: [0x60,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f9 b0 04 <unknown>
 
 sqdecw  x0, vl128
 // CHECK-INST: sqdecw  x0, vl128
 // CHECK-ENCODING: [0x80,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f9 b0 04 <unknown>
 
 sqdecw  x0, vl256
 // CHECK-INST: sqdecw  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f9 b0 04 <unknown>
 
 sqdecw  x0, #14
 // CHECK-INST: sqdecw  x0, #14
 // CHECK-ENCODING: [0xc0,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f9 b0 04 <unknown>
 
 sqdecw  x0, #15
 // CHECK-INST: sqdecw  x0, #15
 // CHECK-ENCODING: [0xe0,0xf9,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f9 b0 04 <unknown>
 
 sqdecw  x0, #16
 // CHECK-INST: sqdecw  x0, #16
 // CHECK-ENCODING: [0x00,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fa b0 04 <unknown>
 
 sqdecw  x0, #17
 // CHECK-INST: sqdecw  x0, #17
 // CHECK-ENCODING: [0x20,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fa b0 04 <unknown>
 
 sqdecw  x0, #18
 // CHECK-INST: sqdecw  x0, #18
 // CHECK-ENCODING: [0x40,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fa b0 04 <unknown>
 
 sqdecw  x0, #19
 // CHECK-INST: sqdecw  x0, #19
 // CHECK-ENCODING: [0x60,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fa b0 04 <unknown>
 
 sqdecw  x0, #20
 // CHECK-INST: sqdecw  x0, #20
 // CHECK-ENCODING: [0x80,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fa b0 04 <unknown>
 
 sqdecw  x0, #21
 // CHECK-INST: sqdecw  x0, #21
 // CHECK-ENCODING: [0xa0,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fa b0 04 <unknown>
 
 sqdecw  x0, #22
 // CHECK-INST: sqdecw  x0, #22
 // CHECK-ENCODING: [0xc0,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fa b0 04 <unknown>
 
 sqdecw  x0, #23
 // CHECK-INST: sqdecw  x0, #23
 // CHECK-ENCODING: [0xe0,0xfa,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fa b0 04 <unknown>
 
 sqdecw  x0, #24
 // CHECK-INST: sqdecw  x0, #24
 // CHECK-ENCODING: [0x00,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fb b0 04 <unknown>
 
 sqdecw  x0, #25
 // CHECK-INST: sqdecw  x0, #25
 // CHECK-ENCODING: [0x20,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fb b0 04 <unknown>
 
 sqdecw  x0, #26
 // CHECK-INST: sqdecw  x0, #26
 // CHECK-ENCODING: [0x40,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fb b0 04 <unknown>
 
 sqdecw  x0, #27
 // CHECK-INST: sqdecw  x0, #27
 // CHECK-ENCODING: [0x60,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fb b0 04 <unknown>
 
 sqdecw  x0, #28
 // CHECK-INST: sqdecw  x0, #28
 // CHECK-ENCODING: [0x80,0xfb,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fb b0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ sqdecw  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdecw  z0.s
 // CHECK-INST: sqdecw	z0.s
 // CHECK-ENCODING: [0xe0,0xcb,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cb a0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdecw  z0.s, pow2, mul #16
 // CHECK-INST: sqdecw	z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc8,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 af 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqdecw  z0.s, pow2
 // CHECK-INST: sqdecw	z0.s, pow2
 // CHECK-ENCODING: [0x00,0xc8,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c8 a0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqincb.s b/llvm/test/MC/AArch64/SVE/sqincb.s
index 19858c518e299..2ad48e3035366 100644
--- a/llvm/test/MC/AArch64/SVE/sqincb.s
+++ b/llvm/test/MC/AArch64/SVE/sqincb.s
@@ -16,25 +16,25 @@
 sqincb  x0
 // CHECK-INST: sqincb  x0
 // CHECK-ENCODING: [0xe0,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 30 04 <unknown>
 
 sqincb  x0, all
 // CHECK-INST: sqincb  x0
 // CHECK-ENCODING: [0xe0,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 30 04 <unknown>
 
 sqincb  x0, all, mul #1
 // CHECK-INST: sqincb  x0
 // CHECK-ENCODING: [0xe0,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 30 04 <unknown>
 
 sqincb  x0, all, mul #16
 // CHECK-INST: sqincb  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf3,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 3f 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqincb  x0, all, mul #16
 sqincb  x0, w0
 // CHECK-INST: sqincb  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 20 04 <unknown>
 
 sqincb  x0, w0, all
 // CHECK-INST: sqincb  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 20 04 <unknown>
 
 sqincb  x0, w0, all, mul #1
 // CHECK-INST: sqincb  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 20 04 <unknown>
 
 sqincb  x0, w0, all, mul #16
 // CHECK-INST: sqincb  x0, w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf3,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 2f 04 <unknown>
 
 sqincb  x0, w0, pow2
 // CHECK-INST: sqincb  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 20 04 <unknown>
 
 sqincb  x0, w0, pow2, mul #16
 // CHECK-INST: sqincb  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf0,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 2f 04 <unknown>
 
 
@@ -86,174 +86,174 @@ sqincb  x0, w0, pow2, mul #16
 sqincb  x0, pow2
 // CHECK-INST: sqincb  x0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 30 04 <unknown>
 
 sqincb  x0, vl1
 // CHECK-INST: sqincb  x0, vl1
 // CHECK-ENCODING: [0x20,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f0 30 04 <unknown>
 
 sqincb  x0, vl2
 // CHECK-INST: sqincb  x0, vl2
 // CHECK-ENCODING: [0x40,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f0 30 04 <unknown>
 
 sqincb  x0, vl3
 // CHECK-INST: sqincb  x0, vl3
 // CHECK-ENCODING: [0x60,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f0 30 04 <unknown>
 
 sqincb  x0, vl4
 // CHECK-INST: sqincb  x0, vl4
 // CHECK-ENCODING: [0x80,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f0 30 04 <unknown>
 
 sqincb  x0, vl5
 // CHECK-INST: sqincb  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f0 30 04 <unknown>
 
 sqincb  x0, vl6
 // CHECK-INST: sqincb  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f0 30 04 <unknown>
 
 sqincb  x0, vl7
 // CHECK-INST: sqincb  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf0,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f0 30 04 <unknown>
 
 sqincb  x0, vl8
 // CHECK-INST: sqincb  x0, vl8
 // CHECK-ENCODING: [0x00,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f1 30 04 <unknown>
 
 sqincb  x0, vl16
 // CHECK-INST: sqincb  x0, vl16
 // CHECK-ENCODING: [0x20,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f1 30 04 <unknown>
 
 sqincb  x0, vl32
 // CHECK-INST: sqincb  x0, vl32
 // CHECK-ENCODING: [0x40,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f1 30 04 <unknown>
 
 sqincb  x0, vl64
 // CHECK-INST: sqincb  x0, vl64
 // CHECK-ENCODING: [0x60,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f1 30 04 <unknown>
 
 sqincb  x0, vl128
 // CHECK-INST: sqincb  x0, vl128
 // CHECK-ENCODING: [0x80,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f1 30 04 <unknown>
 
 sqincb  x0, vl256
 // CHECK-INST: sqincb  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f1 30 04 <unknown>
 
 sqincb  x0, #14
 // CHECK-INST: sqincb  x0, #14
 // CHECK-ENCODING: [0xc0,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f1 30 04 <unknown>
 
 sqincb  x0, #15
 // CHECK-INST: sqincb  x0, #15
 // CHECK-ENCODING: [0xe0,0xf1,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f1 30 04 <unknown>
 
 sqincb  x0, #16
 // CHECK-INST: sqincb  x0, #16
 // CHECK-ENCODING: [0x00,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f2 30 04 <unknown>
 
 sqincb  x0, #17
 // CHECK-INST: sqincb  x0, #17
 // CHECK-ENCODING: [0x20,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f2 30 04 <unknown>
 
 sqincb  x0, #18
 // CHECK-INST: sqincb  x0, #18
 // CHECK-ENCODING: [0x40,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f2 30 04 <unknown>
 
 sqincb  x0, #19
 // CHECK-INST: sqincb  x0, #19
 // CHECK-ENCODING: [0x60,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f2 30 04 <unknown>
 
 sqincb  x0, #20
 // CHECK-INST: sqincb  x0, #20
 // CHECK-ENCODING: [0x80,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f2 30 04 <unknown>
 
 sqincb  x0, #21
 // CHECK-INST: sqincb  x0, #21
 // CHECK-ENCODING: [0xa0,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f2 30 04 <unknown>
 
 sqincb  x0, #22
 // CHECK-INST: sqincb  x0, #22
 // CHECK-ENCODING: [0xc0,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f2 30 04 <unknown>
 
 sqincb  x0, #23
 // CHECK-INST: sqincb  x0, #23
 // CHECK-ENCODING: [0xe0,0xf2,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f2 30 04 <unknown>
 
 sqincb  x0, #24
 // CHECK-INST: sqincb  x0, #24
 // CHECK-ENCODING: [0x00,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f3 30 04 <unknown>
 
 sqincb  x0, #25
 // CHECK-INST: sqincb  x0, #25
 // CHECK-ENCODING: [0x20,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f3 30 04 <unknown>
 
 sqincb  x0, #26
 // CHECK-INST: sqincb  x0, #26
 // CHECK-ENCODING: [0x40,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f3 30 04 <unknown>
 
 sqincb  x0, #27
 // CHECK-INST: sqincb  x0, #27
 // CHECK-ENCODING: [0x60,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f3 30 04 <unknown>
 
 sqincb  x0, #28
 // CHECK-INST: sqincb  x0, #28
 // CHECK-ENCODING: [0x80,0xf3,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f3 30 04 <unknown>
 
diff --git a/llvm/test/MC/AArch64/SVE/sqincd.s b/llvm/test/MC/AArch64/SVE/sqincd.s
index a59879b3800a7..1dd8b62c32c2b 100644
--- a/llvm/test/MC/AArch64/SVE/sqincd.s
+++ b/llvm/test/MC/AArch64/SVE/sqincd.s
@@ -16,25 +16,25 @@
 sqincd  x0
 // CHECK-INST: sqincd  x0
 // CHECK-ENCODING: [0xe0,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 f0 04 <unknown>
 
 sqincd  x0, all
 // CHECK-INST: sqincd  x0
 // CHECK-ENCODING: [0xe0,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 f0 04 <unknown>
 
 sqincd  x0, all, mul #1
 // CHECK-INST: sqincd  x0
 // CHECK-ENCODING: [0xe0,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 f0 04 <unknown>
 
 sqincd  x0, all, mul #16
 // CHECK-INST: sqincd  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf3,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 ff 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqincd  x0, all, mul #16
 sqincd  x0, w0
 // CHECK-INST: sqincd  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 e0 04 <unknown>
 
 sqincd  x0, w0, all
 // CHECK-INST: sqincd  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 e0 04 <unknown>
 
 sqincd  x0, w0, all, mul #1
 // CHECK-INST: sqincd  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 e0 04 <unknown>
 
 sqincd  x0, w0, all, mul #16
 // CHECK-INST: sqincd  x0, w0, all
 // CHECK-ENCODING: [0xe0,0xf3,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 ef 04 <unknown>
 
 sqincd  x0, w0, pow2
 // CHECK-INST: sqincd  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 e0 04 <unknown>
 
 sqincd  x0, w0, pow2, mul #16
 // CHECK-INST: sqincd  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf0,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 ef 04 <unknown>
 
 
@@ -85,37 +85,37 @@ sqincd  x0, w0, pow2, mul #16
 sqincd  z0.d
 // CHECK-INST: sqincd  z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 e0 04 <unknown>
 
 sqincd  z0.d, all
 // CHECK-INST: sqincd  z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 e0 04 <unknown>
 
 sqincd  z0.d, all, mul #1
 // CHECK-INST: sqincd  z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 e0 04 <unknown>
 
 sqincd  z0.d, all, mul #16
 // CHECK-INST: sqincd  z0.d, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 ef 04 <unknown>
 
 sqincd  z0.d, pow2
 // CHECK-INST: sqincd  z0.d, pow2
 // CHECK-ENCODING: [0x00,0xc0,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e0 04 <unknown>
 
 sqincd  z0.d, pow2, mul #16
 // CHECK-INST: sqincd  z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc0,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 ef 04 <unknown>
 
 
@@ -126,175 +126,175 @@ sqincd  z0.d, pow2, mul #16
 sqincd  x0, pow2
 // CHECK-INST: sqincd  x0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 f0 04 <unknown>
 
 sqincd  x0, vl1
 // CHECK-INST: sqincd  x0, vl1
 // CHECK-ENCODING: [0x20,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f0 f0 04 <unknown>
 
 sqincd  x0, vl2
 // CHECK-INST: sqincd  x0, vl2
 // CHECK-ENCODING: [0x40,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f0 f0 04 <unknown>
 
 sqincd  x0, vl3
 // CHECK-INST: sqincd  x0, vl3
 // CHECK-ENCODING: [0x60,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f0 f0 04 <unknown>
 
 sqincd  x0, vl4
 // CHECK-INST: sqincd  x0, vl4
 // CHECK-ENCODING: [0x80,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f0 f0 04 <unknown>
 
 sqincd  x0, vl5
 // CHECK-INST: sqincd  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f0 f0 04 <unknown>
 
 sqincd  x0, vl6
 // CHECK-INST: sqincd  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f0 f0 04 <unknown>
 
 sqincd  x0, vl7
 // CHECK-INST: sqincd  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf0,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f0 f0 04 <unknown>
 
 sqincd  x0, vl8
 // CHECK-INST: sqincd  x0, vl8
 // CHECK-ENCODING: [0x00,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f1 f0 04 <unknown>
 
 sqincd  x0, vl16
 // CHECK-INST: sqincd  x0, vl16
 // CHECK-ENCODING: [0x20,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f1 f0 04 <unknown>
 
 sqincd  x0, vl32
 // CHECK-INST: sqincd  x0, vl32
 // CHECK-ENCODING: [0x40,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f1 f0 04 <unknown>
 
 sqincd  x0, vl64
 // CHECK-INST: sqincd  x0, vl64
 // CHECK-ENCODING: [0x60,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f1 f0 04 <unknown>
 
 sqincd  x0, vl128
 // CHECK-INST: sqincd  x0, vl128
 // CHECK-ENCODING: [0x80,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f1 f0 04 <unknown>
 
 sqincd  x0, vl256
 // CHECK-INST: sqincd  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f1 f0 04 <unknown>
 
 sqincd  x0, #14
 // CHECK-INST: sqincd  x0, #14
 // CHECK-ENCODING: [0xc0,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f1 f0 04 <unknown>
 
 sqincd  x0, #15
 // CHECK-INST: sqincd  x0, #15
 // CHECK-ENCODING: [0xe0,0xf1,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f1 f0 04 <unknown>
 
 sqincd  x0, #16
 // CHECK-INST: sqincd  x0, #16
 // CHECK-ENCODING: [0x00,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f2 f0 04 <unknown>
 
 sqincd  x0, #17
 // CHECK-INST: sqincd  x0, #17
 // CHECK-ENCODING: [0x20,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f2 f0 04 <unknown>
 
 sqincd  x0, #18
 // CHECK-INST: sqincd  x0, #18
 // CHECK-ENCODING: [0x40,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f2 f0 04 <unknown>
 
 sqincd  x0, #19
 // CHECK-INST: sqincd  x0, #19
 // CHECK-ENCODING: [0x60,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f2 f0 04 <unknown>
 
 sqincd  x0, #20
 // CHECK-INST: sqincd  x0, #20
 // CHECK-ENCODING: [0x80,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f2 f0 04 <unknown>
 
 sqincd  x0, #21
 // CHECK-INST: sqincd  x0, #21
 // CHECK-ENCODING: [0xa0,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f2 f0 04 <unknown>
 
 sqincd  x0, #22
 // CHECK-INST: sqincd  x0, #22
 // CHECK-ENCODING: [0xc0,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f2 f0 04 <unknown>
 
 sqincd  x0, #23
 // CHECK-INST: sqincd  x0, #23
 // CHECK-ENCODING: [0xe0,0xf2,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f2 f0 04 <unknown>
 
 sqincd  x0, #24
 // CHECK-INST: sqincd  x0, #24
 // CHECK-ENCODING: [0x00,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f3 f0 04 <unknown>
 
 sqincd  x0, #25
 // CHECK-INST: sqincd  x0, #25
 // CHECK-ENCODING: [0x20,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f3 f0 04 <unknown>
 
 sqincd  x0, #26
 // CHECK-INST: sqincd  x0, #26
 // CHECK-ENCODING: [0x40,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f3 f0 04 <unknown>
 
 sqincd  x0, #27
 // CHECK-INST: sqincd  x0, #27
 // CHECK-ENCODING: [0x60,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f3 f0 04 <unknown>
 
 sqincd  x0, #28
 // CHECK-INST: sqincd  x0, #28
 // CHECK-ENCODING: [0x80,0xf3,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f3 f0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ sqincd  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqincd  z0.d
 // CHECK-INST: sqincd	z0.d
 // CHECK-ENCODING: [0xe0,0xc3,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 e0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqincd  z0.d, pow2, mul #16
 // CHECK-INST: sqincd	z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc0,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 ef 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqincd  z0.d, pow2
 // CHECK-INST: sqincd	z0.d, pow2
 // CHECK-ENCODING: [0x00,0xc0,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqinch.s b/llvm/test/MC/AArch64/SVE/sqinch.s
index e96d1c555f694..aeae6791491ba 100644
--- a/llvm/test/MC/AArch64/SVE/sqinch.s
+++ b/llvm/test/MC/AArch64/SVE/sqinch.s
@@ -16,25 +16,25 @@
 sqinch  x0
 // CHECK-INST: sqinch  x0
 // CHECK-ENCODING: [0xe0,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 70 04 <unknown>
 
 sqinch  x0, all
 // CHECK-INST: sqinch  x0
 // CHECK-ENCODING: [0xe0,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 70 04 <unknown>
 
 sqinch  x0, all, mul #1
 // CHECK-INST: sqinch  x0
 // CHECK-ENCODING: [0xe0,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 70 04 <unknown>
 
 sqinch  x0, all, mul #16
 // CHECK-INST: sqinch  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf3,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 7f 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqinch  x0, all, mul #16
 sqinch  x0, w0
 // CHECK-INST: sqinch  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 60 04 <unknown>
 
 sqinch  x0, w0, all
 // CHECK-INST: sqinch  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 60 04 <unknown>
 
 sqinch  x0, w0, all, mul #1
 // CHECK-INST: sqinch  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 60 04 <unknown>
 
 sqinch  x0, w0, all, mul #16
 // CHECK-INST: sqinch  x0, w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf3,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 6f 04 <unknown>
 
 sqinch  x0, w0, pow2
 // CHECK-INST: sqinch  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 60 04 <unknown>
 
 sqinch  x0, w0, pow2, mul #16
 // CHECK-INST: sqinch  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf0,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 6f 04 <unknown>
 
 
@@ -85,37 +85,37 @@ sqinch  x0, w0, pow2, mul #16
 sqinch  z0.h
 // CHECK-INST: sqinch  z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 60 04 <unknown>
 
 sqinch  z0.h, all
 // CHECK-INST: sqinch  z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 60 04 <unknown>
 
 sqinch  z0.h, all, mul #1
 // CHECK-INST: sqinch  z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 60 04 <unknown>
 
 sqinch  z0.h, all, mul #16
 // CHECK-INST: sqinch  z0.h, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 6f 04 <unknown>
 
 sqinch  z0.h, pow2
 // CHECK-INST: sqinch  z0.h, pow2
 // CHECK-ENCODING: [0x00,0xc0,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 60 04 <unknown>
 
 sqinch  z0.h, pow2, mul #16
 // CHECK-INST: sqinch  z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc0,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 6f 04 <unknown>
 
 
@@ -126,175 +126,175 @@ sqinch  z0.h, pow2, mul #16
 sqinch  x0, pow2
 // CHECK-INST: sqinch  x0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 70 04 <unknown>
 
 sqinch  x0, vl1
 // CHECK-INST: sqinch  x0, vl1
 // CHECK-ENCODING: [0x20,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f0 70 04 <unknown>
 
 sqinch  x0, vl2
 // CHECK-INST: sqinch  x0, vl2
 // CHECK-ENCODING: [0x40,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f0 70 04 <unknown>
 
 sqinch  x0, vl3
 // CHECK-INST: sqinch  x0, vl3
 // CHECK-ENCODING: [0x60,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f0 70 04 <unknown>
 
 sqinch  x0, vl4
 // CHECK-INST: sqinch  x0, vl4
 // CHECK-ENCODING: [0x80,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f0 70 04 <unknown>
 
 sqinch  x0, vl5
 // CHECK-INST: sqinch  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f0 70 04 <unknown>
 
 sqinch  x0, vl6
 // CHECK-INST: sqinch  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f0 70 04 <unknown>
 
 sqinch  x0, vl7
 // CHECK-INST: sqinch  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf0,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f0 70 04 <unknown>
 
 sqinch  x0, vl8
 // CHECK-INST: sqinch  x0, vl8
 // CHECK-ENCODING: [0x00,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f1 70 04 <unknown>
 
 sqinch  x0, vl16
 // CHECK-INST: sqinch  x0, vl16
 // CHECK-ENCODING: [0x20,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f1 70 04 <unknown>
 
 sqinch  x0, vl32
 // CHECK-INST: sqinch  x0, vl32
 // CHECK-ENCODING: [0x40,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f1 70 04 <unknown>
 
 sqinch  x0, vl64
 // CHECK-INST: sqinch  x0, vl64
 // CHECK-ENCODING: [0x60,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f1 70 04 <unknown>
 
 sqinch  x0, vl128
 // CHECK-INST: sqinch  x0, vl128
 // CHECK-ENCODING: [0x80,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f1 70 04 <unknown>
 
 sqinch  x0, vl256
 // CHECK-INST: sqinch  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f1 70 04 <unknown>
 
 sqinch  x0, #14
 // CHECK-INST: sqinch  x0, #14
 // CHECK-ENCODING: [0xc0,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f1 70 04 <unknown>
 
 sqinch  x0, #15
 // CHECK-INST: sqinch  x0, #15
 // CHECK-ENCODING: [0xe0,0xf1,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f1 70 04 <unknown>
 
 sqinch  x0, #16
 // CHECK-INST: sqinch  x0, #16
 // CHECK-ENCODING: [0x00,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f2 70 04 <unknown>
 
 sqinch  x0, #17
 // CHECK-INST: sqinch  x0, #17
 // CHECK-ENCODING: [0x20,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f2 70 04 <unknown>
 
 sqinch  x0, #18
 // CHECK-INST: sqinch  x0, #18
 // CHECK-ENCODING: [0x40,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f2 70 04 <unknown>
 
 sqinch  x0, #19
 // CHECK-INST: sqinch  x0, #19
 // CHECK-ENCODING: [0x60,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f2 70 04 <unknown>
 
 sqinch  x0, #20
 // CHECK-INST: sqinch  x0, #20
 // CHECK-ENCODING: [0x80,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f2 70 04 <unknown>
 
 sqinch  x0, #21
 // CHECK-INST: sqinch  x0, #21
 // CHECK-ENCODING: [0xa0,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f2 70 04 <unknown>
 
 sqinch  x0, #22
 // CHECK-INST: sqinch  x0, #22
 // CHECK-ENCODING: [0xc0,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f2 70 04 <unknown>
 
 sqinch  x0, #23
 // CHECK-INST: sqinch  x0, #23
 // CHECK-ENCODING: [0xe0,0xf2,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f2 70 04 <unknown>
 
 sqinch  x0, #24
 // CHECK-INST: sqinch  x0, #24
 // CHECK-ENCODING: [0x00,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f3 70 04 <unknown>
 
 sqinch  x0, #25
 // CHECK-INST: sqinch  x0, #25
 // CHECK-ENCODING: [0x20,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f3 70 04 <unknown>
 
 sqinch  x0, #26
 // CHECK-INST: sqinch  x0, #26
 // CHECK-ENCODING: [0x40,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f3 70 04 <unknown>
 
 sqinch  x0, #27
 // CHECK-INST: sqinch  x0, #27
 // CHECK-ENCODING: [0x60,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f3 70 04 <unknown>
 
 sqinch  x0, #28
 // CHECK-INST: sqinch  x0, #28
 // CHECK-ENCODING: [0x80,0xf3,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f3 70 04 <unknown>
 
 
@@ -304,35 +304,35 @@ sqinch  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqinch  z0.h
 // CHECK-INST: sqinch	z0.h
 // CHECK-ENCODING: [0xe0,0xc3,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 60 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqinch  z0.h, pow2, mul #16
 // CHECK-INST: sqinch	z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc0,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 6f 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqinch  z0.h, pow2
 // CHECK-INST: sqinch	z0.h, pow2
 // CHECK-ENCODING: [0x00,0xc0,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 60 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqincp.s b/llvm/test/MC/AArch64/SVE/sqincp.s
index 69ef677bf6ca9..f8f279e40de4c 100644
--- a/llvm/test/MC/AArch64/SVE/sqincp.s
+++ b/llvm/test/MC/AArch64/SVE/sqincp.s
@@ -12,85 +12,85 @@
 sqincp  x0, p0.b
 // CHECK-INST: sqincp  x0, p0.b
 // CHECK-ENCODING: [0x00,0x8c,0x28,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 28 25 <unknown>
 
 sqincp  x0, p0.h
 // CHECK-INST: sqincp  x0, p0.h
 // CHECK-ENCODING: [0x00,0x8c,0x68,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 68 25 <unknown>
 
 sqincp  x0, p0.s
 // CHECK-INST: sqincp  x0, p0.s
 // CHECK-ENCODING: [0x00,0x8c,0xa8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c a8 25 <unknown>
 
 sqincp  x0, p0.d
 // CHECK-INST: sqincp  x0, p0.d
 // CHECK-ENCODING: [0x00,0x8c,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c e8 25 <unknown>
 
 sqincp  xzr, p15.b, wzr
 // CHECK-INST: sqincp  xzr, p15.b, wzr
 // CHECK-ENCODING: [0xff,0x89,0x28,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 28 25 <unknown>
 
 sqincp  xzr, p15.h, wzr
 // CHECK-INST: sqincp  xzr, p15.h, wzr
 // CHECK-ENCODING: [0xff,0x89,0x68,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 68 25 <unknown>
 
 sqincp  xzr, p15.s, wzr
 // CHECK-INST: sqincp  xzr, p15.s, wzr
 // CHECK-ENCODING: [0xff,0x89,0xa8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 a8 25 <unknown>
 
 sqincp  xzr, p15.d, wzr
 // CHECK-INST: sqincp  xzr, p15.d, wzr
 // CHECK-ENCODING: [0xff,0x89,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 e8 25 <unknown>
 
 sqincp  z0.h, p0
 // CHECK-INST: sqincp  z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x68,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 68 25 <unknown>
 
 sqincp  z0.h, p0.h
 // CHECK-INST: sqincp  z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x68,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 68 25 <unknown>
 
 sqincp  z0.s, p0
 // CHECK-INST: sqincp  z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xa8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 a8 25 <unknown>
 
 sqincp  z0.s, p0.s
 // CHECK-INST: sqincp  z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xa8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 a8 25 <unknown>
 
 sqincp  z0.d, p0
 // CHECK-INST: sqincp  z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e8 25 <unknown>
 
 sqincp  z0.d, p0.d
 // CHECK-INST: sqincp  z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e8 25 <unknown>
 
 
@@ -100,11 +100,11 @@ sqincp  z0.d, p0.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqincp  z0.d, p0.d
 // CHECK-INST: sqincp	z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xe8,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e8 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqincw.s b/llvm/test/MC/AArch64/SVE/sqincw.s
index b71860c69fffd..c602ba1a28672 100644
--- a/llvm/test/MC/AArch64/SVE/sqincw.s
+++ b/llvm/test/MC/AArch64/SVE/sqincw.s
@@ -16,25 +16,25 @@
 sqincw  x0
 // CHECK-INST: sqincw  x0
 // CHECK-ENCODING: [0xe0,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 b0 04 <unknown>
 
 sqincw  x0, all
 // CHECK-INST: sqincw  x0
 // CHECK-ENCODING: [0xe0,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 b0 04 <unknown>
 
 sqincw  x0, all, mul #1
 // CHECK-INST: sqincw  x0
 // CHECK-ENCODING: [0xe0,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 b0 04 <unknown>
 
 sqincw  x0, all, mul #16
 // CHECK-INST: sqincw  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf3,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 bf 04 <unknown>
 
 
@@ -45,37 +45,37 @@ sqincw  x0, all, mul #16
 sqincw  x0, w0
 // CHECK-INST: sqincw  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 a0 04 <unknown>
 
 sqincw  x0, w0, all
 // CHECK-INST: sqincw  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 a0 04 <unknown>
 
 sqincw  x0, w0, all, mul #1
 // CHECK-INST: sqincw  x0, w0
 // CHECK-ENCODING: [0xe0,0xf3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 a0 04 <unknown>
 
 sqincw  x0, w0, all, mul #16
 // CHECK-INST: sqincw  x0, w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf3,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f3 af 04 <unknown>
 
 sqincw  x0, w0, pow2
 // CHECK-INST: sqincw  x0, w0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 a0 04 <unknown>
 
 sqincw  x0, w0, pow2, mul #16
 // CHECK-INST: sqincw  x0, w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf0,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 af 04 <unknown>
 
 
@@ -85,37 +85,37 @@ sqincw  x0, w0, pow2, mul #16
 sqincw  z0.s
 // CHECK-INST: sqincw  z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 a0 04 <unknown>
 
 sqincw  z0.s, all
 // CHECK-INST: sqincw  z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 a0 04 <unknown>
 
 sqincw  z0.s, all, mul #1
 // CHECK-INST: sqincw  z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 a0 04 <unknown>
 
 sqincw  z0.s, all, mul #16
 // CHECK-INST: sqincw  z0.s, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc3,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 af 04 <unknown>
 
 sqincw  z0.s, pow2
 // CHECK-INST: sqincw  z0.s, pow2
 // CHECK-ENCODING: [0x00,0xc0,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a0 04 <unknown>
 
 sqincw  z0.s, pow2, mul #16
 // CHECK-INST: sqincw  z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc0,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 af 04 <unknown>
 
 
@@ -126,175 +126,175 @@ sqincw  z0.s, pow2, mul #16
 sqincw  x0, pow2
 // CHECK-INST: sqincw  x0, pow2
 // CHECK-ENCODING: [0x00,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f0 b0 04 <unknown>
 
 sqincw  x0, vl1
 // CHECK-INST: sqincw  x0, vl1
 // CHECK-ENCODING: [0x20,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f0 b0 04 <unknown>
 
 sqincw  x0, vl2
 // CHECK-INST: sqincw  x0, vl2
 // CHECK-ENCODING: [0x40,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f0 b0 04 <unknown>
 
 sqincw  x0, vl3
 // CHECK-INST: sqincw  x0, vl3
 // CHECK-ENCODING: [0x60,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f0 b0 04 <unknown>
 
 sqincw  x0, vl4
 // CHECK-INST: sqincw  x0, vl4
 // CHECK-ENCODING: [0x80,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f0 b0 04 <unknown>
 
 sqincw  x0, vl5
 // CHECK-INST: sqincw  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f0 b0 04 <unknown>
 
 sqincw  x0, vl6
 // CHECK-INST: sqincw  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f0 b0 04 <unknown>
 
 sqincw  x0, vl7
 // CHECK-INST: sqincw  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf0,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f0 b0 04 <unknown>
 
 sqincw  x0, vl8
 // CHECK-INST: sqincw  x0, vl8
 // CHECK-ENCODING: [0x00,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f1 b0 04 <unknown>
 
 sqincw  x0, vl16
 // CHECK-INST: sqincw  x0, vl16
 // CHECK-ENCODING: [0x20,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f1 b0 04 <unknown>
 
 sqincw  x0, vl32
 // CHECK-INST: sqincw  x0, vl32
 // CHECK-ENCODING: [0x40,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f1 b0 04 <unknown>
 
 sqincw  x0, vl64
 // CHECK-INST: sqincw  x0, vl64
 // CHECK-ENCODING: [0x60,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f1 b0 04 <unknown>
 
 sqincw  x0, vl128
 // CHECK-INST: sqincw  x0, vl128
 // CHECK-ENCODING: [0x80,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f1 b0 04 <unknown>
 
 sqincw  x0, vl256
 // CHECK-INST: sqincw  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f1 b0 04 <unknown>
 
 sqincw  x0, #14
 // CHECK-INST: sqincw  x0, #14
 // CHECK-ENCODING: [0xc0,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f1 b0 04 <unknown>
 
 sqincw  x0, #15
 // CHECK-INST: sqincw  x0, #15
 // CHECK-ENCODING: [0xe0,0xf1,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f1 b0 04 <unknown>
 
 sqincw  x0, #16
 // CHECK-INST: sqincw  x0, #16
 // CHECK-ENCODING: [0x00,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f2 b0 04 <unknown>
 
 sqincw  x0, #17
 // CHECK-INST: sqincw  x0, #17
 // CHECK-ENCODING: [0x20,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f2 b0 04 <unknown>
 
 sqincw  x0, #18
 // CHECK-INST: sqincw  x0, #18
 // CHECK-ENCODING: [0x40,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f2 b0 04 <unknown>
 
 sqincw  x0, #19
 // CHECK-INST: sqincw  x0, #19
 // CHECK-ENCODING: [0x60,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f2 b0 04 <unknown>
 
 sqincw  x0, #20
 // CHECK-INST: sqincw  x0, #20
 // CHECK-ENCODING: [0x80,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f2 b0 04 <unknown>
 
 sqincw  x0, #21
 // CHECK-INST: sqincw  x0, #21
 // CHECK-ENCODING: [0xa0,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f2 b0 04 <unknown>
 
 sqincw  x0, #22
 // CHECK-INST: sqincw  x0, #22
 // CHECK-ENCODING: [0xc0,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f2 b0 04 <unknown>
 
 sqincw  x0, #23
 // CHECK-INST: sqincw  x0, #23
 // CHECK-ENCODING: [0xe0,0xf2,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f2 b0 04 <unknown>
 
 sqincw  x0, #24
 // CHECK-INST: sqincw  x0, #24
 // CHECK-ENCODING: [0x00,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f3 b0 04 <unknown>
 
 sqincw  x0, #25
 // CHECK-INST: sqincw  x0, #25
 // CHECK-ENCODING: [0x20,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f3 b0 04 <unknown>
 
 sqincw  x0, #26
 // CHECK-INST: sqincw  x0, #26
 // CHECK-ENCODING: [0x40,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f3 b0 04 <unknown>
 
 sqincw  x0, #27
 // CHECK-INST: sqincw  x0, #27
 // CHECK-ENCODING: [0x60,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f3 b0 04 <unknown>
 
 sqincw  x0, #28
 // CHECK-INST: sqincw  x0, #28
 // CHECK-ENCODING: [0x80,0xf3,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f3 b0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ sqincw  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqincw  z0.s
 // CHECK-INST: sqincw	z0.s
 // CHECK-ENCODING: [0xe0,0xc3,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c3 a0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqincw  z0.s, pow2, mul #16
 // CHECK-INST: sqincw	z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc0,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 af 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqincw  z0.s, pow2
 // CHECK-INST: sqincw	z0.s, pow2
 // CHECK-ENCODING: [0x00,0xc0,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sqsub.s b/llvm/test/MC/AArch64/SVE/sqsub.s
index fa183bc8ab50a..88ade7e1f160b 100644
--- a/llvm/test/MC/AArch64/SVE/sqsub.s
+++ b/llvm/test/MC/AArch64/SVE/sqsub.s
@@ -13,109 +13,109 @@
 sqsub     z0.b, z0.b, z0.b
 // CHECK-INST: sqsub z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x18,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 18 20 04 <unknown>
 
 sqsub     z0.h, z0.h, z0.h
 // CHECK-INST: sqsub z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x18,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 18 60 04 <unknown>
 
 sqsub     z0.s, z0.s, z0.s
 // CHECK-INST: sqsub z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x18,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 18 a0 04 <unknown>
 
 sqsub     z0.d, z0.d, z0.d
 // CHECK-INST: sqsub z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x18,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 18 e0 04 <unknown>
 
 sqsub     z0.b, z0.b, #0
 // CHECK-INST: sqsub z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x26,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 26 25 <unknown>
 
 sqsub     z31.b, z31.b, #255
 // CHECK-INST: sqsub z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x26,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 26 25 <unknown>
 
 sqsub     z0.h, z0.h, #0
 // CHECK-INST: sqsub z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x66,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 66 25 <unknown>
 
 sqsub     z0.h, z0.h, #0, lsl #8
 // CHECK-INST: sqsub z0.h, z0.h, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0x66,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 66 25 <unknown>
 
 sqsub     z31.h, z31.h, #255, lsl #8
 // CHECK-INST: sqsub z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x66,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 66 25 <unknown>
 
 sqsub     z31.h, z31.h, #65280
 // CHECK-INST: sqsub z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x66,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 66 25 <unknown>
 
 sqsub     z0.s, z0.s, #0
 // CHECK-INST: sqsub z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xa6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a6 25 <unknown>
 
 sqsub     z0.s, z0.s, #0, lsl #8
 // CHECK-INST: sqsub z0.s, z0.s, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xa6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a6 25 <unknown>
 
 sqsub     z31.s, z31.s, #255, lsl #8
 // CHECK-INST: sqsub z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a6 25 <unknown>
 
 sqsub     z31.s, z31.s, #65280
 // CHECK-INST: sqsub z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a6 25 <unknown>
 
 sqsub     z0.d, z0.d, #0
 // CHECK-INST: sqsub z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xe6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e6 25 <unknown>
 
 sqsub     z0.d, z0.d, #0, lsl #8
 // CHECK-INST: sqsub z0.d, z0.d, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xe6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e6 25 <unknown>
 
 sqsub     z31.d, z31.d, #255, lsl #8
 // CHECK-INST: sqsub z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e6 25 <unknown>
 
 sqsub     z31.d, z31.d, #65280
 // CHECK-INST: sqsub z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e6 25 <unknown>
 
 
@@ -125,11 +125,11 @@ sqsub     z31.d, z31.d, #65280
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqsub     z31.d, z31.d, #65280
 // CHECK-INST: sqsub	z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe6,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e6 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1b.s b/llvm/test/MC/AArch64/SVE/st1b.s
index c65979f9900f7..c221652794da2 100644
--- a/llvm/test/MC/AArch64/SVE/st1b.s
+++ b/llvm/test/MC/AArch64/SVE/st1b.s
@@ -12,119 +12,119 @@
 st1b    z0.b, p0, [x0]
 // CHECK-INST: st1b    { z0.b }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x00,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 00 e4 <unknown>
 
 st1b    z0.h, p0, [x0]
 // CHECK-INST: st1b    { z0.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x20,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 20 e4 <unknown>
 
 st1b    z0.s, p0, [x0]
 // CHECK-INST: st1b    { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 e4 <unknown>
 
 st1b    z0.d, p0, [x0]
 // CHECK-INST: st1b    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x60,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 e4 <unknown>
 
 st1b    { z0.b }, p0, [x0]
 // CHECK-INST: st1b    { z0.b }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x00,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 00 e4 <unknown>
 
 st1b    { z0.h }, p0, [x0]
 // CHECK-INST: st1b    { z0.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x20,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 20 e4 <unknown>
 
 st1b    { z0.s }, p0, [x0]
 // CHECK-INST: st1b    { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 e4 <unknown>
 
 st1b    { z0.d }, p0, [x0]
 // CHECK-INST: st1b    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x60,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 e4 <unknown>
 
 st1b    { z31.b }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1b    { z31.b }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0x0f,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 0f e4 <unknown>
 
 st1b    { z21.b }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1b    { z21.b }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x05,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 05 e4 <unknown>
 
 st1b    { z31.h }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1b    { z31.h }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0x2f,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 2f e4 <unknown>
 
 st1b    { z21.h }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1b    { z21.h }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x25,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 25 e4 <unknown>
 
 st1b    { z31.s }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1b    { z31.s }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0x4f,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 4f e4 <unknown>
 
 st1b    { z21.s }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1b    { z21.s }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x45,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 45 e4 <unknown>
 
 st1b    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1b    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0x6f,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 6f e4 <unknown>
 
 st1b    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1b    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x65,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 65 e4 <unknown>
 
 st1b    { z0.b }, p0, [x0, x0]
 // CHECK-INST: st1b    { z0.b }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0x00,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 00 e4 <unknown>
 
 st1b    { z0.h }, p0, [x0, x0]
 // CHECK-INST: st1b    { z0.h }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0x20,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 20 e4 <unknown>
 
 st1b    { z0.s }, p0, [x0, x0]
 // CHECK-INST: st1b    { z0.s }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0x40,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 40 e4 <unknown>
 
 st1b    { z0.d }, p0, [x0, x0]
 // CHECK-INST: st1b    { z0.d }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x40,0x60,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 60 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1d.s b/llvm/test/MC/AArch64/SVE/st1d.s
index 884a7a56a64f5..2f2911c14afa5 100644
--- a/llvm/test/MC/AArch64/SVE/st1d.s
+++ b/llvm/test/MC/AArch64/SVE/st1d.s
@@ -12,29 +12,29 @@
 st1d    z0.d, p0, [x0]
 // CHECK-INST: st1d    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 e5 <unknown>
 
 st1d    { z0.d }, p0, [x0]
 // CHECK-INST: st1d    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 e5 <unknown>
 
 st1d    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1d    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0xef,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff ef e5 <unknown>
 
 st1d    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1d    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xe5,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 e5 e5 <unknown>
 
 st1d    { z0.d }, p0, [x0, x0, lsl #3]
 // CHECK-INST: st1d    { z0.d }, p0, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0x40,0xe0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 e0 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1h.s b/llvm/test/MC/AArch64/SVE/st1h.s
index 035621e35b218..18ca96d5975e7 100644
--- a/llvm/test/MC/AArch64/SVE/st1h.s
+++ b/llvm/test/MC/AArch64/SVE/st1h.s
@@ -12,89 +12,89 @@
 st1h    z0.h, p0, [x0]
 // CHECK-INST: st1h    { z0.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a0 e4 <unknown>
 
 st1h    z0.s, p0, [x0]
 // CHECK-INST: st1h    { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xc0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 c0 e4 <unknown>
 
 st1h    z0.d, p0, [x0]
 // CHECK-INST: st1h    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 e4 <unknown>
 
 st1h    { z0.h }, p0, [x0]
 // CHECK-INST: st1h    { z0.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a0 e4 <unknown>
 
 st1h    { z0.s }, p0, [x0]
 // CHECK-INST: st1h    { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xc0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 c0 e4 <unknown>
 
 st1h    { z0.d }, p0, [x0]
 // CHECK-INST: st1h    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xe0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e0 e4 <unknown>
 
 st1h    { z31.h }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1h    { z31.h }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0xaf,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff af e4 <unknown>
 
 st1h    { z21.h }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1h    { z21.h }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xa5,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 a5 e4 <unknown>
 
 st1h    { z31.s }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1h    { z31.s }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0xcf,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff cf e4 <unknown>
 
 st1h    { z21.s }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1h    { z21.s }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xc5,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 c5 e4 <unknown>
 
 st1h    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1h    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xe5,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 e5 e4 <unknown>
 
 st1h    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1h    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0xef,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff ef e4 <unknown>
 
 st1h    { z0.h }, p0, [x0, x0, lsl #1]
 // CHECK-INST: st1h    { z0.h }, p0, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x40,0xa0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 a0 e4 <unknown>
 
 st1h    { z0.s }, p0, [x0, x0, lsl #1]
 // CHECK-INST: st1h    { z0.s }, p0, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x40,0xc0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 c0 e4 <unknown>
 
 st1h    { z0.d }, p0, [x0, x0, lsl #1]
 // CHECK-INST: st1h    { z0.d }, p0, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x40,0xe0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 e0 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1w.s b/llvm/test/MC/AArch64/SVE/st1w.s
index 1ca2200979b48..503c7fe82db76 100644
--- a/llvm/test/MC/AArch64/SVE/st1w.s
+++ b/llvm/test/MC/AArch64/SVE/st1w.s
@@ -12,59 +12,59 @@
 st1w    z0.s, p0, [x0]
 // CHECK-INST: st1w    { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 e5 <unknown>
 
 st1w    z0.d, p0, [x0]
 // CHECK-INST: st1w    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x60,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 e5 <unknown>
 
 st1w    { z0.s }, p0, [x0]
 // CHECK-INST: st1w    { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x40,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 40 e5 <unknown>
 
 st1w    { z0.d }, p0, [x0]
 // CHECK-INST: st1w    { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x60,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 60 e5 <unknown>
 
 st1w    { z31.s }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1w    { z31.s }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0x4f,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 4f e5 <unknown>
 
 st1w    { z21.s }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1w    { z21.s }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x45,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 45 e5 <unknown>
 
 st1w    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-INST: st1w    { z31.d }, p7, [sp, #-1, mul vl]
 // CHECK-ENCODING: [0xff,0xff,0x6f,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 6f e5 <unknown>
 
 st1w    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-INST: st1w    { z21.d }, p5, [x10, #5, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x65,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 65 e5 <unknown>
 
 st1w    { z0.s }, p0, [x0, x0, lsl #2]
 // CHECK-INST: st1w    { z0.s }, p0, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0x40,0x40,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 40 e5 <unknown>
 
 st1w    { z0.d }, p0, [x0, x0, lsl #2]
 // CHECK-INST: st1w    { z0.d }, p0, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0x40,0x60,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 60 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st2b.s b/llvm/test/MC/AArch64/SVE/st2b.s
index 2294c3daf23eb..c8e45eabcf6b0 100644
--- a/llvm/test/MC/AArch64/SVE/st2b.s
+++ b/llvm/test/MC/AArch64/SVE/st2b.s
@@ -12,29 +12,29 @@
 st2b    { z0.b, z1.b }, p0, [x0, x0]
 // CHECK-INST: st2b    { z0.b, z1.b }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x60,0x20,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 20 e4 <unknown>
 
 st2b    { z5.b, z6.b }, p3, [x17, x16]
 // CHECK-INST: st2b    { z5.b, z6.b }, p3, [x17, x16]
 // CHECK-ENCODING: [0x25,0x6e,0x30,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e 30 e4 <unknown>
 
 st2b    { z0.b, z1.b }, p0, [x0]
 // CHECK-INST: st2b    { z0.b, z1.b }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x30,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 30 e4 <unknown>
 
 st2b    { z23.b, z24.b }, p3, [x13, #-16, mul vl]
 // CHECK-INST: st2b    { z23.b, z24.b }, p3, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x38,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 38 e4 <unknown>
 
 st2b    { z21.b, z22.b }, p5, [x10, #10, mul vl]
 // CHECK-INST: st2b    { z21.b, z22.b }, p5, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x35,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 35 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st2d.s b/llvm/test/MC/AArch64/SVE/st2d.s
index e8c973b4bc509..afe72786feacb 100644
--- a/llvm/test/MC/AArch64/SVE/st2d.s
+++ b/llvm/test/MC/AArch64/SVE/st2d.s
@@ -12,29 +12,29 @@
 st2d    { z0.d, z1.d }, p0, [x0, x0, lsl #3]
 // CHECK-INST: st2d    { z0.d, z1.d }, p0, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0x60,0xa0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 a0 e5 <unknown>
 
 st2d    { z5.d, z6.d }, p3, [x17, x16, lsl #3]
 // CHECK-INST: st2d    { z5.d, z6.d }, p3, [x17, x16, lsl #3]
 // CHECK-ENCODING: [0x25,0x6e,0xb0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e b0 e5 <unknown>
 
 st2d    { z0.d, z1.d }, p0, [x0]
 // CHECK-INST: st2d    { z0.d, z1.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xb0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 b0 e5 <unknown>
 
 st2d    { z23.d, z24.d }, p3, [x13, #-16, mul vl]
 // CHECK-INST: st2d    { z23.d, z24.d }, p3, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xb8,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed b8 e5 <unknown>
 
 st2d    { z21.d, z22.d }, p5, [x10, #10, mul vl]
 // CHECK-INST: st2d    { z21.d, z22.d }, p5, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xb5,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 b5 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st2h.s b/llvm/test/MC/AArch64/SVE/st2h.s
index 8f4034be72438..c60b0acf70c50 100644
--- a/llvm/test/MC/AArch64/SVE/st2h.s
+++ b/llvm/test/MC/AArch64/SVE/st2h.s
@@ -12,29 +12,29 @@
 st2h    { z0.h, z1.h }, p0, [x0, x0, lsl #1]
 // CHECK-INST: st2h    { z0.h, z1.h }, p0, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x60,0xa0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 a0 e4 <unknown>
 
 st2h    { z5.h, z6.h }, p3, [x17, x16, lsl #1]
 // CHECK-INST: st2h    { z5.h, z6.h }, p3, [x17, x16, lsl #1]
 // CHECK-ENCODING: [0x25,0x6e,0xb0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e b0 e4 <unknown>
 
 st2h    { z0.h, z1.h }, p0, [x0]
 // CHECK-INST: st2h    { z0.h, z1.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xb0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 b0 e4 <unknown>
 
 st2h    { z23.h, z24.h }, p3, [x13, #-16, mul vl]
 // CHECK-INST: st2h    { z23.h, z24.h }, p3, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xb8,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed b8 e4 <unknown>
 
 st2h    { z21.h, z22.h }, p5, [x10, #10, mul vl]
 // CHECK-INST: st2h    { z21.h, z22.h }, p5, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xb5,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 b5 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st2w.s b/llvm/test/MC/AArch64/SVE/st2w.s
index 2e18d440b188e..1407cd2600d97 100644
--- a/llvm/test/MC/AArch64/SVE/st2w.s
+++ b/llvm/test/MC/AArch64/SVE/st2w.s
@@ -12,29 +12,29 @@
 st2w    { z0.s, z1.s }, p0, [x0, x0, lsl #2]
 // CHECK-INST: st2w    { z0.s, z1.s }, p0, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0x60,0x20,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 20 e5 <unknown>
 
 st2w    { z5.s, z6.s }, p3, [x17, x16, lsl #2]
 // CHECK-INST: st2w    { z5.s, z6.s }, p3, [x17, x16, lsl #2]
 // CHECK-ENCODING: [0x25,0x6e,0x30,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e 30 e5 <unknown>
 
 st2w    { z0.s, z1.s }, p0, [x0]
 // CHECK-INST: st2w    { z0.s, z1.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x30,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 30 e5 <unknown>
 
 st2w    { z23.s, z24.s }, p3, [x13, #-16, mul vl]
 // CHECK-INST: st2w    { z23.s, z24.s }, p3, [x13, #-16, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x38,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 38 e5 <unknown>
 
 st2w    { z21.s, z22.s }, p5, [x10, #10, mul vl]
 // CHECK-INST: st2w    { z21.s, z22.s }, p5, [x10, #10, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x35,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 35 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st3b.s b/llvm/test/MC/AArch64/SVE/st3b.s
index 3aabf5c298e18..7331e8cb21dd4 100644
--- a/llvm/test/MC/AArch64/SVE/st3b.s
+++ b/llvm/test/MC/AArch64/SVE/st3b.s
@@ -12,29 +12,29 @@
 st3b    { z0.b, z1.b, z2.b }, p0, [x0, x0]
 // CHECK-INST: st3b    { z0.b, z1.b, z2.b }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x60,0x40,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 40 e4 <unknown>
 
 st3b    { z5.b, z6.b, z7.b }, p3, [x17, x16]
 // CHECK-INST: st3b    { z5.b, z6.b, z7.b }, p3, [x17, x16]
 // CHECK-ENCODING: [0x25,0x6e,0x50,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e 50 e4 <unknown>
 
 st3b    { z0.b, z1.b, z2.b }, p0, [x0]
 // CHECK-INST: st3b    { z0.b, z1.b, z2.b }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x50,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 50 e4 <unknown>
 
 st3b    { z23.b, z24.b, z25.b }, p3, [x13, #-24, mul vl]
 // CHECK-INST: st3b    { z23.b, z24.b, z25.b }, p3, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x58,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 58 e4 <unknown>
 
 st3b    { z21.b, z22.b, z23.b }, p5, [x10, #15, mul vl]
 // CHECK-INST: st3b    { z21.b, z22.b, z23.b }, p5, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x55,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 55 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st3d.s b/llvm/test/MC/AArch64/SVE/st3d.s
index 688024508e24f..0ea0fb8ca53c2 100644
--- a/llvm/test/MC/AArch64/SVE/st3d.s
+++ b/llvm/test/MC/AArch64/SVE/st3d.s
@@ -12,29 +12,29 @@
 st3d    { z0.d, z1.d, z2.d }, p0, [x0, x0, lsl #3]
 // CHECK-INST: st3d    { z0.d, z1.d, z2.d }, p0, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0x60,0xc0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 c0 e5 <unknown>
 
 st3d    { z5.d, z6.d, z7.d }, p3, [x17, x16, lsl #3]
 // CHECK-INST: st3d    { z5.d, z6.d, z7.d }, p3, [x17, x16, lsl #3]
 // CHECK-ENCODING: [0x25,0x6e,0xd0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e d0 e5 <unknown>
 
 st3d    { z0.d, z1.d, z2.d }, p0, [x0]
 // CHECK-INST: st3d    { z0.d, z1.d, z2.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xd0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 d0 e5 <unknown>
 
 st3d    { z23.d, z24.d, z25.d }, p3, [x13, #-24, mul vl]
 // CHECK-INST: st3d    { z23.d, z24.d, z25.d }, p3, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xd8,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed d8 e5 <unknown>
 
 st3d    { z21.d, z22.d, z23.d }, p5, [x10, #15, mul vl]
 // CHECK-INST: st3d    { z21.d, z22.d, z23.d }, p5, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xd5,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 d5 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st3h.s b/llvm/test/MC/AArch64/SVE/st3h.s
index a9724e83be614..a5edb651c6408 100644
--- a/llvm/test/MC/AArch64/SVE/st3h.s
+++ b/llvm/test/MC/AArch64/SVE/st3h.s
@@ -12,29 +12,29 @@
 st3h    { z0.h, z1.h, z2.h }, p0, [x0, x0, lsl #1]
 // CHECK-INST: st3h    { z0.h, z1.h, z2.h }, p0, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x60,0xc0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 c0 e4 <unknown>
 
 st3h    { z5.h, z6.h, z7.h }, p3, [x17, x16, lsl #1]
 // CHECK-INST: st3h    { z5.h, z6.h, z7.h }, p3, [x17, x16, lsl #1]
 // CHECK-ENCODING: [0x25,0x6e,0xd0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e d0 e4 <unknown>
 
 st3h    { z0.h, z1.h, z2.h }, p0, [x0]
 // CHECK-INST: st3h    { z0.h, z1.h, z2.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xd0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 d0 e4 <unknown>
 
 st3h    { z23.h, z24.h, z25.h }, p3, [x13, #-24, mul vl]
 // CHECK-INST: st3h    { z23.h, z24.h, z25.h }, p3, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xd8,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed d8 e4 <unknown>
 
 st3h    { z21.h, z22.h, z23.h }, p5, [x10, #15, mul vl]
 // CHECK-INST: st3h    { z21.h, z22.h, z23.h }, p5, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xd5,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 d5 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st3w.s b/llvm/test/MC/AArch64/SVE/st3w.s
index f10336f7f7211..d106184ba135d 100644
--- a/llvm/test/MC/AArch64/SVE/st3w.s
+++ b/llvm/test/MC/AArch64/SVE/st3w.s
@@ -12,29 +12,29 @@
 st3w    { z0.s, z1.s, z2.s }, p0, [x0, x0, lsl #2]
 // CHECK-INST: st3w    { z0.s, z1.s, z2.s }, p0, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0x60,0x40,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 40 e5 <unknown>
 
 st3w    { z5.s, z6.s, z7.s }, p3, [x17, x16, lsl #2]
 // CHECK-INST: st3w    { z5.s, z6.s, z7.s }, p3, [x17, x16, lsl #2]
 // CHECK-ENCODING: [0x25,0x6e,0x50,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e 50 e5 <unknown>
 
 st3w    { z0.s, z1.s, z2.s }, p0, [x0]
 // CHECK-INST: st3w    { z0.s, z1.s, z2.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x50,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 50 e5 <unknown>
 
 st3w    { z23.s, z24.s, z25.s }, p3, [x13, #-24, mul vl]
 // CHECK-INST: st3w    { z23.s, z24.s, z25.s }, p3, [x13, #-24, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x58,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 58 e5 <unknown>
 
 st3w    { z21.s, z22.s, z23.s }, p5, [x10, #15, mul vl]
 // CHECK-INST: st3w    { z21.s, z22.s, z23.s }, p5, [x10, #15, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x55,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 55 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st4b.s b/llvm/test/MC/AArch64/SVE/st4b.s
index 10c0c86e9270e..ea270f42cce69 100644
--- a/llvm/test/MC/AArch64/SVE/st4b.s
+++ b/llvm/test/MC/AArch64/SVE/st4b.s
@@ -12,29 +12,29 @@
 st4b    { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x0]
 // CHECK-INST: st4b    { z0.b, z1.b, z2.b, z3.b }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x60,0x60,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 60 e4 <unknown>
 
 st4b    { z5.b, z6.b, z7.b, z8.b }, p3, [x17, x16]
 // CHECK-INST: st4b    { z5.b, z6.b, z7.b, z8.b }, p3, [x17, x16]
 // CHECK-ENCODING: [0x25,0x6e,0x70,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e 70 e4 <unknown>
 
 st4b    { z0.b, z1.b, z2.b, z3.b }, p0, [x0]
 // CHECK-INST: st4b    { z0.b, z1.b, z2.b, z3.b }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x70,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 70 e4 <unknown>
 
 st4b    { z23.b, z24.b, z25.b, z26.b }, p3, [x13, #-32, mul vl]
 // CHECK-INST: st4b    { z23.b, z24.b, z25.b, z26.b }, p3, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x78,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 78 e4 <unknown>
 
 st4b    { z21.b, z22.b, z23.b, z24.b }, p5, [x10, #20, mul vl]
 // CHECK-INST: st4b    { z21.b, z22.b, z23.b, z24.b }, p5, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x75,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 75 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st4d.s b/llvm/test/MC/AArch64/SVE/st4d.s
index c0431c2807430..dab7d40bf0347 100644
--- a/llvm/test/MC/AArch64/SVE/st4d.s
+++ b/llvm/test/MC/AArch64/SVE/st4d.s
@@ -12,29 +12,29 @@
 st4d    { z0.d, z1.d, z2.d, z3.d }, p0, [x0, x0, lsl #3]
 // CHECK-INST: st4d    { z0.d, z1.d, z2.d, z3.d }, p0, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0x60,0xe0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 e0 e5 <unknown>
 
 st4d    { z5.d, z6.d, z7.d, z8.d }, p3, [x17, x16, lsl #3]
 // CHECK-INST: st4d    { z5.d, z6.d, z7.d, z8.d }, p3, [x17, x16, lsl #3]
 // CHECK-ENCODING: [0x25,0x6e,0xf0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e f0 e5 <unknown>
 
 st4d    { z0.d, z1.d, z2.d, z3.d }, p0, [x0]
 // CHECK-INST: st4d    { z0.d, z1.d, z2.d, z3.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xf0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 f0 e5 <unknown>
 
 st4d    { z23.d, z24.d, z25.d, z26.d }, p3, [x13, #-32, mul vl]
 // CHECK-INST: st4d    { z23.d, z24.d, z25.d, z26.d }, p3, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xf8,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed f8 e5 <unknown>
 
 st4d    { z21.d, z22.d, z23.d, z24.d }, p5, [x10, #20, mul vl]
 // CHECK-INST: st4d    { z21.d, z22.d, z23.d, z24.d }, p5, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xf5,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 f5 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st4h.s b/llvm/test/MC/AArch64/SVE/st4h.s
index 64861380a93e2..cd1c17e6d8990 100644
--- a/llvm/test/MC/AArch64/SVE/st4h.s
+++ b/llvm/test/MC/AArch64/SVE/st4h.s
@@ -12,29 +12,29 @@
 st4h    { z0.h, z1.h, z2.h, z3.h }, p0, [x0, x0, lsl #1]
 // CHECK-INST: st4h    { z0.h, z1.h, z2.h, z3.h }, p0, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x60,0xe0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 e0 e4 <unknown>
 
 st4h    { z5.h, z6.h, z7.h, z8.h }, p3, [x17, x16, lsl #1]
 // CHECK-INST: st4h    { z5.h, z6.h, z7.h, z8.h }, p3, [x17, x16, lsl #1]
 // CHECK-ENCODING: [0x25,0x6e,0xf0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e f0 e4 <unknown>
 
 st4h    { z0.h, z1.h, z2.h, z3.h }, p0, [x0]
 // CHECK-INST: st4h    { z0.h, z1.h, z2.h, z3.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0xf0,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 f0 e4 <unknown>
 
 st4h    { z23.h, z24.h, z25.h, z26.h }, p3, [x13, #-32, mul vl]
 // CHECK-INST: st4h    { z23.h, z24.h, z25.h, z26.h }, p3, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0xf8,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed f8 e4 <unknown>
 
 st4h    { z21.h, z22.h, z23.h, z24.h }, p5, [x10, #20, mul vl]
 // CHECK-INST: st4h    { z21.h, z22.h, z23.h, z24.h }, p5, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0xf5,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 f5 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st4w.s b/llvm/test/MC/AArch64/SVE/st4w.s
index b4e1b08585cbe..4a05eb604ac3f 100644
--- a/llvm/test/MC/AArch64/SVE/st4w.s
+++ b/llvm/test/MC/AArch64/SVE/st4w.s
@@ -12,29 +12,29 @@
 st4w    { z0.s, z1.s, z2.s, z3.s }, p0, [x0, x0, lsl #2]
 // CHECK-INST: st4w    { z0.s, z1.s, z2.s, z3.s }, p0, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0x60,0x60,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 60 e5 <unknown>
 
 st4w    { z5.s, z6.s, z7.s, z8.s }, p3, [x17, x16, lsl #2]
 // CHECK-INST: st4w    { z5.s, z6.s, z7.s, z8.s }, p3, [x17, x16, lsl #2]
 // CHECK-ENCODING: [0x25,0x6e,0x70,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25 6e 70 e5 <unknown>
 
 st4w    { z0.s, z1.s, z2.s, z3.s }, p0, [x0]
 // CHECK-INST: st4w    { z0.s, z1.s, z2.s, z3.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x70,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 70 e5 <unknown>
 
 st4w    { z23.s, z24.s, z25.s, z26.s }, p3, [x13, #-32, mul vl]
 // CHECK-INST: st4w    { z23.s, z24.s, z25.s, z26.s }, p3, [x13, #-32, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x78,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 78 e5 <unknown>
 
 st4w    { z21.s, z22.s, z23.s, z24.s }, p5, [x10, #20, mul vl]
 // CHECK-INST: st4w    { z21.s, z22.s, z23.s, z24.s }, p5, [x10, #20, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x75,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 75 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/stnt1b.s b/llvm/test/MC/AArch64/SVE/stnt1b.s
index 044d5ccc538d3..bb75df59f78c7 100644
--- a/llvm/test/MC/AArch64/SVE/stnt1b.s
+++ b/llvm/test/MC/AArch64/SVE/stnt1b.s
@@ -12,29 +12,29 @@
 stnt1b  z0.b, p0, [x0]
 // CHECK-INST: stnt1b  { z0.b }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x10,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 10 e4 <unknown>
 
 stnt1b  { z0.b }, p0, [x0]
 // CHECK-INST: stnt1b  { z0.b }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x10,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 10 e4 <unknown>
 
 stnt1b  { z23.b }, p3, [x13, #-8, mul vl]
 // CHECK-INST: stnt1b  { z23.b }, p3, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x18,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 18 e4 <unknown>
 
 stnt1b  { z21.b }, p5, [x10, #7, mul vl]
 // CHECK-INST: stnt1b  { z21.b }, p5, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x17,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 17 e4 <unknown>
 
 stnt1b  { z0.b }, p0, [x0, x0]
 // CHECK-INST: stnt1b  { z0.b }, p0, [x0, x0]
 // CHECK-ENCODING: [0x00,0x60,0x00,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 00 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/stnt1d.s b/llvm/test/MC/AArch64/SVE/stnt1d.s
index 1f85df8c142f0..467d2d753d2ce 100644
--- a/llvm/test/MC/AArch64/SVE/stnt1d.s
+++ b/llvm/test/MC/AArch64/SVE/stnt1d.s
@@ -12,29 +12,29 @@
 stnt1d  z0.d, p0, [x0]
 // CHECK-INST: stnt1d  { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x90,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 90 e5 <unknown>
 
 stnt1d  { z0.d }, p0, [x0]
 // CHECK-INST: stnt1d  { z0.d }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x90,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 90 e5 <unknown>
 
 stnt1d  { z23.d }, p3, [x13, #-8, mul vl]
 // CHECK-INST: stnt1d  { z23.d }, p3, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x98,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 98 e5 <unknown>
 
 stnt1d  { z21.d }, p5, [x10, #7, mul vl]
 // CHECK-INST: stnt1d  { z21.d }, p5, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x97,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 97 e5 <unknown>
 
 stnt1d  { z0.d }, p0, [x0, x0, lsl #3]
 // CHECK-INST: stnt1d  { z0.d }, p0, [x0, x0, lsl #3]
 // CHECK-ENCODING: [0x00,0x60,0x80,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 80 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/stnt1h.s b/llvm/test/MC/AArch64/SVE/stnt1h.s
index 8862b9c5fd75e..c98076e982dc4 100644
--- a/llvm/test/MC/AArch64/SVE/stnt1h.s
+++ b/llvm/test/MC/AArch64/SVE/stnt1h.s
@@ -12,29 +12,29 @@
 stnt1h  z0.h, p0, [x0]
 // CHECK-INST: stnt1h  { z0.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x90,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 90 e4 <unknown>
 
 stnt1h  { z0.h }, p0, [x0]
 // CHECK-INST: stnt1h  { z0.h }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x90,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 90 e4 <unknown>
 
 stnt1h  { z23.h }, p3, [x13, #-8, mul vl]
 // CHECK-INST: stnt1h  { z23.h }, p3, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x98,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 98 e4 <unknown>
 
 stnt1h  { z21.h }, p5, [x10, #7, mul vl]
 // CHECK-INST: stnt1h  { z21.h }, p5, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x97,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 97 e4 <unknown>
 
 stnt1h  { z0.h }, p0, [x0, x0, lsl #1]
 // CHECK-INST: stnt1h  { z0.h }, p0, [x0, x0, lsl #1]
 // CHECK-ENCODING: [0x00,0x60,0x80,0xe4]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 80 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/stnt1w.s b/llvm/test/MC/AArch64/SVE/stnt1w.s
index 2bf56c15bfa46..bdde90686822e 100644
--- a/llvm/test/MC/AArch64/SVE/stnt1w.s
+++ b/llvm/test/MC/AArch64/SVE/stnt1w.s
@@ -12,29 +12,29 @@
 stnt1w  z0.s, p0, [x0]
 // CHECK-INST: stnt1w  { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x10,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 10 e5 <unknown>
 
 stnt1w  { z0.s }, p0, [x0]
 // CHECK-INST: stnt1w  { z0.s }, p0, [x0]
 // CHECK-ENCODING: [0x00,0xe0,0x10,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 10 e5 <unknown>
 
 stnt1w  { z23.s }, p3, [x13, #-8, mul vl]
 // CHECK-INST: stnt1w  { z23.s }, p3, [x13, #-8, mul vl]
 // CHECK-ENCODING: [0xb7,0xed,0x18,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 ed 18 e5 <unknown>
 
 stnt1w  { z21.s }, p5, [x10, #7, mul vl]
 // CHECK-INST: stnt1w  { z21.s }, p5, [x10, #7, mul vl]
 // CHECK-ENCODING: [0x55,0xf5,0x17,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 f5 17 e5 <unknown>
 
 stnt1w  { z0.s }, p0, [x0, x0, lsl #2]
 // CHECK-INST: stnt1w  { z0.s }, p0, [x0, x0, lsl #2]
 // CHECK-ENCODING: [0x00,0x60,0x00,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 00 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/str.s b/llvm/test/MC/AArch64/SVE/str.s
index 476ace3ef6837..0294a52b81aed 100644
--- a/llvm/test/MC/AArch64/SVE/str.s
+++ b/llvm/test/MC/AArch64/SVE/str.s
@@ -12,35 +12,35 @@
 str     z0, [x0]
 // CHECK-INST: str     z0, [x0]
 // CHECK-ENCODING: [0x00,0x40,0x80,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 80 e5 <unknown>
 
 str     z21, [x10, #-256, mul vl]
 // CHECK-INST: str     z21, [x10, #-256, mul vl]
 // CHECK-ENCODING: [0x55,0x41,0xa0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 41 a0 e5 <unknown>
 
 str     z31, [sp, #255, mul vl]
 // CHECK-INST: str     z31, [sp, #255, mul vl]
 // CHECK-ENCODING: [0xff,0x5f,0x9f,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 5f 9f e5 <unknown>
 
 str     p0, [x0]
 // CHECK-INST: str     p0, [x0]
 // CHECK-ENCODING: [0x00,0x00,0x80,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 80 e5 <unknown>
 
 str     p15, [sp, #-256, mul vl]
 // CHECK-INST: str     p15, [sp, #-256, mul vl]
 // CHECK-ENCODING: [0xef,0x03,0xa0,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 03 a0 e5 <unknown>
 
 str     p5, [x10, #255, mul vl]
 // CHECK-INST: str     p5, [x10, #255, mul vl]
 // CHECK-ENCODING: [0x45,0x1d,0x9f,0xe5]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 45 1d 9f e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sub.s b/llvm/test/MC/AArch64/SVE/sub.s
index 67071e6910521..7a63915635787 100644
--- a/llvm/test/MC/AArch64/SVE/sub.s
+++ b/llvm/test/MC/AArch64/SVE/sub.s
@@ -12,193 +12,193 @@
 sub     z0.h, z0.h, z0.h
 // CHECK-INST: sub     z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x04,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 04 60 04 <unknown>
 
 sub     z21.b, z10.b, z21.b
 // CHECK-INST: sub     z21.b, z10.b, z21.b
 // CHECK-ENCODING: [0x55,0x05,0x35,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 05 35 04 <unknown>
 
 sub     z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: sub     z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f c1 04 <unknown>
 
 sub     z23.h, p3/m, z23.h, z13.h
 // CHECK-INST: sub     z23.h, p3/m, z23.h, z13.h
 // CHECK-ENCODING: [0xb7,0x0d,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 41 04 <unknown>
 
 sub     z31.h, z31.h, z31.h
 // CHECK-INST: sub     z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x07,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 07 7f 04 <unknown>
 
 sub     z21.h, z10.h, z21.h
 // CHECK-INST: sub     z21.h, z10.h, z21.h
 // CHECK-ENCODING: [0x55,0x05,0x75,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 05 75 04 <unknown>
 
 sub     z31.b, z31.b, z31.b
 // CHECK-INST: sub     z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x07,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 07 3f 04 <unknown>
 
 sub     z0.s, z0.s, z0.s
 // CHECK-INST: sub     z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x04,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 04 a0 04 <unknown>
 
 sub     z23.s, p3/m, z23.s, z13.s
 // CHECK-INST: sub     z23.s, p3/m, z23.s, z13.s
 // CHECK-ENCODING: [0xb7,0x0d,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 81 04 <unknown>
 
 sub     z23.b, z13.b, z8.b
 // CHECK-INST: sub     z23.b, z13.b, z8.b
 // CHECK-ENCODING: [0xb7,0x05,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 05 28 04 <unknown>
 
 sub     z21.d, z10.d, z21.d
 // CHECK-INST: sub     z21.d, z10.d, z21.d
 // CHECK-ENCODING: [0x55,0x05,0xf5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 05 f5 04 <unknown>
 
 sub     z21.s, z10.s, z21.s
 // CHECK-INST: sub     z21.s, z10.s, z21.s
 // CHECK-ENCODING: [0x55,0x05,0xb5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 05 b5 04 <unknown>
 
 sub     z21.s, p5/m, z21.s, z10.s
 // CHECK-INST: sub     z21.s, p5/m, z21.s, z10.s
 // CHECK-ENCODING: [0x55,0x15,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 81 04 <unknown>
 
 sub     z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: sub     z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 81 04 <unknown>
 
 sub     z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: sub     z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x00,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 c1 04 <unknown>
 
 sub     z0.b, z0.b, z0.b
 // CHECK-INST: sub     z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x04,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 04 20 04 <unknown>
 
 sub     z23.d, z13.d, z8.d
 // CHECK-INST: sub     z23.d, z13.d, z8.d
 // CHECK-ENCODING: [0xb7,0x05,0xe8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 05 e8 04 <unknown>
 
 sub     z23.d, p3/m, z23.d, z13.d
 // CHECK-INST: sub     z23.d, p3/m, z23.d, z13.d
 // CHECK-ENCODING: [0xb7,0x0d,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d c1 04 <unknown>
 
 sub     z23.s, z13.s, z8.s
 // CHECK-INST: sub     z23.s, z13.s, z8.s
 // CHECK-ENCODING: [0xb7,0x05,0xa8,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 05 a8 04 <unknown>
 
 sub     z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: sub     z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 01 04 <unknown>
 
 sub     z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: sub     z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x00,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 41 04 <unknown>
 
 sub     z31.d, z31.d, z31.d
 // CHECK-INST: sub     z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x07,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 07 ff 04 <unknown>
 
 sub     z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: sub     z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 41 04 <unknown>
 
 sub     z23.h, z13.h, z8.h
 // CHECK-INST: sub     z23.h, z13.h, z8.h
 // CHECK-ENCODING: [0xb7,0x05,0x68,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 05 68 04 <unknown>
 
 sub     z21.b, p5/m, z21.b, z10.b
 // CHECK-INST: sub     z21.b, p5/m, z21.b, z10.b
 // CHECK-ENCODING: [0x55,0x15,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 01 04 <unknown>
 
 sub     z21.d, p5/m, z21.d, z10.d
 // CHECK-INST: sub     z21.d, p5/m, z21.d, z10.d
 // CHECK-ENCODING: [0x55,0x15,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 c1 04 <unknown>
 
 sub     z0.d, z0.d, z0.d
 // CHECK-INST: sub     z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x04,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 04 e0 04 <unknown>
 
 sub     z31.s, z31.s, z31.s
 // CHECK-INST: sub     z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x07,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 07 bf 04 <unknown>
 
 sub     z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: sub     z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x00,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 01 04 <unknown>
 
 sub     z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: sub     z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x00,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 81 04 <unknown>
 
 sub     z21.h, p5/m, z21.h, z10.h
 // CHECK-INST: sub     z21.h, p5/m, z21.h, z10.h
 // CHECK-ENCODING: [0x55,0x15,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 55 15 41 04 <unknown>
 
 sub     z23.b, p3/m, z23.b, z13.b
 // CHECK-INST: sub     z23.b, p3/m, z23.b, z13.b
 // CHECK-ENCODING: [0xb7,0x0d,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 01 04 <unknown>
 
 // -----------------------
@@ -207,85 +207,85 @@ sub     z23.b, p3/m, z23.b, z13.b
 sub     z0.b, z0.b, #0
 // CHECK-INST: sub     z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x21,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 21 25 <unknown>
 
 sub     z31.b, z31.b, #255
 // CHECK-INST: sub     z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x21,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 21 25 <unknown>
 
 sub     z0.h, z0.h, #0
 // CHECK-INST: sub     z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x61,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 61 25 <unknown>
 
 sub     z0.h, z0.h, #0, lsl #8
 // CHECK-INST: sub     z0.h, z0.h, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0x61,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 61 25 <unknown>
 
 sub     z31.h, z31.h, #255, lsl #8
 // CHECK-INST: sub     z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x61,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 61 25 <unknown>
 
 sub     z31.h, z31.h, #65280
 // CHECK-INST: sub     z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x61,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 61 25 <unknown>
 
 sub     z0.s, z0.s, #0
 // CHECK-INST: sub     z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xa1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a1 25 <unknown>
 
 sub     z0.s, z0.s, #0, lsl #8
 // CHECK-INST: sub     z0.s, z0.s, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xa1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a1 25 <unknown>
 
 sub     z31.s, z31.s, #255, lsl #8
 // CHECK-INST: sub     z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a1 25 <unknown>
 
 sub     z31.s, z31.s, #65280
 // CHECK-INST: sub     z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a1 25 <unknown>
 
 sub     z0.d, z0.d, #0
 // CHECK-INST: sub     z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xe1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e1 25 <unknown>
 
 sub     z0.d, z0.d, #0, lsl #8
 // CHECK-INST: sub     z0.d, z0.d, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xe1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e1 25 <unknown>
 
 sub     z31.d, z31.d, #255, lsl #8
 // CHECK-INST: sub     z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e1 25 <unknown>
 
 sub     z31.d, z31.d, #65280
 // CHECK-INST: sub     z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e1 25 <unknown>
 
 
@@ -296,35 +296,35 @@ sub     z31.d, z31.d, #65280
 movprfx z23.b, p3/z, z30.b
 // CHECK-INST: movprfx	z23.b, p3/z, z30.b
 // CHECK-ENCODING: [0xd7,0x2f,0x10,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: d7 2f 10 04 <unknown>
 
 sub     z23.b, p3/m, z23.b, z13.b
 // CHECK-INST: sub	z23.b, p3/m, z23.b, z13.b
 // CHECK-ENCODING: [0xb7,0x0d,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 01 04 <unknown>
 
 movprfx z23, z30
 // CHECK-INST: movprfx	z23, z30
 // CHECK-ENCODING: [0xd7,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: d7 bf 20 04 <unknown>
 
 sub     z23.b, p3/m, z23.b, z13.b
 // CHECK-INST: sub	z23.b, p3/m, z23.b, z13.b
 // CHECK-ENCODING: [0xb7,0x0d,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: b7 0d 01 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sub     z31.d, z31.d, #65280
 // CHECK-INST: sub	z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe1,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e1 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/subr.s b/llvm/test/MC/AArch64/SVE/subr.s
index 65201aa414807..b433bd2559dc5 100644
--- a/llvm/test/MC/AArch64/SVE/subr.s
+++ b/llvm/test/MC/AArch64/SVE/subr.s
@@ -13,109 +13,109 @@
 subr    z0.b, p0/m, z0.b, z0.b
 // CHECK-INST: subr z0.b, p0/m, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x00,0x03,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 03 04 <unknown>
 
 subr    z0.h, p0/m, z0.h, z0.h
 // CHECK-INST: subr z0.h, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x00,0x43,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 43 04 <unknown>
 
 subr    z0.s, p0/m, z0.s, z0.s
 // CHECK-INST: subr z0.s, p0/m, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x00,0x83,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 83 04 <unknown>
 
 subr    z0.d, p0/m, z0.d, z0.d
 // CHECK-INST: subr z0.d, p0/m, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x00,0xc3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 00 c3 04 <unknown>
 
 subr    z0.b, z0.b, #0
 // CHECK-INST: subr z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x23,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 23 25 <unknown>
 
 subr    z31.b, z31.b, #255
 // CHECK-INST: subr z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x23,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 23 25 <unknown>
 
 subr    z0.h, z0.h, #0
 // CHECK-INST: subr z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x63,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 63 25 <unknown>
 
 subr    z0.h, z0.h, #0, lsl #8
 // CHECK-INST: subr z0.h, z0.h, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0x63,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 63 25 <unknown>
 
 subr    z31.h, z31.h, #255, lsl #8
 // CHECK-INST: subr z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x63,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 63 25 <unknown>
 
 subr    z31.h, z31.h, #65280
 // CHECK-INST: subr z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x63,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 63 25 <unknown>
 
 subr    z0.s, z0.s, #0
 // CHECK-INST: subr z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xa3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a3 25 <unknown>
 
 subr    z0.s, z0.s, #0, lsl #8
 // CHECK-INST: subr z0.s, z0.s, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xa3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a3 25 <unknown>
 
 subr    z31.s, z31.s, #255, lsl #8
 // CHECK-INST: subr z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a3 25 <unknown>
 
 subr    z31.s, z31.s, #65280
 // CHECK-INST: subr z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a3 25 <unknown>
 
 subr    z0.d, z0.d, #0
 // CHECK-INST: subr z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xe3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e3 25 <unknown>
 
 subr    z0.d, z0.d, #0, lsl #8
 // CHECK-INST: subr z0.d, z0.d, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xe3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e3 25 <unknown>
 
 subr    z31.d, z31.d, #255, lsl #8
 // CHECK-INST: subr z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e3 25 <unknown>
 
 subr    z31.d, z31.d, #65280
 // CHECK-INST: subr z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e3 25 <unknown>
 
 
@@ -125,35 +125,35 @@ subr    z31.d, z31.d, #65280
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 subr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: subr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x00,0xc3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 00 c3 04 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 subr    z5.d, p0/m, z5.d, z0.d
 // CHECK-INST: subr	z5.d, p0/m, z5.d, z0.d
 // CHECK-ENCODING: [0x05,0x00,0xc3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 00 c3 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 subr    z31.d, z31.d, #65280
 // CHECK-INST: subr	z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe3,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e3 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sunpkhi.s b/llvm/test/MC/AArch64/SVE/sunpkhi.s
index fa4f67c5c943a..7d122c07c4eef 100644
--- a/llvm/test/MC/AArch64/SVE/sunpkhi.s
+++ b/llvm/test/MC/AArch64/SVE/sunpkhi.s
@@ -12,17 +12,17 @@
 sunpkhi z31.h, z31.b
 // CHECK-INST: sunpkhi	z31.h, z31.b
 // CHECK-ENCODING: [0xff,0x3b,0x71,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 71 05 <unknown>
 
 sunpkhi z31.s, z31.h
 // CHECK-INST: sunpkhi	z31.s, z31.h
 // CHECK-ENCODING: [0xff,0x3b,0xb1,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b b1 05 <unknown>
 
 sunpkhi z31.d, z31.s
 // CHECK-INST: sunpkhi	z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x3b,0xf1,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b f1 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sunpklo.s b/llvm/test/MC/AArch64/SVE/sunpklo.s
index 2106753d20c60..3dbfbbf87bd58 100644
--- a/llvm/test/MC/AArch64/SVE/sunpklo.s
+++ b/llvm/test/MC/AArch64/SVE/sunpklo.s
@@ -12,17 +12,17 @@
 sunpklo z31.h, z31.b
 // CHECK-INST: sunpklo	z31.h, z31.b
 // CHECK-ENCODING: [0xff,0x3b,0x70,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 70 05 <unknown>
 
 sunpklo z31.s, z31.h
 // CHECK-INST: sunpklo	z31.s, z31.h
 // CHECK-ENCODING: [0xff,0x3b,0xb0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b b0 05 <unknown>
 
 sunpklo z31.d, z31.s
 // CHECK-INST: sunpklo	z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x3b,0xf0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b f0 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sxtb.s b/llvm/test/MC/AArch64/SVE/sxtb.s
index e4b391a57df24..17694b4b3eae7 100644
--- a/llvm/test/MC/AArch64/SVE/sxtb.s
+++ b/llvm/test/MC/AArch64/SVE/sxtb.s
@@ -12,37 +12,37 @@
 sxtb    z0.h, p0/m, z0.h
 // CHECK-INST: sxtb    z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x50,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 50 04 <unknown>
 
 sxtb    z0.s, p0/m, z0.s
 // CHECK-INST: sxtb    z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 90 04 <unknown>
 
 sxtb    z0.d, p0/m, z0.d
 // CHECK-INST: sxtb    z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d0 04 <unknown>
 
 sxtb    z31.h, p7/m, z31.h
 // CHECK-INST: sxtb    z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x50,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 50 04 <unknown>
 
 sxtb    z31.s, p7/m, z31.s
 // CHECK-INST: sxtb    z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 90 04 <unknown>
 
 sxtb    z31.d, p7/m, z31.d
 // CHECK-INST: sxtb    z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d0 04 <unknown>
 
 
@@ -52,23 +52,23 @@ sxtb    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 sxtb    z4.d, p7/m, z31.d
 // CHECK-INST: sxtb	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d0 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sxtb    z4.d, p7/m, z31.d
 // CHECK-INST: sxtb	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sxth.s b/llvm/test/MC/AArch64/SVE/sxth.s
index 83ce9981097e9..a90f8338835e8 100644
--- a/llvm/test/MC/AArch64/SVE/sxth.s
+++ b/llvm/test/MC/AArch64/SVE/sxth.s
@@ -12,25 +12,25 @@
 sxth    z0.s, p0/m, z0.s
 // CHECK-INST: sxth    z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x92,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 92 04 <unknown>
 
 sxth    z0.d, p0/m, z0.d
 // CHECK-INST: sxth    z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d2 04 <unknown>
 
 sxth    z31.s, p7/m, z31.s
 // CHECK-INST: sxth    z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x92,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 92 04 <unknown>
 
 sxth    z31.d, p7/m, z31.d
 // CHECK-INST: sxth    z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d2 04 <unknown>
 
 
@@ -40,23 +40,23 @@ sxth    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 sxth    z4.d, p7/m, z31.d
 // CHECK-INST: sxth	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d2 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sxth    z4.d, p7/m, z31.d
 // CHECK-INST: sxth	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd2,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d2 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/sxtw.s b/llvm/test/MC/AArch64/SVE/sxtw.s
index bd4c291f67b30..5c40fe3caa725 100644
--- a/llvm/test/MC/AArch64/SVE/sxtw.s
+++ b/llvm/test/MC/AArch64/SVE/sxtw.s
@@ -12,13 +12,13 @@
 sxtw    z0.d, p0/m, z0.d
 // CHECK-INST: sxtw    z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d4 04 <unknown>
 
 sxtw    z31.d, p7/m, z31.d
 // CHECK-INST: sxtw    z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d4 04 <unknown>
 
 
@@ -28,23 +28,23 @@ sxtw    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 sxtw    z4.d, p7/m, z31.d
 // CHECK-INST: sxtw	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d4 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sxtw    z4.d, p7/m, z31.d
 // CHECK-INST: sxtw	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd4,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d4 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/tbl.s b/llvm/test/MC/AArch64/SVE/tbl.s
index bbd692eab45db..bbfbdceb11be4 100644
--- a/llvm/test/MC/AArch64/SVE/tbl.s
+++ b/llvm/test/MC/AArch64/SVE/tbl.s
@@ -12,47 +12,47 @@
 tbl  z31.b, z31.b, z31.b
 // CHECK-INST: tbl	z31.b, { z31.b }, z31.b
 // CHECK-ENCODING: [0xff,0x33,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 3f 05 <unknown>
 
 tbl  z31.h, z31.h, z31.h
 // CHECK-INST: tbl	z31.h, { z31.h }, z31.h
 // CHECK-ENCODING: [0xff,0x33,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 7f 05 <unknown>
 
 tbl  z31.s, z31.s, z31.s
 // CHECK-INST: tbl	z31.s, { z31.s }, z31.s
 // CHECK-ENCODING: [0xff,0x33,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 bf 05 <unknown>
 
 tbl  z31.d, z31.d, z31.d
 // CHECK-INST: tbl	z31.d, { z31.d }, z31.d
 // CHECK-ENCODING: [0xff,0x33,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 ff 05 <unknown>
 
 tbl  z31.b, { z31.b }, z31.b
 // CHECK-INST: tbl	z31.b, { z31.b }, z31.b
 // CHECK-ENCODING: [0xff,0x33,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 3f 05 <unknown>
 
 tbl  z31.h, { z31.h }, z31.h
 // CHECK-INST: tbl	z31.h, { z31.h }, z31.h
 // CHECK-ENCODING: [0xff,0x33,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 7f 05 <unknown>
 
 tbl  z31.s, { z31.s }, z31.s
 // CHECK-INST: tbl	z31.s, { z31.s }, z31.s
 // CHECK-ENCODING: [0xff,0x33,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 bf 05 <unknown>
 
 tbl  z31.d, { z31.d }, z31.d
 // CHECK-INST: tbl	z31.d, { z31.d }, z31.d
 // CHECK-ENCODING: [0xff,0x33,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 33 ff 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/trn1.s b/llvm/test/MC/AArch64/SVE/trn1.s
index 623cb3e5dd08d..e573ad85901e6 100644
--- a/llvm/test/MC/AArch64/SVE/trn1.s
+++ b/llvm/test/MC/AArch64/SVE/trn1.s
@@ -12,47 +12,47 @@
 trn1    z31.b, z31.b, z31.b
 // CHECK-INST: trn1	z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x73,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 73 3f 05 <unknown>
 
 trn1    z31.h, z31.h, z31.h
 // CHECK-INST: trn1	z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x73,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 73 7f 05 <unknown>
 
 trn1    z31.s, z31.s, z31.s
 // CHECK-INST: trn1	z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x73,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 73 bf 05 <unknown>
 
 trn1    z31.d, z31.d, z31.d
 // CHECK-INST: trn1	z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x73,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 73 ff 05 <unknown>
 
 trn1    p15.b, p15.b, p15.b
 // CHECK-INST: trn1	p15.b, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x51,0x2f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 51 2f 05 <unknown>
 
 trn1    p15.s, p15.s, p15.s
 // CHECK-INST: trn1	p15.s, p15.s, p15.s
 // CHECK-ENCODING: [0xef,0x51,0xaf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 51 af 05 <unknown>
 
 trn1    p15.h, p15.h, p15.h
 // CHECK-INST: trn1	p15.h, p15.h, p15.h
 // CHECK-ENCODING: [0xef,0x51,0x6f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 51 6f 05 <unknown>
 
 trn1    p15.d, p15.d, p15.d
 // CHECK-INST: trn1	p15.d, p15.d, p15.d
 // CHECK-ENCODING: [0xef,0x51,0xef,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 51 ef 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/trn2.s b/llvm/test/MC/AArch64/SVE/trn2.s
index 4b73b9053e878..9ae60ffb99f96 100644
--- a/llvm/test/MC/AArch64/SVE/trn2.s
+++ b/llvm/test/MC/AArch64/SVE/trn2.s
@@ -12,47 +12,47 @@
 trn2    z31.b, z31.b, z31.b
 // CHECK-INST: trn2	z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x77,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 77 3f 05 <unknown>
 
 trn2    z31.h, z31.h, z31.h
 // CHECK-INST: trn2	z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x77,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 77 7f 05 <unknown>
 
 trn2    z31.s, z31.s, z31.s
 // CHECK-INST: trn2	z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x77,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 77 bf 05 <unknown>
 
 trn2    z31.d, z31.d, z31.d
 // CHECK-INST: trn2	z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x77,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 77 ff 05 <unknown>
 
 trn2    p15.b, p15.b, p15.b
 // CHECK-INST: trn2	p15.b, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x55,0x2f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 55 2f 05 <unknown>
 
 trn2    p15.s, p15.s, p15.s
 // CHECK-INST: trn2	p15.s, p15.s, p15.s
 // CHECK-ENCODING: [0xef,0x55,0xaf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 55 af 05 <unknown>
 
 trn2    p15.h, p15.h, p15.h
 // CHECK-INST: trn2	p15.h, p15.h, p15.h
 // CHECK-ENCODING: [0xef,0x55,0x6f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 55 6f 05 <unknown>
 
 trn2    p15.d, p15.d, p15.d
 // CHECK-INST: trn2	p15.d, p15.d, p15.d
 // CHECK-ENCODING: [0xef,0x55,0xef,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 55 ef 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uabd.s b/llvm/test/MC/AArch64/SVE/uabd.s
index b8ac9d8fc4e54..39745f9f09b3c 100644
--- a/llvm/test/MC/AArch64/SVE/uabd.s
+++ b/llvm/test/MC/AArch64/SVE/uabd.s
@@ -12,25 +12,25 @@
 uabd  z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: uabd	z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x0d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 0d 04 <unknown>
 
 uabd  z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: uabd	z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x4d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 4d 04 <unknown>
 
 uabd  z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: uabd	z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x8d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 8d 04 <unknown>
 
 uabd  z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: uabd	z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xcd,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f cd 04 <unknown>
 
 
@@ -40,23 +40,23 @@ uabd  z31.d, p7/m, z31.d, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 uabd  z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: uabd	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xcd,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f cd 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 uabd  z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: uabd	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xcd,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f cd 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uaddv.s b/llvm/test/MC/AArch64/SVE/uaddv.s
index 2d9e26f601b33..449c6dc4f85c0 100644
--- a/llvm/test/MC/AArch64/SVE/uaddv.s
+++ b/llvm/test/MC/AArch64/SVE/uaddv.s
@@ -12,23 +12,23 @@
 uaddv d0, p7, z31.b
 // CHECK-INST: uaddv	d0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x01,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 01 04 <unknown>
 
 uaddv d0, p7, z31.h
 // CHECK-INST: uaddv	d0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x41,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 41 04 <unknown>
 
 uaddv d0, p7, z31.s
 // CHECK-INST: uaddv	d0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x81,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 81 04 <unknown>
 
 uaddv d0, p7, z31.d
 // CHECK-INST: uaddv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c1 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/ucvtf.s b/llvm/test/MC/AArch64/SVE/ucvtf.s
index 231e9d900a0bb..7b380eda0997b 100644
--- a/llvm/test/MC/AArch64/SVE/ucvtf.s
+++ b/llvm/test/MC/AArch64/SVE/ucvtf.s
@@ -12,43 +12,43 @@
 ucvtf   z0.h, p0/m, z0.h
 // CHECK-INST: ucvtf   z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x53,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 53 65 <unknown>
 
 ucvtf   z0.h, p0/m, z0.s
 // CHECK-INST: ucvtf   z0.h, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x55,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 55 65 <unknown>
 
 ucvtf   z0.h, p0/m, z0.d
 // CHECK-INST: ucvtf   z0.h, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0x57,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 57 65 <unknown>
 
 ucvtf   z0.s, p0/m, z0.s
 // CHECK-INST: ucvtf   z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x95,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 95 65 <unknown>
 
 ucvtf   z0.s, p0/m, z0.d
 // CHECK-INST: ucvtf   z0.s, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd5,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d5 65 <unknown>
 
 ucvtf   z0.d, p0/m, z0.s
 // CHECK-INST: ucvtf   z0.d, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0xd1,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d1 65 <unknown>
 
 ucvtf   z0.d, p0/m, z0.d
 // CHECK-INST: ucvtf   z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d7 65 <unknown>
 
 
@@ -58,23 +58,23 @@ ucvtf   z0.d, p0/m, z0.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 ucvtf   z5.d, p0/m, z0.d
 // CHECK-INST: ucvtf	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xd7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 d7 65 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 ucvtf   z5.d, p0/m, z0.d
 // CHECK-INST: ucvtf	z5.d, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0xd7,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 05 a0 d7 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/udiv.s b/llvm/test/MC/AArch64/SVE/udiv.s
index 3a5aff16ab510..836fe20add50b 100644
--- a/llvm/test/MC/AArch64/SVE/udiv.s
+++ b/llvm/test/MC/AArch64/SVE/udiv.s
@@ -12,13 +12,13 @@
 udiv   z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: udiv	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x1f,0x95,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 95 04 <unknown>
 
 udiv   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: udiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d5 04 <unknown>
 
 
@@ -28,23 +28,23 @@ udiv   z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 udiv   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: udiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d5 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 udiv   z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: udiv	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d5 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/udivr.s b/llvm/test/MC/AArch64/SVE/udivr.s
index 4341ca8fcba10..c179d992e99b6 100644
--- a/llvm/test/MC/AArch64/SVE/udivr.s
+++ b/llvm/test/MC/AArch64/SVE/udivr.s
@@ -12,13 +12,13 @@
 udivr  z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: udivr	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x1f,0x97,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 97 04 <unknown>
 
 udivr  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: udivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d7 04 <unknown>
 
 
@@ -28,23 +28,23 @@ udivr  z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 udivr  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: udivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d7 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 udivr  z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: udivr	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d7 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/udot.s b/llvm/test/MC/AArch64/SVE/udot.s
index b5584fdc3f770..405457f54d3df 100644
--- a/llvm/test/MC/AArch64/SVE/udot.s
+++ b/llvm/test/MC/AArch64/SVE/udot.s
@@ -12,25 +12,25 @@
 udot  z0.s, z1.b, z31.b
 // CHECK-INST: udot	z0.s, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x04,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 9f 44 <unknown>
 
 udot  z0.d, z1.h, z31.h
 // CHECK-INST: udot	z0.d, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x04,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 df 44 <unknown>
 
 udot  z0.s, z1.b, z7.b[3]
 // CHECK-INST: udot	z0.s, z1.b, z7.b[3]
 // CHECK-ENCODING: [0x20,0x04,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 bf 44 <unknown>
 
 udot  z0.d, z1.h, z15.h[1]
 // CHECK-INST: udot	z0.d, z1.h, z15.h[1]
 // CHECK-ENCODING: [0x20,0x04,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 ff 44 <unknown>
 
 
@@ -40,23 +40,23 @@ udot  z0.d, z1.h, z15.h[1]
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 udot  z0.d, z1.h, z31.h
 // CHECK-INST: udot	z0.d, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x04,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 df 44 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 udot  z0.d, z1.h, z15.h[1]
 // CHECK-INST: udot	z0.d, z1.h, z15.h[1]
 // CHECK-ENCODING: [0x20,0x04,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 04 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/umax.s b/llvm/test/MC/AArch64/SVE/umax.s
index f1b0d6d7349d2..facb1f94c5a00 100644
--- a/llvm/test/MC/AArch64/SVE/umax.s
+++ b/llvm/test/MC/AArch64/SVE/umax.s
@@ -12,73 +12,73 @@
 umax    z0.b, z0.b, #0
 // CHECK-INST: umax	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 29 25 <unknown>
 
 umax    z31.b, z31.b, #255
 // CHECK-INST: umax	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 29 25 <unknown>
 
 umax    z0.b, z0.b, #0
 // CHECK-INST: umax	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 29 25 <unknown>
 
 umax    z31.b, z31.b, #255
 // CHECK-INST: umax	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 29 25 <unknown>
 
 umax    z0.b, z0.b, #0
 // CHECK-INST: umax	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 29 25 <unknown>
 
 umax    z31.b, z31.b, #255
 // CHECK-INST: umax	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 29 25 <unknown>
 
 umax    z0.b, z0.b, #0
 // CHECK-INST: umax	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 29 25 <unknown>
 
 umax    z31.b, z31.b, #255
 // CHECK-INST: umax	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 29 25 <unknown>
 
 umax    z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: umax    z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x09,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 09 04 <unknown>
 
 umax    z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: umax    z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x49,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 49 04 <unknown>
 
 umax    z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: umax    z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x89,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 89 04 <unknown>
 
 umax    z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: umax    z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xc9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f c9 04 <unknown>
 
 
@@ -88,35 +88,35 @@ umax    z31.d, p7/m, z31.d, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 umax    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: umax	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xc9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f c9 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 umax    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: umax	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xc9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f c9 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 umax    z31.b, z31.b, #255
 // CHECK-INST: umax	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 29 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/umaxv.s b/llvm/test/MC/AArch64/SVE/umaxv.s
index db02f5e8a2c6e..e9d1d5998512d 100644
--- a/llvm/test/MC/AArch64/SVE/umaxv.s
+++ b/llvm/test/MC/AArch64/SVE/umaxv.s
@@ -12,23 +12,23 @@
 umaxv b0, p7, z31.b
 // CHECK-INST: umaxv	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x09,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 09 04 <unknown>
 
 umaxv h0, p7, z31.h
 // CHECK-INST: umaxv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x49,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 49 04 <unknown>
 
 umaxv s0, p7, z31.s
 // CHECK-INST: umaxv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x89,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 89 04 <unknown>
 
 umaxv d0, p7, z31.d
 // CHECK-INST: umaxv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xc9,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f c9 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/umin.s b/llvm/test/MC/AArch64/SVE/umin.s
index 8919d803fb7e9..7fbce57e8a3a0 100644
--- a/llvm/test/MC/AArch64/SVE/umin.s
+++ b/llvm/test/MC/AArch64/SVE/umin.s
@@ -12,73 +12,73 @@
 umin    z0.b, z0.b, #0
 // CHECK-INST: umin	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 2b 25 <unknown>
 
 umin    z31.b, z31.b, #255
 // CHECK-INST: umin	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 2b 25 <unknown>
 
 umin    z0.b, z0.b, #0
 // CHECK-INST: umin	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 2b 25 <unknown>
 
 umin    z31.b, z31.b, #255
 // CHECK-INST: umin	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 2b 25 <unknown>
 
 umin    z0.b, z0.b, #0
 // CHECK-INST: umin	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 2b 25 <unknown>
 
 umin    z31.b, z31.b, #255
 // CHECK-INST: umin	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 2b 25 <unknown>
 
 umin    z0.b, z0.b, #0
 // CHECK-INST: umin	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 2b 25 <unknown>
 
 umin    z31.b, z31.b, #255
 // CHECK-INST: umin	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 2b 25 <unknown>
 
 umin    z31.b, p7/m, z31.b, z31.b
 // CHECK-INST: umin	z31.b, p7/m, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x1f,0x0b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 0b 04 <unknown>
 
 umin    z31.h, p7/m, z31.h, z31.h
 // CHECK-INST: umin	z31.h, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x1f,0x4b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 4b 04 <unknown>
 
 umin    z31.s, p7/m, z31.s, z31.s
 // CHECK-INST: umin	z31.s, p7/m, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0x8b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 8b 04 <unknown>
 
 umin    z31.d, p7/m, z31.d, z31.d
 // CHECK-INST: umin	z31.d, p7/m, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x1f,0xcb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f cb 04 <unknown>
 
 
@@ -88,35 +88,35 @@ umin    z31.d, p7/m, z31.d, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 umin    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: umin	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xcb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f cb 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 umin    z4.d, p7/m, z4.d, z31.d
 // CHECK-INST: umin	z4.d, p7/m, z4.d, z31.d
 // CHECK-ENCODING: [0xe4,0x1f,0xcb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 1f cb 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 umin    z31.b, z31.b, #255
 // CHECK-INST: umin	z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 2b 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uminv.s b/llvm/test/MC/AArch64/SVE/uminv.s
index bca8878cb749d..c43833300f58b 100644
--- a/llvm/test/MC/AArch64/SVE/uminv.s
+++ b/llvm/test/MC/AArch64/SVE/uminv.s
@@ -12,23 +12,23 @@
 uminv b0, p7, z31.b
 // CHECK-INST: uminv	b0, p7, z31.b
 // CHECK-ENCODING: [0xe0,0x3f,0x0b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 0b 04 <unknown>
 
 uminv h0, p7, z31.h
 // CHECK-INST: uminv	h0, p7, z31.h
 // CHECK-ENCODING: [0xe0,0x3f,0x4b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 4b 04 <unknown>
 
 uminv s0, p7, z31.s
 // CHECK-INST: uminv	s0, p7, z31.s
 // CHECK-ENCODING: [0xe0,0x3f,0x8b,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f 8b 04 <unknown>
 
 uminv d0, p7, z31.d
 // CHECK-INST: uminv	d0, p7, z31.d
 // CHECK-ENCODING: [0xe0,0x3f,0xcb,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3f cb 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/umulh.s b/llvm/test/MC/AArch64/SVE/umulh.s
index db27c8046e88e..2e9d2a89459da 100644
--- a/llvm/test/MC/AArch64/SVE/umulh.s
+++ b/llvm/test/MC/AArch64/SVE/umulh.s
@@ -12,25 +12,25 @@
 umulh z0.b, p7/m, z0.b, z31.b
 // CHECK-INST: umulh	z0.b, p7/m, z0.b, z31.b
 // CHECK-ENCODING: [0xe0,0x1f,0x13,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 13 04 <unknown>
 
 umulh z0.h, p7/m, z0.h, z31.h
 // CHECK-INST: umulh	z0.h, p7/m, z0.h, z31.h
 // CHECK-ENCODING: [0xe0,0x1f,0x53,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 53 04 <unknown>
 
 umulh z0.s, p7/m, z0.s, z31.s
 // CHECK-INST: umulh	z0.s, p7/m, z0.s, z31.s
 // CHECK-ENCODING: [0xe0,0x1f,0x93,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f 93 04 <unknown>
 
 umulh z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: umulh	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d3 04 <unknown>
 
 
@@ -40,23 +40,23 @@ umulh z0.d, p7/m, z0.d, z31.d
 movprfx z0.d, p7/z, z7.d
 // CHECK-INST: movprfx	z0.d, p7/z, z7.d
 // CHECK-ENCODING: [0xe0,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 3c d0 04 <unknown>
 
 umulh z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: umulh	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d3 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 umulh z0.d, p7/m, z0.d, z31.d
 // CHECK-INST: umulh	z0.d, p7/m, z0.d, z31.d
 // CHECK-ENCODING: [0xe0,0x1f,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 1f d3 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqadd.s b/llvm/test/MC/AArch64/SVE/uqadd.s
index 447aca0409733..047c0699dec5a 100644
--- a/llvm/test/MC/AArch64/SVE/uqadd.s
+++ b/llvm/test/MC/AArch64/SVE/uqadd.s
@@ -13,109 +13,109 @@
 uqadd     z0.b, z0.b, z0.b
 // CHECK-INST: uqadd z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x14,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 14 20 04 <unknown>
 
 uqadd     z0.h, z0.h, z0.h
 // CHECK-INST: uqadd z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x14,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 14 60 04 <unknown>
 
 uqadd     z0.s, z0.s, z0.s
 // CHECK-INST: uqadd z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x14,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 14 a0 04 <unknown>
 
 uqadd     z0.d, z0.d, z0.d
 // CHECK-INST: uqadd z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x14,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 14 e0 04 <unknown>
 
 uqadd     z0.b, z0.b, #0
 // CHECK-INST: uqadd z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x25,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 25 25 <unknown>
 
 uqadd     z31.b, z31.b, #255
 // CHECK-INST: uqadd z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x25,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 25 25 <unknown>
 
 uqadd     z0.h, z0.h, #0
 // CHECK-INST: uqadd z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x65,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 65 25 <unknown>
 
 uqadd     z0.h, z0.h, #0, lsl #8
 // CHECK-INST: uqadd z0.h, z0.h, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0x65,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 65 25 <unknown>
 
 uqadd     z31.h, z31.h, #255, lsl #8
 // CHECK-INST: uqadd z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x65,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 65 25 <unknown>
 
 uqadd     z31.h, z31.h, #65280
 // CHECK-INST: uqadd z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x65,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 65 25 <unknown>
 
 uqadd     z0.s, z0.s, #0
 // CHECK-INST: uqadd z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xa5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a5 25 <unknown>
 
 uqadd     z0.s, z0.s, #0, lsl #8
 // CHECK-INST: uqadd z0.s, z0.s, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xa5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a5 25 <unknown>
 
 uqadd     z31.s, z31.s, #255, lsl #8
 // CHECK-INST: uqadd z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a5 25 <unknown>
 
 uqadd     z31.s, z31.s, #65280
 // CHECK-INST: uqadd z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a5 25 <unknown>
 
 uqadd     z0.d, z0.d, #0
 // CHECK-INST: uqadd z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xe5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e5 25 <unknown>
 
 uqadd     z0.d, z0.d, #0, lsl #8
 // CHECK-INST: uqadd z0.d, z0.d, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xe5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e5 25 <unknown>
 
 uqadd     z31.d, z31.d, #255, lsl #8
 // CHECK-INST: uqadd z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e5 25 <unknown>
 
 uqadd     z31.d, z31.d, #65280
 // CHECK-INST: uqadd z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e5 25 <unknown>
 
 
@@ -125,11 +125,11 @@ uqadd     z31.d, z31.d, #65280
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqadd     z31.d, z31.d, #65280
 // CHECK-INST: uqadd	z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe5,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e5 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqdecb.s b/llvm/test/MC/AArch64/SVE/uqdecb.s
index dc2c09710ba23..10f7f893b5835 100644
--- a/llvm/test/MC/AArch64/SVE/uqdecb.s
+++ b/llvm/test/MC/AArch64/SVE/uqdecb.s
@@ -16,25 +16,25 @@
 uqdecb  x0
 // CHECK-INST: uqdecb  x0
 // CHECK-ENCODING: [0xe0,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 30 04 <unknown>
 
 uqdecb  x0, all
 // CHECK-INST: uqdecb  x0
 // CHECK-ENCODING: [0xe0,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 30 04 <unknown>
 
 uqdecb  x0, all, mul #1
 // CHECK-INST: uqdecb  x0
 // CHECK-ENCODING: [0xe0,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 30 04 <unknown>
 
 uqdecb  x0, all, mul #16
 // CHECK-INST: uqdecb  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 3f 04 <unknown>
 
 
@@ -45,37 +45,37 @@ uqdecb  x0, all, mul #16
 uqdecb  w0
 // CHECK-INST: uqdecb  w0
 // CHECK-ENCODING: [0xe0,0xff,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 20 04 <unknown>
 
 uqdecb  w0, all
 // CHECK-INST: uqdecb  w0
 // CHECK-ENCODING: [0xe0,0xff,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 20 04 <unknown>
 
 uqdecb  w0, all, mul #1
 // CHECK-INST: uqdecb  w0
 // CHECK-ENCODING: [0xe0,0xff,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 20 04 <unknown>
 
 uqdecb  w0, all, mul #16
 // CHECK-INST: uqdecb  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 2f 04 <unknown>
 
 uqdecb  w0, pow2
 // CHECK-INST: uqdecb  w0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc 20 04 <unknown>
 
 uqdecb  w0, pow2, mul #16
 // CHECK-INST: uqdecb  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xfc,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc 2f 04 <unknown>
 
 
@@ -86,173 +86,173 @@ uqdecb  w0, pow2, mul #16
 uqdecb  x0, pow2
 // CHECK-INST: uqdecb  x0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc 30 04 <unknown>
 
 uqdecb  x0, vl1
 // CHECK-INST: uqdecb  x0, vl1
 // CHECK-ENCODING: [0x20,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc 30 04 <unknown>
 
 uqdecb  x0, vl2
 // CHECK-INST: uqdecb  x0, vl2
 // CHECK-ENCODING: [0x40,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fc 30 04 <unknown>
 
 uqdecb  x0, vl3
 // CHECK-INST: uqdecb  x0, vl3
 // CHECK-ENCODING: [0x60,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fc 30 04 <unknown>
 
 uqdecb  x0, vl4
 // CHECK-INST: uqdecb  x0, vl4
 // CHECK-ENCODING: [0x80,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fc 30 04 <unknown>
 
 uqdecb  x0, vl5
 // CHECK-INST: uqdecb  x0, vl5
 // CHECK-ENCODING: [0xa0,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fc 30 04 <unknown>
 
 uqdecb  x0, vl6
 // CHECK-INST: uqdecb  x0, vl6
 // CHECK-ENCODING: [0xc0,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fc 30 04 <unknown>
 
 uqdecb  x0, vl7
 // CHECK-INST: uqdecb  x0, vl7
 // CHECK-ENCODING: [0xe0,0xfc,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fc 30 04 <unknown>
 
 uqdecb  x0, vl8
 // CHECK-INST: uqdecb  x0, vl8
 // CHECK-ENCODING: [0x00,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fd 30 04 <unknown>
 
 uqdecb  x0, vl16
 // CHECK-INST: uqdecb  x0, vl16
 // CHECK-ENCODING: [0x20,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fd 30 04 <unknown>
 
 uqdecb  x0, vl32
 // CHECK-INST: uqdecb  x0, vl32
 // CHECK-ENCODING: [0x40,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fd 30 04 <unknown>
 
 uqdecb  x0, vl64
 // CHECK-INST: uqdecb  x0, vl64
 // CHECK-ENCODING: [0x60,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fd 30 04 <unknown>
 
 uqdecb  x0, vl128
 // CHECK-INST: uqdecb  x0, vl128
 // CHECK-ENCODING: [0x80,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fd 30 04 <unknown>
 
 uqdecb  x0, vl256
 // CHECK-INST: uqdecb  x0, vl256
 // CHECK-ENCODING: [0xa0,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fd 30 04 <unknown>
 
 uqdecb  x0, #14
 // CHECK-INST: uqdecb  x0, #14
 // CHECK-ENCODING: [0xc0,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fd 30 04 <unknown>
 
 uqdecb  x0, #15
 // CHECK-INST: uqdecb  x0, #15
 // CHECK-ENCODING: [0xe0,0xfd,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fd 30 04 <unknown>
 
 uqdecb  x0, #16
 // CHECK-INST: uqdecb  x0, #16
 // CHECK-ENCODING: [0x00,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fe 30 04 <unknown>
 
 uqdecb  x0, #17
 // CHECK-INST: uqdecb  x0, #17
 // CHECK-ENCODING: [0x20,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fe 30 04 <unknown>
 
 uqdecb  x0, #18
 // CHECK-INST: uqdecb  x0, #18
 // CHECK-ENCODING: [0x40,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fe 30 04 <unknown>
 
 uqdecb  x0, #19
 // CHECK-INST: uqdecb  x0, #19
 // CHECK-ENCODING: [0x60,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fe 30 04 <unknown>
 
 uqdecb  x0, #20
 // CHECK-INST: uqdecb  x0, #20
 // CHECK-ENCODING: [0x80,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fe 30 04 <unknown>
 
 uqdecb  x0, #21
 // CHECK-INST: uqdecb  x0, #21
 // CHECK-ENCODING: [0xa0,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fe 30 04 <unknown>
 
 uqdecb  x0, #22
 // CHECK-INST: uqdecb  x0, #22
 // CHECK-ENCODING: [0xc0,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fe 30 04 <unknown>
 
 uqdecb  x0, #23
 // CHECK-INST: uqdecb  x0, #23
 // CHECK-ENCODING: [0xe0,0xfe,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fe 30 04 <unknown>
 
 uqdecb  x0, #24
 // CHECK-INST: uqdecb  x0, #24
 // CHECK-ENCODING: [0x00,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ff 30 04 <unknown>
 
 uqdecb  x0, #25
 // CHECK-INST: uqdecb  x0, #25
 // CHECK-ENCODING: [0x20,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ff 30 04 <unknown>
 
 uqdecb  x0, #26
 // CHECK-INST: uqdecb  x0, #26
 // CHECK-ENCODING: [0x40,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ff 30 04 <unknown>
 
 uqdecb  x0, #27
 // CHECK-INST: uqdecb  x0, #27
 // CHECK-ENCODING: [0x60,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ff 30 04 <unknown>
 
 uqdecb  x0, #28
 // CHECK-INST: uqdecb  x0, #28
 // CHECK-ENCODING: [0x80,0xff,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ff 30 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqdecd.s b/llvm/test/MC/AArch64/SVE/uqdecd.s
index 05341f8ff7ca6..0c13af33f2338 100644
--- a/llvm/test/MC/AArch64/SVE/uqdecd.s
+++ b/llvm/test/MC/AArch64/SVE/uqdecd.s
@@ -16,25 +16,25 @@
 uqdecd  x0
 // CHECK-INST: uqdecd  x0
 // CHECK-ENCODING: [0xe0,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff f0 04 <unknown>
 
 uqdecd  x0, all
 // CHECK-INST: uqdecd  x0
 // CHECK-ENCODING: [0xe0,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff f0 04 <unknown>
 
 uqdecd  x0, all, mul #1
 // CHECK-INST: uqdecd  x0
 // CHECK-ENCODING: [0xe0,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff f0 04 <unknown>
 
 uqdecd  x0, all, mul #16
 // CHECK-INST: uqdecd  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff ff 04 <unknown>
 
 
@@ -45,37 +45,37 @@ uqdecd  x0, all, mul #16
 uqdecd  w0
 // CHECK-INST: uqdecd  w0
 // CHECK-ENCODING: [0xe0,0xff,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff e0 04 <unknown>
 
 uqdecd  w0, all
 // CHECK-INST: uqdecd  w0
 // CHECK-ENCODING: [0xe0,0xff,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff e0 04 <unknown>
 
 uqdecd  w0, all, mul #1
 // CHECK-INST: uqdecd  w0
 // CHECK-ENCODING: [0xe0,0xff,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff e0 04 <unknown>
 
 uqdecd  w0, all, mul #16
 // CHECK-INST: uqdecd  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff ef 04 <unknown>
 
 uqdecd  w0, pow2
 // CHECK-INST: uqdecd  w0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc e0 04 <unknown>
 
 uqdecd  w0, pow2, mul #16
 // CHECK-INST: uqdecd  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xfc,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc ef 04 <unknown>
 
 
@@ -85,37 +85,37 @@ uqdecd  w0, pow2, mul #16
 uqdecd  z0.d
 // CHECK-INST: uqdecd  z0.d
 // CHECK-ENCODING: [0xe0,0xcf,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf e0 04 <unknown>
 
 uqdecd  z0.d, all
 // CHECK-INST: uqdecd  z0.d
 // CHECK-ENCODING: [0xe0,0xcf,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf e0 04 <unknown>
 
 uqdecd  z0.d, all, mul #1
 // CHECK-INST: uqdecd  z0.d
 // CHECK-ENCODING: [0xe0,0xcf,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf e0 04 <unknown>
 
 uqdecd  z0.d, all, mul #16
 // CHECK-INST: uqdecd  z0.d, all, mul #16
 // CHECK-ENCODING: [0xe0,0xcf,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf ef 04 <unknown>
 
 uqdecd  z0.d, pow2
 // CHECK-INST: uqdecd  z0.d, pow2
 // CHECK-ENCODING: [0x00,0xcc,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc e0 04 <unknown>
 
 uqdecd  z0.d, pow2, mul #16
 // CHECK-INST: uqdecd  z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xcc,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc ef 04 <unknown>
 
 
@@ -126,175 +126,175 @@ uqdecd  z0.d, pow2, mul #16
 uqdecd  x0, pow2
 // CHECK-INST: uqdecd  x0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc f0 04 <unknown>
 
 uqdecd  x0, vl1
 // CHECK-INST: uqdecd  x0, vl1
 // CHECK-ENCODING: [0x20,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc f0 04 <unknown>
 
 uqdecd  x0, vl2
 // CHECK-INST: uqdecd  x0, vl2
 // CHECK-ENCODING: [0x40,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fc f0 04 <unknown>
 
 uqdecd  x0, vl3
 // CHECK-INST: uqdecd  x0, vl3
 // CHECK-ENCODING: [0x60,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fc f0 04 <unknown>
 
 uqdecd  x0, vl4
 // CHECK-INST: uqdecd  x0, vl4
 // CHECK-ENCODING: [0x80,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fc f0 04 <unknown>
 
 uqdecd  x0, vl5
 // CHECK-INST: uqdecd  x0, vl5
 // CHECK-ENCODING: [0xa0,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fc f0 04 <unknown>
 
 uqdecd  x0, vl6
 // CHECK-INST: uqdecd  x0, vl6
 // CHECK-ENCODING: [0xc0,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fc f0 04 <unknown>
 
 uqdecd  x0, vl7
 // CHECK-INST: uqdecd  x0, vl7
 // CHECK-ENCODING: [0xe0,0xfc,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fc f0 04 <unknown>
 
 uqdecd  x0, vl8
 // CHECK-INST: uqdecd  x0, vl8
 // CHECK-ENCODING: [0x00,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fd f0 04 <unknown>
 
 uqdecd  x0, vl16
 // CHECK-INST: uqdecd  x0, vl16
 // CHECK-ENCODING: [0x20,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fd f0 04 <unknown>
 
 uqdecd  x0, vl32
 // CHECK-INST: uqdecd  x0, vl32
 // CHECK-ENCODING: [0x40,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fd f0 04 <unknown>
 
 uqdecd  x0, vl64
 // CHECK-INST: uqdecd  x0, vl64
 // CHECK-ENCODING: [0x60,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fd f0 04 <unknown>
 
 uqdecd  x0, vl128
 // CHECK-INST: uqdecd  x0, vl128
 // CHECK-ENCODING: [0x80,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fd f0 04 <unknown>
 
 uqdecd  x0, vl256
 // CHECK-INST: uqdecd  x0, vl256
 // CHECK-ENCODING: [0xa0,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fd f0 04 <unknown>
 
 uqdecd  x0, #14
 // CHECK-INST: uqdecd  x0, #14
 // CHECK-ENCODING: [0xc0,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fd f0 04 <unknown>
 
 uqdecd  x0, #15
 // CHECK-INST: uqdecd  x0, #15
 // CHECK-ENCODING: [0xe0,0xfd,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fd f0 04 <unknown>
 
 uqdecd  x0, #16
 // CHECK-INST: uqdecd  x0, #16
 // CHECK-ENCODING: [0x00,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fe f0 04 <unknown>
 
 uqdecd  x0, #17
 // CHECK-INST: uqdecd  x0, #17
 // CHECK-ENCODING: [0x20,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fe f0 04 <unknown>
 
 uqdecd  x0, #18
 // CHECK-INST: uqdecd  x0, #18
 // CHECK-ENCODING: [0x40,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fe f0 04 <unknown>
 
 uqdecd  x0, #19
 // CHECK-INST: uqdecd  x0, #19
 // CHECK-ENCODING: [0x60,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fe f0 04 <unknown>
 
 uqdecd  x0, #20
 // CHECK-INST: uqdecd  x0, #20
 // CHECK-ENCODING: [0x80,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fe f0 04 <unknown>
 
 uqdecd  x0, #21
 // CHECK-INST: uqdecd  x0, #21
 // CHECK-ENCODING: [0xa0,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fe f0 04 <unknown>
 
 uqdecd  x0, #22
 // CHECK-INST: uqdecd  x0, #22
 // CHECK-ENCODING: [0xc0,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fe f0 04 <unknown>
 
 uqdecd  x0, #23
 // CHECK-INST: uqdecd  x0, #23
 // CHECK-ENCODING: [0xe0,0xfe,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fe f0 04 <unknown>
 
 uqdecd  x0, #24
 // CHECK-INST: uqdecd  x0, #24
 // CHECK-ENCODING: [0x00,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ff f0 04 <unknown>
 
 uqdecd  x0, #25
 // CHECK-INST: uqdecd  x0, #25
 // CHECK-ENCODING: [0x20,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ff f0 04 <unknown>
 
 uqdecd  x0, #26
 // CHECK-INST: uqdecd  x0, #26
 // CHECK-ENCODING: [0x40,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ff f0 04 <unknown>
 
 uqdecd  x0, #27
 // CHECK-INST: uqdecd  x0, #27
 // CHECK-ENCODING: [0x60,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ff f0 04 <unknown>
 
 uqdecd  x0, #28
 // CHECK-INST: uqdecd  x0, #28
 // CHECK-ENCODING: [0x80,0xff,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ff f0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ uqdecd  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdecd  z0.d
 // CHECK-INST: uqdecd	z0.d
 // CHECK-ENCODING: [0xe0,0xcf,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf e0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdecd  z0.d, pow2, mul #16
 // CHECK-INST: uqdecd	z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xcc,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc ef 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdecd  z0.d, pow2
 // CHECK-INST: uqdecd	z0.d, pow2
 // CHECK-ENCODING: [0x00,0xcc,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc e0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqdech.s b/llvm/test/MC/AArch64/SVE/uqdech.s
index 6e77f1d7a22d5..c5c4a40490fed 100644
--- a/llvm/test/MC/AArch64/SVE/uqdech.s
+++ b/llvm/test/MC/AArch64/SVE/uqdech.s
@@ -16,25 +16,25 @@
 uqdech  x0
 // CHECK-INST: uqdech  x0
 // CHECK-ENCODING: [0xe0,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 70 04 <unknown>
 
 uqdech  x0, all
 // CHECK-INST: uqdech  x0
 // CHECK-ENCODING: [0xe0,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 70 04 <unknown>
 
 uqdech  x0, all, mul #1
 // CHECK-INST: uqdech  x0
 // CHECK-ENCODING: [0xe0,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 70 04 <unknown>
 
 uqdech  x0, all, mul #16
 // CHECK-INST: uqdech  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 7f 04 <unknown>
 
 
@@ -45,37 +45,37 @@ uqdech  x0, all, mul #16
 uqdech  w0
 // CHECK-INST: uqdech  w0
 // CHECK-ENCODING: [0xe0,0xff,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 60 04 <unknown>
 
 uqdech  w0, all
 // CHECK-INST: uqdech  w0
 // CHECK-ENCODING: [0xe0,0xff,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 60 04 <unknown>
 
 uqdech  w0, all, mul #1
 // CHECK-INST: uqdech  w0
 // CHECK-ENCODING: [0xe0,0xff,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 60 04 <unknown>
 
 uqdech  w0, all, mul #16
 // CHECK-INST: uqdech  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff 6f 04 <unknown>
 
 uqdech  w0, pow2
 // CHECK-INST: uqdech  w0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc 60 04 <unknown>
 
 uqdech  w0, pow2, mul #16
 // CHECK-INST: uqdech  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xfc,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc 6f 04 <unknown>
 
 
@@ -85,37 +85,37 @@ uqdech  w0, pow2, mul #16
 uqdech  z0.h
 // CHECK-INST: uqdech  z0.h
 // CHECK-ENCODING: [0xe0,0xcf,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf 60 04 <unknown>
 
 uqdech  z0.h, all
 // CHECK-INST: uqdech  z0.h
 // CHECK-ENCODING: [0xe0,0xcf,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf 60 04 <unknown>
 
 uqdech  z0.h, all, mul #1
 // CHECK-INST: uqdech  z0.h
 // CHECK-ENCODING: [0xe0,0xcf,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf 60 04 <unknown>
 
 uqdech  z0.h, all, mul #16
 // CHECK-INST: uqdech  z0.h, all, mul #16
 // CHECK-ENCODING: [0xe0,0xcf,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf 6f 04 <unknown>
 
 uqdech  z0.h, pow2
 // CHECK-INST: uqdech  z0.h, pow2
 // CHECK-ENCODING: [0x00,0xcc,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc 60 04 <unknown>
 
 uqdech  z0.h, pow2, mul #16
 // CHECK-INST: uqdech  z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xcc,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc 6f 04 <unknown>
 
 
@@ -126,175 +126,175 @@ uqdech  z0.h, pow2, mul #16
 uqdech  x0, pow2
 // CHECK-INST: uqdech  x0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc 70 04 <unknown>
 
 uqdech  x0, vl1
 // CHECK-INST: uqdech  x0, vl1
 // CHECK-ENCODING: [0x20,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc 70 04 <unknown>
 
 uqdech  x0, vl2
 // CHECK-INST: uqdech  x0, vl2
 // CHECK-ENCODING: [0x40,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fc 70 04 <unknown>
 
 uqdech  x0, vl3
 // CHECK-INST: uqdech  x0, vl3
 // CHECK-ENCODING: [0x60,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fc 70 04 <unknown>
 
 uqdech  x0, vl4
 // CHECK-INST: uqdech  x0, vl4
 // CHECK-ENCODING: [0x80,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fc 70 04 <unknown>
 
 uqdech  x0, vl5
 // CHECK-INST: uqdech  x0, vl5
 // CHECK-ENCODING: [0xa0,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fc 70 04 <unknown>
 
 uqdech  x0, vl6
 // CHECK-INST: uqdech  x0, vl6
 // CHECK-ENCODING: [0xc0,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fc 70 04 <unknown>
 
 uqdech  x0, vl7
 // CHECK-INST: uqdech  x0, vl7
 // CHECK-ENCODING: [0xe0,0xfc,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fc 70 04 <unknown>
 
 uqdech  x0, vl8
 // CHECK-INST: uqdech  x0, vl8
 // CHECK-ENCODING: [0x00,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fd 70 04 <unknown>
 
 uqdech  x0, vl16
 // CHECK-INST: uqdech  x0, vl16
 // CHECK-ENCODING: [0x20,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fd 70 04 <unknown>
 
 uqdech  x0, vl32
 // CHECK-INST: uqdech  x0, vl32
 // CHECK-ENCODING: [0x40,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fd 70 04 <unknown>
 
 uqdech  x0, vl64
 // CHECK-INST: uqdech  x0, vl64
 // CHECK-ENCODING: [0x60,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fd 70 04 <unknown>
 
 uqdech  x0, vl128
 // CHECK-INST: uqdech  x0, vl128
 // CHECK-ENCODING: [0x80,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fd 70 04 <unknown>
 
 uqdech  x0, vl256
 // CHECK-INST: uqdech  x0, vl256
 // CHECK-ENCODING: [0xa0,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fd 70 04 <unknown>
 
 uqdech  x0, #14
 // CHECK-INST: uqdech  x0, #14
 // CHECK-ENCODING: [0xc0,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fd 70 04 <unknown>
 
 uqdech  x0, #15
 // CHECK-INST: uqdech  x0, #15
 // CHECK-ENCODING: [0xe0,0xfd,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fd 70 04 <unknown>
 
 uqdech  x0, #16
 // CHECK-INST: uqdech  x0, #16
 // CHECK-ENCODING: [0x00,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fe 70 04 <unknown>
 
 uqdech  x0, #17
 // CHECK-INST: uqdech  x0, #17
 // CHECK-ENCODING: [0x20,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fe 70 04 <unknown>
 
 uqdech  x0, #18
 // CHECK-INST: uqdech  x0, #18
 // CHECK-ENCODING: [0x40,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fe 70 04 <unknown>
 
 uqdech  x0, #19
 // CHECK-INST: uqdech  x0, #19
 // CHECK-ENCODING: [0x60,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fe 70 04 <unknown>
 
 uqdech  x0, #20
 // CHECK-INST: uqdech  x0, #20
 // CHECK-ENCODING: [0x80,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fe 70 04 <unknown>
 
 uqdech  x0, #21
 // CHECK-INST: uqdech  x0, #21
 // CHECK-ENCODING: [0xa0,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fe 70 04 <unknown>
 
 uqdech  x0, #22
 // CHECK-INST: uqdech  x0, #22
 // CHECK-ENCODING: [0xc0,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fe 70 04 <unknown>
 
 uqdech  x0, #23
 // CHECK-INST: uqdech  x0, #23
 // CHECK-ENCODING: [0xe0,0xfe,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fe 70 04 <unknown>
 
 uqdech  x0, #24
 // CHECK-INST: uqdech  x0, #24
 // CHECK-ENCODING: [0x00,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ff 70 04 <unknown>
 
 uqdech  x0, #25
 // CHECK-INST: uqdech  x0, #25
 // CHECK-ENCODING: [0x20,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ff 70 04 <unknown>
 
 uqdech  x0, #26
 // CHECK-INST: uqdech  x0, #26
 // CHECK-ENCODING: [0x40,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ff 70 04 <unknown>
 
 uqdech  x0, #27
 // CHECK-INST: uqdech  x0, #27
 // CHECK-ENCODING: [0x60,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ff 70 04 <unknown>
 
 uqdech  x0, #28
 // CHECK-INST: uqdech  x0, #28
 // CHECK-ENCODING: [0x80,0xff,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ff 70 04 <unknown>
 
 
@@ -304,35 +304,35 @@ uqdech  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdech  z0.h
 // CHECK-INST: uqdech	z0.h
 // CHECK-ENCODING: [0xe0,0xcf,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf 60 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdech  z0.h, pow2, mul #16
 // CHECK-INST: uqdech	z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xcc,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc 6f 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdech  z0.h, pow2
 // CHECK-INST: uqdech	z0.h, pow2
 // CHECK-ENCODING: [0x00,0xcc,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc 60 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqdecp.s b/llvm/test/MC/AArch64/SVE/uqdecp.s
index e5ca1c5fd7e3a..2deec983a3ddb 100644
--- a/llvm/test/MC/AArch64/SVE/uqdecp.s
+++ b/llvm/test/MC/AArch64/SVE/uqdecp.s
@@ -12,85 +12,85 @@
 uqdecp  x0, p0.b
 // CHECK-INST: uqdecp x0, p0.b
 // CHECK-ENCODING: [0x00,0x8c,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 2b 25 <unknown>
 
 uqdecp  x0, p0.h
 // CHECK-INST: uqdecp x0, p0.h
 // CHECK-ENCODING: [0x00,0x8c,0x6b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 6b 25 <unknown>
 
 uqdecp  x0, p0.s
 // CHECK-INST: uqdecp x0, p0.s
 // CHECK-ENCODING: [0x00,0x8c,0xab,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c ab 25 <unknown>
 
 uqdecp  x0, p0.d
 // CHECK-INST: uqdecp x0, p0.d
 // CHECK-ENCODING: [0x00,0x8c,0xeb,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c eb 25 <unknown>
 
 uqdecp  wzr, p15.b
 // CHECK-INST: uqdecp wzr, p15.b
 // CHECK-ENCODING: [0xff,0x89,0x2b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 2b 25 <unknown>
 
 uqdecp  wzr, p15.h
 // CHECK-INST: uqdecp wzr, p15.h
 // CHECK-ENCODING: [0xff,0x89,0x6b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 6b 25 <unknown>
 
 uqdecp  wzr, p15.s
 // CHECK-INST: uqdecp wzr, p15.s
 // CHECK-ENCODING: [0xff,0x89,0xab,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 ab 25 <unknown>
 
 uqdecp  wzr, p15.d
 // CHECK-INST: uqdecp wzr, p15.d
 // CHECK-ENCODING: [0xff,0x89,0xeb,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 eb 25 <unknown>
 
 uqdecp  z0.h, p0
 // CHECK-INST: uqdecp z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x6b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 6b 25 <unknown>
 
 uqdecp  z0.h, p0.h
 // CHECK-INST: uqdecp z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x6b,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 6b 25 <unknown>
 
 uqdecp  z0.s, p0
 // CHECK-INST: uqdecp z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xab,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 ab 25 <unknown>
 
 uqdecp  z0.s, p0.s
 // CHECK-INST: uqdecp z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xab,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 ab 25 <unknown>
 
 uqdecp  z0.d, p0
 // CHECK-INST: uqdecp z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xeb,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 eb 25 <unknown>
 
 uqdecp  z0.d, p0.d
 // CHECK-INST: uqdecp z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xeb,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 eb 25 <unknown>
 
 
@@ -100,11 +100,11 @@ uqdecp  z0.d, p0.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdecp  z0.d, p0.d
 // CHECK-INST: uqdecp	z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xeb,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 eb 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqdecw.s b/llvm/test/MC/AArch64/SVE/uqdecw.s
index 887219ee4d5d7..f137aaea5da45 100644
--- a/llvm/test/MC/AArch64/SVE/uqdecw.s
+++ b/llvm/test/MC/AArch64/SVE/uqdecw.s
@@ -16,25 +16,25 @@
 uqdecw  x0
 // CHECK-INST: uqdecw  x0
 // CHECK-ENCODING: [0xe0,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff b0 04 <unknown>
 
 uqdecw  x0, all
 // CHECK-INST: uqdecw  x0
 // CHECK-ENCODING: [0xe0,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff b0 04 <unknown>
 
 uqdecw  x0, all, mul #1
 // CHECK-INST: uqdecw  x0
 // CHECK-ENCODING: [0xe0,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff b0 04 <unknown>
 
 uqdecw  x0, all, mul #16
 // CHECK-INST: uqdecw  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff bf 04 <unknown>
 
 
@@ -45,37 +45,37 @@ uqdecw  x0, all, mul #16
 uqdecw  w0
 // CHECK-INST: uqdecw  w0
 // CHECK-ENCODING: [0xe0,0xff,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff a0 04 <unknown>
 
 uqdecw  w0, all
 // CHECK-INST: uqdecw  w0
 // CHECK-ENCODING: [0xe0,0xff,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff a0 04 <unknown>
 
 uqdecw  w0, all, mul #1
 // CHECK-INST: uqdecw  w0
 // CHECK-ENCODING: [0xe0,0xff,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff a0 04 <unknown>
 
 uqdecw  w0, all, mul #16
 // CHECK-INST: uqdecw  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xff,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 ff af 04 <unknown>
 
 uqdecw  w0, pow2
 // CHECK-INST: uqdecw  w0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc a0 04 <unknown>
 
 uqdecw  w0, pow2, mul #16
 // CHECK-INST: uqdecw  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xfc,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc af 04 <unknown>
 
 
@@ -85,37 +85,37 @@ uqdecw  w0, pow2, mul #16
 uqdecw  z0.s
 // CHECK-INST: uqdecw  z0.s
 // CHECK-ENCODING: [0xe0,0xcf,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf a0 04 <unknown>
 
 uqdecw  z0.s, all
 // CHECK-INST: uqdecw  z0.s
 // CHECK-ENCODING: [0xe0,0xcf,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf a0 04 <unknown>
 
 uqdecw  z0.s, all, mul #1
 // CHECK-INST: uqdecw  z0.s
 // CHECK-ENCODING: [0xe0,0xcf,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf a0 04 <unknown>
 
 uqdecw  z0.s, all, mul #16
 // CHECK-INST: uqdecw  z0.s, all, mul #16
 // CHECK-ENCODING: [0xe0,0xcf,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf af 04 <unknown>
 
 uqdecw  z0.s, pow2
 // CHECK-INST: uqdecw  z0.s, pow2
 // CHECK-ENCODING: [0x00,0xcc,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc a0 04 <unknown>
 
 uqdecw  z0.s, pow2, mul #16
 // CHECK-INST: uqdecw  z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xcc,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc af 04 <unknown>
 
 
@@ -126,175 +126,175 @@ uqdecw  z0.s, pow2, mul #16
 uqdecw  x0, pow2
 // CHECK-INST: uqdecw  x0, pow2
 // CHECK-ENCODING: [0x00,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fc b0 04 <unknown>
 
 uqdecw  x0, vl1
 // CHECK-INST: uqdecw  x0, vl1
 // CHECK-ENCODING: [0x20,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fc b0 04 <unknown>
 
 uqdecw  x0, vl2
 // CHECK-INST: uqdecw  x0, vl2
 // CHECK-ENCODING: [0x40,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fc b0 04 <unknown>
 
 uqdecw  x0, vl3
 // CHECK-INST: uqdecw  x0, vl3
 // CHECK-ENCODING: [0x60,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fc b0 04 <unknown>
 
 uqdecw  x0, vl4
 // CHECK-INST: uqdecw  x0, vl4
 // CHECK-ENCODING: [0x80,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fc b0 04 <unknown>
 
 uqdecw  x0, vl5
 // CHECK-INST: uqdecw  x0, vl5
 // CHECK-ENCODING: [0xa0,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fc b0 04 <unknown>
 
 uqdecw  x0, vl6
 // CHECK-INST: uqdecw  x0, vl6
 // CHECK-ENCODING: [0xc0,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fc b0 04 <unknown>
 
 uqdecw  x0, vl7
 // CHECK-INST: uqdecw  x0, vl7
 // CHECK-ENCODING: [0xe0,0xfc,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fc b0 04 <unknown>
 
 uqdecw  x0, vl8
 // CHECK-INST: uqdecw  x0, vl8
 // CHECK-ENCODING: [0x00,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fd b0 04 <unknown>
 
 uqdecw  x0, vl16
 // CHECK-INST: uqdecw  x0, vl16
 // CHECK-ENCODING: [0x20,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fd b0 04 <unknown>
 
 uqdecw  x0, vl32
 // CHECK-INST: uqdecw  x0, vl32
 // CHECK-ENCODING: [0x40,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fd b0 04 <unknown>
 
 uqdecw  x0, vl64
 // CHECK-INST: uqdecw  x0, vl64
 // CHECK-ENCODING: [0x60,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fd b0 04 <unknown>
 
 uqdecw  x0, vl128
 // CHECK-INST: uqdecw  x0, vl128
 // CHECK-ENCODING: [0x80,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fd b0 04 <unknown>
 
 uqdecw  x0, vl256
 // CHECK-INST: uqdecw  x0, vl256
 // CHECK-ENCODING: [0xa0,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fd b0 04 <unknown>
 
 uqdecw  x0, #14
 // CHECK-INST: uqdecw  x0, #14
 // CHECK-ENCODING: [0xc0,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fd b0 04 <unknown>
 
 uqdecw  x0, #15
 // CHECK-INST: uqdecw  x0, #15
 // CHECK-ENCODING: [0xe0,0xfd,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fd b0 04 <unknown>
 
 uqdecw  x0, #16
 // CHECK-INST: uqdecw  x0, #16
 // CHECK-ENCODING: [0x00,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 fe b0 04 <unknown>
 
 uqdecw  x0, #17
 // CHECK-INST: uqdecw  x0, #17
 // CHECK-ENCODING: [0x20,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 fe b0 04 <unknown>
 
 uqdecw  x0, #18
 // CHECK-INST: uqdecw  x0, #18
 // CHECK-ENCODING: [0x40,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 fe b0 04 <unknown>
 
 uqdecw  x0, #19
 // CHECK-INST: uqdecw  x0, #19
 // CHECK-ENCODING: [0x60,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 fe b0 04 <unknown>
 
 uqdecw  x0, #20
 // CHECK-INST: uqdecw  x0, #20
 // CHECK-ENCODING: [0x80,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 fe b0 04 <unknown>
 
 uqdecw  x0, #21
 // CHECK-INST: uqdecw  x0, #21
 // CHECK-ENCODING: [0xa0,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 fe b0 04 <unknown>
 
 uqdecw  x0, #22
 // CHECK-INST: uqdecw  x0, #22
 // CHECK-ENCODING: [0xc0,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 fe b0 04 <unknown>
 
 uqdecw  x0, #23
 // CHECK-INST: uqdecw  x0, #23
 // CHECK-ENCODING: [0xe0,0xfe,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 fe b0 04 <unknown>
 
 uqdecw  x0, #24
 // CHECK-INST: uqdecw  x0, #24
 // CHECK-ENCODING: [0x00,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 ff b0 04 <unknown>
 
 uqdecw  x0, #25
 // CHECK-INST: uqdecw  x0, #25
 // CHECK-ENCODING: [0x20,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 ff b0 04 <unknown>
 
 uqdecw  x0, #26
 // CHECK-INST: uqdecw  x0, #26
 // CHECK-ENCODING: [0x40,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 ff b0 04 <unknown>
 
 uqdecw  x0, #27
 // CHECK-INST: uqdecw  x0, #27
 // CHECK-ENCODING: [0x60,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 ff b0 04 <unknown>
 
 uqdecw  x0, #28
 // CHECK-INST: uqdecw  x0, #28
 // CHECK-ENCODING: [0x80,0xff,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 ff b0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ uqdecw  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdecw  z0.s
 // CHECK-INST: uqdecw	z0.s
 // CHECK-ENCODING: [0xe0,0xcf,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 cf a0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdecw  z0.s, pow2, mul #16
 // CHECK-INST: uqdecw	z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xcc,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc af 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqdecw  z0.s, pow2
 // CHECK-INST: uqdecw	z0.s, pow2
 // CHECK-ENCODING: [0x00,0xcc,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 cc a0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqincb.s b/llvm/test/MC/AArch64/SVE/uqincb.s
index 1895ba138a986..b728cb75340d9 100644
--- a/llvm/test/MC/AArch64/SVE/uqincb.s
+++ b/llvm/test/MC/AArch64/SVE/uqincb.s
@@ -16,25 +16,25 @@
 uqincb  x0
 // CHECK-INST: uqincb  x0
 // CHECK-ENCODING: [0xe0,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 30 04 <unknown>
 
 uqincb  x0, all
 // CHECK-INST: uqincb  x0
 // CHECK-ENCODING: [0xe0,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 30 04 <unknown>
 
 uqincb  x0, all, mul #1
 // CHECK-INST: uqincb  x0
 // CHECK-ENCODING: [0xe0,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 30 04 <unknown>
 
 uqincb  x0, all, mul #16
 // CHECK-INST: uqincb  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 3f 04 <unknown>
 
 
@@ -45,37 +45,37 @@ uqincb  x0, all, mul #16
 uqincb  w0
 // CHECK-INST: uqincb  w0
 // CHECK-ENCODING: [0xe0,0xf7,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 20 04 <unknown>
 
 uqincb  w0, all
 // CHECK-INST: uqincb  w0
 // CHECK-ENCODING: [0xe0,0xf7,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 20 04 <unknown>
 
 uqincb  w0, all, mul #1
 // CHECK-INST: uqincb  w0
 // CHECK-ENCODING: [0xe0,0xf7,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 20 04 <unknown>
 
 uqincb  w0, all, mul #16
 // CHECK-INST: uqincb  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 2f 04 <unknown>
 
 uqincb  w0, pow2
 // CHECK-INST: uqincb  w0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 20 04 <unknown>
 
 uqincb  w0, pow2, mul #16
 // CHECK-INST: uqincb  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf4,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 2f 04 <unknown>
 
 
@@ -86,173 +86,173 @@ uqincb  w0, pow2, mul #16
 uqincb  x0, pow2
 // CHECK-INST: uqincb  x0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 30 04 <unknown>
 
 uqincb  x0, vl1
 // CHECK-INST: uqincb  x0, vl1
 // CHECK-ENCODING: [0x20,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f4 30 04 <unknown>
 
 uqincb  x0, vl2
 // CHECK-INST: uqincb  x0, vl2
 // CHECK-ENCODING: [0x40,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f4 30 04 <unknown>
 
 uqincb  x0, vl3
 // CHECK-INST: uqincb  x0, vl3
 // CHECK-ENCODING: [0x60,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f4 30 04 <unknown>
 
 uqincb  x0, vl4
 // CHECK-INST: uqincb  x0, vl4
 // CHECK-ENCODING: [0x80,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f4 30 04 <unknown>
 
 uqincb  x0, vl5
 // CHECK-INST: uqincb  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f4 30 04 <unknown>
 
 uqincb  x0, vl6
 // CHECK-INST: uqincb  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f4 30 04 <unknown>
 
 uqincb  x0, vl7
 // CHECK-INST: uqincb  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf4,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f4 30 04 <unknown>
 
 uqincb  x0, vl8
 // CHECK-INST: uqincb  x0, vl8
 // CHECK-ENCODING: [0x00,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f5 30 04 <unknown>
 
 uqincb  x0, vl16
 // CHECK-INST: uqincb  x0, vl16
 // CHECK-ENCODING: [0x20,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f5 30 04 <unknown>
 
 uqincb  x0, vl32
 // CHECK-INST: uqincb  x0, vl32
 // CHECK-ENCODING: [0x40,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f5 30 04 <unknown>
 
 uqincb  x0, vl64
 // CHECK-INST: uqincb  x0, vl64
 // CHECK-ENCODING: [0x60,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f5 30 04 <unknown>
 
 uqincb  x0, vl128
 // CHECK-INST: uqincb  x0, vl128
 // CHECK-ENCODING: [0x80,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f5 30 04 <unknown>
 
 uqincb  x0, vl256
 // CHECK-INST: uqincb  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f5 30 04 <unknown>
 
 uqincb  x0, #14
 // CHECK-INST: uqincb  x0, #14
 // CHECK-ENCODING: [0xc0,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f5 30 04 <unknown>
 
 uqincb  x0, #15
 // CHECK-INST: uqincb  x0, #15
 // CHECK-ENCODING: [0xe0,0xf5,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f5 30 04 <unknown>
 
 uqincb  x0, #16
 // CHECK-INST: uqincb  x0, #16
 // CHECK-ENCODING: [0x00,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f6 30 04 <unknown>
 
 uqincb  x0, #17
 // CHECK-INST: uqincb  x0, #17
 // CHECK-ENCODING: [0x20,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f6 30 04 <unknown>
 
 uqincb  x0, #18
 // CHECK-INST: uqincb  x0, #18
 // CHECK-ENCODING: [0x40,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f6 30 04 <unknown>
 
 uqincb  x0, #19
 // CHECK-INST: uqincb  x0, #19
 // CHECK-ENCODING: [0x60,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f6 30 04 <unknown>
 
 uqincb  x0, #20
 // CHECK-INST: uqincb  x0, #20
 // CHECK-ENCODING: [0x80,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f6 30 04 <unknown>
 
 uqincb  x0, #21
 // CHECK-INST: uqincb  x0, #21
 // CHECK-ENCODING: [0xa0,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f6 30 04 <unknown>
 
 uqincb  x0, #22
 // CHECK-INST: uqincb  x0, #22
 // CHECK-ENCODING: [0xc0,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f6 30 04 <unknown>
 
 uqincb  x0, #23
 // CHECK-INST: uqincb  x0, #23
 // CHECK-ENCODING: [0xe0,0xf6,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f6 30 04 <unknown>
 
 uqincb  x0, #24
 // CHECK-INST: uqincb  x0, #24
 // CHECK-ENCODING: [0x00,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f7 30 04 <unknown>
 
 uqincb  x0, #25
 // CHECK-INST: uqincb  x0, #25
 // CHECK-ENCODING: [0x20,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f7 30 04 <unknown>
 
 uqincb  x0, #26
 // CHECK-INST: uqincb  x0, #26
 // CHECK-ENCODING: [0x40,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f7 30 04 <unknown>
 
 uqincb  x0, #27
 // CHECK-INST: uqincb  x0, #27
 // CHECK-ENCODING: [0x60,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f7 30 04 <unknown>
 
 uqincb  x0, #28
 // CHECK-INST: uqincb  x0, #28
 // CHECK-ENCODING: [0x80,0xf7,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f7 30 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqincd.s b/llvm/test/MC/AArch64/SVE/uqincd.s
index b585a7c58fe8e..cebea9327984d 100644
--- a/llvm/test/MC/AArch64/SVE/uqincd.s
+++ b/llvm/test/MC/AArch64/SVE/uqincd.s
@@ -16,25 +16,25 @@
 uqincd  x0
 // CHECK-INST: uqincd  x0
 // CHECK-ENCODING: [0xe0,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 f0 04 <unknown>
 
 uqincd  x0, all
 // CHECK-INST: uqincd  x0
 // CHECK-ENCODING: [0xe0,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 f0 04 <unknown>
 
 uqincd  x0, all, mul #1
 // CHECK-INST: uqincd  x0
 // CHECK-ENCODING: [0xe0,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 f0 04 <unknown>
 
 uqincd  x0, all, mul #16
 // CHECK-INST: uqincd  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 ff 04 <unknown>
 
 
@@ -45,37 +45,37 @@ uqincd  x0, all, mul #16
 uqincd  w0
 // CHECK-INST: uqincd  w0
 // CHECK-ENCODING: [0xe0,0xf7,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 e0 04 <unknown>
 
 uqincd  w0, all
 // CHECK-INST: uqincd  w0
 // CHECK-ENCODING: [0xe0,0xf7,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 e0 04 <unknown>
 
 uqincd  w0, all, mul #1
 // CHECK-INST: uqincd  w0
 // CHECK-ENCODING: [0xe0,0xf7,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 e0 04 <unknown>
 
 uqincd  w0, all, mul #16
 // CHECK-INST: uqincd  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 ef 04 <unknown>
 
 uqincd  w0, pow2
 // CHECK-INST: uqincd  w0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 e0 04 <unknown>
 
 uqincd  w0, pow2, mul #16
 // CHECK-INST: uqincd  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf4,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 ef 04 <unknown>
 
 
@@ -85,37 +85,37 @@ uqincd  w0, pow2, mul #16
 uqincd  z0.d
 // CHECK-INST: uqincd  z0.d
 // CHECK-ENCODING: [0xe0,0xc7,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 e0 04 <unknown>
 
 uqincd  z0.d, all
 // CHECK-INST: uqincd  z0.d
 // CHECK-ENCODING: [0xe0,0xc7,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 e0 04 <unknown>
 
 uqincd  z0.d, all, mul #1
 // CHECK-INST: uqincd  z0.d
 // CHECK-ENCODING: [0xe0,0xc7,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 e0 04 <unknown>
 
 uqincd  z0.d, all, mul #16
 // CHECK-INST: uqincd  z0.d, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc7,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 ef 04 <unknown>
 
 uqincd  z0.d, pow2
 // CHECK-INST: uqincd  z0.d, pow2
 // CHECK-ENCODING: [0x00,0xc4,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 e0 04 <unknown>
 
 uqincd  z0.d, pow2, mul #16
 // CHECK-INST: uqincd  z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc4,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 ef 04 <unknown>
 
 
@@ -126,175 +126,175 @@ uqincd  z0.d, pow2, mul #16
 uqincd  x0, pow2
 // CHECK-INST: uqincd  x0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 f0 04 <unknown>
 
 uqincd  x0, vl1
 // CHECK-INST: uqincd  x0, vl1
 // CHECK-ENCODING: [0x20,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f4 f0 04 <unknown>
 
 uqincd  x0, vl2
 // CHECK-INST: uqincd  x0, vl2
 // CHECK-ENCODING: [0x40,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f4 f0 04 <unknown>
 
 uqincd  x0, vl3
 // CHECK-INST: uqincd  x0, vl3
 // CHECK-ENCODING: [0x60,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f4 f0 04 <unknown>
 
 uqincd  x0, vl4
 // CHECK-INST: uqincd  x0, vl4
 // CHECK-ENCODING: [0x80,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f4 f0 04 <unknown>
 
 uqincd  x0, vl5
 // CHECK-INST: uqincd  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f4 f0 04 <unknown>
 
 uqincd  x0, vl6
 // CHECK-INST: uqincd  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f4 f0 04 <unknown>
 
 uqincd  x0, vl7
 // CHECK-INST: uqincd  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf4,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f4 f0 04 <unknown>
 
 uqincd  x0, vl8
 // CHECK-INST: uqincd  x0, vl8
 // CHECK-ENCODING: [0x00,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f5 f0 04 <unknown>
 
 uqincd  x0, vl16
 // CHECK-INST: uqincd  x0, vl16
 // CHECK-ENCODING: [0x20,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f5 f0 04 <unknown>
 
 uqincd  x0, vl32
 // CHECK-INST: uqincd  x0, vl32
 // CHECK-ENCODING: [0x40,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f5 f0 04 <unknown>
 
 uqincd  x0, vl64
 // CHECK-INST: uqincd  x0, vl64
 // CHECK-ENCODING: [0x60,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f5 f0 04 <unknown>
 
 uqincd  x0, vl128
 // CHECK-INST: uqincd  x0, vl128
 // CHECK-ENCODING: [0x80,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f5 f0 04 <unknown>
 
 uqincd  x0, vl256
 // CHECK-INST: uqincd  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f5 f0 04 <unknown>
 
 uqincd  x0, #14
 // CHECK-INST: uqincd  x0, #14
 // CHECK-ENCODING: [0xc0,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f5 f0 04 <unknown>
 
 uqincd  x0, #15
 // CHECK-INST: uqincd  x0, #15
 // CHECK-ENCODING: [0xe0,0xf5,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f5 f0 04 <unknown>
 
 uqincd  x0, #16
 // CHECK-INST: uqincd  x0, #16
 // CHECK-ENCODING: [0x00,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f6 f0 04 <unknown>
 
 uqincd  x0, #17
 // CHECK-INST: uqincd  x0, #17
 // CHECK-ENCODING: [0x20,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f6 f0 04 <unknown>
 
 uqincd  x0, #18
 // CHECK-INST: uqincd  x0, #18
 // CHECK-ENCODING: [0x40,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f6 f0 04 <unknown>
 
 uqincd  x0, #19
 // CHECK-INST: uqincd  x0, #19
 // CHECK-ENCODING: [0x60,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f6 f0 04 <unknown>
 
 uqincd  x0, #20
 // CHECK-INST: uqincd  x0, #20
 // CHECK-ENCODING: [0x80,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f6 f0 04 <unknown>
 
 uqincd  x0, #21
 // CHECK-INST: uqincd  x0, #21
 // CHECK-ENCODING: [0xa0,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f6 f0 04 <unknown>
 
 uqincd  x0, #22
 // CHECK-INST: uqincd  x0, #22
 // CHECK-ENCODING: [0xc0,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f6 f0 04 <unknown>
 
 uqincd  x0, #23
 // CHECK-INST: uqincd  x0, #23
 // CHECK-ENCODING: [0xe0,0xf6,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f6 f0 04 <unknown>
 
 uqincd  x0, #24
 // CHECK-INST: uqincd  x0, #24
 // CHECK-ENCODING: [0x00,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f7 f0 04 <unknown>
 
 uqincd  x0, #25
 // CHECK-INST: uqincd  x0, #25
 // CHECK-ENCODING: [0x20,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f7 f0 04 <unknown>
 
 uqincd  x0, #26
 // CHECK-INST: uqincd  x0, #26
 // CHECK-ENCODING: [0x40,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f7 f0 04 <unknown>
 
 uqincd  x0, #27
 // CHECK-INST: uqincd  x0, #27
 // CHECK-ENCODING: [0x60,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f7 f0 04 <unknown>
 
 uqincd  x0, #28
 // CHECK-INST: uqincd  x0, #28
 // CHECK-ENCODING: [0x80,0xf7,0xf0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f7 f0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ uqincd  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqincd  z0.d
 // CHECK-INST: uqincd	z0.d
 // CHECK-ENCODING: [0xe0,0xc7,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 e0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqincd  z0.d, pow2, mul #16
 // CHECK-INST: uqincd	z0.d, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc4,0xef,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 ef 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqincd  z0.d, pow2
 // CHECK-INST: uqincd	z0.d, pow2
 // CHECK-ENCODING: [0x00,0xc4,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 e0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqinch.s b/llvm/test/MC/AArch64/SVE/uqinch.s
index c38656ba7edab..a98cf2bd0083b 100644
--- a/llvm/test/MC/AArch64/SVE/uqinch.s
+++ b/llvm/test/MC/AArch64/SVE/uqinch.s
@@ -17,25 +17,25 @@
 uqinch  x0
 // CHECK-INST: uqinch  x0
 // CHECK-ENCODING: [0xe0,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 70 04 <unknown>
 
 uqinch  x0, all
 // CHECK-INST: uqinch  x0
 // CHECK-ENCODING: [0xe0,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 70 04 <unknown>
 
 uqinch  x0, all, mul #1
 // CHECK-INST: uqinch  x0
 // CHECK-ENCODING: [0xe0,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 70 04 <unknown>
 
 uqinch  x0, all, mul #16
 // CHECK-INST: uqinch  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 7f 04 <unknown>
 
 
@@ -46,37 +46,37 @@ uqinch  x0, all, mul #16
 uqinch  w0
 // CHECK-INST: uqinch  w0
 // CHECK-ENCODING: [0xe0,0xf7,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 60 04 <unknown>
 
 uqinch  w0, all
 // CHECK-INST: uqinch  w0
 // CHECK-ENCODING: [0xe0,0xf7,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 60 04 <unknown>
 
 uqinch  w0, all, mul #1
 // CHECK-INST: uqinch  w0
 // CHECK-ENCODING: [0xe0,0xf7,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 60 04 <unknown>
 
 uqinch  w0, all, mul #16
 // CHECK-INST: uqinch  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 6f 04 <unknown>
 
 uqinch  w0, pow2
 // CHECK-INST: uqinch  w0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 60 04 <unknown>
 
 uqinch  w0, pow2, mul #16
 // CHECK-INST: uqinch  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf4,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 6f 04 <unknown>
 
 
@@ -87,37 +87,37 @@ uqinch  w0, pow2, mul #16
 uqinch  z0.h
 // CHECK-INST: uqinch  z0.h
 // CHECK-ENCODING: [0xe0,0xc7,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 60 04 <unknown>
 
 uqinch  z0.h, all
 // CHECK-INST: uqinch  z0.h
 // CHECK-ENCODING: [0xe0,0xc7,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 60 04 <unknown>
 
 uqinch  z0.h, all, mul #1
 // CHECK-INST: uqinch  z0.h
 // CHECK-ENCODING: [0xe0,0xc7,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 60 04 <unknown>
 
 uqinch  z0.h, all, mul #16
 // CHECK-INST: uqinch  z0.h, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc7,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 6f 04 <unknown>
 
 uqinch  z0.h, pow2
 // CHECK-INST: uqinch  z0.h, pow2
 // CHECK-ENCODING: [0x00,0xc4,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 60 04 <unknown>
 
 uqinch  z0.h, pow2, mul #16
 // CHECK-INST: uqinch  z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc4,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 6f 04 <unknown>
 
 
@@ -128,175 +128,175 @@ uqinch  z0.h, pow2, mul #16
 uqinch  x0, pow2
 // CHECK-INST: uqinch  x0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 70 04 <unknown>
 
 uqinch  x0, vl1
 // CHECK-INST: uqinch  x0, vl1
 // CHECK-ENCODING: [0x20,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f4 70 04 <unknown>
 
 uqinch  x0, vl2
 // CHECK-INST: uqinch  x0, vl2
 // CHECK-ENCODING: [0x40,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f4 70 04 <unknown>
 
 uqinch  x0, vl3
 // CHECK-INST: uqinch  x0, vl3
 // CHECK-ENCODING: [0x60,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f4 70 04 <unknown>
 
 uqinch  x0, vl4
 // CHECK-INST: uqinch  x0, vl4
 // CHECK-ENCODING: [0x80,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f4 70 04 <unknown>
 
 uqinch  x0, vl5
 // CHECK-INST: uqinch  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f4 70 04 <unknown>
 
 uqinch  x0, vl6
 // CHECK-INST: uqinch  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f4 70 04 <unknown>
 
 uqinch  x0, vl7
 // CHECK-INST: uqinch  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf4,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f4 70 04 <unknown>
 
 uqinch  x0, vl8
 // CHECK-INST: uqinch  x0, vl8
 // CHECK-ENCODING: [0x00,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f5 70 04 <unknown>
 
 uqinch  x0, vl16
 // CHECK-INST: uqinch  x0, vl16
 // CHECK-ENCODING: [0x20,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f5 70 04 <unknown>
 
 uqinch  x0, vl32
 // CHECK-INST: uqinch  x0, vl32
 // CHECK-ENCODING: [0x40,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f5 70 04 <unknown>
 
 uqinch  x0, vl64
 // CHECK-INST: uqinch  x0, vl64
 // CHECK-ENCODING: [0x60,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f5 70 04 <unknown>
 
 uqinch  x0, vl128
 // CHECK-INST: uqinch  x0, vl128
 // CHECK-ENCODING: [0x80,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f5 70 04 <unknown>
 
 uqinch  x0, vl256
 // CHECK-INST: uqinch  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f5 70 04 <unknown>
 
 uqinch  x0, #14
 // CHECK-INST: uqinch  x0, #14
 // CHECK-ENCODING: [0xc0,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f5 70 04 <unknown>
 
 uqinch  x0, #15
 // CHECK-INST: uqinch  x0, #15
 // CHECK-ENCODING: [0xe0,0xf5,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f5 70 04 <unknown>
 
 uqinch  x0, #16
 // CHECK-INST: uqinch  x0, #16
 // CHECK-ENCODING: [0x00,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f6 70 04 <unknown>
 
 uqinch  x0, #17
 // CHECK-INST: uqinch  x0, #17
 // CHECK-ENCODING: [0x20,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f6 70 04 <unknown>
 
 uqinch  x0, #18
 // CHECK-INST: uqinch  x0, #18
 // CHECK-ENCODING: [0x40,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f6 70 04 <unknown>
 
 uqinch  x0, #19
 // CHECK-INST: uqinch  x0, #19
 // CHECK-ENCODING: [0x60,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f6 70 04 <unknown>
 
 uqinch  x0, #20
 // CHECK-INST: uqinch  x0, #20
 // CHECK-ENCODING: [0x80,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f6 70 04 <unknown>
 
 uqinch  x0, #21
 // CHECK-INST: uqinch  x0, #21
 // CHECK-ENCODING: [0xa0,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f6 70 04 <unknown>
 
 uqinch  x0, #22
 // CHECK-INST: uqinch  x0, #22
 // CHECK-ENCODING: [0xc0,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f6 70 04 <unknown>
 
 uqinch  x0, #23
 // CHECK-INST: uqinch  x0, #23
 // CHECK-ENCODING: [0xe0,0xf6,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f6 70 04 <unknown>
 
 uqinch  x0, #24
 // CHECK-INST: uqinch  x0, #24
 // CHECK-ENCODING: [0x00,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f7 70 04 <unknown>
 
 uqinch  x0, #25
 // CHECK-INST: uqinch  x0, #25
 // CHECK-ENCODING: [0x20,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f7 70 04 <unknown>
 
 uqinch  x0, #26
 // CHECK-INST: uqinch  x0, #26
 // CHECK-ENCODING: [0x40,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f7 70 04 <unknown>
 
 uqinch  x0, #27
 // CHECK-INST: uqinch  x0, #27
 // CHECK-ENCODING: [0x60,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f7 70 04 <unknown>
 
 uqinch  x0, #28
 // CHECK-INST: uqinch  x0, #28
 // CHECK-ENCODING: [0x80,0xf7,0x70,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f7 70 04 <unknown>
 
 
@@ -306,35 +306,35 @@ uqinch  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqinch  z0.h
 // CHECK-INST: uqinch	z0.h
 // CHECK-ENCODING: [0xe0,0xc7,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 60 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqinch  z0.h, pow2, mul #16
 // CHECK-INST: uqinch	z0.h, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc4,0x6f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 6f 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqinch  z0.h, pow2
 // CHECK-INST: uqinch	z0.h, pow2
 // CHECK-ENCODING: [0x00,0xc4,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 60 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqincp.s b/llvm/test/MC/AArch64/SVE/uqincp.s
index 28e076f86b4a5..07c4143abc3c4 100644
--- a/llvm/test/MC/AArch64/SVE/uqincp.s
+++ b/llvm/test/MC/AArch64/SVE/uqincp.s
@@ -12,85 +12,85 @@
 uqincp  x0, p0.b
 // CHECK-INST: uqincp  x0, p0.b
 // CHECK-ENCODING: [0x00,0x8c,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 29 25 <unknown>
 
 uqincp  x0, p0.h
 // CHECK-INST: uqincp  x0, p0.h
 // CHECK-ENCODING: [0x00,0x8c,0x69,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c 69 25 <unknown>
 
 uqincp  x0, p0.s
 // CHECK-INST: uqincp  x0, p0.s
 // CHECK-ENCODING: [0x00,0x8c,0xa9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c a9 25 <unknown>
 
 uqincp  x0, p0.d
 // CHECK-INST: uqincp  x0, p0.d
 // CHECK-ENCODING: [0x00,0x8c,0xe9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 8c e9 25 <unknown>
 
 uqincp  wzr, p15.b
 // CHECK-INST: uqincp  wzr, p15.b
 // CHECK-ENCODING: [0xff,0x89,0x29,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 29 25 <unknown>
 
 uqincp  wzr, p15.h
 // CHECK-INST: uqincp  wzr, p15.h
 // CHECK-ENCODING: [0xff,0x89,0x69,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 69 25 <unknown>
 
 uqincp  wzr, p15.s
 // CHECK-INST: uqincp  wzr, p15.s
 // CHECK-ENCODING: [0xff,0x89,0xa9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 a9 25 <unknown>
 
 uqincp  wzr, p15.d
 // CHECK-INST: uqincp  wzr, p15.d
 // CHECK-ENCODING: [0xff,0x89,0xe9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 89 e9 25 <unknown>
 
 uqincp  z0.h, p0
 // CHECK-INST: uqincp  z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x69,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 69 25 <unknown>
 
 uqincp  z0.h, p0.h
 // CHECK-INST: uqincp  z0.h, p0.h
 // CHECK-ENCODING: [0x00,0x80,0x69,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 69 25 <unknown>
 
 uqincp  z0.s, p0
 // CHECK-INST: uqincp  z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xa9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 a9 25 <unknown>
 
 uqincp  z0.s, p0.s
 // CHECK-INST: uqincp  z0.s, p0.s
 // CHECK-ENCODING: [0x00,0x80,0xa9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 a9 25 <unknown>
 
 uqincp  z0.d, p0
 // CHECK-INST: uqincp  z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xe9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e9 25 <unknown>
 
 uqincp  z0.d, p0.d
 // CHECK-INST: uqincp  z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xe9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e9 25 <unknown>
 
 
@@ -100,11 +100,11 @@ uqincp  z0.d, p0.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqincp  z0.d, p0.d
 // CHECK-INST: uqincp	z0.d, p0.d
 // CHECK-ENCODING: [0x00,0x80,0xe9,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 80 e9 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqincw.s b/llvm/test/MC/AArch64/SVE/uqincw.s
index c644db8c7f3bb..5120894257047 100644
--- a/llvm/test/MC/AArch64/SVE/uqincw.s
+++ b/llvm/test/MC/AArch64/SVE/uqincw.s
@@ -16,25 +16,25 @@
 uqincw  x0
 // CHECK-INST: uqincw  x0
 // CHECK-ENCODING: [0xe0,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 b0 04 <unknown>
 
 uqincw  x0, all
 // CHECK-INST: uqincw  x0
 // CHECK-ENCODING: [0xe0,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 b0 04 <unknown>
 
 uqincw  x0, all, mul #1
 // CHECK-INST: uqincw  x0
 // CHECK-ENCODING: [0xe0,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 b0 04 <unknown>
 
 uqincw  x0, all, mul #16
 // CHECK-INST: uqincw  x0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 bf 04 <unknown>
 
 
@@ -45,37 +45,37 @@ uqincw  x0, all, mul #16
 uqincw  w0
 // CHECK-INST: uqincw  w0
 // CHECK-ENCODING: [0xe0,0xf7,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 a0 04 <unknown>
 
 uqincw  w0, all
 // CHECK-INST: uqincw  w0
 // CHECK-ENCODING: [0xe0,0xf7,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 a0 04 <unknown>
 
 uqincw  w0, all, mul #1
 // CHECK-INST: uqincw  w0
 // CHECK-ENCODING: [0xe0,0xf7,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 a0 04 <unknown>
 
 uqincw  w0, all, mul #16
 // CHECK-INST: uqincw  w0, all, mul #16
 // CHECK-ENCODING: [0xe0,0xf7,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f7 af 04 <unknown>
 
 uqincw  w0, pow2
 // CHECK-INST: uqincw  w0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 a0 04 <unknown>
 
 uqincw  w0, pow2, mul #16
 // CHECK-INST: uqincw  w0, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xf4,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 af 04 <unknown>
 
 
@@ -85,37 +85,37 @@ uqincw  w0, pow2, mul #16
 uqincw  z0.s
 // CHECK-INST: uqincw  z0.s
 // CHECK-ENCODING: [0xe0,0xc7,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 a0 04 <unknown>
 
 uqincw  z0.s, all
 // CHECK-INST: uqincw  z0.s
 // CHECK-ENCODING: [0xe0,0xc7,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 a0 04 <unknown>
 
 uqincw  z0.s, all, mul #1
 // CHECK-INST: uqincw  z0.s
 // CHECK-ENCODING: [0xe0,0xc7,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 a0 04 <unknown>
 
 uqincw  z0.s, all, mul #16
 // CHECK-INST: uqincw  z0.s, all, mul #16
 // CHECK-ENCODING: [0xe0,0xc7,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 af 04 <unknown>
 
 uqincw  z0.s, pow2
 // CHECK-INST: uqincw  z0.s, pow2
 // CHECK-ENCODING: [0x00,0xc4,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 a0 04 <unknown>
 
 uqincw  z0.s, pow2, mul #16
 // CHECK-INST: uqincw  z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc4,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 af 04 <unknown>
 
 
@@ -126,175 +126,175 @@ uqincw  z0.s, pow2, mul #16
 uqincw  x0, pow2
 // CHECK-INST: uqincw  x0, pow2
 // CHECK-ENCODING: [0x00,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f4 b0 04 <unknown>
 
 uqincw  x0, vl1
 // CHECK-INST: uqincw  x0, vl1
 // CHECK-ENCODING: [0x20,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f4 b0 04 <unknown>
 
 uqincw  x0, vl2
 // CHECK-INST: uqincw  x0, vl2
 // CHECK-ENCODING: [0x40,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f4 b0 04 <unknown>
 
 uqincw  x0, vl3
 // CHECK-INST: uqincw  x0, vl3
 // CHECK-ENCODING: [0x60,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f4 b0 04 <unknown>
 
 uqincw  x0, vl4
 // CHECK-INST: uqincw  x0, vl4
 // CHECK-ENCODING: [0x80,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f4 b0 04 <unknown>
 
 uqincw  x0, vl5
 // CHECK-INST: uqincw  x0, vl5
 // CHECK-ENCODING: [0xa0,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f4 b0 04 <unknown>
 
 uqincw  x0, vl6
 // CHECK-INST: uqincw  x0, vl6
 // CHECK-ENCODING: [0xc0,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f4 b0 04 <unknown>
 
 uqincw  x0, vl7
 // CHECK-INST: uqincw  x0, vl7
 // CHECK-ENCODING: [0xe0,0xf4,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f4 b0 04 <unknown>
 
 uqincw  x0, vl8
 // CHECK-INST: uqincw  x0, vl8
 // CHECK-ENCODING: [0x00,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f5 b0 04 <unknown>
 
 uqincw  x0, vl16
 // CHECK-INST: uqincw  x0, vl16
 // CHECK-ENCODING: [0x20,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f5 b0 04 <unknown>
 
 uqincw  x0, vl32
 // CHECK-INST: uqincw  x0, vl32
 // CHECK-ENCODING: [0x40,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f5 b0 04 <unknown>
 
 uqincw  x0, vl64
 // CHECK-INST: uqincw  x0, vl64
 // CHECK-ENCODING: [0x60,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f5 b0 04 <unknown>
 
 uqincw  x0, vl128
 // CHECK-INST: uqincw  x0, vl128
 // CHECK-ENCODING: [0x80,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f5 b0 04 <unknown>
 
 uqincw  x0, vl256
 // CHECK-INST: uqincw  x0, vl256
 // CHECK-ENCODING: [0xa0,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f5 b0 04 <unknown>
 
 uqincw  x0, #14
 // CHECK-INST: uqincw  x0, #14
 // CHECK-ENCODING: [0xc0,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f5 b0 04 <unknown>
 
 uqincw  x0, #15
 // CHECK-INST: uqincw  x0, #15
 // CHECK-ENCODING: [0xe0,0xf5,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f5 b0 04 <unknown>
 
 uqincw  x0, #16
 // CHECK-INST: uqincw  x0, #16
 // CHECK-ENCODING: [0x00,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f6 b0 04 <unknown>
 
 uqincw  x0, #17
 // CHECK-INST: uqincw  x0, #17
 // CHECK-ENCODING: [0x20,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f6 b0 04 <unknown>
 
 uqincw  x0, #18
 // CHECK-INST: uqincw  x0, #18
 // CHECK-ENCODING: [0x40,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f6 b0 04 <unknown>
 
 uqincw  x0, #19
 // CHECK-INST: uqincw  x0, #19
 // CHECK-ENCODING: [0x60,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f6 b0 04 <unknown>
 
 uqincw  x0, #20
 // CHECK-INST: uqincw  x0, #20
 // CHECK-ENCODING: [0x80,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f6 b0 04 <unknown>
 
 uqincw  x0, #21
 // CHECK-INST: uqincw  x0, #21
 // CHECK-ENCODING: [0xa0,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: a0 f6 b0 04 <unknown>
 
 uqincw  x0, #22
 // CHECK-INST: uqincw  x0, #22
 // CHECK-ENCODING: [0xc0,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c0 f6 b0 04 <unknown>
 
 uqincw  x0, #23
 // CHECK-INST: uqincw  x0, #23
 // CHECK-ENCODING: [0xe0,0xf6,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 f6 b0 04 <unknown>
 
 uqincw  x0, #24
 // CHECK-INST: uqincw  x0, #24
 // CHECK-ENCODING: [0x00,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 f7 b0 04 <unknown>
 
 uqincw  x0, #25
 // CHECK-INST: uqincw  x0, #25
 // CHECK-ENCODING: [0x20,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 20 f7 b0 04 <unknown>
 
 uqincw  x0, #26
 // CHECK-INST: uqincw  x0, #26
 // CHECK-ENCODING: [0x40,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 40 f7 b0 04 <unknown>
 
 uqincw  x0, #27
 // CHECK-INST: uqincw  x0, #27
 // CHECK-ENCODING: [0x60,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 60 f7 b0 04 <unknown>
 
 uqincw  x0, #28
 // CHECK-INST: uqincw  x0, #28
 // CHECK-ENCODING: [0x80,0xf7,0xb0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 80 f7 b0 04 <unknown>
 
 
@@ -304,35 +304,35 @@ uqincw  x0, #28
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqincw  z0.s
 // CHECK-INST: uqincw	z0.s
 // CHECK-ENCODING: [0xe0,0xc7,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 c7 a0 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqincw  z0.s, pow2, mul #16
 // CHECK-INST: uqincw	z0.s, pow2, mul #16
 // CHECK-ENCODING: [0x00,0xc4,0xaf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 af 04 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uqincw  z0.s, pow2
 // CHECK-INST: uqincw	z0.s, pow2
 // CHECK-ENCODING: [0x00,0xc4,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c4 a0 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uqsub.s b/llvm/test/MC/AArch64/SVE/uqsub.s
index af44866001758..b1e219662d108 100644
--- a/llvm/test/MC/AArch64/SVE/uqsub.s
+++ b/llvm/test/MC/AArch64/SVE/uqsub.s
@@ -13,109 +13,109 @@
 uqsub     z0.b, z0.b, z0.b
 // CHECK-INST: uqsub z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x1c,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 1c 20 04 <unknown>
 
 uqsub     z0.h, z0.h, z0.h
 // CHECK-INST: uqsub z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 1c 60 04 <unknown>
 
 uqsub     z0.s, z0.s, z0.s
 // CHECK-INST: uqsub z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x1c,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 1c a0 04 <unknown>
 
 uqsub     z0.d, z0.d, z0.d
 // CHECK-INST: uqsub z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x1c,0xe0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 1c e0 04 <unknown>
 
 uqsub     z0.b, z0.b, #0
 // CHECK-INST: uqsub z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xc0,0x27,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 27 25 <unknown>
 
 uqsub     z31.b, z31.b, #255
 // CHECK-INST: uqsub z31.b, z31.b, #255
 // CHECK-ENCODING: [0xff,0xdf,0x27,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff df 27 25 <unknown>
 
 uqsub     z0.h, z0.h, #0
 // CHECK-INST: uqsub z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xc0,0x67,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 67 25 <unknown>
 
 uqsub     z0.h, z0.h, #0, lsl #8
 // CHECK-INST: uqsub z0.h, z0.h, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0x67,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 67 25 <unknown>
 
 uqsub     z31.h, z31.h, #255, lsl #8
 // CHECK-INST: uqsub z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x67,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 67 25 <unknown>
 
 uqsub     z31.h, z31.h, #65280
 // CHECK-INST: uqsub z31.h, z31.h, #65280
 // CHECK-ENCODING: [0xff,0xff,0x67,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff 67 25 <unknown>
 
 uqsub     z0.s, z0.s, #0
 // CHECK-INST: uqsub z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xc0,0xa7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 a7 25 <unknown>
 
 uqsub     z0.s, z0.s, #0, lsl #8
 // CHECK-INST: uqsub z0.s, z0.s, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xa7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 a7 25 <unknown>
 
 uqsub     z31.s, z31.s, #255, lsl #8
 // CHECK-INST: uqsub z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a7 25 <unknown>
 
 uqsub     z31.s, z31.s, #65280
 // CHECK-INST: uqsub z31.s, z31.s, #65280
 // CHECK-ENCODING: [0xff,0xff,0xa7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff a7 25 <unknown>
 
 uqsub     z0.d, z0.d, #0
 // CHECK-INST: uqsub z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xc0,0xe7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 c0 e7 25 <unknown>
 
 uqsub     z0.d, z0.d, #0, lsl #8
 // CHECK-INST: uqsub z0.d, z0.d, #0, lsl #8
 // CHECK-ENCODING: [0x00,0xe0,0xe7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 e0 e7 25 <unknown>
 
 uqsub     z31.d, z31.d, #255, lsl #8
 // CHECK-INST: uqsub z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e7 25 <unknown>
 
 uqsub     z31.d, z31.d, #65280
 // CHECK-INST: uqsub z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e7 25 <unknown>
 
 
@@ -125,11 +125,11 @@ uqsub     z31.d, z31.d, #65280
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqsub     z31.d, z31.d, #65280
 // CHECK-INST: uqsub	z31.d, z31.d, #65280
 // CHECK-ENCODING: [0xff,0xff,0xe7,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff ff e7 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uunpkhi.s b/llvm/test/MC/AArch64/SVE/uunpkhi.s
index d1680973aabb0..32ea44637d231 100644
--- a/llvm/test/MC/AArch64/SVE/uunpkhi.s
+++ b/llvm/test/MC/AArch64/SVE/uunpkhi.s
@@ -12,17 +12,17 @@
 uunpkhi z31.h, z31.b
 // CHECK-INST: uunpkhi	z31.h, z31.b
 // CHECK-ENCODING: [0xff,0x3b,0x73,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 73 05 <unknown>
 
 uunpkhi z31.s, z31.h
 // CHECK-INST: uunpkhi	z31.s, z31.h
 // CHECK-ENCODING: [0xff,0x3b,0xb3,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b b3 05 <unknown>
 
 uunpkhi z31.d, z31.s
 // CHECK-INST: uunpkhi	z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x3b,0xf3,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b f3 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uunpklo.s b/llvm/test/MC/AArch64/SVE/uunpklo.s
index bdd69a35cdf8d..e9fa49ce315ec 100644
--- a/llvm/test/MC/AArch64/SVE/uunpklo.s
+++ b/llvm/test/MC/AArch64/SVE/uunpklo.s
@@ -12,17 +12,17 @@
 uunpklo z31.h, z31.b
 // CHECK-INST: uunpklo	z31.h, z31.b
 // CHECK-ENCODING: [0xff,0x3b,0x72,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b 72 05 <unknown>
 
 uunpklo z31.s, z31.h
 // CHECK-INST: uunpklo	z31.s, z31.h
 // CHECK-ENCODING: [0xff,0x3b,0xb2,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b b2 05 <unknown>
 
 uunpklo z31.d, z31.s
 // CHECK-INST: uunpklo	z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x3b,0xf2,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 3b f2 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uxtb.s b/llvm/test/MC/AArch64/SVE/uxtb.s
index f4fdd6635673e..23548252a4273 100644
--- a/llvm/test/MC/AArch64/SVE/uxtb.s
+++ b/llvm/test/MC/AArch64/SVE/uxtb.s
@@ -12,37 +12,37 @@
 uxtb    z0.h, p0/m, z0.h
 // CHECK-INST: uxtb    z0.h, p0/m, z0.h
 // CHECK-ENCODING: [0x00,0xa0,0x51,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 51 04 <unknown>
 
 uxtb    z0.s, p0/m, z0.s
 // CHECK-INST: uxtb    z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x91,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 91 04 <unknown>
 
 uxtb    z0.d, p0/m, z0.d
 // CHECK-INST: uxtb    z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d1 04 <unknown>
 
 uxtb    z31.h, p7/m, z31.h
 // CHECK-INST: uxtb    z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x51,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 51 04 <unknown>
 
 uxtb    z31.s, p7/m, z31.s
 // CHECK-INST: uxtb    z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x91,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 91 04 <unknown>
 
 uxtb    z31.d, p7/m, z31.d
 // CHECK-INST: uxtb    z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d1 04 <unknown>
 
 
@@ -52,23 +52,23 @@ uxtb    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 uxtb    z4.d, p7/m, z31.d
 // CHECK-INST: uxtb	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d1 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 uxtb    z4.d, p7/m, z31.d
 // CHECK-INST: uxtb	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d1 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uxth.s b/llvm/test/MC/AArch64/SVE/uxth.s
index 8ad1c4ce93ace..43bfa54f708c2 100644
--- a/llvm/test/MC/AArch64/SVE/uxth.s
+++ b/llvm/test/MC/AArch64/SVE/uxth.s
@@ -12,25 +12,25 @@
 uxth    z0.s, p0/m, z0.s
 // CHECK-INST: uxth    z0.s, p0/m, z0.s
 // CHECK-ENCODING: [0x00,0xa0,0x93,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 93 04 <unknown>
 
 uxth    z0.d, p0/m, z0.d
 // CHECK-INST: uxth    z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d3 04 <unknown>
 
 uxth    z31.s, p7/m, z31.s
 // CHECK-INST: uxth    z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x93,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf 93 04 <unknown>
 
 uxth    z31.d, p7/m, z31.d
 // CHECK-INST: uxth    z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d3 04 <unknown>
 
 
@@ -40,23 +40,23 @@ uxth    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 uxth    z4.d, p7/m, z31.d
 // CHECK-INST: uxth	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d3 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 uxth    z4.d, p7/m, z31.d
 // CHECK-INST: uxth	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd3,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d3 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uxtw.s b/llvm/test/MC/AArch64/SVE/uxtw.s
index 63b30f237d907..435f733c9d10b 100644
--- a/llvm/test/MC/AArch64/SVE/uxtw.s
+++ b/llvm/test/MC/AArch64/SVE/uxtw.s
@@ -12,13 +12,13 @@
 uxtw    z0.d, p0/m, z0.d
 // CHECK-INST: uxtw    z0.d, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 a0 d5 04 <unknown>
 
 uxtw    z31.d, p7/m, z31.d
 // CHECK-INST: uxtw    z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bf d5 04 <unknown>
 
 
@@ -28,23 +28,23 @@ uxtw    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 uxtw    z4.d, p7/m, z31.d
 // CHECK-INST: uxtw	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d5 04 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 uxtw    z4.d, p7/m, z31.d
 // CHECK-INST: uxtw	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0xd5,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e4 bf d5 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uzp1.s b/llvm/test/MC/AArch64/SVE/uzp1.s
index f94f0629614d7..08f6300fcbae3 100644
--- a/llvm/test/MC/AArch64/SVE/uzp1.s
+++ b/llvm/test/MC/AArch64/SVE/uzp1.s
@@ -12,47 +12,47 @@
 uzp1    z31.b, z31.b, z31.b
 // CHECK-INST: uzp1	z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x6b,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6b 3f 05 <unknown>
 
 uzp1    z31.h, z31.h, z31.h
 // CHECK-INST: uzp1	z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x6b,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6b 7f 05 <unknown>
 
 uzp1    z31.s, z31.s, z31.s
 // CHECK-INST: uzp1	z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x6b,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6b bf 05 <unknown>
 
 uzp1    z31.d, z31.d, z31.d
 // CHECK-INST: uzp1	z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x6b,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6b ff 05 <unknown>
 
 uzp1    p15.b, p15.b, p15.b
 // CHECK-INST: uzp1	p15.b, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x49,0x2f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 49 2f 05 <unknown>
 
 uzp1    p15.s, p15.s, p15.s
 // CHECK-INST: uzp1	p15.s, p15.s, p15.s
 // CHECK-ENCODING: [0xef,0x49,0xaf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 49 af 05 <unknown>
 
 uzp1    p15.h, p15.h, p15.h
 // CHECK-INST: uzp1	p15.h, p15.h, p15.h
 // CHECK-ENCODING: [0xef,0x49,0x6f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 49 6f 05 <unknown>
 
 uzp1    p15.d, p15.d, p15.d
 // CHECK-INST: uzp1	p15.d, p15.d, p15.d
 // CHECK-ENCODING: [0xef,0x49,0xef,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 49 ef 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/uzp2.s b/llvm/test/MC/AArch64/SVE/uzp2.s
index c8ccacc5b7779..5a9285042811a 100644
--- a/llvm/test/MC/AArch64/SVE/uzp2.s
+++ b/llvm/test/MC/AArch64/SVE/uzp2.s
@@ -12,47 +12,47 @@
 uzp2    z31.b, z31.b, z31.b
 // CHECK-INST: uzp2	z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x6f,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6f 3f 05 <unknown>
 
 uzp2    z31.h, z31.h, z31.h
 // CHECK-INST: uzp2	z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x6f,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6f 7f 05 <unknown>
 
 uzp2    z31.s, z31.s, z31.s
 // CHECK-INST: uzp2	z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x6f,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6f bf 05 <unknown>
 
 uzp2    z31.d, z31.d, z31.d
 // CHECK-INST: uzp2	z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x6f,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 6f ff 05 <unknown>
 
 uzp2    p15.b, p15.b, p15.b
 // CHECK-INST: uzp2	p15.b, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x4d,0x2f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 4d 2f 05 <unknown>
 
 uzp2    p15.s, p15.s, p15.s
 // CHECK-INST: uzp2	p15.s, p15.s, p15.s
 // CHECK-ENCODING: [0xef,0x4d,0xaf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 4d af 05 <unknown>
 
 uzp2    p15.h, p15.h, p15.h
 // CHECK-INST: uzp2	p15.h, p15.h, p15.h
 // CHECK-ENCODING: [0xef,0x4d,0x6f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 4d 6f 05 <unknown>
 
 uzp2    p15.d, p15.d, p15.d
 // CHECK-INST: uzp2	p15.d, p15.d, p15.d
 // CHECK-ENCODING: [0xef,0x4d,0xef,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 4d ef 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/whilele.s b/llvm/test/MC/AArch64/SVE/whilele.s
index 6f1d519aa1bcd..a6cd350e5a4f1 100644
--- a/llvm/test/MC/AArch64/SVE/whilele.s
+++ b/llvm/test/MC/AArch64/SVE/whilele.s
@@ -12,59 +12,59 @@
 whilele  p15.b, xzr, x0
 // CHECK-INST: whilele	p15.b, xzr, x0
 // CHECK-ENCODING: [0xff,0x17,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 17 20 25 <unknown>
 
 whilele  p15.b, x0, xzr
 // CHECK-INST: whilele	p15.b, x0, xzr
 // CHECK-ENCODING: [0x1f,0x14,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 14 3f 25 <unknown>
 
 whilele  p15.b, wzr, w0
 // CHECK-INST: whilele	p15.b, wzr, w0
 // CHECK-ENCODING: [0xff,0x07,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 07 20 25 <unknown>
 
 whilele  p15.b, w0, wzr
 // CHECK-INST: whilele	p15.b, w0, wzr
 // CHECK-ENCODING: [0x1f,0x04,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 04 3f 25 <unknown>
 
 whilele  p15.h, x0, xzr
 // CHECK-INST: whilele	p15.h, x0, xzr
 // CHECK-ENCODING: [0x1f,0x14,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 14 7f 25 <unknown>
 
 whilele  p15.h, w0, wzr
 // CHECK-INST: whilele	p15.h, w0, wzr
 // CHECK-ENCODING: [0x1f,0x04,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 04 7f 25 <unknown>
 
 whilele  p15.s, x0, xzr
 // CHECK-INST: whilele	p15.s, x0, xzr
 // CHECK-ENCODING: [0x1f,0x14,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 14 bf 25 <unknown>
 
 whilele  p15.s, w0, wzr
 // CHECK-INST: whilele	p15.s, w0, wzr
 // CHECK-ENCODING: [0x1f,0x04,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 04 bf 25 <unknown>
 
 whilele  p15.d, w0, wzr
 // CHECK-INST: whilele	p15.d, w0, wzr
 // CHECK-ENCODING: [0x1f,0x04,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 04 ff 25 <unknown>
 
 whilele  p15.d, x0, xzr
 // CHECK-INST: whilele	p15.d, x0, xzr
 // CHECK-ENCODING: [0x1f,0x14,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 14 ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/whilelo.s b/llvm/test/MC/AArch64/SVE/whilelo.s
index 0246b0f4c3917..4d083d24aec58 100644
--- a/llvm/test/MC/AArch64/SVE/whilelo.s
+++ b/llvm/test/MC/AArch64/SVE/whilelo.s
@@ -12,59 +12,59 @@
 whilelo  p15.b, xzr, x0
 // CHECK-INST: whilelo	p15.b, xzr, x0
 // CHECK-ENCODING: [0xef,0x1f,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 1f 20 25 <unknown>
 
 whilelo  p15.b, x0, xzr
 // CHECK-INST: whilelo	p15.b, x0, xzr
 // CHECK-ENCODING: [0x0f,0x1c,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 1c 3f 25 <unknown>
 
 whilelo  p15.b, wzr, w0
 // CHECK-INST: whilelo	p15.b, wzr, w0
 // CHECK-ENCODING: [0xef,0x0f,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 0f 20 25 <unknown>
 
 whilelo  p15.b, w0, wzr
 // CHECK-INST: whilelo	p15.b, w0, wzr
 // CHECK-ENCODING: [0x0f,0x0c,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 0c 3f 25 <unknown>
 
 whilelo  p15.h, x0, xzr
 // CHECK-INST: whilelo	p15.h, x0, xzr
 // CHECK-ENCODING: [0x0f,0x1c,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 1c 7f 25 <unknown>
 
 whilelo  p15.h, w0, wzr
 // CHECK-INST: whilelo	p15.h, w0, wzr
 // CHECK-ENCODING: [0x0f,0x0c,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 0c 7f 25 <unknown>
 
 whilelo  p15.s, x0, xzr
 // CHECK-INST: whilelo	p15.s, x0, xzr
 // CHECK-ENCODING: [0x0f,0x1c,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 1c bf 25 <unknown>
 
 whilelo  p15.s, w0, wzr
 // CHECK-INST: whilelo	p15.s, w0, wzr
 // CHECK-ENCODING: [0x0f,0x0c,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 0c bf 25 <unknown>
 
 whilelo  p15.d, w0, wzr
 // CHECK-INST: whilelo	p15.d, w0, wzr
 // CHECK-ENCODING: [0x0f,0x0c,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 0c ff 25 <unknown>
 
 whilelo  p15.d, x0, xzr
 // CHECK-INST: whilelo	p15.d, x0, xzr
 // CHECK-ENCODING: [0x0f,0x1c,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 1c ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/whilels.s b/llvm/test/MC/AArch64/SVE/whilels.s
index 39342d72d3d5a..4ad0e3ef99f7b 100644
--- a/llvm/test/MC/AArch64/SVE/whilels.s
+++ b/llvm/test/MC/AArch64/SVE/whilels.s
@@ -12,59 +12,59 @@
 whilels  p15.b, xzr, x0
 // CHECK-INST: whilels	p15.b, xzr, x0
 // CHECK-ENCODING: [0xff,0x1f,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 1f 20 25 <unknown>
 
 whilels  p15.b, x0, xzr
 // CHECK-INST: whilels	p15.b, x0, xzr
 // CHECK-ENCODING: [0x1f,0x1c,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 1c 3f 25 <unknown>
 
 whilels  p15.b, wzr, w0
 // CHECK-INST: whilels	p15.b, wzr, w0
 // CHECK-ENCODING: [0xff,0x0f,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 0f 20 25 <unknown>
 
 whilels  p15.b, w0, wzr
 // CHECK-INST: whilels	p15.b, w0, wzr
 // CHECK-ENCODING: [0x1f,0x0c,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 0c 3f 25 <unknown>
 
 whilels  p15.h, x0, xzr
 // CHECK-INST: whilels	p15.h, x0, xzr
 // CHECK-ENCODING: [0x1f,0x1c,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 1c 7f 25 <unknown>
 
 whilels  p15.h, w0, wzr
 // CHECK-INST: whilels	p15.h, w0, wzr
 // CHECK-ENCODING: [0x1f,0x0c,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 0c 7f 25 <unknown>
 
 whilels  p15.s, x0, xzr
 // CHECK-INST: whilels	p15.s, x0, xzr
 // CHECK-ENCODING: [0x1f,0x1c,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 1c bf 25 <unknown>
 
 whilels  p15.s, w0, wzr
 // CHECK-INST: whilels	p15.s, w0, wzr
 // CHECK-ENCODING: [0x1f,0x0c,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 0c bf 25 <unknown>
 
 whilels  p15.d, w0, wzr
 // CHECK-INST: whilels	p15.d, w0, wzr
 // CHECK-ENCODING: [0x1f,0x0c,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 0c ff 25 <unknown>
 
 whilels  p15.d, x0, xzr
 // CHECK-INST: whilels	p15.d, x0, xzr
 // CHECK-ENCODING: [0x1f,0x1c,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 1f 1c ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/whilelt.s b/llvm/test/MC/AArch64/SVE/whilelt.s
index 4c915305b1d3b..5b7f4f16efc00 100644
--- a/llvm/test/MC/AArch64/SVE/whilelt.s
+++ b/llvm/test/MC/AArch64/SVE/whilelt.s
@@ -12,59 +12,59 @@
 whilelt  p15.b, xzr, x0
 // CHECK-INST: whilelt	p15.b, xzr, x0
 // CHECK-ENCODING: [0xef,0x17,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 17 20 25 <unknown>
 
 whilelt  p15.b, x0, xzr
 // CHECK-INST: whilelt	p15.b, x0, xzr
 // CHECK-ENCODING: [0x0f,0x14,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 14 3f 25 <unknown>
 
 whilelt  p15.b, wzr, w0
 // CHECK-INST: whilelt	p15.b, wzr, w0
 // CHECK-ENCODING: [0xef,0x07,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 07 20 25 <unknown>
 
 whilelt  p15.b, w0, wzr
 // CHECK-INST: whilelt	p15.b, w0, wzr
 // CHECK-ENCODING: [0x0f,0x04,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 04 3f 25 <unknown>
 
 whilelt  p15.h, x0, xzr
 // CHECK-INST: whilelt	p15.h, x0, xzr
 // CHECK-ENCODING: [0x0f,0x14,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 14 7f 25 <unknown>
 
 whilelt  p15.h, w0, wzr
 // CHECK-INST: whilelt	p15.h, w0, wzr
 // CHECK-ENCODING: [0x0f,0x04,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 04 7f 25 <unknown>
 
 whilelt  p15.s, x0, xzr
 // CHECK-INST: whilelt	p15.s, x0, xzr
 // CHECK-ENCODING: [0x0f,0x14,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 14 bf 25 <unknown>
 
 whilelt  p15.s, w0, wzr
 // CHECK-INST: whilelt	p15.s, w0, wzr
 // CHECK-ENCODING: [0x0f,0x04,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 04 bf 25 <unknown>
 
 whilelt  p15.d, w0, wzr
 // CHECK-INST: whilelt	p15.d, w0, wzr
 // CHECK-ENCODING: [0x0f,0x04,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 04 ff 25 <unknown>
 
 whilelt  p15.d, x0, xzr
 // CHECK-INST: whilelt	p15.d, x0, xzr
 // CHECK-ENCODING: [0x0f,0x14,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 0f 14 ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/zip1.s b/llvm/test/MC/AArch64/SVE/zip1.s
index bacef95cdf81f..c416bbc563a21 100644
--- a/llvm/test/MC/AArch64/SVE/zip1.s
+++ b/llvm/test/MC/AArch64/SVE/zip1.s
@@ -12,95 +12,95 @@
 zip1    z0.b, z0.b, z0.b
 // CHECK-INST: zip1    z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x60,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 20 05 <unknown>
 
 zip1    z0.h, z0.h, z0.h
 // CHECK-INST: zip1    z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x60,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 60 05 <unknown>
 
 zip1    z0.s, z0.s, z0.s
 // CHECK-INST: zip1    z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x60,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 a0 05 <unknown>
 
 zip1    z0.d, z0.d, z0.d
 // CHECK-INST: zip1    z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x60,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 60 e0 05 <unknown>
 
 zip1    z31.b, z31.b, z31.b
 // CHECK-INST: zip1    z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x63,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 63 3f 05 <unknown>
 
 zip1    z31.h, z31.h, z31.h
 // CHECK-INST: zip1    z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x63,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 63 7f 05 <unknown>
 
 zip1    z31.s, z31.s, z31.s
 // CHECK-INST: zip1    z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x63,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 63 bf 05 <unknown>
 
 zip1    z31.d, z31.d, z31.d
 // CHECK-INST: zip1    z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x63,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 63 ff 05 <unknown>
 
 zip1    p0.b, p0.b, p0.b
 // CHECK-INST: zip1    p0.b, p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 20 05 <unknown>
 
 zip1    p0.h, p0.h, p0.h
 // CHECK-INST: zip1    p0.h, p0.h, p0.h
 // CHECK-ENCODING: [0x00,0x40,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 60 05 <unknown>
 
 zip1    p0.s, p0.s, p0.s
 // CHECK-INST: zip1    p0.s, p0.s, p0.s
 // CHECK-ENCODING: [0x00,0x40,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 a0 05 <unknown>
 
 zip1    p0.d, p0.d, p0.d
 // CHECK-INST: zip1    p0.d, p0.d, p0.d
 // CHECK-ENCODING: [0x00,0x40,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 40 e0 05 <unknown>
 
 zip1    p15.b, p15.b, p15.b
 // CHECK-INST: zip1    p15.b, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x41,0x2f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 41 2f 05 <unknown>
 
 zip1    p15.s, p15.s, p15.s
 // CHECK-INST: zip1    p15.s, p15.s, p15.s
 // CHECK-ENCODING: [0xef,0x41,0xaf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 41 af 05 <unknown>
 
 zip1    p15.h, p15.h, p15.h
 // CHECK-INST: zip1    p15.h, p15.h, p15.h
 // CHECK-ENCODING: [0xef,0x41,0x6f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 41 6f 05 <unknown>
 
 zip1    p15.d, p15.d, p15.d
 // CHECK-INST: zip1    p15.d, p15.d, p15.d
 // CHECK-ENCODING: [0xef,0x41,0xef,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 41 ef 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/zip2.s b/llvm/test/MC/AArch64/SVE/zip2.s
index 56eb686079659..7944b3804b8b8 100644
--- a/llvm/test/MC/AArch64/SVE/zip2.s
+++ b/llvm/test/MC/AArch64/SVE/zip2.s
@@ -12,95 +12,95 @@
 zip2    z0.b, z0.b, z0.b
 // CHECK-INST: zip2    z0.b, z0.b, z0.b
 // CHECK-ENCODING: [0x00,0x64,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 64 20 05 <unknown>
 
 zip2    z0.h, z0.h, z0.h
 // CHECK-INST: zip2    z0.h, z0.h, z0.h
 // CHECK-ENCODING: [0x00,0x64,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 64 60 05 <unknown>
 
 zip2    z0.s, z0.s, z0.s
 // CHECK-INST: zip2    z0.s, z0.s, z0.s
 // CHECK-ENCODING: [0x00,0x64,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 64 a0 05 <unknown>
 
 zip2    z0.d, z0.d, z0.d
 // CHECK-INST: zip2    z0.d, z0.d, z0.d
 // CHECK-ENCODING: [0x00,0x64,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 64 e0 05 <unknown>
 
 zip2    z31.b, z31.b, z31.b
 // CHECK-INST: zip2    z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x67,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 67 3f 05 <unknown>
 
 zip2    z31.h, z31.h, z31.h
 // CHECK-INST: zip2    z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x67,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 67 7f 05 <unknown>
 
 zip2    z31.s, z31.s, z31.s
 // CHECK-INST: zip2    z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x67,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 67 bf 05 <unknown>
 
 zip2    z31.d, z31.d, z31.d
 // CHECK-INST: zip2    z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x67,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff 67 ff 05 <unknown>
 
 zip2    p0.b, p0.b, p0.b
 // CHECK-INST: zip2    p0.b, p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x44,0x20,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 44 20 05 <unknown>
 
 zip2    p0.h, p0.h, p0.h
 // CHECK-INST: zip2    p0.h, p0.h, p0.h
 // CHECK-ENCODING: [0x00,0x44,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 44 60 05 <unknown>
 
 zip2    p0.s, p0.s, p0.s
 // CHECK-INST: zip2    p0.s, p0.s, p0.s
 // CHECK-ENCODING: [0x00,0x44,0xa0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 44 a0 05 <unknown>
 
 zip2    p0.d, p0.d, p0.d
 // CHECK-INST: zip2    p0.d, p0.d, p0.d
 // CHECK-ENCODING: [0x00,0x44,0xe0,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 00 44 e0 05 <unknown>
 
 zip2    p15.b, p15.b, p15.b
 // CHECK-INST: zip2    p15.b, p15.b, p15.b
 // CHECK-ENCODING: [0xef,0x45,0x2f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 45 2f 05 <unknown>
 
 zip2    p15.h, p15.h, p15.h
 // CHECK-INST: zip2    p15.h, p15.h, p15.h
 // CHECK-ENCODING: [0xef,0x45,0x6f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 45 6f 05 <unknown>
 
 zip2    p15.s, p15.s, p15.s
 // CHECK-INST: zip2    p15.s, p15.s, p15.s
 // CHECK-ENCODING: [0xef,0x45,0xaf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 45 af 05 <unknown>
 
 zip2    p15.d, p15.d, p15.d
 // CHECK-INST: zip2    p15.d, p15.d, p15.d
 // CHECK-ENCODING: [0xef,0x45,0xef,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ef 45 ef 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/adclb.s b/llvm/test/MC/AArch64/SVE2/adclb.s
index 4903e4fbe966e..e3150a458a927 100644
--- a/llvm/test/MC/AArch64/SVE2/adclb.s
+++ b/llvm/test/MC/AArch64/SVE2/adclb.s
@@ -12,13 +12,13 @@
 adclb z0.s, z1.s, z31.s
 // CHECK-INST: adclb z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xd0,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d0 1f 45 <unknown>
 
 adclb z0.d, z1.d, z31.d
 // CHECK-INST: adclb z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd0,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d0 5f 45 <unknown>
 
 
@@ -28,11 +28,11 @@ adclb z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 adclb z0.d, z1.d, z31.d
 // CHECK-INST: adclb z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd0,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d0 5f 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/adclt.s b/llvm/test/MC/AArch64/SVE2/adclt.s
index 97f812875a0bf..57cfbd888e372 100644
--- a/llvm/test/MC/AArch64/SVE2/adclt.s
+++ b/llvm/test/MC/AArch64/SVE2/adclt.s
@@ -12,13 +12,13 @@
 adclt z0.s, z1.s, z31.s
 // CHECK-INST: adclt z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xd4,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d4 1f 45 <unknown>
 
 adclt z0.d, z1.d, z31.d
 // CHECK-INST: adclt z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd4,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d4 5f 45 <unknown>
 
 
@@ -28,11 +28,11 @@ adclt z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 adclt z0.d, z1.d, z31.d
 // CHECK-INST: adclt z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd4,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d4 5f 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/addhnb.s b/llvm/test/MC/AArch64/SVE2/addhnb.s
index 734da7ac3a039..5071e3bff0977 100644
--- a/llvm/test/MC/AArch64/SVE2/addhnb.s
+++ b/llvm/test/MC/AArch64/SVE2/addhnb.s
@@ -13,17 +13,17 @@
 addhnb z0.b, z1.h, z31.h
 // CHECK-INST: addhnb	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x60,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 7f 45 <unknown>
 
 addhnb z0.h, z1.s, z31.s
 // CHECK-INST: addhnb	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x60,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 bf 45 <unknown>
 
 addhnb z0.s, z1.d, z31.d
 // CHECK-INST: addhnb	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x60,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/addhnt.s b/llvm/test/MC/AArch64/SVE2/addhnt.s
index 120a218a41168..9b69b71fe8bc6 100644
--- a/llvm/test/MC/AArch64/SVE2/addhnt.s
+++ b/llvm/test/MC/AArch64/SVE2/addhnt.s
@@ -13,17 +13,17 @@
 addhnt z0.b, z1.h, z31.h
 // CHECK-INST: addhnt	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x64,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 7f 45 <unknown>
 
 addhnt z0.h, z1.s, z31.s
 // CHECK-INST: addhnt	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x64,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 bf 45 <unknown>
 
 addhnt z0.s, z1.d, z31.d
 // CHECK-INST: addhnt	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x64,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/addp.s b/llvm/test/MC/AArch64/SVE2/addp.s
index 4147f9c259194..9e90f8997e3b2 100644
--- a/llvm/test/MC/AArch64/SVE2/addp.s
+++ b/llvm/test/MC/AArch64/SVE2/addp.s
@@ -12,25 +12,25 @@
 addp z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: addp z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0xa0,0x11,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 11 44 <unknown>
 
 addp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: addp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0xa0,0x51,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 51 44 <unknown>
 
 addp z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: addp z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0xbf,0x91,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd bf 91 44 <unknown>
 
 addp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: addp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d1 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ addp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 addp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: addp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xa3,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 d1 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 addp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: addp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d1 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/bcax.s b/llvm/test/MC/AArch64/SVE2/bcax.s
index 5c23d7fbe3c98..36752595c822d 100644
--- a/llvm/test/MC/AArch64/SVE2/bcax.s
+++ b/llvm/test/MC/AArch64/SVE2/bcax.s
@@ -12,7 +12,7 @@
 bcax z29.d, z29.d, z30.d, z31.d
 // CHECK-INST: bcax z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x7e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 7e 04 <unknown>
 
 
@@ -22,19 +22,19 @@ bcax z29.d, z29.d, z30.d, z31.d
 bcax z29.b, z29.b, z30.b, z31.b
 // CHECK-INST: bcax z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x7e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 7e 04 <unknown>
 
 bcax z29.h, z29.h, z30.h, z31.h
 // CHECK-INST: bcax z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x7e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 7e 04 <unknown>
 
 bcax z29.s, z29.s, z30.s, z31.s
 // CHECK-INST: bcax z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x7e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 7e 04 <unknown>
 
 
@@ -44,11 +44,11 @@ bcax z29.s, z29.s, z30.s, z31.s
 movprfx z31, z7
 // CHECK-INST: movprfx z31, z7
 // CHECK-ENCODING: [0xff,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bc 20 04 <unknown>
 
 bcax z31.d, z31.d, z30.d, z29.d
 // CHECK-INST: bcax z31.d, z31.d, z30.d, z29.d
 // CHECK-ENCODING: [0xbf,0x3b,0x7e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bf 3b 7e 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/bsl.s b/llvm/test/MC/AArch64/SVE2/bsl.s
index 7a7172a885223..a3ec691c4fdfb 100644
--- a/llvm/test/MC/AArch64/SVE2/bsl.s
+++ b/llvm/test/MC/AArch64/SVE2/bsl.s
@@ -12,7 +12,7 @@
 bsl z0.d, z0.d, z1.d, z2.d
 // CHECK-INST: bsl z0.d, z0.d, z1.d, z2.d
 // CHECK-ENCODING: [0x40,0x3c,0x21,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 40 3c 21 04 <unknown>
 
 
@@ -22,11 +22,11 @@ bsl z0.d, z0.d, z1.d, z2.d
 movprfx z31, z7
 // CHECK-INST: movprfx z31, z7
 // CHECK-ENCODING: [0xff,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bc 20 04 <unknown>
 
 bsl z31.d, z31.d, z30.d, z29.d
 // CHECK-INST: bsl z31.d, z31.d, z30.d, z29.d
 // CHECK-ENCODING: [0xbf,0x3f,0x3e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bf 3f 3e 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/bsl1n.s b/llvm/test/MC/AArch64/SVE2/bsl1n.s
index 49d1a7e5afbbb..15438341b3aff 100644
--- a/llvm/test/MC/AArch64/SVE2/bsl1n.s
+++ b/llvm/test/MC/AArch64/SVE2/bsl1n.s
@@ -12,7 +12,7 @@
 bsl1n z0.d, z0.d, z1.d, z2.d
 // CHECK-INST: bsl1n z0.d, z0.d, z1.d, z2.d
 // CHECK-ENCODING: [0x40,0x3c,0x61,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 40 3c 61 04 <unknown>
 
 
@@ -22,11 +22,11 @@ bsl1n z0.d, z0.d, z1.d, z2.d
 movprfx z31, z7
 // CHECK-INST: movprfx z31, z7
 // CHECK-ENCODING: [0xff,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bc 20 04 <unknown>
 
 bsl1n z31.d, z31.d, z30.d, z29.d
 // CHECK-INST: bsl1n z31.d, z31.d, z30.d, z29.d
 // CHECK-ENCODING: [0xbf,0x3f,0x7e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bf 3f 7e 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/bsl2n.s b/llvm/test/MC/AArch64/SVE2/bsl2n.s
index 77d5221053672..623f6e3605a2f 100644
--- a/llvm/test/MC/AArch64/SVE2/bsl2n.s
+++ b/llvm/test/MC/AArch64/SVE2/bsl2n.s
@@ -12,7 +12,7 @@
 bsl2n z0.d, z0.d, z1.d, z2.d
 // CHECK-INST: bsl2n z0.d, z0.d, z1.d, z2.d
 // CHECK-ENCODING: [0x40,0x3c,0xa1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 40 3c a1 04 <unknown>
 
 
@@ -22,11 +22,11 @@ bsl2n z0.d, z0.d, z1.d, z2.d
 movprfx z31, z7
 // CHECK-INST: movprfx z31, z7
 // CHECK-ENCODING: [0xff,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bc 20 04 <unknown>
 
 bsl2n z31.d, z31.d, z30.d, z29.d
 // CHECK-INST: bsl2n z31.d, z31.d, z30.d, z29.d
 // CHECK-ENCODING: [0xbf,0x3f,0xbe,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bf 3f be 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/cadd.s b/llvm/test/MC/AArch64/SVE2/cadd.s
index c4e4b9f0a4b2d..a27fc6c623bb7 100644
--- a/llvm/test/MC/AArch64/SVE2/cadd.s
+++ b/llvm/test/MC/AArch64/SVE2/cadd.s
@@ -12,49 +12,49 @@
 cadd   z0.b, z0.b, z0.b, #90
 // CHECK-INST: cadd   z0.b, z0.b, z0.b, #90
 // CHECK-ENCODING: [0x00,0xd8,0x00,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 00 45 <unknown>
 
 cadd   z0.h, z0.h, z0.h, #90
 // CHECK-INST: cadd   z0.h, z0.h, z0.h, #90
 // CHECK-ENCODING: [0x00,0xd8,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 40 45 <unknown>
 
 cadd   z0.s, z0.s, z0.s, #90
 // CHECK-INST: cadd   z0.s, z0.s, z0.s, #90
 // CHECK-ENCODING: [0x00,0xd8,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 80 45 <unknown>
 
 cadd   z0.d, z0.d, z0.d, #90
 // CHECK-INST: cadd   z0.d, z0.d, z0.d, #90
 // CHECK-ENCODING: [0x00,0xd8,0xc0,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 c0 45 <unknown>
 
 cadd   z31.b, z31.b, z31.b, #270
 // CHECK-INST: cadd   z31.b, z31.b, z31.b, #270
 // CHECK-ENCODING: [0xff,0xdf,0x00,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df 00 45 <unknown>
 
 cadd   z31.h, z31.h, z31.h, #270
 // CHECK-INST: cadd   z31.h, z31.h, z31.h, #270
 // CHECK-ENCODING: [0xff,0xdf,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df 40 45 <unknown>
 
 cadd   z31.s, z31.s, z31.s, #270
 // CHECK-INST: cadd   z31.s, z31.s, z31.s, #270
 // CHECK-ENCODING: [0xff,0xdf,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df 80 45 <unknown>
 
 cadd   z31.d, z31.d, z31.d, #270
 // CHECK-INST: cadd   z31.d, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xff,0xdf,0xc0,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df c0 45 <unknown>
 
 
@@ -64,11 +64,11 @@ cadd   z31.d, z31.d, z31.d, #270
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 cadd   z4.d, z4.d, z31.d, #270
 // CHECK-INST: cadd	z4.d, z4.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0xdf,0xc0,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 df c0 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/cdot.s b/llvm/test/MC/AArch64/SVE2/cdot.s
index 8e83f2f8ba435..1a9386fa486f8 100644
--- a/llvm/test/MC/AArch64/SVE2/cdot.s
+++ b/llvm/test/MC/AArch64/SVE2/cdot.s
@@ -12,61 +12,61 @@
 cdot  z0.s, z1.b, z31.b, #0
 // CHECK-INST: cdot	z0.s, z1.b, z31.b, #0
 // CHECK-ENCODING: [0x20,0x10,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 9f 44 <unknown>
 
 cdot  z0.d, z1.h, z31.h, #0
 // CHECK-INST: cdot	z0.d, z1.h, z31.h, #0
 // CHECK-ENCODING: [0x20,0x10,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 df 44 <unknown>
 
 cdot  z0.d, z1.h, z31.h, #90
 // CHECK-INST: cdot	z0.d, z1.h, z31.h, #90
 // CHECK-ENCODING: [0x20,0x14,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 14 df 44 <unknown>
 
 cdot  z0.d, z1.h, z31.h, #180
 // CHECK-INST: cdot	z0.d, z1.h, z31.h, #180
 // CHECK-ENCODING: [0x20,0x18,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 18 df 44 <unknown>
 
 cdot  z0.d, z1.h, z31.h, #270
 // CHECK-INST: cdot	z0.d, z1.h, z31.h, #270
 // CHECK-ENCODING: [0x20,0x1c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 1c df 44 <unknown>
 
 cdot  z0.s, z1.b, z7.b[3], #0
 // CHECK-INST: cdot	z0.s, z1.b, z7.b[3], #0
 // CHECK-ENCODING: [0x20,0x40,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 bf 44 <unknown>
 
 cdot  z0.d, z1.h, z15.h[1], #0
 // CHECK-INST: cdot	z0.d, z1.h, z15.h[1], #0
 // CHECK-ENCODING: [0x20,0x40,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 ff 44 <unknown>
 
 cdot  z5.d, z6.h, z3.h[0], #90
 // CHECK-INST: cdot	z5.d, z6.h, z3.h[0], #90
 // CHECK-ENCODING: [0xc5,0x44,0xe3,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: c5 44 e3 44 <unknown>
 
 cdot  z29.d, z30.h, z0.h[0], #180
 // CHECK-INST: cdot z29.d, z30.h, z0.h[0], #180
 // CHECK-ENCODING: [0xdd,0x4b,0xe0,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 4b e0 44 <unknown>
 
 cdot  z31.d, z30.h, z7.h[1], #270
 // CHECK-INST: cdot z31.d, z30.h, z7.h[1], #270
 // CHECK-ENCODING: [0xdf,0x4f,0xf7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 4f f7 44 <unknown>
 
 
@@ -76,23 +76,23 @@ cdot  z31.d, z30.h, z7.h[1], #270
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 cdot  z0.d, z1.h, z31.h, #0
 // CHECK-INST: cdot	z0.d, z1.h, z31.h, #0
 // CHECK-ENCODING: [0x20,0x10,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 df 44 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 cdot  z0.d, z1.h, z15.h[1], #0
 // CHECK-INST: cdot z0.d, z1.h, z15.h[1], #0
 // CHECK-ENCODING: [0x20,0x40,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/cmla.s b/llvm/test/MC/AArch64/SVE2/cmla.s
index b65555bb7fa19..4e93d1b384ca5 100644
--- a/llvm/test/MC/AArch64/SVE2/cmla.s
+++ b/llvm/test/MC/AArch64/SVE2/cmla.s
@@ -12,121 +12,121 @@
 cmla   z0.b, z1.b, z2.b, #0
 // CHECK-INST: cmla   z0.b, z1.b, z2.b, #0
 // CHECK-ENCODING: [0x20,0x20,0x02,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 20 02 44 <unknown>
 
 cmla   z0.h, z1.h, z2.h, #0
 // CHECK-INST: cmla   z0.h, z1.h, z2.h, #0
 // CHECK-ENCODING: [0x20,0x20,0x42,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 20 42 44 <unknown>
 
 cmla   z0.s, z1.s, z2.s, #0
 // CHECK-INST: cmla   z0.s, z1.s, z2.s, #0
 // CHECK-ENCODING: [0x20,0x20,0x82,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 20 82 44 <unknown>
 
 cmla   z0.d, z1.d, z2.d, #0
 // CHECK-INST: cmla   z0.d, z1.d, z2.d, #0
 // CHECK-ENCODING: [0x20,0x20,0xc2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 20 c2 44 <unknown>
 
 cmla   z29.b, z30.b, z31.b, #90
 // CHECK-INST: cmla   z29.b, z30.b, z31.b, #90
 // CHECK-ENCODING: [0xdd,0x27,0x1f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 27 1f 44 <unknown>
 
 cmla   z29.h, z30.h, z31.h, #90
 // CHECK-INST: cmla   z29.h, z30.h, z31.h, #90
 // CHECK-ENCODING: [0xdd,0x27,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 27 5f 44 <unknown>
 
 cmla   z29.s, z30.s, z31.s, #90
 // CHECK-INST: cmla   z29.s, z30.s, z31.s, #90
 // CHECK-ENCODING: [0xdd,0x27,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 27 9f 44 <unknown>
 
 cmla   z29.d, z30.d, z31.d, #90
 // CHECK-INST: cmla   z29.d, z30.d, z31.d, #90
 // CHECK-ENCODING: [0xdd,0x27,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 27 df 44 <unknown>
 
 cmla   z31.b, z31.b, z31.b, #180
 // CHECK-INST: cmla   z31.b, z31.b, z31.b, #180
 // CHECK-ENCODING: [0xff,0x2b,0x1f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2b 1f 44 <unknown>
 
 cmla   z31.h, z31.h, z31.h, #180
 // CHECK-INST: cmla   z31.h, z31.h, z31.h, #180
 // CHECK-ENCODING: [0xff,0x2b,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2b 5f 44 <unknown>
 
 cmla   z31.s, z31.s, z31.s, #180
 // CHECK-INST: cmla   z31.s, z31.s, z31.s, #180
 // CHECK-ENCODING: [0xff,0x2b,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2b 9f 44 <unknown>
 
 cmla   z31.d, z31.d, z31.d, #180
 // CHECK-INST: cmla   z31.d, z31.d, z31.d, #180
 // CHECK-ENCODING: [0xff,0x2b,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2b df 44 <unknown>
 
 cmla   z15.b, z16.b, z17.b, #270
 // CHECK-INST: cmla   z15.b, z16.b, z17.b, #270
 // CHECK-ENCODING: [0x0f,0x2e,0x11,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 2e 11 44 <unknown>
 
 cmla   z15.h, z16.h, z17.h, #270
 // CHECK-INST: cmla   z15.h, z16.h, z17.h, #270
 // CHECK-ENCODING: [0x0f,0x2e,0x51,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 2e 51 44 <unknown>
 
 cmla   z15.s, z16.s, z17.s, #270
 // CHECK-INST: cmla   z15.s, z16.s, z17.s, #270
 // CHECK-ENCODING: [0x0f,0x2e,0x91,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 2e 91 44 <unknown>
 
 cmla   z15.d, z16.d, z17.d, #270
 // CHECK-INST: cmla   z15.d, z16.d, z17.d, #270
 // CHECK-ENCODING: [0x0f,0x2e,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 2e d1 44 <unknown>
 
 cmla   z0.h, z1.h, z2.h[0], #0
 // CHECK-INST: cmla   z0.h, z1.h, z2.h[0], #0
 // CHECK-ENCODING: [0x20,0x60,0xa2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 a2 44 <unknown>
 
 cmla   z0.s, z1.s, z2.s[0], #0
 // CHECK-INST: cmla   z0.s, z1.s, z2.s[0], #0
 // CHECK-ENCODING: [0x20,0x60,0xe2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 e2 44 <unknown>
 
 cmla   z31.h, z30.h, z7.h[0], #180
 // CHECK-INST: cmla   z31.h, z30.h, z7.h[0], #180
 // CHECK-ENCODING: [0xdf,0x6b,0xa7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 6b a7 44 <unknown>
 
 cmla   z31.s, z30.s, z7.s[0], #180
 // CHECK-INST: cmla   z31.s, z30.s, z7.s[0], #180
 // CHECK-ENCODING: [0xdf,0x6b,0xe7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 6b e7 44 <unknown>
 
 
@@ -136,23 +136,23 @@ cmla   z31.s, z30.s, z7.s[0], #180
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 cmla   z4.d, z31.d, z31.d, #270
 // CHECK-INST: cmla   z4.d, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0x2f,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 2f df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 cmla   z21.s, z10.s, z5.s[1], #90
 // CHECK-INST: cmla	z21.s, z10.s, z5.s[1], #90
 // CHECK-ENCODING: [0x55,0x65,0xf5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 65 f5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
index 6042d5692f047..767e5dc5a1513 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s
@@ -3,7 +3,7 @@
 .arch armv9-a+sve2
 .arch armv9-a+nosve2
 tbx z0.b, z1.b, z2.b
-// CHECK: error: instruction requires: streaming-sve or sve2
+// CHECK: error: instruction requires: sve2 or sme
 // CHECK-NEXT: tbx z0.b, z1.b, z2.b
 
 .arch armv9-a+sve2-aes
diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
index 764531714e43f..6d90f7f057490 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s
@@ -3,7 +3,7 @@
 .arch_extension sve2
 .arch_extension nosve2
 tbx z0.b, z1.b, z2.b
-// CHECK: error: instruction requires: streaming-sve or sve2
+// CHECK: error: instruction requires: sve2 or sme
 // CHECK-NEXT: tbx z0.b, z1.b, z2.b
 
 .arch_extension sve2-aes
diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
index 79a28fc6e814e..ed99aa7f00786 100644
--- a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
+++ b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s
@@ -3,7 +3,7 @@
 .cpu generic+sve2
 .cpu generic+nosve2
 tbx z0.b, z1.b, z2.b
-// CHECK: error: instruction requires: streaming-sve or sve2
+// CHECK: error: instruction requires: sve2 or sme
 // CHECK-NEXT: tbx z0.b, z1.b, z2.b
 
 .cpu generic+sve2-aes
diff --git a/llvm/test/MC/AArch64/SVE2/eor3.s b/llvm/test/MC/AArch64/SVE2/eor3.s
index 757ebd7c6a7c4..f5ac2dba9ba05 100644
--- a/llvm/test/MC/AArch64/SVE2/eor3.s
+++ b/llvm/test/MC/AArch64/SVE2/eor3.s
@@ -12,7 +12,7 @@
 eor3 z29.d, z29.d, z30.d, z31.d
 // CHECK-INST: eor3 z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x3e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 3e 04 <unknown>
 
 
@@ -22,19 +22,19 @@ eor3 z29.d, z29.d, z30.d, z31.d
 eor3 z29.b, z29.b, z30.b, z31.b
 // CHECK-INST: eor3 z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x3e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 3e 04 <unknown>
 
 eor3 z29.h, z29.h, z30.h, z31.h
 // CHECK-INST: eor3 z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x3e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 3e 04 <unknown>
 
 eor3 z29.s, z29.s, z30.s, z31.s
 // CHECK-INST: eor3 z29.d, z29.d, z30.d, z31.d
 // CHECK-ENCODING: [0xfd,0x3b,0x3e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fd 3b 3e 04 <unknown>
 
 
@@ -44,11 +44,11 @@ eor3 z29.s, z29.s, z30.s, z31.s
 movprfx z31, z7
 // CHECK-INST: movprfx z31, z7
 // CHECK-ENCODING: [0xff,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bc 20 04 <unknown>
 
 eor3 z31.d, z31.d, z30.d, z29.d
 // CHECK-INST: eor3 z31.d, z31.d, z30.d, z29.d
 // CHECK-ENCODING: [0xbf,0x3b,0x3e,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bf 3b 3e 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/eorbt.s b/llvm/test/MC/AArch64/SVE2/eorbt.s
index a949e022095b8..8f71c71a37f23 100644
--- a/llvm/test/MC/AArch64/SVE2/eorbt.s
+++ b/llvm/test/MC/AArch64/SVE2/eorbt.s
@@ -12,25 +12,25 @@
 eorbt z0.b, z1.b, z31.b
 // CHECK-INST: eorbt z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x90,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 90 1f 45 <unknown>
 
 eorbt z0.h, z1.h, z31.h
 // CHECK-INST: eorbt z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x90,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 90 5f 45 <unknown>
 
 eorbt z0.s, z1.s, z31.s
 // CHECK-INST: eorbt z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x90,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 90 9f 45 <unknown>
 
 eorbt z0.d, z1.d, z31.d
 // CHECK-INST: eorbt z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x90,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 90 df 45 <unknown>
 
 
@@ -40,11 +40,11 @@ eorbt z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 eorbt z0.d, z1.d, z31.d
 // CHECK-INST: eorbt z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x90,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 90 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/eortb.s b/llvm/test/MC/AArch64/SVE2/eortb.s
index 304ac1302f4b6..1d3016e90c7bc 100644
--- a/llvm/test/MC/AArch64/SVE2/eortb.s
+++ b/llvm/test/MC/AArch64/SVE2/eortb.s
@@ -12,25 +12,25 @@
 eortb z0.b, z1.b, z31.b
 // CHECK-INST: eortb z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x94,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 94 1f 45 <unknown>
 
 eortb z0.h, z1.h, z31.h
 // CHECK-INST: eortb z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x94,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 94 5f 45 <unknown>
 
 eortb z0.s, z1.s, z31.s
 // CHECK-INST: eortb z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x94,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 94 9f 45 <unknown>
 
 eortb z0.d, z1.d, z31.d
 // CHECK-INST: eortb z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x94,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 94 df 45 <unknown>
 
 
@@ -40,11 +40,11 @@ eortb z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 eortb z0.d, z1.d, z31.d
 // CHECK-INST: eortb z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x94,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 94 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ext.s b/llvm/test/MC/AArch64/SVE2/ext.s
index 3f1537f124bc4..262b5ad0def27 100644
--- a/llvm/test/MC/AArch64/SVE2/ext.s
+++ b/llvm/test/MC/AArch64/SVE2/ext.s
@@ -12,11 +12,11 @@
 ext z0.b, { z1.b, z2.b }, #0
 // CHECK-INST: ext z0.b, { z1.b, z2.b }, #0
 // CHECK-ENCODING: [0x20,0x00,0x60,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 00 60 05 <unknown>
 
 ext z31.b, { z30.b, z31.b }, #255
 // CHECK-INST: ext z31.b, { z30.b, z31.b }, #255
 // CHECK-ENCODING: [0xdf,0x1f,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 1f 7f 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/faddp.s b/llvm/test/MC/AArch64/SVE2/faddp.s
index 53a6510d1a45e..77b6dea6ff456 100644
--- a/llvm/test/MC/AArch64/SVE2/faddp.s
+++ b/llvm/test/MC/AArch64/SVE2/faddp.s
@@ -12,19 +12,19 @@
 faddp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: faddp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x50,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 50 64 <unknown>
 
 faddp z29.s, p3/m, z29.s, z30.s
 // CHECK-INST: faddp z29.s, p3/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x8f,0x90,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 8f 90 64 <unknown>
 
 faddp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: faddp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d0 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -33,23 +33,23 @@ faddp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 faddp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: faddp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d0 64 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 faddp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: faddp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd0,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d0 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fcvtlt.s b/llvm/test/MC/AArch64/SVE2/fcvtlt.s
index 521de9ff80e56..5120a74b19712 100644
--- a/llvm/test/MC/AArch64/SVE2/fcvtlt.s
+++ b/llvm/test/MC/AArch64/SVE2/fcvtlt.s
@@ -13,11 +13,11 @@
 fcvtlt z0.s, p0/m, z1.h
 // CHECK-INST: fcvtlt z0.s, p0/m, z1.h
 // CHECK-ENCODING: [0x20,0xa0,0x89,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 89 64 <unknown>
 
 fcvtlt z30.d, p7/m, z31.s
 // CHECK-INST: fcvtlt z30.d, p7/m, z31.s
 // CHECK-ENCODING: [0xfe,0xbf,0xcb,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe bf cb 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fcvtnt.s b/llvm/test/MC/AArch64/SVE2/fcvtnt.s
index 04a2b1db7c0c3..1d314ef65ab1b 100644
--- a/llvm/test/MC/AArch64/SVE2/fcvtnt.s
+++ b/llvm/test/MC/AArch64/SVE2/fcvtnt.s
@@ -13,11 +13,11 @@
 fcvtnt z0.h, p0/m, z1.s
 // CHECK-INST: fcvtnt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x88,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 88 64 <unknown>
 
 fcvtnt z30.s, p7/m, z31.d
 // CHECK-INST: fcvtnt z30.s, p7/m, z31.d
 // CHECK-ENCODING: [0xfe,0xbf,0xca,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe bf ca 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fcvtx.s b/llvm/test/MC/AArch64/SVE2/fcvtx.s
index c1a10a9227429..f22ba0f2a0bc1 100644
--- a/llvm/test/MC/AArch64/SVE2/fcvtx.s
+++ b/llvm/test/MC/AArch64/SVE2/fcvtx.s
@@ -13,13 +13,13 @@
 fcvtx    z0.s, p0/m, z0.d
 // CHECK-INST: fcvtx    z0.s, p0/m, z0.d
 // CHECK-ENCODING: [0x00,0xa0,0x0a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a0 0a 65 <unknown>
 
 fcvtx    z30.s, p7/m, z31.d
 // CHECK-INST: fcvtx    z30.s, p7/m, z31.d
 // CHECK-ENCODING: [0xfe,0xbf,0x0a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe bf 0a 65 <unknown>
 
 
@@ -30,23 +30,23 @@ fcvtx    z30.s, p7/m, z31.d
 movprfx z5.d, p0/z, z7.d
 // CHECK-INST: movprfx	z5.d, p0/z, z7.d
 // CHECK-ENCODING: [0xe5,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 20 d0 04 <unknown>
 
 fcvtx    z5.s, p0/m, z0.d
 // CHECK-INST: fcvtx	z5.s, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0x0a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 05 a0 0a 65 <unknown>
 
 movprfx z5, z7
 // CHECK-INST: movprfx	z5, z7
 // CHECK-ENCODING: [0xe5,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5 bc 20 04 <unknown>
 
 fcvtx    z5.s, p0/m, z0.d
 // CHECK-INST: fcvtx	z5.s, p0/m, z0.d
 // CHECK-ENCODING: [0x05,0xa0,0x0a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 05 a0 0a 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fcvtxnt.s b/llvm/test/MC/AArch64/SVE2/fcvtxnt.s
index 9c77c11757535..66317d2e6f58f 100644
--- a/llvm/test/MC/AArch64/SVE2/fcvtxnt.s
+++ b/llvm/test/MC/AArch64/SVE2/fcvtxnt.s
@@ -13,11 +13,11 @@
 fcvtxnt z0.s, p0/m, z1.d
 // CHECK-INST: fcvtxnt z0.s, p0/m, z1.d
 // CHECK-ENCODING: [0x20,0xa0,0x0a,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 0a 64 <unknown>
 
 fcvtxnt z30.s, p7/m, z31.d
 // CHECK-INST: fcvtxnt z30.s, p7/m, z31.d
 // CHECK-ENCODING: [0xfe,0xbf,0x0a,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe bf 0a 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/flogb.s b/llvm/test/MC/AArch64/SVE2/flogb.s
index 0da0f63112030..4e7e6a6668002 100644
--- a/llvm/test/MC/AArch64/SVE2/flogb.s
+++ b/llvm/test/MC/AArch64/SVE2/flogb.s
@@ -12,19 +12,19 @@
 flogb    z31.h, p7/m, z31.h
 // CHECK-INST: flogb	z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x1a,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 1a 65 <unknown>
 
 flogb    z31.s, p7/m, z31.s
 // CHECK-INST: flogb	z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x1c,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 1c 65 <unknown>
 
 flogb    z31.d, p7/m, z31.d
 // CHECK-INST: flogb	z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0x1e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 1e 65 <unknown>
 
 
@@ -34,23 +34,23 @@ flogb    z31.d, p7/m, z31.d
 movprfx z4.d, p7/z, z6.d
 // CHECK-INST: movprfx	z4.d, p7/z, z6.d
 // CHECK-ENCODING: [0xc4,0x3c,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c d0 04 <unknown>
 
 flogb    z4.d, p7/m, z31.d
 // CHECK-INST: flogb	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0x1e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 1e 65 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 flogb    z4.d, p7/m, z31.d
 // CHECK-INST: flogb	z4.d, p7/m, z31.d
 // CHECK-ENCODING: [0xe4,0xbf,0x1e,0x65]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 1e 65 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fmaxnmp.s b/llvm/test/MC/AArch64/SVE2/fmaxnmp.s
index e79fa1d782a88..a006ffbcb6bd4 100644
--- a/llvm/test/MC/AArch64/SVE2/fmaxnmp.s
+++ b/llvm/test/MC/AArch64/SVE2/fmaxnmp.s
@@ -12,19 +12,19 @@
 fmaxnmp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: fmaxnmp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x54,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 54 64 <unknown>
 
 fmaxnmp z29.s, p3/m, z29.s, z30.s
 // CHECK-INST: fmaxnmp z29.s, p3/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x8f,0x94,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 8f 94 64 <unknown>
 
 fmaxnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fmaxnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd4,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d4 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -33,23 +33,23 @@ fmaxnmp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 fmaxnmp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: fmaxnmp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd4,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d4 64 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fmaxnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fmaxnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd4,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d4 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fmaxp.s b/llvm/test/MC/AArch64/SVE2/fmaxp.s
index c3a2286a32184..0e507191445bc 100644
--- a/llvm/test/MC/AArch64/SVE2/fmaxp.s
+++ b/llvm/test/MC/AArch64/SVE2/fmaxp.s
@@ -12,19 +12,19 @@
 fmaxp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: fmaxp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x56,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 56 64 <unknown>
 
 fmaxp z29.s, p3/m, z29.s, z30.s
 // CHECK-INST: fmaxp z29.s, p3/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x8f,0x96,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 8f 96 64 <unknown>
 
 fmaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fmaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd6,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d6 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -33,23 +33,23 @@ fmaxp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 fmaxp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: fmaxp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd6,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d6 64 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fmaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fmaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd6,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d6 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fminnmp.s b/llvm/test/MC/AArch64/SVE2/fminnmp.s
index e4d2d12a7a15d..5a48e6dce466d 100644
--- a/llvm/test/MC/AArch64/SVE2/fminnmp.s
+++ b/llvm/test/MC/AArch64/SVE2/fminnmp.s
@@ -12,19 +12,19 @@
 fminnmp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: fminnmp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x55,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 55 64 <unknown>
 
 fminnmp z29.s, p3/m, z29.s, z30.s
 // CHECK-INST: fminnmp z29.s, p3/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x8f,0x95,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 8f 95 64 <unknown>
 
 fminnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fminnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd5,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d5 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -33,23 +33,23 @@ fminnmp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 fminnmp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: fminnmp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd5,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d5 64 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fminnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fminnmp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd5,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d5 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fminp.s b/llvm/test/MC/AArch64/SVE2/fminp.s
index 2d8712ded0377..a7e5b3a3af15e 100644
--- a/llvm/test/MC/AArch64/SVE2/fminp.s
+++ b/llvm/test/MC/AArch64/SVE2/fminp.s
@@ -12,19 +12,19 @@
 fminp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: fminp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x57,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 57 64 <unknown>
 
 fminp z29.s, p3/m, z29.s, z30.s
 // CHECK-INST: fminp z29.s, p3/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x8f,0x97,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 8f 97 64 <unknown>
 
 fminp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fminp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d7 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -33,23 +33,23 @@ fminp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 fminp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: fminp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d7 64 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 fminp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: fminp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d7 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fmlalb.s b/llvm/test/MC/AArch64/SVE2/fmlalb.s
index e926ce0c8d487..02edbcf9e1b04 100644
--- a/llvm/test/MC/AArch64/SVE2/fmlalb.s
+++ b/llvm/test/MC/AArch64/SVE2/fmlalb.s
@@ -13,19 +13,19 @@
 fmlalb z29.s, z30.h, z31.h
 // CHECK-INST: fmlalb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x83,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 83 bf 64 <unknown>
 
 fmlalb z0.s, z1.h, z7.h[0]
 // CHECK-INST: fmlalb	z0.s, z1.h, z7.h[0]
 // CHECK-ENCODING: [0x20,0x40,0xa7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 a7 64 <unknown>
 
 fmlalb z30.s, z31.h, z7.h[7]
 // CHECK-INST: fmlalb z30.s, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xfe,0x4b,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe 4b bf 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -34,23 +34,23 @@ fmlalb z30.s, z31.h, z7.h[7]
 movprfx z29, z28
 // CHECK-INST: movprfx	z29, z28
 // CHECK-ENCODING: [0x9d,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 9d bf 20 04 <unknown>
 
 fmlalb z29.s, z30.h, z31.h
 // CHECK-INST: fmlalb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x83,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 83 bf 64 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 fmlalb z21.s, z1.h, z7.h[7]
 // CHECK-INST: fmlalb	z21.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x35,0x48,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 48 bf 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fmlalt.s b/llvm/test/MC/AArch64/SVE2/fmlalt.s
index e13dcf13e8410..953b05a2042b7 100644
--- a/llvm/test/MC/AArch64/SVE2/fmlalt.s
+++ b/llvm/test/MC/AArch64/SVE2/fmlalt.s
@@ -13,19 +13,19 @@
 fmlalt z29.s, z30.h, z31.h
 // CHECK-INST: fmlalt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x87,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 87 bf 64 <unknown>
 
 fmlalt z0.s, z1.h, z7.h[0]
 // CHECK-INST: fmlalt	z0.s, z1.h, z7.h[0]
 // CHECK-ENCODING: [0x20,0x44,0xa7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 44 a7 64 <unknown>
 
 fmlalt z30.s, z31.h, z7.h[7]
 // CHECK-INST: fmlalt z30.s, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xfe,0x4f,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe 4f bf 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -34,23 +34,23 @@ fmlalt z30.s, z31.h, z7.h[7]
 movprfx z29, z28
 // CHECK-INST: movprfx	z29, z28
 // CHECK-ENCODING: [0x9d,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 9d bf 20 04 <unknown>
 
 fmlalt z29.s, z30.h, z31.h
 // CHECK-INST: fmlalt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x87,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 87 bf 64 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 fmlalt z21.s, z1.h, z7.h[7]
 // CHECK-INST: fmlalt	z21.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x35,0x4c,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 4c bf 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fmlslb.s b/llvm/test/MC/AArch64/SVE2/fmlslb.s
index 38f8db0b81b9a..e6db85233d176 100644
--- a/llvm/test/MC/AArch64/SVE2/fmlslb.s
+++ b/llvm/test/MC/AArch64/SVE2/fmlslb.s
@@ -13,19 +13,19 @@
 fmlslb z29.s, z30.h, z31.h
 // CHECK-INST: fmlslb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0xa3,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd a3 bf 64 <unknown>
 
 fmlslb z0.s, z1.h, z7.h[0]
 // CHECK-INST: fmlslb	z0.s, z1.h, z7.h[0]
 // CHECK-ENCODING: [0x20,0x60,0xa7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 a7 64 <unknown>
 
 fmlslb z30.s, z31.h, z7.h[7]
 // CHECK-INST: fmlslb z30.s, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xfe,0x6b,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe 6b bf 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -34,23 +34,23 @@ fmlslb z30.s, z31.h, z7.h[7]
 movprfx z29, z28
 // CHECK-INST: movprfx	z29, z28
 // CHECK-ENCODING: [0x9d,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 9d bf 20 04 <unknown>
 
 fmlslb z29.s, z30.h, z31.h
 // CHECK-INST: fmlslb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0xa3,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd a3 bf 64 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 fmlslb z21.s, z1.h, z7.h[7]
 // CHECK-INST: fmlslb	z21.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x35,0x68,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 68 bf 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fmlslt.s b/llvm/test/MC/AArch64/SVE2/fmlslt.s
index 3f994c3a80998..96b1901d138f7 100644
--- a/llvm/test/MC/AArch64/SVE2/fmlslt.s
+++ b/llvm/test/MC/AArch64/SVE2/fmlslt.s
@@ -13,19 +13,19 @@
 fmlslt z29.s, z30.h, z31.h
 // CHECK-INST: fmlslt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0xa7,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd a7 bf 64 <unknown>
 
 fmlslt z0.s, z1.h, z7.h[0]
 // CHECK-INST: fmlslt	z0.s, z1.h, z7.h[0]
 // CHECK-ENCODING: [0x20,0x64,0xa7,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 a7 64 <unknown>
 
 fmlslt z30.s, z31.h, z7.h[7]
 // CHECK-INST: fmlslt z30.s, z31.h, z7.h[7]
 // CHECK-ENCODING: [0xfe,0x6f,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe 6f bf 64 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -34,23 +34,23 @@ fmlslt z30.s, z31.h, z7.h[7]
 movprfx z29, z28
 // CHECK-INST: movprfx	z29, z28
 // CHECK-ENCODING: [0x9d,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 9d bf 20 04 <unknown>
 
 fmlslt z29.s, z30.h, z31.h
 // CHECK-INST: fmlslt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0xa7,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd a7 bf 64 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 fmlslt z21.s, z1.h, z7.h[7]
 // CHECK-INST: fmlslt	z21.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x35,0x6c,0xbf,0x64]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 6c bf 64 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/mla.s b/llvm/test/MC/AArch64/SVE2/mla.s
index 3224f20cb4207..ff5df0b9f4ece 100644
--- a/llvm/test/MC/AArch64/SVE2/mla.s
+++ b/llvm/test/MC/AArch64/SVE2/mla.s
@@ -12,19 +12,19 @@
 mla z0.h, z1.h, z7.h[7]
 // CHECK-INST: mla	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x08,0x7f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 7f 44 <unknown>
 
 mla z0.s, z1.s, z7.s[3]
 // CHECK-INST: mla	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0x08,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 bf 44 <unknown>
 
 mla z0.d, z1.d, z7.d[1]
 // CHECK-INST: mla	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x08,0xf7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 f7 44 <unknown>
 
 
@@ -34,11 +34,11 @@ mla z0.d, z1.d, z7.d[1]
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 mla z0.d, z1.d, z7.d[1]
 // CHECK-INST: mla	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x08,0xf7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 f7 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/mls.s b/llvm/test/MC/AArch64/SVE2/mls.s
index 15bd01cdfb843..c4ef42da09d19 100644
--- a/llvm/test/MC/AArch64/SVE2/mls.s
+++ b/llvm/test/MC/AArch64/SVE2/mls.s
@@ -12,19 +12,19 @@
 mls z0.h, z1.h, z7.h[7]
 // CHECK-INST: mls	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x0c,0x7f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c 7f 44 <unknown>
 
 mls z0.s, z1.s, z7.s[3]
 // CHECK-INST: mls	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0x0c,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c bf 44 <unknown>
 
 mls z0.d, z1.d, z7.d[1]
 // CHECK-INST: mls	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x0c,0xf7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c f7 44 <unknown>
 
 
@@ -34,11 +34,11 @@ mls z0.d, z1.d, z7.d[1]
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 mls z0.d, z1.d, z7.d[1]
 // CHECK-INST: mls	z0.d, z1.d, z7.d[1]
 // CHECK-ENCODING: [0x20,0x0c,0xf7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c f7 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/mul.s b/llvm/test/MC/AArch64/SVE2/mul.s
index ec9b0c9b9b1f3..120ece160176a 100644
--- a/llvm/test/MC/AArch64/SVE2/mul.s
+++ b/llvm/test/MC/AArch64/SVE2/mul.s
@@ -12,41 +12,41 @@
 mul z0.b, z1.b, z2.b
 // CHECK-INST: mul z0.b, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x60,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 22 04 <unknown>
 
 mul z0.h, z1.h, z2.h
 // CHECK-INST: mul z0.h, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x60,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 62 04 <unknown>
 
 mul z29.s, z30.s, z31.s
 // CHECK-INST: mul z29.s, z30.s, z31.s
 // CHECK-ENCODING: [0xdd,0x63,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 63 bf 04 <unknown>
 
 mul z31.d, z31.d, z31.d
 // CHECK-INST: mul z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x63,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 63 ff 04 <unknown>
 
 mul z0.h, z1.h, z7.h[7]
 // CHECK-INST: mul	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xf8,0x7f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 7f 44 <unknown>
 
 mul z0.s, z1.s, z7.s[3]
 // CHECK-INST: mul	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0xf8,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 bf 44 <unknown>
 
 mul z0.d, z1.d, z15.d[1]
 // CHECK-INST: mul	z0.d, z1.d, z15.d[1]
 // CHECK-ENCODING: [0x20,0xf8,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/nbsl.s b/llvm/test/MC/AArch64/SVE2/nbsl.s
index 937390ce35654..af6ad419f0239 100644
--- a/llvm/test/MC/AArch64/SVE2/nbsl.s
+++ b/llvm/test/MC/AArch64/SVE2/nbsl.s
@@ -12,7 +12,7 @@
 nbsl z0.d, z0.d, z1.d, z2.d
 // CHECK-INST: nbsl z0.d, z0.d, z1.d, z2.d
 // CHECK-ENCODING: [0x40,0x3c,0xe1,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 40 3c e1 04 <unknown>
 
 
@@ -22,11 +22,11 @@ nbsl z0.d, z0.d, z1.d, z2.d
 movprfx z31, z7
 // CHECK-INST: movprfx z31, z7
 // CHECK-ENCODING: [0xff,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bc 20 04 <unknown>
 
 nbsl z31.d, z31.d, z30.d, z29.d
 // CHECK-INST: nbsl z31.d, z31.d, z30.d, z29.d
 // CHECK-ENCODING: [0xbf,0x3f,0xfe,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bf 3f fe 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/pmul.s b/llvm/test/MC/AArch64/SVE2/pmul.s
index 69d736ac44f86..20599a9902de8 100644
--- a/llvm/test/MC/AArch64/SVE2/pmul.s
+++ b/llvm/test/MC/AArch64/SVE2/pmul.s
@@ -12,11 +12,11 @@
 pmul z0.b, z1.b, z2.b
 // CHECK-INST: pmul z0.b, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x64,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 22 04 <unknown>
 
 pmul z29.b, z30.b, z31.b
 // CHECK-INST: pmul z29.b, z30.b, z31.b
 // CHECK-ENCODING: [0xdd,0x67,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 67 3f 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/pmullb.s b/llvm/test/MC/AArch64/SVE2/pmullb.s
index 846e6851f2fea..edd2a0907d37f 100644
--- a/llvm/test/MC/AArch64/SVE2/pmullb.s
+++ b/llvm/test/MC/AArch64/SVE2/pmullb.s
@@ -13,11 +13,11 @@
 pmullb z0.h, z1.b, z2.b
 // CHECK-INST: pmullb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x68,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 42 45 <unknown>
 
 pmullb z31.d, z31.s, z31.s
 // CHECK-INST: pmullb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x6b,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 6b df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/pmullt.s b/llvm/test/MC/AArch64/SVE2/pmullt.s
index 0b3758f0ca1dc..4a2328edf4e1a 100644
--- a/llvm/test/MC/AArch64/SVE2/pmullt.s
+++ b/llvm/test/MC/AArch64/SVE2/pmullt.s
@@ -13,11 +13,11 @@
 pmullt z0.h, z1.b, z2.b
 // CHECK-INST: pmullt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x6c,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c 42 45 <unknown>
 
 pmullt z31.d, z31.s, z31.s
 // CHECK-INST: pmullt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x6f,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 6f df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/raddhnb.s b/llvm/test/MC/AArch64/SVE2/raddhnb.s
index 871ba909d755f..8f7678593a686 100644
--- a/llvm/test/MC/AArch64/SVE2/raddhnb.s
+++ b/llvm/test/MC/AArch64/SVE2/raddhnb.s
@@ -13,17 +13,17 @@
 raddhnb z0.b, z1.h, z31.h
 // CHECK-INST: raddhnb	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x68,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 7f 45 <unknown>
 
 raddhnb z0.h, z1.s, z31.s
 // CHECK-INST: raddhnb	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x68,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 bf 45 <unknown>
 
 raddhnb z0.s, z1.d, z31.d
 // CHECK-INST: raddhnb	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x68,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/raddhnt.s b/llvm/test/MC/AArch64/SVE2/raddhnt.s
index d5648a6ff39c3..2c8d7fa2c9592 100644
--- a/llvm/test/MC/AArch64/SVE2/raddhnt.s
+++ b/llvm/test/MC/AArch64/SVE2/raddhnt.s
@@ -13,17 +13,17 @@
 raddhnt z0.b, z1.h, z31.h
 // CHECK-INST: raddhnt	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x6c,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c 7f 45 <unknown>
 
 raddhnt z0.h, z1.s, z31.s
 // CHECK-INST: raddhnt	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x6c,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c bf 45 <unknown>
 
 raddhnt z0.s, z1.d, z31.d
 // CHECK-INST: raddhnt	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x6c,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/rshrnb.s b/llvm/test/MC/AArch64/SVE2/rshrnb.s
index 203e6e7ece149..aa54e46d9b5f9 100644
--- a/llvm/test/MC/AArch64/SVE2/rshrnb.s
+++ b/llvm/test/MC/AArch64/SVE2/rshrnb.s
@@ -12,35 +12,35 @@
 rshrnb     z0.b, z0.h, #1
 // CHECK-INST: rshrnb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x18,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 18 2f 45 <unknown>
 
 rshrnb     z31.b, z31.h, #8
 // CHECK-INST: rshrnb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x1b,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1b 28 45 <unknown>
 
 rshrnb     z0.h, z0.s, #1
 // CHECK-INST: rshrnb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x18,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 18 3f 45 <unknown>
 
 rshrnb     z31.h, z31.s, #16
 // CHECK-INST: rshrnb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x1b,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1b 30 45 <unknown>
 
 rshrnb     z0.s, z0.d, #1
 // CHECK-INST: rshrnb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x18,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 18 7f 45 <unknown>
 
 rshrnb     z31.s, z31.d, #32
 // CHECK-INST: rshrnb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x1b,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1b 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/rshrnt.s b/llvm/test/MC/AArch64/SVE2/rshrnt.s
index 5a52e6be22678..f5ad3df778e4d 100644
--- a/llvm/test/MC/AArch64/SVE2/rshrnt.s
+++ b/llvm/test/MC/AArch64/SVE2/rshrnt.s
@@ -12,35 +12,35 @@
 rshrnt     z0.b, z0.h, #1
 // CHECK-INST: rshrnt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x1c,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 1c 2f 45 <unknown>
 
 rshrnt     z31.b, z31.h, #8
 // CHECK-INST: rshrnt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x1f,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1f 28 45 <unknown>
 
 rshrnt     z0.h, z0.s, #1
 // CHECK-INST: rshrnt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x1c,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 1c 3f 45 <unknown>
 
 rshrnt     z31.h, z31.s, #16
 // CHECK-INST: rshrnt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x1f,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1f 30 45 <unknown>
 
 rshrnt     z0.s, z0.d, #1
 // CHECK-INST: rshrnt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x1c,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 1c 7f 45 <unknown>
 
 rshrnt     z31.s, z31.d, #32
 // CHECK-INST: rshrnt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x1f,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1f 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/rsubhnb.s b/llvm/test/MC/AArch64/SVE2/rsubhnb.s
index c9f4c3fdcdd0b..5804679c55fea 100644
--- a/llvm/test/MC/AArch64/SVE2/rsubhnb.s
+++ b/llvm/test/MC/AArch64/SVE2/rsubhnb.s
@@ -13,17 +13,17 @@
 rsubhnb z0.b, z1.h, z31.h
 // CHECK-INST: rsubhnb	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x78,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 78 7f 45 <unknown>
 
 rsubhnb z0.h, z1.s, z31.s
 // CHECK-INST: rsubhnb	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x78,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 78 bf 45 <unknown>
 
 rsubhnb z0.s, z1.d, z31.d
 // CHECK-INST: rsubhnb	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x78,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 78 ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/rsubhnt.s b/llvm/test/MC/AArch64/SVE2/rsubhnt.s
index 7c1c546c3a38c..7aa7dcd15e92b 100644
--- a/llvm/test/MC/AArch64/SVE2/rsubhnt.s
+++ b/llvm/test/MC/AArch64/SVE2/rsubhnt.s
@@ -13,17 +13,17 @@
 rsubhnt z0.b, z1.h, z31.h
 // CHECK-INST: rsubhnt	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x7c,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 7c 7f 45 <unknown>
 
 rsubhnt z0.h, z1.s, z31.s
 // CHECK-INST: rsubhnt	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x7c,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 7c bf 45 <unknown>
 
 rsubhnt z0.s, z1.d, z31.d
 // CHECK-INST: rsubhnt	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x7c,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 7c ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/saba.s b/llvm/test/MC/AArch64/SVE2/saba.s
index b03087b0660f2..c07649fdcd25e 100644
--- a/llvm/test/MC/AArch64/SVE2/saba.s
+++ b/llvm/test/MC/AArch64/SVE2/saba.s
@@ -12,25 +12,25 @@
 saba z0.b, z1.b, z31.b
 // CHECK-INST: saba z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xf8,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 1f 45 <unknown>
 
 saba z0.h, z1.h, z31.h
 // CHECK-INST: saba z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xf8,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 5f 45 <unknown>
 
 saba z0.s, z1.s, z31.s
 // CHECK-INST: saba z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xf8,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 9f 45 <unknown>
 
 saba z0.d, z1.d, z31.d
 // CHECK-INST: saba z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xf8,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 df 45 <unknown>
 
 
@@ -40,11 +40,11 @@ saba z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 saba z0.d, z1.d, z31.d
 // CHECK-INST: saba z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xf8,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f8 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sabalb.s b/llvm/test/MC/AArch64/SVE2/sabalb.s
index e8cf4cec3f610..18ccaa0626c1d 100644
--- a/llvm/test/MC/AArch64/SVE2/sabalb.s
+++ b/llvm/test/MC/AArch64/SVE2/sabalb.s
@@ -13,19 +13,19 @@
 sabalb z0.h, z1.b, z31.b
 // CHECK-INST: sabalb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xc0,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c0 5f 45 <unknown>
 
 sabalb z0.s, z1.h, z31.h
 // CHECK-INST: sabalb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xc0,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c0 9f 45 <unknown>
 
 sabalb z0.d, z1.s, z31.s
 // CHECK-INST: sabalb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xc0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c0 df 45 <unknown>
 
 
@@ -35,11 +35,11 @@ sabalb z0.d, z1.s, z31.s
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sabalb z21.d, z1.s, z31.s
 // CHECK-INST: sabalb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0xc0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 c0 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sabalt.s b/llvm/test/MC/AArch64/SVE2/sabalt.s
index 042b66a4311f6..59597f67d7031 100644
--- a/llvm/test/MC/AArch64/SVE2/sabalt.s
+++ b/llvm/test/MC/AArch64/SVE2/sabalt.s
@@ -13,19 +13,19 @@
 sabalt z0.h, z1.b, z31.b
 // CHECK-INST: sabalt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xc4,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c4 5f 45 <unknown>
 
 sabalt z0.s, z1.h, z31.h
 // CHECK-INST: sabalt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xc4,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c4 9f 45 <unknown>
 
 sabalt z0.d, z1.s, z31.s
 // CHECK-INST: sabalt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xc4,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c4 df 45 <unknown>
 
 
@@ -35,11 +35,11 @@ sabalt z0.d, z1.s, z31.s
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sabalt z21.d, z1.s, z31.s
 // CHECK-INST: sabalt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0xc4,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 c4 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sabdlb.s b/llvm/test/MC/AArch64/SVE2/sabdlb.s
index 5750e11017bba..e651430aed364 100644
--- a/llvm/test/MC/AArch64/SVE2/sabdlb.s
+++ b/llvm/test/MC/AArch64/SVE2/sabdlb.s
@@ -13,17 +13,17 @@
 sabdlb z0.h, z1.b, z2.b
 // CHECK-INST: sabdlb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x30,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 30 42 45 <unknown>
 
 sabdlb z29.s, z30.h, z31.h
 // CHECK-INST: sabdlb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x33,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 33 9f 45 <unknown>
 
 sabdlb z31.d, z31.s, z31.s
 // CHECK-INST: sabdlb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x33,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 33 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sabdlt.s b/llvm/test/MC/AArch64/SVE2/sabdlt.s
index e802a4692b16d..6682d1b6fb3f8 100644
--- a/llvm/test/MC/AArch64/SVE2/sabdlt.s
+++ b/llvm/test/MC/AArch64/SVE2/sabdlt.s
@@ -13,17 +13,17 @@
 sabdlt z0.h, z1.b, z2.b
 // CHECK-INST: sabdlt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x34,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 34 42 45 <unknown>
 
 sabdlt z29.s, z30.h, z31.h
 // CHECK-INST: sabdlt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x37,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 37 9f 45 <unknown>
 
 sabdlt z31.d, z31.s, z31.s
 // CHECK-INST: sabdlt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x37,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 37 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sadalp.s b/llvm/test/MC/AArch64/SVE2/sadalp.s
index cd6e7325cd8f4..a7e4f6a6ae8fe 100644
--- a/llvm/test/MC/AArch64/SVE2/sadalp.s
+++ b/llvm/test/MC/AArch64/SVE2/sadalp.s
@@ -12,19 +12,19 @@
 sadalp z0.h, p0/m, z1.b
 // CHECK-INST: sadalp z0.h, p0/m, z1.b
 // CHECK-ENCODING: [0x20,0xa0,0x44,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 44 44 <unknown>
 
 sadalp z29.s, p0/m, z30.h
 // CHECK-INST: sadalp z29.s, p0/m, z30.h
 // CHECK-ENCODING: [0xdd,0xa3,0x84,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd a3 84 44 <unknown>
 
 sadalp z30.d, p7/m, z31.s
 // CHECK-INST: sadalp z30.d, p7/m, z31.s
 // CHECK-ENCODING: [0xfe,0xbf,0xc4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe bf c4 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -33,23 +33,23 @@ sadalp z30.d, p7/m, z31.s
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sadalp z31.d, p0/m, z30.s
 // CHECK-INST: sadalp z31.d, p0/m, z30.s
 // CHECK-ENCODING: [0xdf,0xa3,0xc4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 c4 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sadalp z31.d, p0/m, z30.s
 // CHECK-INST: sadalp z31.d, p0/m, z30.s
 // CHECK-ENCODING: [0xdf,0xa3,0xc4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 c4 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/saddlb.s b/llvm/test/MC/AArch64/SVE2/saddlb.s
index e21be54cee0b3..b425afe8bf0ce 100644
--- a/llvm/test/MC/AArch64/SVE2/saddlb.s
+++ b/llvm/test/MC/AArch64/SVE2/saddlb.s
@@ -13,17 +13,17 @@
 saddlb z0.h, z1.b, z2.b
 // CHECK-INST: saddlb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x00,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 00 42 45 <unknown>
 
 saddlb z29.s, z30.h, z31.h
 // CHECK-INST: saddlb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x03,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 03 9f 45 <unknown>
 
 saddlb z31.d, z31.s, z31.s
 // CHECK-INST: saddlb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x03,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 03 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/saddlbt.s b/llvm/test/MC/AArch64/SVE2/saddlbt.s
index 5734fb9ed189d..1f285f70567ee 100644
--- a/llvm/test/MC/AArch64/SVE2/saddlbt.s
+++ b/llvm/test/MC/AArch64/SVE2/saddlbt.s
@@ -13,17 +13,17 @@
 saddlbt z0.h, z1.b, z31.b
 // CHECK-INST: saddlbt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x80,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 5f 45 <unknown>
 
 saddlbt z0.s, z1.h, z31.h
 // CHECK-INST: saddlbt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x80,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 9f 45 <unknown>
 
 saddlbt z0.d, z1.s, z31.s
 // CHECK-INST: saddlbt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x80,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/saddlt.s b/llvm/test/MC/AArch64/SVE2/saddlt.s
index fc4981b640d39..ec481e873a0bd 100644
--- a/llvm/test/MC/AArch64/SVE2/saddlt.s
+++ b/llvm/test/MC/AArch64/SVE2/saddlt.s
@@ -13,17 +13,17 @@
 saddlt z0.h, z1.b, z2.b
 // CHECK-INST: saddlt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x04,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 04 42 45 <unknown>
 
 saddlt z29.s, z30.h, z31.h
 // CHECK-INST: saddlt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x07,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 07 9f 45 <unknown>
 
 saddlt z31.d, z31.s, z31.s
 // CHECK-INST: saddlt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x07,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 07 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/saddwb.s b/llvm/test/MC/AArch64/SVE2/saddwb.s
index 8484c93d0e687..a8a7f643af312 100644
--- a/llvm/test/MC/AArch64/SVE2/saddwb.s
+++ b/llvm/test/MC/AArch64/SVE2/saddwb.s
@@ -13,17 +13,17 @@
 saddwb z0.h, z1.h, z2.b
 // CHECK-INST: saddwb z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x40,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 42 45 <unknown>
 
 saddwb z29.s, z30.s, z31.h
 // CHECK-INST: saddwb z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x43,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 43 9f 45 <unknown>
 
 saddwb z31.d, z31.d, z31.s
 // CHECK-INST: saddwb z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x43,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 43 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/saddwt.s b/llvm/test/MC/AArch64/SVE2/saddwt.s
index 393aa006945aa..f1eeacc36e2ad 100644
--- a/llvm/test/MC/AArch64/SVE2/saddwt.s
+++ b/llvm/test/MC/AArch64/SVE2/saddwt.s
@@ -13,17 +13,17 @@
 saddwt z0.h, z1.h, z2.b
 // CHECK-INST: saddwt z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x44,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 44 42 45 <unknown>
 
 saddwt z29.s, z30.s, z31.h
 // CHECK-INST: saddwt z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x47,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 47 9f 45 <unknown>
 
 saddwt z31.d, z31.d, z31.s
 // CHECK-INST: saddwt z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x47,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 47 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sbclb.s b/llvm/test/MC/AArch64/SVE2/sbclb.s
index 1cc3729f4d4ad..36c9f61b7d086 100644
--- a/llvm/test/MC/AArch64/SVE2/sbclb.s
+++ b/llvm/test/MC/AArch64/SVE2/sbclb.s
@@ -12,13 +12,13 @@
 sbclb z0.s, z1.s, z31.s
 // CHECK-INST: sbclb z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xd0,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d0 9f 45 <unknown>
 
 sbclb z0.d, z1.d, z31.d
 // CHECK-INST: sbclb z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d0 df 45 <unknown>
 
 
@@ -28,11 +28,11 @@ sbclb z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sbclb z0.d, z1.d, z31.d
 // CHECK-INST: sbclb z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d0 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sbclt.s b/llvm/test/MC/AArch64/SVE2/sbclt.s
index 7acc604f699aa..056a6a94e7d12 100644
--- a/llvm/test/MC/AArch64/SVE2/sbclt.s
+++ b/llvm/test/MC/AArch64/SVE2/sbclt.s
@@ -12,13 +12,13 @@
 sbclt z0.s, z1.s, z31.s
 // CHECK-INST: sbclt z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xd4,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d4 9f 45 <unknown>
 
 sbclt z0.d, z1.d, z31.d
 // CHECK-INST: sbclt z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd4,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d4 df 45 <unknown>
 
 
@@ -28,11 +28,11 @@ sbclt z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sbclt z0.d, z1.d, z31.d
 // CHECK-INST: sbclt z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xd4,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d4 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/shadd.s b/llvm/test/MC/AArch64/SVE2/shadd.s
index edbd9275302e0..d54cbe6ebc70e 100644
--- a/llvm/test/MC/AArch64/SVE2/shadd.s
+++ b/llvm/test/MC/AArch64/SVE2/shadd.s
@@ -12,25 +12,25 @@
 shadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: shadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x10,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 10 44 <unknown>
 
 shadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: shadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x50,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 50 44 <unknown>
 
 shadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: shadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x90,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 90 44 <unknown>
 
 shadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: shadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd0,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d0 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ shadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 shadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: shadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd0,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d0 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 shadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: shadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd0,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d0 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/shrnb.s b/llvm/test/MC/AArch64/SVE2/shrnb.s
index 87057bd76df8f..61386522d63af 100644
--- a/llvm/test/MC/AArch64/SVE2/shrnb.s
+++ b/llvm/test/MC/AArch64/SVE2/shrnb.s
@@ -12,35 +12,35 @@
 shrnb     z0.b, z0.h, #1
 // CHECK-INST: shrnb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x10,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 10 2f 45 <unknown>
 
 shrnb     z31.b, z31.h, #8
 // CHECK-INST: shrnb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x13,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 13 28 45 <unknown>
 
 shrnb     z0.h, z0.s, #1
 // CHECK-INST: shrnb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x10,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 10 3f 45 <unknown>
 
 shrnb     z31.h, z31.s, #16
 // CHECK-INST: shrnb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x13,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 13 30 45 <unknown>
 
 shrnb     z0.s, z0.d, #1
 // CHECK-INST: shrnb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x10,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 10 7f 45 <unknown>
 
 shrnb     z31.s, z31.d, #32
 // CHECK-INST: shrnb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x13,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 13 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/shrnt.s b/llvm/test/MC/AArch64/SVE2/shrnt.s
index e5b5331e5276b..070646d51a69c 100644
--- a/llvm/test/MC/AArch64/SVE2/shrnt.s
+++ b/llvm/test/MC/AArch64/SVE2/shrnt.s
@@ -12,35 +12,35 @@
 shrnt     z0.b, z0.h, #1
 // CHECK-INST: shrnt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x14,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 14 2f 45 <unknown>
 
 shrnt     z31.b, z31.h, #8
 // CHECK-INST: shrnt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x17,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 17 28 45 <unknown>
 
 shrnt     z0.h, z0.s, #1
 // CHECK-INST: shrnt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x14,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 14 3f 45 <unknown>
 
 shrnt     z31.h, z31.s, #16
 // CHECK-INST: shrnt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x17,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 17 30 45 <unknown>
 
 shrnt     z0.s, z0.d, #1
 // CHECK-INST: shrnt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x14,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 14 7f 45 <unknown>
 
 shrnt     z31.s, z31.d, #32
 // CHECK-INST: shrnt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x17,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 17 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/shsub.s b/llvm/test/MC/AArch64/SVE2/shsub.s
index ed6b7e927a41b..1b9f8740e7856 100644
--- a/llvm/test/MC/AArch64/SVE2/shsub.s
+++ b/llvm/test/MC/AArch64/SVE2/shsub.s
@@ -12,25 +12,25 @@
 shsub z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: shsub z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x12,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 12 44 <unknown>
 
 shsub z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: shsub z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x52,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 52 44 <unknown>
 
 shsub z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: shsub z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x92,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 92 44 <unknown>
 
 shsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: shsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d2 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ shsub z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 shsub z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: shsub z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d2 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 shsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: shsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d2 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/shsubr.s b/llvm/test/MC/AArch64/SVE2/shsubr.s
index 3d4b5b612133f..61251738686c0 100644
--- a/llvm/test/MC/AArch64/SVE2/shsubr.s
+++ b/llvm/test/MC/AArch64/SVE2/shsubr.s
@@ -12,25 +12,25 @@
 shsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: shsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x16,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 16 44 <unknown>
 
 shsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: shsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x56,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 56 44 <unknown>
 
 shsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: shsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x96,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 96 44 <unknown>
 
 shsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: shsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d6 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ shsubr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 shsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: shsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d6 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 shsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: shsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d6 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sli.s b/llvm/test/MC/AArch64/SVE2/sli.s
index 7a90116d8f992..a835eadfd6516 100644
--- a/llvm/test/MC/AArch64/SVE2/sli.s
+++ b/llvm/test/MC/AArch64/SVE2/sli.s
@@ -12,47 +12,47 @@
 sli     z0.b, z0.b, #0
 // CHECK-INST: sli	z0.b, z0.b, #0
 // CHECK-ENCODING: [0x00,0xf4,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f4 08 45 <unknown>
 
 sli     z31.b, z31.b, #7
 // CHECK-INST: sli	z31.b, z31.b, #7
 // CHECK-ENCODING: [0xff,0xf7,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f7 0f 45 <unknown>
 
 sli     z0.h, z0.h, #0
 // CHECK-INST: sli	z0.h, z0.h, #0
 // CHECK-ENCODING: [0x00,0xf4,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f4 10 45 <unknown>
 
 sli     z31.h, z31.h, #15
 // CHECK-INST: sli	z31.h, z31.h, #15
 // CHECK-ENCODING: [0xff,0xf7,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f7 1f 45 <unknown>
 
 sli     z0.s, z0.s, #0
 // CHECK-INST: sli	z0.s, z0.s, #0
 // CHECK-ENCODING: [0x00,0xf4,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f4 40 45 <unknown>
 
 sli     z31.s, z31.s, #31
 // CHECK-INST: sli	z31.s, z31.s, #31
 // CHECK-ENCODING: [0xff,0xf7,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f7 5f 45 <unknown>
 
 sli     z0.d, z0.d, #0
 // CHECK-INST: sli	z0.d, z0.d, #0
 // CHECK-ENCODING: [0x00,0xf4,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f4 80 45 <unknown>
 
 sli     z31.d, z31.d, #63
 // CHECK-INST: sli	z31.d, z31.d, #63
 // CHECK-ENCODING: [0xff,0xf7,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f7 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smaxp.s b/llvm/test/MC/AArch64/SVE2/smaxp.s
index 80f7e2bd91a40..a28af2e33c6a6 100644
--- a/llvm/test/MC/AArch64/SVE2/smaxp.s
+++ b/llvm/test/MC/AArch64/SVE2/smaxp.s
@@ -12,25 +12,25 @@
 smaxp z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: smaxp z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0xa0,0x14,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 14 44 <unknown>
 
 smaxp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: smaxp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0xa0,0x54,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 54 44 <unknown>
 
 smaxp z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: smaxp z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0xbf,0x94,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd bf 94 44 <unknown>
 
 smaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: smaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d4 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ smaxp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 smaxp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: smaxp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xa3,0xd4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 d4 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 smaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: smaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d4 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sminp.s b/llvm/test/MC/AArch64/SVE2/sminp.s
index ba837b94c2f5b..60f9eebe751b2 100644
--- a/llvm/test/MC/AArch64/SVE2/sminp.s
+++ b/llvm/test/MC/AArch64/SVE2/sminp.s
@@ -12,25 +12,25 @@
 sminp z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sminp z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0xa0,0x16,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 16 44 <unknown>
 
 sminp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sminp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0xa0,0x56,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 56 44 <unknown>
 
 sminp z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sminp z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0xbf,0x96,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd bf 96 44 <unknown>
 
 sminp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sminp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d6 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ sminp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sminp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sminp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xa3,0xd6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 d6 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sminp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sminp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d6 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smlalb.s b/llvm/test/MC/AArch64/SVE2/smlalb.s
index 142f104849609..d0d173185f48f 100644
--- a/llvm/test/MC/AArch64/SVE2/smlalb.s
+++ b/llvm/test/MC/AArch64/SVE2/smlalb.s
@@ -13,31 +13,31 @@
 smlalb z0.h, z1.b, z31.b
 // CHECK-INST: smlalb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x40,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 5f 44 <unknown>
 
 smlalb z0.s, z1.h, z31.h
 // CHECK-INST: smlalb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x40,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 9f 44 <unknown>
 
 smlalb z0.d, z1.s, z31.s
 // CHECK-INST: smlalb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x40,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 40 df 44 <unknown>
 
 smlalb z0.s, z1.h, z7.h[7]
 // CHECK-INST: smlalb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x88,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 88 bf 44 <unknown>
 
 smlalb z0.d, z1.s, z15.s[1]
 // CHECK-INST: smlalb	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0x88,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 88 ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ smlalb z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlalb z21.d, z1.s, z31.s
 // CHECK-INST: smlalb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x40,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 40 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlalb   z21.d, z10.s, z5.s[1]
 // CHECK-INST: smlalb   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x89,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 89 e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smlalt.s b/llvm/test/MC/AArch64/SVE2/smlalt.s
index 767c357d61aac..6358271bcf342 100644
--- a/llvm/test/MC/AArch64/SVE2/smlalt.s
+++ b/llvm/test/MC/AArch64/SVE2/smlalt.s
@@ -13,31 +13,31 @@
 smlalt z0.h, z1.b, z31.b
 // CHECK-INST: smlalt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x44,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 44 5f 44 <unknown>
 
 smlalt z0.s, z1.h, z31.h
 // CHECK-INST: smlalt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x44,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 44 9f 44 <unknown>
 
 smlalt z0.d, z1.s, z31.s
 // CHECK-INST: smlalt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x44,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 44 df 44 <unknown>
 
 smlalt z0.s, z1.h, z7.h[7]
 // CHECK-INST: smlalt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x8c,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 8c bf 44 <unknown>
 
 smlalt z0.d, z1.s, z15.s[1]
 // CHECK-INST: smlalt	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0x8c,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 8c ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ smlalt z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlalt z21.d, z1.s, z31.s
 // CHECK-INST: smlalt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x44,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 44 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlalt   z21.d, z10.s, z5.s[1]
 // CHECK-INST: smlalt   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x8d,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 8d e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smlslb.s b/llvm/test/MC/AArch64/SVE2/smlslb.s
index befd21b879c55..b73fe155bd67c 100644
--- a/llvm/test/MC/AArch64/SVE2/smlslb.s
+++ b/llvm/test/MC/AArch64/SVE2/smlslb.s
@@ -13,31 +13,31 @@
 smlslb z0.h, z1.b, z31.b
 // CHECK-INST: smlslb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x50,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 50 5f 44 <unknown>
 
 smlslb z0.s, z1.h, z31.h
 // CHECK-INST: smlslb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x50,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 50 9f 44 <unknown>
 
 smlslb z0.d, z1.s, z31.s
 // CHECK-INST: smlslb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x50,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 50 df 44 <unknown>
 
 smlslb z0.s, z1.h, z7.h[7]
 // CHECK-INST: smlslb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xa8,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a8 bf 44 <unknown>
 
 smlslb z0.d, z1.s, z15.s[1]
 // CHECK-INST: smlslb	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xa8,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a8 ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ smlslb z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlslb z21.d, z1.s, z31.s
 // CHECK-INST: smlslb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x50,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 50 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlslb   z21.d, z10.s, z5.s[1]
 // CHECK-INST: smlslb   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0xa9,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 a9 e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smlslt.s b/llvm/test/MC/AArch64/SVE2/smlslt.s
index e66b1b574fbb3..689708d99ebd3 100644
--- a/llvm/test/MC/AArch64/SVE2/smlslt.s
+++ b/llvm/test/MC/AArch64/SVE2/smlslt.s
@@ -13,31 +13,31 @@
 smlslt z0.h, z1.b, z31.b
 // CHECK-INST: smlslt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x54,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 54 5f 44 <unknown>
 
 smlslt z0.s, z1.h, z31.h
 // CHECK-INST: smlslt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x54,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 54 9f 44 <unknown>
 
 smlslt z0.d, z1.s, z31.s
 // CHECK-INST: smlslt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x54,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 54 df 44 <unknown>
 
 smlslt z0.s, z1.h, z7.h[7]
 // CHECK-INST: smlslt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xac,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 ac bf 44 <unknown>
 
 smlslt z0.d, z1.s, z15.s[1]
 // CHECK-INST: smlslt	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xac,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 ac ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ smlslt z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlslt z21.d, z1.s, z31.s
 // CHECK-INST: smlslt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x54,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 54 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 smlslt   z21.d, z10.s, z5.s[1]
 // CHECK-INST: smlslt   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0xad,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 ad e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smulh.s b/llvm/test/MC/AArch64/SVE2/smulh.s
index 2b08ba73f1b13..dcb3696234438 100644
--- a/llvm/test/MC/AArch64/SVE2/smulh.s
+++ b/llvm/test/MC/AArch64/SVE2/smulh.s
@@ -12,23 +12,23 @@
 smulh z0.b, z1.b, z2.b
 // CHECK-INST: smulh z0.b, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x68,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 22 04 <unknown>
 
 smulh z0.h, z1.h, z2.h
 // CHECK-INST: smulh z0.h, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x68,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 62 04 <unknown>
 
 smulh z29.s, z30.s, z31.s
 // CHECK-INST: smulh z29.s, z30.s, z31.s
 // CHECK-ENCODING: [0xdd,0x6b,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 6b bf 04 <unknown>
 
 smulh z31.d, z31.d, z31.d
 // CHECK-INST: smulh z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x6b,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 6b ff 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smullb.s b/llvm/test/MC/AArch64/SVE2/smullb.s
index f134660350c0b..78299c9e156ca 100644
--- a/llvm/test/MC/AArch64/SVE2/smullb.s
+++ b/llvm/test/MC/AArch64/SVE2/smullb.s
@@ -13,29 +13,29 @@
 smullb z0.h, z1.b, z2.b
 // CHECK-INST: smullb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x70,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 42 45 <unknown>
 
 smullb z29.s, z30.h, z31.h
 // CHECK-INST: smullb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x73,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 73 9f 45 <unknown>
 
 smullb z31.d, z31.s, z31.s
 // CHECK-INST: smullb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x73,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 73 df 45 <unknown>
 
 smullb z0.s, z1.h, z7.h[7]
 // CHECK-INST: smullb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xc8,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c8 bf 44 <unknown>
 
 smullb z0.d, z1.s, z15.s[1]
 // CHECK-INST: smullb	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xc8,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c8 ef 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/smullt.s b/llvm/test/MC/AArch64/SVE2/smullt.s
index 8e03697a40269..f25a510d978f7 100644
--- a/llvm/test/MC/AArch64/SVE2/smullt.s
+++ b/llvm/test/MC/AArch64/SVE2/smullt.s
@@ -13,29 +13,29 @@
 smullt z0.h, z1.b, z2.b
 // CHECK-INST: smullt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x74,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 42 45 <unknown>
 
 smullt z29.s, z30.h, z31.h
 // CHECK-INST: smullt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x77,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 77 9f 45 <unknown>
 
 smullt z31.d, z31.s, z31.s
 // CHECK-INST: smullt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x77,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 77 df 45 <unknown>
 
 smullt z0.s, z1.h, z7.h[7]
 // CHECK-INST: smullt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xcc,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 cc bf 44 <unknown>
 
 smullt z0.d, z1.s, z15.s[1]
 // CHECK-INST: smullt	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xcc,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 cc ef 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/splice.s b/llvm/test/MC/AArch64/SVE2/splice.s
index 3ce958276f871..d23071ad600e1 100644
--- a/llvm/test/MC/AArch64/SVE2/splice.s
+++ b/llvm/test/MC/AArch64/SVE2/splice.s
@@ -12,23 +12,23 @@
 splice  z29.b, p7, { z30.b, z31.b }
 // CHECK-INST: splice  z29.b, p7, { z30.b, z31.b }
 // CHECK-ENCODING: [0xdd,0x9f,0x2d,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 2d 05 <unknown>
 
 splice  z29.h, p7, { z30.h, z31.h }
 // CHECK-INST: splice  z29.h, p7, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x9f,0x6d,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 6d 05 <unknown>
 
 splice  z29.s, p7, { z30.s, z31.s }
 // CHECK-INST: splice  z29.s, p7, { z30.s, z31.s }
 // CHECK-ENCODING: [0xdd,0x9f,0xad,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f ad 05 <unknown>
 
 splice  z29.d, p7, { z30.d, z31.d }
 // CHECK-INST: splice  z29.d, p7, { z30.d, z31.d }
 // CHECK-ENCODING: [0xdd,0x9f,0xed,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f ed 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqabs.s b/llvm/test/MC/AArch64/SVE2/sqabs.s
index d3a004407bfa3..437819b79031c 100644
--- a/llvm/test/MC/AArch64/SVE2/sqabs.s
+++ b/llvm/test/MC/AArch64/SVE2/sqabs.s
@@ -12,25 +12,25 @@
 sqabs z31.b, p7/m, z31.b
 // CHECK-INST: sqabs z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x08,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 08 44 <unknown>
 
 sqabs z31.h, p7/m, z31.h
 // CHECK-INST: sqabs z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x48,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 48 44 <unknown>
 
 sqabs z31.s, p7/m, z31.s
 // CHECK-INST: sqabs z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x88,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 88 44 <unknown>
 
 sqabs z31.d, p7/m, z31.d
 // CHECK-INST: sqabs z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc8,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf c8 44 <unknown>
 
 
@@ -40,23 +40,23 @@ sqabs z31.d, p7/m, z31.d
 movprfx z4.s, p7/z, z6.s
 // CHECK-INST: movprfx	z4.s, p7/z, z6.s
 // CHECK-ENCODING: [0xc4,0x3c,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c 90 04 <unknown>
 
 sqabs z4.s, p7/m, z31.s
 // CHECK-INST: sqabs z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x88,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 88 44 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sqabs z4.s, p7/m, z31.s
 // CHECK-INST: sqabs z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x88,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 88 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqadd.s b/llvm/test/MC/AArch64/SVE2/sqadd.s
index 32db82948ad6e..8e76ec6c78b5e 100644
--- a/llvm/test/MC/AArch64/SVE2/sqadd.s
+++ b/llvm/test/MC/AArch64/SVE2/sqadd.s
@@ -12,25 +12,25 @@
 sqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x18,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 18 44 <unknown>
 
 sqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x58,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 58 44 <unknown>
 
 sqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x98,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 98 44 <unknown>
 
 sqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd8,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d8 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ sqadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd8,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d8 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd8,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d8 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqcadd.s b/llvm/test/MC/AArch64/SVE2/sqcadd.s
index ee1e3b4c77d8f..0349ddda6b124 100644
--- a/llvm/test/MC/AArch64/SVE2/sqcadd.s
+++ b/llvm/test/MC/AArch64/SVE2/sqcadd.s
@@ -12,49 +12,49 @@
 sqcadd   z0.b, z0.b, z0.b, #90
 // CHECK-INST: sqcadd   z0.b, z0.b, z0.b, #90
 // CHECK-ENCODING: [0x00,0xd8,0x01,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 01 45 <unknown>
 
 sqcadd   z0.h, z0.h, z0.h, #90
 // CHECK-INST: sqcadd   z0.h, z0.h, z0.h, #90
 // CHECK-ENCODING: [0x00,0xd8,0x41,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 41 45 <unknown>
 
 sqcadd   z0.s, z0.s, z0.s, #90
 // CHECK-INST: sqcadd   z0.s, z0.s, z0.s, #90
 // CHECK-ENCODING: [0x00,0xd8,0x81,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 81 45 <unknown>
 
 sqcadd   z0.d, z0.d, z0.d, #90
 // CHECK-INST: sqcadd   z0.d, z0.d, z0.d, #90
 // CHECK-ENCODING: [0x00,0xd8,0xc1,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 d8 c1 45 <unknown>
 
 sqcadd   z31.b, z31.b, z31.b, #270
 // CHECK-INST: sqcadd   z31.b, z31.b, z31.b, #270
 // CHECK-ENCODING: [0xff,0xdf,0x01,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df 01 45 <unknown>
 
 sqcadd   z31.h, z31.h, z31.h, #270
 // CHECK-INST: sqcadd   z31.h, z31.h, z31.h, #270
 // CHECK-ENCODING: [0xff,0xdf,0x41,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df 41 45 <unknown>
 
 sqcadd   z31.s, z31.s, z31.s, #270
 // CHECK-INST: sqcadd   z31.s, z31.s, z31.s, #270
 // CHECK-ENCODING: [0xff,0xdf,0x81,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df 81 45 <unknown>
 
 sqcadd   z31.d, z31.d, z31.d, #270
 // CHECK-INST: sqcadd   z31.d, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xff,0xdf,0xc1,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff df c1 45 <unknown>
 
 
@@ -64,11 +64,11 @@ sqcadd   z31.d, z31.d, z31.d, #270
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sqcadd   z4.d, z4.d, z31.d, #270
 // CHECK-INST: sqcadd	z4.d, z4.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0xdf,0xc1,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 df c1 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmlalb.s b/llvm/test/MC/AArch64/SVE2/sqdmlalb.s
index b0523ee29dfe6..7d05dcaf4cf00 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmlalb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmlalb.s
@@ -13,31 +13,31 @@
 sqdmlalb z0.h, z1.b, z31.b
 // CHECK-INST: sqdmlalb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x60,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 5f 44 <unknown>
 
 sqdmlalb z0.s, z1.h, z31.h
 // CHECK-INST: sqdmlalb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x60,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 9f 44 <unknown>
 
 sqdmlalb z0.d, z1.s, z31.s
 // CHECK-INST: sqdmlalb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x60,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 df 44 <unknown>
 
 sqdmlalb z0.s, z1.h, z7.h[7]
 // CHECK-INST: sqdmlalb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x28,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 28 bf 44 <unknown>
 
 sqdmlalb z0.d, z1.s, z15.s[3]
 // CHECK-INST: sqdmlalb	z0.d, z1.s, z15.s[3]
 // CHECK-ENCODING: [0x20,0x28,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 28 ff 44 <unknown>
 
 
@@ -47,23 +47,23 @@ sqdmlalb z0.d, z1.s, z15.s[3]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlalb z21.d, z1.s, z31.s
 // CHECK-INST: sqdmlalb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x60,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 60 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlalb   z21.d, z10.s, z5.s[1]
 // CHECK-INST: sqdmlalb   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x29,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 29 e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmlalbt.s b/llvm/test/MC/AArch64/SVE2/sqdmlalbt.s
index d25dfc8b9e838..1fabef483cb1d 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmlalbt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmlalbt.s
@@ -13,19 +13,19 @@
 sqdmlalbt z0.h, z1.b, z31.b
 // CHECK-INST: sqdmlalbt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x08,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 5f 44 <unknown>
 
 sqdmlalbt z0.s, z1.h, z31.h
 // CHECK-INST: sqdmlalbt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x08,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 9f 44 <unknown>
 
 sqdmlalbt z0.d, z1.s, z31.s
 // CHECK-INST: sqdmlalbt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x08,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 df 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -34,11 +34,11 @@ sqdmlalbt z0.d, z1.s, z31.s
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlalbt z21.d, z1.s, z31.s
 // CHECK-INST: sqdmlalbt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x08,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 08 df 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmlalt.s b/llvm/test/MC/AArch64/SVE2/sqdmlalt.s
index 552361b659e23..c88da9931a744 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmlalt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmlalt.s
@@ -13,31 +13,31 @@
 sqdmlalt z0.h, z1.b, z31.b
 // CHECK-INST: sqdmlalt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x64,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 5f 44 <unknown>
 
 sqdmlalt z0.s, z1.h, z31.h
 // CHECK-INST: sqdmlalt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x64,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 9f 44 <unknown>
 
 sqdmlalt z0.d, z1.s, z31.s
 // CHECK-INST: sqdmlalt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x64,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 df 44 <unknown>
 
 sqdmlalt z0.s, z1.h, z7.h[7]
 // CHECK-INST: sqdmlalt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x2c,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 2c bf 44 <unknown>
 
 sqdmlalt z0.d, z1.s, z15.s[3]
 // CHECK-INST: sqdmlalt	z0.d, z1.s, z15.s[3]
 // CHECK-ENCODING: [0x20,0x2c,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 2c ff 44 <unknown>
 
 
@@ -47,23 +47,23 @@ sqdmlalt z0.d, z1.s, z15.s[3]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlalt z21.d, z1.s, z31.s
 // CHECK-INST: sqdmlalt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x64,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 64 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlalt   z21.d, z10.s, z5.s[1]
 // CHECK-INST: sqdmlalt   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x2d,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 2d e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmlslb.s b/llvm/test/MC/AArch64/SVE2/sqdmlslb.s
index eabc8ac1a41fc..69937c8ef2444 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmlslb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmlslb.s
@@ -13,31 +13,31 @@
 sqdmlslb z0.h, z1.b, z31.b
 // CHECK-INST: sqdmlslb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x68,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 5f 44 <unknown>
 
 sqdmlslb z0.s, z1.h, z31.h
 // CHECK-INST: sqdmlslb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x68,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 9f 44 <unknown>
 
 sqdmlslb z0.d, z1.s, z31.s
 // CHECK-INST: sqdmlslb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x68,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 68 df 44 <unknown>
 
 sqdmlslb z0.s, z1.h, z7.h[7]
 // CHECK-INST: sqdmlslb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x38,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 38 bf 44 <unknown>
 
 sqdmlslb z0.d, z1.s, z15.s[3]
 // CHECK-INST: sqdmlslb	z0.d, z1.s, z15.s[3]
 // CHECK-ENCODING: [0x20,0x38,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 38 ff 44 <unknown>
 
 
@@ -47,23 +47,23 @@ sqdmlslb z0.d, z1.s, z15.s[3]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlslb z21.d, z1.s, z31.s
 // CHECK-INST: sqdmlslb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x68,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 68 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlslb   z21.d, z10.s, z5.s[1]
 // CHECK-INST: sqdmlslb   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x39,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 39 e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmlslbt.s b/llvm/test/MC/AArch64/SVE2/sqdmlslbt.s
index 1203375b958e6..a166523ae67fa 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmlslbt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmlslbt.s
@@ -13,19 +13,19 @@
 sqdmlslbt z0.h, z1.b, z31.b
 // CHECK-INST: sqdmlslbt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x0c,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c 5f 44 <unknown>
 
 sqdmlslbt z0.s, z1.h, z31.h
 // CHECK-INST: sqdmlslbt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x0c,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c 9f 44 <unknown>
 
 sqdmlslbt z0.d, z1.s, z31.s
 // CHECK-INST: sqdmlslbt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x0c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c df 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -34,11 +34,11 @@ sqdmlslbt z0.d, z1.s, z31.s
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlslbt z21.d, z1.s, z31.s
 // CHECK-INST: sqdmlslbt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x0c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 0c df 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmlslt.s b/llvm/test/MC/AArch64/SVE2/sqdmlslt.s
index c4e4fc1f6a00b..1eb7d15dd5c98 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmlslt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmlslt.s
@@ -13,31 +13,31 @@
 sqdmlslt z0.h, z1.b, z31.b
 // CHECK-INST: sqdmlslt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x6c,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c 5f 44 <unknown>
 
 sqdmlslt z0.s, z1.h, z31.h
 // CHECK-INST: sqdmlslt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x6c,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c 9f 44 <unknown>
 
 sqdmlslt z0.d, z1.s, z31.s
 // CHECK-INST: sqdmlslt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x6c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c df 44 <unknown>
 
 sqdmlslt z0.s, z1.h, z7.h[7]
 // CHECK-INST: sqdmlslt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x3c,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 3c bf 44 <unknown>
 
 sqdmlslt z0.d, z1.s, z15.s[3]
 // CHECK-INST: sqdmlslt	z0.d, z1.s, z15.s[3]
 // CHECK-ENCODING: [0x20,0x3c,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 3c ff 44 <unknown>
 
 
@@ -47,23 +47,23 @@ sqdmlslt z0.d, z1.s, z15.s[3]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlslt z21.d, z1.s, z31.s
 // CHECK-INST: sqdmlslt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x6c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 6c df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqdmlslt   z21.d, z10.s, z5.s[1]
 // CHECK-INST: sqdmlslt   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x3d,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 3d e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmulh.s b/llvm/test/MC/AArch64/SVE2/sqdmulh.s
index a1115cff6b80a..e833f4c3bcfd9 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmulh.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmulh.s
@@ -12,41 +12,41 @@
 sqdmulh z0.b, z1.b, z2.b
 // CHECK-INST: sqdmulh	z0.b, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x70,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 22 04 <unknown>
 
 sqdmulh z0.h, z1.h, z2.h
 // CHECK-INST: sqdmulh	z0.h, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x70,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 62 04 <unknown>
 
 sqdmulh z29.s, z30.s, z31.s
 // CHECK-INST: sqdmulh z29.s, z30.s, z31.s
 // CHECK-ENCODING: [0xdd,0x73,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 73 bf 04 <unknown>
 
 sqdmulh z31.d, z31.d, z31.d
 // CHECK-INST: sqdmulh z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x73,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 73 ff 04 <unknown>
 
 sqdmulh z0.h, z1.h, z7.h[7]
 // CHECK-INST: sqdmulh	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xf0,0x7f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f0 7f 44 <unknown>
 
 sqdmulh z0.s, z1.s, z7.s[3]
 // CHECK-INST: sqdmulh	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0xf0,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f0 bf 44 <unknown>
 
 sqdmulh z0.d, z1.d, z15.d[1]
 // CHECK-INST: sqdmulh	z0.d, z1.d, z15.d[1]
 // CHECK-ENCODING: [0x20,0xf0,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f0 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmullb.s b/llvm/test/MC/AArch64/SVE2/sqdmullb.s
index 33525f793b635..6b06b2515186b 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmullb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmullb.s
@@ -13,29 +13,29 @@
 sqdmullb z0.h, z1.b, z2.b
 // CHECK-INST: sqdmullb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x60,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 60 42 45 <unknown>
 
 sqdmullb z29.s, z30.h, z31.h
 // CHECK-INST: sqdmullb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x63,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 63 9f 45 <unknown>
 
 sqdmullb z31.d, z31.s, z31.s
 // CHECK-INST: sqdmullb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x63,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 63 df 45 <unknown>
 
 sqdmullb z0.s, z1.h, z7.h[7]
 // CHECK-INST: sqdmullb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xe8,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 e8 bf 44 <unknown>
 
 sqdmullb z0.d, z1.s, z15.s[1]
 // CHECK-INST: sqdmullb	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xe8,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 e8 ef 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqdmullt.s b/llvm/test/MC/AArch64/SVE2/sqdmullt.s
index 6c9e6a654ea7d..a051d3fefd2b8 100644
--- a/llvm/test/MC/AArch64/SVE2/sqdmullt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqdmullt.s
@@ -13,29 +13,29 @@
 sqdmullt z0.h, z1.b, z2.b
 // CHECK-INST: sqdmullt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x64,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 64 42 45 <unknown>
 
 sqdmullt z29.s, z30.h, z31.h
 // CHECK-INST: sqdmullt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x67,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 67 9f 45 <unknown>
 
 sqdmullt z31.d, z31.s, z31.s
 // CHECK-INST: sqdmullt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x67,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 67 df 45 <unknown>
 
 sqdmullt z0.s, z1.h, z7.h[7]
 // CHECK-INST: sqdmullt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xec,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 ec bf 44 <unknown>
 
 sqdmullt z0.d, z1.s, z15.s[1]
 // CHECK-INST: sqdmullt	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xec,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 ec ef 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqneg.s b/llvm/test/MC/AArch64/SVE2/sqneg.s
index 39417242a30b1..42d84836425a4 100644
--- a/llvm/test/MC/AArch64/SVE2/sqneg.s
+++ b/llvm/test/MC/AArch64/SVE2/sqneg.s
@@ -12,25 +12,25 @@
 sqneg z31.b, p7/m, z31.b
 // CHECK-INST: sqneg z31.b, p7/m, z31.b
 // CHECK-ENCODING: [0xff,0xbf,0x09,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 09 44 <unknown>
 
 sqneg z31.h, p7/m, z31.h
 // CHECK-INST: sqneg z31.h, p7/m, z31.h
 // CHECK-ENCODING: [0xff,0xbf,0x49,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 49 44 <unknown>
 
 sqneg z31.s, p7/m, z31.s
 // CHECK-INST: sqneg z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x89,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 89 44 <unknown>
 
 sqneg z31.d, p7/m, z31.d
 // CHECK-INST: sqneg z31.d, p7/m, z31.d
 // CHECK-ENCODING: [0xff,0xbf,0xc9,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf c9 44 <unknown>
 
 
@@ -40,23 +40,23 @@ sqneg z31.d, p7/m, z31.d
 movprfx z4.s, p7/z, z6.s
 // CHECK-INST: movprfx	z4.s, p7/z, z6.s
 // CHECK-ENCODING: [0xc4,0x3c,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c 90 04 <unknown>
 
 sqneg z4.s, p7/m, z31.s
 // CHECK-INST: sqneg z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x89,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 89 44 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sqneg z4.s, p7/m, z31.s
 // CHECK-INST: sqneg z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x89,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 89 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrdcmlah.s b/llvm/test/MC/AArch64/SVE2/sqrdcmlah.s
index 5e8942842bd17..267ac0b4cadbc 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrdcmlah.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrdcmlah.s
@@ -12,121 +12,121 @@
 sqrdcmlah   z0.b, z1.b, z2.b, #0
 // CHECK-INST: sqrdcmlah   z0.b, z1.b, z2.b, #0
 // CHECK-ENCODING: [0x20,0x30,0x02,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 30 02 44 <unknown>
 
 sqrdcmlah   z0.h, z1.h, z2.h, #0
 // CHECK-INST: sqrdcmlah   z0.h, z1.h, z2.h, #0
 // CHECK-ENCODING: [0x20,0x30,0x42,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 30 42 44 <unknown>
 
 sqrdcmlah   z0.s, z1.s, z2.s, #0
 // CHECK-INST: sqrdcmlah   z0.s, z1.s, z2.s, #0
 // CHECK-ENCODING: [0x20,0x30,0x82,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 30 82 44 <unknown>
 
 sqrdcmlah   z0.d, z1.d, z2.d, #0
 // CHECK-INST: sqrdcmlah   z0.d, z1.d, z2.d, #0
 // CHECK-ENCODING: [0x20,0x30,0xc2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 30 c2 44 <unknown>
 
 sqrdcmlah   z29.b, z30.b, z31.b, #90
 // CHECK-INST: sqrdcmlah   z29.b, z30.b, z31.b, #90
 // CHECK-ENCODING: [0xdd,0x37,0x1f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 37 1f 44 <unknown>
 
 sqrdcmlah   z29.h, z30.h, z31.h, #90
 // CHECK-INST: sqrdcmlah   z29.h, z30.h, z31.h, #90
 // CHECK-ENCODING: [0xdd,0x37,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 37 5f 44 <unknown>
 
 sqrdcmlah   z29.s, z30.s, z31.s, #90
 // CHECK-INST: sqrdcmlah   z29.s, z30.s, z31.s, #90
 // CHECK-ENCODING: [0xdd,0x37,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 37 9f 44 <unknown>
 
 sqrdcmlah   z29.d, z30.d, z31.d, #90
 // CHECK-INST: sqrdcmlah   z29.d, z30.d, z31.d, #90
 // CHECK-ENCODING: [0xdd,0x37,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 37 df 44 <unknown>
 
 sqrdcmlah   z31.b, z31.b, z31.b, #180
 // CHECK-INST: sqrdcmlah   z31.b, z31.b, z31.b, #180
 // CHECK-ENCODING: [0xff,0x3b,0x1f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b 1f 44 <unknown>
 
 sqrdcmlah   z31.h, z31.h, z31.h, #180
 // CHECK-INST: sqrdcmlah   z31.h, z31.h, z31.h, #180
 // CHECK-ENCODING: [0xff,0x3b,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b 5f 44 <unknown>
 
 sqrdcmlah   z31.s, z31.s, z31.s, #180
 // CHECK-INST: sqrdcmlah   z31.s, z31.s, z31.s, #180
 // CHECK-ENCODING: [0xff,0x3b,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b 9f 44 <unknown>
 
 sqrdcmlah   z31.d, z31.d, z31.d, #180
 // CHECK-INST: sqrdcmlah   z31.d, z31.d, z31.d, #180
 // CHECK-ENCODING: [0xff,0x3b,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b df 44 <unknown>
 
 sqrdcmlah   z15.b, z16.b, z17.b, #270
 // CHECK-INST: sqrdcmlah   z15.b, z16.b, z17.b, #270
 // CHECK-ENCODING: [0x0f,0x3e,0x11,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 3e 11 44 <unknown>
 
 sqrdcmlah   z15.h, z16.h, z17.h, #270
 // CHECK-INST: sqrdcmlah   z15.h, z16.h, z17.h, #270
 // CHECK-ENCODING: [0x0f,0x3e,0x51,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 3e 51 44 <unknown>
 
 sqrdcmlah   z15.s, z16.s, z17.s, #270
 // CHECK-INST: sqrdcmlah   z15.s, z16.s, z17.s, #270
 // CHECK-ENCODING: [0x0f,0x3e,0x91,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 3e 91 44 <unknown>
 
 sqrdcmlah   z15.d, z16.d, z17.d, #270
 // CHECK-INST: sqrdcmlah   z15.d, z16.d, z17.d, #270
 // CHECK-ENCODING: [0x0f,0x3e,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 3e d1 44 <unknown>
 
 sqrdcmlah   z0.h, z1.h, z2.h[0], #0
 // CHECK-INST: sqrdcmlah   z0.h, z1.h, z2.h[0], #0
 // CHECK-ENCODING: [0x20,0x70,0xa2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 a2 44 <unknown>
 
 sqrdcmlah   z0.s, z1.s, z2.s[0], #0
 // CHECK-INST: sqrdcmlah   z0.s, z1.s, z2.s[0], #0
 // CHECK-ENCODING: [0x20,0x70,0xe2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 e2 44 <unknown>
 
 sqrdcmlah   z31.h, z30.h, z7.h[0], #180
 // CHECK-INST: sqrdcmlah   z31.h, z30.h, z7.h[0], #180
 // CHECK-ENCODING: [0xdf,0x7b,0xa7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 7b a7 44 <unknown>
 
 sqrdcmlah   z31.s, z30.s, z7.s[0], #180
 // CHECK-INST: sqrdcmlah   z31.s, z30.s, z7.s[0], #180
 // CHECK-ENCODING: [0xdf,0x7b,0xe7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 7b e7 44 <unknown>
 
 
@@ -136,23 +136,23 @@ sqrdcmlah   z31.s, z30.s, z7.s[0], #180
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 sqrdcmlah   z4.d, z31.d, z31.d, #270
 // CHECK-INST: sqrdcmlah   z4.d, z31.d, z31.d, #270
 // CHECK-ENCODING: [0xe4,0x3f,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 3f df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 sqrdcmlah   z21.s, z10.s, z5.s[1], #90
 // CHECK-INST: sqrdcmlah	z21.s, z10.s, z5.s[1], #90
 // CHECK-ENCODING: [0x55,0x75,0xf5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 75 f5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrdmlah.s b/llvm/test/MC/AArch64/SVE2/sqrdmlah.s
index 7c7c6117f2bfb..af31b9b9b64be 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrdmlah.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrdmlah.s
@@ -13,43 +13,43 @@
 sqrdmlah z0.b, z1.b, z31.b
 // CHECK-INST: sqrdmlah z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x70,0x1f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 1f 44 <unknown>
 
 sqrdmlah z0.h, z1.h, z31.h
 // CHECK-INST: sqrdmlah z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x70,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 5f 44 <unknown>
 
 sqrdmlah z0.s, z1.s, z31.s
 // CHECK-INST: sqrdmlah z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x70,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 9f 44 <unknown>
 
 sqrdmlah z0.d, z1.d, z31.d
 // CHECK-INST: sqrdmlah z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x70,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 df 44 <unknown>
 
 sqrdmlah z0.h, z1.h, z7.h[7]
 // CHECK-INST: sqrdmlah	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x10,0x7f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 7f 44 <unknown>
 
 sqrdmlah z0.s, z1.s, z7.s[3]
 // CHECK-INST: sqrdmlah	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0x10,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 bf 44 <unknown>
 
 sqrdmlah z0.d, z1.d, z15.d[1]
 // CHECK-INST: sqrdmlah	z0.d, z1.d, z15.d[1]
 // CHECK-ENCODING: [0x20,0x10,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 ff 44 <unknown>
 
 
@@ -59,23 +59,23 @@ sqrdmlah z0.d, z1.d, z15.d[1]
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqrdmlah z0.d, z1.d, z31.d
 // CHECK-INST: sqrdmlah z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x70,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 df 44 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqrdmlah z0.d, z1.d, z15.d[1]
 // CHECK-INST: sqrdmlah	z0.d, z1.d, z15.d[1]
 // CHECK-ENCODING: [0x20,0x10,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrdmlsh.s b/llvm/test/MC/AArch64/SVE2/sqrdmlsh.s
index 1f79608ed99bc..25d7b2efd8527 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrdmlsh.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrdmlsh.s
@@ -12,43 +12,43 @@
 sqrdmlsh z0.b, z1.b, z31.b
 // CHECK-INST: sqrdmlsh z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x74,0x1f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 1f 44 <unknown>
 
 sqrdmlsh z0.h, z1.h, z31.h
 // CHECK-INST: sqrdmlsh z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x74,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 5f 44 <unknown>
 
 sqrdmlsh z0.s, z1.s, z31.s
 // CHECK-INST: sqrdmlsh z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x74,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 9f 44 <unknown>
 
 sqrdmlsh z0.d, z1.d, z31.d
 // CHECK-INST: sqrdmlsh z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x74,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 df 44 <unknown>
 
 sqrdmlsh z0.h, z1.h, z7.h[7]
 // CHECK-INST: sqrdmlsh	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x14,0x7f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 14 7f 44 <unknown>
 
 sqrdmlsh z0.s, z1.s, z7.s[3]
 // CHECK-INST: sqrdmlsh	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0x14,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 14 bf 44 <unknown>
 
 sqrdmlsh z0.d, z1.d, z15.d[1]
 // CHECK-INST: sqrdmlsh	z0.d, z1.d, z15.d[1]
 // CHECK-ENCODING: [0x20,0x14,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 14 ff 44 <unknown>
 
 
@@ -58,23 +58,23 @@ sqrdmlsh z0.d, z1.d, z15.d[1]
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqrdmlsh z0.d, z1.d, z31.d
 // CHECK-INST: sqrdmlsh z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x74,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 df 44 <unknown>
 
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 sqrdmlsh z0.d, z1.d, z15.d[1]
 // CHECK-INST: sqrdmlsh	z0.d, z1.d, z15.d[1]
 // CHECK-ENCODING: [0x20,0x14,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 14 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrdmulh.s b/llvm/test/MC/AArch64/SVE2/sqrdmulh.s
index 249edeb326f98..3791327d40abe 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrdmulh.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrdmulh.s
@@ -12,41 +12,41 @@
 sqrdmulh z0.b, z1.b, z2.b
 // CHECK-INST: sqrdmulh	z0.b, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x74,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 22 04 <unknown>
 
 sqrdmulh z0.h, z1.h, z2.h
 // CHECK-INST: sqrdmulh	z0.h, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x74,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 62 04 <unknown>
 
 sqrdmulh z29.s, z30.s, z31.s
 // CHECK-INST: sqrdmulh z29.s, z30.s, z31.s
 // CHECK-ENCODING: [0xdd,0x77,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 77 bf 04 <unknown>
 
 sqrdmulh z31.d, z31.d, z31.d
 // CHECK-INST: sqrdmulh z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x77,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 77 ff 04 <unknown>
 
 sqrdmulh z0.h, z1.h, z7.h[7]
 // CHECK-INST: sqrdmulh	z0.h, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xf4,0x7f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f4 7f 44 <unknown>
 
 sqrdmulh z0.s, z1.s, z7.s[3]
 // CHECK-INST: sqrdmulh	z0.s, z1.s, z7.s[3]
 // CHECK-ENCODING: [0x20,0xf4,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f4 bf 44 <unknown>
 
 sqrdmulh z0.d, z1.d, z15.d[1]
 // CHECK-INST: sqrdmulh	z0.d, z1.d, z15.d[1]
 // CHECK-ENCODING: [0x20,0xf4,0xff,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 f4 ff 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrshl.s b/llvm/test/MC/AArch64/SVE2/sqrshl.s
index 59e2904191bb8..0a8c9514a6a66 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrshl.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrshl.s
@@ -12,25 +12,25 @@
 sqrshl z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sqrshl z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x0a,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 0a 44 <unknown>
 
 sqrshl z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sqrshl z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x4a,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 4a 44 <unknown>
 
 sqrshl z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sqrshl z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x8a,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 8a 44 <unknown>
 
 sqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xca,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f ca 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ sqrshl z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqrshl z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sqrshl z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xca,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 ca 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xca,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f ca 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrshlr.s b/llvm/test/MC/AArch64/SVE2/sqrshlr.s
index 29bd8e5b99c63..70644cfea42f6 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrshlr.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrshlr.s
@@ -12,25 +12,25 @@
 sqrshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sqrshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x0e,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 0e 44 <unknown>
 
 sqrshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sqrshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x4e,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 4e 44 <unknown>
 
 sqrshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sqrshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x8e,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 8e 44 <unknown>
 
 sqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xce,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f ce 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ sqrshlr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqrshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sqrshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xce,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 ce 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xce,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f ce 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrshrnb.s b/llvm/test/MC/AArch64/SVE2/sqrshrnb.s
index bb7d482bf9507..9dde993312155 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrshrnb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrshrnb.s
@@ -12,35 +12,35 @@
 sqrshrnb     z0.b, z0.h, #1
 // CHECK-INST: sqrshrnb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x28,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 28 2f 45 <unknown>
 
 sqrshrnb     z31.b, z31.h, #8
 // CHECK-INST: sqrshrnb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x2b,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2b 28 45 <unknown>
 
 sqrshrnb     z0.h, z0.s, #1
 // CHECK-INST: sqrshrnb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x28,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 28 3f 45 <unknown>
 
 sqrshrnb     z31.h, z31.s, #16
 // CHECK-INST: sqrshrnb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x2b,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2b 30 45 <unknown>
 
 sqrshrnb     z0.s, z0.d, #1
 // CHECK-INST: sqrshrnb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x28,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 28 7f 45 <unknown>
 
 sqrshrnb     z31.s, z31.d, #32
 // CHECK-INST: sqrshrnb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x2b,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2b 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrshrnt.s b/llvm/test/MC/AArch64/SVE2/sqrshrnt.s
index b003cb80dfb19..afd518ea41c02 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrshrnt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrshrnt.s
@@ -12,35 +12,35 @@
 sqrshrnt     z0.b, z0.h, #1
 // CHECK-INST: sqrshrnt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x2c,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 2c 2f 45 <unknown>
 
 sqrshrnt     z31.b, z31.h, #8
 // CHECK-INST: sqrshrnt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x2f,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2f 28 45 <unknown>
 
 sqrshrnt     z0.h, z0.s, #1
 // CHECK-INST: sqrshrnt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x2c,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 2c 3f 45 <unknown>
 
 sqrshrnt     z31.h, z31.s, #16
 // CHECK-INST: sqrshrnt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x2f,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2f 30 45 <unknown>
 
 sqrshrnt     z0.s, z0.d, #1
 // CHECK-INST: sqrshrnt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x2c,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 2c 7f 45 <unknown>
 
 sqrshrnt     z31.s, z31.d, #32
 // CHECK-INST: sqrshrnt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x2f,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2f 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrshrunb.s b/llvm/test/MC/AArch64/SVE2/sqrshrunb.s
index 89b1a05aa635b..471271cb36def 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrshrunb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrshrunb.s
@@ -12,35 +12,35 @@
 sqrshrunb     z0.b, z0.h, #1
 // CHECK-INST: sqrshrunb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x08,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 08 2f 45 <unknown>
 
 sqrshrunb     z31.b, z31.h, #8
 // CHECK-INST: sqrshrunb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x0b,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0b 28 45 <unknown>
 
 sqrshrunb     z0.h, z0.s, #1
 // CHECK-INST: sqrshrunb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x08,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 08 3f 45 <unknown>
 
 sqrshrunb     z31.h, z31.s, #16
 // CHECK-INST: sqrshrunb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x0b,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0b 30 45 <unknown>
 
 sqrshrunb     z0.s, z0.d, #1
 // CHECK-INST: sqrshrunb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x08,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 08 7f 45 <unknown>
 
 sqrshrunb     z31.s, z31.d, #32
 // CHECK-INST: sqrshrunb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x0b,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0b 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqrshrunt.s b/llvm/test/MC/AArch64/SVE2/sqrshrunt.s
index a6884ba0aaa47..1c71e5240ce8b 100644
--- a/llvm/test/MC/AArch64/SVE2/sqrshrunt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqrshrunt.s
@@ -12,35 +12,35 @@
 sqrshrunt     z0.b, z0.h, #1
 // CHECK-INST: sqrshrunt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x0c,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 0c 2f 45 <unknown>
 
 sqrshrunt     z31.b, z31.h, #8
 // CHECK-INST: sqrshrunt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x0f,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0f 28 45 <unknown>
 
 sqrshrunt     z0.h, z0.s, #1
 // CHECK-INST: sqrshrunt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x0c,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 0c 3f 45 <unknown>
 
 sqrshrunt     z31.h, z31.s, #16
 // CHECK-INST: sqrshrunt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x0f,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0f 30 45 <unknown>
 
 sqrshrunt     z0.s, z0.d, #1
 // CHECK-INST: sqrshrunt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x0c,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 0c 7f 45 <unknown>
 
 sqrshrunt     z31.s, z31.d, #32
 // CHECK-INST: sqrshrunt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x0f,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0f 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqshl.s b/llvm/test/MC/AArch64/SVE2/sqshl.s
index 994cdbf287afe..9548d99ba704b 100644
--- a/llvm/test/MC/AArch64/SVE2/sqshl.s
+++ b/llvm/test/MC/AArch64/SVE2/sqshl.s
@@ -12,73 +12,73 @@
 sqshl z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sqshl z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x08,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 08 44 <unknown>
 
 sqshl z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sqshl z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x48,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 48 44 <unknown>
 
 sqshl z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sqshl z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x88,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 88 44 <unknown>
 
 sqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc8,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c8 44 <unknown>
 
 sqshl z0.b, p0/m, z0.b, #0
 // CHECK-INST: sqshl z0.b, p0/m, z0.b, #0
 // CHECK-ENCODING: [0x00,0x81,0x06,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 81 06 04 <unknown>
 
 sqshl z31.b, p0/m, z31.b, #7
 // CHECK-INST: sqshl z31.b, p0/m, z31.b, #7
 // CHECK-ENCODING: [0xff,0x81,0x06,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 81 06 04 <unknown>
 
 sqshl z0.h, p0/m, z0.h, #0
 // CHECK-INST: sqshl z0.h, p0/m, z0.h, #0
 // CHECK-ENCODING: [0x00,0x82,0x06,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 82 06 04 <unknown>
 
 sqshl z31.h, p0/m, z31.h, #15
 // CHECK-INST: sqshl z31.h, p0/m, z31.h, #15
 // CHECK-ENCODING: [0xff,0x83,0x06,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 06 04 <unknown>
 
 sqshl z0.s, p0/m, z0.s, #0
 // CHECK-INST: sqshl z0.s, p0/m, z0.s, #0
 // CHECK-ENCODING: [0x00,0x80,0x46,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 80 46 04 <unknown>
 
 sqshl z31.s, p0/m, z31.s, #31
 // CHECK-INST: sqshl z31.s, p0/m, z31.s, #31
 // CHECK-ENCODING: [0xff,0x83,0x46,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 46 04 <unknown>
 
 sqshl z0.d, p0/m, z0.d, #0
 // CHECK-INST: sqshl z0.d, p0/m, z0.d, #0
 // CHECK-ENCODING: [0x00,0x80,0x86,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 80 86 04 <unknown>
 
 sqshl z31.d, p0/m, z31.d, #63
 // CHECK-INST: sqshl z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 c6 04 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -87,47 +87,47 @@ sqshl z31.d, p0/m, z31.d, #63
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqshl z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sqshl z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xc8,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 c8 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc8,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c8 44 <unknown>
 
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqshl z31.d, p0/m, z31.d, #63
 // CHECK-INST: sqshl z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 c6 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqshl z31.d, p0/m, z31.d, #63
 // CHECK-INST: sqshl z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc6,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 c6 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqshlr.s b/llvm/test/MC/AArch64/SVE2/sqshlr.s
index fcf7461d88c10..819ea1757237d 100644
--- a/llvm/test/MC/AArch64/SVE2/sqshlr.s
+++ b/llvm/test/MC/AArch64/SVE2/sqshlr.s
@@ -12,25 +12,25 @@
 sqshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sqshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x0c,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 0c 44 <unknown>
 
 sqshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sqshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x4c,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 4c 44 <unknown>
 
 sqshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sqshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x8c,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 8c 44 <unknown>
 
 sqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcc,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cc 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ sqshlr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sqshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xcc,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 cc 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcc,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cc 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqshlu.s b/llvm/test/MC/AArch64/SVE2/sqshlu.s
index e70d1d89de0f8..cf4138027929c 100644
--- a/llvm/test/MC/AArch64/SVE2/sqshlu.s
+++ b/llvm/test/MC/AArch64/SVE2/sqshlu.s
@@ -12,49 +12,49 @@
 sqshlu z0.b, p0/m, z0.b, #0
 // CHECK-INST: sqshlu z0.b, p0/m, z0.b, #0
 // CHECK-ENCODING: [0x00,0x81,0x0f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 81 0f 04 <unknown>
 
 sqshlu z31.b, p0/m, z31.b, #7
 // CHECK-INST: sqshlu z31.b, p0/m, z31.b, #7
 // CHECK-ENCODING: [0xff,0x81,0x0f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 81 0f 04 <unknown>
 
 sqshlu z0.h, p0/m, z0.h, #0
 // CHECK-INST: sqshlu z0.h, p0/m, z0.h, #0
 // CHECK-ENCODING: [0x00,0x82,0x0f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 82 0f 04 <unknown>
 
 sqshlu z31.h, p0/m, z31.h, #15
 // CHECK-INST: sqshlu z31.h, p0/m, z31.h, #15
 // CHECK-ENCODING: [0xff,0x83,0x0f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 0f 04 <unknown>
 
 sqshlu z0.s, p0/m, z0.s, #0
 // CHECK-INST: sqshlu z0.s, p0/m, z0.s, #0
 // CHECK-ENCODING: [0x00,0x80,0x4f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 80 4f 04 <unknown>
 
 sqshlu z31.s, p0/m, z31.s, #31
 // CHECK-INST: sqshlu z31.s, p0/m, z31.s, #31
 // CHECK-ENCODING: [0xff,0x83,0x4f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 4f 04 <unknown>
 
 sqshlu z0.d, p0/m, z0.d, #0
 // CHECK-INST: sqshlu z0.d, p0/m, z0.d, #0
 // CHECK-ENCODING: [0x00,0x80,0x8f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 80 8f 04 <unknown>
 
 sqshlu z31.d, p0/m, z31.d, #63
 // CHECK-INST: sqshlu z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xcf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 cf 04 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -63,23 +63,23 @@ sqshlu z31.d, p0/m, z31.d, #63
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqshlu z31.d, p0/m, z31.d, #63
 // CHECK-INST: sqshlu z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xcf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 cf 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqshlu z31.d, p0/m, z31.d, #63
 // CHECK-INST: sqshlu z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xcf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 cf 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqshrnb.s b/llvm/test/MC/AArch64/SVE2/sqshrnb.s
index 0a1343d90de7c..ffe672501fb56 100644
--- a/llvm/test/MC/AArch64/SVE2/sqshrnb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqshrnb.s
@@ -12,35 +12,35 @@
 sqshrnb     z0.b, z0.h, #1
 // CHECK-INST: sqshrnb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x20,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 20 2f 45 <unknown>
 
 sqshrnb     z31.b, z31.h, #8
 // CHECK-INST: sqshrnb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x23,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 23 28 45 <unknown>
 
 sqshrnb     z0.h, z0.s, #1
 // CHECK-INST: sqshrnb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x20,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 20 3f 45 <unknown>
 
 sqshrnb     z31.h, z31.s, #16
 // CHECK-INST: sqshrnb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x23,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 23 30 45 <unknown>
 
 sqshrnb     z0.s, z0.d, #1
 // CHECK-INST: sqshrnb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x20,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 20 7f 45 <unknown>
 
 sqshrnb     z31.s, z31.d, #32
 // CHECK-INST: sqshrnb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x23,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 23 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqshrnt.s b/llvm/test/MC/AArch64/SVE2/sqshrnt.s
index 18fea9e797f6e..24bfcf4f829cc 100644
--- a/llvm/test/MC/AArch64/SVE2/sqshrnt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqshrnt.s
@@ -12,35 +12,35 @@
 sqshrnt     z0.b, z0.h, #1
 // CHECK-INST: sqshrnt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x24,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 24 2f 45 <unknown>
 
 sqshrnt     z31.b, z31.h, #8
 // CHECK-INST: sqshrnt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x27,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 27 28 45 <unknown>
 
 sqshrnt     z0.h, z0.s, #1
 // CHECK-INST: sqshrnt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x24,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 24 3f 45 <unknown>
 
 sqshrnt     z31.h, z31.s, #16
 // CHECK-INST: sqshrnt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x27,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 27 30 45 <unknown>
 
 sqshrnt     z0.s, z0.d, #1
 // CHECK-INST: sqshrnt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x24,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 24 7f 45 <unknown>
 
 sqshrnt     z31.s, z31.d, #32
 // CHECK-INST: sqshrnt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x27,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 27 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqshrunb.s b/llvm/test/MC/AArch64/SVE2/sqshrunb.s
index 58672f00ba2b5..8811900c5ed27 100644
--- a/llvm/test/MC/AArch64/SVE2/sqshrunb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqshrunb.s
@@ -12,35 +12,35 @@
 sqshrunb     z0.b, z0.h, #1
 // CHECK-INST: sqshrunb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x00,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 00 2f 45 <unknown>
 
 sqshrunb     z31.b, z31.h, #8
 // CHECK-INST: sqshrunb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x03,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 03 28 45 <unknown>
 
 sqshrunb     z0.h, z0.s, #1
 // CHECK-INST: sqshrunb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x00,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 00 3f 45 <unknown>
 
 sqshrunb     z31.h, z31.s, #16
 // CHECK-INST: sqshrunb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x03,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 03 30 45 <unknown>
 
 sqshrunb     z0.s, z0.d, #1
 // CHECK-INST: sqshrunb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x00,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 00 7f 45 <unknown>
 
 sqshrunb     z31.s, z31.d, #32
 // CHECK-INST: sqshrunb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x03,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 03 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqshrunt.s b/llvm/test/MC/AArch64/SVE2/sqshrunt.s
index 15efa20cf7b74..50005c2badc7d 100644
--- a/llvm/test/MC/AArch64/SVE2/sqshrunt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqshrunt.s
@@ -12,35 +12,35 @@
 sqshrunt     z0.b, z0.h, #1
 // CHECK-INST: sqshrunt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x04,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 04 2f 45 <unknown>
 
 sqshrunt     z31.b, z31.h, #8
 // CHECK-INST: sqshrunt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x07,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 07 28 45 <unknown>
 
 sqshrunt     z0.h, z0.s, #1
 // CHECK-INST: sqshrunt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x04,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 04 3f 45 <unknown>
 
 sqshrunt     z31.h, z31.s, #16
 // CHECK-INST: sqshrunt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x07,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 07 30 45 <unknown>
 
 sqshrunt     z0.s, z0.d, #1
 // CHECK-INST: sqshrunt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x04,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 04 7f 45 <unknown>
 
 sqshrunt     z31.s, z31.d, #32
 // CHECK-INST: sqshrunt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x07,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 07 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqsub.s b/llvm/test/MC/AArch64/SVE2/sqsub.s
index c9473cce0586e..4717bb3fc2d3b 100644
--- a/llvm/test/MC/AArch64/SVE2/sqsub.s
+++ b/llvm/test/MC/AArch64/SVE2/sqsub.s
@@ -12,25 +12,25 @@
 sqsub z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sqsub z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x1a,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 1a 44 <unknown>
 
 sqsub z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sqsub z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x5a,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 5a 44 <unknown>
 
 sqsub z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sqsub z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x9a,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 9a 44 <unknown>
 
 sqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xda,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f da 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ sqsub z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqsub z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sqsub z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xda,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 da 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xda,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f da 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqsubr.s b/llvm/test/MC/AArch64/SVE2/sqsubr.s
index 13198a5322690..6b911b6b58448 100644
--- a/llvm/test/MC/AArch64/SVE2/sqsubr.s
+++ b/llvm/test/MC/AArch64/SVE2/sqsubr.s
@@ -12,25 +12,25 @@
 sqsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: sqsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x1e,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 1e 44 <unknown>
 
 sqsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: sqsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x5e,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 5e 44 <unknown>
 
 sqsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: sqsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x9e,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 9e 44 <unknown>
 
 sqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xde,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f de 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ sqsubr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 sqsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: sqsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xde,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 de 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 sqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: sqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xde,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f de 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqxtnb.s b/llvm/test/MC/AArch64/SVE2/sqxtnb.s
index 8e3b14c0a7d60..95a7eb6ce167f 100644
--- a/llvm/test/MC/AArch64/SVE2/sqxtnb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqxtnb.s
@@ -13,17 +13,17 @@
 sqxtnb z0.b, z31.h
 // CHECK-INST: sqxtnb	z0.b, z31.h
 // CHECK-ENCODING: [0xe0,0x43,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 43 28 45 <unknown>
 
 sqxtnb z0.h, z31.s
 // CHECK-INST: sqxtnb	z0.h, z31.s
 // CHECK-ENCODING: [0xe0,0x43,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 43 30 45 <unknown>
 
 sqxtnb z0.s, z31.d
 // CHECK-INST: sqxtnb	z0.s, z31.d
 // CHECK-ENCODING: [0xe0,0x43,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 43 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqxtnt.s b/llvm/test/MC/AArch64/SVE2/sqxtnt.s
index 74c0ec8165bec..7db4905d71c31 100644
--- a/llvm/test/MC/AArch64/SVE2/sqxtnt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqxtnt.s
@@ -13,17 +13,17 @@
 sqxtnt z0.b, z31.h
 // CHECK-INST: sqxtnt	z0.b, z31.h
 // CHECK-ENCODING: [0xe0,0x47,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 47 28 45 <unknown>
 
 sqxtnt z0.h, z31.s
 // CHECK-INST: sqxtnt	z0.h, z31.s
 // CHECK-ENCODING: [0xe0,0x47,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 47 30 45 <unknown>
 
 sqxtnt z0.s, z31.d
 // CHECK-INST: sqxtnt	z0.s, z31.d
 // CHECK-ENCODING: [0xe0,0x47,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 47 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqxtunb.s b/llvm/test/MC/AArch64/SVE2/sqxtunb.s
index 5d6d9574cd0f6..6dcecc8646c70 100644
--- a/llvm/test/MC/AArch64/SVE2/sqxtunb.s
+++ b/llvm/test/MC/AArch64/SVE2/sqxtunb.s
@@ -13,17 +13,17 @@
 sqxtunb z0.b, z31.h
 // CHECK-INST: sqxtunb	z0.b, z31.h
 // CHECK-ENCODING: [0xe0,0x53,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 53 28 45 <unknown>
 
 sqxtunb z0.h, z31.s
 // CHECK-INST: sqxtunb	z0.h, z31.s
 // CHECK-ENCODING: [0xe0,0x53,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 53 30 45 <unknown>
 
 sqxtunb z0.s, z31.d
 // CHECK-INST: sqxtunb	z0.s, z31.d
 // CHECK-ENCODING: [0xe0,0x53,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 53 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sqxtunt.s b/llvm/test/MC/AArch64/SVE2/sqxtunt.s
index a75f54d70f311..2ba103a66fa28 100644
--- a/llvm/test/MC/AArch64/SVE2/sqxtunt.s
+++ b/llvm/test/MC/AArch64/SVE2/sqxtunt.s
@@ -13,17 +13,17 @@
 sqxtunt z0.b, z31.h
 // CHECK-INST: sqxtunt	z0.b, z31.h
 // CHECK-ENCODING: [0xe0,0x57,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 57 28 45 <unknown>
 
 sqxtunt z0.h, z31.s
 // CHECK-INST: sqxtunt	z0.h, z31.s
 // CHECK-ENCODING: [0xe0,0x57,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 57 30 45 <unknown>
 
 sqxtunt z0.s, z31.d
 // CHECK-INST: sqxtunt	z0.s, z31.d
 // CHECK-ENCODING: [0xe0,0x57,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 57 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/srhadd.s b/llvm/test/MC/AArch64/SVE2/srhadd.s
index 0f57ab88a6319..5d0375a7f8a21 100644
--- a/llvm/test/MC/AArch64/SVE2/srhadd.s
+++ b/llvm/test/MC/AArch64/SVE2/srhadd.s
@@ -12,25 +12,25 @@
 srhadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: srhadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x14,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 14 44 <unknown>
 
 srhadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: srhadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x54,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 54 44 <unknown>
 
 srhadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: srhadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x94,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 94 44 <unknown>
 
 srhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: srhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d4 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ srhadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 srhadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: srhadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d4 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 srhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: srhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd4,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d4 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sri.s b/llvm/test/MC/AArch64/SVE2/sri.s
index d264bb1fafe74..107bae5267502 100644
--- a/llvm/test/MC/AArch64/SVE2/sri.s
+++ b/llvm/test/MC/AArch64/SVE2/sri.s
@@ -12,47 +12,47 @@
 sri     z0.b, z0.b, #1
 // CHECK-INST: sri	z0.b, z0.b, #1
 // CHECK-ENCODING: [0x00,0xf0,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f0 0f 45 <unknown>
 
 sri     z31.b, z31.b, #8
 // CHECK-INST: sri	z31.b, z31.b, #8
 // CHECK-ENCODING: [0xff,0xf3,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f3 08 45 <unknown>
 
 sri     z0.h, z0.h, #1
 // CHECK-INST: sri	z0.h, z0.h, #1
 // CHECK-ENCODING: [0x00,0xf0,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f0 1f 45 <unknown>
 
 sri     z31.h, z31.h, #16
 // CHECK-INST: sri	z31.h, z31.h, #16
 // CHECK-ENCODING: [0xff,0xf3,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f3 10 45 <unknown>
 
 sri     z0.s, z0.s, #1
 // CHECK-INST: sri	z0.s, z0.s, #1
 // CHECK-ENCODING: [0x00,0xf0,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f0 5f 45 <unknown>
 
 sri     z31.s, z31.s, #32
 // CHECK-INST: sri	z31.s, z31.s, #32
 // CHECK-ENCODING: [0xff,0xf3,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f3 40 45 <unknown>
 
 sri     z0.d, z0.d, #1
 // CHECK-INST: sri	z0.d, z0.d, #1
 // CHECK-ENCODING: [0x00,0xf0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 f0 df 45 <unknown>
 
 sri     z31.d, z31.d, #64
 // CHECK-INST: sri	z31.d, z31.d, #64
 // CHECK-ENCODING: [0xff,0xf3,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff f3 80 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/srshl.s b/llvm/test/MC/AArch64/SVE2/srshl.s
index b9a98c79dc613..323ce653beee7 100644
--- a/llvm/test/MC/AArch64/SVE2/srshl.s
+++ b/llvm/test/MC/AArch64/SVE2/srshl.s
@@ -12,25 +12,25 @@
 srshl z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: srshl z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x02,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 02 44 <unknown>
 
 srshl z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: srshl z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x42,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 42 44 <unknown>
 
 srshl z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: srshl z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x82,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 82 44 <unknown>
 
 srshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: srshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c2 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ srshl z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 srshl z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: srshl z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xc2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 c2 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 srshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: srshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc2,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c2 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/srshlr.s b/llvm/test/MC/AArch64/SVE2/srshlr.s
index 7c25316334e7d..e2f488f1d262c 100644
--- a/llvm/test/MC/AArch64/SVE2/srshlr.s
+++ b/llvm/test/MC/AArch64/SVE2/srshlr.s
@@ -12,25 +12,25 @@
 srshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: srshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x06,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 06 44 <unknown>
 
 srshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: srshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x46,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 46 44 <unknown>
 
 srshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: srshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x86,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 86 44 <unknown>
 
 srshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: srshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c6 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ srshlr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 srshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: srshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xc6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 c6 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 srshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: srshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc6,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c6 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/srshr.s b/llvm/test/MC/AArch64/SVE2/srshr.s
index a8b71141bcba5..6a5bab3c88292 100644
--- a/llvm/test/MC/AArch64/SVE2/srshr.s
+++ b/llvm/test/MC/AArch64/SVE2/srshr.s
@@ -12,49 +12,49 @@
 srshr    z0.b, p0/m, z0.b, #1
 // CHECK-INST: srshr	z0.b, p0/m, z0.b, #1
 // CHECK-ENCODING: [0xe0,0x81,0x0c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 81 0c 04 <unknown>
 
 srshr    z31.b, p0/m, z31.b, #8
 // CHECK-INST: srshr	z31.b, p0/m, z31.b, #8
 // CHECK-ENCODING: [0x1f,0x81,0x0c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 81 0c 04 <unknown>
 
 srshr    z0.h, p0/m, z0.h, #1
 // CHECK-INST: srshr	z0.h, p0/m, z0.h, #1
 // CHECK-ENCODING: [0xe0,0x83,0x0c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 83 0c 04 <unknown>
 
 srshr    z31.h, p0/m, z31.h, #16
 // CHECK-INST: srshr	z31.h, p0/m, z31.h, #16
 // CHECK-ENCODING: [0x1f,0x82,0x0c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 82 0c 04 <unknown>
 
 srshr    z0.s, p0/m, z0.s, #1
 // CHECK-INST: srshr	z0.s, p0/m, z0.s, #1
 // CHECK-ENCODING: [0xe0,0x83,0x4c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 83 4c 04 <unknown>
 
 srshr    z31.s, p0/m, z31.s, #32
 // CHECK-INST: srshr	z31.s, p0/m, z31.s, #32
 // CHECK-ENCODING: [0x1f,0x80,0x4c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 4c 04 <unknown>
 
 srshr    z0.d, p0/m, z0.d, #1
 // CHECK-INST: srshr	z0.d, p0/m, z0.d, #1
 // CHECK-ENCODING: [0xe0,0x83,0xcc,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 83 cc 04 <unknown>
 
 srshr    z31.d, p0/m, z31.d, #64
 // CHECK-INST: srshr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x8c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 8c 04 <unknown>
 
 
@@ -64,23 +64,23 @@ srshr    z31.d, p0/m, z31.d, #64
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx	z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 srshr    z31.d, p0/m, z31.d, #64
 // CHECK-INST: srshr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x8c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 8c 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 srshr    z31.d, p0/m, z31.d, #64
 // CHECK-INST: srshr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x8c,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 8c 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/srsra.s b/llvm/test/MC/AArch64/SVE2/srsra.s
index 3deecc3805d70..ef6f95efa0ef3 100644
--- a/llvm/test/MC/AArch64/SVE2/srsra.s
+++ b/llvm/test/MC/AArch64/SVE2/srsra.s
@@ -12,49 +12,49 @@
 srsra     z0.b, z0.b, #1
 // CHECK-INST: srsra	z0.b, z0.b, #1
 // CHECK-ENCODING: [0x00,0xe8,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e8 0f 45 <unknown>
 
 srsra     z31.b, z31.b, #8
 // CHECK-INST: srsra	z31.b, z31.b, #8
 // CHECK-ENCODING: [0xff,0xeb,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff eb 08 45 <unknown>
 
 srsra     z0.h, z0.h, #1
 // CHECK-INST: srsra	z0.h, z0.h, #1
 // CHECK-ENCODING: [0x00,0xe8,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e8 1f 45 <unknown>
 
 srsra     z31.h, z31.h, #16
 // CHECK-INST: srsra	z31.h, z31.h, #16
 // CHECK-ENCODING: [0xff,0xeb,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff eb 10 45 <unknown>
 
 srsra     z0.s, z0.s, #1
 // CHECK-INST: srsra	z0.s, z0.s, #1
 // CHECK-ENCODING: [0x00,0xe8,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e8 5f 45 <unknown>
 
 srsra     z31.s, z31.s, #32
 // CHECK-INST: srsra	z31.s, z31.s, #32
 // CHECK-ENCODING: [0xff,0xeb,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff eb 40 45 <unknown>
 
 srsra     z0.d, z0.d, #1
 // CHECK-INST: srsra	z0.d, z0.d, #1
 // CHECK-ENCODING: [0x00,0xe8,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e8 df 45 <unknown>
 
 srsra     z31.d, z31.d, #64
 // CHECK-INST: srsra	z31.d, z31.d, #64
 // CHECK-ENCODING: [0xff,0xeb,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff eb 80 45 <unknown>
 
 
@@ -64,11 +64,11 @@ srsra     z31.d, z31.d, #64
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 srsra     z0.d, z1.d, #1
 // CHECK-INST: srsra	z0.d, z1.d, #1
 // CHECK-ENCODING: [0x20,0xe8,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 e8 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sshllb.s b/llvm/test/MC/AArch64/SVE2/sshllb.s
index 11d40ed616670..8b486050b9f08 100644
--- a/llvm/test/MC/AArch64/SVE2/sshllb.s
+++ b/llvm/test/MC/AArch64/SVE2/sshllb.s
@@ -12,35 +12,35 @@
 sshllb     z0.h, z0.b, #0
 // CHECK-INST: sshllb	z0.h, z0.b, #0
 // CHECK-ENCODING: [0x00,0xa0,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a0 08 45 <unknown>
 
 sshllb     z31.h, z31.b, #7
 // CHECK-INST: sshllb	z31.h, z31.b, #7
 // CHECK-ENCODING: [0xff,0xa3,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff a3 0f 45 <unknown>
 
 sshllb     z0.s, z0.h, #0
 // CHECK-INST: sshllb	z0.s, z0.h, #0
 // CHECK-ENCODING: [0x00,0xa0,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a0 10 45 <unknown>
 
 sshllb     z31.s, z31.h, #15
 // CHECK-INST: sshllb	z31.s, z31.h, #15
 // CHECK-ENCODING: [0xff,0xa3,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff a3 1f 45 <unknown>
 
 sshllb     z0.d, z0.s, #0
 // CHECK-INST: sshllb	z0.d, z0.s, #0
 // CHECK-ENCODING: [0x00,0xa0,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a0 40 45 <unknown>
 
 sshllb     z31.d, z31.s, #31
 // CHECK-INST: sshllb	z31.d, z31.s, #31
 // CHECK-ENCODING: [0xff,0xa3,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff a3 5f 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/sshllt.s b/llvm/test/MC/AArch64/SVE2/sshllt.s
index 600d27c543720..70e766ff46680 100644
--- a/llvm/test/MC/AArch64/SVE2/sshllt.s
+++ b/llvm/test/MC/AArch64/SVE2/sshllt.s
@@ -12,35 +12,35 @@
 sshllt     z0.h, z0.b, #0
 // CHECK-INST: sshllt	z0.h, z0.b, #0
 // CHECK-ENCODING: [0x00,0xa4,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a4 08 45 <unknown>
 
 sshllt     z31.h, z31.b, #7
 // CHECK-INST: sshllt	z31.h, z31.b, #7
 // CHECK-ENCODING: [0xff,0xa7,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff a7 0f 45 <unknown>
 
 sshllt     z0.s, z0.h, #0
 // CHECK-INST: sshllt	z0.s, z0.h, #0
 // CHECK-ENCODING: [0x00,0xa4,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a4 10 45 <unknown>
 
 sshllt     z31.s, z31.h, #15
 // CHECK-INST: sshllt	z31.s, z31.h, #15
 // CHECK-ENCODING: [0xff,0xa7,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff a7 1f 45 <unknown>
 
 sshllt     z0.d, z0.s, #0
 // CHECK-INST: sshllt	z0.d, z0.s, #0
 // CHECK-ENCODING: [0x00,0xa4,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a4 40 45 <unknown>
 
 sshllt     z31.d, z31.s, #31
 // CHECK-INST: sshllt	z31.d, z31.s, #31
 // CHECK-ENCODING: [0xff,0xa7,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff a7 5f 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ssra.s b/llvm/test/MC/AArch64/SVE2/ssra.s
index b894595238d01..ab836a1b8a629 100644
--- a/llvm/test/MC/AArch64/SVE2/ssra.s
+++ b/llvm/test/MC/AArch64/SVE2/ssra.s
@@ -12,49 +12,49 @@
 ssra     z0.b, z0.b, #1
 // CHECK-INST: ssra	z0.b, z0.b, #1
 // CHECK-ENCODING: [0x00,0xe0,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e0 0f 45 <unknown>
 
 ssra     z31.b, z31.b, #8
 // CHECK-INST: ssra	z31.b, z31.b, #8
 // CHECK-ENCODING: [0xff,0xe3,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e3 08 45 <unknown>
 
 ssra     z0.h, z0.h, #1
 // CHECK-INST: ssra	z0.h, z0.h, #1
 // CHECK-ENCODING: [0x00,0xe0,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e0 1f 45 <unknown>
 
 ssra     z31.h, z31.h, #16
 // CHECK-INST: ssra	z31.h, z31.h, #16
 // CHECK-ENCODING: [0xff,0xe3,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e3 10 45 <unknown>
 
 ssra     z0.s, z0.s, #1
 // CHECK-INST: ssra	z0.s, z0.s, #1
 // CHECK-ENCODING: [0x00,0xe0,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e0 5f 45 <unknown>
 
 ssra     z31.s, z31.s, #32
 // CHECK-INST: ssra	z31.s, z31.s, #32
 // CHECK-ENCODING: [0xff,0xe3,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e3 40 45 <unknown>
 
 ssra     z0.d, z0.d, #1
 // CHECK-INST: ssra	z0.d, z0.d, #1
 // CHECK-ENCODING: [0x00,0xe0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e0 df 45 <unknown>
 
 ssra     z31.d, z31.d, #64
 // CHECK-INST: ssra	z31.d, z31.d, #64
 // CHECK-ENCODING: [0xff,0xe3,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e3 80 45 <unknown>
 
 
@@ -64,11 +64,11 @@ ssra     z31.d, z31.d, #64
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 ssra     z0.d, z1.d, #1
 // CHECK-INST: ssra	z0.d, z1.d, #1
 // CHECK-ENCODING: [0x20,0xe0,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 e0 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ssublb.s b/llvm/test/MC/AArch64/SVE2/ssublb.s
index d1ce3743ee567..c006b0b2758a6 100644
--- a/llvm/test/MC/AArch64/SVE2/ssublb.s
+++ b/llvm/test/MC/AArch64/SVE2/ssublb.s
@@ -13,17 +13,17 @@
 ssublb z0.h, z1.b, z2.b
 // CHECK-INST: ssublb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x10,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 10 42 45 <unknown>
 
 ssublb z29.s, z30.h, z31.h
 // CHECK-INST: ssublb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x13,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 13 9f 45 <unknown>
 
 ssublb z31.d, z31.s, z31.s
 // CHECK-INST: ssublb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x13,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 13 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ssublbt.s b/llvm/test/MC/AArch64/SVE2/ssublbt.s
index 214a201f9bf4f..c9fd11f35f932 100644
--- a/llvm/test/MC/AArch64/SVE2/ssublbt.s
+++ b/llvm/test/MC/AArch64/SVE2/ssublbt.s
@@ -13,17 +13,17 @@
 ssublbt z0.h, z1.b, z31.b
 // CHECK-INST: ssublbt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x88,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 88 5f 45 <unknown>
 
 ssublbt z0.s, z1.h, z31.h
 // CHECK-INST: ssublbt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x88,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 88 9f 45 <unknown>
 
 ssublbt z0.d, z1.s, z31.s
 // CHECK-INST: ssublbt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x88,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 88 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ssublt.s b/llvm/test/MC/AArch64/SVE2/ssublt.s
index 52e532ff802ae..d92135ea3abc3 100644
--- a/llvm/test/MC/AArch64/SVE2/ssublt.s
+++ b/llvm/test/MC/AArch64/SVE2/ssublt.s
@@ -13,17 +13,17 @@
 ssublt z0.h, z1.b, z2.b
 // CHECK-INST: ssublt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x14,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 14 42 45 <unknown>
 
 ssublt z29.s, z30.h, z31.h
 // CHECK-INST: ssublt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x17,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 17 9f 45 <unknown>
 
 ssublt z31.d, z31.s, z31.s
 // CHECK-INST: ssublt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x17,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 17 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ssubltb.s b/llvm/test/MC/AArch64/SVE2/ssubltb.s
index 47d6a701fc559..889ea63a9b6e5 100644
--- a/llvm/test/MC/AArch64/SVE2/ssubltb.s
+++ b/llvm/test/MC/AArch64/SVE2/ssubltb.s
@@ -13,17 +13,17 @@
 ssubltb z0.h, z1.b, z31.b
 // CHECK-INST: ssubltb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x8c,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 8c 5f 45 <unknown>
 
 ssubltb z0.s, z1.h, z31.h
 // CHECK-INST: ssubltb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x8c,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 8c 9f 45 <unknown>
 
 ssubltb z0.d, z1.s, z31.s
 // CHECK-INST: ssubltb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x8c,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 8c df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ssubwb.s b/llvm/test/MC/AArch64/SVE2/ssubwb.s
index e8dce2aaaff32..ae625fc83d8ab 100644
--- a/llvm/test/MC/AArch64/SVE2/ssubwb.s
+++ b/llvm/test/MC/AArch64/SVE2/ssubwb.s
@@ -13,17 +13,17 @@
 ssubwb z0.h, z1.h, z2.b
 // CHECK-INST: ssubwb z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x50,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 50 42 45 <unknown>
 
 ssubwb z29.s, z30.s, z31.h
 // CHECK-INST: ssubwb z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x53,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 53 9f 45 <unknown>
 
 ssubwb z31.d, z31.d, z31.s
 // CHECK-INST: ssubwb z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x53,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 53 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ssubwt.s b/llvm/test/MC/AArch64/SVE2/ssubwt.s
index 08ea2d3fdc288..13bad073db887 100644
--- a/llvm/test/MC/AArch64/SVE2/ssubwt.s
+++ b/llvm/test/MC/AArch64/SVE2/ssubwt.s
@@ -13,17 +13,17 @@
 ssubwt z0.h, z1.h, z2.b
 // CHECK-INST: ssubwt z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x54,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 54 42 45 <unknown>
 
 ssubwt z29.s, z30.s, z31.h
 // CHECK-INST: ssubwt z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x57,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 57 9f 45 <unknown>
 
 ssubwt z31.d, z31.d, z31.s
 // CHECK-INST: ssubwt z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x57,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 57 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/subhnb.s b/llvm/test/MC/AArch64/SVE2/subhnb.s
index d58e944123992..8a3a434a158dd 100644
--- a/llvm/test/MC/AArch64/SVE2/subhnb.s
+++ b/llvm/test/MC/AArch64/SVE2/subhnb.s
@@ -13,17 +13,17 @@
 subhnb z0.b, z1.h, z31.h
 // CHECK-INST: subhnb	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x70,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 7f 45 <unknown>
 
 subhnb z0.h, z1.s, z31.s
 // CHECK-INST: subhnb	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x70,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 bf 45 <unknown>
 
 subhnb z0.s, z1.d, z31.d
 // CHECK-INST: subhnb	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x70,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 70 ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/subhnt.s b/llvm/test/MC/AArch64/SVE2/subhnt.s
index 01c46e7d2705a..ec3599dffb459 100644
--- a/llvm/test/MC/AArch64/SVE2/subhnt.s
+++ b/llvm/test/MC/AArch64/SVE2/subhnt.s
@@ -13,17 +13,17 @@
 subhnt z0.b, z1.h, z31.h
 // CHECK-INST: subhnt	z0.b, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x74,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 7f 45 <unknown>
 
 subhnt z0.h, z1.s, z31.s
 // CHECK-INST: subhnt	z0.h, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x74,0xbf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 bf 45 <unknown>
 
 subhnt z0.s, z1.d, z31.d
 // CHECK-INST: subhnt	z0.s, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0x74,0xff,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 74 ff 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/suqadd.s b/llvm/test/MC/AArch64/SVE2/suqadd.s
index d69c3dfd2df08..e450fa7e20179 100644
--- a/llvm/test/MC/AArch64/SVE2/suqadd.s
+++ b/llvm/test/MC/AArch64/SVE2/suqadd.s
@@ -12,25 +12,25 @@
 suqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: suqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x1c,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 1c 44 <unknown>
 
 suqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: suqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x5c,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 5c 44 <unknown>
 
 suqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: suqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x9c,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 9c 44 <unknown>
 
 suqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: suqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdc,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f dc 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ suqadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 suqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: suqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xdc,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 dc 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 suqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: suqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdc,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f dc 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/tbl.s b/llvm/test/MC/AArch64/SVE2/tbl.s
index c2bb20ac21a27..e3f764a78acd7 100644
--- a/llvm/test/MC/AArch64/SVE2/tbl.s
+++ b/llvm/test/MC/AArch64/SVE2/tbl.s
@@ -12,23 +12,23 @@
 tbl  z28.b, { z29.b, z30.b }, z31.b
 // CHECK-INST: tbl  z28.b, { z29.b, z30.b }, z31.b
 // CHECK-ENCODING: [0xbc,0x2b,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bc 2b 3f 05 <unknown>
 
 tbl  z28.h, { z29.h, z30.h }, z31.h
 // CHECK-INST: tbl  z28.h, { z29.h, z30.h }, z31.h
 // CHECK-ENCODING: [0xbc,0x2b,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bc 2b 7f 05 <unknown>
 
 tbl  z28.s, { z29.s, z30.s }, z31.s
 // CHECK-INST: tbl  z28.s, { z29.s, z30.s }, z31.s
 // CHECK-ENCODING: [0xbc,0x2b,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bc 2b bf 05 <unknown>
 
 tbl  z28.d, { z29.d, z30.d }, z31.d
 // CHECK-INST: tbl  z28.d, { z29.d, z30.d }, z31.d
 // CHECK-ENCODING: [0xbc,0x2b,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: bc 2b ff 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/tbx.s b/llvm/test/MC/AArch64/SVE2/tbx.s
index e2a6c827a7471..45f3c5d0ce2a6 100644
--- a/llvm/test/MC/AArch64/SVE2/tbx.s
+++ b/llvm/test/MC/AArch64/SVE2/tbx.s
@@ -12,23 +12,23 @@
 tbx  z31.b, z31.b, z31.b
 // CHECK-INST: tbx	z31.b, z31.b, z31.b
 // CHECK-ENCODING: [0xff,0x2f,0x3f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2f 3f 05 <unknown>
 
 tbx  z31.h, z31.h, z31.h
 // CHECK-INST: tbx	z31.h, z31.h, z31.h
 // CHECK-ENCODING: [0xff,0x2f,0x7f,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2f 7f 05 <unknown>
 
 tbx  z31.s, z31.s, z31.s
 // CHECK-INST: tbx	z31.s, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x2f,0xbf,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2f bf 05 <unknown>
 
 tbx  z31.d, z31.d, z31.d
 // CHECK-INST: tbx	z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x2f,0xff,0x05]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 2f ff 05 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uaba.s b/llvm/test/MC/AArch64/SVE2/uaba.s
index e18a598872d9f..ff241366f7879 100644
--- a/llvm/test/MC/AArch64/SVE2/uaba.s
+++ b/llvm/test/MC/AArch64/SVE2/uaba.s
@@ -12,25 +12,25 @@
 uaba z0.b, z1.b, z31.b
 // CHECK-INST: uaba z0.b, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xfc,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 fc 1f 45 <unknown>
 
 uaba z0.h, z1.h, z31.h
 // CHECK-INST: uaba z0.h, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xfc,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 fc 5f 45 <unknown>
 
 uaba z0.s, z1.s, z31.s
 // CHECK-INST: uaba z0.s, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xfc,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 fc 9f 45 <unknown>
 
 uaba z0.d, z1.d, z31.d
 // CHECK-INST: uaba z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xfc,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 fc df 45 <unknown>
 
 
@@ -40,11 +40,11 @@ uaba z0.d, z1.d, z31.d
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 uaba z0.d, z1.d, z31.d
 // CHECK-INST: uaba z0.d, z1.d, z31.d
 // CHECK-ENCODING: [0x20,0xfc,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 fc df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uabalb.s b/llvm/test/MC/AArch64/SVE2/uabalb.s
index 1e2e5111f9102..964b9f79edd78 100644
--- a/llvm/test/MC/AArch64/SVE2/uabalb.s
+++ b/llvm/test/MC/AArch64/SVE2/uabalb.s
@@ -13,19 +13,19 @@
 uabalb z0.h, z1.b, z31.b
 // CHECK-INST: uabalb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xc8,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c8 5f 45 <unknown>
 
 uabalb z0.s, z1.h, z31.h
 // CHECK-INST: uabalb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xc8,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c8 9f 45 <unknown>
 
 uabalb z0.d, z1.s, z31.s
 // CHECK-INST: uabalb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xc8,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 c8 df 45 <unknown>
 
 
@@ -35,11 +35,11 @@ uabalb z0.d, z1.s, z31.s
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 uabalb z21.d, z1.s, z31.s
 // CHECK-INST: uabalb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0xc8,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 c8 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uabalt.s b/llvm/test/MC/AArch64/SVE2/uabalt.s
index 6ed5465237770..db529fed0f0a2 100644
--- a/llvm/test/MC/AArch64/SVE2/uabalt.s
+++ b/llvm/test/MC/AArch64/SVE2/uabalt.s
@@ -13,19 +13,19 @@
 uabalt z0.h, z1.b, z31.b
 // CHECK-INST: uabalt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0xcc,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 cc 5f 45 <unknown>
 
 uabalt z0.s, z1.h, z31.h
 // CHECK-INST: uabalt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0xcc,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 cc 9f 45 <unknown>
 
 uabalt z0.d, z1.s, z31.s
 // CHECK-INST: uabalt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0xcc,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 cc df 45 <unknown>
 
 
@@ -35,11 +35,11 @@ uabalt z0.d, z1.s, z31.s
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 uabalt z21.d, z1.s, z31.s
 // CHECK-INST: uabalt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0xcc,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 cc df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uabdlb.s b/llvm/test/MC/AArch64/SVE2/uabdlb.s
index 17205ec560d4d..d38b1b0db4303 100644
--- a/llvm/test/MC/AArch64/SVE2/uabdlb.s
+++ b/llvm/test/MC/AArch64/SVE2/uabdlb.s
@@ -13,17 +13,17 @@
 uabdlb z0.h, z1.b, z2.b
 // CHECK-INST: uabdlb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x38,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 38 42 45 <unknown>
 
 uabdlb z29.s, z30.h, z31.h
 // CHECK-INST: uabdlb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x3b,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 3b 9f 45 <unknown>
 
 uabdlb z31.d, z31.s, z31.s
 // CHECK-INST: uabdlb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x3b,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uabdlt.s b/llvm/test/MC/AArch64/SVE2/uabdlt.s
index 9a2ddddcd0230..70b746e88bb99 100644
--- a/llvm/test/MC/AArch64/SVE2/uabdlt.s
+++ b/llvm/test/MC/AArch64/SVE2/uabdlt.s
@@ -13,17 +13,17 @@
 uabdlt z0.h, z1.b, z2.b
 // CHECK-INST: uabdlt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x3c,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 3c 42 45 <unknown>
 
 uabdlt z29.s, z30.h, z31.h
 // CHECK-INST: uabdlt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x3f,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 3f 9f 45 <unknown>
 
 uabdlt z31.d, z31.s, z31.s
 // CHECK-INST: uabdlt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x3f,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3f df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uadalp.s b/llvm/test/MC/AArch64/SVE2/uadalp.s
index 8887f132db16a..31d2d8ace3373 100644
--- a/llvm/test/MC/AArch64/SVE2/uadalp.s
+++ b/llvm/test/MC/AArch64/SVE2/uadalp.s
@@ -12,19 +12,19 @@
 uadalp z0.h, p0/m, z1.b
 // CHECK-INST: uadalp z0.h, p0/m, z1.b
 // CHECK-ENCODING: [0x20,0xa0,0x45,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 45 44 <unknown>
 
 uadalp z29.s, p0/m, z30.h
 // CHECK-INST: uadalp z29.s, p0/m, z30.h
 // CHECK-ENCODING: [0xdd,0xa3,0x85,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd a3 85 44 <unknown>
 
 uadalp z30.d, p7/m, z31.s
 // CHECK-INST: uadalp z30.d, p7/m, z31.s
 // CHECK-ENCODING: [0xfe,0xbf,0xc5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: fe bf c5 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -33,23 +33,23 @@ uadalp z30.d, p7/m, z31.s
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uadalp z31.d, p0/m, z30.s
 // CHECK-INST: uadalp z31.d, p0/m, z30.s
 // CHECK-ENCODING: [0xdf,0xa3,0xc5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 c5 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uadalp z31.d, p0/m, z30.s
 // CHECK-INST: uadalp z31.d, p0/m, z30.s
 // CHECK-ENCODING: [0xdf,0xa3,0xc5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 c5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uaddlb.s b/llvm/test/MC/AArch64/SVE2/uaddlb.s
index aba06d215b55d..9f44ab38e99cc 100644
--- a/llvm/test/MC/AArch64/SVE2/uaddlb.s
+++ b/llvm/test/MC/AArch64/SVE2/uaddlb.s
@@ -13,17 +13,17 @@
 uaddlb z0.h, z1.b, z2.b
 // CHECK-INST: uaddlb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x08,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 08 42 45 <unknown>
 
 uaddlb z29.s, z30.h, z31.h
 // CHECK-INST: uaddlb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x0b,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 0b 9f 45 <unknown>
 
 uaddlb z31.d, z31.s, z31.s
 // CHECK-INST: uaddlb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x0b,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0b df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uaddlt.s b/llvm/test/MC/AArch64/SVE2/uaddlt.s
index 458c33d1564c2..19626f08aae64 100644
--- a/llvm/test/MC/AArch64/SVE2/uaddlt.s
+++ b/llvm/test/MC/AArch64/SVE2/uaddlt.s
@@ -13,17 +13,17 @@
 uaddlt z0.h, z1.b, z2.b
 // CHECK-INST: uaddlt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x0c,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 0c 42 45 <unknown>
 
 uaddlt z29.s, z30.h, z31.h
 // CHECK-INST: uaddlt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x0f,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 0f 9f 45 <unknown>
 
 uaddlt z31.d, z31.s, z31.s
 // CHECK-INST: uaddlt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x0f,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0f df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uaddwb.s b/llvm/test/MC/AArch64/SVE2/uaddwb.s
index 84a8b876245e1..ce7395152ddd3 100644
--- a/llvm/test/MC/AArch64/SVE2/uaddwb.s
+++ b/llvm/test/MC/AArch64/SVE2/uaddwb.s
@@ -13,17 +13,17 @@
 uaddwb z0.h, z1.h, z2.b
 // CHECK-INST: uaddwb z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x48,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 48 42 45 <unknown>
 
 uaddwb z29.s, z30.s, z31.h
 // CHECK-INST: uaddwb z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x4b,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 4b 9f 45 <unknown>
 
 uaddwb z31.d, z31.d, z31.s
 // CHECK-INST: uaddwb z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x4b,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 4b df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uaddwt.s b/llvm/test/MC/AArch64/SVE2/uaddwt.s
index 83421f2e3fea7..7adab15d8a14d 100644
--- a/llvm/test/MC/AArch64/SVE2/uaddwt.s
+++ b/llvm/test/MC/AArch64/SVE2/uaddwt.s
@@ -13,17 +13,17 @@
 uaddwt z0.h, z1.h, z2.b
 // CHECK-INST: uaddwt z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x4c,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 4c 42 45 <unknown>
 
 uaddwt z29.s, z30.s, z31.h
 // CHECK-INST: uaddwt z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x4f,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 4f 9f 45 <unknown>
 
 uaddwt z31.d, z31.d, z31.s
 // CHECK-INST: uaddwt z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x4f,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 4f df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uhadd.s b/llvm/test/MC/AArch64/SVE2/uhadd.s
index 3642a704b247f..5fbffc13f4cd7 100644
--- a/llvm/test/MC/AArch64/SVE2/uhadd.s
+++ b/llvm/test/MC/AArch64/SVE2/uhadd.s
@@ -12,25 +12,25 @@
 uhadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uhadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x11,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 11 44 <unknown>
 
 uhadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uhadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x51,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 51 44 <unknown>
 
 uhadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uhadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x91,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 91 44 <unknown>
 
 uhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d1 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uhadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uhadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uhadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d1 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd1,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d1 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uhsub.s b/llvm/test/MC/AArch64/SVE2/uhsub.s
index c26ba6a31b6ec..1a12620c940f8 100644
--- a/llvm/test/MC/AArch64/SVE2/uhsub.s
+++ b/llvm/test/MC/AArch64/SVE2/uhsub.s
@@ -12,25 +12,25 @@
 uhsub z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uhsub z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x13,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 13 44 <unknown>
 
 uhsub z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uhsub z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x53,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 53 44 <unknown>
 
 uhsub z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uhsub z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x93,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 93 44 <unknown>
 
 uhsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uhsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd3,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d3 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uhsub z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uhsub z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uhsub z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd3,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d3 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uhsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uhsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd3,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d3 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uhsubr.s b/llvm/test/MC/AArch64/SVE2/uhsubr.s
index 227e8bed66aa6..d8eeb4124527b 100644
--- a/llvm/test/MC/AArch64/SVE2/uhsubr.s
+++ b/llvm/test/MC/AArch64/SVE2/uhsubr.s
@@ -12,25 +12,25 @@
 uhsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uhsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x17,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 17 44 <unknown>
 
 uhsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uhsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x57,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 57 44 <unknown>
 
 uhsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uhsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x97,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 97 44 <unknown>
 
 uhsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uhsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d7 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uhsubr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uhsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uhsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d7 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uhsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uhsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d7 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umaxp.s b/llvm/test/MC/AArch64/SVE2/umaxp.s
index d88801f87fd80..65e9555a6e02a 100644
--- a/llvm/test/MC/AArch64/SVE2/umaxp.s
+++ b/llvm/test/MC/AArch64/SVE2/umaxp.s
@@ -12,25 +12,25 @@
 umaxp z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: umaxp z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0xa0,0x15,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 15 44 <unknown>
 
 umaxp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: umaxp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0xa0,0x55,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 55 44 <unknown>
 
 umaxp z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: umaxp z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0xbf,0x95,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd bf 95 44 <unknown>
 
 umaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: umaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d5 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ umaxp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 umaxp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: umaxp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xa3,0xd5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 d5 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 umaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: umaxp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uminp.s b/llvm/test/MC/AArch64/SVE2/uminp.s
index 056338cef3ccb..3b9033f4758cc 100644
--- a/llvm/test/MC/AArch64/SVE2/uminp.s
+++ b/llvm/test/MC/AArch64/SVE2/uminp.s
@@ -12,25 +12,25 @@
 uminp z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uminp z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0xa0,0x17,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 17 44 <unknown>
 
 uminp z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uminp z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0xa0,0x57,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 a0 57 44 <unknown>
 
 uminp z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uminp z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0xbf,0x97,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd bf 97 44 <unknown>
 
 uminp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uminp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d7 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uminp z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uminp z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uminp z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xa3,0xd7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df a3 d7 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uminp z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uminp z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0xbf,0xd7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df bf d7 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umlalb.s b/llvm/test/MC/AArch64/SVE2/umlalb.s
index a302be198d2b3..82660736b385e 100644
--- a/llvm/test/MC/AArch64/SVE2/umlalb.s
+++ b/llvm/test/MC/AArch64/SVE2/umlalb.s
@@ -13,31 +13,31 @@
 umlalb z0.h, z1.b, z31.b
 // CHECK-INST: umlalb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x48,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 48 5f 44 <unknown>
 
 umlalb z0.s, z1.h, z31.h
 // CHECK-INST: umlalb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x48,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 48 9f 44 <unknown>
 
 umlalb z0.d, z1.s, z31.s
 // CHECK-INST: umlalb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x48,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 48 df 44 <unknown>
 
 umlalb z0.s, z1.h, z7.h[7]
 // CHECK-INST: umlalb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x98,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 98 bf 44 <unknown>
 
 umlalb z0.d, z1.s, z15.s[1]
 // CHECK-INST: umlalb	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0x98,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 98 ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ umlalb z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlalb z21.d, z1.s, z31.s
 // CHECK-INST: umlalb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x48,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 48 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlalb   z21.d, z10.s, z5.s[1]
 // CHECK-INST: umlalb   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x99,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 99 e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umlalt.s b/llvm/test/MC/AArch64/SVE2/umlalt.s
index 3aa8183fc975e..4077df70545f5 100644
--- a/llvm/test/MC/AArch64/SVE2/umlalt.s
+++ b/llvm/test/MC/AArch64/SVE2/umlalt.s
@@ -13,31 +13,31 @@
 umlalt z0.h, z1.b, z31.b
 // CHECK-INST: umlalt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x4c,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 4c 5f 44 <unknown>
 
 umlalt z0.s, z1.h, z31.h
 // CHECK-INST: umlalt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x4c,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 4c 9f 44 <unknown>
 
 umlalt z0.d, z1.s, z31.s
 // CHECK-INST: umlalt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x4c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 4c df 44 <unknown>
 
 umlalt z0.s, z1.h, z7.h[7]
 // CHECK-INST: umlalt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0x9c,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 9c bf 44 <unknown>
 
 umlalt z0.d, z1.s, z15.s[1]
 // CHECK-INST: umlalt	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0x9c,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 9c ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ umlalt z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlalt z21.d, z1.s, z31.s
 // CHECK-INST: umlalt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x4c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 4c df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlalt   z21.d, z10.s, z5.s[1]
 // CHECK-INST: umlalt   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0x9d,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 9d e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umlslb.s b/llvm/test/MC/AArch64/SVE2/umlslb.s
index 05b03ed461f2d..9dd1f977bb50d 100644
--- a/llvm/test/MC/AArch64/SVE2/umlslb.s
+++ b/llvm/test/MC/AArch64/SVE2/umlslb.s
@@ -13,31 +13,31 @@
 umlslb z0.h, z1.b, z31.b
 // CHECK-INST: umlslb	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x58,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 58 5f 44 <unknown>
 
 umlslb z0.s, z1.h, z31.h
 // CHECK-INST: umlslb	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x58,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 58 9f 44 <unknown>
 
 umlslb z0.d, z1.s, z31.s
 // CHECK-INST: umlslb	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x58,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 58 df 44 <unknown>
 
 umlslb z0.s, z1.h, z7.h[7]
 // CHECK-INST: umlslb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xb8,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 b8 bf 44 <unknown>
 
 umlslb z0.d, z1.s, z15.s[1]
 // CHECK-INST: umlslb	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xb8,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 b8 ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ umlslb z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlslb z21.d, z1.s, z31.s
 // CHECK-INST: umlslb	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x58,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 58 df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlslb   z21.d, z10.s, z5.s[1]
 // CHECK-INST: umlslb   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0xb9,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 b9 e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umlslt.s b/llvm/test/MC/AArch64/SVE2/umlslt.s
index f0e50a0691c28..3f8de7dff2cad 100644
--- a/llvm/test/MC/AArch64/SVE2/umlslt.s
+++ b/llvm/test/MC/AArch64/SVE2/umlslt.s
@@ -13,31 +13,31 @@
 umlslt z0.h, z1.b, z31.b
 // CHECK-INST: umlslt	z0.h, z1.b, z31.b
 // CHECK-ENCODING: [0x20,0x5c,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 5c 5f 44 <unknown>
 
 umlslt z0.s, z1.h, z31.h
 // CHECK-INST: umlslt	z0.s, z1.h, z31.h
 // CHECK-ENCODING: [0x20,0x5c,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 5c 9f 44 <unknown>
 
 umlslt z0.d, z1.s, z31.s
 // CHECK-INST: umlslt	z0.d, z1.s, z31.s
 // CHECK-ENCODING: [0x20,0x5c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 5c df 44 <unknown>
 
 umlslt z0.s, z1.h, z7.h[7]
 // CHECK-INST: umlslt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xbc,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 bc bf 44 <unknown>
 
 umlslt z0.d, z1.s, z15.s[1]
 // CHECK-INST: umlslt	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xbc,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 bc ef 44 <unknown>
 
 
@@ -47,23 +47,23 @@ umlslt z0.d, z1.s, z15.s[1]
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlslt z21.d, z1.s, z31.s
 // CHECK-INST: umlslt	z21.d, z1.s, z31.s
 // CHECK-ENCODING: [0x35,0x5c,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 35 5c df 44 <unknown>
 
 movprfx z21, z28
 // CHECK-INST: movprfx	z21, z28
 // CHECK-ENCODING: [0x95,0xbf,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 95 bf 20 04 <unknown>
 
 umlslt   z21.d, z10.s, z5.s[1]
 // CHECK-INST: umlslt   z21.d, z10.s, z5.s[1]
 // CHECK-ENCODING: [0x55,0xbd,0xe5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 55 bd e5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umulh.s b/llvm/test/MC/AArch64/SVE2/umulh.s
index 58dde1dff0f0c..749c27571efa1 100644
--- a/llvm/test/MC/AArch64/SVE2/umulh.s
+++ b/llvm/test/MC/AArch64/SVE2/umulh.s
@@ -12,23 +12,23 @@
 umulh z0.b, z1.b, z2.b
 // CHECK-INST: umulh z0.b, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x6c,0x22,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c 22 04 <unknown>
 
 umulh z0.h, z1.h, z2.h
 // CHECK-INST: umulh z0.h, z1.h, z2.h
 // CHECK-ENCODING: [0x20,0x6c,0x62,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 6c 62 04 <unknown>
 
 umulh z29.s, z30.s, z31.s
 // CHECK-INST: umulh z29.s, z30.s, z31.s
 // CHECK-ENCODING: [0xdd,0x6f,0xbf,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 6f bf 04 <unknown>
 
 umulh z31.d, z31.d, z31.d
 // CHECK-INST: umulh z31.d, z31.d, z31.d
 // CHECK-ENCODING: [0xff,0x6f,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 6f ff 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umullb.s b/llvm/test/MC/AArch64/SVE2/umullb.s
index d8a98e2c1bfbb..37cd499cd2e1b 100644
--- a/llvm/test/MC/AArch64/SVE2/umullb.s
+++ b/llvm/test/MC/AArch64/SVE2/umullb.s
@@ -13,29 +13,29 @@
 umullb z0.h, z1.b, z2.b
 // CHECK-INST: umullb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x78,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 78 42 45 <unknown>
 
 umullb z29.s, z30.h, z31.h
 // CHECK-INST: umullb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x7b,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 7b 9f 45 <unknown>
 
 umullb z31.d, z31.s, z31.s
 // CHECK-INST: umullb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x7b,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 7b df 45 <unknown>
 
 umullb z0.s, z1.h, z7.h[7]
 // CHECK-INST: umullb	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xd8,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d8 bf 44 <unknown>
 
 umullb z0.d, z1.s, z15.s[1]
 // CHECK-INST: umullb	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xd8,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 d8 ef 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/umullt.s b/llvm/test/MC/AArch64/SVE2/umullt.s
index 2bd8d6f80c44c..3acaaae293e54 100644
--- a/llvm/test/MC/AArch64/SVE2/umullt.s
+++ b/llvm/test/MC/AArch64/SVE2/umullt.s
@@ -13,29 +13,29 @@
 umullt z0.h, z1.b, z2.b
 // CHECK-INST: umullt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x7c,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 7c 42 45 <unknown>
 
 umullt z29.s, z30.h, z31.h
 // CHECK-INST: umullt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x7f,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 7f 9f 45 <unknown>
 
 umullt z31.d, z31.s, z31.s
 // CHECK-INST: umullt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x7f,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 7f df 45 <unknown>
 
 umullt z0.s, z1.h, z7.h[7]
 // CHECK-INST: umullt	z0.s, z1.h, z7.h[7]
 // CHECK-ENCODING: [0x20,0xdc,0xbf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 dc bf 44 <unknown>
 
 umullt z0.d, z1.s, z15.s[1]
 // CHECK-INST: umullt	z0.d, z1.s, z15.s[1]
 // CHECK-ENCODING: [0x20,0xdc,0xef,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 dc ef 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqadd.s b/llvm/test/MC/AArch64/SVE2/uqadd.s
index 19a935611487b..be127c4333395 100644
--- a/llvm/test/MC/AArch64/SVE2/uqadd.s
+++ b/llvm/test/MC/AArch64/SVE2/uqadd.s
@@ -12,25 +12,25 @@
 uqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x19,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 19 44 <unknown>
 
 uqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x59,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 59 44 <unknown>
 
 uqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x99,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 99 44 <unknown>
 
 uqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd9,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d9 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uqadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd9,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d9 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd9,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d9 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqrshl.s b/llvm/test/MC/AArch64/SVE2/uqrshl.s
index e424a354ce324..d25e6d80779a4 100644
--- a/llvm/test/MC/AArch64/SVE2/uqrshl.s
+++ b/llvm/test/MC/AArch64/SVE2/uqrshl.s
@@ -12,25 +12,25 @@
 uqrshl z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uqrshl z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x0b,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 0b 44 <unknown>
 
 uqrshl z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uqrshl z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x4b,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 4b 44 <unknown>
 
 uqrshl z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uqrshl z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x8b,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 8b 44 <unknown>
 
 uqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcb,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cb 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uqrshl z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqrshl z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uqrshl z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xcb,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 cb 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqrshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcb,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cb 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqrshlr.s b/llvm/test/MC/AArch64/SVE2/uqrshlr.s
index 0f9159cc4abac..68aee0cafdb6d 100644
--- a/llvm/test/MC/AArch64/SVE2/uqrshlr.s
+++ b/llvm/test/MC/AArch64/SVE2/uqrshlr.s
@@ -12,25 +12,25 @@
 uqrshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uqrshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x0f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 0f 44 <unknown>
 
 uqrshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uqrshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x4f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 4f 44 <unknown>
 
 uqrshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uqrshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x8f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 8f 44 <unknown>
 
 uqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cf 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uqrshlr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqrshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uqrshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xcf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 cf 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqrshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cf 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqrshrnb.s b/llvm/test/MC/AArch64/SVE2/uqrshrnb.s
index d6935f9d8d0a7..7da923d55b0af 100644
--- a/llvm/test/MC/AArch64/SVE2/uqrshrnb.s
+++ b/llvm/test/MC/AArch64/SVE2/uqrshrnb.s
@@ -12,35 +12,35 @@
 uqrshrnb     z0.b, z0.h, #1
 // CHECK-INST: uqrshrnb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x38,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 38 2f 45 <unknown>
 
 uqrshrnb     z31.b, z31.h, #8
 // CHECK-INST: uqrshrnb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x3b,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b 28 45 <unknown>
 
 uqrshrnb     z0.h, z0.s, #1
 // CHECK-INST: uqrshrnb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x38,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 38 3f 45 <unknown>
 
 uqrshrnb     z31.h, z31.s, #16
 // CHECK-INST: uqrshrnb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x3b,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b 30 45 <unknown>
 
 uqrshrnb     z0.s, z0.d, #1
 // CHECK-INST: uqrshrnb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x38,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 38 7f 45 <unknown>
 
 uqrshrnb     z31.s, z31.d, #32
 // CHECK-INST: uqrshrnb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x3b,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3b 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqrshrnt.s b/llvm/test/MC/AArch64/SVE2/uqrshrnt.s
index 2232881bb9918..ce831f5f26cce 100644
--- a/llvm/test/MC/AArch64/SVE2/uqrshrnt.s
+++ b/llvm/test/MC/AArch64/SVE2/uqrshrnt.s
@@ -12,35 +12,35 @@
 uqrshrnt     z0.b, z0.h, #1
 // CHECK-INST: uqrshrnt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x3c,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 3c 2f 45 <unknown>
 
 uqrshrnt     z31.b, z31.h, #8
 // CHECK-INST: uqrshrnt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x3f,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3f 28 45 <unknown>
 
 uqrshrnt     z0.h, z0.s, #1
 // CHECK-INST: uqrshrnt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x3c,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 3c 3f 45 <unknown>
 
 uqrshrnt     z31.h, z31.s, #16
 // CHECK-INST: uqrshrnt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x3f,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3f 30 45 <unknown>
 
 uqrshrnt     z0.s, z0.d, #1
 // CHECK-INST: uqrshrnt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x3c,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 3c 7f 45 <unknown>
 
 uqrshrnt     z31.s, z31.d, #32
 // CHECK-INST: uqrshrnt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x3f,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 3f 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqshl.s b/llvm/test/MC/AArch64/SVE2/uqshl.s
index 93d93abb67618..fbe8501651d15 100644
--- a/llvm/test/MC/AArch64/SVE2/uqshl.s
+++ b/llvm/test/MC/AArch64/SVE2/uqshl.s
@@ -12,73 +12,73 @@
 uqshl z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uqshl z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x09,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 09 44 <unknown>
 
 uqshl z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uqshl z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x49,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 49 44 <unknown>
 
 uqshl z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uqshl z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x89,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 89 44 <unknown>
 
 uqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc9,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c9 44 <unknown>
 
 uqshl z0.b, p0/m, z0.b, #0
 // CHECK-INST: uqshl z0.b, p0/m, z0.b, #0
 // CHECK-ENCODING: [0x00,0x81,0x07,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 81 07 04 <unknown>
 
 uqshl z31.b, p0/m, z31.b, #7
 // CHECK-INST: uqshl z31.b, p0/m, z31.b, #7
 // CHECK-ENCODING: [0xff,0x81,0x07,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 81 07 04 <unknown>
 
 uqshl z0.h, p0/m, z0.h, #0
 // CHECK-INST: uqshl z0.h, p0/m, z0.h, #0
 // CHECK-ENCODING: [0x00,0x82,0x07,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 82 07 04 <unknown>
 
 uqshl z31.h, p0/m, z31.h, #15
 // CHECK-INST: uqshl z31.h, p0/m, z31.h, #15
 // CHECK-ENCODING: [0xff,0x83,0x07,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 07 04 <unknown>
 
 uqshl z0.s, p0/m, z0.s, #0
 // CHECK-INST: uqshl z0.s, p0/m, z0.s, #0
 // CHECK-ENCODING: [0x00,0x80,0x47,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 80 47 04 <unknown>
 
 uqshl z31.s, p0/m, z31.s, #31
 // CHECK-INST: uqshl z31.s, p0/m, z31.s, #31
 // CHECK-ENCODING: [0xff,0x83,0x47,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 47 04 <unknown>
 
 uqshl z0.d, p0/m, z0.d, #0
 // CHECK-INST: uqshl z0.d, p0/m, z0.d, #0
 // CHECK-ENCODING: [0x00,0x80,0x87,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 80 87 04 <unknown>
 
 uqshl z31.d, p0/m, z31.d, #63
 // CHECK-INST: uqshl z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 c7 04 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -87,47 +87,47 @@ uqshl z31.d, p0/m, z31.d, #63
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqshl z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uqshl z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xc9,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 c9 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc9,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c9 44 <unknown>
 
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqshl z31.d, p0/m, z31.d, #63
 // CHECK-INST: uqshl z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 c7 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqshl z31.d, p0/m, z31.d, #63
 // CHECK-INST: uqshl z31.d, p0/m, z31.d, #63
 // CHECK-ENCODING: [0xff,0x83,0xc7,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 83 c7 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqshlr.s b/llvm/test/MC/AArch64/SVE2/uqshlr.s
index 13c374b5a6ff1..1d41a4beb51d3 100644
--- a/llvm/test/MC/AArch64/SVE2/uqshlr.s
+++ b/llvm/test/MC/AArch64/SVE2/uqshlr.s
@@ -12,25 +12,25 @@
 uqshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uqshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x0d,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 0d 44 <unknown>
 
 uqshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uqshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x4d,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 4d 44 <unknown>
 
 uqshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uqshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x8d,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 8d 44 <unknown>
 
 uqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcd,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cd 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uqshlr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uqshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xcd,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 cd 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xcd,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f cd 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqshrnb.s b/llvm/test/MC/AArch64/SVE2/uqshrnb.s
index 35e13254eb5f8..23a0e30803f36 100644
--- a/llvm/test/MC/AArch64/SVE2/uqshrnb.s
+++ b/llvm/test/MC/AArch64/SVE2/uqshrnb.s
@@ -12,35 +12,35 @@
 uqshrnb     z0.b, z0.h, #1
 // CHECK-INST: uqshrnb	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x30,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 30 2f 45 <unknown>
 
 uqshrnb     z31.b, z31.h, #8
 // CHECK-INST: uqshrnb	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x33,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 33 28 45 <unknown>
 
 uqshrnb     z0.h, z0.s, #1
 // CHECK-INST: uqshrnb	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x30,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 30 3f 45 <unknown>
 
 uqshrnb     z31.h, z31.s, #16
 // CHECK-INST: uqshrnb	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x33,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 33 30 45 <unknown>
 
 uqshrnb     z0.s, z0.d, #1
 // CHECK-INST: uqshrnb	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x30,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 30 7f 45 <unknown>
 
 uqshrnb     z31.s, z31.d, #32
 // CHECK-INST: uqshrnb	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x33,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 33 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqshrnt.s b/llvm/test/MC/AArch64/SVE2/uqshrnt.s
index d8bfd2bebe43d..d3a0ffab3cfe3 100644
--- a/llvm/test/MC/AArch64/SVE2/uqshrnt.s
+++ b/llvm/test/MC/AArch64/SVE2/uqshrnt.s
@@ -12,35 +12,35 @@
 uqshrnt     z0.b, z0.h, #1
 // CHECK-INST: uqshrnt	z0.b, z0.h, #1
 // CHECK-ENCODING: [0x00,0x34,0x2f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 34 2f 45 <unknown>
 
 uqshrnt     z31.b, z31.h, #8
 // CHECK-INST: uqshrnt	z31.b, z31.h, #8
 // CHECK-ENCODING: [0xff,0x37,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 37 28 45 <unknown>
 
 uqshrnt     z0.h, z0.s, #1
 // CHECK-INST: uqshrnt	z0.h, z0.s, #1
 // CHECK-ENCODING: [0x00,0x34,0x3f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 34 3f 45 <unknown>
 
 uqshrnt     z31.h, z31.s, #16
 // CHECK-INST: uqshrnt	z31.h, z31.s, #16
 // CHECK-ENCODING: [0xff,0x37,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 37 30 45 <unknown>
 
 uqshrnt     z0.s, z0.d, #1
 // CHECK-INST: uqshrnt	z0.s, z0.d, #1
 // CHECK-ENCODING: [0x00,0x34,0x7f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 34 7f 45 <unknown>
 
 uqshrnt     z31.s, z31.d, #32
 // CHECK-INST: uqshrnt	z31.s, z31.d, #32
 // CHECK-ENCODING: [0xff,0x37,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 37 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqsub.s b/llvm/test/MC/AArch64/SVE2/uqsub.s
index f9ed9e4c3ff1e..ecb108acb69f8 100644
--- a/llvm/test/MC/AArch64/SVE2/uqsub.s
+++ b/llvm/test/MC/AArch64/SVE2/uqsub.s
@@ -12,25 +12,25 @@
 uqsub z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uqsub z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x1b,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 1b 44 <unknown>
 
 uqsub z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uqsub z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x5b,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 5b 44 <unknown>
 
 uqsub z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uqsub z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x9b,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 9b 44 <unknown>
 
 uqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdb,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f db 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uqsub z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqsub z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uqsub z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xdb,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 db 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqsub z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdb,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f db 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqsubr.s b/llvm/test/MC/AArch64/SVE2/uqsubr.s
index 85597559c9862..ede46a645ad57 100644
--- a/llvm/test/MC/AArch64/SVE2/uqsubr.s
+++ b/llvm/test/MC/AArch64/SVE2/uqsubr.s
@@ -12,25 +12,25 @@
 uqsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: uqsubr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x1f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 1f 44 <unknown>
 
 uqsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: uqsubr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x5f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 5f 44 <unknown>
 
 uqsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: uqsubr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x9f,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 9f 44 <unknown>
 
 uqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f df 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ uqsubr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 uqsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: uqsubr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 df 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 uqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: uqsubr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdf,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f df 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqxtnb.s b/llvm/test/MC/AArch64/SVE2/uqxtnb.s
index 4e1f68d1c2348..9dea65c2d2e6c 100644
--- a/llvm/test/MC/AArch64/SVE2/uqxtnb.s
+++ b/llvm/test/MC/AArch64/SVE2/uqxtnb.s
@@ -13,17 +13,17 @@
 uqxtnb z0.b, z31.h
 // CHECK-INST: uqxtnb	z0.b, z31.h
 // CHECK-ENCODING: [0xe0,0x4b,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 4b 28 45 <unknown>
 
 uqxtnb z0.h, z31.s
 // CHECK-INST: uqxtnb	z0.h, z31.s
 // CHECK-ENCODING: [0xe0,0x4b,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 4b 30 45 <unknown>
 
 uqxtnb z0.s, z31.d
 // CHECK-INST: uqxtnb	z0.s, z31.d
 // CHECK-ENCODING: [0xe0,0x4b,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 4b 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/uqxtnt.s b/llvm/test/MC/AArch64/SVE2/uqxtnt.s
index cde4f590f5d99..c12405805f888 100644
--- a/llvm/test/MC/AArch64/SVE2/uqxtnt.s
+++ b/llvm/test/MC/AArch64/SVE2/uqxtnt.s
@@ -13,17 +13,17 @@
 uqxtnt z0.b, z31.h
 // CHECK-INST: uqxtnt	z0.b, z31.h
 // CHECK-ENCODING: [0xe0,0x4f,0x28,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 4f 28 45 <unknown>
 
 uqxtnt z0.h, z31.s
 // CHECK-INST: uqxtnt	z0.h, z31.s
 // CHECK-ENCODING: [0xe0,0x4f,0x30,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 4f 30 45 <unknown>
 
 uqxtnt z0.s, z31.d
 // CHECK-INST: uqxtnt	z0.s, z31.d
 // CHECK-ENCODING: [0xe0,0x4f,0x60,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 4f 60 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/urecpe.s b/llvm/test/MC/AArch64/SVE2/urecpe.s
index 4063f78c7e024..ff91435f60d75 100644
--- a/llvm/test/MC/AArch64/SVE2/urecpe.s
+++ b/llvm/test/MC/AArch64/SVE2/urecpe.s
@@ -12,7 +12,7 @@
 urecpe z31.s, p7/m, z31.s
 // CHECK-INST: urecpe z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x80,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 80 44 <unknown>
 
 
@@ -22,23 +22,23 @@ urecpe z31.s, p7/m, z31.s
 movprfx z4.s, p7/z, z6.s
 // CHECK-INST: movprfx	z4.s, p7/z, z6.s
 // CHECK-ENCODING: [0xc4,0x3c,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c 90 04 <unknown>
 
 urecpe z4.s, p7/m, z31.s
 // CHECK-INST: urecpe z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x80,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 80 44 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 urecpe z4.s, p7/m, z31.s
 // CHECK-INST: urecpe z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x80,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 80 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/urhadd.s b/llvm/test/MC/AArch64/SVE2/urhadd.s
index 776717f16f19b..7fc35f598de57 100644
--- a/llvm/test/MC/AArch64/SVE2/urhadd.s
+++ b/llvm/test/MC/AArch64/SVE2/urhadd.s
@@ -12,25 +12,25 @@
 urhadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: urhadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x15,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 15 44 <unknown>
 
 urhadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: urhadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x55,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 55 44 <unknown>
 
 urhadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: urhadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x95,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 95 44 <unknown>
 
 urhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: urhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d5 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ urhadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 urhadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: urhadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xd5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 d5 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 urhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: urhadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xd5,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f d5 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/urshl.s b/llvm/test/MC/AArch64/SVE2/urshl.s
index 068534e924383..fd425df9533bd 100644
--- a/llvm/test/MC/AArch64/SVE2/urshl.s
+++ b/llvm/test/MC/AArch64/SVE2/urshl.s
@@ -12,25 +12,25 @@
 urshl z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: urshl z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x03,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 03 44 <unknown>
 
 urshl z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: urshl z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x43,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 43 44 <unknown>
 
 urshl z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: urshl z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x83,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 83 44 <unknown>
 
 urshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: urshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc3,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c3 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ urshl z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 urshl z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: urshl z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xc3,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 c3 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 urshl z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: urshl z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc3,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c3 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/urshlr.s b/llvm/test/MC/AArch64/SVE2/urshlr.s
index 0b5facea1b786..fee2aaf4c4fdd 100644
--- a/llvm/test/MC/AArch64/SVE2/urshlr.s
+++ b/llvm/test/MC/AArch64/SVE2/urshlr.s
@@ -12,25 +12,25 @@
 urshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: urshlr z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x07,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 07 44 <unknown>
 
 urshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: urshlr z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x47,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 47 44 <unknown>
 
 urshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: urshlr z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x87,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 87 44 <unknown>
 
 urshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: urshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c7 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ urshlr z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 urshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: urshlr z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xc7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 c7 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 urshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: urshlr z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xc7,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f c7 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/urshr.s b/llvm/test/MC/AArch64/SVE2/urshr.s
index 4e0b8b0436b59..f2e95b9800238 100644
--- a/llvm/test/MC/AArch64/SVE2/urshr.s
+++ b/llvm/test/MC/AArch64/SVE2/urshr.s
@@ -12,49 +12,49 @@
 urshr    z0.b, p0/m, z0.b, #1
 // CHECK-INST: urshr	z0.b, p0/m, z0.b, #1
 // CHECK-ENCODING: [0xe0,0x81,0x0d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 81 0d 04 <unknown>
 
 urshr    z31.b, p0/m, z31.b, #8
 // CHECK-INST: urshr	z31.b, p0/m, z31.b, #8
 // CHECK-ENCODING: [0x1f,0x81,0x0d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 81 0d 04 <unknown>
 
 urshr    z0.h, p0/m, z0.h, #1
 // CHECK-INST: urshr	z0.h, p0/m, z0.h, #1
 // CHECK-ENCODING: [0xe0,0x83,0x0d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 83 0d 04 <unknown>
 
 urshr    z31.h, p0/m, z31.h, #16
 // CHECK-INST: urshr	z31.h, p0/m, z31.h, #16
 // CHECK-ENCODING: [0x1f,0x82,0x0d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 82 0d 04 <unknown>
 
 urshr    z0.s, p0/m, z0.s, #1
 // CHECK-INST: urshr	z0.s, p0/m, z0.s, #1
 // CHECK-ENCODING: [0xe0,0x83,0x4d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 83 4d 04 <unknown>
 
 urshr    z31.s, p0/m, z31.s, #32
 // CHECK-INST: urshr	z31.s, p0/m, z31.s, #32
 // CHECK-ENCODING: [0x1f,0x80,0x4d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 4d 04 <unknown>
 
 urshr    z0.d, p0/m, z0.d, #1
 // CHECK-INST: urshr	z0.d, p0/m, z0.d, #1
 // CHECK-ENCODING: [0xe0,0x83,0xcd,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e0 83 cd 04 <unknown>
 
 urshr    z31.d, p0/m, z31.d, #64
 // CHECK-INST: urshr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x8d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 8d 04 <unknown>
 
 
@@ -64,23 +64,23 @@ urshr    z31.d, p0/m, z31.d, #64
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx	z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 urshr    z31.d, p0/m, z31.d, #64
 // CHECK-INST: urshr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x8d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 8d 04 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx	z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 urshr    z31.d, p0/m, z31.d, #64
 // CHECK-INST: urshr	z31.d, p0/m, z31.d, #64
 // CHECK-ENCODING: [0x1f,0x80,0x8d,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 80 8d 04 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ursqrte.s b/llvm/test/MC/AArch64/SVE2/ursqrte.s
index 981cd6a511ef9..ac66a15ab566e 100644
--- a/llvm/test/MC/AArch64/SVE2/ursqrte.s
+++ b/llvm/test/MC/AArch64/SVE2/ursqrte.s
@@ -12,7 +12,7 @@
 ursqrte z31.s, p7/m, z31.s
 // CHECK-INST: ursqrte z31.s, p7/m, z31.s
 // CHECK-ENCODING: [0xff,0xbf,0x81,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff bf 81 44 <unknown>
 
 
@@ -22,23 +22,23 @@ ursqrte z31.s, p7/m, z31.s
 movprfx z4.s, p7/z, z6.s
 // CHECK-INST: movprfx	z4.s, p7/z, z6.s
 // CHECK-ENCODING: [0xc4,0x3c,0x90,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 3c 90 04 <unknown>
 
 ursqrte z4.s, p7/m, z31.s
 // CHECK-INST: ursqrte z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x81,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 81 44 <unknown>
 
 movprfx z4, z6
 // CHECK-INST: movprfx	z4, z6
 // CHECK-ENCODING: [0xc4,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: c4 bc 20 04 <unknown>
 
 ursqrte z4.s, p7/m, z31.s
 // CHECK-INST: ursqrte z4.s, p7/m, z31.s
 // CHECK-ENCODING: [0xe4,0xbf,0x81,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: e4 bf 81 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ursra.s b/llvm/test/MC/AArch64/SVE2/ursra.s
index d2509e1a63b29..005e5511e7087 100644
--- a/llvm/test/MC/AArch64/SVE2/ursra.s
+++ b/llvm/test/MC/AArch64/SVE2/ursra.s
@@ -12,49 +12,49 @@
 ursra     z0.b, z0.b, #1
 // CHECK-INST: ursra	z0.b, z0.b, #1
 // CHECK-ENCODING: [0x00,0xec,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 ec 0f 45 <unknown>
 
 ursra     z31.b, z31.b, #8
 // CHECK-INST: ursra	z31.b, z31.b, #8
 // CHECK-ENCODING: [0xff,0xef,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff ef 08 45 <unknown>
 
 ursra     z0.h, z0.h, #1
 // CHECK-INST: ursra	z0.h, z0.h, #1
 // CHECK-ENCODING: [0x00,0xec,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 ec 1f 45 <unknown>
 
 ursra     z31.h, z31.h, #16
 // CHECK-INST: ursra	z31.h, z31.h, #16
 // CHECK-ENCODING: [0xff,0xef,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff ef 10 45 <unknown>
 
 ursra     z0.s, z0.s, #1
 // CHECK-INST: ursra	z0.s, z0.s, #1
 // CHECK-ENCODING: [0x00,0xec,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 ec 5f 45 <unknown>
 
 ursra     z31.s, z31.s, #32
 // CHECK-INST: ursra	z31.s, z31.s, #32
 // CHECK-ENCODING: [0xff,0xef,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff ef 40 45 <unknown>
 
 ursra     z0.d, z0.d, #1
 // CHECK-INST: ursra	z0.d, z0.d, #1
 // CHECK-ENCODING: [0x00,0xec,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 ec df 45 <unknown>
 
 ursra     z31.d, z31.d, #64
 // CHECK-INST: ursra	z31.d, z31.d, #64
 // CHECK-ENCODING: [0xff,0xef,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff ef 80 45 <unknown>
 
 
@@ -64,11 +64,11 @@ ursra     z31.d, z31.d, #64
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 ursra     z0.d, z1.d, #1
 // CHECK-INST: ursra	z0.d, z1.d, #1
 // CHECK-ENCODING: [0x20,0xec,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 ec df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ushllb.s b/llvm/test/MC/AArch64/SVE2/ushllb.s
index 04bf78a7f8af1..ea1d85052f42d 100644
--- a/llvm/test/MC/AArch64/SVE2/ushllb.s
+++ b/llvm/test/MC/AArch64/SVE2/ushllb.s
@@ -12,35 +12,35 @@
 ushllb     z0.h, z0.b, #0
 // CHECK-INST: ushllb	z0.h, z0.b, #0
 // CHECK-ENCODING: [0x00,0xa8,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a8 08 45 <unknown>
 
 ushllb     z31.h, z31.b, #7
 // CHECK-INST: ushllb	z31.h, z31.b, #7
 // CHECK-ENCODING: [0xff,0xab,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff ab 0f 45 <unknown>
 
 ushllb     z0.s, z0.h, #0
 // CHECK-INST: ushllb	z0.s, z0.h, #0
 // CHECK-ENCODING: [0x00,0xa8,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a8 10 45 <unknown>
 
 ushllb     z31.s, z31.h, #15
 // CHECK-INST: ushllb	z31.s, z31.h, #15
 // CHECK-ENCODING: [0xff,0xab,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff ab 1f 45 <unknown>
 
 ushllb     z0.d, z0.s, #0
 // CHECK-INST: ushllb	z0.d, z0.s, #0
 // CHECK-ENCODING: [0x00,0xa8,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 a8 40 45 <unknown>
 
 ushllb     z31.d, z31.s, #31
 // CHECK-INST: ushllb	z31.d, z31.s, #31
 // CHECK-ENCODING: [0xff,0xab,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff ab 5f 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/ushllt.s b/llvm/test/MC/AArch64/SVE2/ushllt.s
index 1261942d0f6ca..c4f351fb72429 100644
--- a/llvm/test/MC/AArch64/SVE2/ushllt.s
+++ b/llvm/test/MC/AArch64/SVE2/ushllt.s
@@ -12,35 +12,35 @@
 ushllt     z0.h, z0.b, #0
 // CHECK-INST: ushllt	z0.h, z0.b, #0
 // CHECK-ENCODING: [0x00,0xac,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 ac 08 45 <unknown>
 
 ushllt     z31.h, z31.b, #7
 // CHECK-INST: ushllt	z31.h, z31.b, #7
 // CHECK-ENCODING: [0xff,0xaf,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff af 0f 45 <unknown>
 
 ushllt     z0.s, z0.h, #0
 // CHECK-INST: ushllt	z0.s, z0.h, #0
 // CHECK-ENCODING: [0x00,0xac,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 ac 10 45 <unknown>
 
 ushllt     z31.s, z31.h, #15
 // CHECK-INST: ushllt	z31.s, z31.h, #15
 // CHECK-ENCODING: [0xff,0xaf,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff af 1f 45 <unknown>
 
 ushllt     z0.d, z0.s, #0
 // CHECK-INST: ushllt	z0.d, z0.s, #0
 // CHECK-ENCODING: [0x00,0xac,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 ac 40 45 <unknown>
 
 ushllt     z31.d, z31.s, #31
 // CHECK-INST: ushllt	z31.d, z31.s, #31
 // CHECK-ENCODING: [0xff,0xaf,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff af 5f 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/usqadd.s b/llvm/test/MC/AArch64/SVE2/usqadd.s
index 3d16be59673c8..4fa372118547d 100644
--- a/llvm/test/MC/AArch64/SVE2/usqadd.s
+++ b/llvm/test/MC/AArch64/SVE2/usqadd.s
@@ -12,25 +12,25 @@
 usqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-INST: usqadd z0.b, p0/m, z0.b, z1.b
 // CHECK-ENCODING: [0x20,0x80,0x1d,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 1d 44 <unknown>
 
 usqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-INST: usqadd z0.h, p0/m, z0.h, z1.h
 // CHECK-ENCODING: [0x20,0x80,0x5d,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 80 5d 44 <unknown>
 
 usqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-INST: usqadd z29.s, p7/m, z29.s, z30.s
 // CHECK-ENCODING: [0xdd,0x9f,0x9d,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 9f 9d 44 <unknown>
 
 usqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: usqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdd,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f dd 44 <unknown>
 
 // --------------------------------------------------------------------------//
@@ -39,23 +39,23 @@ usqadd z31.d, p7/m, z31.d, z30.d
 movprfx z31.d, p0/z, z6.d
 // CHECK-INST: movprfx z31.d, p0/z, z6.d
 // CHECK-ENCODING: [0xdf,0x20,0xd0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df 20 d0 04 <unknown>
 
 usqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-INST: usqadd z31.d, p0/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x83,0xdd,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 83 dd 44 <unknown>
 
 movprfx z31, z6
 // CHECK-INST: movprfx z31, z6
 // CHECK-ENCODING: [0xdf,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: df bc 20 04 <unknown>
 
 usqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-INST: usqadd z31.d, p7/m, z31.d, z30.d
 // CHECK-ENCODING: [0xdf,0x9f,0xdd,0x44]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 9f dd 44 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/usra.s b/llvm/test/MC/AArch64/SVE2/usra.s
index 83cad939b6dc7..51e942a49e09e 100644
--- a/llvm/test/MC/AArch64/SVE2/usra.s
+++ b/llvm/test/MC/AArch64/SVE2/usra.s
@@ -12,49 +12,49 @@
 usra     z0.b, z0.b, #1
 // CHECK-INST: usra	z0.b, z0.b, #1
 // CHECK-ENCODING: [0x00,0xe4,0x0f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e4 0f 45 <unknown>
 
 usra     z31.b, z31.b, #8
 // CHECK-INST: usra	z31.b, z31.b, #8
 // CHECK-ENCODING: [0xff,0xe7,0x08,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e7 08 45 <unknown>
 
 usra     z0.h, z0.h, #1
 // CHECK-INST: usra	z0.h, z0.h, #1
 // CHECK-ENCODING: [0x00,0xe4,0x1f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e4 1f 45 <unknown>
 
 usra     z31.h, z31.h, #16
 // CHECK-INST: usra	z31.h, z31.h, #16
 // CHECK-ENCODING: [0xff,0xe7,0x10,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e7 10 45 <unknown>
 
 usra     z0.s, z0.s, #1
 // CHECK-INST: usra	z0.s, z0.s, #1
 // CHECK-ENCODING: [0x00,0xe4,0x5f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e4 5f 45 <unknown>
 
 usra     z31.s, z31.s, #32
 // CHECK-INST: usra	z31.s, z31.s, #32
 // CHECK-ENCODING: [0xff,0xe7,0x40,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e7 40 45 <unknown>
 
 usra     z0.d, z0.d, #1
 // CHECK-INST: usra	z0.d, z0.d, #1
 // CHECK-ENCODING: [0x00,0xe4,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 00 e4 df 45 <unknown>
 
 usra     z31.d, z31.d, #64
 // CHECK-INST: usra	z31.d, z31.d, #64
 // CHECK-ENCODING: [0xff,0xe7,0x80,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff e7 80 45 <unknown>
 
 
@@ -64,11 +64,11 @@ usra     z31.d, z31.d, #64
 movprfx z0, z7
 // CHECK-INST: movprfx	z0, z7
 // CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e0 bc 20 04 <unknown>
 
 usra     z0.d, z1.d, #1
 // CHECK-INST: usra	z0.d, z1.d, #1
 // CHECK-ENCODING: [0x20,0xe4,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 e4 df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/usublb.s b/llvm/test/MC/AArch64/SVE2/usublb.s
index 7225b6baa1ec5..86ab0485f82ec 100644
--- a/llvm/test/MC/AArch64/SVE2/usublb.s
+++ b/llvm/test/MC/AArch64/SVE2/usublb.s
@@ -13,17 +13,17 @@
 usublb z0.h, z1.b, z2.b
 // CHECK-INST: usublb z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x18,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 18 42 45 <unknown>
 
 usublb z29.s, z30.h, z31.h
 // CHECK-INST: usublb z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x1b,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 1b 9f 45 <unknown>
 
 usublb z31.d, z31.s, z31.s
 // CHECK-INST: usublb z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1b,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1b df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/usublt.s b/llvm/test/MC/AArch64/SVE2/usublt.s
index d4000116e59f7..5cc9f7eeb255d 100644
--- a/llvm/test/MC/AArch64/SVE2/usublt.s
+++ b/llvm/test/MC/AArch64/SVE2/usublt.s
@@ -13,17 +13,17 @@
 usublt z0.h, z1.b, z2.b
 // CHECK-INST: usublt z0.h, z1.b, z2.b
 // CHECK-ENCODING: [0x20,0x1c,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 1c 42 45 <unknown>
 
 usublt z29.s, z30.h, z31.h
 // CHECK-INST: usublt z29.s, z30.h, z31.h
 // CHECK-ENCODING: [0xdd,0x1f,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 1f 9f 45 <unknown>
 
 usublt z31.d, z31.s, z31.s
 // CHECK-INST: usublt z31.d, z31.s, z31.s
 // CHECK-ENCODING: [0xff,0x1f,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1f df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/usubwb.s b/llvm/test/MC/AArch64/SVE2/usubwb.s
index 15764300da959..34ce06e520f0b 100644
--- a/llvm/test/MC/AArch64/SVE2/usubwb.s
+++ b/llvm/test/MC/AArch64/SVE2/usubwb.s
@@ -13,17 +13,17 @@
 usubwb z0.h, z1.h, z2.b
 // CHECK-INST: usubwb z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x58,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 58 42 45 <unknown>
 
 usubwb z29.s, z30.s, z31.h
 // CHECK-INST: usubwb z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x5b,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 5b 9f 45 <unknown>
 
 usubwb z31.d, z31.d, z31.s
 // CHECK-INST: usubwb z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x5b,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 5b df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/usubwt.s b/llvm/test/MC/AArch64/SVE2/usubwt.s
index 4c3575cd65214..3fa8ef34cab47 100644
--- a/llvm/test/MC/AArch64/SVE2/usubwt.s
+++ b/llvm/test/MC/AArch64/SVE2/usubwt.s
@@ -13,17 +13,17 @@
 usubwt z0.h, z1.h, z2.b
 // CHECK-INST: usubwt z0.h, z1.h, z2.b
 // CHECK-ENCODING: [0x20,0x5c,0x42,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 5c 42 45 <unknown>
 
 usubwt z29.s, z30.s, z31.h
 // CHECK-INST: usubwt z29.s, z30.s, z31.h
 // CHECK-ENCODING: [0xdd,0x5f,0x9f,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: dd 5f 9f 45 <unknown>
 
 usubwt z31.d, z31.d, z31.s
 // CHECK-INST: usubwt z31.d, z31.d, z31.s
 // CHECK-ENCODING: [0xff,0x5f,0xdf,0x45]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 5f df 45 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/whilege.s b/llvm/test/MC/AArch64/SVE2/whilege.s
index 374987e2d1752..566699a17a2a1 100644
--- a/llvm/test/MC/AArch64/SVE2/whilege.s
+++ b/llvm/test/MC/AArch64/SVE2/whilege.s
@@ -12,59 +12,59 @@
 whilege  p15.b, xzr, x0
 // CHECK-INST: whilege	p15.b, xzr, x0
 // CHECK-ENCODING: [0xef,0x13,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ef 13 20 25 <unknown>
 
 whilege  p15.b, x0, xzr
 // CHECK-INST: whilege	p15.b, x0, xzr
 // CHECK-ENCODING: [0x0f,0x10,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 10 3f 25 <unknown>
 
 whilege  p15.b, wzr, w0
 // CHECK-INST: whilege	p15.b, wzr, w0
 // CHECK-ENCODING: [0xef,0x03,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ef 03 20 25 <unknown>
 
 whilege  p15.b, w0, wzr
 // CHECK-INST: whilege	p15.b, w0, wzr
 // CHECK-ENCODING: [0x0f,0x00,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 00 3f 25 <unknown>
 
 whilege  p15.h, x0, xzr
 // CHECK-INST: whilege	p15.h, x0, xzr
 // CHECK-ENCODING: [0x0f,0x10,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 10 7f 25 <unknown>
 
 whilege  p15.h, w0, wzr
 // CHECK-INST: whilege	p15.h, w0, wzr
 // CHECK-ENCODING: [0x0f,0x00,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 00 7f 25 <unknown>
 
 whilege  p15.s, x0, xzr
 // CHECK-INST: whilege	p15.s, x0, xzr
 // CHECK-ENCODING: [0x0f,0x10,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 10 bf 25 <unknown>
 
 whilege  p15.s, w0, wzr
 // CHECK-INST: whilege	p15.s, w0, wzr
 // CHECK-ENCODING: [0x0f,0x00,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 00 bf 25 <unknown>
 
 whilege  p15.d, w0, wzr
 // CHECK-INST: whilege	p15.d, w0, wzr
 // CHECK-ENCODING: [0x0f,0x00,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 00 ff 25 <unknown>
 
 whilege  p15.d, x0, xzr
 // CHECK-INST: whilege	p15.d, x0, xzr
 // CHECK-ENCODING: [0x0f,0x10,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 10 ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/whilegt.s b/llvm/test/MC/AArch64/SVE2/whilegt.s
index db2bd8cc2596c..4c14d50a40d17 100644
--- a/llvm/test/MC/AArch64/SVE2/whilegt.s
+++ b/llvm/test/MC/AArch64/SVE2/whilegt.s
@@ -12,59 +12,59 @@
 whilegt  p15.b, xzr, x0
 // CHECK-INST: whilegt	p15.b, xzr, x0
 // CHECK-ENCODING: [0xff,0x13,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 13 20 25 <unknown>
 
 whilegt  p15.b, x0, xzr
 // CHECK-INST: whilegt	p15.b, x0, xzr
 // CHECK-ENCODING: [0x1f,0x10,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 10 3f 25 <unknown>
 
 whilegt  p15.b, wzr, w0
 // CHECK-INST: whilegt	p15.b, wzr, w0
 // CHECK-ENCODING: [0xff,0x03,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 03 20 25 <unknown>
 
 whilegt  p15.b, w0, wzr
 // CHECK-INST: whilegt	p15.b, w0, wzr
 // CHECK-ENCODING: [0x1f,0x00,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 00 3f 25 <unknown>
 
 whilegt  p15.h, x0, xzr
 // CHECK-INST: whilegt	p15.h, x0, xzr
 // CHECK-ENCODING: [0x1f,0x10,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 10 7f 25 <unknown>
 
 whilegt  p15.h, w0, wzr
 // CHECK-INST: whilegt	p15.h, w0, wzr
 // CHECK-ENCODING: [0x1f,0x00,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 00 7f 25 <unknown>
 
 whilegt  p15.s, x0, xzr
 // CHECK-INST: whilegt	p15.s, x0, xzr
 // CHECK-ENCODING: [0x1f,0x10,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 10 bf 25 <unknown>
 
 whilegt  p15.s, w0, wzr
 // CHECK-INST: whilegt	p15.s, w0, wzr
 // CHECK-ENCODING: [0x1f,0x00,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 00 bf 25 <unknown>
 
 whilegt  p15.d, w0, wzr
 // CHECK-INST: whilegt	p15.d, w0, wzr
 // CHECK-ENCODING: [0x1f,0x00,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 00 ff 25 <unknown>
 
 whilegt  p15.d, x0, xzr
 // CHECK-INST: whilegt	p15.d, x0, xzr
 // CHECK-ENCODING: [0x1f,0x10,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 10 ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/whilehi.s b/llvm/test/MC/AArch64/SVE2/whilehi.s
index cdb9a048e636c..ae6404e07e53d 100644
--- a/llvm/test/MC/AArch64/SVE2/whilehi.s
+++ b/llvm/test/MC/AArch64/SVE2/whilehi.s
@@ -12,59 +12,59 @@
 whilehi  p15.b, xzr, x0
 // CHECK-INST: whilehi	p15.b, xzr, x0
 // CHECK-ENCODING: [0xff,0x1b,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 1b 20 25 <unknown>
 
 whilehi  p15.b, x0, xzr
 // CHECK-INST: whilehi	p15.b, x0, xzr
 // CHECK-ENCODING: [0x1f,0x18,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 18 3f 25 <unknown>
 
 whilehi  p15.b, wzr, w0
 // CHECK-INST: whilehi	p15.b, wzr, w0
 // CHECK-ENCODING: [0xff,0x0b,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ff 0b 20 25 <unknown>
 
 whilehi  p15.b, w0, wzr
 // CHECK-INST: whilehi	p15.b, w0, wzr
 // CHECK-ENCODING: [0x1f,0x08,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 08 3f 25 <unknown>
 
 whilehi  p15.h, x0, xzr
 // CHECK-INST: whilehi	p15.h, x0, xzr
 // CHECK-ENCODING: [0x1f,0x18,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 18 7f 25 <unknown>
 
 whilehi  p15.h, w0, wzr
 // CHECK-INST: whilehi	p15.h, w0, wzr
 // CHECK-ENCODING: [0x1f,0x08,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 08 7f 25 <unknown>
 
 whilehi  p15.s, x0, xzr
 // CHECK-INST: whilehi	p15.s, x0, xzr
 // CHECK-ENCODING: [0x1f,0x18,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 18 bf 25 <unknown>
 
 whilehi  p15.s, w0, wzr
 // CHECK-INST: whilehi	p15.s, w0, wzr
 // CHECK-ENCODING: [0x1f,0x08,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 08 bf 25 <unknown>
 
 whilehi  p15.d, w0, wzr
 // CHECK-INST: whilehi	p15.d, w0, wzr
 // CHECK-ENCODING: [0x1f,0x08,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 08 ff 25 <unknown>
 
 whilehi  p15.d, x0, xzr
 // CHECK-INST: whilehi	p15.d, x0, xzr
 // CHECK-ENCODING: [0x1f,0x18,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 1f 18 ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/whilehs.s b/llvm/test/MC/AArch64/SVE2/whilehs.s
index 58aa22d331b16..390d61cf79279 100644
--- a/llvm/test/MC/AArch64/SVE2/whilehs.s
+++ b/llvm/test/MC/AArch64/SVE2/whilehs.s
@@ -12,59 +12,59 @@
 whilehs  p15.b, xzr, x0
 // CHECK-INST: whilehs	p15.b, xzr, x0
 // CHECK-ENCODING: [0xef,0x1b,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ef 1b 20 25 <unknown>
 
 whilehs  p15.b, x0, xzr
 // CHECK-INST: whilehs	p15.b, x0, xzr
 // CHECK-ENCODING: [0x0f,0x18,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 18 3f 25 <unknown>
 
 whilehs  p15.b, wzr, w0
 // CHECK-INST: whilehs	p15.b, wzr, w0
 // CHECK-ENCODING: [0xef,0x0b,0x20,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: ef 0b 20 25 <unknown>
 
 whilehs  p15.b, w0, wzr
 // CHECK-INST: whilehs	p15.b, w0, wzr
 // CHECK-ENCODING: [0x0f,0x08,0x3f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 08 3f 25 <unknown>
 
 whilehs  p15.h, x0, xzr
 // CHECK-INST: whilehs	p15.h, x0, xzr
 // CHECK-ENCODING: [0x0f,0x18,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 18 7f 25 <unknown>
 
 whilehs  p15.h, w0, wzr
 // CHECK-INST: whilehs	p15.h, w0, wzr
 // CHECK-ENCODING: [0x0f,0x08,0x7f,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 08 7f 25 <unknown>
 
 whilehs  p15.s, x0, xzr
 // CHECK-INST: whilehs	p15.s, x0, xzr
 // CHECK-ENCODING: [0x0f,0x18,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 18 bf 25 <unknown>
 
 whilehs  p15.s, w0, wzr
 // CHECK-INST: whilehs	p15.s, w0, wzr
 // CHECK-ENCODING: [0x0f,0x08,0xbf,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 08 bf 25 <unknown>
 
 whilehs  p15.d, w0, wzr
 // CHECK-INST: whilehs	p15.d, w0, wzr
 // CHECK-ENCODING: [0x0f,0x08,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 08 ff 25 <unknown>
 
 whilehs  p15.d, x0, xzr
 // CHECK-INST: whilehs	p15.d, x0, xzr
 // CHECK-ENCODING: [0x0f,0x18,0xff,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 0f 18 ff 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/whilerw.s b/llvm/test/MC/AArch64/SVE2/whilerw.s
index 611593a2107ae..4772eff64a8fd 100644
--- a/llvm/test/MC/AArch64/SVE2/whilerw.s
+++ b/llvm/test/MC/AArch64/SVE2/whilerw.s
@@ -12,23 +12,23 @@
 whilerw  p15.b, x30, x30
 // CHECK-INST: whilerw  p15.b, x30, x30
 // CHECK-ENCODING: [0xdf,0x33,0x3e,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 33 3e 25 <unknown>
 
 whilerw  p15.h, x30, x30
 // CHECK-INST: whilerw  p15.h, x30, x30
 // CHECK-ENCODING: [0xdf,0x33,0x7e,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 33 7e 25 <unknown>
 
 whilerw  p15.s, x30, x30
 // CHECK-INST: whilerw  p15.s, x30, x30
 // CHECK-ENCODING: [0xdf,0x33,0xbe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 33 be 25 <unknown>
 
 whilerw  p15.d, x30, x30
 // CHECK-INST: whilerw  p15.d, x30, x30
 // CHECK-ENCODING: [0xdf,0x33,0xfe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 33 fe 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/whilewr.s b/llvm/test/MC/AArch64/SVE2/whilewr.s
index 63f21f44d0266..c4b4c40ae4cfe 100644
--- a/llvm/test/MC/AArch64/SVE2/whilewr.s
+++ b/llvm/test/MC/AArch64/SVE2/whilewr.s
@@ -12,23 +12,23 @@
 whilewr  p15.b, x30, x30
 // CHECK-INST: whilewr  p15.b, x30, x30
 // CHECK-ENCODING: [0xcf,0x33,0x3e,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: cf 33 3e 25 <unknown>
 
 whilewr  p15.h, x30, x30
 // CHECK-INST: whilewr  p15.h, x30, x30
 // CHECK-ENCODING: [0xcf,0x33,0x7e,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: cf 33 7e 25 <unknown>
 
 whilewr  p15.s, x30, x30
 // CHECK-INST: whilewr  p15.s, x30, x30
 // CHECK-ENCODING: [0xcf,0x33,0xbe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: cf 33 be 25 <unknown>
 
 whilewr  p15.d, x30, x30
 // CHECK-INST: whilewr  p15.d, x30, x30
 // CHECK-ENCODING: [0xcf,0x33,0xfe,0x25]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: cf 33 fe 25 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/xar.s b/llvm/test/MC/AArch64/SVE2/xar.s
index 4aed0590f69d1..1e40aba3fae5e 100644
--- a/llvm/test/MC/AArch64/SVE2/xar.s
+++ b/llvm/test/MC/AArch64/SVE2/xar.s
@@ -12,49 +12,49 @@
 xar     z0.b, z0.b, z1.b, #1
 // CHECK-INST: xar	z0.b, z0.b, z1.b, #1
 // CHECK-ENCODING: [0x20,0x34,0x2f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 34 2f 04 <unknown>
 
 xar     z31.b, z31.b, z30.b, #8
 // CHECK-INST: xar	z31.b, z31.b, z30.b, #8
 // CHECK-ENCODING: [0xdf,0x37,0x28,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 37 28 04 <unknown>
 
 xar     z0.h, z0.h, z1.h, #1
 // CHECK-INST: xar	z0.h, z0.h, z1.h, #1
 // CHECK-ENCODING: [0x20,0x34,0x3f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 34 3f 04 <unknown>
 
 xar     z31.h, z31.h, z30.h, #16
 // CHECK-INST: xar	z31.h, z31.h, z30.h, #16
 // CHECK-ENCODING: [0xdf,0x37,0x30,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 37 30 04 <unknown>
 
 xar     z0.s, z0.s, z1.s, #1
 // CHECK-INST: xar	z0.s, z0.s, z1.s, #1
 // CHECK-ENCODING: [0x20,0x34,0x7f,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 34 7f 04 <unknown>
 
 xar     z31.s, z31.s, z30.s, #32
 // CHECK-INST: xar	z31.s, z31.s, z30.s, #32
 // CHECK-ENCODING: [0xdf,0x37,0x60,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 37 60 04 <unknown>
 
 xar     z0.d, z0.d, z1.d, #1
 // CHECK-INST: xar	z0.d, z0.d, z1.d, #1
 // CHECK-ENCODING: [0x20,0x34,0xff,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: 20 34 ff 04 <unknown>
 
 xar     z31.d, z31.d, z30.d, #64
 // CHECK-INST: xar	z31.d, z31.d, z30.d, #64
 // CHECK-ENCODING: [0xdf,0x37,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 37 a0 04 <unknown>
 
 
@@ -64,11 +64,11 @@ xar     z31.d, z31.d, z30.d, #64
 movprfx z31, z7
 // CHECK-INST: movprfx z31, z7
 // CHECK-ENCODING: [0xff,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: ff bc 20 04 <unknown>
 
 xar     z31.d, z31.d, z30.d, #64
 // CHECK-INST: xar     z31.d, z31.d, z30.d, #64
 // CHECK-ENCODING: [0xdf,0x37,0xa0,0x04]
-// CHECK-ERROR: instruction requires: streaming-sve or sve2
+// CHECK-ERROR: instruction requires: sve2 or sme
 // CHECK-UNKNOWN: df 37 a0 04 <unknown>

From c9592ae49b8c486754a745e0c7b1a30b92a12e8b Mon Sep 17 00:00:00 2001
From: Krasimir Georgiev <krasimir@google.com>
Date: Tue, 22 Feb 2022 15:43:09 +0100
Subject: [PATCH 497/748] [clang-format] Fix preprocessor nesting after commit
 529aa4b011c4ae808d658022ef643c44dd9b2c9c

In https://github.com/llvm/llvm-project/commit/529aa4b011c4ae808d658022ef643c44dd9b2c9c
by setting the identifier info to nullptr, we started to subtly
interfere with the parts in the beginning of the function,
https://github.com/llvm/llvm-project/blob/529aa4b011c4ae808d658022ef643c44dd9b2c9c/clang/lib/Format/UnwrappedLineParser.cpp#L991
causing the preprocessor nesting to change in some cases. E.g., for the
added regression test, clang-format started incorrectly guessing the
language as C++.

This tries to address this by introducing an internal identifier info
element to use instead.

Reviewed By: curdeius, MyDeveloperDay

Differential Revision: https://reviews.llvm.org/D120315
---
 clang/lib/Format/FormatToken.h           | 7 +++++++
 clang/lib/Format/UnwrappedLineParser.cpp | 5 +++--
 clang/unittests/Format/FormatTest.cpp    | 7 +++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 5f05986addf6a..bd1d447328a0a 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -949,6 +949,10 @@ struct AdditionalKeywords {
     kw_slots = &IdentTable.get("slots");
     kw_qslots = &IdentTable.get("Q_SLOTS");
 
+    // For internal clang-format use.
+    kw_internal_ident_after_define =
+        &IdentTable.get("__CLANG_FORMAT_INTERNAL_IDENT_AFTER_DEFINE__");
+
     // C# keywords
     kw_dollar = &IdentTable.get("dollar");
     kw_base = &IdentTable.get("base");
@@ -1069,6 +1073,9 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_slots;
   IdentifierInfo *kw_qslots;
 
+  // For internal use by clang-format.
+  IdentifierInfo *kw_internal_ident_after_define;
+
   // C# keywords
   IdentifierInfo *kw_dollar;
   IdentifierInfo *kw_base;
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 7d29afb0c042c..09e209e856541 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1089,9 +1089,10 @@ void UnwrappedLineParser::parsePPDefine() {
   // In the context of a define, even keywords should be treated as normal
   // identifiers. Setting the kind to identifier is not enough, because we need
   // to treat additional keywords like __except as well, which are already
-  // identifiers.
+  // identifiers. Setting the identifier info to null interferes with include
+  // guard processing above, and changes preprocessing nesting.
   FormatTok->Tok.setKind(tok::identifier);
-  FormatTok->Tok.setIdentifierInfo(nullptr);
+  FormatTok->Tok.setIdentifierInfo(Keywords.kw_internal_ident_after_define);
   nextToken();
   if (FormatTok->Tok.getKind() == tok::l_paren &&
       !FormatTok->hasWhitespaceBefore())
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 51f6239bf2100..7d8b74c9c455f 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -22724,6 +22724,13 @@ TEST_F(FormatTest, FileAndCode) {
   EXPECT_EQ(
       FormatStyle::LK_Cpp,
       guessLanguage("foo.h", "#define FOO(...) auto bar = [] __VA_ARGS__;"));
+  // Only one of the two preprocessor regions has ObjC-like code.
+  EXPECT_EQ(FormatStyle::LK_ObjC,
+            guessLanguage("foo.h", "#if A\n"
+                                   "#define B() C\n"
+                                   "#else\n"
+                                   "#define B() [NSString a:@\"\"]\n"
+                                   "#endif\n"));
 }
 
 TEST_F(FormatTest, GuessLanguageWithCpp11AttributeSpecifiers) {

From 02571f86bb01bbea171072996533d7a6e63bba54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 22 Feb 2022 11:07:39 +0100
Subject: [PATCH 498/748] [clang][www] Port make_cxx_dr_status script to
 Python3

And run it to re-generate the cxx_dr_status.html

Differential Revision: https://reviews.llvm.org/D120313
---
 clang/www/cxx_dr_status.html | 291 ++++++++++++++++++++++++++++-------
 clang/www/make_cxx_dr_status |  32 ++--
 2 files changed, 249 insertions(+), 74 deletions(-)

diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 462b335f7801c..62ccfc518d456 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -7304,11 +7304,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Updating Annex C to C99</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="1249">
+  <tr id="1249">
     <td><a href="https://wg21.link/cwg1249">1249</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Cv-qualification of nested lambda capture</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="1250">
     <td><a href="https://wg21.link/cwg1250">1250</a></td>
@@ -10006,9 +10006,9 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="1699">
     <td><a href="https://wg21.link/cwg1699">1699</a></td>
-    <td>open</td>
+    <td>extension</td>
     <td>Does befriending a class befriend its friends?</td>
-    <td align="center">Not resolved</td>
+    <td align="center">Extension</td>
   </tr>
   <tr id="1700">
     <td><a href="https://wg21.link/cwg1700">1700</a></td>
@@ -10154,11 +10154,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Multicharacter user-defined character literals</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="1724">
+  <tr id="1724">
     <td><a href="https://wg21.link/cwg1724">1724</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Unclear rules for deduction failure</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="1725">
     <td><a href="https://wg21.link/cwg1725">1725</a></td>
@@ -10166,11 +10166,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Trailing return type with nested function declarator</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1726">
+  <tr id="1726">
     <td><a href="https://wg21.link/cwg1726">1726</a></td>
-    <td>drafting</td>
+    <td>DR</td>
     <td>Declarator operators and conversion function</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="1727">
     <td><a href="https://wg21.link/cwg1727">1727</a></td>
@@ -10208,11 +10208,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Defining types in <I>condition</I>s and range-based <TT>for</TT> statements</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1733">
+  <tr id="1733">
     <td><a href="https://wg21.link/cwg1733">1733</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Return type and value for <TT>operator=</TT> with <I>ref-qualifier</I></td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="1734">
     <td><a href="https://wg21.link/cwg1734">1734</a></td>
@@ -14154,7 +14154,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://wg21.link/cwg2390">2390</a></td>
     <td>CD5</td>
     <td>Is the argument of <TT>__has_cpp_attribute</TT> macro-expanded?</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr id="2391">
     <td><a href="https://wg21.link/cwg2391">2391</a></td>
@@ -14194,7 +14194,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2397">
     <td><a href="https://wg21.link/cwg2397">2397</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>auto</TT> specifier for pointers and references to arrays</td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -14250,7 +14250,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://wg21.link/cwg2406">2406</a></td>
     <td>CD5</td>
     <td><TT>[[fallthrough]]</TT> attribute and iteration statements</td>
-    <td class="none" align="center">Unknown</td>
+    <td class="full" align="center">Yes</td>
   </tr>
   <tr class="open" id="2407">
     <td><a href="https://wg21.link/cwg2407">2407</a></td>
@@ -14500,7 +14500,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2448">
     <td><a href="https://wg21.link/cwg2448">2448</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Cv-qualification of arithmetic types and deprecation of volatile</td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -14540,11 +14540,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Tail recursion and coroutine symmetric transfer</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2455">
+  <tr id="2455">
     <td><a href="https://wg21.link/cwg2455">2455</a></td>
-    <td>drafting</td>
+    <td>WP</td>
     <td>Concatenation of string literals vs translation phases 5 and 6</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2456">
     <td><a href="https://wg21.link/cwg2456">2456</a></td>
@@ -14560,7 +14560,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2458">
     <td><a href="https://wg21.link/cwg2458">2458</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Value category of expressions denoting non-static member functions</td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -14602,13 +14602,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2465">
     <td><a href="https://wg21.link/cwg2465">2465</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Coroutine parameters passed to a promise constructor</td>
     <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="2466">
     <td><a href="https://wg21.link/cwg2466">2466</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>co_await</TT> should be a single evaluation</td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -14656,7 +14656,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2474">
     <td><a href="https://wg21.link/cwg2474">2474</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Cv-qualification and deletion</td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -14674,7 +14674,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2477">
     <td><a href="https://wg21.link/cwg2477">2477</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Defaulted vs deleted copy constructors/assignment operators</td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -14686,7 +14686,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2479">
     <td><a href="https://wg21.link/cwg2479">2479</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing specifications for <TT>consteval</TT> and <TT>constinit</TT></td>
     <td class="none" align="center">Unknown</td>
   </tr>
@@ -14698,15 +14698,15 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2481">
     <td><a href="https://wg21.link/cwg2481">2481</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Cv-qualification of temporary to which a reference is bound</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2482">
+  <tr id="2482">
     <td><a href="https://wg21.link/cwg2482">2482</a></td>
-    <td>review</td>
+    <td>WP</td>
     <td><TT>bit_cast</TT> and indeterminate values</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2483">
     <td><a href="https://wg21.link/cwg2483">2483</a></td>
@@ -14714,11 +14714,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Language linkage of static member functions</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2484">
+  <tr id="2484">
     <td><a href="https://wg21.link/cwg2484">2484</a></td>
-    <td>open</td>
+    <td>DRWP</td>
     <td><TT>char8_t</TT> and <TT>char16_t</TT> in integral promotions</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2485">
     <td><a href="https://wg21.link/cwg2485">2485</a></td>
@@ -14726,11 +14726,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Bit-fields in integral promotions</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2486">
+  <tr id="2486">
     <td><a href="https://wg21.link/cwg2486">2486</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Call to <TT>noexcept</TT> function via <TT>noexcept(false)</TT> pointer/lvalue</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2487">
     <td><a href="https://wg21.link/cwg2487">2487</a></td>
@@ -14738,11 +14738,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Type dependence of function-style cast to incomplete array type</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr id="2488">
+  <tr class="open" id="2488">
     <td><a href="https://wg21.link/cwg2488">2488</a></td>
-    <td>NAD</td>
+    <td>open</td>
     <td>Overloading virtual functions and functions with trailing <I>requires-clause</I>s</td>
-    <td class="none" align="center">Unknown</td>
+    <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2489">
     <td><a href="https://wg21.link/cwg2489">2489</a></td>
@@ -14750,35 +14750,35 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Storage provided by array of <TT>char</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2490">
+  <tr id="2490">
     <td><a href="https://wg21.link/cwg2490">2490</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Restrictions on destruction in constant expressions</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr id="2491">
     <td><a href="https://wg21.link/cwg2491">2491</a></td>
-    <td>ready</td>
+    <td>DRWP</td>
     <td>Export of typedef after its first declaration</td>
     <td class="none" align="center">Unknown</td>
   </tr>
-  <tr id="2492">
+  <tr class="open" id="2492">
     <td><a href="https://wg21.link/cwg2492">2492</a></td>
-    <td>NAD</td>
+    <td>drafting</td>
     <td>Comparing user-defined conversion sequences in list-initialization</td>
-    <td class="none" align="center">Unknown</td>
+    <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2493">
+  <tr id="2493">
     <td><a href="https://wg21.link/cwg2493">2493</a></td>
-    <td>open</td>
+    <td>dup</td>
     <td><TT>auto</TT> as a <I>conversion-type-id</I></td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2494">
+  <tr id="2494">
     <td><a href="https://wg21.link/cwg2494">2494</a></td>
-    <td>drafting</td>
+    <td>DR</td>
     <td>Multiple definitions of non-odr-used entities</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2495">
     <td><a href="https://wg21.link/cwg2495">2495</a></td>
@@ -14788,13 +14788,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2496">
     <td><a href="https://wg21.link/cwg2496">2496</a></td>
-    <td>ready</td>
+    <td>DRWP</td>
     <td><I>ref-qualifier</I>s and virtual overriding</td>
     <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2497">
     <td><a href="https://wg21.link/cwg2497">2497</a></td>
-    <td>open</td>
+    <td>drafting</td>
     <td>Points of instantiation for constexpr function templates</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -14804,11 +14804,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Partial specialization failure and the immediate context</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2499">
+  <tr id="2499">
     <td><a href="https://wg21.link/cwg2499">2499</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Inconsistency in definition of pointer-interconvertibility</td>
-    <td align="center">Not resolved</td>
+    <td class="none" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2500">
     <td><a href="https://wg21.link/cwg2500">2500</a></td>
@@ -14822,7 +14822,180 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Explicit instantiation and trailing <I>requires-clause</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
-</table>
+  <tr id="2502">
+    <td><a href="https://wg21.link/cwg2502">2502</a></td>
+    <td>accepted</td>
+    <td>Unintended declaration conflicts in nested statement scopes</td>
+    <td class="none" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2503">
+    <td><a href="https://wg21.link/cwg2503">2503</a></td>
+    <td>drafting</td>
+    <td>Unclear relationship among name, qualified name, and unqualified name</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2504">
+    <td><a href="https://wg21.link/cwg2504">2504</a></td>
+    <td>open</td>
+    <td>Inheriting constructors from virtual base classes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2505">
+    <td><a href="https://wg21.link/cwg2505">2505</a></td>
+    <td>drafting</td>
+    <td>Nested unnamed namespace of inline unnamed namespace</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2506">
+    <td><a href="https://wg21.link/cwg2506">2506</a></td>
+    <td>DR</td>
+    <td>Structured bindings and array cv-qualifiers</td>
+    <td class="none" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2507">
+    <td><a href="https://wg21.link/cwg2507">2507</a></td>
+    <td>review</td>
+    <td>Default arguments for <TT>operator[]</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2508">
+    <td><a href="https://wg21.link/cwg2508">2508</a></td>
+    <td>review</td>
+    <td>Restrictions on uses of template parameter names</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2509">
+    <td><a href="https://wg21.link/cwg2509">2509</a></td>
+    <td>DR</td>
+    <td><I>decl-specifier-seq</I> in <I>lambda-specifiers</I></td>
+    <td class="none" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2510">
+    <td><a href="https://wg21.link/cwg2510">2510</a></td>
+    <td>open</td>
+    <td><I>noexcept-specifier</I> of friend function vs class completeness</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2511">
+    <td><a href="https://wg21.link/cwg2511">2511</a></td>
+    <td>DR</td>
+    <td>cv-qualified bit-fields</td>
+    <td class="none" align="center">Unknown</td>
+  </tr>
+  <tr id="2512">
+    <td><a href="https://wg21.link/cwg2512">2512</a></td>
+    <td>NAD</td>
+    <td><TT>typeid</TT> and incomplete class types</td>
+    <td class="none" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2513">
+    <td><a href="https://wg21.link/cwg2513">2513</a></td>
+    <td>open</td>
+    <td>Ambiguity with <I>requires-clause</I> and <I>operator-function-id</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2514">
+    <td><a href="https://wg21.link/cwg2514">2514</a></td>
+    <td>open</td>
+    <td>Modifying const subobjects</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2515">
+    <td><a href="https://wg21.link/cwg2515">2515</a></td>
+    <td>open</td>
+    <td>Result of a function call</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2516">
+    <td><a href="https://wg21.link/cwg2516">2516</a></td>
+    <td>open</td>
+    <td>Locus of <I>enum-specifier</I> or <I>opaque-enum-declaration</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2517">
+    <td><a href="https://wg21.link/cwg2517">2517</a></td>
+    <td>open</td>
+    <td>Useless restriction on use of parameter in <I>constraint-expression</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2518">
+    <td><a href="https://wg21.link/cwg2518">2518</a></td>
+    <td>open</td>
+    <td>Conformance requirements and <TT>#error</TT>/<TT>#warning</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2519">
+    <td><a href="https://wg21.link/cwg2519">2519</a></td>
+    <td>open</td>
+    <td>Object representation of a bit-field</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2520">
+    <td><a href="https://wg21.link/cwg2520">2520</a></td>
+    <td>open</td>
+    <td>Template signature and default template arguments</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2521">
+    <td><a href="https://wg21.link/cwg2521">2521</a></td>
+    <td>open</td>
+    <td>User-defined literals and reserved identifiers</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2522">
+    <td><a href="https://wg21.link/cwg2522">2522</a></td>
+    <td>open</td>
+    <td>Removing placemarker tokens and retention of whitespace</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2523">
+    <td><a href="https://wg21.link/cwg2523">2523</a></td>
+    <td>open</td>
+    <td>Undefined behavior via omitted destructor call in constant expressions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2524">
+    <td><a href="https://wg21.link/cwg2524">2524</a></td>
+    <td>open</td>
+    <td>Distinguishing user-defined conversion sequences by <I>ref-qualifier</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2525">
+    <td><a href="https://wg21.link/cwg2525">2525</a></td>
+    <td>open</td>
+    <td>Incorrect definition of implicit conversion sequence</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2526">
+    <td><a href="https://wg21.link/cwg2526">2526</a></td>
+    <td>open</td>
+    <td>Relational comparison of <TT>void*</TT> pointers</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2527">
+    <td><a href="https://wg21.link/cwg2527">2527</a></td>
+    <td>open</td>
+    <td>Non-class potentially-overlapping objects</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2528">
+    <td><a href="https://wg21.link/cwg2528">2528</a></td>
+    <td>open</td>
+    <td>Three-way comparison and the usual arithmetic conversions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2529">
+    <td><a href="https://wg21.link/cwg2529">2529</a></td>
+    <td>open</td>
+    <td>Constant destruction of constexpr references</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2530">
+    <td><a href="https://wg21.link/cwg2530">2530</a></td>
+    <td>open</td>
+    <td>Multiple definitions of enumerators</td>
+    <td align="center">Not resolved</td>
+  </tr></table>
 
 </div>
 </body>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index d35165f90ed03..e98dfbab77402 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 import sys, os, re
 
 index = 'cwg_index.html'
@@ -10,7 +10,7 @@ if len(sys.argv) == 1:
 elif len(sys.argv) == 2:
   index = sys.argv[1]
 else:
-  print >>sys.stderr, 'Usage: make_drs [<path to cwg_index.html>]'
+  print('Usage: make_drs [<path to cwg_index.html>]', file=sys.stderr)
   sys.exit(1)
 
 class DR:
@@ -38,17 +38,16 @@ for test_cpp in os.listdir(dr_test_dir):
     continue
   test_cpp = os.path.join(dr_test_dir, test_cpp)
   found_any = False;
-  for match in re.finditer(status_re, file(test_cpp, 'r').read()):
+  for match in re.finditer(status_re, open(test_cpp, 'r').read()):
     status_map[int(match.group(1))] = match.group(2)
     found_any = True
   if not found_any:
-    print >> sys.stderr, "warning:%s: no '// dr123: foo' comments in this file" % test_cpp
+    print("warning:%s: no '// dr123: foo' comments in this file" % test_cpp, file=sys.stderr)
 
-drs = sorted((parse(dr) for dr in file(index, 'r').read().split('<TR>')[2:]),
+drs = sorted((parse(dr) for dr in open(index, 'r').read().split('<TR>')[2:]),
              key = lambda dr: dr.issue)
-out_file = file(output, 'w')
-
-print >> out_file, '''\
+out_file = open(output, 'w')
+out_file.write('''\
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
           "http://www.w3.org/TR/html4/strict.dtd">
 <!-- This file is auto-generated by make_cxx_dr_status. Do not modify. -->
@@ -90,7 +89,7 @@ print >> out_file, '''\
     <th>Status</th>
     <th>Issue title</th>
     <th>Available in Clang?</th>
-  </tr>'''
+  </tr>''')
 
 latest_release = 13
 
@@ -143,7 +142,7 @@ def availability(issue):
       try:
         _, avail_style = availability(int(dup))
       except:
-        print >>sys.stderr, "issue %s marked as sup %s" % (issue, dup)
+        print("issue %s marked as sup %s" % (issue, dup), file=sys.stderr)
         avail_style = ' class="none"'
   elif status.startswith('dup '):
     dup = int(status.split(' ', 1)[1])
@@ -174,20 +173,23 @@ for dr in drs:
     if not avail.startswith('Sup') and not avail.startswith('Dup'):
       count[avail] = count.get(avail, 0) + 1
 
-  print >> out_file, '''\
+  out_file.write('''
   <tr%s id="%s">
     <td><a href="https://wg21.link/cwg%s">%s</a></td>
     <td>%s</td>
     <td>%s</td>
     <td%s align="center">%s</td>
-  </tr>''' % (row_style, dr.issue, dr.issue, dr.issue, dr.status, dr.title, avail_style, avail)
+  </tr>''' % (row_style, dr.issue, dr.issue, dr.issue, dr.status, dr.title, avail_style, avail))
 
 for status, num in sorted(count.items()):
-  print "%s: %s" % (status, num)
+  print("%s: %s" % (status, num))
 
-print >> out_file, '''\
+out_file.write('''\
 </table>
 
 </div>
 </body>
-</html>'''
+</html>
+''')
+out_file.close()
+

From 126a2607a8458c7928aa1da880d45c36560017a6 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 21 Feb 2022 15:38:52 +0100
Subject: [PATCH 499/748] [lldb] Remove HostProcess:GetMainModule

the function is unused, and the posix implementation is only really correct on linux.
---
 .../include/lldb/Host/HostNativeProcessBase.h |  1 -
 lldb/include/lldb/Host/HostProcess.h          |  1 -
 .../lldb/Host/posix/HostProcessPosix.h        |  1 -
 .../lldb/Host/windows/HostProcessWindows.h    |  1 -
 lldb/source/Host/common/HostProcess.cpp       |  4 ---
 lldb/source/Host/posix/HostProcessPosix.cpp   | 25 -------------------
 .../Host/windows/HostProcessWindows.cpp       | 18 -------------
 7 files changed, 51 deletions(-)

diff --git a/lldb/include/lldb/Host/HostNativeProcessBase.h b/lldb/include/lldb/Host/HostNativeProcessBase.h
index 5469f8a50e263..349e3349183b5 100644
--- a/lldb/include/lldb/Host/HostNativeProcessBase.h
+++ b/lldb/include/lldb/Host/HostNativeProcessBase.h
@@ -30,7 +30,6 @@ class HostNativeProcessBase {
   virtual ~HostNativeProcessBase() = default;
 
   virtual Status Terminate() = 0;
-  virtual Status GetMainModule(FileSpec &file_spec) const = 0;
 
   virtual lldb::pid_t GetProcessId() const = 0;
   virtual bool IsRunning() const = 0;
diff --git a/lldb/include/lldb/Host/HostProcess.h b/lldb/include/lldb/Host/HostProcess.h
index 0b7c303642239..00cb6a212736c 100644
--- a/lldb/include/lldb/Host/HostProcess.h
+++ b/lldb/include/lldb/Host/HostProcess.h
@@ -37,7 +37,6 @@ class HostProcess {
   ~HostProcess();
 
   Status Terminate();
-  Status GetMainModule(FileSpec &file_spec) const;
 
   lldb::pid_t GetProcessId() const;
   bool IsRunning() const;
diff --git a/lldb/include/lldb/Host/posix/HostProcessPosix.h b/lldb/include/lldb/Host/posix/HostProcessPosix.h
index 5def1b77eefe6..eec19b621a890 100644
--- a/lldb/include/lldb/Host/posix/HostProcessPosix.h
+++ b/lldb/include/lldb/Host/posix/HostProcessPosix.h
@@ -27,7 +27,6 @@ class HostProcessPosix : public HostNativeProcessBase {
   static Status Signal(lldb::process_t process, int signo);
 
   Status Terminate() override;
-  Status GetMainModule(FileSpec &file_spec) const override;
 
   lldb::pid_t GetProcessId() const override;
   bool IsRunning() const override;
diff --git a/lldb/include/lldb/Host/windows/HostProcessWindows.h b/lldb/include/lldb/Host/windows/HostProcessWindows.h
index 925d565c275ef..dc27bdc46bb8f 100644
--- a/lldb/include/lldb/Host/windows/HostProcessWindows.h
+++ b/lldb/include/lldb/Host/windows/HostProcessWindows.h
@@ -25,7 +25,6 @@ class HostProcessWindows : public HostNativeProcessBase {
   void SetOwnsHandle(bool owns);
 
   Status Terminate() override;
-  Status GetMainModule(FileSpec &file_spec) const override;
 
   lldb::pid_t GetProcessId() const override;
   bool IsRunning() const override;
diff --git a/lldb/source/Host/common/HostProcess.cpp b/lldb/source/Host/common/HostProcess.cpp
index 06dd192013ba4..83b856df36eb9 100644
--- a/lldb/source/Host/common/HostProcess.cpp
+++ b/lldb/source/Host/common/HostProcess.cpp
@@ -22,10 +22,6 @@ HostProcess::~HostProcess() = default;
 
 Status HostProcess::Terminate() { return m_native_process->Terminate(); }
 
-Status HostProcess::GetMainModule(FileSpec &file_spec) const {
-  return m_native_process->GetMainModule(file_spec);
-}
-
 lldb::pid_t HostProcess::GetProcessId() const {
   return m_native_process->GetProcessId();
 }
diff --git a/lldb/source/Host/posix/HostProcessPosix.cpp b/lldb/source/Host/posix/HostProcessPosix.cpp
index 8599a94d22416..9889be07bca8b 100644
--- a/lldb/source/Host/posix/HostProcessPosix.cpp
+++ b/lldb/source/Host/posix/HostProcessPosix.cpp
@@ -49,31 +49,6 @@ Status HostProcessPosix::Signal(lldb::process_t process, int signo) {
 
 Status HostProcessPosix::Terminate() { return Signal(SIGKILL); }
 
-Status HostProcessPosix::GetMainModule(FileSpec &file_spec) const {
-  Status error;
-
-  // Use special code here because proc/[pid]/exe is a symbolic link.
-  char link_path[PATH_MAX];
-  if (snprintf(link_path, PATH_MAX, "/proc/%" PRIu64 "/exe", m_process) != 1) {
-    error.SetErrorString("Unable to build /proc/<pid>/exe string");
-    return error;
-  }
-
-  error = FileSystem::Instance().Readlink(FileSpec(link_path), file_spec);
-  if (!error.Success())
-    return error;
-
-  // If the binary has been deleted, the link name has " (deleted)" appended.
-  // Remove if there.
-  if (file_spec.GetFilename().GetStringRef().endswith(" (deleted)")) {
-    const char *filename = file_spec.GetFilename().GetCString();
-    static const size_t deleted_len = strlen(" (deleted)");
-    const size_t len = file_spec.GetFilename().GetLength();
-    file_spec.GetFilename().SetCStringWithLength(filename, len - deleted_len);
-  }
-  return error;
-}
-
 lldb::pid_t HostProcessPosix::GetProcessId() const { return m_process; }
 
 bool HostProcessPosix::IsRunning() const {
diff --git a/lldb/source/Host/windows/HostProcessWindows.cpp b/lldb/source/Host/windows/HostProcessWindows.cpp
index 0dc23e1a65629..741ec68d1d1ee 100644
--- a/lldb/source/Host/windows/HostProcessWindows.cpp
+++ b/lldb/source/Host/windows/HostProcessWindows.cpp
@@ -48,24 +48,6 @@ Status HostProcessWindows::Terminate() {
   return error;
 }
 
-Status HostProcessWindows::GetMainModule(FileSpec &file_spec) const {
-  Status error;
-  if (m_process == nullptr)
-    error.SetError(ERROR_INVALID_HANDLE, lldb::eErrorTypeWin32);
-
-  std::vector<wchar_t> wpath(PATH_MAX);
-  if (::GetProcessImageFileNameW(m_process, wpath.data(), wpath.size())) {
-    std::string path;
-    if (llvm::convertWideToUTF8(wpath.data(), path))
-      file_spec.SetFile(path, FileSpec::Style::native);
-    else
-      error.SetErrorString("Error converting path to UTF-8");
-  } else
-    error.SetError(::GetLastError(), lldb::eErrorTypeWin32);
-
-  return error;
-}
-
 lldb::pid_t HostProcessWindows::GetProcessId() const {
   return (m_process == LLDB_INVALID_PROCESS) ? -1 : ::GetProcessId(m_process);
 }

From 5c4f749429bb8e069f93451a56bb8af424873e95 Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm@google.com>
Date: Tue, 22 Feb 2022 23:58:57 +0900
Subject: [PATCH 500/748] [mlir][bufferize] Fix GCC build

Differential Revision: https://reviews.llvm.org/D120326
---
 mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 472a0932707b8..a10d322778b8d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -151,7 +151,7 @@ struct FinalizingBufferizePass
 
 struct OneShotBufferizePass
     : public OneShotBufferizeBase<OneShotBufferizePass> {
-  using OneShotBufferizeBase<OneShotBufferizePass>::OneShotBufferizeBase;
+  OneShotBufferizePass() : OneShotBufferizeBase<OneShotBufferizePass>() {}
 
   explicit OneShotBufferizePass(const AnalysisBufferizationOptions &options)
       : options(options) {}

From f8cedc642d9b85720cb7175ef25ddde90a3fbca2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 11 Feb 2022 08:27:33 +0100
Subject: [PATCH 501/748] [clang] Never wrap a nullptr in
 CXXNewExpr::getArraySize()

Otherwise callers of these functions have to check both the return value
for and the contents of the returned llvm::Optional.

Fixes #53742

Differential Revision: https://reviews.llvm.org/D119525
---
 clang/docs/ReleaseNotes.rst       | 11 ++++++++++-
 clang/include/clang/AST/ExprCXX.h | 21 +++++++++++++++++++--
 clang/lib/AST/ExprConstant.cpp    |  2 +-
 clang/lib/AST/StmtPrinter.cpp     |  4 ++--
 clang/lib/Sema/TreeTransform.h    |  4 ++--
 clang/test/AST/issue53742.cpp     | 14 ++++++++++++++
 6 files changed, 48 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/AST/issue53742.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 499b065fe6e07..8de1e65a83dc4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -54,6 +54,14 @@ Major New Features
   There is an analogous ``zero_call_used_regs`` attribute to allow for finer
   control of this feature.
 
+Bug Fixes
+------------------
+- ``CXXNewExpr::getArraySize()`` previously returned a ``llvm::Optional``
+  wrapping a ``nullptr`` when the ``CXXNewExpr`` did not have an array
+  size expression. This was fixed and ``::getArraySize()`` will now always
+  either return ``None`` or a ``llvm::Optional`` wrapping a valid ``Expr*``.
+  This fixes `Issue #53742<https://github.com/llvm/llvm-project/issues/53742>`_.
+
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -83,7 +91,8 @@ Attribute Changes in Clang
 - Added support for parameter pack expansion in `clang::annotate`.
 
 - The ``overloadable`` attribute can now be written in all of the syntactic
-  locations a declaration attribute may appear. Fixes PR53805.
+  locations a declaration attribute may appear.
+  This fixes `Issue #53805<https://github.com/llvm/llvm-project/issues/53805>`_.
 
 Windows Support
 ---------------
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 161287adce4ca..3da9290c7dfbe 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -2261,15 +2261,32 @@ class CXXNewExpr final
 
   bool isArray() const { return CXXNewExprBits.IsArray; }
 
+  /// This might return None even if isArray() returns true,
+  /// since there might not be an array size expression.
+  /// If the result is not-None, it will never wrap a nullptr.
   Optional<Expr *> getArraySize() {
     if (!isArray())
       return None;
-    return cast_or_null<Expr>(getTrailingObjects<Stmt *>()[arraySizeOffset()]);
+
+    if (auto *Result =
+            cast_or_null<Expr>(getTrailingObjects<Stmt *>()[arraySizeOffset()]))
+      return Result;
+
+    return None;
   }
+
+  /// This might return None even if isArray() returns true,
+  /// since there might not be an array size expression.
+  /// If the result is not-None, it will never wrap a nullptr.
   Optional<const Expr *> getArraySize() const {
     if (!isArray())
       return None;
-    return cast_or_null<Expr>(getTrailingObjects<Stmt *>()[arraySizeOffset()]);
+
+    if (auto *Result =
+            cast_or_null<Expr>(getTrailingObjects<Stmt *>()[arraySizeOffset()]))
+      return Result;
+
+    return None;
   }
 
   unsigned getNumPlacementArgs() const {
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 163109bf7c9f5..99f136a72d6fe 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9427,7 +9427,7 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) {
   bool ValueInit = false;
 
   QualType AllocType = E->getAllocatedType();
-  if (Optional<const Expr*> ArraySize = E->getArraySize()) {
+  if (Optional<const Expr *> ArraySize = E->getArraySize()) {
     const Expr *Stripped = *ArraySize;
     for (; auto *ICE = dyn_cast<ImplicitCastExpr>(Stripped);
          Stripped = ICE->getSubExpr())
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 746bf8c21cd72..5ad935591ecd9 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -2132,10 +2132,10 @@ void StmtPrinter::VisitCXXNewExpr(CXXNewExpr *E) {
   if (E->isParenTypeId())
     OS << "(";
   std::string TypeS;
-  if (Optional<Expr *> Size = E->getArraySize()) {
+  if (E->isArray()) {
     llvm::raw_string_ostream s(TypeS);
     s << '[';
-    if (*Size)
+    if (Optional<Expr *> Size = E->getArraySize())
       (*Size)->printPretty(s, Helper, Policy);
     s << ']';
   }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 466a156add516..0716689d4b626 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -11912,9 +11912,9 @@ TreeTransform<Derived>::TransformCXXNewExpr(CXXNewExpr *E) {
 
   // Transform the size of the array we're allocating (if any).
   Optional<Expr *> ArraySize;
-  if (Optional<Expr *> OldArraySize = E->getArraySize()) {
+  if (E->isArray()) {
     ExprResult NewArraySize;
-    if (*OldArraySize) {
+    if (Optional<Expr *> OldArraySize = E->getArraySize()) {
       NewArraySize = getDerived().TransformExpr(*OldArraySize);
       if (NewArraySize.isInvalid())
         return ExprError();
diff --git a/clang/test/AST/issue53742.cpp b/clang/test/AST/issue53742.cpp
new file mode 100644
index 0000000000000..93978f2bcc11d
--- /dev/null
+++ b/clang/test/AST/issue53742.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fsyntax-only %s -verify
+
+struct Data {
+  char *a;
+  char *b;
+  bool *c;
+};
+
+int main() {
+  Data in;
+  in.a = new char[](); // expected-error {{cannot determine allocated array size from initializer}}
+  in.c = new bool[100]();
+  in.b = new char[100]();
+}

From e075bf6bdbcaa2652891ebff3e7ce9ca00cadd8a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 22 Feb 2022 16:31:46 +0100
Subject: [PATCH 502/748] [CodeGen] Add test for PR53990 (NFC)

---
 .../X86/pr53990-incorrect-machine-sink.ll     | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll

diff --git a/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll b/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll
new file mode 100644
index 0000000000000..3d7ff6cbe676a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr53990-incorrect-machine-sink.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
+
+declare void @clobber()
+
+define void @test(i1 %c, i64* %p, i64* noalias %p2) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    movq %rsi, %r14
+; CHECK-NEXT:    movl %edi, %r15d
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    jmpq *.LJTI0_0(,%rax,8)
+; CHECK-NEXT:  .LBB0_1: # %split.3
+; CHECK-NEXT:    movq (%r14), %rbp
+; CHECK-NEXT:    testb $1, %r15b
+; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %clobber
+; CHECK-NEXT:    callq clobber@PLT
+; CHECK-NEXT:  .LBB0_3: # %sink
+; CHECK-NEXT:    movq %rbp, (%rbx)
+; CHECK-NEXT:  .LBB0_4: # %latch
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    jmpq *.LJTI0_0(,%rax,8)
+entry:
+  %val = load i64, i64* %p, align 8
+  br label %loop
+
+loop:
+  switch i8 undef, label %unreachable [
+    i8 0, label %latch
+    i8 1, label %split.1
+    i8 2, label %split.2
+    i8 3, label %split.3
+  ]
+
+unreachable:
+  unreachable
+
+split.3:
+  br i1 %c, label %clobber, label %sink
+
+split.1:
+  br label %latch
+
+split.2:
+  br label %latch
+
+clobber:
+  call void @clobber()
+  br label %sink
+
+sink:
+  store i64 %val, i64* %p2, align 8
+  br label %latch
+
+latch:
+  %phi = phi i64 [ 0, %sink ], [ 0, %split.2 ], [ 1, %split.1 ], [ 0, %loop ]
+  %phi.live = add i64 %phi, 0
+  br label %loop
+}

From 535a23053bbb0945b7ed0925cbc7d6d7227487f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 22 Feb 2022 16:34:12 +0100
Subject: [PATCH 503/748] Fix docs build after
 f8cedc642d9b85720cb7175ef25ddde90a3fbca2

Looks like rst doesn't like '#' in link texts. Just remove it.
---
 clang/docs/ReleaseNotes.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8de1e65a83dc4..7e92224901d41 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -60,7 +60,7 @@ Bug Fixes
   wrapping a ``nullptr`` when the ``CXXNewExpr`` did not have an array
   size expression. This was fixed and ``::getArraySize()`` will now always
   either return ``None`` or a ``llvm::Optional`` wrapping a valid ``Expr*``.
-  This fixes `Issue #53742<https://github.com/llvm/llvm-project/issues/53742>`_.
+  This fixes `Issue 53742<https://github.com/llvm/llvm-project/issues/53742>`_.
 
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -92,7 +92,7 @@ Attribute Changes in Clang
 
 - The ``overloadable`` attribute can now be written in all of the syntactic
   locations a declaration attribute may appear.
-  This fixes `Issue #53805<https://github.com/llvm/llvm-project/issues/53805>`_.
+  This fixes `Issue 53805<https://github.com/llvm/llvm-project/issues/53805>`_.
 
 Windows Support
 ---------------

From fee4a9712f58caa9f1c3fc6c76ac46c5407475b6 Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Tue, 22 Feb 2022 16:40:30 +0100
Subject: [PATCH 504/748] [clang-format] Use FormatToken::is* functions without
 passing through `Tok`. NFC.

---
 clang/lib/Format/ContinuationIndenter.cpp |   2 +-
 clang/lib/Format/FormatTokenLexer.cpp     |  12 +-
 clang/lib/Format/TokenAnnotator.cpp       |   8 +-
 clang/lib/Format/UnwrappedLineParser.cpp  | 160 +++++++++++-----------
 4 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index ec268e74fd97e..62e0d01871e8d 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1634,7 +1634,7 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State,
   NewState.HasMultipleNestedBlocks = (Current.BlockParameterCount > 1);
 
   if (Style.BraceWrapping.BeforeLambdaBody && Current.Next != nullptr &&
-      Current.Tok.is(tok::l_paren)) {
+      Current.is(tok::l_paren)) {
     // Search for any parameter that is a lambda
     FormatToken const *next = Current.Next;
     while (next != nullptr) {
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 1540c14686faa..a48db4ef6d90f 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -91,7 +91,7 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
       handleCSharpVerbatimAndInterpolatedStrings();
     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
       FirstInLineIndex = Tokens.size() - 1;
-  } while (Tokens.back()->Tok.isNot(tok::eof));
+  } while (Tokens.back()->isNot(tok::eof));
   return Tokens;
 }
 
@@ -851,7 +851,7 @@ FormatToken *FormatTokenLexer::getNextToken() {
 
   // Consume and record whitespace until we find a significant token.
   unsigned WhitespaceLength = TrailingWhitespace;
-  while (FormatTok->Tok.is(tok::unknown)) {
+  while (FormatTok->is(tok::unknown)) {
     StringRef Text = FormatTok->TokenText;
     auto EscapesNewline = [&](int pos) {
       // A '\r' here is just part of '\r\n'. Skip it.
@@ -965,12 +965,12 @@ FormatToken *FormatTokenLexer::getNextToken() {
   FormatTok->OriginalColumn = Column;
 
   TrailingWhitespace = 0;
-  if (FormatTok->Tok.is(tok::comment)) {
+  if (FormatTok->is(tok::comment)) {
     // FIXME: Add the trimmed whitespace to Column.
     StringRef UntrimmedText = FormatTok->TokenText;
     FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
     TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
-  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
+  } else if (FormatTok->is(tok::raw_identifier)) {
     IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
     FormatTok->Tok.setIdentifierInfo(&Info);
     FormatTok->Tok.setKind(Info.getTokenID());
@@ -985,12 +985,12 @@ FormatToken *FormatTokenLexer::getNextToken() {
       FormatTok->Tok.setKind(tok::identifier);
       FormatTok->Tok.setIdentifierInfo(nullptr);
     }
-  } else if (FormatTok->Tok.is(tok::greatergreater)) {
+  } else if (FormatTok->is(tok::greatergreater)) {
     FormatTok->Tok.setKind(tok::greater);
     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
     ++Column;
     StateStack.push(LexerState::TOKEN_STASHED);
-  } else if (FormatTok->Tok.is(tok::lessless)) {
+  } else if (FormatTok->is(tok::lessless)) {
     FormatTok->Tok.setKind(tok::less);
     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
     ++Column;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 9a020eb6ca7dc..7649263a18a1e 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -114,7 +114,7 @@ class AnnotatingParser {
     // If there's a template keyword before the opening angle bracket, this is a
     // template parameter, not an argument.
     Contexts.back().InTemplateArgument =
-        Left->Previous && Left->Previous->Tok.isNot(tok::kw_template);
+        Left->Previous && Left->Previous->isNot(tok::kw_template);
 
     if (Style.Language == FormatStyle::LK_Java &&
         CurrentToken->is(tok::question))
@@ -1266,7 +1266,7 @@ class AnnotatingParser {
       return LT_ImportStatement;
     }
 
-    if (CurrentToken->Tok.is(tok::numeric_constant)) {
+    if (CurrentToken->is(tok::numeric_constant)) {
       CurrentToken->SpacesRequiredBefore = 1;
       return Type;
     }
@@ -1743,7 +1743,7 @@ class AnnotatingParser {
                                  tok::coloncolon))
         if (FormatToken *AfterParen = Current.MatchingParen->Next) {
           // Make sure this isn't the return type of an Obj-C block declaration
-          if (AfterParen->Tok.isNot(tok::caret)) {
+          if (AfterParen->isNot(tok::caret)) {
             if (FormatToken *BeforeParen = Current.MatchingParen->Previous)
               if (BeforeParen->is(tok::identifier) &&
                   !BeforeParen->is(TT_TypenameMacro) &&
@@ -3564,7 +3564,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
          // (e.g. as "const x of y" in a for loop), or after a destructuring
          // operation (const [x, y] of z, const {a, b} of c).
          (Left.is(Keywords.kw_of) && Left.Previous &&
-          (Left.Previous->Tok.is(tok::identifier) ||
+          (Left.Previous->is(tok::identifier) ||
            Left.Previous->isOneOf(tok::r_square, tok::r_brace)))) &&
         (!Left.Previous || !Left.Previous->is(tok::period)))
       return true;
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 09e209e856541..35465bf9a85b5 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -589,7 +589,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
   // update information about whether an lbrace starts a
   // braced init list or a different block during the loop.
   SmallVector<FormatToken *, 8> LBraceStack;
-  assert(Tok->Tok.is(tok::l_brace));
+  assert(Tok->is(tok::l_brace));
   do {
     // Get next non-comment token.
     FormatToken *NextTok;
@@ -723,7 +723,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
     }
     PrevTok = Tok;
     Tok = NextTok;
-  } while (Tok->Tok.isNot(tok::eof) && !LBraceStack.empty());
+  } while (Tok->isNot(tok::eof) && !LBraceStack.empty());
 
   // Assume other blocks for all unclosed opening braces.
   for (FormatToken *LBrace : LBraceStack)
@@ -835,7 +835,7 @@ UnwrappedLineParser::parseBlock(bool MustBeDeclaration, unsigned AddLevels,
     parseStructuralElement();
   }
 
-  if (MunchSemi && FormatTok->Tok.is(tok::semi))
+  if (MunchSemi && FormatTok->is(tok::semi))
     nextToken();
 
   Line->Level = InitialLevel;
@@ -923,7 +923,7 @@ void UnwrappedLineParser::parseChildBlock(
 }
 
 void UnwrappedLineParser::parsePPDirective() {
-  assert(FormatTok->Tok.is(tok::hash) && "'#' expected");
+  assert(FormatTok->is(tok::hash) && "'#' expected");
   ScopedMacroState MacroState(*Line, Tokens, FormatTok);
 
   nextToken();
@@ -1402,9 +1402,9 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
     return;
   case tok::kw_extern:
     nextToken();
-    if (FormatTok->Tok.is(tok::string_literal)) {
+    if (FormatTok->is(tok::string_literal)) {
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace)) {
+      if (FormatTok->is(tok::l_brace)) {
         if (Style.BraceWrapping.AfterExternBlock)
           addUnwrappedLine();
         // Either we indent or for backwards compatibility we follow the
@@ -1433,7 +1433,7 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
     LLVM_FALLTHROUGH;
   case tok::kw_inline:
     nextToken();
-    if (FormatTok->Tok.is(tok::kw_namespace)) {
+    if (FormatTok->is(tok::kw_namespace)) {
       parseNamespace();
       return;
     }
@@ -1498,7 +1498,7 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
     switch (FormatTok->Tok.getKind()) {
     case tok::at:
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace)) {
+      if (FormatTok->is(tok::l_brace)) {
         nextToken();
         parseBracedList();
         break;
@@ -1529,7 +1529,7 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
         return;
       case tok::objc_autoreleasepool:
         nextToken();
-        if (FormatTok->Tok.is(tok::l_brace)) {
+        if (FormatTok->is(tok::l_brace)) {
           if (Style.BraceWrapping.AfterControlStatement ==
               FormatStyle::BWACS_Always)
             addUnwrappedLine();
@@ -1539,10 +1539,10 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
         return;
       case tok::objc_synchronized:
         nextToken();
-        if (FormatTok->Tok.is(tok::l_paren))
+        if (FormatTok->is(tok::l_paren))
           // Skip synchronization object
           parseParens();
-        if (FormatTok->Tok.is(tok::l_brace)) {
+        if (FormatTok->is(tok::l_brace)) {
           if (Style.BraceWrapping.AfterControlStatement ==
               FormatStyle::BWACS_Always)
             addUnwrappedLine();
@@ -1751,7 +1751,7 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
       TokenCount = Line->Tokens.size();
       if (TokenCount == 1 ||
           (TokenCount == 2 && Line->Tokens.front().Tok->is(tok::comment))) {
-        if (FormatTok->Tok.is(tok::colon) && !Line->MustBeDeclaration) {
+        if (FormatTok->is(tok::colon) && !Line->MustBeDeclaration) {
           Line->Tokens.begin()->Tok->MustBreakBefore = true;
           parseLabel(!Style.IndentGotoLabels);
           return;
@@ -1784,7 +1784,7 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
       }
 
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace)) {
+      if (FormatTok->is(tok::l_brace)) {
         // Block kind should probably be set to BK_BracedInit for any language.
         // C# needs this change to ensure that array initialisers and object
         // initialisers are indented the same way.
@@ -1793,7 +1793,7 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
         nextToken();
         parseBracedList();
       } else if (Style.Language == FormatStyle::LK_Proto &&
-                 FormatTok->Tok.is(tok::less)) {
+                 FormatTok->is(tok::less)) {
         nextToken();
         parseBracedList(/*ContinueOnSemicolons=*/false, /*IsEnum=*/false,
                         /*ClosingBraceKind=*/tok::greater);
@@ -2185,7 +2185,7 @@ bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons,
 /// \param AmpAmpTokenType If different than TT_Unknown sets this type for all
 /// double ampersands. This only counts for the current parens scope.
 void UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
-  assert(FormatTok->Tok.is(tok::l_paren) && "'(' expected.");
+  assert(FormatTok->is(tok::l_paren) && "'(' expected.");
   nextToken();
   do {
     switch (FormatTok->Tok.getKind()) {
@@ -2209,7 +2209,7 @@ void UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
       break;
     case tok::at:
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace)) {
+      if (FormatTok->is(tok::l_brace)) {
         nextToken();
         parseBracedList();
       }
@@ -2253,7 +2253,7 @@ void UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) {
 
 void UnwrappedLineParser::parseSquare(bool LambdaIntroducer) {
   if (!LambdaIntroducer) {
-    assert(FormatTok->Tok.is(tok::l_square) && "'[' expected.");
+    assert(FormatTok->is(tok::l_square) && "'[' expected.");
     if (tryToParseLambda())
       return;
   }
@@ -2278,7 +2278,7 @@ void UnwrappedLineParser::parseSquare(bool LambdaIntroducer) {
     }
     case tok::at:
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace)) {
+      if (FormatTok->is(tok::l_brace)) {
         nextToken();
         parseBracedList();
       }
@@ -2379,11 +2379,11 @@ FormatToken *UnwrappedLineParser::parseIfThenElse(IfStmtKind *IfKind,
       parseSquare();
   };
 
-  assert(FormatTok->Tok.is(tok::kw_if) && "'if' expected");
+  assert(FormatTok->is(tok::kw_if) && "'if' expected");
   nextToken();
-  if (FormatTok->Tok.isOneOf(tok::kw_constexpr, tok::identifier))
+  if (FormatTok->isOneOf(tok::kw_constexpr, tok::identifier))
     nextToken();
-  if (FormatTok->Tok.is(tok::l_paren))
+  if (FormatTok->is(tok::l_paren))
     parseParens();
   HandleAttributes();
 
@@ -2393,7 +2393,7 @@ FormatToken *UnwrappedLineParser::parseIfThenElse(IfStmtKind *IfKind,
   FormatToken *IfLeftBrace = nullptr;
   IfStmtKind IfBlockKind = IfStmtKind::NotIf;
 
-  if (FormatTok->Tok.is(tok::l_brace)) {
+  if (FormatTok->is(tok::l_brace)) {
     IfLeftBrace = FormatTok;
     CompoundStatementIndenter Indenter(this, Style, Line->Level);
     IfBlockKind = parseBlock();
@@ -2416,20 +2416,20 @@ FormatToken *UnwrappedLineParser::parseIfThenElse(IfStmtKind *IfKind,
   FormatToken *ElseLeftBrace = nullptr;
   IfStmtKind Kind = IfStmtKind::IfOnly;
 
-  if (FormatTok->Tok.is(tok::kw_else)) {
+  if (FormatTok->is(tok::kw_else)) {
     if (Style.RemoveBracesLLVM) {
       NestedTooDeep.back() = false;
       Kind = IfStmtKind::IfElse;
     }
     nextToken();
     HandleAttributes();
-    if (FormatTok->Tok.is(tok::l_brace)) {
+    if (FormatTok->is(tok::l_brace)) {
       ElseLeftBrace = FormatTok;
       CompoundStatementIndenter Indenter(this, Style, Line->Level);
       if (parseBlock() == IfStmtKind::IfOnly)
         Kind = IfStmtKind::IfElseIf;
       addUnwrappedLine();
-    } else if (FormatTok->Tok.is(tok::kw_if)) {
+    } else if (FormatTok->is(tok::kw_if)) {
       FormatToken *Previous = Tokens->getPreviousToken();
       const bool IsPrecededByComment = Previous && Previous->is(tok::comment);
       if (IsPrecededByComment) {
@@ -2546,8 +2546,8 @@ void UnwrappedLineParser::parseTryCatch() {
                              tok::kw___finally) ||
           ((Style.Language == FormatStyle::LK_Java || Style.isJavaScript()) &&
            FormatTok->is(Keywords.kw_finally)) ||
-          (FormatTok->Tok.isObjCAtKeyword(tok::objc_catch) ||
-           FormatTok->Tok.isObjCAtKeyword(tok::objc_finally))))
+          (FormatTok->isObjCAtKeyword(tok::objc_catch) ||
+           FormatTok->isObjCAtKeyword(tok::objc_finally))))
       break;
     nextToken();
     while (FormatTok->isNot(tok::l_brace)) {
@@ -2595,7 +2595,7 @@ void UnwrappedLineParser::parseNamespace() {
       else
         nextToken();
   }
-  if (FormatTok->Tok.is(tok::l_brace)) {
+  if (FormatTok->is(tok::l_brace)) {
     if (ShouldBreakBeforeBrace(Style, InitialToken))
       addUnwrappedLine();
 
@@ -2620,7 +2620,7 @@ void UnwrappedLineParser::parseNamespace() {
 
     // Munch the semicolon after a namespace. This is more common than one would
     // think. Putting the semicolon into its own line is very ugly.
-    if (FormatTok->Tok.is(tok::semi))
+    if (FormatTok->is(tok::semi))
       nextToken();
 
     addUnwrappedLine(AddLevels > 0 ? LineLevel::Remove : LineLevel::Keep);
@@ -2678,12 +2678,12 @@ void UnwrappedLineParser::parseForOrWhileLoop() {
     nextToken();
   if (Style.isCpp() && FormatTok->is(tok::kw_co_await))
     nextToken();
-  if (FormatTok->Tok.is(tok::l_paren))
+  if (FormatTok->is(tok::l_paren))
     parseParens();
 
   keepAncestorBraces();
 
-  if (FormatTok->Tok.is(tok::l_brace)) {
+  if (FormatTok->is(tok::l_brace)) {
     FormatToken *LeftBrace = FormatTok;
     CompoundStatementIndenter Indenter(this, Style, Line->Level);
     parseBlock();
@@ -2702,12 +2702,12 @@ void UnwrappedLineParser::parseForOrWhileLoop() {
 }
 
 void UnwrappedLineParser::parseDoWhile() {
-  assert(FormatTok->Tok.is(tok::kw_do) && "'do' expected");
+  assert(FormatTok->is(tok::kw_do) && "'do' expected");
   nextToken();
 
   keepAncestorBraces();
 
-  if (FormatTok->Tok.is(tok::l_brace)) {
+  if (FormatTok->is(tok::l_brace)) {
     CompoundStatementIndenter Indenter(this, Style, Line->Level);
     parseBlock();
     if (Style.BraceWrapping.BeforeWhile)
@@ -2720,7 +2720,7 @@ void UnwrappedLineParser::parseDoWhile() {
     NestedTooDeep.pop_back();
 
   // FIXME: Add error handling.
-  if (!FormatTok->Tok.is(tok::kw_while)) {
+  if (!FormatTok->is(tok::kw_while)) {
     addUnwrappedLine();
     return;
   }
@@ -2743,13 +2743,13 @@ void UnwrappedLineParser::parseLabel(bool LeftAlignLabel) {
     Line->Level = 0;
 
   if (!Style.IndentCaseBlocks && CommentsBeforeNextToken.empty() &&
-      FormatTok->Tok.is(tok::l_brace)) {
+      FormatTok->is(tok::l_brace)) {
 
     CompoundStatementIndenter Indenter(this, Line->Level,
                                        Style.BraceWrapping.AfterCaseLabel,
                                        Style.BraceWrapping.IndentBraces);
     parseBlock();
-    if (FormatTok->Tok.is(tok::kw_break)) {
+    if (FormatTok->is(tok::kw_break)) {
       if (Style.BraceWrapping.AfterControlStatement ==
           FormatStyle::BWACS_Always) {
         addUnwrappedLine();
@@ -2773,24 +2773,24 @@ void UnwrappedLineParser::parseLabel(bool LeftAlignLabel) {
 }
 
 void UnwrappedLineParser::parseCaseLabel() {
-  assert(FormatTok->Tok.is(tok::kw_case) && "'case' expected");
+  assert(FormatTok->is(tok::kw_case) && "'case' expected");
 
   // FIXME: fix handling of complex expressions here.
   do {
     nextToken();
-  } while (!eof() && !FormatTok->Tok.is(tok::colon));
+  } while (!eof() && !FormatTok->is(tok::colon));
   parseLabel();
 }
 
 void UnwrappedLineParser::parseSwitch() {
-  assert(FormatTok->Tok.is(tok::kw_switch) && "'switch' expected");
+  assert(FormatTok->is(tok::kw_switch) && "'switch' expected");
   nextToken();
-  if (FormatTok->Tok.is(tok::l_paren))
+  if (FormatTok->is(tok::l_paren))
     parseParens();
 
   keepAncestorBraces();
 
-  if (FormatTok->Tok.is(tok::l_brace)) {
+  if (FormatTok->is(tok::l_brace)) {
     CompoundStatementIndenter Indenter(this, Style, Line->Level);
     parseBlock();
     addUnwrappedLine();
@@ -2812,10 +2812,10 @@ void UnwrappedLineParser::parseAccessSpecifier() {
   if (FormatTok->isOneOf(Keywords.kw_slots, Keywords.kw_qslots))
     nextToken();
   // Otherwise, we don't know what it is, and we'd better keep the next token.
-  if (FormatTok->Tok.is(tok::colon)) {
+  if (FormatTok->is(tok::colon)) {
     nextToken();
     addUnwrappedLine();
-  } else if (!FormatTok->Tok.is(tok::coloncolon) &&
+  } else if (!FormatTok->is(tok::coloncolon) &&
              !std::binary_search(COperatorsFollowingVar.begin(),
                                  COperatorsFollowingVar.end(),
                                  FormatTok->Tok.getKind())) {
@@ -2833,16 +2833,16 @@ void UnwrappedLineParser::parseAccessSpecifier() {
 /// Returns if either the concept has been completely parsed, or if it detects
 /// that the concept definition is incorrect.
 void UnwrappedLineParser::parseConcept() {
-  assert(FormatTok->Tok.is(tok::kw_concept) && "'concept' expected");
+  assert(FormatTok->is(tok::kw_concept) && "'concept' expected");
   nextToken();
-  if (!FormatTok->Tok.is(tok::identifier))
+  if (!FormatTok->is(tok::identifier))
     return;
   nextToken();
-  if (!FormatTok->Tok.is(tok::equal))
+  if (!FormatTok->is(tok::equal))
     return;
   nextToken();
   parseConstraintExpression();
-  if (FormatTok->Tok.is(tok::semi))
+  if (FormatTok->is(tok::semi))
     nextToken();
   addUnwrappedLine();
 }
@@ -2851,7 +2851,7 @@ void UnwrappedLineParser::parseConcept() {
 /// \pre The current token has to be the requires keyword.
 /// \returns true if it parsed a clause.
 bool clang::format::UnwrappedLineParser::parseRequires() {
-  assert(FormatTok->Tok.is(tok::kw_requires) && "'requires' expected");
+  assert(FormatTok->is(tok::kw_requires) && "'requires' expected");
   auto RequiresToken = FormatTok;
 
   // We try to guess if it is a requires clause, or a requires expression. For
@@ -2990,7 +2990,7 @@ bool clang::format::UnwrappedLineParser::parseRequires() {
 /// the clause is incorrect.
 void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) {
   assert(FormatTok->getPreviousNonComment() == RequiresToken);
-  assert(RequiresToken->Tok.is(tok::kw_requires) && "'requires' expected");
+  assert(RequiresToken->is(tok::kw_requires) && "'requires' expected");
   assert(RequiresToken->getType() == TT_Unknown);
 
   // If there is no previous token, we are within a requires expression,
@@ -3019,7 +3019,7 @@ void UnwrappedLineParser::parseRequiresClause(FormatToken *RequiresToken) {
 /// that the expression is incorrect.
 void UnwrappedLineParser::parseRequiresExpression(FormatToken *RequiresToken) {
   assert(FormatTok->getPreviousNonComment() == RequiresToken);
-  assert(RequiresToken->Tok.is(tok::kw_requires) && "'requires' expected");
+  assert(RequiresToken->is(tok::kw_requires) && "'requires' expected");
   assert(RequiresToken->getType() == TT_Unknown);
 
   RequiresToken->setType(TT_RequiresExpression);
@@ -3088,7 +3088,7 @@ void UnwrappedLineParser::parseConstraintExpression() {
 
       // Read identifier with optional template declaration.
       nextToken();
-      if (FormatTok->Tok.is(tok::less))
+      if (FormatTok->is(tok::less))
         parseBracedList(/*ContinueOnSemicolons=*/false, /*IsEnum=*/false,
                         /*ClosingBraceKind=*/tok::greater);
       break;
@@ -3178,7 +3178,7 @@ bool UnwrappedLineParser::parseEnum() {
   const FormatToken &InitialToken = *FormatTok;
 
   // Won't be 'enum' for NS_ENUMs.
-  if (FormatTok->Tok.is(tok::kw_enum))
+  if (FormatTok->is(tok::kw_enum))
     nextToken();
 
   // In TypeScript, "enum" can also be used as property name, e.g. in interface
@@ -3192,7 +3192,7 @@ bool UnwrappedLineParser::parseEnum() {
     return false;
 
   // Eat up enum class ...
-  if (FormatTok->Tok.is(tok::kw_class) || FormatTok->Tok.is(tok::kw_struct))
+  if (FormatTok->isOneOf(tok::kw_class, tok::kw_struct))
     nextToken();
 
   while (FormatTok->Tok.getIdentifierInfo() ||
@@ -3402,7 +3402,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
     nextToken();
     // We can have macros or attributes in between 'class' and the class name.
     if (!IsNonMacroIdentifier) {
-      if (FormatTok->Tok.is(tok::l_paren)) {
+      if (FormatTok->is(tok::l_paren)) {
         parseParens();
       } else if (FormatTok->is(TT_AttributeSquare)) {
         parseSquare();
@@ -3440,7 +3440,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
             break;
         }
       }
-      if (FormatTok->Tok.is(tok::semi))
+      if (FormatTok->is(tok::semi))
         return;
       if (Style.isCSharp() && FormatTok->is(Keywords.kw_where)) {
         addUnwrappedLine();
@@ -3465,7 +3465,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
       return TT_RecordLBrace;
     }
   };
-  if (FormatTok->Tok.is(tok::l_brace)) {
+  if (FormatTok->is(tok::l_brace)) {
     FormatTok->setType(GetBraceType(InitialToken));
     if (ParseAsExpr) {
       parseChildBlock();
@@ -3483,14 +3483,14 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
 }
 
 void UnwrappedLineParser::parseObjCMethod() {
-  assert(FormatTok->Tok.isOneOf(tok::l_paren, tok::identifier) &&
+  assert(FormatTok->isOneOf(tok::l_paren, tok::identifier) &&
          "'(' or identifier expected.");
   do {
-    if (FormatTok->Tok.is(tok::semi)) {
+    if (FormatTok->is(tok::semi)) {
       nextToken();
       addUnwrappedLine();
       return;
-    } else if (FormatTok->Tok.is(tok::l_brace)) {
+    } else if (FormatTok->is(tok::l_brace)) {
       if (Style.BraceWrapping.AfterFunction)
         addUnwrappedLine();
       parseBlock();
@@ -3503,20 +3503,20 @@ void UnwrappedLineParser::parseObjCMethod() {
 }
 
 void UnwrappedLineParser::parseObjCProtocolList() {
-  assert(FormatTok->Tok.is(tok::less) && "'<' expected.");
+  assert(FormatTok->is(tok::less) && "'<' expected.");
   do {
     nextToken();
     // Early exit in case someone forgot a close angle.
     if (FormatTok->isOneOf(tok::semi, tok::l_brace) ||
-        FormatTok->Tok.isObjCAtKeyword(tok::objc_end))
+        FormatTok->isObjCAtKeyword(tok::objc_end))
       return;
-  } while (!eof() && FormatTok->Tok.isNot(tok::greater));
+  } while (!eof() && FormatTok->isNot(tok::greater));
   nextToken(); // Skip '>'.
 }
 
 void UnwrappedLineParser::parseObjCUntilAtEnd() {
   do {
-    if (FormatTok->Tok.isObjCAtKeyword(tok::objc_end)) {
+    if (FormatTok->isObjCAtKeyword(tok::objc_end)) {
       nextToken();
       addUnwrappedLine();
       break;
@@ -3546,22 +3546,22 @@ void UnwrappedLineParser::parseObjCInterfaceOrImplementation() {
 
   // @interface can be followed by a lightweight generic
   // specialization list, then either a base class or a category.
-  if (FormatTok->Tok.is(tok::less))
+  if (FormatTok->is(tok::less))
     parseObjCLightweightGenerics();
-  if (FormatTok->Tok.is(tok::colon)) {
+  if (FormatTok->is(tok::colon)) {
     nextToken();
     nextToken(); // base class name
     // The base class can also have lightweight generics applied to it.
-    if (FormatTok->Tok.is(tok::less))
+    if (FormatTok->is(tok::less))
       parseObjCLightweightGenerics();
-  } else if (FormatTok->Tok.is(tok::l_paren))
+  } else if (FormatTok->is(tok::l_paren))
     // Skip category, if present.
     parseParens();
 
-  if (FormatTok->Tok.is(tok::less))
+  if (FormatTok->is(tok::less))
     parseObjCProtocolList();
 
-  if (FormatTok->Tok.is(tok::l_brace)) {
+  if (FormatTok->is(tok::l_brace)) {
     if (Style.BraceWrapping.AfterObjCDeclaration)
       addUnwrappedLine();
     parseBlock(/*MustBeDeclaration=*/true);
@@ -3575,7 +3575,7 @@ void UnwrappedLineParser::parseObjCInterfaceOrImplementation() {
 }
 
 void UnwrappedLineParser::parseObjCLightweightGenerics() {
-  assert(FormatTok->Tok.is(tok::less));
+  assert(FormatTok->is(tok::less));
   // Unlike protocol lists, generic parameterizations support
   // nested angles:
   //
@@ -3588,11 +3588,11 @@ void UnwrappedLineParser::parseObjCLightweightGenerics() {
     nextToken();
     // Early exit in case someone forgot a close angle.
     if (FormatTok->isOneOf(tok::semi, tok::l_brace) ||
-        FormatTok->Tok.isObjCAtKeyword(tok::objc_end))
+        FormatTok->isObjCAtKeyword(tok::objc_end))
       break;
-    if (FormatTok->Tok.is(tok::less))
+    if (FormatTok->is(tok::less))
       ++NumOpenAngles;
-    else if (FormatTok->Tok.is(tok::greater)) {
+    else if (FormatTok->is(tok::greater)) {
       assert(NumOpenAngles > 0 && "'>' makes NumOpenAngles negative");
       --NumOpenAngles;
     }
@@ -3617,11 +3617,11 @@ bool UnwrappedLineParser::parseObjCProtocol() {
 
   nextToken(); // protocol name
 
-  if (FormatTok->Tok.is(tok::less))
+  if (FormatTok->is(tok::less))
     parseObjCProtocolList();
 
   // Check for protocol declaration.
-  if (FormatTok->Tok.is(tok::semi)) {
+  if (FormatTok->is(tok::semi)) {
     nextToken();
     addUnwrappedLine();
     return true;
@@ -3736,7 +3736,7 @@ void UnwrappedLineParser::addUnwrappedLine(LineLevel AdjustLevel) {
   FormatTok->Previous = nullptr;
 }
 
-bool UnwrappedLineParser::eof() const { return FormatTok->Tok.is(tok::eof); }
+bool UnwrappedLineParser::eof() const { return FormatTok->is(tok::eof); }
 
 bool UnwrappedLineParser::isOnNewLine(const FormatToken &FormatTok) {
   return (Line->InPPDirective || FormatTok.HasUnescapedNewline) &&
@@ -3973,9 +3973,9 @@ void UnwrappedLineParser::readToken(int LevelDifference) {
 
     FirstNonCommentOnLine = IsFirstNonCommentOnLine(
         FirstNonCommentOnLine, *FormatTok, PreviousWasComment);
-    PreviousWasComment = FormatTok->Tok.is(tok::comment);
+    PreviousWasComment = FormatTok->is(tok::comment);
 
-    while (!Line->InPPDirective && FormatTok->Tok.is(tok::hash) &&
+    while (!Line->InPPDirective && FormatTok->is(tok::hash) &&
            FirstNonCommentOnLine) {
       distributeComments(Comments, FormatTok);
       Comments.clear();
@@ -3995,7 +3995,7 @@ void UnwrappedLineParser::readToken(int LevelDifference) {
         Line->Level += PPBranchLevel;
       flushComments(isOnNewLine(*FormatTok));
       parsePPDirective();
-      PreviousWasComment = FormatTok->Tok.is(tok::comment);
+      PreviousWasComment = FormatTok->is(tok::comment);
       FirstNonCommentOnLine = IsFirstNonCommentOnLine(
           FirstNonCommentOnLine, *FormatTok, PreviousWasComment);
     }
@@ -4004,7 +4004,7 @@ void UnwrappedLineParser::readToken(int LevelDifference) {
         !Line->InPPDirective)
       continue;
 
-    if (!FormatTok->Tok.is(tok::comment)) {
+    if (!FormatTok->is(tok::comment)) {
       distributeComments(Comments, FormatTok);
       Comments.clear();
       return;

From 2aaba44b5c2265f90ac9f0ae188417ef79201c82 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Tue, 22 Feb 2022 09:52:19 -0600
Subject: [PATCH 505/748] [PowerPC] Allow absolute expressions in relocations

The Linux kernel build uses absolute expressions suffixed with @lo/@ha
relocations. This currently doesn't work for DS/DQ form instructions and
there is no reason for it not to. It also works with GAS.
This patch allows this as long as the value is a multiple of 4/16
for DS/DQ form.

Differential revision: https://reviews.llvm.org/D115419
---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 63 +++++++------------
 .../PowerPC/MCTargetDesc/PPCAsmBackend.cpp    |  2 +
 .../MCTargetDesc/PPCELFObjectWriter.cpp       |  2 +
 .../PowerPC/MCTargetDesc/PPCFixupKinds.h      |  4 ++
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp |  4 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp | 13 +++-
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |  4 +-
 llvm/test/MC/PowerPC/ppc64-abs-reloc.s        | 22 +++++++
 8 files changed, 69 insertions(+), 45 deletions(-)
 create mode 100644 llvm/test/MC/PowerPC/ppc64-abs-reloc.s

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 715cff72dcab4..7113fe33b5d7a 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -341,31 +341,11 @@ struct PPCOperand : public MCParsedAsmOperand {
 
   bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); }
   bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); }
-  bool isU16Imm() const {
-    switch (Kind) {
-      case Expression:
-        return true;
-      case Immediate:
-      case ContextImmediate:
-        return isUInt<16>(getImmU16Context());
-      default:
-        return false;
-    }
-  }
-  bool isS16Imm() const {
-    switch (Kind) {
-      case Expression:
-        return true;
-      case Immediate:
-      case ContextImmediate:
-        return isInt<16>(getImmS16Context());
-      default:
-        return false;
-    }
-  }
-  bool isS16ImmX4() const { return Kind == Expression ||
-                                   (Kind == Immediate && isInt<16>(getImm()) &&
-                                    (getImm() & 3) == 0); }
+  bool isU16Imm() const { return isExtImm<16>(/*Signed*/ false, 1); }
+  bool isS16Imm() const { return isExtImm<16>(/*Signed*/ true, 1); }
+  bool isS16ImmX4() const { return isExtImm<16>(/*Signed*/ true, 4); }
+  bool isS16ImmX16() const { return isExtImm<16>(/*Signed*/ true, 16); }
+  bool isS17Imm() const { return isExtImm<17>(/*Signed*/ true, 1); }
 
   bool isHashImmX8() const {
     // The Hash Imm form is used for instructions that check or store a hash.
@@ -375,9 +355,6 @@ struct PPCOperand : public MCParsedAsmOperand {
             (getImm() & 7) == 0);
   }
 
-  bool isS16ImmX16() const { return Kind == Expression ||
-                                    (Kind == Immediate && isInt<16>(getImm()) &&
-                                     (getImm() & 15) == 0); }
   bool isS34ImmX16() const {
     return Kind == Expression ||
            (Kind == Immediate && isInt<34>(getImm()) && (getImm() & 15) == 0);
@@ -388,17 +365,6 @@ struct PPCOperand : public MCParsedAsmOperand {
     return Kind == Expression || (Kind == Immediate && isInt<34>(getImm()));
   }
 
-  bool isS17Imm() const {
-    switch (Kind) {
-      case Expression:
-        return true;
-      case Immediate:
-      case ContextImmediate:
-        return isInt<17>(getImmS16Context());
-      default:
-        return false;
-    }
-  }
   bool isTLSReg() const { return Kind == TLSRegister; }
   bool isDirectBr() const {
     if (Kind == Expression)
@@ -712,6 +678,25 @@ struct PPCOperand : public MCParsedAsmOperand {
 
     return CreateExpr(Val, S, E, IsPPC64);
   }
+
+private:
+  template <unsigned Width>
+  bool isExtImm(bool Signed, unsigned Multiple) const {
+    switch (Kind) {
+    default:
+      return false;
+    case Expression:
+      return true;
+    case Immediate:
+    case ContextImmediate:
+      if (Signed)
+        return isInt<Width>(getImmS16Context()) &&
+               (getImmS16Context() & (Multiple - 1)) == 0;
+      else
+        return isUInt<Width>(getImmU16Context()) &&
+               (getImmU16Context() & (Multiple - 1)) == 0;
+    }
+  }
 };
 
 } // end anonymous namespace.
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 9df94edc8cdff..2e678ffd58c2a 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -44,6 +44,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   case PPC::fixup_ppc_half16:
     return Value & 0xffff;
   case PPC::fixup_ppc_half16ds:
+  case PPC::fixup_ppc_half16dq:
     return Value & 0xfffc;
   case PPC::fixup_ppc_pcrel34:
   case PPC::fixup_ppc_imm34:
@@ -60,6 +61,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_2:
   case PPC::fixup_ppc_half16:
   case PPC::fixup_ppc_half16ds:
+  case PPC::fixup_ppc_half16dq:
     return 2;
   case FK_Data_4:
   case PPC::fixup_ppc_brcond14:
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 94ef7b45434f3..1e58039582c25 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -125,6 +125,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       }
       break;
     case PPC::fixup_ppc_half16ds:
+    case PPC::fixup_ppc_half16dq:
       Target.print(errs());
       errs() << '\n';
       report_fatal_error("Invalid PC-relative half16ds relocation");
@@ -349,6 +350,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       }
       break;
     case PPC::fixup_ppc_half16ds:
+    case PPC::fixup_ppc_half16dq:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_None:
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 73292f7b7938f..df0c666f5b113 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -51,6 +51,10 @@ enum Fixups {
   /// register number.
   fixup_ppc_nofixup,
 
+  /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for
+  /// instrs like 'lxv'. Produces the same relocation as fixup_ppc_half16ds.
+  fixup_ppc_half16dq,
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index a42824fb36f08..5255c26a8bafc 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -197,8 +197,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
   }
 
   // Otherwise add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_half16ds));
+  Fixups.push_back(MCFixup::create(IsLittleEndian ? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16dq));
   return RegBits;
 }
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index abff444491313..6cd04ee018fd7 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -110,9 +110,18 @@ PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
 
   if (Value.isAbsolute()) {
     int64_t Result = evaluateAsInt64(Value.getConstant());
-    if ((Fixup == nullptr || (unsigned)Fixup->getKind() != PPC::fixup_ppc_half16) &&
-        (Result >= 0x8000))
+    bool IsHalf16 = Fixup && Fixup->getTargetKind() == PPC::fixup_ppc_half16;
+    bool IsHalf16DS =
+        Fixup && Fixup->getTargetKind() == PPC::fixup_ppc_half16ds;
+    bool IsHalf16DQ =
+        Fixup && Fixup->getTargetKind() == PPC::fixup_ppc_half16dq;
+    bool IsHalf = IsHalf16 || IsHalf16DS || IsHalf16DQ;
+
+    if (!IsHalf && Result >= 0x8000)
       return false;
+    if ((IsHalf16DS && (Result & 0x3)) || (IsHalf16DQ && (Result & 0xf)))
+      return false;
+
     Res = MCValue::get(Result);
   } else {
     if (!Layout)
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 0f2903acad06c..54ba48f8c1298 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1016,7 +1016,7 @@ def dispRI : Operand<iPTR> {
 }
 def PPCDispRIXOperand : AsmOperandClass {
  let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
- let RenderMethod = "addImmOperands";
+ let RenderMethod = "addS16ImmOperands";
 }
 def dispRIX : Operand<iPTR> {
   let ParserMatchClass = PPCDispRIXOperand;
@@ -1030,7 +1030,7 @@ def dispRIHash : Operand<iPTR> {
 }
 def PPCDispRIX16Operand : AsmOperandClass {
  let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
- let RenderMethod = "addImmOperands";
+ let RenderMethod = "addS16ImmOperands";
 }
 def dispRIX16 : Operand<iPTR> {
   let ParserMatchClass = PPCDispRIX16Operand;
diff --git a/llvm/test/MC/PowerPC/ppc64-abs-reloc.s b/llvm/test/MC/PowerPC/ppc64-abs-reloc.s
new file mode 100644
index 0000000000000..8b0d0b4471783
--- /dev/null
+++ b/llvm/test/MC/PowerPC/ppc64-abs-reloc.s
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -triple powerpc64le-unknown-linux-gnu %s -filetype=obj -o - | \
+# RUN:    llvm-objdump -D -r - | FileCheck %s
+	.text
+test:                                   # @test
+        add 5, 3, 4
+        extsw 3, 5
+        .space 32776
+lab2:
+        lxv 5, (lab2-test)@l(4)
+        ld 5, (lab2-test)@l(4)
+        lwz 5, (lab2-test)@l(4)
+        lxv 5, 8389632@l(4)
+        ld 5, 8389632@l(4)
+        lwz 5, 8389632@l(4)
+        blr
+
+# CHECK: lxv 5, -32752(4)
+# CHECK: ld 5, -32752(4)
+# CHECK: lwz 5, -32752(4)
+# CHECK: lxv 5, 1024(4)
+# CHECK: ld 5, 1024(4)
+# CHECK: lwz 5, 1024(4)

From 071f870e7ff0a3d04f0d93852ff7c29b59111f78 Mon Sep 17 00:00:00 2001
From: Marek Kurdej <marek.kurdej+llvm.org@gmail.com>
Date: Tue, 22 Feb 2022 15:46:28 +0100
Subject: [PATCH 506/748] [clang-format] Avoid parsing "requires" as a keyword
 in non-C++-like languages.

Fixes the issue raised post-review in D113319 (cf. https://reviews.llvm.org/D113319#3337485).

Reviewed By: krasimir

Differential Revision: https://reviews.llvm.org/D120324
---
 clang/lib/Format/UnwrappedLineParser.cpp | 10 +++++++---
 clang/unittests/Format/FormatTestJS.cpp  |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 35465bf9a85b5..e2cbcea14d7a9 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1563,9 +1563,13 @@ void UnwrappedLineParser::parseStructuralElement(IfStmtKind *IfKind,
       parseConcept();
       return;
     case tok::kw_requires: {
-      bool ParsedClause = parseRequires();
-      if (ParsedClause)
-        return;
+      if (Style.isCpp()) {
+        bool ParsedClause = parseRequires();
+        if (ParsedClause)
+          return;
+      } else {
+        nextToken();
+      }
       break;
     }
     case tok::kw_enum:
diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
index d84533e8a2b03..67df2d41731a6 100644
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -323,6 +323,7 @@ TEST_F(FormatTestJS, ReservedWords) {
   verifyFormat("var struct = 2;");
   verifyFormat("var union = 2;");
   verifyFormat("var interface = 2;");
+  verifyFormat("var requires = {};");
   verifyFormat("interface = 2;");
   verifyFormat("x = interface instanceof y;");
   verifyFormat("interface Test {\n"

From 8e10448cbbd98e2496f8bc8985b6ee499b7ffcdd Mon Sep 17 00:00:00 2001
From: tyb0807 <sontuan.vu@arm.com>
Date: Tue, 1 Feb 2022 21:41:23 +0000
Subject: [PATCH 507/748] [AArch64] Remove unused feature flags from
 AArch64TargetInfo

This removes two feature flags from `AArch64TargetInfo` class:

- `HasHBC`: this feature does not involve generating any IR intrinsics,
so clang does not need to know about whether it is set

- `HasCrypto`: this feature is deprecated in favor of finer grained
features such as AES, SHA2, SHA3 and SM4. The associated ACLE macro
__ARM_FEATURE_CRYPTO is thus no longer used.

Differential Revision: https://reviews.llvm.org/D118757
---
 clang/lib/Basic/Targets/AArch64.cpp | 6 ------
 clang/lib/Basic/Targets/AArch64.h   | 2 --
 2 files changed, 8 deletions(-)

diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index b04508570ad75..bd03d88b2d5aa 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -525,7 +525,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
                                              DiagnosticsEngine &Diags) {
   FPU = FPUMode;
   HasCRC = false;
-  HasCrypto = false;
   HasAES = false;
   HasSHA2 = false;
   HasSHA3 = false;
@@ -548,7 +547,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
   HasMatmulFP64 = false;
   HasMatmulFP32 = false;
   HasLSE = false;
-  HasHBC = false;
   HasMOPS = false;
 
   ArchKind = llvm::AArch64::ArchKind::INVALID;
@@ -599,8 +597,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
     }
     if (Feature == "+crc")
       HasCRC = true;
-    if (Feature == "+crypto")
-      HasCrypto = true;
     if (Feature == "+aes")
       HasAES = true;
     if (Feature == "+sha2")
@@ -665,8 +661,6 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasRandGen = true;
     if (Feature == "+flagm")
       HasFlagM = true;
-    if (Feature == "+hbc")
-      HasHBC = true;
     if (Feature == "+mops")
       HasMOPS = true;
   }
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 9e22aeaff251f..bd6812d1257cf 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -30,7 +30,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   unsigned FPU;
   bool HasCRC;
-  bool HasCrypto;
   bool HasAES;
   bool HasSHA2;
   bool HasSHA3;
@@ -54,7 +53,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasMatmulFP32;
   bool HasLSE;
   bool HasFlagM;
-  bool HasHBC;
   bool HasMOPS;
 
   llvm::AArch64::ArchKind ArchKind;

From 79c9072dc009693477242bc1347a2a6c3e419423 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Sun, 20 Feb 2022 09:05:46 +0100
Subject: [PATCH 508/748] Restore documentation for __builtin_assume

This got removed by 6cacd420a1d72bca7809e6b516fb1e18ac6056c8, and that was a
mistake.

Differential Revision: https://reviews.llvm.org/D120205
---
 clang/docs/LanguageExtensions.rst | 33 +++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index f45d88092eb4a..865a877b02190 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2205,6 +2205,39 @@ Query for this feature with ``__has_builtin(__builtin_alloca_with_align)``.
 
 .. _langext-__builtin_assume:
 
+``__builtin_assume``
+--------------------
+
+``__builtin_assume`` is used to provide the optimizer with a boolean
+invariant that is defined to be true.
+
+**Syntax**:
+
+.. code-block:: c++
+
+    __builtin_assume(bool)
+
+**Example of Use**:
+
+.. code-block:: c++
+
+  int foo(int x) {
+      __builtin_assume(x != 0);
+      // The optimizer may short-circuit this check using the invariant.
+      if (x == 0)
+            return do_something();
+      return do_something_else();
+  }
+
+**Description**:
+
+The boolean argument to this function is defined to be true. The optimizer may
+analyze the form of the expression provided as the argument and deduce from
+that information used to optimize the program. If the condition is violated
+during execution, the behavior is undefined. The argument itself is 
+
+Query for this feature with ``__has_builtin(__builtin_assume)``.
+
 ``__builtin_call_with_static_chain``
 ------------------------------------
 

From b6eafba296fc0444892a176ccc3cb947399b408c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 14 Feb 2022 17:00:41 +0100
Subject: [PATCH 509/748] [Bitcode] Store type IDs for values

This is the next step towards supporting bitcode auto upgrade with
opaque pointers. The ValueList now stores the Value* together with
its associated type ID, which allows inspecting the original pointer
element type of arbitrary values.

This is a largely mechanical change threading the type ID through
various places. I've left TODOTypeID placeholders in a number of
places where determining the type ID is either non-trivial or
requires allocating a new type ID not present in the original
bitcode. For this reason, the new type IDs are also not used for
anything yet (apart from propagation). They will get used once the
TODOs are resolved.

Differential Revision: https://reviews.llvm.org/D119821
---
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp  | 435 +++++++++++++--------
 llvm/lib/Bitcode/Reader/MetadataLoader.cpp |  17 +-
 llvm/lib/Bitcode/Reader/ValueList.cpp      |  35 +-
 llvm/lib/Bitcode/Reader/ValueList.h        |  22 +-
 4 files changed, 323 insertions(+), 186 deletions(-)

diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index cf8381ef10768..0433b8fd215ba 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -595,15 +595,17 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   StructType *createIdentifiedStructType(LLVMContext &Context);
 
   static constexpr unsigned InvalidTypeID = ~0u;
+  /// Placeholder for value type IDs we don't yet determine.
+  static constexpr unsigned TODOTypeID = InvalidTypeID - 1;
 
   Type *getTypeByID(unsigned ID);
   Type *getPtrElementTypeByID(unsigned ID);
-  unsigned getContainedTypeID(unsigned ID, unsigned Idx);
+  unsigned getContainedTypeID(unsigned ID, unsigned Idx = 0);
 
-  Value *getFnValueByID(unsigned ID, Type *Ty) {
+  Value *getFnValueByID(unsigned ID, Type *Ty, unsigned TyID) {
     if (Ty && Ty->isMetadataTy())
       return MetadataAsValue::get(Ty->getContext(), getFnMetadataByID(ID));
-    return ValueList.getValueFwdRef(ID, Ty);
+    return ValueList.getValueFwdRef(ID, Ty, TyID);
   }
 
   Metadata *getFnMetadataByID(unsigned ID) {
@@ -625,7 +627,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   /// Increment Slot past the number of slots used in the record. Return true on
   /// failure.
   bool getValueTypePair(const SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
-                        unsigned InstNum, Value *&ResVal) {
+                        unsigned InstNum, Value *&ResVal, unsigned &TypeID) {
     if (Slot == Record.size()) return true;
     unsigned ValNo = (unsigned)Record[Slot++];
     // Adjust the ValNo, if it was encoded relative to the InstNum.
@@ -634,14 +636,15 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
     if (ValNo < InstNum) {
       // If this is not a forward reference, just return the value we already
       // have.
-      ResVal = getFnValueByID(ValNo, nullptr);
+      TypeID = ValueList.getTypeID(ValNo);
+      ResVal = getFnValueByID(ValNo, nullptr, TypeID);
       return ResVal == nullptr;
     }
     if (Slot == Record.size())
       return true;
 
-    unsigned TypeNo = (unsigned)Record[Slot++];
-    ResVal = getFnValueByID(ValNo, getTypeByID(TypeNo));
+    TypeID = (unsigned)Record[Slot++];
+    ResVal = getFnValueByID(ValNo, getTypeByID(TypeID), TypeID);
     return ResVal == nullptr;
   }
 
@@ -649,8 +652,8 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   /// past the number of slots used by the value in the record. Return true if
   /// there is an error.
   bool popValue(const SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
-                unsigned InstNum, Type *Ty, Value *&ResVal) {
-    if (getValue(Record, Slot, InstNum, Ty, ResVal))
+                unsigned InstNum, Type *Ty, unsigned TyID, Value *&ResVal) {
+    if (getValue(Record, Slot, InstNum, Ty, TyID, ResVal))
       return true;
     // All values currently take a single record slot.
     ++Slot;
@@ -659,32 +662,32 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
 
   /// Like popValue, but does not increment the Slot number.
   bool getValue(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
-                unsigned InstNum, Type *Ty, Value *&ResVal) {
-    ResVal = getValue(Record, Slot, InstNum, Ty);
+                unsigned InstNum, Type *Ty, unsigned TyID,  Value *&ResVal) {
+    ResVal = getValue(Record, Slot, InstNum, Ty, TyID);
     return ResVal == nullptr;
   }
 
   /// Version of getValue that returns ResVal directly, or 0 if there is an
   /// error.
   Value *getValue(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
-                  unsigned InstNum, Type *Ty) {
+                  unsigned InstNum, Type *Ty, unsigned TyID) {
     if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)Record[Slot];
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
       ValNo = InstNum - ValNo;
-    return getFnValueByID(ValNo, Ty);
+    return getFnValueByID(ValNo, Ty, TyID);
   }
 
   /// Like getValue, but decodes signed VBRs.
   Value *getValueSigned(const SmallVectorImpl<uint64_t> &Record, unsigned Slot,
-                        unsigned InstNum, Type *Ty) {
+                        unsigned InstNum, Type *Ty, unsigned TyID) {
     if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
       ValNo = InstNum - ValNo;
-    return getFnValueByID(ValNo, Ty);
+    return getFnValueByID(ValNo, Ty, TyID);
   }
 
   /// Upgrades old-style typeless byval/sret/inalloca attributes by adding the
@@ -1185,6 +1188,9 @@ Type *BitcodeReader::getTypeByID(unsigned ID) {
 }
 
 unsigned BitcodeReader::getContainedTypeID(unsigned ID, unsigned Idx) {
+  if (ID == TODOTypeID)
+    return TODOTypeID;
+
   auto It = ContainedTypeIDs.find(ID);
   if (It == ContainedTypeIDs.end())
     return InvalidTypeID;
@@ -2398,12 +2404,14 @@ Error BitcodeReader::parseConstants() {
   SmallVector<uint64_t, 64> Record;
 
   // Read all the records for this value table.
+  unsigned CurTyID = TODOTypeID;
   Type *CurTy = Type::getInt32Ty(Context);
   Type *CurElemTy = nullptr;
   unsigned NextCstNo = ValueList.size();
 
   struct DelayedShufTy {
     VectorType *OpTy;
+    unsigned OpTyID;
     VectorType *RTy;
     uint64_t Op0Idx;
     uint64_t Op1Idx;
@@ -2413,6 +2421,7 @@ Error BitcodeReader::parseConstants() {
   std::vector<DelayedShufTy> DelayedShuffles;
   struct DelayedSelTy {
     Type *OpTy;
+    unsigned OpTyID;
     uint64_t Op0Idx;
     uint64_t Op1Idx;
     uint64_t Op2Idx;
@@ -2439,32 +2448,34 @@ Error BitcodeReader::parseConstants() {
       // and we can't convert a forward reference.
       for (auto &DelayedShuffle : DelayedShuffles) {
         VectorType *OpTy = DelayedShuffle.OpTy;
+        unsigned OpTyID = DelayedShuffle.OpTyID;
         VectorType *RTy = DelayedShuffle.RTy;
         uint64_t Op0Idx = DelayedShuffle.Op0Idx;
         uint64_t Op1Idx = DelayedShuffle.Op1Idx;
         uint64_t Op2Idx = DelayedShuffle.Op2Idx;
         uint64_t CstNo = DelayedShuffle.CstNo;
-        Constant *Op0 = ValueList.getConstantFwdRef(Op0Idx, OpTy);
-        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy);
+        Constant *Op0 = ValueList.getConstantFwdRef(Op0Idx, OpTy, OpTyID);
+        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy, OpTyID);
         Type *ShufTy =
             VectorType::get(Type::getInt32Ty(Context), RTy->getElementCount());
-        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, ShufTy);
+        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, ShufTy, TODOTypeID);
         if (!ShuffleVectorInst::isValidOperands(Op0, Op1, Op2))
           return error("Invalid shufflevector operands");
         SmallVector<int, 16> Mask;
         ShuffleVectorInst::getShuffleMask(Op2, Mask);
         Value *V = ConstantExpr::getShuffleVector(Op0, Op1, Mask);
-        ValueList.assignValue(CstNo, V);
+        ValueList.assignValue(CstNo, V, TODOTypeID);
       }
       for (auto &DelayedSelector : DelayedSelectors) {
         Type *OpTy = DelayedSelector.OpTy;
+        unsigned OpTyID = DelayedSelector.OpTyID;
         Type *SelectorTy = Type::getInt1Ty(Context);
         uint64_t Op0Idx = DelayedSelector.Op0Idx;
         uint64_t Op1Idx = DelayedSelector.Op1Idx;
         uint64_t Op2Idx = DelayedSelector.Op2Idx;
         uint64_t CstNo = DelayedSelector.CstNo;
-        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy);
-        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, OpTy);
+        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy, OpTyID);
+        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, OpTy, OpTyID);
         // The selector might be an i1 or an <n x i1>
         // Get the type from the ValueList before getting a forward ref.
         if (VectorType *VTy = dyn_cast<VectorType>(OpTy)) {
@@ -2473,9 +2484,10 @@ Error BitcodeReader::parseConstants() {
           if (SelectorTy != V->getType())
             SelectorTy = VectorType::get(SelectorTy, VTy->getElementCount());
         }
-        Constant *Op0 = ValueList.getConstantFwdRef(Op0Idx, SelectorTy);
+        Constant *Op0 =
+            ValueList.getConstantFwdRef(Op0Idx, SelectorTy, TODOTypeID);
         Value *V = ConstantExpr::getSelect(Op0, Op1, Op2);
-        ValueList.assignValue(CstNo, V);
+        ValueList.assignValue(CstNo, V, TODOTypeID);
       }
 
       if (NextCstNo != ValueList.size())
@@ -2510,8 +2522,9 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid record");
       if (TypeList[Record[0]] == VoidType)
         return error("Invalid constant type");
-      CurTy = TypeList[Record[0]];
-      CurElemTy = getPtrElementTypeByID(Record[0]);
+      CurTyID = Record[0];
+      CurTy = TypeList[CurTyID];
+      CurElemTy = getPtrElementTypeByID(CurTyID);
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
       if (CurTy->isVoidTy() || CurTy->isFunctionTy() || CurTy->isLabelTy())
@@ -2575,18 +2588,23 @@ Error BitcodeReader::parseConstants() {
 
       if (StructType *STy = dyn_cast<StructType>(CurTy)) {
         for (unsigned i = 0; i != Size; ++i)
-          Elts.push_back(ValueList.getConstantFwdRef(Record[i],
-                                                     STy->getElementType(i)));
+          Elts.push_back(ValueList.getConstantFwdRef(
+              Record[i], STy->getElementType(i),
+              getContainedTypeID(CurTyID, i)));
         V = ConstantStruct::get(STy, Elts);
       } else if (ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
         Type *EltTy = ATy->getElementType();
+        unsigned EltTyID = getContainedTypeID(CurTyID);
         for (unsigned i = 0; i != Size; ++i)
-          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy,
+                                                     EltTyID));
         V = ConstantArray::get(ATy, Elts);
       } else if (VectorType *VTy = dyn_cast<VectorType>(CurTy)) {
         Type *EltTy = VTy->getElementType();
+        unsigned EltTyID = getContainedTypeID(CurTyID);
         for (unsigned i = 0; i != Size; ++i)
-          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy,
+                                                     EltTyID));
         V = ConstantVector::get(Elts);
       } else {
         V = UndefValue::get(CurTy);
@@ -2672,7 +2690,7 @@ Error BitcodeReader::parseConstants() {
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown unop.
       } else {
-        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
+        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy, CurTyID);
         unsigned Flags = 0;
         V = ConstantExpr::get(Opc, LHS, Flags);
       }
@@ -2685,8 +2703,8 @@ Error BitcodeReader::parseConstants() {
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown binop.
       } else {
-        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
-        Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy);
+        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy, CurTyID);
+        Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy, CurTyID);
         unsigned Flags = 0;
         if (Record.size() >= 4) {
           if (Opc == Instruction::Add ||
@@ -2716,10 +2734,11 @@ Error BitcodeReader::parseConstants() {
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
-        Type *OpTy = getTypeByID(Record[1]);
+        unsigned OpTyID = Record[1];
+        Type *OpTy = getTypeByID(OpTyID);
         if (!OpTy)
           return error("Invalid record");
-        Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
+        Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy, OpTyID);
         V = UpgradeBitCastExpr(Opc, Op, CurTy);
         if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
       }
@@ -2749,10 +2768,12 @@ Error BitcodeReader::parseConstants() {
       SmallVector<Constant*, 16> Elts;
       unsigned BaseTypeID = Record[OpNum];
       while (OpNum != Record.size()) {
-        Type *ElTy = getTypeByID(Record[OpNum++]);
+        unsigned ElTyID = Record[OpNum++];
+        Type *ElTy = getTypeByID(ElTyID);
         if (!ElTy)
           return error("Invalid record");
-        Elts.push_back(ValueList.getConstantFwdRef(Record[OpNum++], ElTy));
+        Elts.push_back(ValueList.getConstantFwdRef(Record[OpNum++], ElTy,
+                                                   ElTyID));
       }
 
       if (Elts.size() < 1)
@@ -2786,8 +2807,8 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid record");
 
       DelayedSelectors.push_back(
-          {CurTy, Record[0], Record[1], Record[2], NextCstNo});
-      (void)ValueList.getConstantFwdRef(NextCstNo, CurTy);
+          {CurTy, CurTyID, Record[0], Record[1], Record[2], NextCstNo});
+      (void)ValueList.getConstantFwdRef(NextCstNo, CurTy, CurTyID);
       ++NextCstNo;
       continue;
     }
@@ -2795,20 +2816,23 @@ Error BitcodeReader::parseConstants() {
         : { // CE_EXTRACTELT: [opty, opval, opty, opval]
       if (Record.size() < 3)
         return error("Invalid record");
+      unsigned OpTyID = Record[0];
       VectorType *OpTy =
-        dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
+        dyn_cast_or_null<VectorType>(getTypeByID(OpTyID));
       if (!OpTy)
         return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy, OpTyID);
       Constant *Op1 = nullptr;
       if (Record.size() == 4) {
-        Type *IdxTy = getTypeByID(Record[2]);
+        unsigned IdxTyID = Record[2];
+        Type *IdxTy = getTypeByID(IdxTyID);
         if (!IdxTy)
           return error("Invalid record");
-        Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy);
+        Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy, IdxTyID);
       } else {
         // Deprecated, but still needed to read old bitcode files.
-        Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+        Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context),
+                                          TODOTypeID);
       }
       if (!Op1)
         return error("Invalid record");
@@ -2820,18 +2844,21 @@ Error BitcodeReader::parseConstants() {
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
         return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy, CurTyID);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
-                                                  OpTy->getElementType());
+                                                  OpTy->getElementType(),
+                                                  getContainedTypeID(CurTyID));
       Constant *Op2 = nullptr;
       if (Record.size() == 4) {
-        Type *IdxTy = getTypeByID(Record[2]);
+        unsigned IdxTyID = Record[2];
+        Type *IdxTy = getTypeByID(IdxTyID);
         if (!IdxTy)
           return error("Invalid record");
-        Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy);
+        Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy, IdxTyID);
       } else {
         // Deprecated, but still needed to read old bitcode files.
-        Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+        Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context),
+                                          TODOTypeID);
       }
       if (!Op2)
         return error("Invalid record");
@@ -2843,7 +2870,7 @@ Error BitcodeReader::parseConstants() {
       if (Record.size() < 3 || !OpTy)
         return error("Invalid record");
       DelayedShuffles.push_back(
-          {OpTy, OpTy, Record[0], Record[1], Record[2], NextCstNo});
+          {OpTy, CurTyID, OpTy, Record[0], Record[1], Record[2], NextCstNo});
       ++NextCstNo;
       continue;
     }
@@ -2854,18 +2881,19 @@ Error BitcodeReader::parseConstants() {
       if (Record.size() < 4 || !RTy || !OpTy)
         return error("Invalid record");
       DelayedShuffles.push_back(
-          {OpTy, RTy, Record[1], Record[2], Record[3], NextCstNo});
+          {OpTy, CurTyID, RTy, Record[1], Record[2], Record[3], NextCstNo});
       ++NextCstNo;
       continue;
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
       if (Record.size() < 4)
         return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
       if (!OpTy)
         return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy, OpTyID);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy, OpTyID);
 
       if (OpTy->isFPOrFPVectorTy())
         V = ConstantExpr::getFCmp(Record[3], Op0, Op1);
@@ -2995,11 +3023,12 @@ Error BitcodeReader::parseConstants() {
     case bitc::CST_CODE_BLOCKADDRESS:{
       if (Record.size() < 3)
         return error("Invalid record");
-      Type *FnTy = getTypeByID(Record[0]);
+      unsigned FnTyID = Record[0];
+      Type *FnTy = getTypeByID(FnTyID);
       if (!FnTy)
         return error("Invalid record");
-      Function *Fn =
-        dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
+      Function *Fn = dyn_cast_or_null<Function>(
+          ValueList.getConstantFwdRef(Record[1], FnTy, FnTyID));
       if (!Fn)
         return error("Invalid record");
 
@@ -3036,11 +3065,12 @@ Error BitcodeReader::parseConstants() {
     case bitc::CST_CODE_DSO_LOCAL_EQUIVALENT: {
       if (Record.size() < 2)
         return error("Invalid record");
-      Type *GVTy = getTypeByID(Record[0]);
+      unsigned GVTyID = Record[0];
+      Type *GVTy = getTypeByID(GVTyID);
       if (!GVTy)
         return error("Invalid record");
       GlobalValue *GV = dyn_cast_or_null<GlobalValue>(
-          ValueList.getConstantFwdRef(Record[1], GVTy));
+          ValueList.getConstantFwdRef(Record[1], GVTy, GVTyID));
       if (!GV)
         return error("Invalid record");
 
@@ -3050,11 +3080,12 @@ Error BitcodeReader::parseConstants() {
     case bitc::CST_CODE_NO_CFI_VALUE: {
       if (Record.size() < 2)
         return error("Invalid record");
-      Type *GVTy = getTypeByID(Record[0]);
+      unsigned GVTyID = Record[0];
+      Type *GVTy = getTypeByID(GVTyID);
       if (!GVTy)
         return error("Invalid record");
       GlobalValue *GV = dyn_cast_or_null<GlobalValue>(
-          ValueList.getConstantFwdRef(Record[1], GVTy));
+          ValueList.getConstantFwdRef(Record[1], GVTy, GVTyID));
       if (!GV)
         return error("Invalid record");
       V = NoCFIValue::get(GV);
@@ -3062,7 +3093,7 @@ Error BitcodeReader::parseConstants() {
     }
     }
 
-    ValueList.assignValue(NextCstNo, V);
+    ValueList.assignValue(NextCstNo, V, CurTyID);
     ++NextCstNo;
   }
 }
@@ -3341,7 +3372,8 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
 
   if (Record.size() < 6)
     return error("Invalid record");
-  Type *Ty = getTypeByID(Record[0]);
+  unsigned TyID = Record[0];
+  Type *Ty = getTypeByID(TyID);
   if (!Ty)
     return error("Invalid record");
   bool isConstant = Record[1] & 1;
@@ -3401,7 +3433,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   else
     upgradeDLLImportExportLinkage(NewGV, RawLinkage);
 
-  ValueList.push_back(NewGV);
+  ValueList.push_back(NewGV, TyID);
 
   // Remember which value to use for the global initializer.
   if (unsigned InitID = Record[2])
@@ -3579,7 +3611,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
     Func->setPartition(StringRef(Strtab.data() + Record[17], Record[18]));
   }
 
-  ValueList.push_back(Func);
+  ValueList.push_back(Func, TODOTypeID);
 
   if (OperandInfo.PersonalityFn || OperandInfo.Prefix || OperandInfo.Prologue)
     FunctionOperands.push_back(OperandInfo);
@@ -3669,7 +3701,7 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
     OpNum += 2;
   }
 
-  ValueList.push_back(NewGA);
+  ValueList.push_back(NewGA, TypeID);
   IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
   return Error::success();
 }
@@ -4067,7 +4099,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
   for (Argument &I : F->args()) {
     assert(I.getType() == FTy->getParamType(ArgNo++) &&
            "Incorrect fully specified type for Function Argument");
-    ValueList.push_back(&I);
+    ValueList.push_back(&I, TODOTypeID);
   }
   unsigned NextValueNo = ValueList.size();
   BasicBlock *CurBB = nullptr;
@@ -4140,6 +4172,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     // Read a record.
     Record.clear();
     Instruction *I = nullptr;
+    unsigned ResTypeID = InvalidTypeID;
     Expected<unsigned> MaybeBitCode = Stream.readRecord(Entry.ID, Record);
     if (!MaybeBitCode)
       return MaybeBitCode.takeError();
@@ -4223,7 +4256,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_UNOP: {    // UNOP: [opval, ty, opcode]
       unsigned OpNum = 0;
       Value *LHS;
-      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+      unsigned TypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS, TypeID) ||
           OpNum+1 > Record.size())
         return error("Invalid record");
 
@@ -4231,6 +4265,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Opc == -1)
         return error("Invalid record");
       I = UnaryOperator::Create((Instruction::UnaryOps)Opc, LHS);
+      ResTypeID = TypeID;
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
         if (isa<FPMathOperator>(I)) {
@@ -4244,8 +4279,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_BINOP: {    // BINOP: [opval, ty, opval, opcode]
       unsigned OpNum = 0;
       Value *LHS, *RHS;
-      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
+      unsigned TypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS, TypeID) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), TypeID, RHS) ||
           OpNum+1 > Record.size())
         return error("Invalid record");
 
@@ -4253,6 +4289,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Opc == -1)
         return error("Invalid record");
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+      ResTypeID = TypeID;
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
         if (Opc == Instruction::Add ||
@@ -4281,11 +4318,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_CAST: {    // CAST: [opval, opty, destty, castopc]
       unsigned OpNum = 0;
       Value *Op;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID) ||
           OpNum+2 != Record.size())
         return error("Invalid record");
 
-      Type *ResTy = getTypeByID(Record[OpNum]);
+      ResTypeID = Record[OpNum];
+      Type *ResTy = getTypeByID(ResTypeID);
       int Opc = getDecodedCastOpcode(Record[OpNum + 1]);
       if (Opc == -1 || !ResTy)
         return error("Invalid record");
@@ -4322,7 +4361,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
 
       Value *BasePtr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
+      unsigned BasePtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr, BasePtrTypeID))
         return error("Invalid record");
 
       if (!Ty) {
@@ -4336,12 +4376,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       SmallVector<Value*, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+        unsigned OpTypeID;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID))
           return error("Invalid record");
         GEPIdx.push_back(Op);
       }
 
       I = GetElementPtrInst::Create(Ty, BasePtr, GEPIdx);
+      ResTypeID = TODOTypeID;
 
       InstructionList.push_back(I);
       if (InBounds)
@@ -4353,7 +4395,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
                                        // EXTRACTVAL: [opty, opval, n x indices]
       unsigned OpNum = 0;
       Value *Agg;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+      unsigned AggTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg, AggTypeID))
         return error("Invalid record");
       Type *Ty = Agg->getType();
 
@@ -4362,6 +4405,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("EXTRACTVAL: Invalid instruction with 0 indices");
 
       SmallVector<unsigned, 4> EXTRACTVALIdx;
+      ResTypeID = AggTypeID;
       for (; OpNum != RecSize; ++OpNum) {
         bool IsArray = Ty->isArrayTy();
         bool IsStruct = Ty->isStructTy();
@@ -4377,10 +4421,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           return error("EXTRACTVAL: Invalid array index");
         EXTRACTVALIdx.push_back((unsigned)Index);
 
-        if (IsStruct)
+        if (IsStruct) {
           Ty = Ty->getStructElementType(Index);
-        else
+          ResTypeID = getContainedTypeID(ResTypeID, Index);
+        } else {
           Ty = Ty->getArrayElementType();
+          ResTypeID = getContainedTypeID(ResTypeID);
+        }
       }
 
       I = ExtractValueInst::Create(Agg, EXTRACTVALIdx);
@@ -4392,10 +4439,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
                            // INSERTVAL: [opty, opval, opty, opval, n x indices]
       unsigned OpNum = 0;
       Value *Agg;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+      unsigned AggTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg, AggTypeID))
         return error("Invalid record");
       Value *Val;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Val))
+      unsigned ValTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID))
         return error("Invalid record");
 
       unsigned RecSize = Record.size();
@@ -4429,6 +4478,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Inserted value type doesn't match aggregate type");
 
       I = InsertValueInst::Create(Agg, Val, INSERTVALIdx);
+      ResTypeID = AggTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4438,12 +4488,16 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // handles select i1 ... in old bitcode
       unsigned OpNum = 0;
       Value *TrueVal, *FalseVal, *Cond;
-      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
-          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
-          popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
+      unsigned TypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal, TypeID) ||
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), TypeID,
+                   FalseVal) ||
+          popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context),
+                   TODOTypeID, Cond))
         return error("Invalid record");
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      ResTypeID = TypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4453,9 +4507,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // handles select i1 or select [N x i1]
       unsigned OpNum = 0;
       Value *TrueVal, *FalseVal, *Cond;
-      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
-          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
-          getValueTypePair(Record, OpNum, NextValueNo, Cond))
+      unsigned ValTypeID, CondTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal, ValTypeID) ||
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), ValTypeID,
+                   FalseVal) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Cond, CondTypeID))
         return error("Invalid record");
 
       // select condition can be either i1 or [N x i1]
@@ -4471,6 +4527,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      ResTypeID = ValTypeID;
       InstructionList.push_back(I);
       if (OpNum < Record.size() && isa<FPMathOperator>(I)) {
         FastMathFlags FMF = getDecodedFastMathFlags(Record[OpNum]);
@@ -4483,12 +4540,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_EXTRACTELT: { // EXTRACTELT: [opty, opval, opval]
       unsigned OpNum = 0;
       Value *Vec, *Idx;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
-          getValueTypePair(Record, OpNum, NextValueNo, Idx))
+      unsigned VecTypeID, IdxTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec, VecTypeID) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Idx, IdxTypeID))
         return error("Invalid record");
       if (!Vec->getType()->isVectorTy())
         return error("Invalid type for value");
       I = ExtractElementInst::Create(Vec, Idx);
+      ResTypeID = getContainedTypeID(VecTypeID);
       InstructionList.push_back(I);
       break;
     }
@@ -4496,15 +4555,18 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_INSERTELT: { // INSERTELT: [ty, opval,opval,opval]
       unsigned OpNum = 0;
       Value *Vec, *Elt, *Idx;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Vec))
+      unsigned VecTypeID, IdxTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec, VecTypeID))
         return error("Invalid record");
       if (!Vec->getType()->isVectorTy())
         return error("Invalid type for value");
       if (popValue(Record, OpNum, NextValueNo,
-                   cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
-          getValueTypePair(Record, OpNum, NextValueNo, Idx))
+                   cast<VectorType>(Vec->getType())->getElementType(),
+                   getContainedTypeID(VecTypeID), Elt) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Idx, IdxTypeID))
         return error("Invalid record");
       I = InsertElementInst::Create(Vec, Elt, Idx);
+      ResTypeID = VecTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4512,16 +4574,20 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_SHUFFLEVEC: {// SHUFFLEVEC: [opval,ty,opval,opval]
       unsigned OpNum = 0;
       Value *Vec1, *Vec2, *Mask;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
-          popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
+      unsigned Vec1TypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec1, Vec1TypeID) ||
+          popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec1TypeID,
+                   Vec2))
         return error("Invalid record");
 
-      if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
+      unsigned MaskTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Mask, MaskTypeID))
         return error("Invalid record");
       if (!Vec1->getType()->isVectorTy() || !Vec2->getType()->isVectorTy())
         return error("Invalid type for value");
 
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
+      ResTypeID = TODOTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4535,8 +4601,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       unsigned OpNum = 0;
       Value *LHS, *RHS;
-      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS))
+      unsigned LHSTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS, LHSTypeID) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), LHSTypeID, RHS))
         return error("Invalid record");
 
       if (OpNum >= Record.size())
@@ -4556,6 +4623,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         I = new FCmpInst((FCmpInst::Predicate)PredVal, LHS, RHS);
       else
         I = new ICmpInst((ICmpInst::Predicate)PredVal, LHS, RHS);
+      ResTypeID = TODOTypeID;
 
       if (FMF.any())
         I->setFastMathFlags(FMF);
@@ -4574,7 +4642,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
         unsigned OpNum = 0;
         Value *Op = nullptr;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+        unsigned OpTypeID;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID))
           return error("Invalid record");
         if (OpNum != Record.size())
           return error("Invalid record");
@@ -4597,7 +4666,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       else {
         BasicBlock *FalseDest = getBasicBlock(Record[1]);
         Value *Cond = getValue(Record, 2, NextValueNo,
-                               Type::getInt1Ty(Context));
+                               Type::getInt1Ty(Context), TODOTypeID);
         if (!FalseDest || !Cond)
           return error("Invalid record");
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
@@ -4609,8 +4678,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() != 1 && Record.size() != 2)
         return error("Invalid record");
       unsigned Idx = 0;
-      Value *CleanupPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Value *CleanupPad = getValue(
+          Record, Idx++, NextValueNo, Type::getTokenTy(Context), TODOTypeID);
       if (!CleanupPad)
         return error("Invalid record");
       BasicBlock *UnwindDest = nullptr;
@@ -4628,8 +4697,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() != 2)
         return error("Invalid record");
       unsigned Idx = 0;
-      Value *CatchPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Value *CatchPad = getValue(
+          Record, Idx++, NextValueNo, Type::getTokenTy(Context), TODOTypeID);
       if (!CatchPad)
         return error("Invalid record");
       BasicBlock *BB = getBasicBlock(Record[Idx++]);
@@ -4647,8 +4716,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       unsigned Idx = 0;
 
-      Value *ParentPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Value *ParentPad = getValue(
+          Record, Idx++, NextValueNo, Type::getTokenTy(Context), TODOTypeID);
 
       unsigned NumHandlers = Record[Idx++];
 
@@ -4675,6 +4744,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       for (BasicBlock *Handler : Handlers)
         CatchSwitch->addHandler(Handler);
       I = CatchSwitch;
+      ResTypeID = TODOTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4686,15 +4756,16 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       unsigned Idx = 0;
 
-      Value *ParentPad =
-          getValue(Record, Idx++, NextValueNo, Type::getTokenTy(Context));
+      Value *ParentPad = getValue(
+          Record, Idx++, NextValueNo, Type::getTokenTy(Context), TODOTypeID);
 
       unsigned NumArgOperands = Record[Idx++];
 
       SmallVector<Value *, 2> Args;
       for (unsigned Op = 0; Op != NumArgOperands; ++Op) {
         Value *Val;
-        if (getValueTypePair(Record, Idx, NextValueNo, Val))
+        unsigned ValTypeID;
+        if (getValueTypePair(Record, Idx, NextValueNo, Val, ValTypeID))
           return error("Invalid record");
         Args.push_back(Val);
       }
@@ -4706,6 +4777,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         I = CleanupPadInst::Create(ParentPad, Args);
       else
         I = CatchPadInst::Create(ParentPad, Args);
+      ResTypeID = TODOTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -4717,10 +4789,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         // Hopefully someday we will have support for case ranges and can use
         // this format again.
 
-        Type *OpTy = getTypeByID(Record[1]);
+        unsigned OpTyID = Record[1];
+        Type *OpTy = getTypeByID(OpTyID);
         unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
 
-        Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
+        Value *Cond = getValue(Record, 2, NextValueNo, OpTy, OpTyID);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (!OpTy || !Cond || !Default)
           return error("Invalid record");
@@ -4774,8 +4847,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       if (Record.size() < 3 || (Record.size() & 1) == 0)
         return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
-      Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Cond = getValue(Record, 1, NextValueNo, OpTy, OpTyID);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (!OpTy || !Cond || !Default)
         return error("Invalid record");
@@ -4783,8 +4857,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
       for (unsigned i = 0, e = NumCases; i != e; ++i) {
-        ConstantInt *CaseVal =
-          dyn_cast_or_null<ConstantInt>(getFnValueByID(Record[3+i*2], OpTy));
+        ConstantInt *CaseVal = dyn_cast_or_null<ConstantInt>(
+            getFnValueByID(Record[3+i*2], OpTy, OpTyID));
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
         if (!CaseVal || !DestBB) {
           delete SI;
@@ -4798,8 +4872,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
         return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
-      Value *Address = getValue(Record, 1, NextValueNo, OpTy);
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Address = getValue(Record, 1, NextValueNo, OpTy, OpTyID);
       if (!OpTy || !Address)
         return error("Invalid record");
       unsigned NumDests = Record.size()-2;
@@ -4835,7 +4910,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
 
       Value *Callee;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+      unsigned CalleeTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee, CalleeTypeID))
         return error("Invalid record");
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
@@ -4856,7 +4932,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       SmallVector<Type *, 16> ArgsTys;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Ops.push_back(getValue(Record, OpNum, NextValueNo,
-                               FTy->getParamType(i)));
+                               FTy->getParamType(i), TODOTypeID));
         ArgsTys.push_back(FTy->getParamType(i));
         if (!Ops.back())
           return error("Invalid record");
@@ -4869,7 +4945,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         // Read type/value pairs for varargs params.
         while (OpNum != Record.size()) {
           Value *Op;
-          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          unsigned OpTypeID;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID))
             return error("Invalid record");
           Ops.push_back(Op);
           ArgsTys.push_back(Op->getType());
@@ -4878,6 +4955,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       I = InvokeInst::Create(FTy, Callee, NormalBB, UnwindBB, Ops,
                              OperandBundles);
+      ResTypeID = TODOTypeID;
       OperandBundles.clear();
       InstructionList.push_back(I);
       cast<InvokeInst>(I)->setCallingConv(
@@ -4890,7 +4968,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_RESUME: { // RESUME: [opval]
       unsigned Idx = 0;
       Value *Val = nullptr;
-      if (getValueTypePair(Record, Idx, NextValueNo, Val))
+      unsigned ValTypeID;
+      if (getValueTypePair(Record, Idx, NextValueNo, Val, ValTypeID))
         return error("Invalid record");
       I = ResumeInst::Create(Val);
       InstructionList.push_back(I);
@@ -4916,7 +4995,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
 
       Value *Callee;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+      unsigned CalleeTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee, CalleeTypeID))
         return error("Invalid record");
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
@@ -4941,7 +5021,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         if (FTy->getParamType(i)->isLabelTy())
           Arg = getBasicBlock(Record[OpNum]);
         else
-          Arg = getValue(Record, OpNum, NextValueNo, FTy->getParamType(i));
+          Arg = getValue(Record, OpNum, NextValueNo, FTy->getParamType(i),
+                         TODOTypeID);
         if (!Arg)
           return error("Invalid record");
         Args.push_back(Arg);
@@ -4955,7 +5036,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
-          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          unsigned OpTypeID;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID))
             return error("Invalid record");
           Args.push_back(Op);
           ArgsTys.push_back(Op->getType());
@@ -4964,6 +5046,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       I = CallBrInst::Create(FTy, Callee, DefaultDest, IndirectDests, Args,
                              OperandBundles);
+      ResTypeID = TODOTypeID;
       OperandBundles.clear();
       InstructionList.push_back(I);
       cast<CallBrInst>(I)->setCallingConv(
@@ -4980,7 +5063,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.empty())
         return error("Invalid record");
       // The first record specifies the type.
-      Type *Ty = getTypeByID(Record[0]);
+      unsigned TyID = Record[0];
+      Type *Ty = getTypeByID(TyID);
       if (!Ty)
         return error("Invalid record");
 
@@ -4999,15 +5083,16 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         // negative IDs (for forward references).  Use a signed VBR
         // representation to keep the encoding small.
         if (UseRelativeIDs)
-          V = getValueSigned(Record, i * 2 + 1, NextValueNo, Ty);
+          V = getValueSigned(Record, i * 2 + 1, NextValueNo, Ty, TyID);
         else
-          V = getValue(Record, i * 2 + 1, NextValueNo, Ty);
+          V = getValue(Record, i * 2 + 1, NextValueNo, Ty, TyID);
         BasicBlock *BB = getBasicBlock(Record[i * 2 + 2]);
         if (!V || !BB)
           return error("Invalid record");
         PN->addIncoming(V, BB);
       }
       I = PN;
+      ResTypeID = TyID;
 
       // If there are an even number of records, the final record must be FMF.
       if (Record.size() % 2 == 0) {
@@ -5032,12 +5117,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         if (Record.size() < 4)
           return error("Invalid record");
       }
-      Type *Ty = getTypeByID(Record[Idx++]);
+      ResTypeID = Record[Idx++];
+      Type *Ty = getTypeByID(ResTypeID);
       if (!Ty)
         return error("Invalid record");
       if (BitCode == bitc::FUNC_CODE_INST_LANDINGPAD_OLD) {
         Value *PersFn = nullptr;
-        if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
+        unsigned PersFnTypeID;
+        if (getValueTypePair(Record, Idx, NextValueNo, PersFn, PersFnTypeID))
           return error("Invalid record");
 
         if (!F->hasPersonalityFn())
@@ -5054,8 +5141,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         LandingPadInst::ClauseType CT =
           LandingPadInst::ClauseType(Record[Idx++]); (void)CT;
         Value *Val;
+        unsigned ValTypeID;
 
-        if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
+        if (getValueTypePair(Record, Idx, NextValueNo, Val, ValTypeID)) {
           delete LP;
           return error("Invalid record");
         }
@@ -5087,8 +5175,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         if (!Ty)
           return error("Missing element type for old-style alloca");
       }
-      Type *OpTy = getTypeByID(Record[1]);
-      Value *Size = getFnValueByID(Record[2], OpTy);
+      unsigned OpTyID = Record[1];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Size = getFnValueByID(Record[2], OpTy, OpTyID);
       MaybeAlign Align;
       uint64_t AlignExp =
           Bitfield::get<APV::AlignLower>(Rec) |
@@ -5113,13 +5202,15 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;
+      ResTypeID = TODOTypeID;
       InstructionList.push_back(I);
       break;
     }
     case bitc::FUNC_CODE_INST_LOAD: { // LOAD: [opty, op, align, vol]
       unsigned OpNum = 0;
       Value *Op;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID) ||
           (OpNum + 2 != Record.size() && OpNum + 3 != Record.size()))
         return error("Invalid record");
 
@@ -5128,8 +5219,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       Type *Ty = nullptr;
       if (OpNum + 3 == Record.size()) {
-        Ty = getTypeByID(Record[OpNum++]);
+        ResTypeID = Record[OpNum++];
+        Ty = getTypeByID(ResTypeID);
       } else {
+        ResTypeID = getContainedTypeID(OpTypeID);
         Ty = Op->getType()->getPointerElementType();
       }
 
@@ -5152,7 +5245,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
        // LOADATOMIC: [opty, op, align, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Op;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID) ||
           (OpNum + 4 != Record.size() && OpNum + 5 != Record.size()))
         return error("Invalid record");
 
@@ -5161,8 +5255,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       Type *Ty = nullptr;
       if (OpNum + 5 == Record.size()) {
-        Ty = getTypeByID(Record[OpNum++]);
+        ResTypeID = Record[OpNum++];
+        Ty = getTypeByID(ResTypeID);
       } else {
+        ResTypeID = getContainedTypeID(OpTypeID);
         Ty = Op->getType()->getPointerElementType();
       }
 
@@ -5191,11 +5287,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+      unsigned PtrTypeID, ValTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID) ||
           (BitCode == bitc::FUNC_CODE_INST_STORE
-               ? getValueTypePair(Record, OpNum, NextValueNo, Val)
+               ? getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID)
                : popValue(Record, OpNum, NextValueNo,
-                          Ptr->getType()->getPointerElementType(), Val)) ||
+                          Ptr->getType()->getPointerElementType(),
+                          getContainedTypeID(PtrTypeID), Val)) ||
           OpNum + 2 != Record.size())
         return error("Invalid record");
 
@@ -5218,12 +5316,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       // STOREATOMIC: [ptrty, ptr, val, align, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+      unsigned PtrTypeID, ValTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID) ||
           !isa<PointerType>(Ptr->getType()) ||
           (BitCode == bitc::FUNC_CODE_INST_STOREATOMIC
-               ? getValueTypePair(Record, OpNum, NextValueNo, Val)
+               ? getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID)
                : popValue(Record, OpNum, NextValueNo,
-                          Ptr->getType()->getPointerElementType(), Val)) ||
+                          Ptr->getType()->getPointerElementType(),
+                          getContainedTypeID(PtrTypeID), Val)) ||
           OpNum + 4 != Record.size())
         return error("Invalid record");
 
@@ -5253,20 +5353,23 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const size_t NumRecords = Record.size();
       unsigned OpNum = 0;
       Value *Ptr = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr))
+      unsigned PtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID))
         return error("Invalid record");
 
       if (!isa<PointerType>(Ptr->getType()))
         return error("Cmpxchg operand is not a pointer type");
 
       Value *Cmp = nullptr;
+      unsigned CmpTypeID = getContainedTypeID(PtrTypeID);
       if (popValue(Record, OpNum, NextValueNo,
                    cast<PointerType>(Ptr->getType())->getPointerElementType(),
-                   Cmp))
+                   CmpTypeID, Cmp))
         return error("Invalid record");
 
       Value *New = nullptr;
-      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), New) ||
+      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), CmpTypeID,
+                   New) ||
           NumRecords < OpNum + 3 || NumRecords > OpNum + 5)
         return error("Invalid record");
 
@@ -5295,6 +5398,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, Alignment, SuccessOrdering,
                                 FailureOrdering, SSID);
+      ResTypeID = TODOTypeID;
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
 
       if (NumRecords < 8) {
@@ -5303,6 +5407,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         // expecting the first component of a modern cmpxchg.
         CurBB->getInstList().push_back(I);
         I = ExtractValueInst::Create(I, 0);
+        ResTypeID = TODOTypeID;
       } else {
         cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum + 4]);
       }
@@ -5316,18 +5421,20 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const size_t NumRecords = Record.size();
       unsigned OpNum = 0;
       Value *Ptr = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr))
+      unsigned PtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID))
         return error("Invalid record");
 
       if (!isa<PointerType>(Ptr->getType()))
         return error("Cmpxchg operand is not a pointer type");
 
       Value *Cmp = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Cmp))
+      unsigned CmpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Cmp, CmpTypeID))
         return error("Invalid record");
 
       Value *Val = nullptr;
-      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), Val))
+      if (popValue(Record, OpNum, NextValueNo, Cmp->getType(), CmpTypeID, Val))
         return error("Invalid record");
 
       if (NumRecords < OpNum + 3 || NumRecords > OpNum + 6)
@@ -5366,6 +5473,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
                                 FailureOrdering, SSID);
       cast<AtomicCmpXchgInst>(I)->setVolatile(IsVol);
       cast<AtomicCmpXchgInst>(I)->setWeak(IsWeak);
+      ResTypeID = TODOTypeID;
 
       InstructionList.push_back(I);
       break;
@@ -5378,7 +5486,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 0;
 
       Value *Ptr = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr))
+      unsigned PtrTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr, PtrTypeID))
         return error("Invalid record");
 
       if (!isa<PointerType>(Ptr->getType()))
@@ -5388,10 +5497,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (BitCode == bitc::FUNC_CODE_INST_ATOMICRMW_OLD) {
         if (popValue(Record, OpNum, NextValueNo,
                      cast<PointerType>(Ptr->getType())->getPointerElementType(),
-                     Val))
+                     getContainedTypeID(PtrTypeID), Val))
           return error("Invalid record");
       } else {
-        if (getValueTypePair(Record, OpNum, NextValueNo, Val))
+        unsigned ValTypeID;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Val, ValTypeID))
           return error("Invalid record");
       }
 
@@ -5425,6 +5535,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
             Align(TheModule->getDataLayout().getTypeStoreSize(Val->getType()));
 
       I = new AtomicRMWInst(Operation, Ptr, Val, *Alignment, Ordering, SSID);
+      ResTypeID = TODOTypeID;
       cast<AtomicRMWInst>(I)->setVolatile(IsVol);
 
       InstructionList.push_back(I);
@@ -5467,7 +5578,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
 
       Value *Callee;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+      unsigned CalleeTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee, CalleeTypeID))
         return error("Invalid record");
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
@@ -5492,7 +5604,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           Args.push_back(getBasicBlock(Record[OpNum]));
         else
           Args.push_back(getValue(Record, OpNum, NextValueNo,
-                                  FTy->getParamType(i)));
+                                  FTy->getParamType(i), TODOTypeID));
         ArgsTys.push_back(FTy->getParamType(i));
         if (!Args.back())
           return error("Invalid record");
@@ -5505,7 +5617,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
-          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          unsigned OpTypeID;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID))
             return error("Invalid record");
           Args.push_back(Op);
           ArgsTys.push_back(Op->getType());
@@ -5513,6 +5626,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
 
       I = CallInst::Create(FTy, Callee, Args, OperandBundles);
+      ResTypeID = TODOTypeID;
       OperandBundles.clear();
       InstructionList.push_back(I);
       cast<CallInst>(I)->setCallingConv(
@@ -5538,9 +5652,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
         return error("Invalid record");
-      Type *OpTy = getTypeByID(Record[0]);
-      Value *Op = getValue(Record, 1, NextValueNo, OpTy);
-      Type *ResTy = getTypeByID(Record[2]);
+      unsigned OpTyID = Record[0];
+      Type *OpTy = getTypeByID(OpTyID);
+      Value *Op = getValue(Record, 1, NextValueNo, OpTy, OpTyID);
+      ResTypeID = Record[2];
+      Type *ResTy = getTypeByID(ResTypeID);
       if (!OpTy || !Op || !ResTy)
         return error("Invalid record");
       I = new VAArgInst(Op, ResTy);
@@ -5561,7 +5677,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 1;
       while (OpNum != Record.size()) {
         Value *Op;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+        unsigned OpTypeID;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID))
           return error("Invalid record");
         Inputs.push_back(Op);
       }
@@ -5573,12 +5690,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_FREEZE: { // FREEZE: [opty,opval]
       unsigned OpNum = 0;
       Value *Op = nullptr;
-      if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+      unsigned OpTypeID;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID))
         return error("Invalid record");
       if (OpNum != Record.size())
         return error("Invalid record");
 
       I = new FreezeInst(Op);
+      ResTypeID = OpTypeID;
       InstructionList.push_back(I);
       break;
     }
@@ -5603,8 +5722,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     }
 
     // Non-void values get registered in the value table for future use.
-    if (!I->getType()->isVoidTy())
-      ValueList.assignValue(NextValueNo++, I);
+    if (!I->getType()->isVoidTy()) {
+      assert(ResTypeID != InvalidTypeID && "Should have ID for non-void type");
+      ValueList.assignValue(NextValueNo++, I, ResTypeID);
+    }
   }
 
 OutOfRecordLoop:
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 1955443e73db6..fb5491a4409b3 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1219,14 +1219,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       break;
     }
 
-    Type *Ty = getTypeByID(Record[0]);
+    unsigned TyID = Record[0];
+    Type *Ty = getTypeByID(TyID);
     if (Ty->isMetadataTy() || Ty->isVoidTy()) {
       dropRecord();
       break;
     }
 
     MetadataList.assignValue(
-        LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
+        LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty, TyID)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1239,14 +1240,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     unsigned Size = Record.size();
     SmallVector<Metadata *, 8> Elts;
     for (unsigned i = 0; i != Size; i += 2) {
-      Type *Ty = getTypeByID(Record[i]);
+      unsigned TyID = Record[i];
+      Type *Ty = getTypeByID(TyID);
       if (!Ty)
         return error("Invalid record");
       if (Ty->isMetadataTy())
         Elts.push_back(getMD(Record[i + 1]));
       else if (!Ty->isVoidTy()) {
-        auto *MD =
-            ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty));
+        auto *MD = ValueAsMetadata::get(
+            ValueList.getValueFwdRef(Record[i + 1], Ty, TyID));
         assert(isa<ConstantAsMetadata>(MD) &&
                "Expected non-function-local metadata");
         Elts.push_back(MD);
@@ -1261,12 +1263,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() != 2)
       return error("Invalid record");
 
-    Type *Ty = getTypeByID(Record[0]);
+    unsigned TyID = Record[0];
+    Type *Ty = getTypeByID(TyID);
     if (Ty->isMetadataTy() || Ty->isVoidTy())
       return error("Invalid record");
 
     MetadataList.assignValue(
-        ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
+        ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty, TyID)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
diff --git a/llvm/lib/Bitcode/Reader/ValueList.cpp b/llvm/lib/Bitcode/Reader/ValueList.cpp
index 2be1f625d0537..e9be518b0f247 100644
--- a/llvm/lib/Bitcode/Reader/ValueList.cpp
+++ b/llvm/lib/Bitcode/Reader/ValueList.cpp
@@ -60,35 +60,39 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPlaceHolder, Value)
 
 } // end namespace llvm
 
-void BitcodeReaderValueList::assignValue(unsigned Idx, Value *V) {
+void BitcodeReaderValueList::assignValue(unsigned Idx, Value *V,
+                                         unsigned TypeID) {
   if (Idx == size()) {
-    push_back(V);
+    push_back(V, TypeID);
     return;
   }
 
   if (Idx >= size())
     resize(Idx + 1);
 
-  WeakTrackingVH &OldV = ValuePtrs[Idx];
-  if (!OldV) {
-    OldV = V;
+  auto &Old = ValuePtrs[Idx];
+  if (!Old.first) {
+    Old.first = V;
+    Old.second = TypeID;
     return;
   }
 
   // Handle constants and non-constants (e.g. instrs) differently for
   // efficiency.
-  if (Constant *PHC = dyn_cast<Constant>(&*OldV)) {
+  if (Constant *PHC = dyn_cast<Constant>(&*Old.first)) {
     ResolveConstants.push_back(std::make_pair(PHC, Idx));
-    OldV = V;
+    Old.first = V;
+    Old.second = TypeID;
   } else {
     // If there was a forward reference to this value, replace it.
-    Value *PrevVal = OldV;
-    OldV->replaceAllUsesWith(V);
+    Value *PrevVal = Old.first;
+    Old.first->replaceAllUsesWith(V);
     PrevVal->deleteValue();
   }
 }
 
-Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx, Type *Ty) {
+Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx, Type *Ty,
+                                                    unsigned TyID) {
   // Bail out for a clearly invalid value.
   if (Idx >= RefsUpperBound)
     return nullptr;
@@ -96,7 +100,7 @@ Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx, Type *Ty) {
   if (Idx >= size())
     resize(Idx + 1);
 
-  if (Value *V = ValuePtrs[Idx]) {
+  if (Value *V = ValuePtrs[Idx].first) {
     if (Ty != V->getType())
       report_fatal_error("Type mismatch in constant table!");
     return cast<Constant>(V);
@@ -104,11 +108,12 @@ Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx, Type *Ty) {
 
   // Create and return a placeholder, which will later be RAUW'd.
   Constant *C = new ConstantPlaceHolder(Ty, Context);
-  ValuePtrs[Idx] = C;
+  ValuePtrs[Idx] = {C, TyID};
   return C;
 }
 
-Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
+Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty,
+                                              unsigned TyID) {
   // Bail out for a clearly invalid value.
   if (Idx >= RefsUpperBound)
     return nullptr;
@@ -116,7 +121,7 @@ Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
   if (Idx >= size())
     resize(Idx + 1);
 
-  if (Value *V = ValuePtrs[Idx]) {
+  if (Value *V = ValuePtrs[Idx].first) {
     // If the types don't match, it's invalid.
     if (Ty && Ty != V->getType())
       return nullptr;
@@ -129,7 +134,7 @@ Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
 
   // Create and return a placeholder, which will later be RAUW'd.
   Value *V = new Argument(Ty);
-  ValuePtrs[Idx] = V;
+  ValuePtrs[Idx] = {V, TyID};
   return V;
 }
 
diff --git a/llvm/lib/Bitcode/Reader/ValueList.h b/llvm/lib/Bitcode/Reader/ValueList.h
index 9723781ae5160..8058a9abe95fe 100644
--- a/llvm/lib/Bitcode/Reader/ValueList.h
+++ b/llvm/lib/Bitcode/Reader/ValueList.h
@@ -26,7 +26,8 @@ class Type;
 class Value;
 
 class BitcodeReaderValueList {
-  std::vector<WeakTrackingVH> ValuePtrs;
+  /// Maps Value ID to pair of Value* and Type ID.
+  std::vector<std::pair<WeakTrackingVH, unsigned>> ValuePtrs;
 
   /// As we resolve forward-referenced constants, we add information about them
   /// to this vector.  This allows us to resolve them in bulk instead of
@@ -58,7 +59,9 @@ class BitcodeReaderValueList {
   void resize(unsigned N) {
     ValuePtrs.resize(N);
   }
-  void push_back(Value *V) { ValuePtrs.emplace_back(V); }
+  void push_back(Value *V, unsigned TypeID) {
+    ValuePtrs.emplace_back(V, TypeID);
+  }
 
   void clear() {
     assert(ResolveConstants.empty() && "Constants not resolved?");
@@ -67,10 +70,15 @@ class BitcodeReaderValueList {
 
   Value *operator[](unsigned i) const {
     assert(i < ValuePtrs.size());
-    return ValuePtrs[i];
+    return ValuePtrs[i].first;
+  }
+
+  unsigned getTypeID(unsigned ValNo) const {
+    assert(ValNo < ValuePtrs.size());
+    return ValuePtrs[ValNo].second;
   }
 
-  Value *back() const { return ValuePtrs.back(); }
+  Value *back() const { return ValuePtrs.back().first; }
   void pop_back() {
     ValuePtrs.pop_back();
   }
@@ -81,10 +89,10 @@ class BitcodeReaderValueList {
     ValuePtrs.resize(N);
   }
 
-  Constant *getConstantFwdRef(unsigned Idx, Type *Ty);
-  Value *getValueFwdRef(unsigned Idx, Type *Ty);
+  Constant *getConstantFwdRef(unsigned Idx, Type *Ty, unsigned TyID);
+  Value *getValueFwdRef(unsigned Idx, Type *Ty, unsigned TyID);
 
-  void assignValue(unsigned Idx, Value *V);
+  void assignValue(unsigned Idx, Value *V, unsigned TypeID);
 
   /// Once all constants are read, this method bulk resolves any forward
   /// references.

From ba17bd2674c07e4ba79f3aede0f6dbb0c9ef592c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 21 Feb 2022 11:17:58 -0800
Subject: [PATCH 510/748] [AMDGPU] Extend SILoadStoreOptimizer to handle global
 loads

There can be situations where global and flat loads and stores are not
combined by the vectorizer, in particular if their address space
differ in the IR but they end up the same class instructions after
selection. For example a divergent load from constant address space
ends up being the same global_load as a load from global address space.

TODO: merge global stores.
TODO: handle SADDR forms.
TODO: merge flat load/stores.
TODO: merge flat with global promoting to flat.

Differential Revision: https://reviews.llvm.org/D120279
---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  83 +++++++
 .../test/CodeGen/AMDGPU/memcpy-fixed-align.ll |   8 +-
 .../AMDGPU/merge-global-load-store.mir        | 230 ++++++++++++++++++
 .../AMDGPU/promote-kernel-arguments.ll        |  12 +-
 4 files changed, 318 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ed2b957e28d9a..2a3271b89c1d4 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -79,6 +79,7 @@ enum InstClassEnum {
   MIMG,
   TBUFFER_LOAD,
   TBUFFER_STORE,
+  GLOBAL_LOAD
 };
 
 struct AddressRegs {
@@ -233,6 +234,9 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   MachineBasicBlock::iterator
   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
                         MachineBasicBlock::iterator InsertBefore);
+  MachineBasicBlock::iterator
+  mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                      MachineBasicBlock::iterator InsertBefore);
 
   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                            int32_t NewOffset) const;
@@ -300,10 +304,15 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
 
   switch (Opc) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORD:
     return 1;
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
     return 2;
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+    return 3;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
     return 4;
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return 8;
@@ -388,6 +397,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B64_gfx9:
     return DS_WRITE;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+    return GLOBAL_LOAD;
   }
 }
 
@@ -421,6 +435,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+    return AMDGPU::GLOBAL_LOAD_DWORD;
   }
 }
 
@@ -483,6 +502,12 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64_gfx9:
     Result.Addr = true;
     return Result;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+    Result.VAddr = true;
+    return Result;
   }
 }
 
@@ -1364,6 +1389,49 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   return New;
 }
 
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  const unsigned Opcode = getNewOpcode(CI, Paired);
+
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+  Register DestReg = MRI->createVirtualRegister(SuperRC);
+
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
+
+  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
+
+  MachineInstr *New =
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+       .addImm(std::min(CI.Offset, Paired.Offset))
+       .addImm(CI.CPol)
+       .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+  // Copy to the old destination registers.
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
+  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
+
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  CI.I->eraseFromParent();
+  Paired.I->eraseFromParent();
+  return New;
+}
+
 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
                                             const CombineInfo &Paired) {
   const unsigned Width = CI.Width + Paired.Width;
@@ -1392,6 +1460,17 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 8:
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
     }
+  case GLOBAL_LOAD:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_LOAD_DWORDX2;
+    case 3:
+      return AMDGPU::GLOBAL_LOAD_DWORDX3;
+    case 4:
+      return AMDGPU::GLOBAL_LOAD_DWORDX4;
+    }
   case MIMG:
     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
            "No overlaps");
@@ -2035,6 +2114,10 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
+    case GLOBAL_LOAD:
+      NewMI = mergeGlobalLoadPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
+      break;
     }
     CI.setMI(NewMI, *this);
     CI.Order = Where->Order;
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index 61579ba3c2212..6be01670a7beb 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -7,13 +7,11 @@ define void @memcpy_fixed_align(i8 addrspace(5)*  %dst, i8 addrspace(1)* %src) {
 ; MUBUF-LABEL: memcpy_fixed_align:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    global_load_dword v0, v[1:2], off offset:36
-; MUBUF-NEXT:    global_load_dword v11, v[1:2], off offset:32
+; MUBUF-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
 ; MUBUF-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
 ; MUBUF-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off
-; MUBUF-NEXT:    s_waitcnt vmcnt(3)
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:36
-; MUBUF-NEXT:    s_waitcnt vmcnt(3)
+; MUBUF-NEXT:    s_waitcnt vmcnt(2)
+; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:36
 ; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32
 ; MUBUF-NEXT:    s_waitcnt vmcnt(3)
 ; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28
diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
new file mode 100644
index 0000000000000..f84184e3f5f52
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
@@ -0,0 +1,230 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            merge_global_load_dword_2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_load_dword_3
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 [[DEF]], 0, 1, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX3_]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub2
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2, implicit %3
+...
+
+---
+name:            merge_global_load_dword_4
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_4
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 2, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            merge_global_load_dword_5
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_5
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 3, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[GLOBAL_LOAD_DWORD]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
+...
+
+---
+name:            merge_global_load_dword_6
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_6
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 16, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6
+...
+
+---
+name:            merge_global_load_dwordx2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec :: (load (s128) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_load_dwordx3_with_dwordx1
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dwordx3_with_dwordx1
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[DEF]], 12, 0, implicit $exec :: (load (s128) from `i128 addrspace(1)* undef`, align 8, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `i128 addrspace(1)* undef`, align 8, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_load_dwordx1_with_dwordx2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dwordx1_with_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 [[DEF]], 12, 0, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub1_sub2
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 8, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_agpr_with_vgpr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_agpr_with_vgpr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_disjoint
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_disjoint
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_overlap
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_overlap
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 3, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_global_load_dword_different_cpol
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_different_cpol
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
index 5cc37b45e0cc4..2a2f797b3d65b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
@@ -87,14 +87,7 @@ entry:
 
 ; GCN-LABEL: flat_ptr_arg:
 ; GCN-COUNT-2: global_load_dwordx2
-
-; FIXME: First load is in the constant address space and second is in global
-;        because it is clobbered by store. GPU load store vectorizer cannot
-;        combine them. Note, this does not happen with -O3 because loads are
-;        vectorized in pairs earlier and stay in the global address space.
-
-; GCN:         global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}}
-; GCN:         global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4
+; GCN:         global_load_dwordx4
 ; GCN:         global_store_dword
 define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) {
 ; CHECK-LABEL: @flat_ptr_arg(
@@ -177,8 +170,7 @@ entry:
 
 ; GCN-LABEL: global_ptr_arg:
 ; GCN: global_load_dwordx2
-; GCN: global_load_dword v{{[0-9]+}}, [[PTR:v\[[0-9:]+\]]], off{{$}}
-; GCN: global_load_dwordx3 v[{{[0-9:]+}}], [[PTR]], off offset:4
+; GCN: global_load_dwordx4
 ; GCN: global_store_dword
 define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
 ; CHECK-LABEL: @global_ptr_arg(

From f4e9df22b5c59692f79fb4d5f9da07d139f6d5aa Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 22 Feb 2022 17:56:01 +0100
Subject: [PATCH 511/748] [InstCombine] Add test for missed select fold due to
 one use limitation (NFC)

The eq sub zero fold currently has an artificial one-use limitation,
causing us to miss this fold.
---
 llvm/test/Transforms/InstCombine/icmp-sub.ll | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/icmp-sub.ll b/llvm/test/Transforms/InstCombine/icmp-sub.ll
index 5088cb5a73d1f..2efa2bbe269ad 100644
--- a/llvm/test/Transforms/InstCombine/icmp-sub.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-sub.ll
@@ -488,3 +488,18 @@ define <2 x i1> @sub_ne_zero_use(<2 x i8> %x, <2 x i8> %y) {
   %r = icmp eq <2 x i8> %sub, zeroinitializer
   ret <2 x i1> %r
 }
+
+define i32 @sub_eq_zero_select(i32 %a, i32 %b, i32* %p) {
+; CHECK-LABEL: @sub_eq_zero_select(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SUB]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[B]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %sub = sub i32 %a, %b
+  store i32 %sub, i32* %p
+  %cmp = icmp eq i32 %sub, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}

From 9e055c0fffa10e7bb02710c989129d503b39fc94 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 21 Feb 2022 14:48:59 -0800
Subject: [PATCH 512/748] [AMDGPU] Extend SILoadStoreOptimizer to handle global
 saddr loads

This adds handling of the _SADDR forms to the GLOBAL_LOAD combining.

TODO: merge global stores.
TODO: merge flat load/stores.
TODO: merge flat with global promoting to flat.

Differential Revision: https://reviews.llvm.org/D120285
---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  42 ++++-
 .../AMDGPU/merge-global-load-store.mir        | 169 ++++++++++++++++++
 2 files changed, 210 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 2a3271b89c1d4..851346c354e2e 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -79,7 +79,8 @@ enum InstClassEnum {
   MIMG,
   TBUFFER_LOAD,
   TBUFFER_STORE,
-  GLOBAL_LOAD
+  GLOBAL_LOAD,
+  GLOBAL_LOAD_SADDR
 };
 
 struct AddressRegs {
@@ -87,6 +88,7 @@ struct AddressRegs {
   bool SBase = false;
   bool SRsrc = false;
   bool SOffset = false;
+  bool SAddr = false;
   bool VAddr = false;
   bool Addr = false;
   bool SSamp = false;
@@ -305,14 +307,18 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   switch (Opc) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
   case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
     return 1;
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
     return 2;
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
     return 3;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
     return 4;
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return 8;
@@ -402,6 +408,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
     return GLOBAL_LOAD;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+    return GLOBAL_LOAD_SADDR;
   }
 }
 
@@ -440,6 +451,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
     return AMDGPU::GLOBAL_LOAD_DWORD;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+    return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
   }
 }
 
@@ -502,6 +518,12 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64_gfx9:
     Result.Addr = true;
     return Result;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+    Result.SAddr = true;
+    LLVM_FALLTHROUGH;
   case AMDGPU::GLOBAL_LOAD_DWORD:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
@@ -579,6 +601,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
   if (Regs.SOffset)
     AddrIdx[NumAddresses++] =
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
+  if (Regs.SAddr)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
   if (Regs.VAddr)
     AddrIdx[NumAddresses++] =
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
@@ -1402,6 +1427,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
 
   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
 
+  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+    MIB.add(*SAddr);
+
   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
 
@@ -1471,6 +1499,17 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 4:
       return AMDGPU::GLOBAL_LOAD_DWORDX4;
     }
+  case GLOBAL_LOAD_SADDR:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
+    case 3:
+      return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
+    case 4:
+      return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
+    }
   case MIMG:
     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
            "No overlaps");
@@ -2115,6 +2154,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     case GLOBAL_LOAD:
+    case GLOBAL_LOAD_SADDR:
       NewMI = mergeGlobalLoadPair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
index f84184e3f5f52..d2404fca19b50 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
@@ -228,3 +228,172 @@ body:             |
     %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
     S_NOP 0, implicit %1, implicit %2
 ...
+
+---
+name:            merge_global_load_dword_saddr_2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_saddr_2
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name:            merge_global_load_dword_saddr_3
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_saddr_3
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX3_SADDR]].sub2
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            merge_global_load_dword_saddr_4
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_saddr_4
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 0, 2, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_SADDR]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5
+...
+
+---
+name:            merge_global_load_dword_saddr_6
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_saddr_6
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 4, 3, implicit $exec :: (load (s128) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_SADDR]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 20, 3, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+    ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
+...
+
+---
+name:            merge_global_load_dwordx2_saddr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dwordx2_saddr
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s128) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX4_SADDR]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX4_SADDR]].sub2_sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name:            no_merge_global_load_dword_and_global_load_dword_saddr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_and_global_load_dword_saddr
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name:            no_merge_global_load_dword_saddr_different_saddr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_saddr_different_saddr
+    ; GCN: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]].sub0_sub1, [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]].sub2_sub3, [[DEF1]], 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]]
+    %0:sgpr_128 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name:            no_merge_global_load_dword_saddr_different_vaddr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_load_dword_saddr_different_vaddr
+    ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]]
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3
+...

From cedc23bc8612feea61c0a7d92ccaae1ce51520ce Mon Sep 17 00:00:00 2001
From: Simon Atanasyan <simon@atanasyan.com>
Date: Tue, 22 Feb 2022 16:46:28 +0300
Subject: [PATCH 513/748] [MIPS] Add `-no-pie` option to the clang driver's
 tests depend on it

---
 clang/test/Driver/mips-cs.cpp      |  48 +++----
 clang/test/Driver/mips-fsf.cpp     | 208 ++++++++++++++---------------
 clang/test/Driver/mips-img-v2.cpp  |  24 ++--
 clang/test/Driver/mips-img.cpp     |  12 +-
 clang/test/Driver/mips-mti-linux.c |   6 +-
 5 files changed, 150 insertions(+), 148 deletions(-)

diff --git a/clang/test/Driver/mips-cs.cpp b/clang/test/Driver/mips-cs.cpp
index 6ef4c5d4350cd..39f87d8fd8354 100644
--- a/clang/test/Driver/mips-cs.cpp
+++ b/clang/test/Driver/mips-cs.cpp
@@ -4,7 +4,7 @@
 //
 // = Big-endian, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu \
+// RUN:     --target=mips-linux-gnu -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32 %s
 // CHECK-BE-HF-32: "-internal-isystem"
@@ -32,7 +32,7 @@
 //
 // = Big-endian, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -muclibc \
+// RUN:     --target=mips-linux-gnu -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-32 %s
 // CHECK-BE-UC-HF-32: "-internal-isystem"
@@ -61,7 +61,7 @@
 //
 // = Big-endian, hard float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips16 \
+// RUN:     --target=mips-linux-gnu -mips16 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16 %s
 // CHECK-BE-HF-16: "-internal-isystem"
@@ -90,7 +90,7 @@
 //
 // = Big-endian, hard float, mmicromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips \
+// RUN:     --target=mips-linux-gnu -mmicromips -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-MICRO %s
 // CHECK-BE-HF-MICRO: "-internal-isystem"
@@ -119,7 +119,7 @@
 //
 // = Big-endian, hard float, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mnan=2008 \
+// RUN:     --target=mips-linux-gnu -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-NAN %s
 // CHECK-BE-HF-NAN: "-internal-isystem"
@@ -148,7 +148,7 @@
 //
 // = Big-endian, hard float, uclibc, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -muclibc -mnan=2008 \
+// RUN:     --target=mips-linux-gnu -muclibc -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-NAN %s
 // CHECK-BE-UC-HF-NAN: "-internal-isystem"
@@ -177,7 +177,7 @@
 //
 // = Big-endian, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -msoft-float \
+// RUN:     --target=mips-linux-gnu -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32 %s
 // CHECK-BE-SF-32: "-internal-isystem"
@@ -206,7 +206,7 @@
 //
 // = Big-endian, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -muclibc -msoft-float \
+// RUN:     --target=mips-linux-gnu -muclibc -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-SF-32 %s
 // CHECK-BE-UC-SF-32: "-internal-isystem"
@@ -235,7 +235,7 @@
 //
 // = Big-endian, soft float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -msoft-float -mips16 \
+// RUN:     --target=mips-linux-gnu -msoft-float -mips16 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16 %s
 // CHECK-BE-SF-16: "-internal-isystem"
@@ -264,7 +264,7 @@
 //
 // = Big-endian, soft float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -msoft-float -mmicromips \
+// RUN:     --target=mips-linux-gnu -msoft-float -mmicromips -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-MICRO %s
 // CHECK-BE-SF-MICRO: "-internal-isystem"
@@ -293,7 +293,7 @@
 //
 // = Big-endian, hard float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu \
+// RUN:     --target=mips64-linux-gnu -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64 %s
 // CHECK-BE-HF-64: "-internal-isystem"
@@ -322,7 +322,7 @@
 //
 // = Big-endian, soft float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -msoft-float \
+// RUN:     --target=mips64-linux-gnu -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64 %s
 // CHECK-BE-SF-64: "-internal-isystem"
@@ -351,7 +351,7 @@
 //
 // = Little-endian, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mhard-float \
+// RUN:     --target=mipsel-linux-gnu -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32 %s
 // CHECK-EL-HF-32: "-internal-isystem"
@@ -380,7 +380,7 @@
 //
 // = Little-endian, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mhard-float -muclibc \
+// RUN:     --target=mipsel-linux-gnu -mhard-float -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-32 %s
 // CHECK-EL-UC-HF-32: "-internal-isystem"
@@ -409,7 +409,7 @@
 //
 // = Little-endian, hard float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips16 \
+// RUN:     --target=mipsel-linux-gnu -mips16 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16 %s
 // CHECK-EL-HF-16: "-internal-isystem"
@@ -438,7 +438,7 @@
 //
 // = Little-endian, hard float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips \
+// RUN:     --target=mipsel-linux-gnu -mmicromips -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-MICRO %s
 // CHECK-EL-HF-MICRO: "-internal-isystem"
@@ -467,7 +467,7 @@
 //
 // = Little-endian, hard float, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mnan=2008 \
+// RUN:     --target=mipsel-linux-gnu -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-NAN %s
 // CHECK-EL-HF-NAN: "-internal-isystem"
@@ -496,7 +496,7 @@
 //
 // = Little-endian, hard float, uclibc, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -muclibc -mnan=2008 \
+// RUN:     --target=mipsel-linux-gnu -muclibc -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-NAN %s
 // CHECK-EL-UC-HF-NAN: "-internal-isystem"
@@ -525,7 +525,7 @@
 //
 // = Little-endian, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft \
+// RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32 %s
 // CHECK-EL-SF-32: "-internal-isystem"
@@ -554,7 +554,7 @@
 //
 // = Little-endian, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft -muclibc \
+// RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-SF-32 %s
 // CHECK-EL-UC-SF-32: "-internal-isystem"
@@ -583,7 +583,7 @@
 //
 // = Little-endian, soft float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips16 -msoft-float \
+// RUN:     --target=mipsel-linux-gnu -mips16 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16 %s
 // CHECK-EL-SF-16: "-internal-isystem"
@@ -612,7 +612,7 @@
 //
 // = Little-endian, soft float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -msoft-float \
+// RUN:     --target=mipsel-linux-gnu -mmicromips -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-MICRO %s
 // CHECK-EL-SF-MICRO: "-internal-isystem"
@@ -641,7 +641,7 @@
 //
 // = Little-endian, hard float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu \
+// RUN:     --target=mips64el-linux-gnu -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64 %s
 // CHECK-EL-HF-64: "-internal-isystem"
@@ -670,7 +670,7 @@
 //
 // = Little-endian, soft float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -msoft-float \
+// RUN:     --target=mips64el-linux-gnu -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64 %s
 // CHECK-EL-SF-64: "-internal-isystem"
diff --git a/clang/test/Driver/mips-fsf.cpp b/clang/test/Driver/mips-fsf.cpp
index b94da69789169..f67ad4e18ff29 100644
--- a/clang/test/Driver/mips-fsf.cpp
+++ b/clang/test/Driver/mips-fsf.cpp
@@ -4,7 +4,7 @@
 //
 // = Big-endian, mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32 %s
 // CHECK-BE-HF-32: "-internal-isystem"
@@ -31,7 +31,7 @@
 //
 // = Big-endian, mips32, hard float, fp64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-32 %s
 // CHECK-BE-HF64-32: "-internal-isystem"
@@ -58,7 +58,7 @@
 //
 // = Big-endian, mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -msoft-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32 %s
 // CHECK-BE-SF-32: "-internal-isystem"
@@ -85,7 +85,7 @@
 //
 // = Big-endian, mips16 / mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16 %s
 // CHECK-BE-HF-16: "-internal-isystem"
@@ -112,7 +112,7 @@
 //
 // = Big-endian, mips16 / mips32, hard float, fp64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-16 %s
 // CHECK-BE-HF64-16: "-internal-isystem"
@@ -139,7 +139,7 @@
 //
 // = Big-endian, mips16 / mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -msoft-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16 %s
 // CHECK-BE-SF-16: "-internal-isystem"
@@ -166,7 +166,7 @@
 //
 // = Big-endian, mips32 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-16 %s
 // CHECK-BE-NAN-16: "-internal-isystem"
@@ -193,7 +193,7 @@
 //
 // = Big-endian, mips32 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-16 %s
 // CHECK-BE-NAN64-16: "-internal-isystem"
@@ -220,7 +220,7 @@
 //
 // = Big-endian, mips32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-32 %s
 // CHECK-BE-NAN-32: "-internal-isystem"
@@ -247,7 +247,7 @@
 //
 // = Big-endian, mips32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32 %s
 // CHECK-BE-NAN64-32: "-internal-isystem"
@@ -274,7 +274,7 @@
 //
 // = Big-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R2 %s
 // CHECK-BE-HF-32R2: "-internal-isystem"
@@ -301,7 +301,7 @@
 //
 // = Big-endian, mips32r2, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float -muclibc \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-32R2 %s
 // CHECK-BE-UC-HF-32R2: "-internal-isystem"
@@ -328,7 +328,7 @@
 //
 // = Big-endian, mips32r2, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-32R2 %s
 // CHECK-BE-HF64-32R2: "-internal-isystem"
@@ -355,7 +355,7 @@
 //
 // = Big-endian, mips32r2, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32R2 %s
 // CHECK-BE-SF-32R2: "-internal-isystem"
@@ -382,7 +382,7 @@
 //
 // = Big-endian, mips32r2, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float -muclibc \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-SF-32R2 %s
 // CHECK-BE-UC-SF-32R2: "-internal-isystem"
@@ -409,7 +409,7 @@
 //
 // = Big-endian, mips32r2 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16R2 %s
 // CHECK-BE-HF-16R2: "-internal-isystem"
@@ -436,7 +436,7 @@
 //
 // = Big-endian, mips32r2 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-16R2 %s
 // CHECK-BE-HF64-16R2: "-internal-isystem"
@@ -463,7 +463,7 @@
 //
 // = Big-endian, mips32r2 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -msoft-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16R2 %s
 // CHECK-BE-SF-16R2: "-internal-isystem"
@@ -490,7 +490,7 @@
 //
 // = Big-endian, mips32r2 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-16R2 %s
 // CHECK-BE-NAN-16R2: "-internal-isystem"
@@ -517,7 +517,7 @@
 //
 // = Big-endian, mips32r2 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-16R2 %s
 // CHECK-BE-NAN64-16R2: "-internal-isystem"
@@ -544,7 +544,7 @@
 //
 // = Big-endian, mips32r2, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-32R2 %s
 // CHECK-BE-NAN-32R2: "-internal-isystem"
@@ -571,7 +571,7 @@
 //
 // = Big-endian, mips32r2, nan2008, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-NAN-32R2 %s
 // CHECK-BE-UC-NAN-32R2: "-internal-isystem"
@@ -598,7 +598,7 @@
 //
 // = Big-endian, mips32r2, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32R2 %s
 // CHECK-BE-NAN64-32R2: "-internal-isystem"
@@ -625,7 +625,7 @@
 //
 // = Big-endian, default (mips32r2), fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mfp64 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32R2-DEF %s
 // CHECK-BE-NAN64-32R2-DEF: "-internal-isystem"
@@ -652,7 +652,7 @@
 //
 // = Big-endian, micromips, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mmicromips -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-MM %s
 // CHECK-BE-HF-MM: "-internal-isystem"
@@ -679,7 +679,7 @@
 //
 // = Big-endian, micromips, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-MM %s
 // CHECK-BE-HF64-MM: "-internal-isystem"
@@ -706,7 +706,7 @@
 //
 // = Big-endian, micromips, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mmicromips -msoft-float \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-MM %s
 // CHECK-BE-SF-MM: "-internal-isystem"
@@ -733,7 +733,7 @@
 //
 // = Big-endian, micromips, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mmicromips -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-MM %s
 // CHECK-BE-NAN-MM: "-internal-isystem"
@@ -760,7 +760,7 @@
 //
 // = Big-endian, micromips, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-MM %s
 // CHECK-BE-NAN64-MM: "-internal-isystem"
@@ -787,7 +787,7 @@
 //
 // = Big-endian, mips64, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64-N32 %s
 // CHECK-BE-HF-64-N32: "-internal-isystem"
@@ -814,7 +814,7 @@
 //
 // = Big-endian, mips64, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64-N32 %s
 // CHECK-BE-HF64-64-N32: "-internal-isystem"
@@ -841,7 +841,7 @@
 //
 // = Big-endian, mips64, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -msoft-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64-N32 %s
 // CHECK-BE-SF-64-N32: "-internal-isystem"
@@ -868,7 +868,7 @@
 //
 // = Big-endian, mips64, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64-N32 %s
 // CHECK-BE-NAN-64-N32: "-internal-isystem"
@@ -895,7 +895,7 @@
 //
 // = Big-endian, mips64, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64-N32 %s
 // CHECK-BE-NAN64-64-N32: "-internal-isystem"
@@ -922,7 +922,7 @@
 //
 // = Big-endian, mips64, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64-64 %s
 // CHECK-BE-HF-64-64: "-internal-isystem"
@@ -949,7 +949,7 @@
 //
 // = Big-endian, mips64, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64-64 %s
 // CHECK-BE-HF64-64-64: "-internal-isystem"
@@ -976,7 +976,7 @@
 //
 // = Big-endian, mips64, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -msoft-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64-64 %s
 // CHECK-BE-SF-64-64: "-internal-isystem"
@@ -1003,7 +1003,7 @@
 //
 // = Big-endian, mips64, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64-64 %s
 // CHECK-BE-NAN-64-64: "-internal-isystem"
@@ -1030,7 +1030,7 @@
 //
 // = Big-endian, mips64, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64-64 %s
 // CHECK-BE-NAN64-64-64: "-internal-isystem"
@@ -1057,7 +1057,7 @@
 //
 // = Big-endian, mips64r2, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R2-N32 %s
 // CHECK-BE-HF-64R2-N32: "-internal-isystem"
@@ -1084,7 +1084,7 @@
 //
 // = Big-endian, mips64r2, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64R2-N32 %s
 // CHECK-BE-HF64-64R2-N32: "-internal-isystem"
@@ -1111,7 +1111,7 @@
 //
 // = Big-endian, mips64r2, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64R2-N32 %s
 // CHECK-BE-SF-64R2-N32: "-internal-isystem"
@@ -1138,7 +1138,7 @@
 //
 // = Big-endian, mips64r2, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64R2-N32 %s
 // CHECK-BE-NAN-64R2-N32: "-internal-isystem"
@@ -1165,7 +1165,7 @@
 //
 // = Big-endian, mips64r2, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-N32 %s
 // CHECK-BE-NAN64-64R2-N32: "-internal-isystem"
@@ -1192,7 +1192,7 @@
 //
 // = Big-endian, mips64r2, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R2-64 %s
 // CHECK-BE-HF-64R2-64: "-internal-isystem"
@@ -1219,7 +1219,7 @@
 //
 // = Big-endian, mips64r2, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64R2-64 %s
 // CHECK-BE-HF64-64R2-64: "-internal-isystem"
@@ -1246,7 +1246,7 @@
 //
 // = Big-endian, mips64r2, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64R2-64 %s
 // CHECK-BE-SF-64R2-64: "-internal-isystem"
@@ -1273,7 +1273,7 @@
 //
 // = Big-endian, mips64r2, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64R2-64 %s
 // CHECK-BE-NAN-64R2-64: "-internal-isystem"
@@ -1300,7 +1300,7 @@
 //
 // = Big-endian, mips64r2, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-64 %s
 // CHECK-BE-NAN64-64R2-64: "-internal-isystem"
@@ -1327,7 +1327,7 @@
 //
 // = Big-endian, default (mips64r2), ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-64-DEF %s
 // CHECK-BE-NAN64-64R2-64-DEF: "-internal-isystem"
@@ -1354,7 +1354,7 @@
 //
 // = Little-endian, mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32 %s
 // CHECK-EL-HF-32: "-internal-isystem"
@@ -1381,7 +1381,7 @@
 //
 // = Little-endian, mips32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-32 %s
 // CHECK-EL-HF64-32: "-internal-isystem"
@@ -1408,7 +1408,7 @@
 //
 // = Little-endian, mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -msoft-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32 %s
 // CHECK-EL-SF-32: "-internal-isystem"
@@ -1435,7 +1435,7 @@
 //
 // = Little-endian, mips32 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16 %s
 // CHECK-EL-HF-16: "-internal-isystem"
@@ -1462,7 +1462,7 @@
 //
 // = Little-endian, mips32 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-16 %s
 // CHECK-EL-HF64-16: "-internal-isystem"
@@ -1489,7 +1489,7 @@
 //
 // = Little-endian, mips32 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -msoft-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16 %s
 // CHECK-EL-SF-16: "-internal-isystem"
@@ -1516,7 +1516,7 @@
 //
 // = Little-endian, mips32 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-16 %s
 // CHECK-EL-NAN-16: "-internal-isystem"
@@ -1543,7 +1543,7 @@
 //
 // = Little-endian, mips32 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-16 %s
 // CHECK-EL-NAN64-16: "-internal-isystem"
@@ -1570,7 +1570,7 @@
 //
 // = Little-endian, mips32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-32 %s
 // CHECK-EL-NAN-32: "-internal-isystem"
@@ -1597,7 +1597,7 @@
 //
 // = Little-endian, mips32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32 %s
 // CHECK-EL-NAN64-32: "-internal-isystem"
@@ -1624,7 +1624,7 @@
 //
 // = Little-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32R2 %s
 // CHECK-EL-HF-32R2: "-internal-isystem"
@@ -1651,7 +1651,7 @@
 //
 // = Little-endian, mips32r2, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float -muclibc \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-32R2 %s
 // CHECK-EL-UC-HF-32R2: "-internal-isystem"
@@ -1678,7 +1678,7 @@
 //
 // = Little-endian, mips32r2, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-32R2 %s
 // CHECK-EL-HF64-32R2: "-internal-isystem"
@@ -1705,7 +1705,7 @@
 //
 // = Little-endian, mips32r2, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32R2 %s
 // CHECK-EL-SF-32R2: "-internal-isystem"
@@ -1732,7 +1732,7 @@
 //
 // = Little-endian, mips32r2, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float -muclibc \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-SF-32R2 %s
 // CHECK-EL-UC-SF-32R2: "-internal-isystem"
@@ -1759,7 +1759,7 @@
 //
 // = Little-endian, mips32r2 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16R2 %s
 // CHECK-EL-HF-16R2: "-internal-isystem"
@@ -1786,7 +1786,7 @@
 //
 // = Little-endian, mips32r2 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-16R2 %s
 // CHECK-EL-HF64-16R2: "-internal-isystem"
@@ -1813,7 +1813,7 @@
 //
 // = Little-endian, mips32r2 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -msoft-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16R2 %s
 // CHECK-EL-SF-16R2: "-internal-isystem"
@@ -1840,7 +1840,7 @@
 //
 // = Little-endian, mips32r2 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-16R2 %s
 // CHECK-EL-NAN-16R2: "-internal-isystem"
@@ -1867,7 +1867,7 @@
 //
 // = Little-endian, mips32r2 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-16R2 %s
 // CHECK-EL-NAN64-16R2: "-internal-isystem"
@@ -1894,7 +1894,7 @@
 //
 // = Little-endian, mips32r2, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-32R2 %s
 // CHECK-EL-NAN-32R2: "-internal-isystem"
@@ -1921,7 +1921,7 @@
 //
 // = Little-endian, mips32r2, nan2008, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-NAN-32R2 %s
 // CHECK-EL-UC-NAN-32R2: "-internal-isystem"
@@ -1948,7 +1948,7 @@
 //
 // = Little-endian, mips32r2, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32R2 %s
 // CHECK-EL-NAN64-32R2: "-internal-isystem"
@@ -1975,7 +1975,7 @@
 //
 // = Little-endian, default (mips32r2), fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mfp64 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32R2-DEF %s
 // CHECK-EL-NAN64-32R2-DEF: "-internal-isystem"
@@ -2002,7 +2002,7 @@
 //
 // = Little-endian, micromips, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-MM %s
 // CHECK-EL-HF-MM: "-internal-isystem"
@@ -2029,7 +2029,7 @@
 //
 // = Little-endian, micromips, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mhard-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-MM %s
 // CHECK-EL-HF64-MM: "-internal-isystem"
@@ -2056,7 +2056,7 @@
 //
 // = Little-endian, micromips, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -msoft-float \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-MM %s
 // CHECK-EL-SF-MM: "-internal-isystem"
@@ -2083,7 +2083,7 @@
 //
 // = Little-endian, micromips, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-MM %s
 // CHECK-EL-NAN-MM: "-internal-isystem"
@@ -2110,7 +2110,7 @@
 //
 // = Little-endian, micromips, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-MM %s
 // CHECK-EL-NAN64-MM: "-internal-isystem"
@@ -2137,7 +2137,7 @@
 //
 // = Little-endian, mips64, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64-N32 %s
 // CHECK-EL-HF-64-N32: "-internal-isystem"
@@ -2164,7 +2164,7 @@
 //
 // = Little-endian, mips64, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64-N32 %s
 // CHECK-EL-HF64-64-N32: "-internal-isystem"
@@ -2191,7 +2191,7 @@
 //
 // = Little-endian, mips64, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -msoft-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64-N32 %s
 // CHECK-EL-SF-64-N32: "-internal-isystem"
@@ -2218,7 +2218,7 @@
 //
 // = Little-endian, mips64, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64-N32 %s
 // CHECK-EL-NAN-64-N32: "-internal-isystem"
@@ -2245,7 +2245,7 @@
 //
 // = Little-endian, mips64, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64-N32 %s
 // CHECK-EL-NAN64-64-N32: "-internal-isystem"
@@ -2272,7 +2272,7 @@
 //
 // = Little-endian, mips64, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64-64 %s
 // CHECK-EL-HF-64-64: "-internal-isystem"
@@ -2299,7 +2299,7 @@
 //
 // = Little-endian, mips64, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64-64 %s
 // CHECK-EL-HF64-64-64: "-internal-isystem"
@@ -2326,7 +2326,7 @@
 //
 // = Little-endian, mips64, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -msoft-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64-64 %s
 // CHECK-EL-SF-64-64: "-internal-isystem"
@@ -2353,7 +2353,7 @@
 //
 // = Little-endian, mips64, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64-64 %s
 // CHECK-EL-NAN-64-64: "-internal-isystem"
@@ -2380,7 +2380,7 @@
 //
 // = Little-endian, mips64, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64-64 %s
 // CHECK-EL-NAN64-64-64: "-internal-isystem"
@@ -2407,7 +2407,7 @@
 //
 // = Little-endian, mips64r2, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64R2-N32 %s
 // CHECK-EL-HF-64R2-N32: "-internal-isystem"
@@ -2434,7 +2434,7 @@
 //
 // = Little-endian, mips64r2, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64R2-N32 %s
 // CHECK-EL-HF64-64R2-N32: "-internal-isystem"
@@ -2461,7 +2461,7 @@
 //
 // = Little-endian, mips64r2, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64R2-N32 %s
 // CHECK-EL-SF-64R2-N32: "-internal-isystem"
@@ -2488,7 +2488,7 @@
 //
 // = Little-endian, mips64r2, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64R2-N32 %s
 // CHECK-EL-NAN-64R2-N32: "-internal-isystem"
@@ -2515,7 +2515,7 @@
 //
 // = Little-endian, mips64r2, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-N32 %s
 // CHECK-EL-NAN64-64R2-N32: "-internal-isystem"
@@ -2542,7 +2542,7 @@
 //
 // = Little-endian, mips64r2, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64R2-64 %s
 // CHECK-EL-HF-64R2-64: "-internal-isystem"
@@ -2569,7 +2569,7 @@
 //
 // = Little-endian, mips64r2, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64R2-64 %s
 // CHECK-EL-HF64-64R2-64: "-internal-isystem"
@@ -2596,7 +2596,7 @@
 //
 // = Little-endian, mips64r2, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64R2-64 %s
 // CHECK-EL-SF-64R2-64: "-internal-isystem"
@@ -2623,7 +2623,7 @@
 //
 // = Little-endian, mips64r2, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64R2-64 %s
 // CHECK-EL-NAN-64R2-64: "-internal-isystem"
@@ -2650,7 +2650,7 @@
 //
 // = Little-endian, mips64r2, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-64 %s
 // CHECK-EL-NAN64-64R2-64: "-internal-isystem"
@@ -2677,7 +2677,7 @@
 //
 // = Little-endian, default (mips64r2), ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     --target=mips64el-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-64-DEF %s
 // CHECK-EL-NAN64-64R2-64-DEF: "-internal-isystem"
@@ -2706,7 +2706,7 @@
 //
 // = Big-endian, mips32r3, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r3 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r3 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R3 %s
 // CHECK-BE-HF-32R3: "-internal-isystem"
@@ -2733,7 +2733,7 @@
 //
 // = Big-endian, mips32r5, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux-gnu -mips32r5 -mhard-float \
+// RUN:     --target=mips-mti-linux-gnu -mips32r5 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R5 %s
 // CHECK-BE-HF-32R5: "-internal-isystem"
@@ -2760,7 +2760,7 @@
 //
 // = Big-endian, mips64r3, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r3 -mabi=64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r3 -mabi=64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R3-64 %s
 // CHECK-BE-HF-64R3-64: "-internal-isystem"
@@ -2787,7 +2787,7 @@
 //
 // = Big-endian, mips64r5, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-mti-linux-gnu -mips64r5 -mabi=64 -mhard-float \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r5 -mabi=64 -mhard-float -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R5-64 %s
 // CHECK-BE-HF-64R5-64: "-internal-isystem"
diff --git a/clang/test/Driver/mips-img-v2.cpp b/clang/test/Driver/mips-img-v2.cpp
index 69250b716e5b1..e031466ea2ac4 100644
--- a/clang/test/Driver/mips-img-v2.cpp
+++ b/clang/test/Driver/mips-img-v2.cpp
@@ -7,7 +7,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EB -mips32r6 -mhard-float -mabi=32 \
+// RUN:        -EB -mips32r6 -mhard-float -mabi=32 -no-pie \
 // RUN:   | FileCheck --check-prefix=EB-HARD-O32 %s
 // EB-HARD-O32: "-internal-isystem"
 // EB-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -35,7 +35,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EB -mips64r6 -mhard-float -mabi=n32 \
+// RUN:        -EB -mips64r6 -mhard-float -mabi=n32 -no-pie \
 // RUN:   | FileCheck --check-prefix=EB-HARD-N32 %s
 // EB-HARD-N32: "-internal-isystem"
 // EB-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -63,7 +63,7 @@
 // RUN:        --target=mips64-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EB -mips64r6 -mhard-float -mabi=64 \
+// RUN:        -EB -mips64r6 -mhard-float -mabi=64 -no-pie \
 // RUN:   | FileCheck --check-prefix=EB-HARD-N64 %s
 // EB-HARD-N64: "-internal-isystem"
 // EB-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -91,7 +91,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EL -mips32r6 -mhard-float -mabi=32 \
+// RUN:        -EL -mips32r6 -mhard-float -mabi=32 -no-pie \
 // RUN:   | FileCheck --check-prefix=EL-HARD-O32 %s
 // EL-HARD-O32: "-internal-isystem"
 // EL-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -119,7 +119,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EL -mips64r6 -mhard-float -mabi=n32 \
+// RUN:        -EL -mips64r6 -mhard-float -mabi=n32 -no-pie \
 // RUN:   | FileCheck --check-prefix=EL-HARD-N32 %s
 // EL-HARD-N32: "-internal-isystem"
 // EL-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -147,7 +147,7 @@
 // RUN:        --target=mips64-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EL -mips64r6 -mhard-float -mabi=64 \
+// RUN:        -EL -mips64r6 -mhard-float -mabi=64 -no-pie \
 // RUN:   | FileCheck --check-prefix=EL-HARD-N64 %s
 // EL-HARD-N64: "-internal-isystem"
 // EL-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -175,7 +175,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EB -mips32r6 -msoft-float \
+// RUN:        -EB -mips32r6 -msoft-float -no-pie \
 // RUN:   | FileCheck --check-prefix=EB-SOFT %s
 // EB-SOFT: "-internal-isystem"
 // EB-SOFT: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -203,7 +203,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EL -mips32r6 -msoft-float \
+// RUN:        -EL -mips32r6 -msoft-float -no-pie \
 // RUN:   | FileCheck --check-prefix=EL-SOFT %s
 // EL-SOFT: "-internal-isystem"
 // EL-SOFT: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -231,7 +231,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EB -mips32r6 -mhard-float -mmicromips \
+// RUN:        -EB -mips32r6 -mhard-float -mmicromips -no-pie \
 // RUN:   | FileCheck --check-prefix=EB-HARD-MICRO %s
 // EB-HARD-MICRO: "-internal-isystem"
 // EB-HARD-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -259,7 +259,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EB -mips32r6 -msoft-float -mmicromips \
+// RUN:        -EB -mips32r6 -msoft-float -mmicromips -no-pie \
 // RUN:   | FileCheck --check-prefix=EB-SOFT-MICRO %s
 // EB-SOFT-MICRO: "-internal-isystem"
 // EB-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -287,7 +287,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EL -mips32r6 -mhard-float -mmicromips \
+// RUN:        -EL -mips32r6 -mhard-float -mmicromips -no-pie \
 // RUN:   | FileCheck --check-prefix=EL-HARD-MICRO %s
 // EL-HARD-MICRO: "-internal-isystem"
 // EL-HARD-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
@@ -315,7 +315,7 @@
 // RUN:        --target=mips-img-linux-gnu \
 // RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
 // RUN:        -stdlib=libstdc++ \
-// RUN:        -EL -mips32r6 -msoft-float -mmicromips \
+// RUN:        -EL -mips32r6 -msoft-float -mmicromips -no-pie \
 // RUN:   | FileCheck --check-prefix=EL-SOFT-MICRO %s
 // EL-SOFT-MICRO: "-internal-isystem"
 // EL-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
diff --git a/clang/test/Driver/mips-img.cpp b/clang/test/Driver/mips-img.cpp
index c97bb9478e7cb..272fa724102c4 100644
--- a/clang/test/Driver/mips-img.cpp
+++ b/clang/test/Driver/mips-img.cpp
@@ -4,7 +4,7 @@
 //
 // = Big-endian, mips32r6
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-img-linux-gnu -mips32r6 \
+// RUN:     --target=mips-img-linux-gnu -mips32r6 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-32R6 %s
 // CHECK-BE-32R6: "-internal-isystem"
@@ -31,7 +31,7 @@
 //
 // = Little-endian, mips32r6
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-img-linux-gnu -mips32r6 -EL \
+// RUN:     --target=mips-img-linux-gnu -mips32r6 -EL -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-32R6 %s
 // CHECK-LE-32R6: "-internal-isystem"
@@ -58,7 +58,7 @@
 //
 // = Big-endian, mips64r6, N32
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=n32 \
+// RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=n32 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-64R6-N32 %s
 // CHECK-BE-64R6-N32: "-internal-isystem"
@@ -85,7 +85,7 @@
 //
 // = Little-endian, mips64r6, N32
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=n32 \
+// RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=n32 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-64R6-N32 %s
 // CHECK-LE-64R6-N32: "-internal-isystem"
@@ -112,7 +112,7 @@
 //
 // = Big-endian, mips64r6, N64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=64 \
+// RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=64 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-64R6-N64 %s
 // CHECK-BE-64R6-N64: "-internal-isystem"
@@ -139,7 +139,7 @@
 //
 // = Little-endian, mips64r6, N64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=64 \
+// RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=64 -no-pie \
 // RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-64R6-N64 %s
 // CHECK-LE-64R6-N64: "-internal-isystem"
diff --git a/clang/test/Driver/mips-mti-linux.c b/clang/test/Driver/mips-mti-linux.c
index 21b270c0a4b76..0c5efa536829f 100644
--- a/clang/test/Driver/mips-mti-linux.c
+++ b/clang/test/Driver/mips-mti-linux.c
@@ -8,7 +8,8 @@
 
 // = Big-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux -mips32r2 -mhard-float -rtlib=platform -fuse-ld=ld \
+// RUN:     --target=mips-mti-linux -mips32r2 -mhard-float -no-pie \
+// RUN:     -rtlib=platform -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/mips_mti_linux/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R2 %s
 //
@@ -26,7 +27,8 @@
 
 // = Little-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-mti-linux -mips32r2 -EL -mhard-float -rtlib=platform -fuse-ld=ld \
+// RUN:     --target=mips-mti-linux -mips32r2 -EL -mhard-float -no-pie \
+// RUN:     -rtlib=platform -fuse-ld=ld \
 // RUN:     --sysroot=%S/Inputs/mips_mti_linux/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-HF-32R2 %s
 //

From 0e74d75a295729bc145724ffa0495fee4d1b598c Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 22 Feb 2022 09:49:19 +0000
Subject: [PATCH 514/748] [StructurizeCFG] Fix boolean not bug

D118623 added code to fold not-of-compare into a compare
with the inverted predicate, if the compare had no other
uses. This relies on accurate use lists in the IR but it
was run before setPhiValues, when some phi inputs are still
stored in a data structure on the side, instead of being
real uses in the IR. The effect was that a phi that should
be using the original compare result would now get an
inverted result instead.

Fix this by moving simplifyConditions after setPhiValues.

Differential Revision: https://reviews.llvm.org/D120312
---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp           | 2 +-
 llvm/test/Transforms/StructurizeCFG/invert-condition.ll | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index b3a445368537d..14c7433531e5c 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -1089,8 +1089,8 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   createFlow();
   insertConditions(false);
   insertConditions(true);
-  simplifyConditions();
   setPhiValues();
+  simplifyConditions();
   simplifyAffectedPhis();
   rebuildSSA();
 
diff --git a/llvm/test/Transforms/StructurizeCFG/invert-condition.ll b/llvm/test/Transforms/StructurizeCFG/invert-condition.ll
index 5b6f1d8545175..aa8589563484c 100644
--- a/llvm/test/Transforms/StructurizeCFG/invert-condition.ll
+++ b/llvm/test/Transforms/StructurizeCFG/invert-condition.ll
@@ -29,13 +29,12 @@ bb5:                                              ; preds = %bb2
   ret void
 }
 
-; FIXME: StructurizeCFG modifies I5 in-place without updating the use of I5 in
-; the phi instruction.
 define void @invert_condition_phi(i32 %arg) {
 ; CHECK-LABEL: @invert_condition_phi(
 ; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    [[I5:%.*]] = icmp ne i32 [[ARG:%.*]], 0
-; CHECK-NEXT:    br i1 [[I5]], label [[IF1:%.*]], label [[ENDIF1:%.*]]
+; CHECK-NEXT:    [[I5:%.*]] = icmp eq i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    [[I5_INV:%.*]] = xor i1 [[I5]], true
+; CHECK-NEXT:    br i1 [[I5_INV]], label [[IF1:%.*]], label [[ENDIF1:%.*]]
 ; CHECK:       if1:
 ; CHECK-NEXT:    br label [[ENDIF1]]
 ; CHECK:       endif1:

From b47e2dc91f62c6d32875a9252e520e72a55f2cc3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 18 Feb 2022 11:24:05 +0000
Subject: [PATCH 515/748] [StableHashing] Hash machine basic blocks and
 functions

This adds very basic support for hashing MachineBasicBlock
and MachineFunction, for use in MachineFunctionPass to
detect passes that modify the MachineFunction wrongly.

Differential Revision: https://reviews.llvm.org/D120122
---
 llvm/include/llvm/CodeGen/MachineStableHash.h |  4 ++++
 llvm/lib/CodeGen/MachineStableHash.cpp        | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/MachineStableHash.h b/llvm/include/llvm/CodeGen/MachineStableHash.h
index 8423b2da1c785..43571b7b8afd2 100644
--- a/llvm/include/llvm/CodeGen/MachineStableHash.h
+++ b/llvm/include/llvm/CodeGen/MachineStableHash.h
@@ -17,6 +17,8 @@
 #include "llvm/CodeGen/StableHashing.h"
 
 namespace llvm {
+class MachineBasicBlock;
+class MachineFunction;
 class MachineInstr;
 class MachineOperand;
 
@@ -24,6 +26,8 @@ stable_hash stableHashValue(const MachineOperand &MO);
 stable_hash stableHashValue(const MachineInstr &MI, bool HashVRegs = false,
                             bool HashConstantPoolIndices = false,
                             bool HashMemOperands = false);
+stable_hash stableHashValue(const MachineBasicBlock &MBB);
+stable_hash stableHashValue(const MachineFunction &MF);
 
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 6b213f8d0bdfc..49b69b105c607 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -195,3 +195,21 @@ stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
   return stable_hash_combine_range(HashComponents.begin(),
                                    HashComponents.end());
 }
+
+stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) {
+  SmallVector<stable_hash> HashComponents;
+  // TODO: Hash more stuff like block alignment and branch probabilities.
+  for (auto &MI : MBB)
+    HashComponents.push_back(stableHashValue(MI));
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}
+
+stable_hash llvm::stableHashValue(const MachineFunction &MF) {
+  SmallVector<stable_hash> HashComponents;
+  // TODO: Hash lots more stuff like function alignment and stack objects.
+  for (auto &MBB : MF)
+    HashComponents.push_back(stableHashValue(MBB));
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}

From 16994a2cfaac6305f49a02afc1218e5c33199ea4 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 22 Feb 2022 12:50:39 -0500
Subject: [PATCH 516/748] Fix the Sphinx build after
 f8cedc642d9b85720cb7175ef25ddde90a3fbca2

---
 clang/docs/ReleaseNotes.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7e92224901d41..68a867409c160 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -60,7 +60,7 @@ Bug Fixes
   wrapping a ``nullptr`` when the ``CXXNewExpr`` did not have an array
   size expression. This was fixed and ``::getArraySize()`` will now always
   either return ``None`` or a ``llvm::Optional`` wrapping a valid ``Expr*``.
-  This fixes `Issue 53742<https://github.com/llvm/llvm-project/issues/53742>`_.
+  This fixes `Issue 53742 <https://github.com/llvm/llvm-project/issues/53742>`_.
 
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -92,7 +92,7 @@ Attribute Changes in Clang
 
 - The ``overloadable`` attribute can now be written in all of the syntactic
   locations a declaration attribute may appear.
-  This fixes `Issue 53805<https://github.com/llvm/llvm-project/issues/53805>`_.
+  This fixes `Issue 53805 <https://github.com/llvm/llvm-project/issues/53805>`_.
 
 Windows Support
 ---------------

From 104d9a674312c314699558ad8ee48b70624fdb6c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Tue, 22 Feb 2022 12:59:56 -0500
Subject: [PATCH 517/748] [Clang][OpenMP] Add the codegen support for `atomic
 compare`

This patch adds the codegen support for `atomic compare` in clang.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D118632
---
 clang/include/clang/AST/StmtOpenMP.h         |   25 +-
 clang/lib/AST/StmtOpenMP.cpp                 |   16 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp           |   53 +-
 clang/lib/Sema/SemaOpenMP.cpp                |   28 +-
 clang/test/OpenMP/atomic_compare_codegen.cpp | 4021 ++++++++++++++++++
 5 files changed, 4116 insertions(+), 27 deletions(-)
 create mode 100644 clang/test/OpenMP/atomic_compare_codegen.cpp

diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index d5b5c9580da9e..1702cafaf462f 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -2863,6 +2863,8 @@ class OMPAtomicDirective : public OMPExecutableDirective {
     POS_V,
     POS_E,
     POS_UpdateExpr,
+    POS_D,
+    POS_Cond,
   };
 
   /// Set 'x' part of the associated expression/statement.
@@ -2877,6 +2879,10 @@ class OMPAtomicDirective : public OMPExecutableDirective {
   void setV(Expr *V) { Data->getChildren()[DataPositionTy::POS_V] = V; }
   /// Set 'expr' part of the associated expression/statement.
   void setExpr(Expr *E) { Data->getChildren()[DataPositionTy::POS_E] = E; }
+  /// Set 'd' part of the associated expression/statement.
+  void setD(Expr *D) { Data->getChildren()[DataPositionTy::POS_D] = D; }
+  /// Set conditional expression in `atomic compare`.
+  void setCond(Expr *C) { Data->getChildren()[DataPositionTy::POS_Cond] = C; }
 
 public:
   /// Creates directive with a list of \a Clauses and 'x', 'v' and 'expr'
@@ -2894,6 +2900,8 @@ class OMPAtomicDirective : public OMPExecutableDirective {
   /// \param UE Helper expression of the form
   /// 'OpaqueValueExpr(x) binop OpaqueValueExpr(expr)' or
   /// 'OpaqueValueExpr(expr) binop OpaqueValueExpr(x)'.
+  /// \param D 'd' part of the associated expression/statement.
+  /// \param Cond Conditional expression in `atomic compare` construct.
   /// \param IsXLHSInRHSPart true if \a UE has the first form and false if the
   /// second.
   /// \param IsPostfixUpdate true if original value of 'x' must be stored in
@@ -2901,7 +2909,8 @@ class OMPAtomicDirective : public OMPExecutableDirective {
   static OMPAtomicDirective *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
          ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt, Expr *X, Expr *V,
-         Expr *E, Expr *UE, bool IsXLHSInRHSPart, bool IsPostfixUpdate);
+         Expr *E, Expr *UE, Expr *D, Expr *Cond, bool IsXLHSInRHSPart,
+         bool IsPostfixUpdate);
 
   /// Creates an empty directive with the place for \a NumClauses
   /// clauses.
@@ -2951,6 +2960,20 @@ class OMPAtomicDirective : public OMPExecutableDirective {
   const Expr *getExpr() const {
     return cast_or_null<Expr>(Data->getChildren()[DataPositionTy::POS_E]);
   }
+  /// Get 'd' part of the associated expression/statement.
+  Expr *getD() {
+    return cast_or_null<Expr>(Data->getChildren()[DataPositionTy::POS_D]);
+  }
+  Expr *getD() const {
+    return cast_or_null<Expr>(Data->getChildren()[DataPositionTy::POS_D]);
+  }
+  /// Get the 'cond' part of the source atomic expression.
+  Expr *getCondExpr() {
+    return cast_or_null<Expr>(Data->getChildren()[DataPositionTy::POS_Cond]);
+  }
+  Expr *getCondExpr() const {
+    return cast_or_null<Expr>(Data->getChildren()[DataPositionTy::POS_Cond]);
+  }
 
   static bool classof(const Stmt *T) {
     return T->getStmtClass() == OMPAtomicDirectiveClass;
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 8a9f73d3dbf0b..cd81271ba3e34 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -863,16 +863,20 @@ OMPOrderedDirective *OMPOrderedDirective::CreateEmpty(const ASTContext &C,
                                                    !IsStandalone);
 }
 
-OMPAtomicDirective *OMPAtomicDirective::Create(
-    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
-    ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt, Expr *X, Expr *V,
-    Expr *E, Expr *UE, bool IsXLHSInRHSPart, bool IsPostfixUpdate) {
+OMPAtomicDirective *
+OMPAtomicDirective::Create(const ASTContext &C, SourceLocation StartLoc,
+                           SourceLocation EndLoc, ArrayRef<OMPClause *> Clauses,
+                           Stmt *AssociatedStmt, Expr *X, Expr *V, Expr *E,
+                           Expr *UE, Expr *D, Expr *Cond, bool IsXLHSInRHSPart,
+                           bool IsPostfixUpdate) {
   auto *Dir = createDirective<OMPAtomicDirective>(
-      C, Clauses, AssociatedStmt, /*NumChildren=*/4, StartLoc, EndLoc);
+      C, Clauses, AssociatedStmt, /*NumChildren=*/6, StartLoc, EndLoc);
   Dir->setX(X);
   Dir->setV(V);
   Dir->setExpr(E);
   Dir->setUpdateExpr(UE);
+  Dir->setD(D);
+  Dir->setCond(Cond);
   Dir->IsXLHSInRHSPart = IsXLHSInRHSPart;
   Dir->IsPostfixUpdate = IsPostfixUpdate;
   return Dir;
@@ -882,7 +886,7 @@ OMPAtomicDirective *OMPAtomicDirective::CreateEmpty(const ASTContext &C,
                                                     unsigned NumClauses,
                                                     EmptyShell) {
   return createEmptyDirective<OMPAtomicDirective>(
-      C, NumClauses, /*HasAssociatedStmt=*/true, /*NumChildren=*/4);
+      C, NumClauses, /*HasAssociatedStmt=*/true, /*NumChildren=*/6);
 }
 
 OMPTargetDirective *OMPTargetDirective::Create(const ASTContext &C,
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 8ea4968f4b11b..4bdeff4d41f34 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6017,11 +6017,50 @@ static void emitOMPAtomicCaptureExpr(CodeGenFunction &CGF,
   }
 }
 
+static void emitOMPAtomicCompareExpr(CodeGenFunction &CGF,
+                                     llvm::AtomicOrdering AO, const Expr *X,
+                                     const Expr *E, const Expr *D,
+                                     const Expr *CE, bool IsXBinopExpr,
+                                     SourceLocation Loc) {
+  llvm::OpenMPIRBuilder &OMPBuilder =
+      CGF.CGM.getOpenMPRuntime().getOMPBuilder();
+
+  OMPAtomicCompareOp Op;
+  assert(isa<BinaryOperator>(CE) && "CE is not a BinaryOperator");
+  switch (cast<BinaryOperator>(CE)->getOpcode()) {
+  case BO_EQ:
+    Op = OMPAtomicCompareOp::EQ;
+    break;
+  case BO_LT:
+    Op = OMPAtomicCompareOp::MIN;
+    break;
+  case BO_GT:
+    Op = OMPAtomicCompareOp::MAX;
+    break;
+  default:
+    llvm_unreachable("unsupported atomic compare binary operator");
+  }
+
+  LValue XLVal = CGF.EmitLValue(X);
+  llvm::Value *XPtr = XLVal.getPointer(CGF);
+  llvm::Value *EVal = CGF.EmitScalarExpr(E);
+  llvm::Value *DVal = D ? CGF.EmitScalarExpr(D) : nullptr;
+
+  llvm::OpenMPIRBuilder::AtomicOpValue XOpVal{
+      XPtr, XPtr->getType()->getPointerElementType(),
+      X->getType().isVolatileQualified(),
+      X->getType()->hasSignedIntegerRepresentation()};
+
+  CGF.Builder.restoreIP(OMPBuilder.createAtomicCompare(
+      CGF.Builder, XOpVal, EVal, DVal, AO, Op, IsXBinopExpr));
+}
+
 static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
                               llvm::AtomicOrdering AO, bool IsPostfixUpdate,
                               const Expr *X, const Expr *V, const Expr *E,
-                              const Expr *UE, bool IsXLHSInRHSPart,
-                              bool IsCompareCapture, SourceLocation Loc) {
+                              const Expr *UE, const Expr *D, const Expr *CE,
+                              bool IsXLHSInRHSPart, bool IsCompareCapture,
+                              SourceLocation Loc) {
   switch (Kind) {
   case OMPC_read:
     emitOMPAtomicReadExpr(CGF, AO, X, V, Loc);
@@ -6045,11 +6084,7 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
           "'atomic compare capture' is not supported for now");
       CGF.CGM.getDiags().Report(DiagID);
     } else {
-      // Emit an error here.
-      unsigned DiagID = CGF.CGM.getDiags().getCustomDiagID(
-          DiagnosticsEngine::Error,
-          "'atomic compare' is not supported for now");
-      CGF.CGM.getDiags().Report(DiagID);
+      emitOMPAtomicCompareExpr(CGF, AO, X, E, D, CE, IsXLHSInRHSPart, Loc);
     }
     break;
   }
@@ -6202,8 +6237,8 @@ void CodeGenFunction::EmitOMPAtomicDirective(const OMPAtomicDirective &S) {
   LexicalScope Scope(*this, S.getSourceRange());
   EmitStopPoint(S.getAssociatedStmt());
   emitOMPAtomicExpr(*this, Kind, AO, S.isPostfixUpdate(), S.getX(), S.getV(),
-                    S.getExpr(), S.getUpdateExpr(), S.isXLHSInRHSPart(),
-                    IsCompareCapture, S.getBeginLoc());
+                    S.getExpr(), S.getUpdateExpr(), S.getD(), S.getCondExpr(),
+                    S.isXLHSInRHSPart(), IsCompareCapture, S.getBeginLoc());
 }
 
 static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index ad8d304ef43c3..55cee7813b26b 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -11133,11 +11133,11 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
   switch (Cond->getOpcode()) {
   case BO_EQ: {
     C = Cond;
-    D = BO->getRHS();
+    D = BO->getRHS()->IgnoreImpCasts();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) {
-      E = Cond->getRHS();
+      E = Cond->getRHS()->IgnoreImpCasts();
     } else if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) {
-      E = Cond->getLHS();
+      E = Cond->getLHS()->IgnoreImpCasts();
     } else {
       ErrorInfo.Error = ErrorTy::InvalidComparison;
       ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
@@ -11148,7 +11148,7 @@ bool OpenMPAtomicCompareChecker::checkCondUpdateStmt(IfStmt *S,
   }
   case BO_LT:
   case BO_GT: {
-    E = BO->getRHS();
+    E = BO->getRHS()->IgnoreImpCasts();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS()) &&
         checkIfTwoExprsAreSame(ContextRef, E, Cond->getRHS())) {
       C = Cond;
@@ -11228,11 +11228,11 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S,
   switch (Cond->getOpcode()) {
   case BO_EQ: {
     C = Cond;
-    D = CO->getTrueExpr();
+    D = CO->getTrueExpr()->IgnoreImpCasts();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS())) {
-      E = Cond->getRHS();
+      E = Cond->getRHS()->IgnoreImpCasts();
     } else if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getRHS())) {
-      E = Cond->getLHS();
+      E = Cond->getLHS()->IgnoreImpCasts();
     } else {
       ErrorInfo.Error = ErrorTy::InvalidComparison;
       ErrorInfo.ErrorLoc = ErrorInfo.NoteLoc = Cond->getExprLoc();
@@ -11243,7 +11243,7 @@ bool OpenMPAtomicCompareChecker::checkCondExprStmt(Stmt *S,
   }
   case BO_LT:
   case BO_GT: {
-    E = CO->getTrueExpr();
+    E = CO->getTrueExpr()->IgnoreImpCasts();
     if (checkIfTwoExprsAreSame(ContextRef, X, Cond->getLHS()) &&
         checkIfTwoExprsAreSame(ContextRef, E, Cond->getRHS())) {
       C = Cond;
@@ -11843,6 +11843,8 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
   Expr *V = nullptr;
   Expr *E = nullptr;
   Expr *UE = nullptr;
+  Expr *D = nullptr;
+  Expr *CE = nullptr;
   bool IsXLHSInRHSPart = false;
   bool IsPostfixUpdate = false;
   // OpenMP [2.12.6, atomic Construct]
@@ -12252,15 +12254,19 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
             << ErrorInfo.Error << ErrorInfo.NoteRange;
         return StmtError();
       }
-      // TODO: We don't set X, D, E, etc. here because in code gen we will emit
-      // error directly.
+      X = Checker.getX();
+      E = Checker.getE();
+      D = Checker.getD();
+      CE = Checker.getCond();
+      // We reuse IsXLHSInRHSPart to tell if it is in the form 'x ordop expr'.
+      IsXLHSInRHSPart = Checker.isXBinopExpr();
     }
   }
 
   setFunctionHasBranchProtectedScope();
 
   return OMPAtomicDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                    X, V, E, UE, IsXLHSInRHSPart,
+                                    X, V, E, UE, D, CE, IsXLHSInRHSPart,
                                     IsPostfixUpdate);
 }
 
diff --git a/clang/test/OpenMP/atomic_compare_codegen.cpp b/clang/test/OpenMP/atomic_compare_codegen.cpp
new file mode 100644
index 0000000000000..9d3dcf0fee138
--- /dev/null
+++ b/clang/test/OpenMP/atomic_compare_codegen.cpp
@@ -0,0 +1,4021 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp -fopenmp-version=51 -x c -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=51 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -target-cpu core2 -fopenmp-simd -fopenmp-version=51 -x c -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=51 -x c -triple x86_64-apple-darwin10 -target-cpu core2 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {
+  char cx, ce, cd;
+  unsigned char ucx, uce, ucd;
+  short sx, se, sd;
+  unsigned short usx, use, usd;
+  int ix, ie, id;
+  unsigned int uix, uie, uid;
+  long lx, le, ld;
+  unsigned long ulx, ule, uld;
+  long long llx, lle, lld;
+  unsigned long long ullx, ulle, ulld;
+
+#pragma omp atomic compare
+  cx = cx > ce ? ce : cx;
+#pragma omp atomic compare
+  cx = cx < ce ? ce : cx;
+#pragma omp atomic compare
+  cx = ce > cx ? ce : cx;
+#pragma omp atomic compare
+  cx = ce < cx ? ce : cx;
+#pragma omp atomic compare
+  if (cx > ce)
+    cx = ce;
+#pragma omp atomic compare
+  if (cx < ce)
+    cx = ce;
+#pragma omp atomic compare
+  if (ce > cx)
+    cx = ce;
+#pragma omp atomic compare
+  if (ce < cx)
+    cx = ce;
+
+#pragma omp atomic compare
+  cx = cx == ce ? cd : cx;
+#pragma omp atomic compare
+  cx = ce == cx ? cd : cx;
+#pragma omp atomic compare
+  if (cx == ce)
+    cx = cd;
+#pragma omp atomic compare
+  if (ce == cx)
+    cx = cd;
+
+#pragma omp atomic compare
+  ucx = ucx > uce ? uce : ucx;
+#pragma omp atomic compare
+  ucx = ucx < uce ? uce : ucx;
+#pragma omp atomic compare
+  ucx = uce > ucx ? uce : ucx;
+#pragma omp atomic compare
+  ucx = uce < ucx ? uce : ucx;
+#pragma omp atomic compare
+  if (ucx > uce)
+    ucx = uce;
+#pragma omp atomic compare
+  if (ucx < uce)
+    ucx = uce;
+#pragma omp atomic compare
+  if (uce > ucx)
+    ucx = uce;
+#pragma omp atomic compare
+  if (uce < ucx)
+    ucx = uce;
+
+#pragma omp atomic compare
+  ucx = ucx == uce ? ucd : ucx;
+#pragma omp atomic compare
+  ucx = uce == ucx ? ucd : ucx;
+#pragma omp atomic compare
+  if (ucx == uce)
+    ucx = ucd;
+#pragma omp atomic compare
+  if (uce == ucx)
+    ucx = ucd;
+
+#pragma omp atomic compare acq_rel
+  cx = cx > ce ? ce : cx;
+#pragma omp atomic compare acq_rel
+  cx = cx < ce ? ce : cx;
+#pragma omp atomic compare acq_rel
+  cx = ce > cx ? ce : cx;
+#pragma omp atomic compare acq_rel
+  cx = ce < cx ? ce : cx;
+#pragma omp atomic compare acq_rel
+  if (cx > ce)
+    cx = ce;
+#pragma omp atomic compare acq_rel
+  if (cx < ce)
+    cx = ce;
+#pragma omp atomic compare acq_rel
+  if (ce > cx)
+    cx = ce;
+#pragma omp atomic compare acq_rel
+  if (ce < cx)
+    cx = ce;
+
+#pragma omp atomic compare acq_rel
+  cx = cx == ce ? cd : cx;
+#pragma omp atomic compare acq_rel
+  cx = ce == cx ? cd : cx;
+#pragma omp atomic compare acq_rel
+  if (cx == ce)
+    cx = cd;
+#pragma omp atomic compare acq_rel
+  if (ce == cx)
+    cx = cd;
+
+#pragma omp atomic compare acq_rel
+  ucx = ucx > uce ? uce : ucx;
+#pragma omp atomic compare acq_rel
+  ucx = ucx < uce ? uce : ucx;
+#pragma omp atomic compare acq_rel
+  ucx = uce > ucx ? uce : ucx;
+#pragma omp atomic compare acq_rel
+  ucx = uce < ucx ? uce : ucx;
+#pragma omp atomic compare acq_rel
+  if (ucx > uce)
+    ucx = uce;
+#pragma omp atomic compare acq_rel
+  if (ucx < uce)
+    ucx = uce;
+#pragma omp atomic compare acq_rel
+  if (uce > ucx)
+    ucx = uce;
+#pragma omp atomic compare acq_rel
+  if (uce < ucx)
+    ucx = uce;
+
+#pragma omp atomic compare acq_rel
+  ucx = ucx == uce ? ucd : ucx;
+#pragma omp atomic compare acq_rel
+  ucx = uce == ucx ? ucd : ucx;
+#pragma omp atomic compare acq_rel
+  if (ucx == uce)
+    ucx = ucd;
+#pragma omp atomic compare acq_rel
+  if (uce == ucx)
+    ucx = ucd;
+
+#pragma omp atomic compare acquire
+  cx = cx > ce ? ce : cx;
+#pragma omp atomic compare acquire
+  cx = cx < ce ? ce : cx;
+#pragma omp atomic compare acquire
+  cx = ce > cx ? ce : cx;
+#pragma omp atomic compare acquire
+  cx = ce < cx ? ce : cx;
+#pragma omp atomic compare acquire
+  if (cx > ce)
+    cx = ce;
+#pragma omp atomic compare acquire
+  if (cx < ce)
+    cx = ce;
+#pragma omp atomic compare acquire
+  if (ce > cx)
+    cx = ce;
+#pragma omp atomic compare acquire
+  if (ce < cx)
+    cx = ce;
+
+#pragma omp atomic compare acquire
+  cx = cx == ce ? cd : cx;
+#pragma omp atomic compare acquire
+  cx = ce == cx ? cd : cx;
+#pragma omp atomic compare acquire
+  if (cx == ce)
+    cx = cd;
+#pragma omp atomic compare acquire
+  if (ce == cx)
+    cx = cd;
+
+#pragma omp atomic compare acquire
+  ucx = ucx > uce ? uce : ucx;
+#pragma omp atomic compare acquire
+  ucx = ucx < uce ? uce : ucx;
+#pragma omp atomic compare acquire
+  ucx = uce > ucx ? uce : ucx;
+#pragma omp atomic compare acquire
+  ucx = uce < ucx ? uce : ucx;
+#pragma omp atomic compare acquire
+  if (ucx > uce)
+    ucx = uce;
+#pragma omp atomic compare acquire
+  if (ucx < uce)
+    ucx = uce;
+#pragma omp atomic compare acquire
+  if (uce > ucx)
+    ucx = uce;
+#pragma omp atomic compare acquire
+  if (uce < ucx)
+    ucx = uce;
+
+#pragma omp atomic compare acquire
+  ucx = ucx == uce ? ucd : ucx;
+#pragma omp atomic compare acquire
+  ucx = uce == ucx ? ucd : ucx;
+#pragma omp atomic compare acquire
+  if (ucx == uce)
+    ucx = ucd;
+#pragma omp atomic compare acquire
+  if (uce == ucx)
+    ucx = ucd;
+
+#pragma omp atomic compare relaxed
+  cx = cx > ce ? ce : cx;
+#pragma omp atomic compare relaxed
+  cx = cx < ce ? ce : cx;
+#pragma omp atomic compare relaxed
+  cx = ce > cx ? ce : cx;
+#pragma omp atomic compare relaxed
+  cx = ce < cx ? ce : cx;
+#pragma omp atomic compare relaxed
+  if (cx > ce)
+    cx = ce;
+#pragma omp atomic compare relaxed
+  if (cx < ce)
+    cx = ce;
+#pragma omp atomic compare relaxed
+  if (ce > cx)
+    cx = ce;
+#pragma omp atomic compare relaxed
+  if (ce < cx)
+    cx = ce;
+
+#pragma omp atomic compare relaxed
+  cx = cx == ce ? cd : cx;
+#pragma omp atomic compare relaxed
+  cx = ce == cx ? cd : cx;
+#pragma omp atomic compare relaxed
+  if (cx == ce)
+    cx = cd;
+#pragma omp atomic compare relaxed
+  if (ce == cx)
+    cx = cd;
+
+#pragma omp atomic compare relaxed
+  ucx = ucx > uce ? uce : ucx;
+#pragma omp atomic compare relaxed
+  ucx = ucx < uce ? uce : ucx;
+#pragma omp atomic compare relaxed
+  ucx = uce > ucx ? uce : ucx;
+#pragma omp atomic compare relaxed
+  ucx = uce < ucx ? uce : ucx;
+#pragma omp atomic compare relaxed
+  if (ucx > uce)
+    ucx = uce;
+#pragma omp atomic compare relaxed
+  if (ucx < uce)
+    ucx = uce;
+#pragma omp atomic compare relaxed
+  if (uce > ucx)
+    ucx = uce;
+#pragma omp atomic compare relaxed
+  if (uce < ucx)
+    ucx = uce;
+
+#pragma omp atomic compare relaxed
+  ucx = ucx == uce ? ucd : ucx;
+#pragma omp atomic compare relaxed
+  ucx = uce == ucx ? ucd : ucx;
+#pragma omp atomic compare relaxed
+  if (ucx == uce)
+    ucx = ucd;
+#pragma omp atomic compare relaxed
+  if (uce == ucx)
+    ucx = ucd;
+
+#pragma omp atomic compare release
+  cx = cx > ce ? ce : cx;
+#pragma omp atomic compare release
+  cx = cx < ce ? ce : cx;
+#pragma omp atomic compare release
+  cx = ce > cx ? ce : cx;
+#pragma omp atomic compare release
+  cx = ce < cx ? ce : cx;
+#pragma omp atomic compare release
+  if (cx > ce)
+    cx = ce;
+#pragma omp atomic compare release
+  if (cx < ce)
+    cx = ce;
+#pragma omp atomic compare release
+  if (ce > cx)
+    cx = ce;
+#pragma omp atomic compare release
+  if (ce < cx)
+    cx = ce;
+
+#pragma omp atomic compare release
+  cx = cx == ce ? cd : cx;
+#pragma omp atomic compare release
+  cx = ce == cx ? cd : cx;
+#pragma omp atomic compare release
+  if (cx == ce)
+    cx = cd;
+#pragma omp atomic compare release
+  if (ce == cx)
+    cx = cd;
+
+#pragma omp atomic compare release
+  ucx = ucx > uce ? uce : ucx;
+#pragma omp atomic compare release
+  ucx = ucx < uce ? uce : ucx;
+#pragma omp atomic compare release
+  ucx = uce > ucx ? uce : ucx;
+#pragma omp atomic compare release
+  ucx = uce < ucx ? uce : ucx;
+#pragma omp atomic compare release
+  if (ucx > uce)
+    ucx = uce;
+#pragma omp atomic compare release
+  if (ucx < uce)
+    ucx = uce;
+#pragma omp atomic compare release
+  if (uce > ucx)
+    ucx = uce;
+#pragma omp atomic compare release
+  if (uce < ucx)
+    ucx = uce;
+
+#pragma omp atomic compare release
+  ucx = ucx == uce ? ucd : ucx;
+#pragma omp atomic compare release
+  ucx = uce == ucx ? ucd : ucx;
+#pragma omp atomic compare release
+  if (ucx == uce)
+    ucx = ucd;
+#pragma omp atomic compare release
+  if (uce == ucx)
+    ucx = ucd;
+
+#pragma omp atomic compare seq_cst
+  cx = cx > ce ? ce : cx;
+#pragma omp atomic compare seq_cst
+  cx = cx < ce ? ce : cx;
+#pragma omp atomic compare seq_cst
+  cx = ce > cx ? ce : cx;
+#pragma omp atomic compare seq_cst
+  cx = ce < cx ? ce : cx;
+#pragma omp atomic compare seq_cst
+  if (cx > ce)
+    cx = ce;
+#pragma omp atomic compare seq_cst
+  if (cx < ce)
+    cx = ce;
+#pragma omp atomic compare seq_cst
+  if (ce > cx)
+    cx = ce;
+#pragma omp atomic compare seq_cst
+  if (ce < cx)
+    cx = ce;
+
+#pragma omp atomic compare seq_cst
+  cx = cx == ce ? cd : cx;
+#pragma omp atomic compare seq_cst
+  cx = ce == cx ? cd : cx;
+#pragma omp atomic compare seq_cst
+  if (cx == ce)
+    cx = cd;
+#pragma omp atomic compare seq_cst
+  if (ce == cx)
+    cx = cd;
+
+#pragma omp atomic compare seq_cst
+  ucx = ucx > uce ? uce : ucx;
+#pragma omp atomic compare seq_cst
+  ucx = ucx < uce ? uce : ucx;
+#pragma omp atomic compare seq_cst
+  ucx = uce > ucx ? uce : ucx;
+#pragma omp atomic compare seq_cst
+  ucx = uce < ucx ? uce : ucx;
+#pragma omp atomic compare seq_cst
+  if (ucx > uce)
+    ucx = uce;
+#pragma omp atomic compare seq_cst
+  if (ucx < uce)
+    ucx = uce;
+#pragma omp atomic compare seq_cst
+  if (uce > ucx)
+    ucx = uce;
+#pragma omp atomic compare seq_cst
+  if (uce < ucx)
+    ucx = uce;
+
+#pragma omp atomic compare seq_cst
+  ucx = ucx == uce ? ucd : ucx;
+#pragma omp atomic compare seq_cst
+  ucx = uce == ucx ? ucd : ucx;
+#pragma omp atomic compare seq_cst
+  if (ucx == uce)
+    ucx = ucd;
+#pragma omp atomic compare seq_cst
+  if (uce == ucx)
+    ucx = ucd;
+
+#pragma omp atomic compare
+  sx = sx > se ? se : sx;
+#pragma omp atomic compare
+  sx = sx < se ? se : sx;
+#pragma omp atomic compare
+  sx = se > sx ? se : sx;
+#pragma omp atomic compare
+  sx = se < sx ? se : sx;
+#pragma omp atomic compare
+  if (sx > se)
+    sx = se;
+#pragma omp atomic compare
+  if (sx < se)
+    sx = se;
+#pragma omp atomic compare
+  if (se > sx)
+    sx = se;
+#pragma omp atomic compare
+  if (se < sx)
+    sx = se;
+
+#pragma omp atomic compare
+  sx = sx == se ? sd : sx;
+#pragma omp atomic compare
+  sx = se == sx ? sd : sx;
+#pragma omp atomic compare
+  if (sx == se)
+    sx = sd;
+#pragma omp atomic compare
+  if (se == sx)
+    sx = sd;
+
+#pragma omp atomic compare
+  usx = usx > use ? use : usx;
+#pragma omp atomic compare
+  usx = usx < use ? use : usx;
+#pragma omp atomic compare
+  usx = use > usx ? use : usx;
+#pragma omp atomic compare
+  usx = use < usx ? use : usx;
+#pragma omp atomic compare
+  if (usx > use)
+    usx = use;
+#pragma omp atomic compare
+  if (usx < use)
+    usx = use;
+#pragma omp atomic compare
+  if (use > usx)
+    usx = use;
+#pragma omp atomic compare
+  if (use < usx)
+    usx = use;
+
+#pragma omp atomic compare
+  usx = usx == use ? usd : usx;
+#pragma omp atomic compare
+  usx = use == usx ? usd : usx;
+#pragma omp atomic compare
+  if (usx == use)
+    usx = usd;
+#pragma omp atomic compare
+  if (use == usx)
+    usx = usd;
+
+#pragma omp atomic compare acq_rel
+  sx = sx > se ? se : sx;
+#pragma omp atomic compare acq_rel
+  sx = sx < se ? se : sx;
+#pragma omp atomic compare acq_rel
+  sx = se > sx ? se : sx;
+#pragma omp atomic compare acq_rel
+  sx = se < sx ? se : sx;
+#pragma omp atomic compare acq_rel
+  if (sx > se)
+    sx = se;
+#pragma omp atomic compare acq_rel
+  if (sx < se)
+    sx = se;
+#pragma omp atomic compare acq_rel
+  if (se > sx)
+    sx = se;
+#pragma omp atomic compare acq_rel
+  if (se < sx)
+    sx = se;
+
+#pragma omp atomic compare acq_rel
+  sx = sx == se ? sd : sx;
+#pragma omp atomic compare acq_rel
+  sx = se == sx ? sd : sx;
+#pragma omp atomic compare acq_rel
+  if (sx == se)
+    sx = sd;
+#pragma omp atomic compare acq_rel
+  if (se == sx)
+    sx = sd;
+
+#pragma omp atomic compare acq_rel
+  usx = usx > use ? use : usx;
+#pragma omp atomic compare acq_rel
+  usx = usx < use ? use : usx;
+#pragma omp atomic compare acq_rel
+  usx = use > usx ? use : usx;
+#pragma omp atomic compare acq_rel
+  usx = use < usx ? use : usx;
+#pragma omp atomic compare acq_rel
+  if (usx > use)
+    usx = use;
+#pragma omp atomic compare acq_rel
+  if (usx < use)
+    usx = use;
+#pragma omp atomic compare acq_rel
+  if (use > usx)
+    usx = use;
+#pragma omp atomic compare acq_rel
+  if (use < usx)
+    usx = use;
+
+#pragma omp atomic compare acq_rel
+  usx = usx == use ? usd : usx;
+#pragma omp atomic compare acq_rel
+  usx = use == usx ? usd : usx;
+#pragma omp atomic compare acq_rel
+  if (usx == use)
+    usx = usd;
+#pragma omp atomic compare acq_rel
+  if (use == usx)
+    usx = usd;
+
+#pragma omp atomic compare acquire
+  sx = sx > se ? se : sx;
+#pragma omp atomic compare acquire
+  sx = sx < se ? se : sx;
+#pragma omp atomic compare acquire
+  sx = se > sx ? se : sx;
+#pragma omp atomic compare acquire
+  sx = se < sx ? se : sx;
+#pragma omp atomic compare acquire
+  if (sx > se)
+    sx = se;
+#pragma omp atomic compare acquire
+  if (sx < se)
+    sx = se;
+#pragma omp atomic compare acquire
+  if (se > sx)
+    sx = se;
+#pragma omp atomic compare acquire
+  if (se < sx)
+    sx = se;
+
+#pragma omp atomic compare acquire
+  sx = sx == se ? sd : sx;
+#pragma omp atomic compare acquire
+  sx = se == sx ? sd : sx;
+#pragma omp atomic compare acquire
+  if (sx == se)
+    sx = sd;
+#pragma omp atomic compare acquire
+  if (se == sx)
+    sx = sd;
+
+#pragma omp atomic compare acquire
+  usx = usx > use ? use : usx;
+#pragma omp atomic compare acquire
+  usx = usx < use ? use : usx;
+#pragma omp atomic compare acquire
+  usx = use > usx ? use : usx;
+#pragma omp atomic compare acquire
+  usx = use < usx ? use : usx;
+#pragma omp atomic compare acquire
+  if (usx > use)
+    usx = use;
+#pragma omp atomic compare acquire
+  if (usx < use)
+    usx = use;
+#pragma omp atomic compare acquire
+  if (use > usx)
+    usx = use;
+#pragma omp atomic compare acquire
+  if (use < usx)
+    usx = use;
+
+#pragma omp atomic compare acquire
+  usx = usx == use ? usd : usx;
+#pragma omp atomic compare acquire
+  usx = use == usx ? usd : usx;
+#pragma omp atomic compare acquire
+  if (usx == use)
+    usx = usd;
+#pragma omp atomic compare acquire
+  if (use == usx)
+    usx = usd;
+
+#pragma omp atomic compare relaxed
+  sx = sx > se ? se : sx;
+#pragma omp atomic compare relaxed
+  sx = sx < se ? se : sx;
+#pragma omp atomic compare relaxed
+  sx = se > sx ? se : sx;
+#pragma omp atomic compare relaxed
+  sx = se < sx ? se : sx;
+#pragma omp atomic compare relaxed
+  if (sx > se)
+    sx = se;
+#pragma omp atomic compare relaxed
+  if (sx < se)
+    sx = se;
+#pragma omp atomic compare relaxed
+  if (se > sx)
+    sx = se;
+#pragma omp atomic compare relaxed
+  if (se < sx)
+    sx = se;
+
+#pragma omp atomic compare relaxed
+  sx = sx == se ? sd : sx;
+#pragma omp atomic compare relaxed
+  sx = se == sx ? sd : sx;
+#pragma omp atomic compare relaxed
+  if (sx == se)
+    sx = sd;
+#pragma omp atomic compare relaxed
+  if (se == sx)
+    sx = sd;
+
+#pragma omp atomic compare relaxed
+  usx = usx > use ? use : usx;
+#pragma omp atomic compare relaxed
+  usx = usx < use ? use : usx;
+#pragma omp atomic compare relaxed
+  usx = use > usx ? use : usx;
+#pragma omp atomic compare relaxed
+  usx = use < usx ? use : usx;
+#pragma omp atomic compare relaxed
+  if (usx > use)
+    usx = use;
+#pragma omp atomic compare relaxed
+  if (usx < use)
+    usx = use;
+#pragma omp atomic compare relaxed
+  if (use > usx)
+    usx = use;
+#pragma omp atomic compare relaxed
+  if (use < usx)
+    usx = use;
+
+#pragma omp atomic compare relaxed
+  usx = usx == use ? usd : usx;
+#pragma omp atomic compare relaxed
+  usx = use == usx ? usd : usx;
+#pragma omp atomic compare relaxed
+  if (usx == use)
+    usx = usd;
+#pragma omp atomic compare relaxed
+  if (use == usx)
+    usx = usd;
+
+#pragma omp atomic compare release
+  sx = sx > se ? se : sx;
+#pragma omp atomic compare release
+  sx = sx < se ? se : sx;
+#pragma omp atomic compare release
+  sx = se > sx ? se : sx;
+#pragma omp atomic compare release
+  sx = se < sx ? se : sx;
+#pragma omp atomic compare release
+  if (sx > se)
+    sx = se;
+#pragma omp atomic compare release
+  if (sx < se)
+    sx = se;
+#pragma omp atomic compare release
+  if (se > sx)
+    sx = se;
+#pragma omp atomic compare release
+  if (se < sx)
+    sx = se;
+
+#pragma omp atomic compare release
+  sx = sx == se ? sd : sx;
+#pragma omp atomic compare release
+  sx = se == sx ? sd : sx;
+#pragma omp atomic compare release
+  if (sx == se)
+    sx = sd;
+#pragma omp atomic compare release
+  if (se == sx)
+    sx = sd;
+
+#pragma omp atomic compare release
+  usx = usx > use ? use : usx;
+#pragma omp atomic compare release
+  usx = usx < use ? use : usx;
+#pragma omp atomic compare release
+  usx = use > usx ? use : usx;
+#pragma omp atomic compare release
+  usx = use < usx ? use : usx;
+#pragma omp atomic compare release
+  if (usx > use)
+    usx = use;
+#pragma omp atomic compare release
+  if (usx < use)
+    usx = use;
+#pragma omp atomic compare release
+  if (use > usx)
+    usx = use;
+#pragma omp atomic compare release
+  if (use < usx)
+    usx = use;
+
+#pragma omp atomic compare release
+  usx = usx == use ? usd : usx;
+#pragma omp atomic compare release
+  usx = use == usx ? usd : usx;
+#pragma omp atomic compare release
+  if (usx == use)
+    usx = usd;
+#pragma omp atomic compare release
+  if (use == usx)
+    usx = usd;
+
+#pragma omp atomic compare seq_cst
+  sx = sx > se ? se : sx;
+#pragma omp atomic compare seq_cst
+  sx = sx < se ? se : sx;
+#pragma omp atomic compare seq_cst
+  sx = se > sx ? se : sx;
+#pragma omp atomic compare seq_cst
+  sx = se < sx ? se : sx;
+#pragma omp atomic compare seq_cst
+  if (sx > se)
+    sx = se;
+#pragma omp atomic compare seq_cst
+  if (sx < se)
+    sx = se;
+#pragma omp atomic compare seq_cst
+  if (se > sx)
+    sx = se;
+#pragma omp atomic compare seq_cst
+  if (se < sx)
+    sx = se;
+
+#pragma omp atomic compare seq_cst
+  sx = sx == se ? sd : sx;
+#pragma omp atomic compare seq_cst
+  sx = se == sx ? sd : sx;
+#pragma omp atomic compare seq_cst
+  if (sx == se)
+    sx = sd;
+#pragma omp atomic compare seq_cst
+  if (se == sx)
+    sx = sd;
+
+#pragma omp atomic compare seq_cst
+  usx = usx > use ? use : usx;
+#pragma omp atomic compare seq_cst
+  usx = usx < use ? use : usx;
+#pragma omp atomic compare seq_cst
+  usx = use > usx ? use : usx;
+#pragma omp atomic compare seq_cst
+  usx = use < usx ? use : usx;
+#pragma omp atomic compare seq_cst
+  if (usx > use)
+    usx = use;
+#pragma omp atomic compare seq_cst
+  if (usx < use)
+    usx = use;
+#pragma omp atomic compare seq_cst
+  if (use > usx)
+    usx = use;
+#pragma omp atomic compare seq_cst
+  if (use < usx)
+    usx = use;
+
+#pragma omp atomic compare seq_cst
+  usx = usx == use ? usd : usx;
+#pragma omp atomic compare seq_cst
+  usx = use == usx ? usd : usx;
+#pragma omp atomic compare seq_cst
+  if (usx == use)
+    usx = usd;
+#pragma omp atomic compare seq_cst
+  if (use == usx)
+    usx = usd;
+
+#pragma omp atomic compare
+  ix = ix > ie ? ie : ix;
+#pragma omp atomic compare
+  ix = ix < ie ? ie : ix;
+#pragma omp atomic compare
+  ix = ie > ix ? ie : ix;
+#pragma omp atomic compare
+  ix = ie < ix ? ie : ix;
+#pragma omp atomic compare
+  if (ix > ie)
+    ix = ie;
+#pragma omp atomic compare
+  if (ix < ie)
+    ix = ie;
+#pragma omp atomic compare
+  if (ie > ix)
+    ix = ie;
+#pragma omp atomic compare
+  if (ie < ix)
+    ix = ie;
+
+#pragma omp atomic compare
+  ix = ix == ie ? id : ix;
+#pragma omp atomic compare
+  ix = ie == ix ? id : ix;
+#pragma omp atomic compare
+  if (ix == ie)
+    ix = id;
+#pragma omp atomic compare
+  if (ie == ix)
+    ix = id;
+
+#pragma omp atomic compare
+  uix = uix > uie ? uie : uix;
+#pragma omp atomic compare
+  uix = uix < uie ? uie : uix;
+#pragma omp atomic compare
+  uix = uie > uix ? uie : uix;
+#pragma omp atomic compare
+  uix = uie < uix ? uie : uix;
+#pragma omp atomic compare
+  if (uix > uie)
+    uix = uie;
+#pragma omp atomic compare
+  if (uix < uie)
+    uix = uie;
+#pragma omp atomic compare
+  if (uie > uix)
+    uix = uie;
+#pragma omp atomic compare
+  if (uie < uix)
+    uix = uie;
+
+#pragma omp atomic compare
+  uix = uix == uie ? uid : uix;
+#pragma omp atomic compare
+  uix = uie == uix ? uid : uix;
+#pragma omp atomic compare
+  if (uix == uie)
+    uix = uid;
+#pragma omp atomic compare
+  if (uie == uix)
+    uix = uid;
+
+#pragma omp atomic compare acq_rel
+  ix = ix > ie ? ie : ix;
+#pragma omp atomic compare acq_rel
+  ix = ix < ie ? ie : ix;
+#pragma omp atomic compare acq_rel
+  ix = ie > ix ? ie : ix;
+#pragma omp atomic compare acq_rel
+  ix = ie < ix ? ie : ix;
+#pragma omp atomic compare acq_rel
+  if (ix > ie)
+    ix = ie;
+#pragma omp atomic compare acq_rel
+  if (ix < ie)
+    ix = ie;
+#pragma omp atomic compare acq_rel
+  if (ie > ix)
+    ix = ie;
+#pragma omp atomic compare acq_rel
+  if (ie < ix)
+    ix = ie;
+
+#pragma omp atomic compare acq_rel
+  ix = ix == ie ? id : ix;
+#pragma omp atomic compare acq_rel
+  ix = ie == ix ? id : ix;
+#pragma omp atomic compare acq_rel
+  if (ix == ie)
+    ix = id;
+#pragma omp atomic compare acq_rel
+  if (ie == ix)
+    ix = id;
+
+#pragma omp atomic compare acq_rel
+  uix = uix > uie ? uie : uix;
+#pragma omp atomic compare acq_rel
+  uix = uix < uie ? uie : uix;
+#pragma omp atomic compare acq_rel
+  uix = uie > uix ? uie : uix;
+#pragma omp atomic compare acq_rel
+  uix = uie < uix ? uie : uix;
+#pragma omp atomic compare acq_rel
+  if (uix > uie)
+    uix = uie;
+#pragma omp atomic compare acq_rel
+  if (uix < uie)
+    uix = uie;
+#pragma omp atomic compare acq_rel
+  if (uie > uix)
+    uix = uie;
+#pragma omp atomic compare acq_rel
+  if (uie < uix)
+    uix = uie;
+
+#pragma omp atomic compare acq_rel
+  uix = uix == uie ? uid : uix;
+#pragma omp atomic compare acq_rel
+  uix = uie == uix ? uid : uix;
+#pragma omp atomic compare acq_rel
+  if (uix == uie)
+    uix = uid;
+#pragma omp atomic compare acq_rel
+  if (uie == uix)
+    uix = uid;
+
+#pragma omp atomic compare acquire
+  ix = ix > ie ? ie : ix;
+#pragma omp atomic compare acquire
+  ix = ix < ie ? ie : ix;
+#pragma omp atomic compare acquire
+  ix = ie > ix ? ie : ix;
+#pragma omp atomic compare acquire
+  ix = ie < ix ? ie : ix;
+#pragma omp atomic compare acquire
+  if (ix > ie)
+    ix = ie;
+#pragma omp atomic compare acquire
+  if (ix < ie)
+    ix = ie;
+#pragma omp atomic compare acquire
+  if (ie > ix)
+    ix = ie;
+#pragma omp atomic compare acquire
+  if (ie < ix)
+    ix = ie;
+
+#pragma omp atomic compare acquire
+  ix = ix == ie ? id : ix;
+#pragma omp atomic compare acquire
+  ix = ie == ix ? id : ix;
+#pragma omp atomic compare acquire
+  if (ix == ie)
+    ix = id;
+#pragma omp atomic compare acquire
+  if (ie == ix)
+    ix = id;
+
+#pragma omp atomic compare acquire
+  uix = uix > uie ? uie : uix;
+#pragma omp atomic compare acquire
+  uix = uix < uie ? uie : uix;
+#pragma omp atomic compare acquire
+  uix = uie > uix ? uie : uix;
+#pragma omp atomic compare acquire
+  uix = uie < uix ? uie : uix;
+#pragma omp atomic compare acquire
+  if (uix > uie)
+    uix = uie;
+#pragma omp atomic compare acquire
+  if (uix < uie)
+    uix = uie;
+#pragma omp atomic compare acquire
+  if (uie > uix)
+    uix = uie;
+#pragma omp atomic compare acquire
+  if (uie < uix)
+    uix = uie;
+
+#pragma omp atomic compare acquire
+  uix = uix == uie ? uid : uix;
+#pragma omp atomic compare acquire
+  uix = uie == uix ? uid : uix;
+#pragma omp atomic compare acquire
+  if (uix == uie)
+    uix = uid;
+#pragma omp atomic compare acquire
+  if (uie == uix)
+    uix = uid;
+
+#pragma omp atomic compare relaxed
+  ix = ix > ie ? ie : ix;
+#pragma omp atomic compare relaxed
+  ix = ix < ie ? ie : ix;
+#pragma omp atomic compare relaxed
+  ix = ie > ix ? ie : ix;
+#pragma omp atomic compare relaxed
+  ix = ie < ix ? ie : ix;
+#pragma omp atomic compare relaxed
+  if (ix > ie)
+    ix = ie;
+#pragma omp atomic compare relaxed
+  if (ix < ie)
+    ix = ie;
+#pragma omp atomic compare relaxed
+  if (ie > ix)
+    ix = ie;
+#pragma omp atomic compare relaxed
+  if (ie < ix)
+    ix = ie;
+
+#pragma omp atomic compare relaxed
+  ix = ix == ie ? id : ix;
+#pragma omp atomic compare relaxed
+  ix = ie == ix ? id : ix;
+#pragma omp atomic compare relaxed
+  if (ix == ie)
+    ix = id;
+#pragma omp atomic compare relaxed
+  if (ie == ix)
+    ix = id;
+
+#pragma omp atomic compare relaxed
+  uix = uix > uie ? uie : uix;
+#pragma omp atomic compare relaxed
+  uix = uix < uie ? uie : uix;
+#pragma omp atomic compare relaxed
+  uix = uie > uix ? uie : uix;
+#pragma omp atomic compare relaxed
+  uix = uie < uix ? uie : uix;
+#pragma omp atomic compare relaxed
+  if (uix > uie)
+    uix = uie;
+#pragma omp atomic compare relaxed
+  if (uix < uie)
+    uix = uie;
+#pragma omp atomic compare relaxed
+  if (uie > uix)
+    uix = uie;
+#pragma omp atomic compare relaxed
+  if (uie < uix)
+    uix = uie;
+
+#pragma omp atomic compare relaxed
+  uix = uix == uie ? uid : uix;
+#pragma omp atomic compare relaxed
+  uix = uie == uix ? uid : uix;
+#pragma omp atomic compare relaxed
+  if (uix == uie)
+    uix = uid;
+#pragma omp atomic compare relaxed
+  if (uie == uix)
+    uix = uid;
+
+#pragma omp atomic compare release
+  ix = ix > ie ? ie : ix;
+#pragma omp atomic compare release
+  ix = ix < ie ? ie : ix;
+#pragma omp atomic compare release
+  ix = ie > ix ? ie : ix;
+#pragma omp atomic compare release
+  ix = ie < ix ? ie : ix;
+#pragma omp atomic compare release
+  if (ix > ie)
+    ix = ie;
+#pragma omp atomic compare release
+  if (ix < ie)
+    ix = ie;
+#pragma omp atomic compare release
+  if (ie > ix)
+    ix = ie;
+#pragma omp atomic compare release
+  if (ie < ix)
+    ix = ie;
+
+#pragma omp atomic compare release
+  ix = ix == ie ? id : ix;
+#pragma omp atomic compare release
+  ix = ie == ix ? id : ix;
+#pragma omp atomic compare release
+  if (ix == ie)
+    ix = id;
+#pragma omp atomic compare release
+  if (ie == ix)
+    ix = id;
+
+#pragma omp atomic compare release
+  uix = uix > uie ? uie : uix;
+#pragma omp atomic compare release
+  uix = uix < uie ? uie : uix;
+#pragma omp atomic compare release
+  uix = uie > uix ? uie : uix;
+#pragma omp atomic compare release
+  uix = uie < uix ? uie : uix;
+#pragma omp atomic compare release
+  if (uix > uie)
+    uix = uie;
+#pragma omp atomic compare release
+  if (uix < uie)
+    uix = uie;
+#pragma omp atomic compare release
+  if (uie > uix)
+    uix = uie;
+#pragma omp atomic compare release
+  if (uie < uix)
+    uix = uie;
+
+#pragma omp atomic compare release
+  uix = uix == uie ? uid : uix;
+#pragma omp atomic compare release
+  uix = uie == uix ? uid : uix;
+#pragma omp atomic compare release
+  if (uix == uie)
+    uix = uid;
+#pragma omp atomic compare release
+  if (uie == uix)
+    uix = uid;
+
+#pragma omp atomic compare seq_cst
+  ix = ix > ie ? ie : ix;
+#pragma omp atomic compare seq_cst
+  ix = ix < ie ? ie : ix;
+#pragma omp atomic compare seq_cst
+  ix = ie > ix ? ie : ix;
+#pragma omp atomic compare seq_cst
+  ix = ie < ix ? ie : ix;
+#pragma omp atomic compare seq_cst
+  if (ix > ie)
+    ix = ie;
+#pragma omp atomic compare seq_cst
+  if (ix < ie)
+    ix = ie;
+#pragma omp atomic compare seq_cst
+  if (ie > ix)
+    ix = ie;
+#pragma omp atomic compare seq_cst
+  if (ie < ix)
+    ix = ie;
+
+#pragma omp atomic compare seq_cst
+  ix = ix == ie ? id : ix;
+#pragma omp atomic compare seq_cst
+  ix = ie == ix ? id : ix;
+#pragma omp atomic compare seq_cst
+  if (ix == ie)
+    ix = id;
+#pragma omp atomic compare seq_cst
+  if (ie == ix)
+    ix = id;
+
+#pragma omp atomic compare seq_cst
+  uix = uix > uie ? uie : uix;
+#pragma omp atomic compare seq_cst
+  uix = uix < uie ? uie : uix;
+#pragma omp atomic compare seq_cst
+  uix = uie > uix ? uie : uix;
+#pragma omp atomic compare seq_cst
+  uix = uie < uix ? uie : uix;
+#pragma omp atomic compare seq_cst
+  if (uix > uie)
+    uix = uie;
+#pragma omp atomic compare seq_cst
+  if (uix < uie)
+    uix = uie;
+#pragma omp atomic compare seq_cst
+  if (uie > uix)
+    uix = uie;
+#pragma omp atomic compare seq_cst
+  if (uie < uix)
+    uix = uie;
+
+#pragma omp atomic compare seq_cst
+  uix = uix == uie ? uid : uix;
+#pragma omp atomic compare seq_cst
+  uix = uie == uix ? uid : uix;
+#pragma omp atomic compare seq_cst
+  if (uix == uie)
+    uix = uid;
+#pragma omp atomic compare seq_cst
+  if (uie == uix)
+    uix = uid;
+
+#pragma omp atomic compare
+  lx = lx > le ? le : lx;
+#pragma omp atomic compare
+  lx = lx < le ? le : lx;
+#pragma omp atomic compare
+  lx = le > lx ? le : lx;
+#pragma omp atomic compare
+  lx = le < lx ? le : lx;
+#pragma omp atomic compare
+  if (lx > le)
+    lx = le;
+#pragma omp atomic compare
+  if (lx < le)
+    lx = le;
+#pragma omp atomic compare
+  if (le > lx)
+    lx = le;
+#pragma omp atomic compare
+  if (le < lx)
+    lx = le;
+
+#pragma omp atomic compare
+  lx = lx == le ? ld : lx;
+#pragma omp atomic compare
+  lx = le == lx ? ld : lx;
+#pragma omp atomic compare
+  if (lx == le)
+    lx = ld;
+#pragma omp atomic compare
+  if (le == lx)
+    lx = ld;
+
+#pragma omp atomic compare
+  ulx = ulx > ule ? ule : ulx;
+#pragma omp atomic compare
+  ulx = ulx < ule ? ule : ulx;
+#pragma omp atomic compare
+  ulx = ule > ulx ? ule : ulx;
+#pragma omp atomic compare
+  ulx = ule < ulx ? ule : ulx;
+#pragma omp atomic compare
+  if (ulx > ule)
+    ulx = ule;
+#pragma omp atomic compare
+  if (ulx < ule)
+    ulx = ule;
+#pragma omp atomic compare
+  if (ule > ulx)
+    ulx = ule;
+#pragma omp atomic compare
+  if (ule < ulx)
+    ulx = ule;
+
+#pragma omp atomic compare
+  ulx = ulx == ule ? uld : ulx;
+#pragma omp atomic compare
+  ulx = ule == ulx ? uld : ulx;
+#pragma omp atomic compare
+  if (ulx == ule)
+    ulx = uld;
+#pragma omp atomic compare
+  if (ule == ulx)
+    ulx = uld;
+
+#pragma omp atomic compare acq_rel
+  lx = lx > le ? le : lx;
+#pragma omp atomic compare acq_rel
+  lx = lx < le ? le : lx;
+#pragma omp atomic compare acq_rel
+  lx = le > lx ? le : lx;
+#pragma omp atomic compare acq_rel
+  lx = le < lx ? le : lx;
+#pragma omp atomic compare acq_rel
+  if (lx > le)
+    lx = le;
+#pragma omp atomic compare acq_rel
+  if (lx < le)
+    lx = le;
+#pragma omp atomic compare acq_rel
+  if (le > lx)
+    lx = le;
+#pragma omp atomic compare acq_rel
+  if (le < lx)
+    lx = le;
+
+#pragma omp atomic compare acq_rel
+  lx = lx == le ? ld : lx;
+#pragma omp atomic compare acq_rel
+  lx = le == lx ? ld : lx;
+#pragma omp atomic compare acq_rel
+  if (lx == le)
+    lx = ld;
+#pragma omp atomic compare acq_rel
+  if (le == lx)
+    lx = ld;
+
+#pragma omp atomic compare acq_rel
+  ulx = ulx > ule ? ule : ulx;
+#pragma omp atomic compare acq_rel
+  ulx = ulx < ule ? ule : ulx;
+#pragma omp atomic compare acq_rel
+  ulx = ule > ulx ? ule : ulx;
+#pragma omp atomic compare acq_rel
+  ulx = ule < ulx ? ule : ulx;
+#pragma omp atomic compare acq_rel
+  if (ulx > ule)
+    ulx = ule;
+#pragma omp atomic compare acq_rel
+  if (ulx < ule)
+    ulx = ule;
+#pragma omp atomic compare acq_rel
+  if (ule > ulx)
+    ulx = ule;
+#pragma omp atomic compare acq_rel
+  if (ule < ulx)
+    ulx = ule;
+
+#pragma omp atomic compare acq_rel
+  ulx = ulx == ule ? uld : ulx;
+#pragma omp atomic compare acq_rel
+  ulx = ule == ulx ? uld : ulx;
+#pragma omp atomic compare acq_rel
+  if (ulx == ule)
+    ulx = uld;
+#pragma omp atomic compare acq_rel
+  if (ule == ulx)
+    ulx = uld;
+
+#pragma omp atomic compare acquire
+  lx = lx > le ? le : lx;
+#pragma omp atomic compare acquire
+  lx = lx < le ? le : lx;
+#pragma omp atomic compare acquire
+  lx = le > lx ? le : lx;
+#pragma omp atomic compare acquire
+  lx = le < lx ? le : lx;
+#pragma omp atomic compare acquire
+  if (lx > le)
+    lx = le;
+#pragma omp atomic compare acquire
+  if (lx < le)
+    lx = le;
+#pragma omp atomic compare acquire
+  if (le > lx)
+    lx = le;
+#pragma omp atomic compare acquire
+  if (le < lx)
+    lx = le;
+
+#pragma omp atomic compare acquire
+  lx = lx == le ? ld : lx;
+#pragma omp atomic compare acquire
+  lx = le == lx ? ld : lx;
+#pragma omp atomic compare acquire
+  if (lx == le)
+    lx = ld;
+#pragma omp atomic compare acquire
+  if (le == lx)
+    lx = ld;
+
+#pragma omp atomic compare acquire
+  ulx = ulx > ule ? ule : ulx;
+#pragma omp atomic compare acquire
+  ulx = ulx < ule ? ule : ulx;
+#pragma omp atomic compare acquire
+  ulx = ule > ulx ? ule : ulx;
+#pragma omp atomic compare acquire
+  ulx = ule < ulx ? ule : ulx;
+#pragma omp atomic compare acquire
+  if (ulx > ule)
+    ulx = ule;
+#pragma omp atomic compare acquire
+  if (ulx < ule)
+    ulx = ule;
+#pragma omp atomic compare acquire
+  if (ule > ulx)
+    ulx = ule;
+#pragma omp atomic compare acquire
+  if (ule < ulx)
+    ulx = ule;
+
+#pragma omp atomic compare acquire
+  ulx = ulx == ule ? uld : ulx;
+#pragma omp atomic compare acquire
+  ulx = ule == ulx ? uld : ulx;
+#pragma omp atomic compare acquire
+  if (ulx == ule)
+    ulx = uld;
+#pragma omp atomic compare acquire
+  if (ule == ulx)
+    ulx = uld;
+
+#pragma omp atomic compare relaxed
+  lx = lx > le ? le : lx;
+#pragma omp atomic compare relaxed
+  lx = lx < le ? le : lx;
+#pragma omp atomic compare relaxed
+  lx = le > lx ? le : lx;
+#pragma omp atomic compare relaxed
+  lx = le < lx ? le : lx;
+#pragma omp atomic compare relaxed
+  if (lx > le)
+    lx = le;
+#pragma omp atomic compare relaxed
+  if (lx < le)
+    lx = le;
+#pragma omp atomic compare relaxed
+  if (le > lx)
+    lx = le;
+#pragma omp atomic compare relaxed
+  if (le < lx)
+    lx = le;
+
+#pragma omp atomic compare relaxed
+  lx = lx == le ? ld : lx;
+#pragma omp atomic compare relaxed
+  lx = le == lx ? ld : lx;
+#pragma omp atomic compare relaxed
+  if (lx == le)
+    lx = ld;
+#pragma omp atomic compare relaxed
+  if (le == lx)
+    lx = ld;
+
+#pragma omp atomic compare relaxed
+  ulx = ulx > ule ? ule : ulx;
+#pragma omp atomic compare relaxed
+  ulx = ulx < ule ? ule : ulx;
+#pragma omp atomic compare relaxed
+  ulx = ule > ulx ? ule : ulx;
+#pragma omp atomic compare relaxed
+  ulx = ule < ulx ? ule : ulx;
+#pragma omp atomic compare relaxed
+  if (ulx > ule)
+    ulx = ule;
+#pragma omp atomic compare relaxed
+  if (ulx < ule)
+    ulx = ule;
+#pragma omp atomic compare relaxed
+  if (ule > ulx)
+    ulx = ule;
+#pragma omp atomic compare relaxed
+  if (ule < ulx)
+    ulx = ule;
+
+#pragma omp atomic compare relaxed
+  ulx = ulx == ule ? uld : ulx;
+#pragma omp atomic compare relaxed
+  ulx = ule == ulx ? uld : ulx;
+#pragma omp atomic compare relaxed
+  if (ulx == ule)
+    ulx = uld;
+#pragma omp atomic compare relaxed
+  if (ule == ulx)
+    ulx = uld;
+
+#pragma omp atomic compare release
+  lx = lx > le ? le : lx;
+#pragma omp atomic compare release
+  lx = lx < le ? le : lx;
+#pragma omp atomic compare release
+  lx = le > lx ? le : lx;
+#pragma omp atomic compare release
+  lx = le < lx ? le : lx;
+#pragma omp atomic compare release
+  if (lx > le)
+    lx = le;
+#pragma omp atomic compare release
+  if (lx < le)
+    lx = le;
+#pragma omp atomic compare release
+  if (le > lx)
+    lx = le;
+#pragma omp atomic compare release
+  if (le < lx)
+    lx = le;
+
+#pragma omp atomic compare release
+  lx = lx == le ? ld : lx;
+#pragma omp atomic compare release
+  lx = le == lx ? ld : lx;
+#pragma omp atomic compare release
+  if (lx == le)
+    lx = ld;
+#pragma omp atomic compare release
+  if (le == lx)
+    lx = ld;
+
+#pragma omp atomic compare release
+  ulx = ulx > ule ? ule : ulx;
+#pragma omp atomic compare release
+  ulx = ulx < ule ? ule : ulx;
+#pragma omp atomic compare release
+  ulx = ule > ulx ? ule : ulx;
+#pragma omp atomic compare release
+  ulx = ule < ulx ? ule : ulx;
+#pragma omp atomic compare release
+  if (ulx > ule)
+    ulx = ule;
+#pragma omp atomic compare release
+  if (ulx < ule)
+    ulx = ule;
+#pragma omp atomic compare release
+  if (ule > ulx)
+    ulx = ule;
+#pragma omp atomic compare release
+  if (ule < ulx)
+    ulx = ule;
+
+#pragma omp atomic compare release
+  ulx = ulx == ule ? uld : ulx;
+#pragma omp atomic compare release
+  ulx = ule == ulx ? uld : ulx;
+#pragma omp atomic compare release
+  if (ulx == ule)
+    ulx = uld;
+#pragma omp atomic compare release
+  if (ule == ulx)
+    ulx = uld;
+
+#pragma omp atomic compare seq_cst
+  lx = lx > le ? le : lx;
+#pragma omp atomic compare seq_cst
+  lx = lx < le ? le : lx;
+#pragma omp atomic compare seq_cst
+  lx = le > lx ? le : lx;
+#pragma omp atomic compare seq_cst
+  lx = le < lx ? le : lx;
+#pragma omp atomic compare seq_cst
+  if (lx > le)
+    lx = le;
+#pragma omp atomic compare seq_cst
+  if (lx < le)
+    lx = le;
+#pragma omp atomic compare seq_cst
+  if (le > lx)
+    lx = le;
+#pragma omp atomic compare seq_cst
+  if (le < lx)
+    lx = le;
+
+#pragma omp atomic compare seq_cst
+  lx = lx == le ? ld : lx;
+#pragma omp atomic compare seq_cst
+  lx = le == lx ? ld : lx;
+#pragma omp atomic compare seq_cst
+  if (lx == le)
+    lx = ld;
+#pragma omp atomic compare seq_cst
+  if (le == lx)
+    lx = ld;
+
+#pragma omp atomic compare seq_cst
+  ulx = ulx > ule ? ule : ulx;
+#pragma omp atomic compare seq_cst
+  ulx = ulx < ule ? ule : ulx;
+#pragma omp atomic compare seq_cst
+  ulx = ule > ulx ? ule : ulx;
+#pragma omp atomic compare seq_cst
+  ulx = ule < ulx ? ule : ulx;
+#pragma omp atomic compare seq_cst
+  if (ulx > ule)
+    ulx = ule;
+#pragma omp atomic compare seq_cst
+  if (ulx < ule)
+    ulx = ule;
+#pragma omp atomic compare seq_cst
+  if (ule > ulx)
+    ulx = ule;
+#pragma omp atomic compare seq_cst
+  if (ule < ulx)
+    ulx = ule;
+
+#pragma omp atomic compare seq_cst
+  ulx = ulx == ule ? uld : ulx;
+#pragma omp atomic compare seq_cst
+  ulx = ule == ulx ? uld : ulx;
+#pragma omp atomic compare seq_cst
+  if (ulx == ule)
+    ulx = uld;
+#pragma omp atomic compare seq_cst
+  if (ule == ulx)
+    ulx = uld;
+
+#pragma omp atomic compare
+  llx = llx > lle ? lle : llx;
+#pragma omp atomic compare
+  llx = llx < lle ? lle : llx;
+#pragma omp atomic compare
+  llx = lle > llx ? lle : llx;
+#pragma omp atomic compare
+  llx = lle < llx ? lle : llx;
+#pragma omp atomic compare
+  if (llx > lle)
+    llx = lle;
+#pragma omp atomic compare
+  if (llx < lle)
+    llx = lle;
+#pragma omp atomic compare
+  if (lle > llx)
+    llx = lle;
+#pragma omp atomic compare
+  if (lle < llx)
+    llx = lle;
+
+#pragma omp atomic compare
+  llx = llx == lle ? lld : llx;
+#pragma omp atomic compare
+  llx = lle == llx ? lld : llx;
+#pragma omp atomic compare
+  if (llx == lle)
+    llx = lld;
+#pragma omp atomic compare
+  if (lle == llx)
+    llx = lld;
+
+#pragma omp atomic compare
+  ullx = ullx > ulle ? ulle : ullx;
+#pragma omp atomic compare
+  ullx = ullx < ulle ? ulle : ullx;
+#pragma omp atomic compare
+  ullx = ulle > ullx ? ulle : ullx;
+#pragma omp atomic compare
+  ullx = ulle < ullx ? ulle : ullx;
+#pragma omp atomic compare
+  if (ullx > ulle)
+    ullx = ulle;
+#pragma omp atomic compare
+  if (ullx < ulle)
+    ullx = ulle;
+#pragma omp atomic compare
+  if (ulle > ullx)
+    ullx = ulle;
+#pragma omp atomic compare
+  if (ulle < ullx)
+    ullx = ulle;
+
+#pragma omp atomic compare
+  ullx = ullx == ulle ? ulld : ullx;
+#pragma omp atomic compare
+  ullx = ulle == ullx ? ulld : ullx;
+#pragma omp atomic compare
+  if (ullx == ulle)
+    ullx = ulld;
+#pragma omp atomic compare
+  if (ulle == ullx)
+    ullx = ulld;
+
+#pragma omp atomic compare acq_rel
+  llx = llx > lle ? lle : llx;
+#pragma omp atomic compare acq_rel
+  llx = llx < lle ? lle : llx;
+#pragma omp atomic compare acq_rel
+  llx = lle > llx ? lle : llx;
+#pragma omp atomic compare acq_rel
+  llx = lle < llx ? lle : llx;
+#pragma omp atomic compare acq_rel
+  if (llx > lle)
+    llx = lle;
+#pragma omp atomic compare acq_rel
+  if (llx < lle)
+    llx = lle;
+#pragma omp atomic compare acq_rel
+  if (lle > llx)
+    llx = lle;
+#pragma omp atomic compare acq_rel
+  if (lle < llx)
+    llx = lle;
+
+#pragma omp atomic compare acq_rel
+  llx = llx == lle ? lld : llx;
+#pragma omp atomic compare acq_rel
+  llx = lle == llx ? lld : llx;
+#pragma omp atomic compare acq_rel
+  if (llx == lle)
+    llx = lld;
+#pragma omp atomic compare acq_rel
+  if (lle == llx)
+    llx = lld;
+
+#pragma omp atomic compare acq_rel
+  ullx = ullx > ulle ? ulle : ullx;
+#pragma omp atomic compare acq_rel
+  ullx = ullx < ulle ? ulle : ullx;
+#pragma omp atomic compare acq_rel
+  ullx = ulle > ullx ? ulle : ullx;
+#pragma omp atomic compare acq_rel
+  ullx = ulle < ullx ? ulle : ullx;
+#pragma omp atomic compare acq_rel
+  if (ullx > ulle)
+    ullx = ulle;
+#pragma omp atomic compare acq_rel
+  if (ullx < ulle)
+    ullx = ulle;
+#pragma omp atomic compare acq_rel
+  if (ulle > ullx)
+    ullx = ulle;
+#pragma omp atomic compare acq_rel
+  if (ulle < ullx)
+    ullx = ulle;
+
+#pragma omp atomic compare acq_rel
+  ullx = ullx == ulle ? ulld : ullx;
+#pragma omp atomic compare acq_rel
+  ullx = ulle == ullx ? ulld : ullx;
+#pragma omp atomic compare acq_rel
+  if (ullx == ulle)
+    ullx = ulld;
+#pragma omp atomic compare acq_rel
+  if (ulle == ullx)
+    ullx = ulld;
+
+#pragma omp atomic compare acquire
+  llx = llx > lle ? lle : llx;
+#pragma omp atomic compare acquire
+  llx = llx < lle ? lle : llx;
+#pragma omp atomic compare acquire
+  llx = lle > llx ? lle : llx;
+#pragma omp atomic compare acquire
+  llx = lle < llx ? lle : llx;
+#pragma omp atomic compare acquire
+  if (llx > lle)
+    llx = lle;
+#pragma omp atomic compare acquire
+  if (llx < lle)
+    llx = lle;
+#pragma omp atomic compare acquire
+  if (lle > llx)
+    llx = lle;
+#pragma omp atomic compare acquire
+  if (lle < llx)
+    llx = lle;
+
+#pragma omp atomic compare acquire
+  llx = llx == lle ? lld : llx;
+#pragma omp atomic compare acquire
+  llx = lle == llx ? lld : llx;
+#pragma omp atomic compare acquire
+  if (llx == lle)
+    llx = lld;
+#pragma omp atomic compare acquire
+  if (lle == llx)
+    llx = lld;
+
+#pragma omp atomic compare acquire
+  ullx = ullx > ulle ? ulle : ullx;
+#pragma omp atomic compare acquire
+  ullx = ullx < ulle ? ulle : ullx;
+#pragma omp atomic compare acquire
+  ullx = ulle > ullx ? ulle : ullx;
+#pragma omp atomic compare acquire
+  ullx = ulle < ullx ? ulle : ullx;
+#pragma omp atomic compare acquire
+  if (ullx > ulle)
+    ullx = ulle;
+#pragma omp atomic compare acquire
+  if (ullx < ulle)
+    ullx = ulle;
+#pragma omp atomic compare acquire
+  if (ulle > ullx)
+    ullx = ulle;
+#pragma omp atomic compare acquire
+  if (ulle < ullx)
+    ullx = ulle;
+
+#pragma omp atomic compare acquire
+  ullx = ullx == ulle ? ulld : ullx;
+#pragma omp atomic compare acquire
+  ullx = ulle == ullx ? ulld : ullx;
+#pragma omp atomic compare acquire
+  if (ullx == ulle)
+    ullx = ulld;
+#pragma omp atomic compare acquire
+  if (ulle == ullx)
+    ullx = ulld;
+
+#pragma omp atomic compare relaxed
+  llx = llx > lle ? lle : llx;
+#pragma omp atomic compare relaxed
+  llx = llx < lle ? lle : llx;
+#pragma omp atomic compare relaxed
+  llx = lle > llx ? lle : llx;
+#pragma omp atomic compare relaxed
+  llx = lle < llx ? lle : llx;
+#pragma omp atomic compare relaxed
+  if (llx > lle)
+    llx = lle;
+#pragma omp atomic compare relaxed
+  if (llx < lle)
+    llx = lle;
+#pragma omp atomic compare relaxed
+  if (lle > llx)
+    llx = lle;
+#pragma omp atomic compare relaxed
+  if (lle < llx)
+    llx = lle;
+
+#pragma omp atomic compare relaxed
+  llx = llx == lle ? lld : llx;
+#pragma omp atomic compare relaxed
+  llx = lle == llx ? lld : llx;
+#pragma omp atomic compare relaxed
+  if (llx == lle)
+    llx = lld;
+#pragma omp atomic compare relaxed
+  if (lle == llx)
+    llx = lld;
+
+#pragma omp atomic compare relaxed
+  ullx = ullx > ulle ? ulle : ullx;
+#pragma omp atomic compare relaxed
+  ullx = ullx < ulle ? ulle : ullx;
+#pragma omp atomic compare relaxed
+  ullx = ulle > ullx ? ulle : ullx;
+#pragma omp atomic compare relaxed
+  ullx = ulle < ullx ? ulle : ullx;
+#pragma omp atomic compare relaxed
+  if (ullx > ulle)
+    ullx = ulle;
+#pragma omp atomic compare relaxed
+  if (ullx < ulle)
+    ullx = ulle;
+#pragma omp atomic compare relaxed
+  if (ulle > ullx)
+    ullx = ulle;
+#pragma omp atomic compare relaxed
+  if (ulle < ullx)
+    ullx = ulle;
+
+#pragma omp atomic compare relaxed
+  ullx = ullx == ulle ? ulld : ullx;
+#pragma omp atomic compare relaxed
+  ullx = ulle == ullx ? ulld : ullx;
+#pragma omp atomic compare relaxed
+  if (ullx == ulle)
+    ullx = ulld;
+#pragma omp atomic compare relaxed
+  if (ulle == ullx)
+    ullx = ulld;
+
+#pragma omp atomic compare release
+  llx = llx > lle ? lle : llx;
+#pragma omp atomic compare release
+  llx = llx < lle ? lle : llx;
+#pragma omp atomic compare release
+  llx = lle > llx ? lle : llx;
+#pragma omp atomic compare release
+  llx = lle < llx ? lle : llx;
+#pragma omp atomic compare release
+  if (llx > lle)
+    llx = lle;
+#pragma omp atomic compare release
+  if (llx < lle)
+    llx = lle;
+#pragma omp atomic compare release
+  if (lle > llx)
+    llx = lle;
+#pragma omp atomic compare release
+  if (lle < llx)
+    llx = lle;
+
+#pragma omp atomic compare release
+  llx = llx == lle ? lld : llx;
+#pragma omp atomic compare release
+  llx = lle == llx ? lld : llx;
+#pragma omp atomic compare release
+  if (llx == lle)
+    llx = lld;
+#pragma omp atomic compare release
+  if (lle == llx)
+    llx = lld;
+
+#pragma omp atomic compare release
+  ullx = ullx > ulle ? ulle : ullx;
+#pragma omp atomic compare release
+  ullx = ullx < ulle ? ulle : ullx;
+#pragma omp atomic compare release
+  ullx = ulle > ullx ? ulle : ullx;
+#pragma omp atomic compare release
+  ullx = ulle < ullx ? ulle : ullx;
+#pragma omp atomic compare release
+  if (ullx > ulle)
+    ullx = ulle;
+#pragma omp atomic compare release
+  if (ullx < ulle)
+    ullx = ulle;
+#pragma omp atomic compare release
+  if (ulle > ullx)
+    ullx = ulle;
+#pragma omp atomic compare release
+  if (ulle < ullx)
+    ullx = ulle;
+
+#pragma omp atomic compare release
+  ullx = ullx == ulle ? ulld : ullx;
+#pragma omp atomic compare release
+  ullx = ulle == ullx ? ulld : ullx;
+#pragma omp atomic compare release
+  if (ullx == ulle)
+    ullx = ulld;
+#pragma omp atomic compare release
+  if (ulle == ullx)
+    ullx = ulld;
+
+#pragma omp atomic compare seq_cst
+  llx = llx > lle ? lle : llx;
+#pragma omp atomic compare seq_cst
+  llx = llx < lle ? lle : llx;
+#pragma omp atomic compare seq_cst
+  llx = lle > llx ? lle : llx;
+#pragma omp atomic compare seq_cst
+  llx = lle < llx ? lle : llx;
+#pragma omp atomic compare seq_cst
+  if (llx > lle)
+    llx = lle;
+#pragma omp atomic compare seq_cst
+  if (llx < lle)
+    llx = lle;
+#pragma omp atomic compare seq_cst
+  if (lle > llx)
+    llx = lle;
+#pragma omp atomic compare seq_cst
+  if (lle < llx)
+    llx = lle;
+
+#pragma omp atomic compare seq_cst
+  llx = llx == lle ? lld : llx;
+#pragma omp atomic compare seq_cst
+  llx = lle == llx ? lld : llx;
+#pragma omp atomic compare seq_cst
+  if (llx == lle)
+    llx = lld;
+#pragma omp atomic compare seq_cst
+  if (lle == llx)
+    llx = lld;
+
+#pragma omp atomic compare seq_cst
+  ullx = ullx > ulle ? ulle : ullx;
+#pragma omp atomic compare seq_cst
+  ullx = ullx < ulle ? ulle : ullx;
+#pragma omp atomic compare seq_cst
+  ullx = ulle > ullx ? ulle : ullx;
+#pragma omp atomic compare seq_cst
+  ullx = ulle < ullx ? ulle : ullx;
+#pragma omp atomic compare seq_cst
+  if (ullx > ulle)
+    ullx = ulle;
+#pragma omp atomic compare seq_cst
+  if (ullx < ulle)
+    ullx = ulle;
+#pragma omp atomic compare seq_cst
+  if (ulle > ullx)
+    ullx = ulle;
+#pragma omp atomic compare seq_cst
+  if (ulle < ullx)
+    ullx = ulle;
+
+#pragma omp atomic compare seq_cst
+  ullx = ullx == ulle ? ulld : ullx;
+#pragma omp atomic compare seq_cst
+  ullx = ulle == ullx ? ulld : ullx;
+#pragma omp atomic compare seq_cst
+  if (ullx == ulle)
+    ullx = ulld;
+#pragma omp atomic compare seq_cst
+  if (ulle == ullx)
+    ullx = ulld;
+}
+
+#endif
+// CHECK-LABEL: @foo(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CX:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[CE:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[CD:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[UCX:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[UCE:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[UCD:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[SX:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[SE:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[SD:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[USX:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[USE:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[USD:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[IX:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[IE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ID:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[UIX:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[UIE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[UID:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LX:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[LE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[LD:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[ULX:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[ULE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[ULD:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[LLX:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[LLE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[LLD:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[ULLX:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[ULLE:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[ULLD:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP0]] monotonic, align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP2]] monotonic, align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP4]] monotonic, align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP6]] monotonic, align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP9:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP8]] monotonic, align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP10]] monotonic, align 1
+// CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP13:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP12]] monotonic, align 1
+// CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP15:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP14]] monotonic, align 1
+// CHECK-NEXT:    [[TMP16:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP17:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP18:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP16]], i8 [[TMP17]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP19:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP20:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP21:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP19]], i8 [[TMP20]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP22:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP23:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP24:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP22]], i8 [[TMP23]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP25:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP26:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP27:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP25]], i8 [[TMP26]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP28:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP29:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP28]] monotonic, align 1
+// CHECK-NEXT:    [[TMP30:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP31:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP30]] monotonic, align 1
+// CHECK-NEXT:    [[TMP32:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP33:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP32]] monotonic, align 1
+// CHECK-NEXT:    [[TMP34:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP35:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP34]] monotonic, align 1
+// CHECK-NEXT:    [[TMP36:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP37:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP36]] monotonic, align 1
+// CHECK-NEXT:    [[TMP38:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP39:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP38]] monotonic, align 1
+// CHECK-NEXT:    [[TMP40:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP41:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP40]] monotonic, align 1
+// CHECK-NEXT:    [[TMP42:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP43:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP42]] monotonic, align 1
+// CHECK-NEXT:    [[TMP44:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP45:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP46:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP44]], i8 [[TMP45]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP47:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP48:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP49:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP47]], i8 [[TMP48]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP50:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP51:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP52:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP50]], i8 [[TMP51]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP53:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP54:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP55:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP53]], i8 [[TMP54]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP56:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP57:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP56]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    [[TMP58:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP59:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP58]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP60:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP61:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP60]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP62:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP63:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP62]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP64:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP65:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP64]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP66:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP67:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP66]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP68:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP69:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP68]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP70:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP71:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP70]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP72:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP73:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP74:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP72]], i8 [[TMP73]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP75:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP76:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP77:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP75]], i8 [[TMP76]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP78:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP79:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP80:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP78]], i8 [[TMP79]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP81:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP82:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP83:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP81]], i8 [[TMP82]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP84:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP85:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP84]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP86:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP87:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP86]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP88:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP89:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP88]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP90:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP91:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP90]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP92:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP93:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP92]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP94:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP95:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP94]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP96:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP97:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP96]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP98:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP99:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP98]] acq_rel, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP100:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP101:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP102:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP100]], i8 [[TMP101]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP103:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP104:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP105:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP103]], i8 [[TMP104]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP106:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP107:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP108:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP106]], i8 [[TMP107]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP109:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP110:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP111:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP109]], i8 [[TMP110]] acq_rel acquire, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP112:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP113:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP112]] acquire, align 1
+// CHECK-NEXT:    [[TMP114:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP115:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP114]] acquire, align 1
+// CHECK-NEXT:    [[TMP116:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP117:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP116]] acquire, align 1
+// CHECK-NEXT:    [[TMP118:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP119:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP118]] acquire, align 1
+// CHECK-NEXT:    [[TMP120:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP121:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP120]] acquire, align 1
+// CHECK-NEXT:    [[TMP122:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP123:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP122]] acquire, align 1
+// CHECK-NEXT:    [[TMP124:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP125:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP124]] acquire, align 1
+// CHECK-NEXT:    [[TMP126:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP127:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP126]] acquire, align 1
+// CHECK-NEXT:    [[TMP128:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP129:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP130:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP128]], i8 [[TMP129]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP131:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP132:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP133:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP131]], i8 [[TMP132]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP134:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP135:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP136:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP134]], i8 [[TMP135]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP137:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP138:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP139:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP137]], i8 [[TMP138]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP140:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP141:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP140]] acquire, align 1
+// CHECK-NEXT:    [[TMP142:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP143:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP142]] acquire, align 1
+// CHECK-NEXT:    [[TMP144:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP145:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP144]] acquire, align 1
+// CHECK-NEXT:    [[TMP146:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP147:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP146]] acquire, align 1
+// CHECK-NEXT:    [[TMP148:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP149:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP148]] acquire, align 1
+// CHECK-NEXT:    [[TMP150:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP151:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP150]] acquire, align 1
+// CHECK-NEXT:    [[TMP152:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP153:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP152]] acquire, align 1
+// CHECK-NEXT:    [[TMP154:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP155:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP154]] acquire, align 1
+// CHECK-NEXT:    [[TMP156:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP157:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP158:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP156]], i8 [[TMP157]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP159:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP160:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP161:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP159]], i8 [[TMP160]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP162:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP163:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP164:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP162]], i8 [[TMP163]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP165:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP166:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP167:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP165]], i8 [[TMP166]] acquire acquire, align 1
+// CHECK-NEXT:    [[TMP168:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP169:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP168]] monotonic, align 1
+// CHECK-NEXT:    [[TMP170:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP171:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP170]] monotonic, align 1
+// CHECK-NEXT:    [[TMP172:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP173:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP172]] monotonic, align 1
+// CHECK-NEXT:    [[TMP174:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP175:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP174]] monotonic, align 1
+// CHECK-NEXT:    [[TMP176:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP177:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP176]] monotonic, align 1
+// CHECK-NEXT:    [[TMP178:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP179:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP178]] monotonic, align 1
+// CHECK-NEXT:    [[TMP180:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP181:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP180]] monotonic, align 1
+// CHECK-NEXT:    [[TMP182:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP183:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP182]] monotonic, align 1
+// CHECK-NEXT:    [[TMP184:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP185:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP186:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP184]], i8 [[TMP185]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP187:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP188:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP189:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP187]], i8 [[TMP188]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP190:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP191:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP192:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP190]], i8 [[TMP191]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP193:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP194:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP195:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP193]], i8 [[TMP194]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP196:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP197:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP196]] monotonic, align 1
+// CHECK-NEXT:    [[TMP198:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP199:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP198]] monotonic, align 1
+// CHECK-NEXT:    [[TMP200:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP201:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP200]] monotonic, align 1
+// CHECK-NEXT:    [[TMP202:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP203:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP202]] monotonic, align 1
+// CHECK-NEXT:    [[TMP204:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP205:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP204]] monotonic, align 1
+// CHECK-NEXT:    [[TMP206:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP207:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP206]] monotonic, align 1
+// CHECK-NEXT:    [[TMP208:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP209:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP208]] monotonic, align 1
+// CHECK-NEXT:    [[TMP210:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP211:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP210]] monotonic, align 1
+// CHECK-NEXT:    [[TMP212:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP213:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP214:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP212]], i8 [[TMP213]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP215:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP216:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP217:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP215]], i8 [[TMP216]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP218:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP219:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP220:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP218]], i8 [[TMP219]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP221:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP222:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP223:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP221]], i8 [[TMP222]] monotonic monotonic, align 1
+// CHECK-NEXT:    [[TMP224:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP225:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP224]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP226:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP227:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP226]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP228:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP229:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP228]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP230:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP231:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP230]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP232:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP233:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP232]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP234:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP235:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP234]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP236:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP237:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP236]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP238:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP239:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP238]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP240:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP241:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP242:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP240]], i8 [[TMP241]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP243:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP244:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP245:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP243]], i8 [[TMP244]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP246:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP247:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP248:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP246]], i8 [[TMP247]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP249:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP250:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP251:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP249]], i8 [[TMP250]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP252:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP253:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP252]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP254:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP255:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP254]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP256:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP257:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP256]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP258:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP259:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP258]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP260:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP261:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP260]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP262:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP263:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP262]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP264:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP265:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP264]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP266:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP267:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP266]] release, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP268:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP269:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP270:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP268]], i8 [[TMP269]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP271:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP272:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP273:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP271]], i8 [[TMP272]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP274:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP275:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP276:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP274]], i8 [[TMP275]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP277:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP278:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP279:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP277]], i8 [[TMP278]] release monotonic, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP280:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP281:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP280]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP282:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP283:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP282]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP284:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP285:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP284]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP286:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP287:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP286]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP288:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP289:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP288]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP290:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP291:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP290]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP292:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP293:%.*]] = atomicrmw umax i8* [[CX]], i8 [[TMP292]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP294:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP295:%.*]] = atomicrmw umin i8* [[CX]], i8 [[TMP294]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP296:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP297:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP298:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP296]], i8 [[TMP297]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP299:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP300:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP301:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP299]], i8 [[TMP300]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP302:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP303:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP304:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP302]], i8 [[TMP303]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP305:%.*]] = load i8, i8* [[CE]], align 1
+// CHECK-NEXT:    [[TMP306:%.*]] = load i8, i8* [[CD]], align 1
+// CHECK-NEXT:    [[TMP307:%.*]] = cmpxchg i8* [[CX]], i8 [[TMP305]], i8 [[TMP306]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP308:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP309:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP308]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP310:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP311:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP310]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP312:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP313:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP312]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP314:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP315:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP314]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP316:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP317:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP316]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP318:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP319:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP318]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP320:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP321:%.*]] = atomicrmw umax i8* [[UCX]], i8 [[TMP320]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP322:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP323:%.*]] = atomicrmw umin i8* [[UCX]], i8 [[TMP322]] seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP324:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP325:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP326:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP324]], i8 [[TMP325]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP327:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP328:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP329:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP327]], i8 [[TMP328]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP330:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP331:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP332:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP330]], i8 [[TMP331]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP333:%.*]] = load i8, i8* [[UCE]], align 1
+// CHECK-NEXT:    [[TMP334:%.*]] = load i8, i8* [[UCD]], align 1
+// CHECK-NEXT:    [[TMP335:%.*]] = cmpxchg i8* [[UCX]], i8 [[TMP333]], i8 [[TMP334]] seq_cst seq_cst, align 1
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP336:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP337:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP336]] monotonic, align 2
+// CHECK-NEXT:    [[TMP338:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP339:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP338]] monotonic, align 2
+// CHECK-NEXT:    [[TMP340:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP341:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP340]] monotonic, align 2
+// CHECK-NEXT:    [[TMP342:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP343:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP342]] monotonic, align 2
+// CHECK-NEXT:    [[TMP344:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP345:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP344]] monotonic, align 2
+// CHECK-NEXT:    [[TMP346:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP347:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP346]] monotonic, align 2
+// CHECK-NEXT:    [[TMP348:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP349:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP348]] monotonic, align 2
+// CHECK-NEXT:    [[TMP350:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP351:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP350]] monotonic, align 2
+// CHECK-NEXT:    [[TMP352:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP353:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP354:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP352]], i16 [[TMP353]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP355:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP356:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP357:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP355]], i16 [[TMP356]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP358:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP359:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP360:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP358]], i16 [[TMP359]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP361:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP362:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP363:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP361]], i16 [[TMP362]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP364:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP365:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP364]] monotonic, align 2
+// CHECK-NEXT:    [[TMP366:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP367:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP366]] monotonic, align 2
+// CHECK-NEXT:    [[TMP368:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP369:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP368]] monotonic, align 2
+// CHECK-NEXT:    [[TMP370:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP371:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP370]] monotonic, align 2
+// CHECK-NEXT:    [[TMP372:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP373:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP372]] monotonic, align 2
+// CHECK-NEXT:    [[TMP374:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP375:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP374]] monotonic, align 2
+// CHECK-NEXT:    [[TMP376:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP377:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP376]] monotonic, align 2
+// CHECK-NEXT:    [[TMP378:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP379:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP378]] monotonic, align 2
+// CHECK-NEXT:    [[TMP380:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP381:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP382:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP380]], i16 [[TMP381]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP383:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP384:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP385:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP383]], i16 [[TMP384]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP386:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP387:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP388:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP386]], i16 [[TMP387]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP389:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP390:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP391:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP389]], i16 [[TMP390]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP392:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP393:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP392]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP394:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP395:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP394]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP396:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP397:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP396]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP398:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP399:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP398]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP400:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP401:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP400]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP402:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP403:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP402]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP404:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP405:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP404]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP406:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP407:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP406]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP408:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP409:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP410:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP408]], i16 [[TMP409]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP411:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP412:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP413:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP411]], i16 [[TMP412]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP414:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP415:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP416:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP414]], i16 [[TMP415]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP417:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP418:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP419:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP417]], i16 [[TMP418]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP420:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP421:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP420]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP422:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP423:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP422]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP424:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP425:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP424]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP426:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP427:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP426]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP428:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP429:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP428]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP430:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP431:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP430]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP432:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP433:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP432]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP434:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP435:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP434]] acq_rel, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP436:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP437:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP438:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP436]], i16 [[TMP437]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP439:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP440:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP441:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP439]], i16 [[TMP440]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP442:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP443:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP444:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP442]], i16 [[TMP443]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP445:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP446:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP447:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP445]], i16 [[TMP446]] acq_rel acquire, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP448:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP449:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP448]] acquire, align 2
+// CHECK-NEXT:    [[TMP450:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP451:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP450]] acquire, align 2
+// CHECK-NEXT:    [[TMP452:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP453:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP452]] acquire, align 2
+// CHECK-NEXT:    [[TMP454:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP455:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP454]] acquire, align 2
+// CHECK-NEXT:    [[TMP456:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP457:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP456]] acquire, align 2
+// CHECK-NEXT:    [[TMP458:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP459:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP458]] acquire, align 2
+// CHECK-NEXT:    [[TMP460:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP461:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP460]] acquire, align 2
+// CHECK-NEXT:    [[TMP462:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP463:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP462]] acquire, align 2
+// CHECK-NEXT:    [[TMP464:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP465:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP466:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP464]], i16 [[TMP465]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP467:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP468:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP469:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP467]], i16 [[TMP468]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP470:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP471:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP472:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP470]], i16 [[TMP471]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP473:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP474:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP475:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP473]], i16 [[TMP474]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP476:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP477:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP476]] acquire, align 2
+// CHECK-NEXT:    [[TMP478:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP479:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP478]] acquire, align 2
+// CHECK-NEXT:    [[TMP480:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP481:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP480]] acquire, align 2
+// CHECK-NEXT:    [[TMP482:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP483:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP482]] acquire, align 2
+// CHECK-NEXT:    [[TMP484:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP485:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP484]] acquire, align 2
+// CHECK-NEXT:    [[TMP486:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP487:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP486]] acquire, align 2
+// CHECK-NEXT:    [[TMP488:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP489:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP488]] acquire, align 2
+// CHECK-NEXT:    [[TMP490:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP491:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP490]] acquire, align 2
+// CHECK-NEXT:    [[TMP492:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP493:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP494:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP492]], i16 [[TMP493]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP495:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP496:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP497:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP495]], i16 [[TMP496]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP498:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP499:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP500:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP498]], i16 [[TMP499]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP501:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP502:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP503:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP501]], i16 [[TMP502]] acquire acquire, align 2
+// CHECK-NEXT:    [[TMP504:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP505:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP504]] monotonic, align 2
+// CHECK-NEXT:    [[TMP506:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP507:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP506]] monotonic, align 2
+// CHECK-NEXT:    [[TMP508:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP509:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP508]] monotonic, align 2
+// CHECK-NEXT:    [[TMP510:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP511:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP510]] monotonic, align 2
+// CHECK-NEXT:    [[TMP512:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP513:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP512]] monotonic, align 2
+// CHECK-NEXT:    [[TMP514:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP515:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP514]] monotonic, align 2
+// CHECK-NEXT:    [[TMP516:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP517:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP516]] monotonic, align 2
+// CHECK-NEXT:    [[TMP518:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP519:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP518]] monotonic, align 2
+// CHECK-NEXT:    [[TMP520:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP521:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP522:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP520]], i16 [[TMP521]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP523:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP524:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP525:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP523]], i16 [[TMP524]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP526:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP527:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP528:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP526]], i16 [[TMP527]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP529:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP530:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP531:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP529]], i16 [[TMP530]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP532:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP533:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP532]] monotonic, align 2
+// CHECK-NEXT:    [[TMP534:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP535:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP534]] monotonic, align 2
+// CHECK-NEXT:    [[TMP536:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP537:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP536]] monotonic, align 2
+// CHECK-NEXT:    [[TMP538:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP539:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP538]] monotonic, align 2
+// CHECK-NEXT:    [[TMP540:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP541:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP540]] monotonic, align 2
+// CHECK-NEXT:    [[TMP542:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP543:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP542]] monotonic, align 2
+// CHECK-NEXT:    [[TMP544:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP545:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP544]] monotonic, align 2
+// CHECK-NEXT:    [[TMP546:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP547:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP546]] monotonic, align 2
+// CHECK-NEXT:    [[TMP548:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP549:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP550:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP548]], i16 [[TMP549]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP551:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP552:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP553:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP551]], i16 [[TMP552]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP554:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP555:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP556:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP554]], i16 [[TMP555]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP557:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP558:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP559:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP557]], i16 [[TMP558]] monotonic monotonic, align 2
+// CHECK-NEXT:    [[TMP560:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP561:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP560]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP562:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP563:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP562]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP564:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP565:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP564]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP566:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP567:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP566]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP568:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP569:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP568]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP570:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP571:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP570]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP572:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP573:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP572]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP574:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP575:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP574]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP576:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP577:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP578:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP576]], i16 [[TMP577]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP579:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP580:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP581:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP579]], i16 [[TMP580]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP582:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP583:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP584:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP582]], i16 [[TMP583]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP585:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP586:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP587:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP585]], i16 [[TMP586]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP588:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP589:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP588]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP590:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP591:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP590]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP592:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP593:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP592]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP594:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP595:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP594]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP596:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP597:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP596]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP598:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP599:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP598]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP600:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP601:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP600]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP602:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP603:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP602]] release, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP604:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP605:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP606:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP604]], i16 [[TMP605]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP607:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP608:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP609:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP607]], i16 [[TMP608]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP610:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP611:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP612:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP610]], i16 [[TMP611]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP613:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP614:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP615:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP613]], i16 [[TMP614]] release monotonic, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP616:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP617:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP616]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP618:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP619:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP618]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP620:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP621:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP620]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP622:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP623:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP622]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP624:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP625:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP624]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP626:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP627:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP626]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP628:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP629:%.*]] = atomicrmw umax i16* [[SX]], i16 [[TMP628]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP630:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP631:%.*]] = atomicrmw umin i16* [[SX]], i16 [[TMP630]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP632:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP633:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP634:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP632]], i16 [[TMP633]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP635:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP636:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP637:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP635]], i16 [[TMP636]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP638:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP639:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP640:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP638]], i16 [[TMP639]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP641:%.*]] = load i16, i16* [[SE]], align 2
+// CHECK-NEXT:    [[TMP642:%.*]] = load i16, i16* [[SD]], align 2
+// CHECK-NEXT:    [[TMP643:%.*]] = cmpxchg i16* [[SX]], i16 [[TMP641]], i16 [[TMP642]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP644:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP645:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP644]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP646:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP647:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP646]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP648:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP649:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP648]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP650:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP651:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP650]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP652:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP653:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP652]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP654:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP655:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP654]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP656:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP657:%.*]] = atomicrmw umax i16* [[USX]], i16 [[TMP656]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP658:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP659:%.*]] = atomicrmw umin i16* [[USX]], i16 [[TMP658]] seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP660:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP661:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP662:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP660]], i16 [[TMP661]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP663:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP664:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP665:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP663]], i16 [[TMP664]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP666:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP667:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP668:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP666]], i16 [[TMP667]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP669:%.*]] = load i16, i16* [[USE]], align 2
+// CHECK-NEXT:    [[TMP670:%.*]] = load i16, i16* [[USD]], align 2
+// CHECK-NEXT:    [[TMP671:%.*]] = cmpxchg i16* [[USX]], i16 [[TMP669]], i16 [[TMP670]] seq_cst seq_cst, align 2
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP672:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP673:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP672]] monotonic, align 4
+// CHECK-NEXT:    [[TMP674:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP675:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP674]] monotonic, align 4
+// CHECK-NEXT:    [[TMP676:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP677:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP676]] monotonic, align 4
+// CHECK-NEXT:    [[TMP678:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP679:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP678]] monotonic, align 4
+// CHECK-NEXT:    [[TMP680:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP681:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP680]] monotonic, align 4
+// CHECK-NEXT:    [[TMP682:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP683:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP682]] monotonic, align 4
+// CHECK-NEXT:    [[TMP684:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP685:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP684]] monotonic, align 4
+// CHECK-NEXT:    [[TMP686:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP687:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP686]] monotonic, align 4
+// CHECK-NEXT:    [[TMP688:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP689:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP690:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP688]], i32 [[TMP689]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP691:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP692:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP693:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP691]], i32 [[TMP692]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP694:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP695:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP696:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP694]], i32 [[TMP695]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP697:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP698:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP699:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP697]], i32 [[TMP698]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP700:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP701:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP700]] monotonic, align 4
+// CHECK-NEXT:    [[TMP702:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP703:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP702]] monotonic, align 4
+// CHECK-NEXT:    [[TMP704:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP705:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP704]] monotonic, align 4
+// CHECK-NEXT:    [[TMP706:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP707:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP706]] monotonic, align 4
+// CHECK-NEXT:    [[TMP708:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP709:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP708]] monotonic, align 4
+// CHECK-NEXT:    [[TMP710:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP711:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP710]] monotonic, align 4
+// CHECK-NEXT:    [[TMP712:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP713:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP712]] monotonic, align 4
+// CHECK-NEXT:    [[TMP714:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP715:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP714]] monotonic, align 4
+// CHECK-NEXT:    [[TMP716:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP717:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP718:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP716]], i32 [[TMP717]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP719:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP720:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP721:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP719]], i32 [[TMP720]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP722:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP723:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP724:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP722]], i32 [[TMP723]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP725:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP726:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP727:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP725]], i32 [[TMP726]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP728:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP729:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP728]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP730:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP731:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP730]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP732:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP733:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP732]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP734:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP735:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP734]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP736:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP737:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP736]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP738:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP739:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP738]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP740:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP741:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP740]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP742:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP743:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP742]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP744:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP745:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP746:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP744]], i32 [[TMP745]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP747:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP748:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP749:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP747]], i32 [[TMP748]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP750:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP751:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP752:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP750]], i32 [[TMP751]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP753:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP754:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP755:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP753]], i32 [[TMP754]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP756:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP757:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP756]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP758:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP759:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP758]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP760:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP761:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP760]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP762:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP763:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP762]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP764:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP765:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP764]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP766:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP767:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP766]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP768:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP769:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP768]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP770:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP771:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP770]] acq_rel, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP772:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP773:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP774:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP772]], i32 [[TMP773]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP775:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP776:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP777:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP775]], i32 [[TMP776]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP778:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP779:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP780:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP778]], i32 [[TMP779]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP781:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP782:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP783:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP781]], i32 [[TMP782]] acq_rel acquire, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP784:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP785:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP784]] acquire, align 4
+// CHECK-NEXT:    [[TMP786:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP787:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP786]] acquire, align 4
+// CHECK-NEXT:    [[TMP788:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP789:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP788]] acquire, align 4
+// CHECK-NEXT:    [[TMP790:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP791:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP790]] acquire, align 4
+// CHECK-NEXT:    [[TMP792:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP793:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP792]] acquire, align 4
+// CHECK-NEXT:    [[TMP794:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP795:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP794]] acquire, align 4
+// CHECK-NEXT:    [[TMP796:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP797:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP796]] acquire, align 4
+// CHECK-NEXT:    [[TMP798:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP799:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP798]] acquire, align 4
+// CHECK-NEXT:    [[TMP800:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP801:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP802:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP800]], i32 [[TMP801]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP803:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP804:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP805:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP803]], i32 [[TMP804]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP806:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP807:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP808:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP806]], i32 [[TMP807]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP809:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP810:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP811:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP809]], i32 [[TMP810]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP812:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP813:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP812]] acquire, align 4
+// CHECK-NEXT:    [[TMP814:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP815:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP814]] acquire, align 4
+// CHECK-NEXT:    [[TMP816:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP817:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP816]] acquire, align 4
+// CHECK-NEXT:    [[TMP818:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP819:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP818]] acquire, align 4
+// CHECK-NEXT:    [[TMP820:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP821:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP820]] acquire, align 4
+// CHECK-NEXT:    [[TMP822:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP823:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP822]] acquire, align 4
+// CHECK-NEXT:    [[TMP824:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP825:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP824]] acquire, align 4
+// CHECK-NEXT:    [[TMP826:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP827:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP826]] acquire, align 4
+// CHECK-NEXT:    [[TMP828:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP829:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP830:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP828]], i32 [[TMP829]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP831:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP832:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP833:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP831]], i32 [[TMP832]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP834:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP835:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP836:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP834]], i32 [[TMP835]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP837:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP838:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP839:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP837]], i32 [[TMP838]] acquire acquire, align 4
+// CHECK-NEXT:    [[TMP840:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP841:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP840]] monotonic, align 4
+// CHECK-NEXT:    [[TMP842:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP843:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP842]] monotonic, align 4
+// CHECK-NEXT:    [[TMP844:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP845:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP844]] monotonic, align 4
+// CHECK-NEXT:    [[TMP846:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP847:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP846]] monotonic, align 4
+// CHECK-NEXT:    [[TMP848:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP849:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP848]] monotonic, align 4
+// CHECK-NEXT:    [[TMP850:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP851:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP850]] monotonic, align 4
+// CHECK-NEXT:    [[TMP852:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP853:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP852]] monotonic, align 4
+// CHECK-NEXT:    [[TMP854:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP855:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP854]] monotonic, align 4
+// CHECK-NEXT:    [[TMP856:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP857:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP858:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP856]], i32 [[TMP857]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP859:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP860:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP861:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP859]], i32 [[TMP860]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP862:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP863:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP864:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP862]], i32 [[TMP863]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP865:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP866:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP867:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP865]], i32 [[TMP866]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP868:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP869:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP868]] monotonic, align 4
+// CHECK-NEXT:    [[TMP870:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP871:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP870]] monotonic, align 4
+// CHECK-NEXT:    [[TMP872:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP873:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP872]] monotonic, align 4
+// CHECK-NEXT:    [[TMP874:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP875:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP874]] monotonic, align 4
+// CHECK-NEXT:    [[TMP876:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP877:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP876]] monotonic, align 4
+// CHECK-NEXT:    [[TMP878:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP879:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP878]] monotonic, align 4
+// CHECK-NEXT:    [[TMP880:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP881:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP880]] monotonic, align 4
+// CHECK-NEXT:    [[TMP882:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP883:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP882]] monotonic, align 4
+// CHECK-NEXT:    [[TMP884:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP885:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP886:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP884]], i32 [[TMP885]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP887:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP888:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP889:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP887]], i32 [[TMP888]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP890:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP891:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP892:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP890]], i32 [[TMP891]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP893:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP894:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP895:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP893]], i32 [[TMP894]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP896:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP897:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP896]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP898:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP899:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP898]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP900:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP901:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP900]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP902:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP903:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP902]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP904:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP905:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP904]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP906:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP907:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP906]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP908:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP909:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP908]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP910:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP911:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP910]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP912:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP913:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP914:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP912]], i32 [[TMP913]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP915:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP916:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP917:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP915]], i32 [[TMP916]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP918:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP919:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP920:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP918]], i32 [[TMP919]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP921:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP922:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP923:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP921]], i32 [[TMP922]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP924:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP925:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP924]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP926:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP927:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP926]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP928:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP929:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP928]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP930:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP931:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP930]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP932:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP933:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP932]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP934:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP935:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP934]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP936:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP937:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP936]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP938:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP939:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP938]] release, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP940:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP941:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP942:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP940]], i32 [[TMP941]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP943:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP944:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP945:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP943]], i32 [[TMP944]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP946:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP947:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP948:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP946]], i32 [[TMP947]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP949:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP950:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP951:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP949]], i32 [[TMP950]] release monotonic, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP952:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP953:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP952]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP954:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP955:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP954]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP956:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP957:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP956]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP958:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP959:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP958]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP960:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP961:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP960]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP962:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP963:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP962]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP964:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP965:%.*]] = atomicrmw umax i32* [[IX]], i32 [[TMP964]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP966:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP967:%.*]] = atomicrmw umin i32* [[IX]], i32 [[TMP966]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP968:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP969:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP970:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP968]], i32 [[TMP969]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP971:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP972:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP973:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP971]], i32 [[TMP972]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP974:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP975:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP976:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP974]], i32 [[TMP975]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP977:%.*]] = load i32, i32* [[IE]], align 4
+// CHECK-NEXT:    [[TMP978:%.*]] = load i32, i32* [[ID]], align 4
+// CHECK-NEXT:    [[TMP979:%.*]] = cmpxchg i32* [[IX]], i32 [[TMP977]], i32 [[TMP978]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP980:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP981:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP980]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP982:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP983:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP982]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP984:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP985:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP984]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP986:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP987:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP986]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP988:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP989:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP988]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP990:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP991:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP990]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP992:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP993:%.*]] = atomicrmw umax i32* [[UIX]], i32 [[TMP992]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP994:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP995:%.*]] = atomicrmw umin i32* [[UIX]], i32 [[TMP994]] seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP996:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP997:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP998:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP996]], i32 [[TMP997]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP999:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP1000:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP1001:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP999]], i32 [[TMP1000]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1002:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP1003:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP1004:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP1002]], i32 [[TMP1003]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1005:%.*]] = load i32, i32* [[UIE]], align 4
+// CHECK-NEXT:    [[TMP1006:%.*]] = load i32, i32* [[UID]], align 4
+// CHECK-NEXT:    [[TMP1007:%.*]] = cmpxchg i32* [[UIX]], i32 [[TMP1005]], i32 [[TMP1006]] seq_cst seq_cst, align 4
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1008:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1009:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1008]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1010:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1011:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1010]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1012:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1013:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1012]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1014:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1015:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1014]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1016:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1017:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1016]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1018:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1019:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1018]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1020:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1021:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1020]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1022:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1023:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1022]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1024:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1025:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1026:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1024]], i64 [[TMP1025]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1027:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1028:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1029:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1027]], i64 [[TMP1028]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1030:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1031:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1032:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1030]], i64 [[TMP1031]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1033:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1034:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1035:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1033]], i64 [[TMP1034]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1036:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1037:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1036]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1038:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1039:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1038]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1040:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1041:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1040]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1042:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1043:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1042]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1044:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1045:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1044]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1046:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1047:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1046]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1048:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1049:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1048]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1050:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1051:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1050]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1052:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1053:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1054:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1052]], i64 [[TMP1053]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1055:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1056:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1057:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1055]], i64 [[TMP1056]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1058:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1059:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1060:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1058]], i64 [[TMP1059]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1061:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1062:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1063:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1061]], i64 [[TMP1062]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1064:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1065:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1064]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1066:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1067:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1066]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1068:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1069:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1068]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1070:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1071:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1070]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1072:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1073:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1072]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1074:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1075:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1074]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1076:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1077:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1076]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1078:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1079:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1078]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1080:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1081:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1082:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1080]], i64 [[TMP1081]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1083:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1084:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1085:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1083]], i64 [[TMP1084]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1086:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1087:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1088:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1086]], i64 [[TMP1087]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1089:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1090:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1091:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1089]], i64 [[TMP1090]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1092:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1093:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1092]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1094:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1095:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1094]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1096:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1097:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1096]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1098:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1099:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1098]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1100:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1101:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1100]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1102:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1103:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1102]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1104:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1105:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1104]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1106:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1107:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1106]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1108:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1109:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1110:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1108]], i64 [[TMP1109]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1111:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1112:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1113:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1111]], i64 [[TMP1112]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1114:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1115:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1116:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1114]], i64 [[TMP1115]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1117:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1118:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1119:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1117]], i64 [[TMP1118]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1120:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1121:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1120]] acquire, align 8
+// CHECK-NEXT:    [[TMP1122:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1123:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1122]] acquire, align 8
+// CHECK-NEXT:    [[TMP1124:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1125:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1124]] acquire, align 8
+// CHECK-NEXT:    [[TMP1126:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1127:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1126]] acquire, align 8
+// CHECK-NEXT:    [[TMP1128:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1129:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1128]] acquire, align 8
+// CHECK-NEXT:    [[TMP1130:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1131:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1130]] acquire, align 8
+// CHECK-NEXT:    [[TMP1132:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1133:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1132]] acquire, align 8
+// CHECK-NEXT:    [[TMP1134:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1135:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1134]] acquire, align 8
+// CHECK-NEXT:    [[TMP1136:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1137:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1138:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1136]], i64 [[TMP1137]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1139:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1140:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1141:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1139]], i64 [[TMP1140]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1142:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1143:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1144:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1142]], i64 [[TMP1143]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1145:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1146:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1147:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1145]], i64 [[TMP1146]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1148:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1149:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1148]] acquire, align 8
+// CHECK-NEXT:    [[TMP1150:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1151:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1150]] acquire, align 8
+// CHECK-NEXT:    [[TMP1152:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1153:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1152]] acquire, align 8
+// CHECK-NEXT:    [[TMP1154:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1155:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1154]] acquire, align 8
+// CHECK-NEXT:    [[TMP1156:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1157:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1156]] acquire, align 8
+// CHECK-NEXT:    [[TMP1158:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1159:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1158]] acquire, align 8
+// CHECK-NEXT:    [[TMP1160:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1161:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1160]] acquire, align 8
+// CHECK-NEXT:    [[TMP1162:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1163:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1162]] acquire, align 8
+// CHECK-NEXT:    [[TMP1164:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1165:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1166:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1164]], i64 [[TMP1165]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1167:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1168:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1169:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1167]], i64 [[TMP1168]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1170:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1171:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1172:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1170]], i64 [[TMP1171]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1173:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1174:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1175:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1173]], i64 [[TMP1174]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1176:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1177:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1176]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1178:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1179:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1178]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1180:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1181:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1180]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1182:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1183:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1182]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1184:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1185:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1184]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1186:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1187:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1186]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1188:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1189:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1188]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1190:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1191:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1190]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1192:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1193:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1194:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1192]], i64 [[TMP1193]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1195:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1196:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1197:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1195]], i64 [[TMP1196]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1198:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1199:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1200:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1198]], i64 [[TMP1199]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1201:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1202:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1203:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1201]], i64 [[TMP1202]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1204:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1205:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1204]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1206:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1207:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1206]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1208:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1209:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1208]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1210:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1211:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1210]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1212:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1213:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1212]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1214:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1215:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1214]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1216:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1217:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1216]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1218:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1219:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1218]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1220:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1221:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1222:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1220]], i64 [[TMP1221]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1223:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1224:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1225:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1223]], i64 [[TMP1224]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1226:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1227:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1228:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1226]], i64 [[TMP1227]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1229:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1230:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1231:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1229]], i64 [[TMP1230]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1232:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1233:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1232]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1234:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1235:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1234]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1236:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1237:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1236]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1238:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1239:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1238]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1240:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1241:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1240]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1242:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1243:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1242]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1244:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1245:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1244]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1246:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1247:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1246]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1248:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1249:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1250:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1248]], i64 [[TMP1249]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1251:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1252:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1253:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1251]], i64 [[TMP1252]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1254:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1255:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1256:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1254]], i64 [[TMP1255]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1257:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1258:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1259:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1257]], i64 [[TMP1258]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1260:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1261:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1260]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1262:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1263:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1262]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1264:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1265:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1264]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1266:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1267:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1266]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1268:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1269:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1268]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1270:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1271:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1270]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1272:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1273:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1272]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1274:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1275:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1274]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1276:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1277:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1278:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1276]], i64 [[TMP1277]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1279:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1280:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1281:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1279]], i64 [[TMP1280]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1282:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1283:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1284:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1282]], i64 [[TMP1283]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1285:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1286:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1287:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1285]], i64 [[TMP1286]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1288:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1289:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1288]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1290:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1291:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1290]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1292:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1293:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1292]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1294:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1295:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1294]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1296:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1297:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1296]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1298:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1299:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1298]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1300:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1301:%.*]] = atomicrmw umax i64* [[LX]], i64 [[TMP1300]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1302:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1303:%.*]] = atomicrmw umin i64* [[LX]], i64 [[TMP1302]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1304:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1305:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1306:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1304]], i64 [[TMP1305]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1307:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1308:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1309:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1307]], i64 [[TMP1308]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1310:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1311:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1312:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1310]], i64 [[TMP1311]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1313:%.*]] = load i64, i64* [[LE]], align 8
+// CHECK-NEXT:    [[TMP1314:%.*]] = load i64, i64* [[LD]], align 8
+// CHECK-NEXT:    [[TMP1315:%.*]] = cmpxchg i64* [[LX]], i64 [[TMP1313]], i64 [[TMP1314]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1316:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1317:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1316]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1318:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1319:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1318]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1320:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1321:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1320]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1322:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1323:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1322]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1324:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1325:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1324]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1326:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1327:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1326]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1328:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1329:%.*]] = atomicrmw umax i64* [[ULX]], i64 [[TMP1328]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1330:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1331:%.*]] = atomicrmw umin i64* [[ULX]], i64 [[TMP1330]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1332:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1333:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1334:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1332]], i64 [[TMP1333]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1335:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1336:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1337:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1335]], i64 [[TMP1336]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1338:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1339:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1340:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1338]], i64 [[TMP1339]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1341:%.*]] = load i64, i64* [[ULE]], align 8
+// CHECK-NEXT:    [[TMP1342:%.*]] = load i64, i64* [[ULD]], align 8
+// CHECK-NEXT:    [[TMP1343:%.*]] = cmpxchg i64* [[ULX]], i64 [[TMP1341]], i64 [[TMP1342]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1344:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1345:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1344]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1346:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1347:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1346]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1348:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1349:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1348]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1350:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1351:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1350]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1352:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1353:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1352]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1354:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1355:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1354]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1356:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1357:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1356]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1358:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1359:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1358]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1360:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1361:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1362:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1360]], i64 [[TMP1361]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1363:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1364:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1365:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1363]], i64 [[TMP1364]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1366:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1367:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1368:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1366]], i64 [[TMP1367]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1369:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1370:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1371:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1369]], i64 [[TMP1370]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1372:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1373:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1372]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1374:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1375:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1374]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1376:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1377:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1376]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1378:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1379:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1378]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1380:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1381:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1380]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1382:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1383:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1382]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1384:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1385:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1384]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1386:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1387:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1386]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1388:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1389:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1390:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1388]], i64 [[TMP1389]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1391:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1392:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1393:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1391]], i64 [[TMP1392]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1394:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1395:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1396:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1394]], i64 [[TMP1395]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1397:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1398:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1399:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1397]], i64 [[TMP1398]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1400:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1401:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1400]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1402:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1403:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1402]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1404:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1405:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1404]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1406:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1407:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1406]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1408:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1409:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1408]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1410:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1411:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1410]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1412:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1413:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1412]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1414:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1415:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1414]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1416:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1417:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1418:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1416]], i64 [[TMP1417]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1419:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1420:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1421:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1419]], i64 [[TMP1420]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1422:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1423:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1424:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1422]], i64 [[TMP1423]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1425:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1426:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1427:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1425]], i64 [[TMP1426]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1428:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1429:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1428]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1430:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1431:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1430]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1432:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1433:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1432]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1434:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1435:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1434]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1436:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1437:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1436]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1438:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1439:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1438]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1440:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1441:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1440]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1442:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1443:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1442]] acq_rel, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1444:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1445:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1446:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1444]], i64 [[TMP1445]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1447:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1448:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1449:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1447]], i64 [[TMP1448]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1450:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1451:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1452:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1450]], i64 [[TMP1451]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1453:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1454:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1455:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1453]], i64 [[TMP1454]] acq_rel acquire, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1456:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1457:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1456]] acquire, align 8
+// CHECK-NEXT:    [[TMP1458:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1459:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1458]] acquire, align 8
+// CHECK-NEXT:    [[TMP1460:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1461:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1460]] acquire, align 8
+// CHECK-NEXT:    [[TMP1462:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1463:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1462]] acquire, align 8
+// CHECK-NEXT:    [[TMP1464:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1465:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1464]] acquire, align 8
+// CHECK-NEXT:    [[TMP1466:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1467:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1466]] acquire, align 8
+// CHECK-NEXT:    [[TMP1468:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1469:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1468]] acquire, align 8
+// CHECK-NEXT:    [[TMP1470:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1471:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1470]] acquire, align 8
+// CHECK-NEXT:    [[TMP1472:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1473:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1474:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1472]], i64 [[TMP1473]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1475:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1476:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1477:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1475]], i64 [[TMP1476]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1478:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1479:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1480:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1478]], i64 [[TMP1479]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1481:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1482:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1483:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1481]], i64 [[TMP1482]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1484:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1485:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1484]] acquire, align 8
+// CHECK-NEXT:    [[TMP1486:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1487:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1486]] acquire, align 8
+// CHECK-NEXT:    [[TMP1488:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1489:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1488]] acquire, align 8
+// CHECK-NEXT:    [[TMP1490:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1491:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1490]] acquire, align 8
+// CHECK-NEXT:    [[TMP1492:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1493:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1492]] acquire, align 8
+// CHECK-NEXT:    [[TMP1494:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1495:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1494]] acquire, align 8
+// CHECK-NEXT:    [[TMP1496:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1497:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1496]] acquire, align 8
+// CHECK-NEXT:    [[TMP1498:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1499:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1498]] acquire, align 8
+// CHECK-NEXT:    [[TMP1500:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1501:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1502:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1500]], i64 [[TMP1501]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1503:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1504:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1505:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1503]], i64 [[TMP1504]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1506:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1507:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1508:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1506]], i64 [[TMP1507]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1509:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1510:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1511:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1509]], i64 [[TMP1510]] acquire acquire, align 8
+// CHECK-NEXT:    [[TMP1512:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1513:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1512]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1514:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1515:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1514]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1516:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1517:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1516]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1518:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1519:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1518]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1520:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1521:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1520]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1522:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1523:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1522]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1524:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1525:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1524]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1526:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1527:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1526]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1528:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1529:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1530:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1528]], i64 [[TMP1529]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1531:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1532:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1533:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1531]], i64 [[TMP1532]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1534:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1535:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1536:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1534]], i64 [[TMP1535]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1537:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1538:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1539:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1537]], i64 [[TMP1538]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1540:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1541:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1540]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1542:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1543:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1542]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1544:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1545:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1544]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1546:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1547:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1546]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1548:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1549:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1548]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1550:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1551:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1550]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1552:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1553:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1552]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1554:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1555:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1554]] monotonic, align 8
+// CHECK-NEXT:    [[TMP1556:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1557:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1558:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1556]], i64 [[TMP1557]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1559:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1560:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1561:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1559]], i64 [[TMP1560]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1562:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1563:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1564:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1562]], i64 [[TMP1563]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1565:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1566:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1567:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1565]], i64 [[TMP1566]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP1568:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1569:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1568]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1570:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1571:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1570]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1572:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1573:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1572]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1574:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1575:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1574]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1576:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1577:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1576]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1578:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1579:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1578]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1580:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1581:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1580]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1582:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1583:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1582]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1584:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1585:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1586:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1584]], i64 [[TMP1585]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1587:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1588:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1589:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1587]], i64 [[TMP1588]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1590:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1591:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1592:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1590]], i64 [[TMP1591]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1593:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1594:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1595:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1593]], i64 [[TMP1594]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1596:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1597:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1596]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1598:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1599:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1598]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1600:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1601:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1600]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1602:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1603:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1602]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1604:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1605:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1604]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1606:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1607:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1606]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1608:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1609:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1608]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1610:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1611:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1610]] release, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1612:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1613:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1614:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1612]], i64 [[TMP1613]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1615:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1616:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1617:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1615]], i64 [[TMP1616]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1618:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1619:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1620:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1618]], i64 [[TMP1619]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1621:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1622:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1623:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1621]], i64 [[TMP1622]] release monotonic, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1624:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1625:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1624]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1626:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1627:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1626]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1628:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1629:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1628]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1630:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1631:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1630]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1632:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1633:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1632]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1634:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1635:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1634]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1636:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1637:%.*]] = atomicrmw umax i64* [[LLX]], i64 [[TMP1636]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1638:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1639:%.*]] = atomicrmw umin i64* [[LLX]], i64 [[TMP1638]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1640:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1641:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1642:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1640]], i64 [[TMP1641]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1643:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1644:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1645:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1643]], i64 [[TMP1644]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1646:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1647:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1648:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1646]], i64 [[TMP1647]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1649:%.*]] = load i64, i64* [[LLE]], align 8
+// CHECK-NEXT:    [[TMP1650:%.*]] = load i64, i64* [[LLD]], align 8
+// CHECK-NEXT:    [[TMP1651:%.*]] = cmpxchg i64* [[LLX]], i64 [[TMP1649]], i64 [[TMP1650]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1652:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1653:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1652]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1654:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1655:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1654]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1656:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1657:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1656]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1658:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1659:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1658]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1660:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1661:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1660]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1662:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1663:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1662]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1664:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1665:%.*]] = atomicrmw umax i64* [[ULLX]], i64 [[TMP1664]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1666:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1667:%.*]] = atomicrmw umin i64* [[ULLX]], i64 [[TMP1666]] seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1668:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1669:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1670:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1668]], i64 [[TMP1669]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1671:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1672:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1673:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1671]], i64 [[TMP1672]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1674:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1675:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1676:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1674]], i64 [[TMP1675]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[TMP1677:%.*]] = load i64, i64* [[ULLE]], align 8
+// CHECK-NEXT:    [[TMP1678:%.*]] = load i64, i64* [[ULLD]], align 8
+// CHECK-NEXT:    [[TMP1679:%.*]] = cmpxchg i64* [[ULLX]], i64 [[TMP1677]], i64 [[TMP1678]] seq_cst seq_cst, align 8
+// CHECK-NEXT:    call void @__kmpc_flush(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    ret void

From 88d66f6ed1e5a3a9370a3181b7307fe65590e3ac Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 22 Feb 2022 10:07:58 -0800
Subject: [PATCH 518/748] [ELF] Move duplicate symbol check after input file
 parsing

https://discourse.llvm.org/t/parallel-input-file-parsing/60164

To decouple symbol initialization and section initialization, `Defined::section`
assignment should be postponed after input file parsing. To avoid spurious
duplicate definition error due to two definitions in COMDAT groups of the same
signature, we should postpone the duplicate symbol check.

The function is called postScan instead of a more specific name like
checkDuplicateSymbols, because we may merge Symbol::mergeProperties into
postScan. It is placed after compileBitcodeFiles to apply to ET_REL files
produced by LTO. This causes minor diagnostic regression
for skipLinkedOutput configurations: ld.lld --thinlto-index-only a.bc b.o
(bitcode definition prevails) won't detect duplicate symbol error. I think this
is an acceptable compromise. The important cases where (a) both files are
bitcode or (b) --thinlto-index-only is unused are still detected.

Reviewed By: ikudrin

Differential Revision: https://reviews.llvm.org/D119908
---
 lld/ELF/Driver.cpp                           | 32 +++++++++++
 lld/ELF/InputFiles.cpp                       | 57 +++++++++++++++++---
 lld/ELF/InputFiles.h                         |  4 ++
 lld/ELF/SymbolTable.cpp                      | 10 ++++
 lld/ELF/SymbolTable.h                        |  1 +
 lld/ELF/Symbols.cpp                          | 25 ++++-----
 lld/ELF/Symbols.h                            |  4 ++
 lld/test/ELF/invalid/symtab-sh-info-dup.test |  2 +-
 lld/test/ELF/lto/duplicated.ll               |  7 +--
 lld/test/ELF/vs-diagnostics-duplicate.s      |  2 +-
 10 files changed, 116 insertions(+), 28 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 4910a7d5a1633..cfc99dd115dc1 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2118,6 +2118,8 @@ static void redirectSymbols(ArrayRef<WrappedSymbol> wrapped) {
       map.try_emplace(sym, sym2);
       // If both foo@v1 and foo@@v1 are defined and non-weak, report a duplicate
       // definition error.
+      if (sym->isDefined())
+        sym2->checkDuplicate(cast<Defined>(*sym));
       sym2->resolve(*sym);
       // Eliminate foo@v1 from the symbol table.
       sym->symbolKind = Symbol::PlaceholderKind;
@@ -2223,6 +2225,25 @@ static uint32_t getAndFeatures() {
   return ret;
 }
 
+static void postParseObjectFile(ELFFileBase *file) {
+  switch (config->ekind) {
+  case ELF32LEKind:
+    cast<ObjFile<ELF32LE>>(file)->postParse();
+    break;
+  case ELF32BEKind:
+    cast<ObjFile<ELF32BE>>(file)->postParse();
+    break;
+  case ELF64LEKind:
+    cast<ObjFile<ELF64LE>>(file)->postParse();
+    break;
+  case ELF64BEKind:
+    cast<ObjFile<ELF64BE>>(file)->postParse();
+    break;
+  default:
+    llvm_unreachable("");
+  }
+}
+
 // Do actual linking. Note that when this function is called,
 // all linker scripts have already been parsed.
 void LinkerDriver::link(opt::InputArgList &args) {
@@ -2340,6 +2361,11 @@ void LinkerDriver::link(opt::InputArgList &args) {
     for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
       handleLibcall(s);
 
+  // No more lazy bitcode can be extracted at this point. Do post parse work
+  // like checking duplicate symbols.
+  parallelForEach(objectFiles, postParseObjectFile);
+  parallelForEach(bitcodeFiles, [](BitcodeFile *file) { file->postParse(); });
+
   // Return if there were name resolution errors.
   if (errorCount())
     return;
@@ -2393,6 +2419,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
   //
   // With this the symbol table should be complete. After this, no new names
   // except a few linker-synthesized ones will be added to the symbol table.
+  const size_t numObjsBeforeLTO = objectFiles.size();
   invokeELFT(compileBitcodeFiles, skipLinkedOutput);
 
   // Symbol resolution finished. Report backward reference problems.
@@ -2404,6 +2431,11 @@ void LinkerDriver::link(opt::InputArgList &args) {
   if (skipLinkedOutput)
     return;
 
+  // compileBitcodeFiles may have produced lto.tmp object files. After this, no
+  // more file will be added.
+  auto newObjectFiles = makeArrayRef(objectFiles).slice(numObjsBeforeLTO);
+  parallelForEach(newObjectFiles, postParseObjectFile);
+
   // Handle --exclude-libs again because lto.tmp may reference additional
   // libcalls symbols defined in an excluded archive. This may override
   // versionId set by scanVersionScript().
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index e4509c1f78880..e4de1a3462823 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1149,6 +1149,33 @@ void ObjFile<ELFT>::initializeSymbols(const object::ELFFile<ELFT> &obj) {
   }
 }
 
+// Called after all ObjFile::parse is called for all ObjFiles. This checks
+// duplicate symbols and may do symbol property merge in the future.
+template <class ELFT> void ObjFile<ELFT>::postParse() {
+  ArrayRef<Elf_Sym> eSyms = this->getELFSyms<ELFT>();
+  for (size_t i = firstGlobal, end = eSyms.size(); i != end; ++i) {
+    const Elf_Sym &eSym = eSyms[i];
+    const Symbol &sym = *symbols[i];
+    // !sym.file allows a symbol assignment redefines a symbol without an error.
+    if (sym.file == this || !sym.file || !sym.isDefined() ||
+        eSym.st_shndx == SHN_UNDEF || eSym.st_shndx == SHN_COMMON ||
+        eSym.getBinding() == STB_WEAK)
+      continue;
+    uint32_t secIdx = eSym.st_shndx;
+    if (LLVM_UNLIKELY(secIdx == SHN_XINDEX))
+      secIdx = cantFail(getExtendedSymbolTableIndex<ELFT>(eSym, i, shndxTable));
+    else if (secIdx >= SHN_LORESERVE)
+      secIdx = 0;
+    if (sections[secIdx] == &InputSection::discarded)
+      continue;
+    // Allow absolute symbols with the same value for GNU ld compatibility.
+    if (!cast<Defined>(sym).section && !sections[secIdx] &&
+        cast<Defined>(sym).value == eSym.st_value)
+      continue;
+    reportDuplicate(sym, this, sections[secIdx], eSym.st_value);
+  }
+}
+
 // The handling of tentative definitions (COMMON symbols) in archives is murky.
 // A tentative definition will be promoted to a global definition if there are
 // no non-tentative definitions to dominate it. When we hold a tentative
@@ -1617,7 +1644,6 @@ createBitcodeSymbol(Symbol *&sym, const std::vector<bool> &keptComdats,
 }
 
 template <class ELFT> void BitcodeFile::parse() {
-  std::vector<bool> keptComdats;
   for (std::pair<StringRef, Comdat::SelectionKind> s : obj->getComdatTable()) {
     keptComdats.push_back(
         s.second == Comdat::NoDeduplicate ||
@@ -1646,6 +1672,20 @@ void BitcodeFile::parseLazy() {
     }
 }
 
+void BitcodeFile::postParse() {
+  for (auto it : llvm::enumerate(obj->symbols())) {
+    const Symbol &sym = *symbols[it.index()];
+    const auto &objSym = it.value();
+    if (sym.file == this || !sym.isDefined() || objSym.isUndefined() ||
+        objSym.isCommon() || objSym.isWeak())
+      continue;
+    int c = objSym.getComdatIndex();
+    if (c != -1 && !keptComdats[c])
+      continue;
+    reportDuplicate(sym, this, nullptr, 0);
+  }
+}
+
 void BinaryFile::parse() {
   ArrayRef<uint8_t> data = arrayRefFromStringRef(mb.getBuffer());
   auto *section = make<InputSection>(this, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS,
@@ -1663,12 +1703,15 @@ void BinaryFile::parse() {
 
   llvm::StringSaver &saver = lld::saver();
 
-  symtab->addSymbol(Defined{nullptr, saver.save(s + "_start"), STB_GLOBAL,
-                            STV_DEFAULT, STT_OBJECT, 0, 0, section});
-  symtab->addSymbol(Defined{nullptr, saver.save(s + "_end"), STB_GLOBAL,
-                            STV_DEFAULT, STT_OBJECT, data.size(), 0, section});
-  symtab->addSymbol(Defined{nullptr, saver.save(s + "_size"), STB_GLOBAL,
-                            STV_DEFAULT, STT_OBJECT, data.size(), 0, nullptr});
+  symtab->addAndCheckDuplicate(Defined{nullptr, saver.save(s + "_start"),
+                                       STB_GLOBAL, STV_DEFAULT, STT_OBJECT, 0,
+                                       0, section});
+  symtab->addAndCheckDuplicate(Defined{nullptr, saver.save(s + "_end"),
+                                       STB_GLOBAL, STV_DEFAULT, STT_OBJECT,
+                                       data.size(), 0, section});
+  symtab->addAndCheckDuplicate(Defined{nullptr, saver.save(s + "_size"),
+                                       STB_GLOBAL, STV_DEFAULT, STT_OBJECT,
+                                       data.size(), 0, nullptr});
 }
 
 InputFile *elf::createObjectFile(MemoryBufferRef mb, StringRef archiveName,
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index bbd3072c54d5f..e2aba8e6250cf 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -273,6 +273,8 @@ template <class ELFT> class ObjFile : public ELFFileBase {
   // Get cached DWARF information.
   DWARFCache *getDwarf();
 
+  void postParse();
+
 private:
   void initializeSections(bool ignoreComdats,
                           const llvm::object::ELFFile<ELFT> &obj);
@@ -315,7 +317,9 @@ class BitcodeFile : public InputFile {
   static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; }
   template <class ELFT> void parse();
   void parseLazy();
+  void postParse();
   std::unique_ptr<llvm::lto::InputFile> obj;
+  std::vector<bool> keptComdats;
 };
 
 // .so file.
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index 2cfc5fcd19b3b..bb948218a2364 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -102,6 +102,16 @@ Symbol *SymbolTable::addSymbol(const Symbol &newSym) {
   return sym;
 }
 
+// This variant of addSymbol is used by BinaryFile::parse to check duplicate
+// symbol errors.
+Symbol *SymbolTable::addAndCheckDuplicate(const Defined &newSym) {
+  Symbol *sym = insert(newSym.getName());
+  if (sym->isDefined())
+    sym->checkDuplicate(newSym);
+  sym->resolve(newSym);
+  return sym;
+}
+
 Symbol *SymbolTable::find(StringRef name) {
   auto it = symMap.find(CachedHashStringRef(name));
   if (it == symMap.end())
diff --git a/lld/ELF/SymbolTable.h b/lld/ELF/SymbolTable.h
index 60bd10a0b168a..e55daffe8ba46 100644
--- a/lld/ELF/SymbolTable.h
+++ b/lld/ELF/SymbolTable.h
@@ -39,6 +39,7 @@ class SymbolTable {
   Symbol *insert(StringRef name);
 
   Symbol *addSymbol(const Symbol &newSym);
+  Symbol *addAndCheckDuplicate(const Defined &newSym);
 
   void scanVersionScript();
 
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 972c70d53af8e..9b231ba46a561 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -572,21 +572,11 @@ int Symbol::compare(const Symbol *other) const {
     return -1;
   }
 
-  auto *oldSym = cast<Defined>(this);
-  auto *newSym = cast<Defined>(other);
-
-  if (isa_and_nonnull<BitcodeFile>(other->file))
-    return 0;
-
-  if (!oldSym->section && !newSym->section && oldSym->value == newSym->value &&
-      newSym->binding == STB_GLOBAL)
-    return -1;
-
   return 0;
 }
 
-static void reportDuplicate(const Symbol &sym, InputFile *newFile,
-                            InputSectionBase *errSec, uint64_t errOffset) {
+void elf::reportDuplicate(const Symbol &sym, InputFile *newFile,
+                          InputSectionBase *errSec, uint64_t errOffset) {
   if (config->allowMultipleDefinition)
     return;
   const Defined *d = cast<Defined>(&sym);
@@ -619,6 +609,13 @@ static void reportDuplicate(const Symbol &sym, InputFile *newFile,
   error(msg);
 }
 
+void Symbol::checkDuplicate(const Defined &other) const {
+  if (compare(&other) == 0)
+    reportDuplicate(*this, other.file,
+                    dyn_cast_or_null<InputSectionBase>(other.section),
+                    other.value);
+}
+
 void Symbol::resolveCommon(const CommonSymbol &other) {
   int cmp = compare(&other);
   if (cmp < 0)
@@ -653,10 +650,6 @@ void Symbol::resolveDefined(const Defined &other) {
   int cmp = compare(&other);
   if (cmp > 0)
     replace(other);
-  else if (cmp == 0)
-    reportDuplicate(*this, other.file,
-                    dyn_cast_or_null<InputSectionBase>(other.section),
-                    other.value);
 }
 
 template <class LazyT>
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index 6cd8370317124..dd245660d13d5 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -213,6 +213,8 @@ class Symbol {
   // non-lazy object causes a runtime error.
   void extract() const;
 
+  void checkDuplicate(const Defined &other) const;
+
 private:
   void resolveUndefined(const Undefined &other);
   void resolveCommon(const CommonSymbol &other);
@@ -569,6 +571,8 @@ template <typename... T> Defined *makeDefined(T &&...args) {
       Defined(std::forward<T>(args)...);
 }
 
+void reportDuplicate(const Symbol &sym, InputFile *newFile,
+                     InputSectionBase *errSec, uint64_t errOffset);
 void maybeWarnUnorderableSymbol(const Symbol *sym);
 bool computeIsPreemptible(const Symbol &sym);
 void reportBackrefs();
diff --git a/lld/test/ELF/invalid/symtab-sh-info-dup.test b/lld/test/ELF/invalid/symtab-sh-info-dup.test
index 36c7af0d66c6f..8b0833d67bbb2 100644
--- a/lld/test/ELF/invalid/symtab-sh-info-dup.test
+++ b/lld/test/ELF/invalid/symtab-sh-info-dup.test
@@ -9,11 +9,11 @@
 # RUN: not ld.lld %t.o %t.o -o /dev/null 2>&1 | FileCheck %s
 
 # CHECK:      error: {{.*}}.o: STB_LOCAL symbol (2) found at index >= .symtab's sh_info (1)
+# CHECK-NEXT: error: {{.*}}.o: STB_LOCAL symbol (2) found at index >= .symtab's sh_info (1)
 # CHECK-NEXT: error: duplicate symbol: _start
 # CHECK-NEXT: >>> defined at {{.*}}.o:(.text+0x0)
 # CHECK-NEXT: >>> defined at {{.*}}.o:(.text+0x0)
 # CHECK-EMPTY:
-# CHECK-NEXT: error: {{.*}}.o: STB_LOCAL symbol (2) found at index >= .symtab's sh_info (1)
 
 # RUN: ld.lld --noinhibit-exec %t.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN
 # WARN: warning: {{.*}}.o: STB_LOCAL symbol (2) found at index >= .symtab's sh_info (1)
diff --git a/lld/test/ELF/lto/duplicated.ll b/lld/test/ELF/lto/duplicated.ll
index 60bdb4455ee6b..6b09260edf09b 100644
--- a/lld/test/ELF/lto/duplicated.ll
+++ b/lld/test/ELF/lto/duplicated.ll
@@ -8,17 +8,18 @@
 ;; --thinlto-index-only skips some passes. Test the error is present.
 ; RUN: not ld.lld %t/a.bc %t/a.bc --thinlto-index-only -o /dev/null 2>&1 | FileCheck %s
 ; RUN: not ld.lld %t/b.o %t/a.bc --lto-emit-asm -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK2
+; RUN: not ld.lld %t/a.bc %t/b.o --thinlto-index-only -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK2
 
 ;; --undefined-glob g extracts %t/c.bc which causes a duplicate symbol error.
-; RUN: not ld.lld %t/a.bc --start-lib %t/c.bc --undefined-glob g --thinlto-index-only -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK
+; RUN: not ld.lld %t/a.bc --start-lib %t/c.bc --undefined-glob g --thinlto-index-only -o /dev/null 2>&1 | FileCheck %s
 
 ; CHECK:      duplicate symbol: f
 ; CHECK-NEXT: >>> defined in {{.*}}.bc
 ; CHECK-NEXT: >>> defined in {{.*}}.bc
 
 ; CHECK2:      duplicate symbol: f
-; CHECK2-NEXT: >>> defined in {{.*}}.o
-; CHECK2-NEXT: >>> defined in {{.*}}.bc
+; CHECK2-NEXT: >>> defined in {{.*}}
+; CHECK2-NEXT: >>> defined in {{.*}}
 
 ;--- a.ll
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/ELF/vs-diagnostics-duplicate.s b/lld/test/ELF/vs-diagnostics-duplicate.s
index cf4637fded9d8..397e14ac68cb2 100644
--- a/lld/test/ELF/vs-diagnostics-duplicate.s
+++ b/lld/test/ELF/vs-diagnostics-duplicate.s
@@ -2,7 +2,7 @@
 // RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1.o
 // RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %S/Inputs/vs-diagnostics-duplicate2.s -o %t2.o
 // RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %S/Inputs/vs-diagnostics-duplicate3.s -o %t3.o
-// RUN: not ld.lld --vs-diagnostics %t1.o %t2.o %t3.o -o /dev/null 2>&1 | FileCheck %s
+// RUN: not ld.lld --vs-diagnostics --threads=1 %t1.o %t2.o %t3.o -o /dev/null 2>&1 | FileCheck %s
 
 // Case 1. The source locations are unknown for both symbols.
 // CHECK:      {{.*}}ld.lld{{.*}}: error: duplicate symbol: foo

From 026a43f6cf9f9fe3fb3fcf7065393ebc979afdef Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Tue, 22 Feb 2022 19:08:51 +0100
Subject: [PATCH 519/748] [flang] Update PFTBuilder

This patch update the PFTBuilder to be able to lower
the construct present in semantics.

This is a building block for other lowering patches that will be posted soon.

This patch is part of the upstreaming effort from fir-dev branch.

Reviewed By: PeteSteinfeld, schweitz

Differential Revision: https://reviews.llvm.org/D120336

Co-authored-by: Jean Perier <jperier@nvidia.com>
Co-authored-by: V Donaldson <vdonaldson@nvidia.com>
---
 flang/include/flang/Lower/HostAssociations.h |  68 ++
 flang/include/flang/Lower/IntervalSet.h      | 109 ++++
 flang/include/flang/Lower/PFTBuilder.h       | 205 +++---
 flang/include/flang/Lower/PFTDefs.h          |   1 +
 flang/lib/Lower/PFTBuilder.cpp               | 621 +++++++++++++------
 flang/test/Lower/pre-fir-tree01.f90          |   6 +-
 flang/test/Lower/pre-fir-tree02.f90          |   7 +-
 flang/test/Lower/pre-fir-tree05.f90          |   7 +-
 8 files changed, 741 insertions(+), 283 deletions(-)
 create mode 100644 flang/include/flang/Lower/HostAssociations.h
 create mode 100644 flang/include/flang/Lower/IntervalSet.h

diff --git a/flang/include/flang/Lower/HostAssociations.h b/flang/include/flang/Lower/HostAssociations.h
new file mode 100644
index 0000000000000..c091dbc3339a4
--- /dev/null
+++ b/flang/include/flang/Lower/HostAssociations.h
@@ -0,0 +1,68 @@
+//===-- Lower/HostAssociations.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_HOSTASSOCIATIONS_H
+#define FORTRAN_LOWER_HOSTASSOCIATIONS_H
+
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace Fortran {
+namespace semantics {
+class Symbol;
+}
+
+namespace lower {
+class AbstractConverter;
+class SymMap;
+
+/// Internal procedures in Fortran may access variables declared in the host
+/// procedure directly. We bundle these variables together in a tuple and pass
+/// them as an extra argument.
+class HostAssociations {
+public:
+  /// Returns true iff there are no host associations.
+  bool empty() const { return symbols.empty(); }
+
+  /// Adds a set of Symbols that will be the host associated bindings for this
+  /// host procedure.
+  void addSymbolsToBind(
+      const llvm::SetVector<const Fortran::semantics::Symbol *> &s) {
+    assert(empty() && "symbol set must be initially empty");
+    symbols = s;
+  }
+
+  /// Code gen the FIR for the local bindings for the host associated symbols
+  /// for the host (parent) procedure using `builder`.
+  void hostProcedureBindings(AbstractConverter &converter, SymMap &symMap);
+
+  /// Code gen the FIR for the local bindings for the host associated symbols
+  /// for an internal (child) procedure using `builder`.
+  void internalProcedureBindings(AbstractConverter &converter, SymMap &symMap);
+
+  /// Return the type of the extra argument to add to each internal procedure.
+  mlir::Type getArgumentType(AbstractConverter &convert);
+
+  /// Is \p symbol host associated ?
+  bool isAssociated(const Fortran::semantics::Symbol &symbol) const {
+    return symbols.contains(&symbol);
+  }
+
+private:
+  /// Canonical vector of host associated symbols.
+  llvm::SetVector<const Fortran::semantics::Symbol *> symbols;
+
+  /// The type of the extra argument to be added to each internal procedure.
+  mlir::Type argType;
+};
+} // namespace lower
+} // namespace Fortran
+
+#endif // FORTRAN_LOWER_HOSTASSOCIATIONS_H
diff --git a/flang/include/flang/Lower/IntervalSet.h b/flang/include/flang/Lower/IntervalSet.h
new file mode 100644
index 0000000000000..3d7a36e30b570
--- /dev/null
+++ b/flang/include/flang/Lower/IntervalSet.h
@@ -0,0 +1,109 @@
+//===-- IntervalSet.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_INTERVALSET_H
+#define FORTRAN_LOWER_INTERVALSET_H
+
+#include <cassert>
+#include <map>
+
+namespace Fortran::lower {
+
+//===----------------------------------------------------------------------===//
+// Interval set
+//===----------------------------------------------------------------------===//
+
+/// Interval set to keep track of intervals, merging them when they overlap one
+/// another. Used to refine the pseudo-offset ranges of the front-end symbols
+/// into groups of aliasing variables.
+struct IntervalSet {
+  using MAP = std::map<std::size_t, std::size_t>;
+  using Iterator = MAP::const_iterator;
+
+  // Handles the merging of overlapping intervals correctly, efficiently.
+  void merge(std::size_t lo, std::size_t up) {
+    assert(lo <= up);
+    if (empty()) {
+      m.insert({lo, up});
+      return;
+    }
+    auto i = m.lower_bound(lo);
+    // i->first >= lo
+    if (i == begin()) {
+      if (up < i->first) {
+        // [lo..up] < i->first
+        m.insert({lo, up});
+        return;
+      }
+      // up >= i->first
+      if (i->second > up)
+        up = i->second;
+      fuse(lo, up, i);
+      return;
+    }
+    auto i1 = i;
+    if (i == end() || i->first > lo)
+      i = std::prev(i);
+    // i->first <= lo
+    if (i->second >= up) {
+      // i->first <= lo && up <= i->second, keep i
+      return;
+    }
+    // i->second < up
+    if (i->second < lo) {
+      if (i1 == end() || i1->first > up) {
+        // i < [lo..up] < i1
+        m.insert({lo, up});
+        return;
+      }
+      // i < [lo..up], i1->first <= up  -->  [lo..up] union [i1..?]
+      i = i1;
+    } else {
+      // i->first <= lo, lo <= i->second  -->  [i->first..up] union [i..?]
+      lo = i->first;
+    }
+    fuse(lo, up, i);
+  }
+
+  Iterator find(std::size_t pt) const {
+    auto i = m.lower_bound(pt);
+    if (i != end() && i->first == pt)
+      return i;
+    if (i == begin())
+      return end();
+    i = std::prev(i);
+    if (i->second < pt)
+      return end();
+    return i;
+  }
+
+  Iterator begin() const { return m.begin(); }
+  Iterator end() const { return m.end(); }
+  bool empty() const { return m.empty(); }
+  std::size_t size() const { return m.size(); }
+
+private:
+  // Find and fuse overlapping sets.
+  void fuse(std::size_t lo, std::size_t up, Iterator i) {
+    auto j = m.upper_bound(up);
+    // up < j->first
+    std::size_t cu = std::prev(j)->second;
+    // cu < j->first
+    if (cu > up)
+      up = cu;
+    m.erase(i, j);
+    // merge [i .. j) with [i->first, max(up, cu)]
+    m.insert({lo, up});
+  }
+
+  MAP m{};
+};
+
+} // namespace Fortran::lower
+
+#endif // FORTRAN_LOWER_INTERVALSET_H
diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index 0e625bf86b99c..1d4788451a42c 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -19,9 +19,11 @@
 
 #include "flang/Common/reference.h"
 #include "flang/Common/template.h"
+#include "flang/Lower/HostAssociations.h"
 #include "flang/Lower/PFTDefs.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/attr.h"
+#include "flang/Semantics/scope.h"
 #include "flang/Semantics/symbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -62,7 +64,7 @@ class ReferenceVariantBase {
   }
   template <typename B>
   constexpr BaseType<B> *getIf() const {
-    auto *ptr = std::get_if<Ref<B>>(&u);
+    const Ref<B> *ptr = std::get_if<Ref<B>>(&u);
     return ptr ? &ptr->get() : nullptr;
   }
   template <typename B>
@@ -106,8 +108,7 @@ using ActionStmts = std::tuple<
     parser::ComputedGotoStmt, parser::ForallStmt, parser::ArithmeticIfStmt,
     parser::AssignStmt, parser::AssignedGotoStmt, parser::PauseStmt>;
 
-using OtherStmts =
-    std::tuple<parser::FormatStmt, parser::EntryStmt, parser::NamelistStmt>;
+using OtherStmts = std::tuple<parser::EntryStmt, parser::FormatStmt>;
 
 using ConstructStmts = std::tuple<
     parser::AssociateStmt, parser::EndAssociateStmt, parser::BlockStmt,
@@ -134,7 +135,11 @@ using Constructs =
 
 using Directives =
     std::tuple<parser::CompilerDirective, parser::OpenACCConstruct,
-               parser::OpenMPConstruct, parser::OmpEndLoopDirective>;
+               parser::OpenACCDeclarativeConstruct, parser::OpenMPConstruct,
+               parser::OpenMPDeclarativeConstruct, parser::OmpEndLoopDirective>;
+
+using DeclConstructs = std::tuple<parser::OpenMPDeclarativeConstruct,
+                                  parser::OpenACCDeclarativeConstruct>;
 
 template <typename A>
 static constexpr bool isActionStmt{common::HasMember<A, ActionStmts>};
@@ -154,6 +159,9 @@ static constexpr bool isConstruct{common::HasMember<A, Constructs>};
 template <typename A>
 static constexpr bool isDirective{common::HasMember<A, Directives>};
 
+template <typename A>
+static constexpr bool isDeclConstruct{common::HasMember<A, DeclConstructs>};
+
 template <typename A>
 static constexpr bool isIntermediateConstructStmt{common::HasMember<
     A, std::tuple<parser::CaseStmt, parser::ElseIfStmt, parser::ElseStmt,
@@ -161,10 +169,14 @@ static constexpr bool isIntermediateConstructStmt{common::HasMember<
 
 template <typename A>
 static constexpr bool isNopConstructStmt{common::HasMember<
-    A, std::tuple<parser::EndAssociateStmt, parser::CaseStmt,
-                  parser::EndSelectStmt, parser::ElseIfStmt, parser::ElseStmt,
-                  parser::EndIfStmt, parser::SelectRankCaseStmt,
-                  parser::TypeGuardStmt>>};
+    A, std::tuple<parser::CaseStmt, parser::EndSelectStmt, parser::ElseIfStmt,
+                  parser::ElseStmt, parser::EndIfStmt,
+                  parser::SelectRankCaseStmt, parser::TypeGuardStmt>>};
+
+template <typename A>
+static constexpr bool isExecutableDirective{common::HasMember<
+    A, std::tuple<parser::CompilerDirective, parser::OpenACCConstruct,
+                  parser::OpenMPConstruct>>};
 
 template <typename A>
 static constexpr bool isFunctionLike{common::HasMember<
@@ -244,6 +256,11 @@ struct Evaluation : EvaluationVariant {
       return pft::isNopConstructStmt<std::decay_t<decltype(r)>>;
     }});
   }
+  constexpr bool isExecutableDirective() const {
+    return visit(common::visitors{[](auto &r) {
+      return pft::isExecutableDirective<std::decay_t<decltype(r)>>;
+    }});
+  }
 
   /// Return the predicate:  "This is a non-initial, non-terminal construct
   /// statement."  For an IfConstruct, this is ElseIfStmt and ElseStmt.
@@ -295,11 +312,12 @@ struct Evaluation : EvaluationVariant {
 
   // FIR generation looks primarily at PFT ActionStmt and ConstructStmt leaf
   // nodes.  Members such as lexicalSuccessor and block are applicable only
-  // to these nodes.  The controlSuccessor member is used for nonlexical
-  // successors, such as linking to a GOTO target.  For multiway branches,
-  // it is set to the first target.  Successor and exit links always target
-  // statements.  An internal Construct node has a constructExit link that
-  // applies to exits from anywhere within the construct.
+  // to these nodes, plus some directives.  The controlSuccessor member is
+  // used for nonlexical successors, such as linking to a GOTO target.  For
+  // multiway branches, it is set to the first target.  Successor and exit
+  // links always target statements or directives.  An internal Construct
+  // node has a constructExit link that applies to exits from anywhere within
+  // the construct.
   //
   // An unstructured construct is one that contains some form of goto.  This
   // is indicated by the isUnstructured member flag, which may be set on a
@@ -327,8 +345,8 @@ struct Evaluation : EvaluationVariant {
   std::optional<parser::Label> label{};
   std::unique_ptr<EvaluationList> evaluationList; // nested evaluations
   Evaluation *parentConstruct{nullptr};  // set for nodes below the top level
-  Evaluation *lexicalSuccessor{nullptr}; // set for ActionStmt, ConstructStmt
-  Evaluation *controlSuccessor{nullptr}; // set for some statements
+  Evaluation *lexicalSuccessor{nullptr}; // set for leaf nodes, some directives
+  Evaluation *controlSuccessor{nullptr}; // set for some leaf nodes
   Evaluation *constructExit{nullptr};    // set for constructs
   bool isNewBlock{false};                // evaluation begins a new basic block
   bool isUnstructured{false};  // evaluation has unstructured control flow
@@ -354,13 +372,6 @@ struct ProgramUnit : ProgramVariant {
   PftNode parent;
 };
 
-/// Helper to get location from FunctionLikeUnit/ModuleLikeUnit begin/end
-/// statements.
-template <typename T>
-static parser::CharBlock stmtSourceLoc(const T &stmt) {
-  return stmt.visit(common::visitors{[](const auto &x) { return x.source; }});
-}
-
 /// A variable captures an object to be created per the declaration part of a
 /// function like unit.
 ///
@@ -386,9 +397,6 @@ struct Variable {
     const semantics::Symbol *symbol{};
 
     bool isGlobal() const { return global; }
-    bool isDeclaration() const {
-      return !symbol || symbol != &symbol->GetUltimate();
-    }
 
     int depth{};
     bool global{};
@@ -399,32 +407,45 @@ struct Variable {
     std::size_t aliasOffset{};
   };
 
+  /// <offset, size> pair
   using Interval = std::tuple<std::size_t, std::size_t>;
 
   /// An interval of storage is a contiguous block of memory to be allocated or
   /// mapped onto another variable. Aliasing variables will be pointers into
   /// interval stores and may overlap each other.
   struct AggregateStore {
-    AggregateStore(Interval &&interval, const Fortran::semantics::Scope &scope,
-                   bool isDeclaration = false)
-        : interval{std::move(interval)}, scope{&scope}, isDecl{isDeclaration} {}
-    AggregateStore(Interval &&interval, const Fortran::semantics::Scope &scope,
-                   const llvm::SmallVector<const semantics::Symbol *, 8> &vars,
-                   bool isDeclaration = false)
-        : interval{std::move(interval)}, scope{&scope}, vars{vars},
-          isDecl{isDeclaration} {}
-
-    bool isGlobal() const { return vars.size() > 0; }
-    bool isDeclaration() const { return isDecl; }
+    AggregateStore(Interval &&interval,
+                   const Fortran::semantics::Symbol &namingSym,
+                   bool isGlobal = false)
+        : interval{std::move(interval)}, namingSymbol{&namingSym},
+          isGlobalAggregate{isGlobal} {}
+    AggregateStore(const semantics::Symbol &initialValueSym,
+                   const semantics::Symbol &namingSym, bool isGlobal = false)
+        : interval{initialValueSym.offset(), initialValueSym.size()},
+          namingSymbol{&namingSym}, initialValueSymbol{&initialValueSym},
+          isGlobalAggregate{isGlobal} {};
+
+    bool isGlobal() const { return isGlobalAggregate; }
     /// Get offset of the aggregate inside its scope.
     std::size_t getOffset() const { return std::get<0>(interval); }
-
+    /// Returns symbols holding the aggregate initial value if any.
+    const semantics::Symbol *getInitialValueSymbol() const {
+      return initialValueSymbol;
+    }
+    /// Returns the symbol that gives its name to the aggregate.
+    const semantics::Symbol &getNamingSymbol() const { return *namingSymbol; }
+    /// Scope to which the aggregates belongs to.
+    const semantics::Scope &getOwningScope() const {
+      return getNamingSymbol().owner();
+    }
+    /// <offset, size> of the aggregate in its scope.
     Interval interval{};
-    /// scope in which the interval is.
-    const Fortran::semantics::Scope *scope;
-    llvm::SmallVector<const semantics::Symbol *, 8> vars{};
-    /// Is this a declaration of a storage defined in another scope ?
-    bool isDecl;
+    /// Symbol that gives its name to the aggregate. Always set by constructor.
+    const semantics::Symbol *namingSymbol;
+    /// Compiler generated symbol with the aggregate initial value if any.
+    const semantics::Symbol *initialValueSymbol = nullptr;
+    /// Is this a global aggregate ?
+    bool isGlobalAggregate;
   };
 
   explicit Variable(const Fortran::semantics::Symbol &sym, bool global = false,
@@ -463,31 +484,32 @@ struct Variable {
     return std::visit([](const auto &x) { return x.isGlobal(); }, var);
   }
 
-  /// Is this a declaration of a variable owned by another scope ?
-  bool isDeclaration() const {
-    return std::visit([](const auto &x) { return x.isDeclaration(); }, var);
+  /// Is this a module variable ?
+  bool isModuleVariable() const {
+    const semantics::Scope *scope = getOwningScope();
+    return scope && scope->IsModule();
   }
 
   const Fortran::semantics::Scope *getOwningScope() const {
     return std::visit(
         common::visitors{
             [](const Nominal &x) { return &x.symbol->GetUltimate().owner(); },
-            [](const AggregateStore &agg) { return agg.scope; }},
+            [](const AggregateStore &agg) { return &agg.getOwningScope(); }},
         var);
   }
 
   bool isHeapAlloc() const {
-    if (const auto *s = std::get_if<Nominal>(&var))
+    if (auto *s = std::get_if<Nominal>(&var))
       return s->heapAlloc;
     return false;
   }
   bool isPointer() const {
-    if (const auto *s = std::get_if<Nominal>(&var))
+    if (auto *s = std::get_if<Nominal>(&var))
       return s->pointer;
     return false;
   }
   bool isTarget() const {
-    if (const auto *s = std::get_if<Nominal>(&var))
+    if (auto *s = std::get_if<Nominal>(&var))
       return s->target;
     return false;
   }
@@ -495,7 +517,7 @@ struct Variable {
   /// An alias(er) is a variable that is part of a EQUIVALENCE that is allocated
   /// locally on the stack.
   bool isAlias() const {
-    if (const auto *s = std::get_if<Nominal>(&var))
+    if (auto *s = std::get_if<Nominal>(&var))
       return s->aliaser;
     return false;
   }
@@ -534,7 +556,7 @@ struct Variable {
 
   /// The depth is recorded for nominal variables as a debugging aid.
   int getDepth() const {
-    if (const auto *s = std::get_if<Nominal>(&var))
+    if (auto *s = std::get_if<Nominal>(&var))
       return s->depth;
     return 0;
   }
@@ -574,17 +596,6 @@ struct FunctionLikeUnit : public ProgramUnit {
   FunctionLikeUnit(FunctionLikeUnit &&) = default;
   FunctionLikeUnit(const FunctionLikeUnit &) = delete;
 
-  /// Return true iff this function like unit is Fortran recursive (actually
-  /// meaning it's reentrant).
-  bool isRecursive() const {
-    if (isMainProgram())
-      return false;
-    const auto &sym = getSubprogramSymbol();
-    return sym.attrs().test(semantics::Attr::RECURSIVE) ||
-           (!sym.attrs().test(semantics::Attr::NON_RECURSIVE) &&
-            defaultRecursiveFunctionSetting());
-  }
-
   std::vector<Variable> getOrderedSymbolTable() { return varList[0]; }
 
   bool isMainProgram() const {
@@ -592,13 +603,7 @@ struct FunctionLikeUnit : public ProgramUnit {
   }
 
   /// Get the starting source location for this function like unit
-  parser::CharBlock getStartingSourceLoc() {
-    if (beginStmt)
-      return stmtSourceLoc(*beginStmt);
-    if (!evaluationList.empty())
-      return evaluationList.front().position;
-    return stmtSourceLoc(endStmt);
-  }
+  parser::CharBlock getStartingSourceLoc() const;
 
   void setActiveEntry(int entryIndex) {
     assert(entryIndex >= 0 && entryIndex < (int)entryPointList.size() &&
@@ -610,7 +615,7 @@ struct FunctionLikeUnit : public ProgramUnit {
   /// This should not be called if the FunctionLikeUnit is the main program
   /// since anonymous main programs do not have a symbol.
   const semantics::Symbol &getSubprogramSymbol() const {
-    const auto *symbol = entryPointList[activeEntry].first;
+    const semantics::Symbol *symbol = entryPointList[activeEntry].first;
     if (!symbol)
       llvm::report_fatal_error(
           "not inside a procedure; do not call on main program.");
@@ -623,11 +628,27 @@ struct FunctionLikeUnit : public ProgramUnit {
     return entryPointList[activeEntry].second;
   }
 
-  /// Helper to get location from FunctionLikeUnit begin/end statements.
-  static parser::CharBlock stmtSourceLoc(const FunctionStatement &stmt) {
-    return stmt.visit(common::visitors{[](const auto &x) { return x.source; }});
+  //===--------------------------------------------------------------------===//
+  // Host associations
+  //===--------------------------------------------------------------------===//
+
+  void setHostAssociatedSymbols(
+      const llvm::SetVector<const semantics::Symbol *> &symbols) {
+    hostAssociations.addSymbolsToBind(symbols);
   }
 
+  /// Return the host associations, if any, from the parent (host) procedure.
+  /// Crashes if the parent is not a procedure.
+  HostAssociations &parentHostAssoc();
+
+  /// Return true iff the parent is a procedure and the parent has a non-empty
+  /// set of host associations.
+  bool parentHasHostAssoc();
+
+  /// Return the host associations for this function like unit. The list of host
+  /// associations are kept in the host procedure.
+  HostAssociations &getHostAssoc() { return hostAssociations; }
+
   LLVM_DUMP_METHOD void dump() const;
 
   /// Anonymous programs do not have a begin statement
@@ -647,13 +668,14 @@ struct FunctionLikeUnit : public ProgramUnit {
   /// Current index into entryPointList.  Index 0 is the primary entry point.
   int activeEntry = 0;
   /// Dummy arguments that are not universal across entry points.
-  llvm::SmallVector<const semantics::Symbol *, 3> nonUniversalDummyArguments;
+  llvm::SmallVector<const semantics::Symbol *, 1> nonUniversalDummyArguments;
   /// Primary result for function subprograms with alternate entries.  This
   /// is one of the largest result values, not necessarily the first one.
   const semantics::Symbol *primaryResult{nullptr};
   /// Terminal basic block (if any)
   mlir::Block *finalBlock{};
   std::vector<std::vector<Variable>> varList;
+  HostAssociations hostAssociations;
 };
 
 /// Module-like units contain a list of function-like units.
@@ -675,9 +697,16 @@ struct ModuleLikeUnit : public ProgramUnit {
 
   std::vector<Variable> getOrderedSymbolTable() { return varList[0]; }
 
+  /// Get the starting source location for this module like unit.
+  parser::CharBlock getStartingSourceLoc() const;
+
+  /// Get the module scope.
+  const Fortran::semantics::Scope &getScope() const;
+
   ModuleStatement beginStmt;
   ModuleStatement endStmt;
   std::list<FunctionLikeUnit> nestedFunctions;
+  EvaluationList evaluationList;
   std::vector<std::vector<Variable>> varList;
 };
 
@@ -722,6 +751,33 @@ struct Program {
   std::list<Units> units;
 };
 
+/// Return the list of variables that appears in the specification expressions
+/// of a function result.
+std::vector<pft::Variable>
+buildFuncResultDependencyList(const Fortran::semantics::Symbol &);
+
+/// Helper to get location from FunctionLikeUnit/ModuleLikeUnit begin/end
+/// statements.
+template <typename T>
+static parser::CharBlock stmtSourceLoc(const T &stmt) {
+  return stmt.visit(common::visitors{[](const auto &x) { return x.source; }});
+}
+
+/// Get the first PFT ancestor node that has type ParentType.
+template <typename ParentType, typename A>
+ParentType *getAncestor(A &node) {
+  if (auto *seekedParent = node.parent.template getIf<ParentType>())
+    return seekedParent;
+  return node.parent.visit(common::visitors{
+      [](Program &p) -> ParentType * { return nullptr; },
+      [](auto &p) -> ParentType * { return getAncestor<ParentType>(p); }});
+}
+
+/// Call the provided \p callBack on all symbols that are referenced inside \p
+/// funit.
+void visitAllSymbols(const FunctionLikeUnit &funit,
+                     std::function<void(const semantics::Symbol &)> callBack);
+
 } // namespace Fortran::lower::pft
 
 namespace Fortran::lower {
@@ -739,7 +795,6 @@ createPFT(const parser::Program &root,
 
 /// Dumper for displaying a PFT.
 void dumpPFT(llvm::raw_ostream &outputStream, const pft::Program &pft);
-
 } // namespace Fortran::lower
 
 #endif // FORTRAN_LOWER_PFTBUILDER_H
diff --git a/flang/include/flang/Lower/PFTDefs.h b/flang/include/flang/Lower/PFTDefs.h
index 4dc31756ea4af..194f1020da57c 100644
--- a/flang/include/flang/Lower/PFTDefs.h
+++ b/flang/include/flang/Lower/PFTDefs.h
@@ -42,6 +42,7 @@ class Reference;
 namespace lower {
 
 bool definedInCommonBlock(const semantics::Symbol &sym);
+bool symbolIsGlobal(const semantics::Symbol &sym);
 bool defaultRecursiveFunctionSetting();
 
 namespace pft {
diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
index b0bd5bec1694e..c6a5ceb7b044d 100644
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -1,4 +1,4 @@
-//===-- PFTBuilder.cc -----------------------------------------------------===//
+//===-- PFTBuilder.cpp ----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Lower/PFTBuilder.h"
-#include "IntervalSet.h"
+#include "flang/Lower/IntervalSet.h"
 #include "flang/Lower/Support/Utils.h"
 #include "flang/Parser/dump-parse-tree.h"
 #include "flang/Parser/parse-tree-visitor.h"
@@ -160,6 +160,8 @@ class PFTBuilder {
       exitFunction();
     } else if constexpr (lower::pft::isConstruct<A> ||
                          lower::pft::isDirective<A>) {
+      if constexpr (lower::pft::isDeclConstruct<A>)
+        return;
       exitConstructOrDirective();
     }
   }
@@ -221,14 +223,17 @@ class PFTBuilder {
   /// Initialize a new module-like unit and make it the builder's focus.
   template <typename A>
   bool enterModule(const A &func) {
-    auto &unit =
+    Fortran::lower::pft::ModuleLikeUnit &unit =
         addUnit(lower::pft::ModuleLikeUnit{func, pftParentStack.back()});
     functionList = &unit.nestedFunctions;
+    pushEvaluationList(&unit.evaluationList);
     pftParentStack.emplace_back(unit);
     return true;
   }
 
   void exitModule() {
+    if (!evaluationListStack.empty())
+      popEvaluationList();
     pftParentStack.pop_back();
     resetFunctionState();
   }
@@ -240,6 +245,11 @@ class PFTBuilder {
     if (evaluationListStack.empty())
       return;
     auto evaluationList = evaluationListStack.back();
+    if (evaluationList->empty() &&
+        pftParentStack.back().getIf<lower::pft::ModuleLikeUnit>()) {
+      popEvaluationList();
+      return;
+    }
     if (evaluationList->empty() || !evaluationList->back().isEndStmt()) {
       const auto &endStmt =
           pftParentStack.back().get<lower::pft::FunctionLikeUnit>().endStmt;
@@ -274,8 +284,9 @@ class PFTBuilder {
   bool enterFunction(const A &func,
                      const semantics::SemanticsContext &semanticsContext) {
     endFunctionBody(); // enclosing host subprogram body, if any
-    auto &unit = addFunction(lower::pft::FunctionLikeUnit{
-        func, pftParentStack.back(), semanticsContext});
+    Fortran::lower::pft::FunctionLikeUnit &unit =
+        addFunction(lower::pft::FunctionLikeUnit{func, pftParentStack.back(),
+                                                 semanticsContext});
     labelEvaluationMap = &unit.labelEvaluationMap;
     assignSymbolLabelMap = &unit.assignSymbolLabelMap;
     functionList = &unit.nestedFunctions;
@@ -296,20 +307,38 @@ class PFTBuilder {
     resetFunctionState();
   }
 
-  /// Initialize a new construct and make it the builder's focus.
+  /// Initialize a new construct or directive and make it the builder's focus.
   template <typename A>
-  bool enterConstructOrDirective(const A &construct) {
-    auto &eval =
-        addEvaluation(lower::pft::Evaluation{construct, pftParentStack.back()});
+  bool enterConstructOrDirective(const A &constructOrDirective) {
+    Fortran::lower::pft::Evaluation &eval = addEvaluation(
+        lower::pft::Evaluation{constructOrDirective, pftParentStack.back()});
     eval.evaluationList.reset(new lower::pft::EvaluationList);
     pushEvaluationList(eval.evaluationList.get());
     pftParentStack.emplace_back(eval);
     constructAndDirectiveStack.emplace_back(&eval);
+    if constexpr (lower::pft::isDeclConstruct<A>) {
+      popEvaluationList();
+      pftParentStack.pop_back();
+      constructAndDirectiveStack.pop_back();
+      popEvaluationList();
+    }
     return true;
   }
 
   void exitConstructOrDirective() {
     rewriteIfGotos();
+    auto *eval = constructAndDirectiveStack.back();
+    if (eval->isExecutableDirective()) {
+      // A construct at the end of an (unstructured) OpenACC or OpenMP
+      // construct region must have an exit target inside the region.
+      Fortran::lower::pft::EvaluationList &evaluationList =
+          *eval->evaluationList;
+      if (!evaluationList.empty() && evaluationList.back().isConstruct()) {
+        static const parser::ContinueStmt exitTarget{};
+        addEvaluation(
+            lower::pft::Evaluation{exitTarget, pftParentStack.back(), {}, {}});
+      }
+    }
     popEvaluationList();
     pftParentStack.pop_back();
     constructAndDirectiveStack.pop_back();
@@ -372,7 +401,8 @@ class PFTBuilder {
     auto &entryPointList = eval.getOwningProcedure()->entryPointList;
     evaluationListStack.back()->emplace_back(std::move(eval));
     lower::pft::Evaluation *p = &evaluationListStack.back()->back();
-    if (p->isActionStmt() || p->isConstructStmt() || p->isEndStmt()) {
+    if (p->isActionStmt() || p->isConstructStmt() || p->isEndStmt() ||
+        p->isExecutableDirective()) {
       if (lastLexicalEvaluation) {
         lastLexicalEvaluation->lexicalSuccessor = p;
         p->printIndex = lastLexicalEvaluation->printIndex + 1;
@@ -380,13 +410,14 @@ class PFTBuilder {
         p->printIndex = 1;
       }
       lastLexicalEvaluation = p;
-      for (auto entryIndex = entryPointList.size() - 1;
+      for (std::size_t entryIndex = entryPointList.size() - 1;
            entryIndex && !entryPointList[entryIndex].second->lexicalSuccessor;
            --entryIndex)
         // Link to the entry's first executable statement.
         entryPointList[entryIndex].second->lexicalSuccessor = p;
     } else if (const auto *entryStmt = p->getIf<parser::EntryStmt>()) {
-      const auto *sym = std::get<parser::Name>(entryStmt->t).symbol;
+      const semantics::Symbol *sym =
+          std::get<parser::Name>(entryStmt->t).symbol;
       assert(sym->has<semantics::SubprogramDetails>() &&
              "entry must be a subprogram");
       entryPointList.push_back(std::pair{sym, p});
@@ -410,8 +441,9 @@ class PFTBuilder {
     evaluationListStack.pop_back();
   }
 
-  /// Rewrite IfConstructs containing a GotoStmt to eliminate an unstructured
-  /// branch and a trivial basic block.  The pre-branch-analysis code:
+  /// Rewrite IfConstructs containing a GotoStmt or CycleStmt to eliminate an
+  /// unstructured branch and a trivial basic block.  The pre-branch-analysis
+  /// code:
   ///
   ///       <<IfConstruct>>
   ///         1 If[Then]Stmt: if(cond) goto L
@@ -433,8 +465,8 @@ class PFTBuilder {
   ///       6 Statement: L ...
   ///
   /// The If[Then]Stmt condition is implicitly negated.  It is not modified
-  /// in the PFT.  It must be negated when generating FIR.  The GotoStmt is
-  /// deleted.
+  /// in the PFT.  It must be negated when generating FIR.  The GotoStmt or
+  /// CycleStmt is deleted.
   ///
   /// The transformation is only valid for forward branch targets at the same
   /// construct nesting level as the IfConstruct.  The result must not violate
@@ -449,56 +481,86 @@ class PFTBuilder {
   /// not significant, but could be changed.
   ///
   void rewriteIfGotos() {
-    using T = struct {
+    auto &evaluationList = *evaluationListStack.back();
+    if (!evaluationList.size())
+      return;
+    struct T {
       lower::pft::EvaluationList::iterator ifConstructIt;
       parser::Label ifTargetLabel;
+      bool isCycleStmt = false;
     };
-    llvm::SmallVector<T, 8> ifExpansionStack;
-    auto &evaluationList = *evaluationListStack.back();
+    llvm::SmallVector<T> ifCandidateStack;
+    const auto *doStmt =
+        evaluationList.begin()->getIf<parser::NonLabelDoStmt>();
+    std::string doName = doStmt ? getConstructName(*doStmt) : std::string{};
     for (auto it = evaluationList.begin(), end = evaluationList.end();
          it != end; ++it) {
       auto &eval = *it;
       if (eval.isA<parser::EntryStmt>()) {
-        ifExpansionStack.clear();
+        ifCandidateStack.clear();
         continue;
       }
       auto firstStmt = [](lower::pft::Evaluation *e) {
         return e->isConstruct() ? &*e->evaluationList->begin() : e;
       };
-      auto &targetEval = *firstStmt(&eval);
-      if (targetEval.label) {
-        while (!ifExpansionStack.empty() &&
-               ifExpansionStack.back().ifTargetLabel == *targetEval.label) {
-          auto ifConstructIt = ifExpansionStack.back().ifConstructIt;
-          auto successorIt = std::next(ifConstructIt);
+      const Fortran::lower::pft::Evaluation &targetEval = *firstStmt(&eval);
+      bool targetEvalIsEndDoStmt = targetEval.isA<parser::EndDoStmt>();
+      auto branchTargetMatch = [&]() {
+        if (const parser::Label targetLabel =
+                ifCandidateStack.back().ifTargetLabel)
+          if (targetLabel == *targetEval.label)
+            return true; // goto target match
+        if (targetEvalIsEndDoStmt && ifCandidateStack.back().isCycleStmt)
+          return true; // cycle target match
+        return false;
+      };
+      if (targetEval.label || targetEvalIsEndDoStmt) {
+        while (!ifCandidateStack.empty() && branchTargetMatch()) {
+          lower::pft::EvaluationList::iterator ifConstructIt =
+              ifCandidateStack.back().ifConstructIt;
+          lower::pft::EvaluationList::iterator successorIt =
+              std::next(ifConstructIt);
           if (successorIt != it) {
-            auto &ifBodyList = *ifConstructIt->evaluationList;
-            auto gotoStmtIt = std::next(ifBodyList.begin());
-            assert(gotoStmtIt->isA<parser::GotoStmt>() && "expected GotoStmt");
-            ifBodyList.erase(gotoStmtIt);
-            auto &ifStmt = *ifBodyList.begin();
+            Fortran::lower::pft::EvaluationList &ifBodyList =
+                *ifConstructIt->evaluationList;
+            lower::pft::EvaluationList::iterator branchStmtIt =
+                std::next(ifBodyList.begin());
+            assert((branchStmtIt->isA<parser::GotoStmt>() ||
+                    branchStmtIt->isA<parser::CycleStmt>()) &&
+                   "expected goto or cycle statement");
+            ifBodyList.erase(branchStmtIt);
+            lower::pft::Evaluation &ifStmt = *ifBodyList.begin();
             ifStmt.negateCondition = true;
             ifStmt.lexicalSuccessor = firstStmt(&*successorIt);
-            auto endIfStmtIt = std::prev(ifBodyList.end());
+            lower::pft::EvaluationList::iterator endIfStmtIt =
+                std::prev(ifBodyList.end());
             std::prev(it)->lexicalSuccessor = &*endIfStmtIt;
             endIfStmtIt->lexicalSuccessor = firstStmt(&*it);
             ifBodyList.splice(endIfStmtIt, evaluationList, successorIt, it);
             for (; successorIt != endIfStmtIt; ++successorIt)
               successorIt->parentConstruct = &*ifConstructIt;
           }
-          ifExpansionStack.pop_back();
+          ifCandidateStack.pop_back();
         }
       }
       if (eval.isA<parser::IfConstruct>() && eval.evaluationList->size() == 3) {
-        if (auto *gotoStmt = std::next(eval.evaluationList->begin())
-                                 ->getIf<parser::GotoStmt>())
-          ifExpansionStack.push_back({it, gotoStmt->v});
+        const auto bodyEval = std::next(eval.evaluationList->begin());
+        if (const auto *gotoStmt = bodyEval->getIf<parser::GotoStmt>()) {
+          ifCandidateStack.push_back({it, gotoStmt->v});
+        } else if (doStmt) {
+          if (const auto *cycleStmt = bodyEval->getIf<parser::CycleStmt>()) {
+            std::string cycleName = getConstructName(*cycleStmt);
+            if (cycleName.empty() || cycleName == doName)
+              // This candidate will match doStmt's EndDoStmt.
+              ifCandidateStack.push_back({it, {}, true});
+          }
+        }
       }
     }
   }
 
-  /// Mark I/O statement ERR, EOR, and END specifier branch targets.
-  /// Mark an I/O statement with an assigned format as unstructured.
+  /// Mark IO statement ERR, EOR, and END specifier branch targets.
+  /// Mark an IO statement with an assigned format as unstructured.
   template <typename A>
   void analyzeIoBranches(lower::pft::Evaluation &eval, const A &stmt) {
     auto analyzeFormatSpec = [&](const parser::Format &format) {
@@ -566,8 +628,8 @@ class PFTBuilder {
     // If this is a branch into the body of a construct (usually illegal,
     // but allowed in some legacy cases), then the targetEvaluation and its
     // ancestors must be marked as unstructured.
-    auto *sourceConstruct = sourceEvaluation.parentConstruct;
-    auto *targetConstruct = targetEvaluation.parentConstruct;
+    lower::pft::Evaluation *sourceConstruct = sourceEvaluation.parentConstruct;
+    lower::pft::Evaluation *targetConstruct = targetEvaluation.parentConstruct;
     if (targetConstruct &&
         &targetConstruct->getFirstNestedEvaluation() == &targetEvaluation)
       // A branch to an initial constructStmt is a branch to the construct.
@@ -575,9 +637,18 @@ class PFTBuilder {
     if (targetConstruct) {
       while (sourceConstruct && sourceConstruct != targetConstruct)
         sourceConstruct = sourceConstruct->parentConstruct;
-      if (sourceConstruct != targetConstruct)
-        for (auto *eval = &targetEvaluation; eval; eval = eval->parentConstruct)
+      if (sourceConstruct != targetConstruct) // branch into a construct body
+        for (lower::pft::Evaluation *eval = &targetEvaluation; eval;
+             eval = eval->parentConstruct) {
           eval->isUnstructured = true;
+          // If the branch is a backward branch into an already analyzed
+          // DO or IF construct, mark the construct exit as a new block.
+          // For a forward branch, the isUnstructured flag will cause this
+          // to be done when the construct is analyzed.
+          if (eval->constructExit && (eval->isA<parser::DoConstruct>() ||
+                                      eval->isA<parser::IfConstruct>()))
+            eval->constructExit->isNewBlock = true;
+        }
     }
   }
   void markBranchTarget(lower::pft::Evaluation &sourceEvaluation,
@@ -615,18 +686,18 @@ class PFTBuilder {
         parser::MaskedElsewhereStmt, parser::NonLabelDoStmt,
         parser::SelectCaseStmt, parser::SelectRankCaseStmt,
         parser::TypeGuardStmt, parser::WhereConstructStmt>;
-
     if constexpr (common::HasMember<A, MaybeConstructNameInTuple>) {
       if (auto name = std::get<std::optional<parser::Name>>(stmt.t))
         return name->ToString();
     }
 
-    // These statements have several std::optional<parser::Name>
+    // These statements have multiple std::optional<parser::Name> elements.
     if constexpr (std::is_same_v<A, parser::SelectRankStmt> ||
                   std::is_same_v<A, parser::SelectTypeStmt>) {
       if (auto name = std::get<0>(stmt.t))
         return name->ToString();
     }
+
     return {};
   }
 
@@ -648,7 +719,7 @@ class PFTBuilder {
     lower::pft::Evaluation *lastConstructStmtEvaluation{};
     for (auto &eval : evaluationList) {
       eval.visit(common::visitors{
-          // Action statements (except I/O statements)
+          // Action statements (except IO statements)
           [&](const parser::CallStmt &s) {
             // Look for alternate return specifiers.
             const auto &args =
@@ -726,6 +797,11 @@ class PFTBuilder {
             markSuccessorAsNewBlock(eval);
           },
 
+          // The first executable statement after an EntryStmt is a new block.
+          [&](const parser::EntryStmt &) {
+            eval.lexicalSuccessor->isNewBlock = true;
+          },
+
           // Construct statements
           [&](const parser::AssociateStmt &s) {
             insertConstructName(s, parentConstruct);
@@ -861,7 +937,7 @@ class PFTBuilder {
             eval.isUnstructured = true;
           },
 
-          // Default - Common analysis for I/O statements; otherwise nop.
+          // Default - Common analysis for IO statements; otherwise nop.
           [&](const auto &stmt) {
             using A = std::decay_t<decltype(stmt)>;
             using IoStmts = std::tuple<
@@ -901,7 +977,8 @@ class PFTBuilder {
   /// also find one of the largest function results, since a single result
   /// container holds the result for all entries.
   void processEntryPoints() {
-    auto *unit = evaluationListStack.back()->front().getOwningProcedure();
+    lower::pft::Evaluation *initialEval = &evaluationListStack.back()->front();
+    lower::pft::FunctionLikeUnit *unit = initialEval->getOwningProcedure();
     int entryCount = unit->entryPointList.size();
     if (entryCount == 1)
       return;
@@ -910,7 +987,7 @@ class PFTBuilder {
       unit->setActiveEntry(entryIndex);
       const auto &details =
           unit->getSubprogramSymbol().get<semantics::SubprogramDetails>();
-      for (auto *arg : details.dummyArgs()) {
+      for (semantics::Symbol *arg : details.dummyArgs()) {
         if (!arg)
           continue; // alternate return specifier (no actual argument)
         const auto iter = dummyCountMap.find(arg);
@@ -920,7 +997,7 @@ class PFTBuilder {
           ++iter->second;
       }
       if (details.isFunction()) {
-        const auto *resultSym = &details.result();
+        const semantics::Symbol *resultSym = &details.result();
         assert(resultSym && "missing result symbol");
         if (!unit->primaryResult ||
             unit->primaryResult->size() < resultSym->size())
@@ -931,6 +1008,13 @@ class PFTBuilder {
     for (auto arg : dummyCountMap)
       if (arg.second < entryCount)
         unit->nonUniversalDummyArguments.push_back(arg.first);
+    // The first executable statement in the subprogram is preceded by a
+    // branch to the entry point, so it starts a new block.
+    if (initialEval->hasNestedEvaluations())
+      initialEval = &initialEval->getFirstNestedEvaluation();
+    else if (initialEval->isA<Fortran::parser::EntryStmt>())
+      initialEval = initialEval->lexicalSuccessor;
+    initialEval->isNewBlock = true;
   }
 
   std::unique_ptr<lower::pft::Program> pgm;
@@ -985,33 +1069,32 @@ class PFTDumper {
                       const lower::pft::Evaluation &eval,
                       const std::string &indentString, int indent = 1) {
     llvm::StringRef name = evaluationName(eval);
-    std::string bang = eval.isUnstructured ? "!" : "";
-    if (eval.isConstruct() || eval.isDirective()) {
-      outputStream << indentString << "<<" << name << bang << ">>";
-      if (eval.constructExit)
-        outputStream << " -> " << eval.constructExit->printIndex;
-      outputStream << '\n';
-      dumpEvaluationList(outputStream, *eval.evaluationList, indent + 1);
-      outputStream << indentString << "<<End " << name << bang << ">>\n";
-      return;
-    }
+    llvm::StringRef newBlock = eval.isNewBlock ? "^" : "";
+    llvm::StringRef bang = eval.isUnstructured ? "!" : "";
     outputStream << indentString;
     if (eval.printIndex)
       outputStream << eval.printIndex << ' ';
-    if (eval.isNewBlock)
-      outputStream << '^';
-    outputStream << name << bang;
-    if (eval.isActionStmt() || eval.isConstructStmt()) {
-      if (eval.negateCondition)
-        outputStream << " [negate]";
-      if (eval.controlSuccessor)
-        outputStream << " -> " << eval.controlSuccessor->printIndex;
-    } else if (eval.isA<parser::EntryStmt>() && eval.lexicalSuccessor) {
+    if (eval.hasNestedEvaluations())
+      outputStream << "<<" << newBlock << name << bang << ">>";
+    else
+      outputStream << newBlock << name << bang;
+    if (eval.negateCondition)
+      outputStream << " [negate]";
+    if (eval.constructExit)
+      outputStream << " -> " << eval.constructExit->printIndex;
+    else if (eval.controlSuccessor)
+      outputStream << " -> " << eval.controlSuccessor->printIndex;
+    else if (eval.isA<parser::EntryStmt>() && eval.lexicalSuccessor)
       outputStream << " -> " << eval.lexicalSuccessor->printIndex;
-    }
     if (!eval.position.empty())
       outputStream << ": " << eval.position.ToString();
+    else if (auto *dir = eval.getIf<Fortran::parser::CompilerDirective>())
+      outputStream << ": !" << dir->source.ToString();
     outputStream << '\n';
+    if (eval.hasNestedEvaluations()) {
+      dumpEvaluationList(outputStream, *eval.evaluationList, indent + 1);
+      outputStream << indentString << "<<End " << name << bang << ">>\n";
+    }
   }
 
   void dumpEvaluation(llvm::raw_ostream &ostream,
@@ -1024,7 +1107,7 @@ class PFTDumper {
                           int indent = 1) {
     static const auto white = "                                      ++"s;
     auto indentString = white.substr(0, indent * 2);
-    for (const auto &eval : evaluationList)
+    for (const lower::pft::Evaluation &eval : evaluationList)
       dumpEvaluation(outputStream, eval, indentString, indent);
   }
 
@@ -1069,7 +1152,8 @@ class PFTDumper {
     dumpEvaluationList(outputStream, functionLikeUnit.evaluationList);
     if (!functionLikeUnit.nestedFunctions.empty()) {
       outputStream << "\nContains\n";
-      for (auto &func : functionLikeUnit.nestedFunctions)
+      for (const lower::pft::FunctionLikeUnit &func :
+           functionLikeUnit.nestedFunctions)
         dumpFunctionLikeUnit(outputStream, func);
       outputStream << "End Contains\n";
     }
@@ -1081,7 +1165,8 @@ class PFTDumper {
     outputStream << getNodeIndex(moduleLikeUnit) << " ";
     outputStream << "ModuleLike: ";
     outputStream << "\nContains\n";
-    for (auto &func : moduleLikeUnit.nestedFunctions)
+    for (const lower::pft::FunctionLikeUnit &func :
+         moduleLikeUnit.nestedFunctions)
       dumpFunctionLikeUnit(outputStream, func);
     outputStream << "End Contains\nEnd ModuleLike\n\n";
   }
@@ -1122,6 +1207,7 @@ getFunctionStmt(const T &func) {
       std::get<parser::Statement<A>>(func.t)};
   return result;
 }
+
 template <typename A, typename T>
 static lower::pft::ModuleLikeUnit::ModuleStatement getModuleStmt(const T &mod) {
   lower::pft::ModuleLikeUnit::ModuleStatement result{
@@ -1179,12 +1265,39 @@ bool Fortran::lower::definedInCommonBlock(const semantics::Symbol &sym) {
   return semantics::FindCommonBlockContaining(sym);
 }
 
+static bool isReEntrant(const Fortran::semantics::Scope &scope) {
+  if (scope.kind() == Fortran::semantics::Scope::Kind::MainProgram)
+    return false;
+  if (scope.kind() == Fortran::semantics::Scope::Kind::Subprogram) {
+    const Fortran::semantics::Symbol *sym = scope.symbol();
+    assert(sym && "Subprogram scope must have a symbol");
+    return sym->attrs().test(semantics::Attr::RECURSIVE) ||
+           (!sym->attrs().test(semantics::Attr::NON_RECURSIVE) &&
+            Fortran::lower::defaultRecursiveFunctionSetting());
+  }
+  if (scope.kind() == Fortran::semantics::Scope::Kind::Module)
+    return false;
+  return true;
+}
+
 /// Is the symbol `sym` a global?
-static bool symbolIsGlobal(const semantics::Symbol &sym) {
-  if (const auto *details = sym.detailsIf<semantics::ObjectEntityDetails>())
+bool Fortran::lower::symbolIsGlobal(const semantics::Symbol &sym) {
+  if (const auto *details = sym.detailsIf<semantics::ObjectEntityDetails>()) {
     if (details->init())
       return true;
-  return semantics::IsSaved(sym) || lower::definedInCommonBlock(sym);
+    if (!isReEntrant(sym.owner())) {
+      // Turn array and character of non re-entrant programs (like the main
+      // program) into global memory.
+      if (const Fortran::semantics::DeclTypeSpec *symTy = sym.GetType())
+        if (symTy->category() == semantics::DeclTypeSpec::Character)
+          if (auto e = symTy->characterTypeSpec().length().GetExplicit())
+            return true;
+      if (!details->shape().empty() || !details->coshape().empty())
+        return true;
+    }
+  }
+  return semantics::IsSaved(sym) || lower::definedInCommonBlock(sym) ||
+         semantics::IsNamedConstant(sym);
 }
 
 namespace {
@@ -1194,91 +1307,86 @@ namespace {
 /// symbol table, which is sorted by name.
 struct SymbolDependenceDepth {
   explicit SymbolDependenceDepth(
-      std::vector<std::vector<lower::pft::Variable>> &vars, bool reentrant)
-      : vars{vars}, reentrant{reentrant} {}
+      std::vector<std::vector<lower::pft::Variable>> &vars)
+      : vars{vars} {}
 
   void analyzeAliasesInCurrentScope(const semantics::Scope &scope) {
+    // FIXME: When this function is called on the scope of an internal
+    // procedure whose parent contains an EQUIVALENCE set and the internal
+    // procedure uses variables from that EQUIVALENCE set, we end up creating
+    // an AggregateStore for those variables unnecessarily.
+    //
+    /// If this is a function nested in a module no host associated
+    /// symbol are added to the function scope for module symbols used in this
+    /// scope. As a result, alias analysis in parent module scopes must be
+    /// preformed here.
+    const semantics::Scope *parentScope = &scope;
+    while (!parentScope->IsGlobal()) {
+      parentScope = &parentScope->parent();
+      if (parentScope->IsModule())
+        analyzeAliases(*parentScope);
+    }
     for (const auto &iter : scope) {
-      const auto &ultimate = iter.second.get().GetUltimate();
+      const semantics::Symbol &ultimate = iter.second.get().GetUltimate();
       if (skipSymbol(ultimate))
         continue;
-      bool isDeclaration = scope != ultimate.owner();
-      analyzeAliases(ultimate.owner(), isDeclaration);
+      analyzeAliases(ultimate.owner());
     }
     // add all aggregate stores to the front of the work list
     adjustSize(1);
     // The copy in the loop matters, 'stores' will still be used.
-    for (auto st : stores) {
+    for (auto st : stores)
       vars[0].emplace_back(std::move(st));
-    }
   }
+
+  // Compute the offset of the last byte that resides in the symbol.
+  inline static std::size_t offsetWidth(const Fortran::semantics::Symbol &sym) {
+    std::size_t width = sym.offset();
+    if (std::size_t size = sym.size())
+      width += size - 1;
+    return width;
+  }
+
   // Analyze the equivalence sets. This analysis need not be performed when the
   // scope has no equivalence sets.
-  void analyzeAliases(const semantics::Scope &scope, bool isDeclaration) {
+  void analyzeAliases(const semantics::Scope &scope) {
     if (scope.equivalenceSets().empty())
       return;
-    if (scopeAnlyzedForAliases.find(&scope) != scopeAnlyzedForAliases.end())
+    // Don't analyze a scope if it has already been analyzed.
+    if (analyzedScopes.find(&scope) != analyzedScopes.end())
       return;
-    scopeAnlyzedForAliases.insert(&scope);
-    Fortran::lower::IntervalSet intervals;
-    llvm::DenseMap<std::size_t, llvm::SmallVector<const semantics::Symbol *, 8>>
-        aliasSets;
-    llvm::DenseMap<std::size_t, const semantics::Symbol *> setIsGlobal;
-
-    // 1. Construct the intervals. Determine each entity's interval, merging
-    // overlapping intervals into aggregates.
-    for (const auto &pair : scope) {
-      const auto &sym = pair.second.get();
-      if (skipSymbol(sym))
-        continue;
-      LLVM_DEBUG(llvm::dbgs() << "symbol: " << sym << '\n');
-      intervals.merge(sym.offset(), sym.offset() + sym.size() - 1);
-    }
-
-    // 2. Compute alias sets. Adds each entity to a set for the interval it
-    // appears to be mapped into.
-    for (const auto &pair : scope) {
-      const auto &sym = pair.second.get();
-      if (skipSymbol(sym))
-        continue;
-      auto iter = intervals.find(sym.offset());
-      if (iter != intervals.end()) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "symbol: " << toStringRef(sym.name()) << " on ["
-                   << iter->first << ".." << iter->second << "]\n");
-        aliasSets[iter->first].push_back(&sym);
-        if (symbolIsGlobal(sym))
-          setIsGlobal.insert({iter->first, &sym});
-      }
-    }
 
-    // 3. For each alias set with more than 1 member, add an Interval to the
-    // stores. The Interval will be lowered into a single memory allocation,
-    // with the co-located, overlapping variables mapped into that memory range.
-    for (const auto &pair : aliasSets) {
-      if (pair.second.size() > 1) {
-        // Set contains more than 1 aliasing variable.
-        // 1. Mark the symbols as aliasing for lowering.
-        for (auto *sym : pair.second)
-          aliasSyms.insert(sym);
-        auto gvarIter = setIsGlobal.find(pair.first);
-        auto iter = intervals.find(pair.first);
-        auto ibgn = iter->first;
-        auto ilen = iter->second - ibgn + 1;
-        // 2. Add an Interval to the list of stores allocated for this unit.
-        lower::pft::Variable::Interval interval(ibgn, ilen);
-        if (gvarIter != setIsGlobal.end()) {
-          LLVM_DEBUG(llvm::dbgs()
-                     << "interval [" << ibgn << ".." << ibgn + ilen
-                     << ") added as global " << *gvarIter->second << '\n');
-          stores.emplace_back(std::move(interval), scope, pair.second,
-                              isDeclaration);
+    analyzedScopes.insert(&scope);
+    std::list<std::list<semantics::SymbolRef>> aggregates =
+        Fortran::semantics::GetStorageAssociations(scope);
+    for (std::list<semantics::SymbolRef> aggregate : aggregates) {
+      const Fortran::semantics::Symbol *aggregateSym = nullptr;
+      bool isGlobal = false;
+      const semantics::Symbol &first = *aggregate.front();
+      std::size_t start = first.offset();
+      std::size_t end = first.offset() + first.size();
+      const Fortran::semantics::Symbol *namingSym = nullptr;
+      for (semantics::SymbolRef symRef : aggregate) {
+        const semantics::Symbol &sym = *symRef;
+        aliasSyms.insert(&sym);
+        if (sym.test(Fortran::semantics::Symbol::Flag::CompilerCreated)) {
+          aggregateSym = &sym;
         } else {
-          LLVM_DEBUG(llvm::dbgs() << "interval [" << ibgn << ".." << ibgn + ilen
-                                  << ") added\n");
-          stores.emplace_back(std::move(interval), scope, isDeclaration);
+          isGlobal |= lower::symbolIsGlobal(sym);
+          start = std::min(sym.offset(), start);
+          end = std::max(sym.offset() + sym.size(), end);
+          if (!namingSym || (sym.name() < namingSym->name()))
+            namingSym = &sym;
         }
       }
+      assert(namingSym && "must contain at least one user symbol");
+      if (!aggregateSym) {
+        stores.emplace_back(
+            Fortran::lower::pft::Variable::Interval{start, end - start},
+            *namingSym, isGlobal);
+      } else {
+        stores.emplace_back(*aggregateSym, *namingSym, isGlobal);
+      }
     }
   }
 
@@ -1293,7 +1401,14 @@ struct SymbolDependenceDepth {
       // TODO: add declaration?
       return 0;
     }
-    auto ultimate = sym.GetUltimate();
+    semantics::Symbol ultimate = sym.GetUltimate();
+    if (const auto *details =
+            ultimate.detailsIf<semantics::NamelistDetails>()) {
+      // handle namelist group symbols
+      for (const semantics::SymbolRef &s : details->objects())
+        analyze(s);
+      return 0;
+    }
     if (!ultimate.has<semantics::ObjectEntityDetails>() &&
         !ultimate.has<semantics::ProcEntityDetails>())
       return 0;
@@ -1302,21 +1417,22 @@ struct SymbolDependenceDepth {
       llvm_unreachable("not yet implemented - derived type analysis");
 
     // Symbol must be something lowering will have to allocate.
-    bool global = semantics::IsSaved(sym);
     int depth = 0;
-    const auto *symTy = sym.GetType();
+    const semantics::DeclTypeSpec *symTy = sym.GetType();
     assert(symTy && "symbol must have a type");
 
-    // check CHARACTER's length
-    if (symTy->category() == semantics::DeclTypeSpec::Character)
-      if (auto e = symTy->characterTypeSpec().length().GetExplicit()) {
-        // turn variable into a global if this unit is not reentrant
-        global = global || !reentrant;
-        for (const auto &s : evaluate::CollectSymbols(*e))
-          depth = std::max(analyze(s) + 1, depth);
-      }
-
+    // Analyze symbols appearing in object entity specification expression. This
+    // ensures these symbols will be instantiated before the current one.
+    // This is not done for object entities that are host associated because
+    // they must be instantiated from the value of the host symbols (the
+    // specification expressions should not be re-evaluated).
     if (const auto *details = sym.detailsIf<semantics::ObjectEntityDetails>()) {
+      // check CHARACTER's length
+      if (symTy->category() == semantics::DeclTypeSpec::Character)
+        if (auto e = symTy->characterTypeSpec().length().GetExplicit())
+          for (const auto &s : evaluate::CollectSymbols(*e))
+            depth = std::max(analyze(s) + 1, depth);
+
       auto doExplicit = [&](const auto &bound) {
         if (bound.isExplicit()) {
           semantics::SomeExpr e{*bound.GetExplicit()};
@@ -1325,28 +1441,22 @@ struct SymbolDependenceDepth {
         }
       };
       // handle any symbols in array bound declarations
-      if (!details->shape().empty())
-        global = global || !reentrant;
-      for (const auto &subs : details->shape()) {
+      for (const semantics::ShapeSpec &subs : details->shape()) {
         doExplicit(subs.lbound());
         doExplicit(subs.ubound());
       }
       // handle any symbols in coarray bound declarations
-      if (!details->coshape().empty())
-        global = global || !reentrant;
-      for (const auto &subs : details->coshape()) {
+      for (const semantics::ShapeSpec &subs : details->coshape()) {
         doExplicit(subs.lbound());
         doExplicit(subs.ubound());
       }
       // handle any symbols in initialization expressions
-      if (auto e = details->init()) {
-        // A PARAMETER may not be marked as implicitly SAVE, so set the flag.
-        global = true;
+      if (auto e = details->init())
         for (const auto &s : evaluate::CollectSymbols(*e))
           depth = std::max(analyze(s) + 1, depth);
-      }
     }
     adjustSize(depth + 1);
+    bool global = lower::symbolIsGlobal(sym);
     vars[depth].emplace_back(sym, global, depth);
     if (semantics::IsAllocatable(sym))
       vars[depth].back().setHeapAlloc();
@@ -1357,7 +1467,7 @@ struct SymbolDependenceDepth {
 
     // If there are alias sets, then link the participating variables to their
     // aggregate stores when constructing the new variable on the list.
-    if (auto *store = findStoreIfAlias(sym)) {
+    if (lower::pft::Variable::AggregateStore *store = findStoreIfAlias(sym)) {
       vars[depth].back().setAlias(store->getOffset());
     }
     return depth;
@@ -1373,26 +1483,31 @@ struct SymbolDependenceDepth {
 
   Fortran::lower::pft::Variable::AggregateStore *
   findStoreIfAlias(const Fortran::evaluate::Symbol &sym) {
-    const auto &ultimate = sym.GetUltimate();
-    const auto &scope = ultimate.owner();
+    const semantics::Symbol &ultimate = sym.GetUltimate();
+    const semantics::Scope &scope = ultimate.owner();
     // Expect the total number of EQUIVALENCE sets to be small for a typical
     // Fortran program.
     if (aliasSyms.find(&ultimate) != aliasSyms.end()) {
       LLVM_DEBUG(llvm::dbgs() << "symbol: " << ultimate << '\n');
       LLVM_DEBUG(llvm::dbgs() << "scope: " << scope << '\n');
-      auto off = ultimate.offset();
-      for (auto &v : stores) {
-        if (v.scope == &scope) {
-          auto bot = std::get<0>(v.interval);
-          if (off >= bot && off < bot + std::get<1>(v.interval))
+      std::size_t off = ultimate.offset();
+      std::size_t symSize = ultimate.size();
+      for (lower::pft::Variable::AggregateStore &v : stores) {
+        if (&v.getOwningScope() == &scope) {
+          auto intervalOff = std::get<0>(v.interval);
+          auto intervalSize = std::get<1>(v.interval);
+          if (off >= intervalOff && off < intervalOff + intervalSize)
+            return &v;
+          // Zero sized symbol in zero sized equivalence.
+          if (off == intervalOff && symSize == 0)
             return &v;
         }
       }
       // clang-format off
       LLVM_DEBUG(
           llvm::dbgs() << "looking for " << off << "\n{\n";
-          for (auto v : stores) {
-            llvm::dbgs() << " in scope: " << v.scope << "\n";
+          for (lower::pft::Variable::AggregateStore &v : stores) {
+            llvm::dbgs() << " in scope: " << &v.getOwningScope() << "\n";
             llvm::dbgs() << "  i = [" << std::get<0>(v.interval) << ".."
                 << std::get<0>(v.interval) + std::get<1>(v.interval)
                 << "]\n";
@@ -1407,8 +1522,11 @@ struct SymbolDependenceDepth {
 private:
   /// Skip symbol in alias analysis.
   bool skipSymbol(const semantics::Symbol &sym) {
+    // Common block equivalences are largely managed by the front end.
+    // Compiler generated symbols ('.' names) cannot be equivalenced.
+    // FIXME: Equivalence code generation may need to be revisited.
     return !sym.has<semantics::ObjectEntityDetails>() ||
-           lower::definedInCommonBlock(sym);
+           lower::definedInCommonBlock(sym) || sym.name()[0] == '.';
   }
 
   // Make sure the table is of appropriate size.
@@ -1420,23 +1538,26 @@ struct SymbolDependenceDepth {
   llvm::SmallSet<const semantics::Symbol *, 32> seen;
   std::vector<std::vector<lower::pft::Variable>> &vars;
   llvm::SmallSet<const semantics::Symbol *, 32> aliasSyms;
-  llvm::SmallSet<const semantics::Scope *, 4> scopeAnlyzedForAliases;
+  /// Set of Scope that have been analyzed for aliases.
+  llvm::SmallSet<const semantics::Scope *, 4> analyzedScopes;
   std::vector<Fortran::lower::pft::Variable::AggregateStore> stores;
-  bool reentrant;
 };
 } // namespace
 
 static void processSymbolTable(
     const semantics::Scope &scope,
-    std::vector<std::vector<Fortran::lower::pft::Variable>> &varList,
-    bool reentrant) {
-  SymbolDependenceDepth sdd{varList, reentrant};
+    std::vector<std::vector<Fortran::lower::pft::Variable>> &varList) {
+  SymbolDependenceDepth sdd{varList};
   sdd.analyzeAliasesInCurrentScope(scope);
   for (const auto &iter : scope)
     sdd.analyze(iter.second.get());
   sdd.finalize();
 }
 
+//===----------------------------------------------------------------------===//
+// FunctionLikeUnit implementation
+//===----------------------------------------------------------------------===//
+
 Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
     const parser::MainProgram &func, const lower::pft::PftNode &parent,
     const semantics::SemanticsContext &semanticsContext)
@@ -1447,14 +1568,14 @@ Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
       std::get<std::optional<parser::Statement<parser::ProgramStmt>>>(func.t);
   if (programStmt.has_value()) {
     beginStmt = FunctionStatement(programStmt.value());
-    const auto *symbol = getSymbol(*beginStmt);
+    const semantics::Symbol *symbol = getSymbol(*beginStmt);
     entryPointList[0].first = symbol;
-    processSymbolTable(*symbol->scope(), varList, isRecursive());
+    processSymbolTable(*symbol->scope(), varList);
   } else {
     processSymbolTable(
         semanticsContext.FindScope(
             std::get<parser::Statement<parser::EndProgramStmt>>(func.t).source),
-        varList, isRecursive());
+        varList);
   }
 }
 
@@ -1464,9 +1585,9 @@ Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
     : ProgramUnit{func, parent},
       beginStmt{getFunctionStmt<parser::FunctionStmt>(func)},
       endStmt{getFunctionStmt<parser::EndFunctionStmt>(func)} {
-  const auto *symbol = getSymbol(*beginStmt);
+  const semantics::Symbol *symbol = getSymbol(*beginStmt);
   entryPointList[0].first = symbol;
-  processSymbolTable(*symbol->scope(), varList, isRecursive());
+  processSymbolTable(*symbol->scope(), varList);
 }
 
 Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
@@ -1475,9 +1596,9 @@ Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
     : ProgramUnit{func, parent},
       beginStmt{getFunctionStmt<parser::SubroutineStmt>(func)},
       endStmt{getFunctionStmt<parser::EndSubroutineStmt>(func)} {
-  const auto *symbol = getSymbol(*beginStmt);
+  const semantics::Symbol *symbol = getSymbol(*beginStmt);
   entryPointList[0].first = symbol;
-  processSymbolTable(*symbol->scope(), varList, isRecursive());
+  processSymbolTable(*symbol->scope(), varList);
 }
 
 Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
@@ -1486,17 +1607,43 @@ Fortran::lower::pft::FunctionLikeUnit::FunctionLikeUnit(
     : ProgramUnit{func, parent},
       beginStmt{getFunctionStmt<parser::MpSubprogramStmt>(func)},
       endStmt{getFunctionStmt<parser::EndMpSubprogramStmt>(func)} {
-  const auto *symbol = getSymbol(*beginStmt);
+  const semantics::Symbol *symbol = getSymbol(*beginStmt);
   entryPointList[0].first = symbol;
-  processSymbolTable(*symbol->scope(), varList, isRecursive());
+  processSymbolTable(*symbol->scope(), varList);
+}
+
+Fortran::lower::HostAssociations &
+Fortran::lower::pft::FunctionLikeUnit::parentHostAssoc() {
+  if (auto *par = parent.getIf<FunctionLikeUnit>())
+    return par->hostAssociations;
+  llvm::report_fatal_error("parent is not a function");
 }
 
+bool Fortran::lower::pft::FunctionLikeUnit::parentHasHostAssoc() {
+  if (auto *par = parent.getIf<FunctionLikeUnit>())
+    return !par->hostAssociations.empty();
+  return false;
+}
+
+parser::CharBlock
+Fortran::lower::pft::FunctionLikeUnit::getStartingSourceLoc() const {
+  if (beginStmt)
+    return stmtSourceLoc(*beginStmt);
+  if (!evaluationList.empty())
+    return evaluationList.front().position;
+  return stmtSourceLoc(endStmt);
+}
+
+//===----------------------------------------------------------------------===//
+// ModuleLikeUnit implementation
+//===----------------------------------------------------------------------===//
+
 Fortran::lower::pft::ModuleLikeUnit::ModuleLikeUnit(
     const parser::Module &m, const lower::pft::PftNode &parent)
     : ProgramUnit{m, parent}, beginStmt{getModuleStmt<parser::ModuleStmt>(m)},
       endStmt{getModuleStmt<parser::EndModuleStmt>(m)} {
-  const auto *symbol = getSymbol(beginStmt);
-  processSymbolTable(*symbol->scope(), varList, /*reentrant=*/false);
+  const semantics::Symbol *symbol = getSymbol(beginStmt);
+  processSymbolTable(*symbol->scope(), varList);
 }
 
 Fortran::lower::pft::ModuleLikeUnit::ModuleLikeUnit(
@@ -1504,9 +1651,25 @@ Fortran::lower::pft::ModuleLikeUnit::ModuleLikeUnit(
     : ProgramUnit{m, parent}, beginStmt{getModuleStmt<parser::SubmoduleStmt>(
                                   m)},
       endStmt{getModuleStmt<parser::EndSubmoduleStmt>(m)} {
-  const auto *symbol = getSymbol(beginStmt);
-  processSymbolTable(*symbol->scope(), varList, /*reentrant=*/false);
+  const semantics::Symbol *symbol = getSymbol(beginStmt);
+  processSymbolTable(*symbol->scope(), varList);
+}
+
+parser::CharBlock
+Fortran::lower::pft::ModuleLikeUnit::getStartingSourceLoc() const {
+  return stmtSourceLoc(beginStmt);
 }
+const Fortran::semantics::Scope &
+Fortran::lower::pft::ModuleLikeUnit::getScope() const {
+  const Fortran::semantics::Symbol *symbol = getSymbol(beginStmt);
+  assert(symbol && symbol->scope() &&
+         "Module statement must have a symbol with a scope");
+  return *symbol->scope();
+}
+
+//===----------------------------------------------------------------------===//
+// BlockDataUnit implementation
+//===----------------------------------------------------------------------===//
 
 Fortran::lower::pft::BlockDataUnit::BlockDataUnit(
     const parser::BlockData &bd, const lower::pft::PftNode &parent,
@@ -1562,14 +1725,11 @@ void Fortran::lower::pft::Variable::dump() const {
   } else if (auto *s = std::get_if<AggregateStore>(&var)) {
     llvm::errs() << "interval[" << std::get<0>(s->interval) << ", "
                  << std::get<1>(s->interval) << "]:";
+    llvm::errs() << " name: " << toStringRef(s->getNamingSymbol().name());
     if (s->isGlobal())
       llvm::errs() << ", global";
-    if (s->vars.size()) {
-      llvm::errs() << ", vars: {";
-      llvm::interleaveComma(s->vars, llvm::errs(),
-                            [](auto *y) { llvm::errs() << *y; });
-      llvm::errs() << '}';
-    }
+    if (s->initialValueSymbol)
+      llvm::errs() << ", initial value: {" << *s->initialValueSymbol << "}";
   } else {
     llvm_unreachable("not a Variable");
   }
@@ -1588,3 +1748,70 @@ void Fortran::lower::pft::ModuleLikeUnit::dump() const {
 void Fortran::lower::pft::BlockDataUnit::dump() const {
   llvm::errs() << "block data {\n" << symTab << "\n}\n";
 }
+
+std::vector<Fortran::lower::pft::Variable>
+Fortran::lower::pft::buildFuncResultDependencyList(
+    const Fortran::semantics::Symbol &symbol) {
+  std::vector<std::vector<pft::Variable>> variableList;
+  SymbolDependenceDepth sdd(variableList);
+  sdd.analyzeAliasesInCurrentScope(symbol.owner());
+  sdd.analyze(symbol);
+  sdd.finalize();
+  // Remove the pft::variable for the result itself, only its dependencies
+  // should be returned in the list.
+  assert(!variableList[0].empty() && "must at least contain the result");
+  assert(&variableList[0].back().getSymbol() == &symbol &&
+         "result sym should be last");
+  variableList[0].pop_back();
+  return variableList[0];
+}
+
+namespace {
+/// Helper class to find all the symbols referenced in a FunctionLikeUnit.
+/// It defines a parse tree visitor doing a deep visit in all nodes with
+/// symbols (including evaluate::Expr).
+struct SymbolVisitor {
+  template <typename A>
+  bool Pre(const A &x) {
+    if constexpr (Fortran::parser::HasTypedExpr<A>::value)
+      if (const auto *expr = Fortran::semantics::GetExpr(x))
+        visitExpr(*expr);
+    return true;
+  }
+
+  bool Pre(const Fortran::parser::Name &name) {
+    if (const semantics::Symbol *symbol = name.symbol)
+      visitSymbol(*symbol);
+    return false;
+  }
+
+  void visitExpr(const Fortran::lower::SomeExpr &expr) {
+    for (const semantics::Symbol &symbol :
+         Fortran::evaluate::CollectSymbols(expr))
+      visitSymbol(symbol);
+  }
+
+  void visitSymbol(const Fortran::semantics::Symbol &symbol) {
+    callBack(symbol);
+    // Visit statement function body since it will be inlined in lowering.
+    if (const auto *subprogramDetails =
+            symbol.detailsIf<Fortran::semantics::SubprogramDetails>())
+      if (const auto &maybeExpr = subprogramDetails->stmtFunction())
+        visitExpr(*maybeExpr);
+  }
+
+  template <typename A>
+  constexpr void Post(const A &) {}
+
+  const std::function<void(const Fortran::semantics::Symbol &)> &callBack;
+};
+} // namespace
+
+void Fortran::lower::pft::visitAllSymbols(
+    const Fortran::lower::pft::FunctionLikeUnit &funit,
+    const std::function<void(const Fortran::semantics::Symbol &)> callBack) {
+  SymbolVisitor visitor{callBack};
+  funit.visit([&](const auto &functionParserNode) {
+    parser::Walk(functionParserNode, visitor);
+  });
+}
diff --git a/flang/test/Lower/pre-fir-tree01.f90 b/flang/test/Lower/pre-fir-tree01.f90
index ba26510d58f03..0af8eef28fc53 100644
--- a/flang/test/Lower/pre-fir-tree01.f90
+++ b/flang/test/Lower/pre-fir-tree01.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fdebug-pre-fir-tree %s | FileCheck %s
+! RUN: bbc -pft-test -o %t %s | FileCheck %s
 
 ! Test structure of the Pre-FIR tree
 
@@ -132,14 +132,12 @@ function bar()
 ! Test top level directives
 !DIR$ INTEGER=64
 ! CHECK: CompilerDirective:
-! CHECK: End CompilerDirective
 
 ! Test nested directive
 ! CHECK: Subroutine test_directive
 subroutine test_directive()
   !DIR$ INTEGER=64
-  ! CHECK: <<CompilerDirective>>
-  ! CHECK: <<End CompilerDirective>>
+  ! CHECK: CompilerDirective:
 end subroutine
 ! CHECK: EndSubroutine
 
diff --git a/flang/test/Lower/pre-fir-tree02.f90 b/flang/test/Lower/pre-fir-tree02.f90
index 5692505a9bdb7..7cc55df4c0bb8 100644
--- a/flang/test/Lower/pre-fir-tree02.f90
+++ b/flang/test/Lower/pre-fir-tree02.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fdebug-pre-fir-tree %s | FileCheck %s
+! RUN: bbc -pft-test -o %t %s | FileCheck %s
 
 ! Test Pre-FIR Tree captures all the intended nodes from the parse-tree
 ! Coarray and OpenMP related nodes are tested in other files.
@@ -212,8 +212,7 @@ function bar(x)
   ! CHECK: Subroutine sub
   subroutine sub(a)
     real(4):: a
-    ! CompilerDirective
-    ! CHECK: <<CompilerDirective>>
+    ! CompilerDirective:
     !DIR$ IGNORE_TKR a
   end subroutine
 
@@ -254,7 +253,7 @@ subroutine iostmts(filename, a, b, c)
   read(10, *) length
   ! CHECK: RewindStmt
   rewind 10
-  ! CHECK: NamelistStmt
+  ! CHECK-NOT: NamelistStmt
   namelist /nlist/ a, b, c
   ! CHECK: WriteStmt
   write(10, NML=nlist)
diff --git a/flang/test/Lower/pre-fir-tree05.f90 b/flang/test/Lower/pre-fir-tree05.f90
index 0e4576cf7c14d..aeca3ab79ac9f 100644
--- a/flang/test/Lower/pre-fir-tree05.f90
+++ b/flang/test/Lower/pre-fir-tree05.f90
@@ -24,14 +24,15 @@ subroutine foo()
   ! CHECK-NEXT: EndDoStmt
   ! CHECK-NEXT: <<End DoConstruct>>
   end do
+  ! CHECK-NEXT: ContinueStmt
   !$acc end parallel
-  ! CHECK-NEXT: <<End OpenACCConstruct>>
+  ! CHECK-NEXT:      <<End OpenACCConstruct>>
   ! CHECK-NEXT: <<End OpenACCConstruct>>
   ! CHECK-NEXT: EndSubroutineStmt
 end subroutine
 ! CHECK-NEXT: End Subroutine foo
 
-! CHECK: Subroutine foo
+! CHECK: Subroutine foo2
 subroutine foo2()
   ! CHECK-NEXT: <<OpenACCConstruct>>
   !$acc parallel loop
@@ -41,9 +42,9 @@ subroutine foo2()
   ! CHECK-NEXT: EndDoStmt
   ! CHECK-NEXT: <<End DoConstruct>>
   end do
+  ! CHECK-NEXT: ContinueStmt
   !$acc end parallel loop
   ! CHECK-NEXT: <<End OpenACCConstruct>>
   ! CHECK-NEXT: EndSubroutineStmt
 end subroutine
 ! CHECK-NEXT: End Subroutine foo2
-

From 0539a26d91a1b7c74022fa9cf33bd7faca87544d Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 10:13:14 -0800
Subject: [PATCH 520/748] [SLP] Schedule only sub-graph of vectorizable
 instructions

SLP currently schedules all instructions within a scheduling window which stretches from the first instruction potentially vectorized to the last. This window can include a very large number of unrelated instructions which are not being considered for vectorization. This change switches the code to only schedule the sub-graph consisting of the instructions being vectorized and their transitive users.

This has the effect of greatly reducing the amount of work performed in large basic blocks, and thus greatly improves compile time on degenerate examples. To understand the effects, I added some statistics (not planned for upstream contribution). Here's an illustration from my motivating example:

Before this patch:

704357 SLP                          - Number of calcDeps actions
 699021 SLP                          - Number of schedule calls
   5598 SLP                          - Number of ReSchedule actions
     59 SLP                          - Number of ReScheduleOnFail actions
  10084 SLP                          - Number of schedule resets
   8523 SLP                          - Number of vector instructions generated

After this patch:

102895 SLP                          - Number of calcDeps actions
 161916 SLP                          - Number of schedule calls
   5637 SLP                          - Number of ReSchedule actions
     55 SLP                          - Number of ReScheduleOnFail actions
  10083 SLP                          - Number of schedule resets
   8403 SLP                          - Number of vector instructions generated

I do want to highlight that there is a small difference in number of generated vector instructions. This example is hitting the bailout due to maximum window size, and the change in scheduling is slightly perturbing when and how we hit it. This can be seen in the RescheduleOnFail counter change. Given that, I think we can safely ignore.

The downside of this change can be seen in the large test diff. We group all vectorizable instructions together at the bottom of the scheduling region. This means that vector instructions can move quite far from their original point in code. While maybe undesirable, I don't see this as being a major problem as this pass is not intended to be a general scheduling pass.

For context, it's worth noting that the pre-scheduling that SLP does while building the vector tree is exactly the sub-graph scheduling implemented by this patch.

Differential Revision: https://reviews.llvm.org/D118538
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  25 +-
 .../SLPVectorizer/AArch64/64-bit-vector.ll    |   4 +-
 .../SLPVectorizer/AArch64/commute.ll          |   4 +-
 .../SLPVectorizer/AArch64/gather-reduce.ll    |   8 +-
 .../SLPVectorizer/AArch64/horizontal.ll       |   6 +-
 .../SLPVectorizer/AArch64/loadi8.ll           |   2 +-
 .../SLPVectorizer/AArch64/matmul.ll           |  68 +--
 .../AArch64/memory-runtime-checks.ll          |  12 +-
 .../SLPVectorizer/AArch64/sdiv-pow2.ll        |   4 +-
 .../AArch64/slp-and-reduction.ll              |   2 +-
 .../SLPVectorizer/AArch64/slp-or-reduction.ll |   2 +-
 .../AArch64/slp-xor-reduction.ll              |   2 +-
 .../SLPVectorizer/AArch64/spillcost-di.ll     |  10 +-
 .../SLPVectorizer/AArch64/spillcost-order.ll  |   4 +-
 .../AArch64/transpose-inseltpoison.ll         |  24 +-
 .../SLPVectorizer/AArch64/transpose.ll        |  24 +-
 .../SLPVectorizer/AArch64/tsc-s352.ll         |   4 +-
 .../Transforms/SLPVectorizer/AArch64/widen.ll |  22 +-
 .../SLPVectorizer/AMDGPU/packed-math.ll       |   4 +-
 .../Transforms/SLPVectorizer/NVPTX/v2f16.ll   |   2 +-
 .../SLPVectorizer/SystemZ/pr34619.ll          |   2 +-
 .../Transforms/SLPVectorizer/X86/PR32086.ll   |  18 +-
 .../Transforms/SLPVectorizer/X86/PR39774.ll   |   6 +-
 .../Transforms/SLPVectorizer/X86/addsub.ll    |  26 +-
 .../Transforms/SLPVectorizer/X86/align.ll     |   6 +-
 .../Transforms/SLPVectorizer/X86/arith-abs.ll | 192 +++---
 .../SLPVectorizer/X86/arith-add-ssat.ll       | 216 +++----
 .../SLPVectorizer/X86/arith-add-usat.ll       | 132 ++--
 .../Transforms/SLPVectorizer/X86/arith-add.ll | 272 ++++-----
 .../Transforms/SLPVectorizer/X86/arith-div.ll | 192 +++---
 .../Transforms/SLPVectorizer/X86/arith-fix.ll | 568 +++++++++---------
 .../Transforms/SLPVectorizer/X86/arith-mul.ll | 328 +++++-----
 .../SLPVectorizer/X86/arith-smax.ll           | 244 ++++----
 .../SLPVectorizer/X86/arith-smin.ll           | 244 ++++----
 .../SLPVectorizer/X86/arith-sub-ssat.ll       | 216 +++----
 .../SLPVectorizer/X86/arith-sub-usat.ll       | 132 ++--
 .../Transforms/SLPVectorizer/X86/arith-sub.ll | 272 ++++-----
 .../SLPVectorizer/X86/arith-umax.ll           | 244 ++++----
 .../SLPVectorizer/X86/arith-umin.ll           | 244 ++++----
 .../SLPVectorizer/X86/bitreverse.ll           |  32 +-
 .../Transforms/SLPVectorizer/X86/broadcast.ll |  18 +-
 .../Transforms/SLPVectorizer/X86/bswap.ll     |  16 +-
 .../X86/combined-stores-chains.ll             |  24 +-
 .../SLPVectorizer/X86/consecutive-access.ll   |  10 +-
 .../SLPVectorizer/X86/continue_vectorizing.ll |   4 +-
 .../X86/crash_exceed_scheduling.ll            |  18 +-
 .../SLPVectorizer/X86/crash_mandeltext.ll     |   2 +-
 .../SLPVectorizer/X86/crash_smallpt.ll        |  14 +-
 llvm/test/Transforms/SLPVectorizer/X86/cse.ll |  28 +-
 .../test/Transforms/SLPVectorizer/X86/ctlz.ll |  48 +-
 .../Transforms/SLPVectorizer/X86/ctpop.ll     |  32 +-
 .../test/Transforms/SLPVectorizer/X86/cttz.ll |  48 +-
 .../Transforms/SLPVectorizer/X86/diamond.ll   |   6 +-
 .../SLPVectorizer/X86/diamond_broadcast.ll    |   2 +-
 .../X86/diamond_broadcast_extra_shuffle.ll    |   6 +-
 .../SLPVectorizer/X86/different-vec-widths.ll |  36 +-
 .../SLPVectorizer/X86/dot-product.ll          |  40 +-
 .../SLPVectorizer/X86/extract_in_tree_user.ll |  12 +-
 .../test/Transforms/SLPVectorizer/X86/fabs.ll |  72 +--
 .../Transforms/SLPVectorizer/X86/fcopysign.ll | 104 ++--
 llvm/test/Transforms/SLPVectorizer/X86/fma.ll |  32 +-
 .../Transforms/SLPVectorizer/X86/fmaxnum.ll   | 104 ++--
 .../Transforms/SLPVectorizer/X86/fminnum.ll   | 104 ++--
 .../Transforms/SLPVectorizer/X86/fmuladd.ll   | 128 ++--
 .../SLPVectorizer/X86/fptosi-inseltpoison.ll  |  32 +-
 .../Transforms/SLPVectorizer/X86/fptosi.ll    |  32 +-
 .../Transforms/SLPVectorizer/X86/fptoui.ll    |  32 +-
 .../Transforms/SLPVectorizer/X86/fround.ll    | 440 +++++++-------
 .../Transforms/SLPVectorizer/X86/funclet.ll   |   4 +-
 llvm/test/Transforms/SLPVectorizer/X86/gep.ll |  10 +-
 .../SLPVectorizer/X86/horizontal-list.ll      |   8 +-
 .../SLPVectorizer/X86/horizontal-minmax.ll    |   6 +-
 .../SLPVectorizer/X86/horizontal.ll           |   4 +-
 .../SLPVectorizer/X86/insert-after-bundle.ll  |  40 +-
 ...nsert-element-build-vector-inseltpoison.ll |  36 +-
 .../X86/insert-element-build-vector.ll        |  36 +-
 .../SLPVectorizer/X86/insert-shuffle.ll       |  10 +-
 .../SLPVectorizer/X86/insertvalue.ll          |  16 +-
 .../SLPVectorizer/X86/inst_size_bug.ll        |   2 +-
 .../X86/intrinsic_with_scalar_param.ll        |   6 +-
 .../X86/jumbled-load-shuffle-placement.ll     |  16 +-
 .../SLPVectorizer/X86/jumbled-load.ll         |  20 +-
 .../SLPVectorizer/X86/jumbled_store_crash.ll  |  48 +-
 .../X86/load-merge-inseltpoison.ll            |  18 +-
 .../SLPVectorizer/X86/load-merge.ll           |  18 +-
 .../Transforms/SLPVectorizer/X86/lookahead.ll |  24 +-
 .../Transforms/SLPVectorizer/X86/metadata.ll  |   2 +-
 .../SLPVectorizer/X86/multi_block.ll          |   8 +-
 .../SLPVectorizer/X86/phi_overalignedtype.ll  |   6 +-
 .../Transforms/SLPVectorizer/X86/powof2div.ll |   8 +-
 .../Transforms/SLPVectorizer/X86/powof2mul.ll |  34 +-
 .../Transforms/SLPVectorizer/X86/pr35497.ll   |  46 +-
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 438 +++++++-------
 .../Transforms/SLPVectorizer/X86/pr47629.ll   | 438 +++++++-------
 .../SLPVectorizer/X86/remark_horcost.ll       |  16 +-
 .../X86/reorder_diamond_match.ll              |  54 +-
 .../Transforms/SLPVectorizer/X86/resched.ll   |  18 +-
 .../Transforms/SLPVectorizer/X86/return.ll    |   2 +-
 .../X86/reuse-extracts-in-wider-vect.ll       |   4 +-
 .../SLPVectorizer/X86/schedule_budget.ll      |   4 +-
 .../SLPVectorizer/X86/scheduling.ll           |  16 +-
 .../SLPVectorizer/X86/shift-ashr.ll           | 180 +++---
 .../SLPVectorizer/X86/shift-lshr.ll           | 208 +++----
 .../Transforms/SLPVectorizer/X86/shift-shl.ll | 180 +++---
 .../SLPVectorizer/X86/shrink_after_reorder.ll |   8 +-
 .../SLPVectorizer/X86/simple-loop.ll          |  14 +-
 .../Transforms/SLPVectorizer/X86/simplebb.ll  |  10 +-
 .../SLPVectorizer/X86/sitofp-inseltpoison.ll  | 232 +++----
 .../Transforms/SLPVectorizer/X86/sitofp.ll    | 232 +++----
 .../SLPVectorizer/X86/split-load8_2-unord.ll  |  36 +-
 .../test/Transforms/SLPVectorizer/X86/sqrt.ll |  72 +--
 .../SLPVectorizer/X86/store-jumbled.ll        |  10 +-
 .../SLPVectorizer/X86/stores-non-ordered.ll   |  16 +-
 .../SLPVectorizer/X86/stores_vectorize.ll     |  16 +-
 .../Transforms/SLPVectorizer/X86/tiny-tree.ll |  10 +-
 .../Transforms/SLPVectorizer/X86/uitofp.ll    | 260 ++++----
 .../X86/vectorize-reorder-alt-shuffle.ll      |   8 +-
 .../X86/vectorize-reordered-list.ll           |   4 +-
 .../X86/vectorize-widest-phis.ll              |  20 +-
 .../SLPVectorizer/int_sideeffect.ll           |   4 +-
 120 files changed, 4101 insertions(+), 4098 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4a731107f46de..99da014e0cc3d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2727,7 +2727,8 @@ class BoUpSLP {
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
-          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+          if (MemoryDepSD->hasValidDependencies() &&
+              MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
             // There are no more unscheduled dependencies after decrementing,
             // so we can put the dependent instruction into the ready list.
             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
@@ -2782,7 +2783,8 @@ class BoUpSLP {
     void initialFillReadyList(ReadyListType &ReadyList) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         doForAllOpcodes(I, [&](ScheduleData *SD) {
-          if (SD->isSchedulingEntity() && SD->isReady()) {
+          if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
+              SD->isReady()) {
             ReadyList.insert(SD);
             LLVM_DEBUG(dbgs()
                        << "SLP:    initially in ready list: " << *SD << "\n");
@@ -7872,6 +7874,11 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
   LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
 
+  // A key point - if we got here, pre-scheduling was able to find a valid
+  // scheduling of the sub-graph of the scheduling window which consists
+  // of all vector bundles and their transitive users.  As such, we do not
+  // need to reschedule anything *outside of* that subgraph.
+
   BS->resetSchedule();
 
   // For the real scheduling we use a more sophisticated ready-list: it is
@@ -7884,21 +7891,19 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   };
   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
 
-  // Ensure that all dependency data is updated and fill the ready-list with
-  // initial instructions.
+  // Ensure that all dependency data is updated (for nodes in the sub-graph)
+  // and fill the ready-list with initial instructions.
   int Idx = 0;
-  int NumToSchedule = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+    BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
       assert((isVectorLikeInstWithConstOps(SD->Inst) ||
               SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
-      if (SD->isSchedulingEntity()) {
+
+      if (SD->isSchedulingEntity() && SD->isPartOfBundle())
         BS->calculateDependencies(SD, false, this);
-        NumToSchedule++;
-      }
     });
   }
   BS->initialFillReadyList(ReadyInsts);
@@ -7921,9 +7926,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     }
 
     BS->schedule(picked, ReadyInsts);
-    NumToSchedule--;
   }
-  assert(NumToSchedule == 0 && "could not schedule all instructions");
 
   // Check that we didn't break any of our invariants.
 #ifdef EXPENSIVE_CHECKS
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
index 10883987aa758..531fe4fb815a1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
@@ -9,11 +9,11 @@ define void @f(float* %r, float* %w) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0
 ; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1
+; CHECK-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
+; CHECK-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[R0]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], [[TMP2]]
-; CHECK-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
-; CHECK-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[W0]] to <2 x float>*
 ; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 4bd0754bf9594..7d38eb60d1f03 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -12,9 +12,9 @@ define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
@@ -57,9 +57,9 @@ define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index 536f72a73684e..ec7b03af83f8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -36,6 +36,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -85,7 +86,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -111,6 +111,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -160,7 +161,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -297,6 +297,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -346,7 +347,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -372,6 +372,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -421,7 +422,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 4d0e0dfd69f14..4be767ce01e84 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -37,9 +37,9 @@ define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias noca
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
@@ -163,9 +163,9 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_017]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
@@ -274,10 +274,10 @@ define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noali
 ; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 6
 ; CHECK-NEXT:    [[ARRAYIDX63:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 6
 ; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P1_044]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[P2_045]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
index f4b027086265f..c71da72317c9e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
@@ -18,6 +18,7 @@ define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonl
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SRC]] to <4 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
@@ -32,7 +33,6 @@ define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonl
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
-; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[DST]] to <4 x i8>*
 ; CHECK-NEXT:    store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
index 97ae874e6c9a2..c68c7732bbeac 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
@@ -17,58 +17,58 @@ define void @wrap_mul4(double* nocapture %Out, [2 x double]* nocapture readonly
 ; CHECK-NEXT:    [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1
+; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2
+; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3
+; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3
+; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0
+; CHECK-NEXT:    [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1
+; CHECK-NEXT:    [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2
-; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2
-; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>*
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]]
-; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>*
-; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]]
-; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0
-; CHECK-NEXT:    [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8
-; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1
-; CHECK-NEXT:    [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double* [[OUT]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP29]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[OUT]] to <2 x double>*
 ; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2
 ; CHECK-NEXT:    [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP9]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <2 x double> [[TMP15]], [[TMP18]]
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP19]], <2 x double>* [[TMP20]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4
 ; CHECK-NEXT:    [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 5
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP25]], <2 x double>* [[TMP31]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[TEMP10]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> [[TMP2]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <2 x double> [[TMP24]], double [[TEMP11]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <2 x double> [[TMP7]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd <2 x double> [[TMP23]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP27]], <2 x double>* [[TMP28]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6
 ; CHECK-NEXT:    [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 7
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul <2 x double> [[TMP14]], [[TMP22]]
+; CHECK-NEXT:    [[TMP30:%.*]] = fmul <2 x double> [[TMP17]], [[TMP25]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fadd <2 x double> [[TMP29]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[TMP32]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP31]], <2 x double>* [[TMP32]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index d6bb9bda4bba8..aa61c095ecc7a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -326,10 +326,10 @@ define void @no_version(i32* nocapture %dst, i32* nocapture readonly %src) {
 ; CHECK-LABEL: @no_version(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i32> [[TMP1]], <i32 16, i32 16>
-; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -902,12 +902,7 @@ define i32 @block_partly_vectorized_without_versioning(%struct.spam* readonly %a
 ; CHECK-NEXT:    [[A_GEP_14:%.*]] = getelementptr i8, i8* [[A]], i64 14
 ; CHECK-NEXT:    [[B_GEP_14:%.*]] = getelementptr i8, i8* [[B]], i64 14
 ; CHECK-NEXT:    [[A_GEP_15:%.*]] = getelementptr i8, i8* [[A]], i64 15
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[B_GEP_15:%.*]] = getelementptr i8, i8* [[B]], i64 15
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[R_GEP_0:%.*]] = getelementptr i8, i8* [[ARG1]], i64 0
 ; CHECK-NEXT:    [[R_GEP_1:%.*]] = getelementptr i8, i8* [[ARG1]], i64 1
 ; CHECK-NEXT:    [[R_GEP_2:%.*]] = getelementptr i8, i8* [[ARG1]], i64 2
@@ -924,6 +919,11 @@ define i32 @block_partly_vectorized_without_versioning(%struct.spam* readonly %a
 ; CHECK-NEXT:    [[R_GEP_13:%.*]] = getelementptr i8, i8* [[ARG1]], i64 13
 ; CHECK-NEXT:    [[R_GEP_14:%.*]] = getelementptr i8, i8* [[ARG1]], i64 14
 ; CHECK-NEXT:    [[R_GEP_15:%.*]] = getelementptr i8, i8* [[ARG1]], i64 15
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[R_GEP_0]] to <16 x i8>*
 ; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 1
 ; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 15
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
index dfd322013399f..a631a9732a203 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
@@ -13,14 +13,14 @@ define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
index 2f7076dd25ca3..3f5d5147fd508 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
@@ -21,9 +21,9 @@ define i8 @reduce_and(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
-; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
index 53126ee407e98..f0cc2fc8d4022 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
@@ -21,9 +21,9 @@ define i8 @reduce_or(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
-; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
index 2c59e57cec56a..0136453298b79 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
@@ -21,9 +21,9 @@ define i8 @reduce_xor(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
-; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[TMP1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
index 39f2f885bc26b..05b1bf2a45b14 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
@@ -15,13 +15,13 @@ define void @patatino(i64 %n, i64 %i, %struct.S* %p) !dbg !7 {
 ; CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]]
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]]
 ; CHECK-NEXT:    [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG29:![0-9]+]]
+; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG30:![0-9]+]]
+; CHECK-NEXT:    [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG31:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA29:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]]
-; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG34:![0-9]+]]
-; CHECK-NEXT:    [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG35:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA32:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg [[DBG36:![0-9]+]]
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA29]]
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA32]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG37:![0-9]+]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
index 8e0ca4b293841..96b3aa2509c4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
@@ -15,15 +15,15 @@ define void @test(i64* %ptr, i64* noalias %res) {
 ; CHECK-NEXT:    [[CALL_I_I:%.*]] = call i32* @get_ptr()
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 2
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 1
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3
+; CHECK-NEXT:    [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 2
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[GEP_1]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[RES]] to <2 x i64>*
 ; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index b1da97e0f96ab..24c69fb667272 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -135,13 +135,13 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_31]]
@@ -171,12 +171,12 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP6]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index c670673e95936..962f8ca42bf86 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -135,13 +135,13 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_31]]
@@ -171,12 +171,12 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP6]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]]
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
index 2b634ae718277..b5f4dad4e712d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
@@ -33,9 +33,9 @@ define i32 @s352() {
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX6]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
@@ -48,9 +48,9 @@ define i32 @s352() {
 ; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4
-; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP11]], [[TMP13]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
index 3129ba79dafaa..4821ae9c64fd3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
@@ -25,14 +25,6 @@ define void @PR50256(i8* %a, i16* %b, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 15
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[A]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARRAYIDX_8]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw <8 x i16> [[TMP5]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw <8 x i16> [[TMP6]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 3
@@ -48,10 +40,18 @@ define void @PR50256(i8* %a, i16* %b, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX3_13:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX3_14:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX3_15:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 15
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[B]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* [[TMP9]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[A]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i16> [[TMP3]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX_8]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw <8 x i16> [[TMP8]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3_8]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* [[TMP10]], align 2
+; CHECK-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
index 4cc004ed9862a..7ab2df33692ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -191,10 +191,10 @@ define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, ha
 ; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
 ; GCN-NEXT:    [[I1:%.*]] = load half, half addrspace(3)* [[B:%.*]], align 2
 ; GCN-NEXT:    [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
-; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
-; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
 ; GCN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1
 ; GCN-NEXT:    [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
 ; GCN-NEXT:    [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
 ; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
 ; GCN-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
index c45a900ccc555..09ffb51df081b 100644
--- a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@@ -14,11 +14,11 @@ define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %a
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half*
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], <half 0xH5380, half 0xH5380>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], <half 0xH57F0, half 0xH57F0>
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>*
 ; CHECK-NEXT:    store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
index d334197ad9c7b..4812df4cb5e87 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -14,13 +14,13 @@ define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:    [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4
+; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
index 741dbcec392e8..12f5470dacd07 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -4,12 +4,12 @@
 define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
 ; CHECK-LABEL: @i64_simplified(
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
@@ -33,12 +33,12 @@ define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
 define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_reversed(
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
@@ -62,12 +62,12 @@ define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
 define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_extract(
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 7668747a75ace..bc8abf3a92eec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -54,9 +54,11 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
 ; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
+; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
+; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
 ; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
 ; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
@@ -88,9 +90,7 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP2]]
-; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
 ; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]]
-; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
 ; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0
 ; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
 ; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[VAL_41]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index ebbbefc9f81f2..1e2c9e8402376 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -302,21 +302,21 @@ define void @reorder_alt_subTree() #0 {
 define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
 ; CHECK-LABEL: @reorder_alt_rightsubTree(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[D]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fsub <2 x double> [[TMP11]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP11]], [[TMP6]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %1 = load double, double* %a
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/align.ll b/llvm/test/Transforms/SLPVectorizer/X86/align.ll
index a4ddfa1989d9a..90091b5af8ada 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/align.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/align.ll
@@ -12,9 +12,9 @@ define void @test1(double* %a, double* %b, double* %c) {
 ; CHECK-NEXT:    [[STORE1:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 1
 ; CHECK-NEXT:    [[STORE2:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
@@ -48,11 +48,11 @@ define void @test2(float * %a, float * %b) {
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
 ; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
index fc78797f5f7d3..9794ca999acad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
@@ -28,40 +28,40 @@ declare i8  @llvm.abs.i8 (i8, i1)
 define void @abs_v8i64() {
 ; SSE-LABEL: @abs_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false)
-; SSE-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)
-; SSE-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP5]], i1 false)
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP7]], i1 false)
 ; SSE-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false)
-; SLM-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
-; SLM-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)
-; SLM-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
+; SLM-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
+; SLM-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP5]], i1 false)
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP7]], i1 false)
 ; SLM-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false)
-; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP2]], i1 false)
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false)
+; AVX-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP3]], i1 false)
 ; AVX-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -101,40 +101,40 @@ define void @abs_v8i64() {
 define void @abs_v16i32() {
 ; SSE-LABEL: @abs_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 false)
-; SSE-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 false)
-; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP5]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP7]], i1 false)
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 false)
-; SLM-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SLM-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 false)
-; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP5]], i1 false)
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP7]], i1 false)
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP3]], i1 false)
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -198,40 +198,40 @@ define void @abs_v16i32() {
 define void @abs_v32i16() {
 ; SSE-LABEL: @abs_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP2]], i1 false)
-; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
-; SSE-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP4]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP5]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP5]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP7]], i1 false)
 ; SSE-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP2]], i1 false)
-; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP4]], i1 false)
-; SLM-NEXT:    store <8 x i16> [[TMP5]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SLM-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
+; SLM-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP5]], i1 false)
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP7]], i1 false)
 ; SLM-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP1]], i1 false)
-; AVX-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP2]], i1 false)
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP1]], i1 false)
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP3]], i1 false)
 ; AVX-NEXT:    store <16 x i16> [[TMP4]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -343,40 +343,40 @@ define void @abs_v32i16() {
 define void @abs_v64i8() {
 ; SSE-LABEL: @abs_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP2]], i1 false)
-; SSE-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
-; SSE-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP4]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP5]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP7]], i1 false)
 ; SSE-NEXT:    store <16 x i8> [[TMP8]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP2]], i1 false)
-; SLM-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
-; SLM-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP4]], i1 false)
-; SLM-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SLM-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
+; SLM-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP5]], i1 false)
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP7]], i1 false)
 ; SLM-NEXT:    store <16 x i8> [[TMP8]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP1]], i1 false)
-; AVX-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP2]], i1 false)
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP1]], i1 false)
+; AVX-NEXT:    store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP3]], i1 false)
 ; AVX-NEXT:    store <32 x i8> [[TMP4]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
index 66154522c327d..a2bbec2cd3a7f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -98,12 +98,12 @@ define void @add_v8i64() {
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -190,50 +190,50 @@ define void @add_v8i64() {
 define void @add_v16i32() {
 ; SSE-LABEL: @add_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -314,50 +314,50 @@ define void @add_v16i32() {
 define void @add_v32i16() {
 ; SSE-LABEL: @add_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -502,50 +502,50 @@ define void @add_v32i16() {
 define void @add_v64i8() {
 ; SSE-LABEL: @add_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
index fceb1cfc69017..203270d73c759 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -63,12 +63,12 @@ define void @add_v8i64() {
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,31 +117,31 @@ define void @add_v8i64() {
 define void @add_v16i32() {
 ; SSE-LABEL: @add_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -222,31 +222,31 @@ define void @add_v16i32() {
 define void @add_v32i16() {
 ; SSE-LABEL: @add_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -391,31 +391,31 @@ define void @add_v32i16() {
 define void @add_v64i8() {
 ; SSE-LABEL: @add_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
index 085e9adeaeef6..316c323f44f09 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -25,50 +25,50 @@
 define void @add_v8i64() {
 ; SSE-LABEL: @add_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,50 +117,50 @@ define void @add_v8i64() {
 define void @add_v16i32() {
 ; SSE-LABEL: @add_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -241,50 +241,50 @@ define void @add_v16i32() {
 define void @add_v32i16() {
 ; SSE-LABEL: @add_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -429,50 +429,50 @@ define void @add_v32i16() {
 define void @add_v64i8() {
 ; SSE-LABEL: @add_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = add <32 x i8> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll
index a1386e30170e4..0cb304d7859a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll
@@ -25,40 +25,40 @@
 define void @sdiv_v16i32_uniformconst() {
 ; SSE-LABEL: @sdiv_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sdiv_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sdiv_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = sdiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = sdiv <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = sdiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = sdiv <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -122,40 +122,40 @@ define void @sdiv_v16i32_uniformconst() {
 define void @srem_v16i32_uniformconst() {
 ; SSE-LABEL: @srem_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @srem_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @srem_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = srem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = srem <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = srem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = srem <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -219,40 +219,40 @@ define void @srem_v16i32_uniformconst() {
 define void @udiv_v16i32_uniformconst() {
 ; SSE-LABEL: @udiv_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @udiv_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @udiv_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = udiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = udiv <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = udiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = udiv <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -316,40 +316,40 @@ define void @udiv_v16i32_uniformconst() {
 define void @urem_v16i32_uniformconst() {
 ; SSE-LABEL: @urem_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @urem_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @urem_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = urem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    [[TMP4:%.*]] = urem <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = urem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = urem <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
index acd9979a6fbbd..e2c2c9a1ff32b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
@@ -28,69 +28,69 @@ declare i8  @llvm.smul.fix.i8 (i8 , i8 , i32)
 define void @smul_v8i64() {
 ; SSE-LABEL: @smul_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smul_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @smul_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
-; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
-; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @smul_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
-; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
+; AVX2-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -103,12 +103,12 @@ define void @smul_v8i64() {
 ;
 ; AVX256BW-LABEL: @smul_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
-; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
-; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
+; AVX256BW-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
 ;
@@ -150,20 +150,20 @@ define void @smul_v8i64() {
 define void @smul_v16i32() {
 ; SSE-LABEL: @smul_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3)
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -236,12 +236,12 @@ define void @smul_v16i32() {
 ;
 ; AVX-LABEL: @smul_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], i32 3)
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]], i32 3)
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -322,50 +322,50 @@ define void @smul_v16i32() {
 define void @smul_v32i16() {
 ; SSE-LABEL: @smul_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smul_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smul_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], i32 3)
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]], i32 3)
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -510,50 +510,50 @@ define void @smul_v32i16() {
 define void @smul_v64i8() {
 ; SSE-LABEL: @smul_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smul_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smul_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]], i32 3)
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]], i32 3)
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], i32 3)
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]], i32 3)
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -831,69 +831,69 @@ declare i8  @llvm.umul.fix.i8 (i8 , i8 , i32)
 define void @umul_v8i64() {
 ; SSE-LABEL: @umul_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umul_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @umul_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
-; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
-; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @umul_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
-; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
+; AVX2-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -906,12 +906,12 @@ define void @umul_v8i64() {
 ;
 ; AVX256BW-LABEL: @umul_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
-; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
-; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
+; AVX256BW-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
 ;
@@ -953,20 +953,20 @@ define void @umul_v8i64() {
 define void @umul_v16i32() {
 ; SSE-LABEL: @umul_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3)
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -1039,12 +1039,12 @@ define void @umul_v16i32() {
 ;
 ; AVX-LABEL: @umul_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], i32 3)
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]], i32 3)
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -1125,50 +1125,50 @@ define void @umul_v16i32() {
 define void @umul_v32i16() {
 ; SSE-LABEL: @umul_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umul_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umul_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], i32 3)
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]], i32 3)
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -1313,50 +1313,50 @@ define void @umul_v32i16() {
 define void @umul_v64i8() {
 ; SSE-LABEL: @umul_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umul_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umul_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]], i32 3)
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]], i32 3)
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], i32 3)
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]], i32 3)
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
index c38794dae9253..053fa937f917c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -95,31 +95,31 @@ define void @mul_v8i64() {
 ;
 ; AVX128-LABEL: @mul_v8i64(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <2 x i64> [[TMP1]], [[TMP5]]
-; AVX128-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[TMP2]], [[TMP6]]
-; AVX128-NEXT:    [[TMP11:%.*]] = mul <2 x i64> [[TMP3]], [[TMP7]]
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[TMP4]], [[TMP8]]
-; AVX128-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX128-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX128-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP2]]
+; AVX128-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[TMP4]], [[TMP5]]
+; AVX128-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <2 x i64> [[TMP7]], [[TMP8]]
+; AVX128-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[TMP10]], [[TMP11]]
 ; AVX128-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v8i64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]]
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]]
-; AVX256-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = mul <4 x i64> [[TMP1]], [[TMP2]]
+; AVX256-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[TMP4]], [[TMP5]]
 ; AVX256-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256-NEXT:    ret void
 ;
@@ -168,69 +168,69 @@ define void @mul_v8i64() {
 define void @mul_v16i32() {
 ; SSE-LABEL: @mul_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX128-LABEL: @mul_v16i32(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
-; AVX128-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
-; AVX128-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
-; AVX128-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; AVX128-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX128-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
+; AVX128-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
+; AVX128-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
+; AVX128-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
 ; AVX128-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v16i32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]]
-; AVX256-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = mul <8 x i32> [[TMP1]], [[TMP2]]
+; AVX256-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP4]], [[TMP5]]
 ; AVX256-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -311,69 +311,69 @@ define void @mul_v16i32() {
 define void @mul_v32i16() {
 ; SSE-LABEL: @mul_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX128-LABEL: @mul_v32i16(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
-; AVX128-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
-; AVX128-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
-; AVX128-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; AVX128-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; AVX128-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
+; AVX128-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
+; AVX128-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
+; AVX128-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
 ; AVX128-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v32i16(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
-; AVX256-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP3:%.*]] = mul <16 x i16> [[TMP1]], [[TMP2]]
+; AVX256-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP4]], [[TMP5]]
 ; AVX256-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX256-NEXT:    ret void
 ;
@@ -518,69 +518,69 @@ define void @mul_v32i16() {
 define void @mul_v64i8() {
 ; SSE-LABEL: @mul_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX128-LABEL: @mul_v64i8(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
-; AVX128-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
-; AVX128-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
-; AVX128-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; AVX128-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; AVX128-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]]
+; AVX128-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]]
+; AVX128-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]]
+; AVX128-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]]
+; AVX128-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; AVX128-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v64i8(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP5:%.*]] = mul <32 x i8> [[TMP1]], [[TMP3]]
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <32 x i8> [[TMP2]], [[TMP4]]
-; AVX256-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP3:%.*]] = mul <32 x i8> [[TMP1]], [[TMP2]]
+; AVX256-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <32 x i8> [[TMP4]], [[TMP5]]
 ; AVX256-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll
index a773ab657caa1..d71a3cb239ef9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll
@@ -63,31 +63,31 @@ define void @smax_v8i64() {
 ;
 ; SLM-LABEL: @smax_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @smax_v8i64() {
 define void @smax_v16i32() {
 ; SSE-LABEL: @smax_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smax_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @smax_v16i32() {
 define void @smax_v32i16() {
 ; SSE-LABEL: @smax_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smax_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @smax_v32i16() {
 define void @smax_v64i8() {
 ; SSE-LABEL: @smax_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smax_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll
index 1ead7570ca3c1..af2c560d12a98 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll
@@ -63,31 +63,31 @@ define void @smin_v8i64() {
 ;
 ; SLM-LABEL: @smin_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @smin_v8i64() {
 define void @smin_v16i32() {
 ; SSE-LABEL: @smin_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smin_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @smin_v16i32() {
 define void @smin_v32i16() {
 ; SSE-LABEL: @smin_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smin_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @smin_v32i16() {
 define void @smin_v64i8() {
 ; SSE-LABEL: @smin_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smin_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
index 88f18cba2b2ee..d25eccd882079 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -98,12 +98,12 @@ define void @sub_v8i64() {
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -190,50 +190,50 @@ define void @sub_v8i64() {
 define void @sub_v16i32() {
 ; SSE-LABEL: @sub_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -314,50 +314,50 @@ define void @sub_v16i32() {
 define void @sub_v32i16() {
 ; SSE-LABEL: @sub_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -502,50 +502,50 @@ define void @sub_v32i16() {
 define void @sub_v64i8() {
 ; SSE-LABEL: @sub_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
index afe17947cc3af..79bba89de18a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -63,12 +63,12 @@ define void @sub_v8i64() {
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,31 +117,31 @@ define void @sub_v8i64() {
 define void @sub_v16i32() {
 ; SSE-LABEL: @sub_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -222,31 +222,31 @@ define void @sub_v16i32() {
 define void @sub_v32i16() {
 ; SSE-LABEL: @sub_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -391,31 +391,31 @@ define void @sub_v32i16() {
 define void @sub_v64i8() {
 ; SSE-LABEL: @sub_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
index 11537e3965af4..a98c1234d3611 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -25,50 +25,50 @@
 define void @sub_v8i64() {
 ; SSE-LABEL: @sub_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = sub <4 x i64> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,50 +117,50 @@ define void @sub_v8i64() {
 define void @sub_v16i32() {
 ; SSE-LABEL: @sub_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -241,50 +241,50 @@ define void @sub_v16i32() {
 define void @sub_v32i16() {
 ; SSE-LABEL: @sub_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP10]], [[TMP11]]
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -429,50 +429,50 @@ define void @sub_v32i16() {
 define void @sub_v64i8() {
 ; SSE-LABEL: @sub_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = sub <16 x i8> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
-; SLM-NEXT:    [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
-; SLM-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
-; SLM-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = sub <16 x i8> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = sub <32 x i8> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = sub <32 x i8> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sub <32 x i8> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = sub <32 x i8> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll
index 5202e41fd770b..a481971db382d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll
@@ -63,31 +63,31 @@ define void @umax_v8i64() {
 ;
 ; SLM-LABEL: @umax_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @umax_v8i64() {
 define void @umax_v16i32() {
 ; SSE-LABEL: @umax_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umax_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @umax_v16i32() {
 define void @umax_v32i16() {
 ; SSE-LABEL: @umax_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umax_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @umax_v32i16() {
 define void @umax_v64i8() {
 ; SSE-LABEL: @umax_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umax_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll
index bfa6c1f590af2..bbed2195a4f37 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll
@@ -63,31 +63,31 @@ define void @umin_v8i64() {
 ;
 ; SLM-LABEL: @umin_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
+; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @umin_v8i64() {
 define void @umin_v16i32() {
 ; SSE-LABEL: @umin_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umin_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @umin_v16i32() {
 define void @umin_v32i16() {
 ; SSE-LABEL: @umin_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umin_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @umin_v32i16() {
 define void @umin_v64i8() {
 ; SSE-LABEL: @umin_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umin_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
+; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
index 721a98bb46059..7aa4d6728472b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
@@ -40,10 +40,10 @@ define void @bitreverse_2i64() #0 {
 define void @bitreverse_4i64() #0 {
 ; SSE-LABEL: @bitreverse_4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP2]])
-; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP3]])
 ; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -99,10 +99,10 @@ define void @bitreverse_4i32() #0 {
 define void @bitreverse_8i32() #0 {
 ; SSE-LABEL: @bitreverse_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP3]])
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -182,10 +182,10 @@ define void @bitreverse_8i16() #0 {
 define void @bitreverse_16i16() #0 {
 ; SSE-LABEL: @bitreverse_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP3]])
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -313,10 +313,10 @@ define void @bitreverse_16i8() #0 {
 define void @bitreverse_32i8() #0 {
 ; SSE-LABEL: @bitreverse_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP3]])
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
index 03717ad13d82f..6ec2cf3738117 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
@@ -17,15 +17,15 @@ define void @bcast_vals(i64 *%A, i64 *%B, i64 *%S) {
 ; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* [[B:%.*]], align 8
 ; CHECK-NEXT:    [[V1:%.*]] = sub i64 [[A0]], 1
 ; CHECK-NEXT:    [[V2:%.*]] = sub i64 [[B0]], 1
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1
+; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2
+; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1
-; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2
-; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[IDXS0]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
@@ -66,11 +66,15 @@ define void @bcast_vals2(i16 *%A, i16 *%B, i16 *%C, i16 *%D, i16 *%E, i32 *%S) {
 ; CHECK-LABEL: @bcast_vals2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* [[A:%.*]], align 8
+; CHECK-NEXT:    [[V1:%.*]] = sext i16 [[A0]] to i32
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1
+; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2
+; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3
 ; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* [[B:%.*]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load i16, i16* [[C:%.*]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load i16, i16* [[D:%.*]], align 8
 ; CHECK-NEXT:    [[E0:%.*]] = load i16, i16* [[E:%.*]], align 8
-; CHECK-NEXT:    [[V1:%.*]] = sext i16 [[A0]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2
@@ -79,10 +83,6 @@ define void @bcast_vals2(i16 *%A, i16 *%B, i16 *%C, i16 *%D, i16 *%E, i32 *%S) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]]
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1
-; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2
-; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[IDXS0]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
index bdb384f6fa9c5..92a9e431c6e1c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
@@ -103,10 +103,10 @@ define void @bswap_4i32() #0 {
 define void @bswap_8i32() #0 {
 ; SSE-LABEL: @bswap_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP3]])
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -180,10 +180,10 @@ define void @bswap_8i16() #0 {
 define void @bswap_16i16() #0 {
 ; SSE-LABEL: @bswap_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP3]])
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
index 2fdef624d48f5..6685fe260150c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
@@ -23,21 +23,21 @@ define void @foo(i8* %v0, i8* readonly %v1) {
 ; CHECK-NEXT:    [[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9
 ; CHECK-NEXT:    [[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 10
 ; CHECK-NEXT:    [[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 11
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[TMP2]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 4, i32 4, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i64> [[TMP4]], <i64 4, i64 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP6]], <i64 6, i64 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i64> [[TMP6]], <i64 6, i64 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 4, i32 4, i32 6, i32 7>
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T21]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %t0 = bitcast i8* %v0 to i32*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
index 8f57fe6866bd4..737d8d1082118 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -21,9 +21,9 @@ define void @foo_3double(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL]], 1
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
@@ -85,9 +85,9 @@ define void @foo_2double(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL]], 1
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
@@ -139,9 +139,9 @@ define void @foo_4float(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD37:%.*]] = add nsw i32 [[MUL]], 3
 ; CHECK-NEXT:    [[IDXPROM38:%.*]] = sext i32 [[ADD37]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM38]]
+; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
@@ -295,9 +295,9 @@ define void @foo_2double_non_power_of_2(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[MUL]], 7
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD7]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
@@ -343,9 +343,9 @@ define void @foo_2double_non_power_of_2_zext(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[MUL]], 7
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = zext i32 [[ADD7]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
index 75248e1d7fa5c..bb60df259f17d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
@@ -9,13 +9,13 @@ define void @test1(double* %a, double* %b, double* %c, double* %d) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[A]] to <4 x i32>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
index de371d8895c7d..7b6e6ca3c61af 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
@@ -4,20 +4,12 @@
 define void @exceed(double %0, double %1) {
 ; CHECK-LABEL: @exceed(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
 ; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
 ; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
@@ -27,13 +19,21 @@ define void @exceed(double %0, double %1) {
 ; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
index e3ff057355537..e6878e140a2f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -93,12 +93,12 @@ for.end48:                                        ; preds = %for.end44
 define void @zot(%struct.hoge* %arg) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index 9c8fbf8a2ed90..723c12d9b05b6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -36,13 +36,13 @@ define void @main() #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double poison, double undef>, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> <double poison, double undef>, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
@@ -114,6 +114,7 @@ define void @_Z8radianceRK3RayiPt() #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
+; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double undef, double poison>, double undef, i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]]
@@ -122,7 +123,6 @@ define void @_Z8radianceRK3RayiPt() #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]]
-; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; CHECK-NEXT:    br label [[RETURN:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
index a6d6dc2c1a5b4..de6a2c169338f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -16,21 +16,21 @@ define i32 @test(double* nocapture %G) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[G:%.*]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret i32 undef
@@ -133,24 +133,24 @@ define i32 @test2(double* nocapture %G, i32 %k) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], <double 1.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
 ; CHECK-NEXT:    br label [[TMP24:%.*]]
 ; CHECK:       14:
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP17:%.*]] = load double, double* [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], <double 7.000000e+00, double 8.000000e+00>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> [[TMP20]], double [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], <double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP22]], <2 x double>* [[TMP23]], align 8
 ; CHECK-NEXT:    br label [[TMP24]]
 ; CHECK:       24:
 ; CHECK-NEXT:    ret i32 undef
@@ -267,10 +267,10 @@ define i32 @partial_mrg(double* nocapture %A, i32 %n) {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[N]], 4
 ; CHECK-NEXT:    [[CONV12:%.*]] = sitofp i32 [[ADD]] to double
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 490263a396986..210ebe84d4f22 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -168,10 +168,10 @@ define void @ctlz_8i32() #0 {
 ;
 ; SSE42-LABEL: @ctlz_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -245,10 +245,10 @@ define void @ctlz_8i16() #0 {
 define void @ctlz_16i16() #0 {
 ; SSE-LABEL: @ctlz_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 false)
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -370,10 +370,10 @@ define void @ctlz_16i8() #0 {
 define void @ctlz_32i8() #0 {
 ; SSE-LABEL: @ctlz_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 false)
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -630,10 +630,10 @@ define void @ctlz_undef_8i32() #0 {
 ;
 ; SSE42-LABEL: @ctlz_undef_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 true)
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -707,10 +707,10 @@ define void @ctlz_undef_8i16() #0 {
 define void @ctlz_undef_16i16() #0 {
 ; SSE-LABEL: @ctlz_undef_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 true)
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 true)
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -832,10 +832,10 @@ define void @ctlz_undef_16i8() #0 {
 define void @ctlz_undef_32i8() #0 {
 ; SSE-LABEL: @ctlz_undef_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true)
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 true)
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
index 973512922ea03..d87fa110531b0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
@@ -63,10 +63,10 @@ define void @ctpop_2i64() #0 {
 define void @ctpop_4i64() #0 {
 ; SSE2-LABEL: @ctpop_4i64(
 ; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
-; SSE2-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
-; SSE2-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]])
-; SSE2-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
+; SSE2-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP3]])
 ; SSE2-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE2-NEXT:    ret void
 ;
@@ -182,10 +182,10 @@ define void @ctpop_4i32() #0 {
 define void @ctpop_8i32() #0 {
 ; SSE2-LABEL: @ctpop_8i32(
 ; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE2-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
-; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]])
-; SSE2-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
+; SSE2-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP3]])
 ; SSE2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE2-NEXT:    ret void
 ;
@@ -313,10 +313,10 @@ define void @ctpop_8i16() #0 {
 define void @ctpop_16i16() #0 {
 ; SSE-LABEL: @ctpop_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP3]])
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -438,10 +438,10 @@ define void @ctpop_16i8() #0 {
 define void @ctpop_32i8() #0 {
 ; SSE-LABEL: @ctpop_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
+; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP3]])
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
index 7cbc4b1b88530..bd584fc79c63c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -168,10 +168,10 @@ define void @cttz_8i32() #0 {
 ;
 ; SSE42-LABEL: @cttz_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP3]], i1 false)
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -245,10 +245,10 @@ define void @cttz_8i16() #0 {
 define void @cttz_16i16() #0 {
 ; SSE-LABEL: @cttz_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP3]], i1 false)
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -370,10 +370,10 @@ define void @cttz_16i8() #0 {
 define void @cttz_32i8() #0 {
 ; SSE-LABEL: @cttz_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP3]], i1 false)
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -630,10 +630,10 @@ define void @cttz_undef_8i32() #0 {
 ;
 ; SSE42-LABEL: @cttz_undef_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP3]], i1 true)
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -707,10 +707,10 @@ define void @cttz_undef_8i16() #0 {
 define void @cttz_undef_16i16() #0 {
 ; SSE-LABEL: @cttz_undef_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true)
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 true)
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP3]], i1 true)
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -832,10 +832,10 @@ define void @cttz_undef_16i8() #0 {
 define void @cttz_undef_32i8() #0 {
 ; SSE-LABEL: @cttz_undef_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true)
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 true)
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP3]], i1 true)
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
index 554170236184b..545aaf3cb411f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -21,12 +21,12 @@ define i32 @foo(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i3
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret i32 0
@@ -72,12 +72,12 @@ define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
@@ -116,12 +116,12 @@ define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
index 830b882dac096..63ba0bc6af7f7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
@@ -7,10 +7,10 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
index 01b75f4f9806e..c4d3bbe297c51 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
@@ -7,10 +7,10 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0
@@ -37,10 +37,10 @@ define i32 @diamond_broadcast2(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0
@@ -67,10 +67,10 @@ define i32 @diamond_broadcast3(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll b/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll
index c27fad02c077b..849a8365094b5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll
@@ -26,19 +26,19 @@ define void @PR28457(double* noalias nocapture align 32 %q, double* noalias noca
 ; SSE-NEXT:    [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5
 ; SSE-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>*
 ; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast double* [[P2]] to <2 x double>*
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; SSE-NEXT:    [[TMP5:%.*]] = bitcast double* [[P4]] to <2 x double>*
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00>
+; SSE-NEXT:    [[TMP4:%.*]] = bitcast double* [[Q0]] to <2 x double>*
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; SSE-NEXT:    [[TMP5:%.*]] = bitcast double* [[P2]] to <2 x double>*
 ; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
-; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00>
-; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00>
-; SSE-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP6]], <double 1.000000e+00, double 1.000000e+00>
-; SSE-NEXT:    [[TMP10:%.*]] = bitcast double* [[Q0]] to <2 x double>*
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP10]], align 8
-; SSE-NEXT:    [[TMP11:%.*]] = bitcast double* [[Q2]] to <2 x double>*
-; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP11]], align 8
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 1.000000e+00, double 1.000000e+00>
+; SSE-NEXT:    [[TMP8:%.*]] = bitcast double* [[Q2]] to <2 x double>*
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
+; SSE-NEXT:    [[TMP9:%.*]] = bitcast double* [[P4]] to <2 x double>*
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8
+; SSE-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 1.000000e+00>
 ; SSE-NEXT:    [[TMP12:%.*]] = bitcast double* [[Q4]] to <2 x double>*
-; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP12]], align 8
+; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @PR28457(
@@ -56,14 +56,14 @@ define void @PR28457(double* noalias nocapture align 32 %q, double* noalias noca
 ; AVX-NEXT:    [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <4 x double>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast double* [[P4]] to <2 x double>*
-; AVX-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
-; AVX-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
-; AVX-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00>
-; AVX-NEXT:    [[TMP7:%.*]] = bitcast double* [[Q0]] to <4 x double>*
-; AVX-NEXT:    store <4 x double> [[TMP5]], <4 x double>* [[TMP7]], align 8
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; AVX-NEXT:    [[TMP4:%.*]] = bitcast double* [[Q0]] to <4 x double>*
+; AVX-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast double* [[P4]] to <2 x double>*
+; AVX-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
+; AVX-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 1.000000e+00, double 1.000000e+00>
 ; AVX-NEXT:    [[TMP8:%.*]] = bitcast double* [[Q4]] to <2 x double>*
-; AVX-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP8]], align 8
+; AVX-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; AVX-NEXT:    ret void
 ;
   %p0 = getelementptr inbounds double, double* %p, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index a77c322218b9b..22f716e4a36ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -20,14 +20,14 @@ define double @dot4f64(double* dereferenceable(32) %ptrx, double* dereferenceabl
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[PTRY]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[PTRX2]] to <2 x double>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[PTRY2]] to <2 x double>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PTRX2]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[PTRY2]] to <2 x double>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP13]]
@@ -71,14 +71,14 @@ define float @dot4f32(float* dereferenceable(16) %ptrx, float* dereferenceable(1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[PTRY]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[PTRX2]] to <2 x float>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[PTRY2]] to <2 x float>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PTRX2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[PTRY2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP13]]
@@ -202,11 +202,11 @@ define double @dot3f64(double* dereferenceable(32) %ptrx, double* dereferenceabl
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load double, double* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load double, double* [[PTRY]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP6]]
@@ -240,11 +240,11 @@ define float @dot3f32(float* dereferenceable(16) %ptrx, float* dereferenceable(1
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load float, float* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, float* [[PTRY]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP6]]
@@ -278,11 +278,11 @@ define double @dot3f64_fast(double* dereferenceable(32) %ptrx, double* dereferen
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load double, double* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load double, double* [[PTRY]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP6]]
@@ -316,11 +316,11 @@ define float @dot3f32_fast(float* dereferenceable(16) %ptrx, float* dereferencea
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load float, float* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, float* [[PTRY]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index 38b2b97a23cd0..cc736eb807394 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -10,11 +10,11 @@ define i32 @fn1() {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** @a, align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> <i64 11, i64 56>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>*
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8
@@ -41,18 +41,18 @@ define void @fn2(i32* %a, i32* %b, float* %c) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP5]], i32 [[TMP6]])
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
@@ -103,11 +103,11 @@ define void @externally_used_ptrs() {
 ; CHECK-LABEL: @externally_used_ptrs(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** @a, align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> <i64 56, i64 11>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
index 19898df286711..8c2e864a6cb21 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
@@ -39,10 +39,10 @@ define void @fabs_2f64() #0 {
 define void @fabs_4f64() #0 {
 ; SSE-LABEL: @fabs_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -70,25 +70,25 @@ define void @fabs_4f64() #0 {
 define void @fabs_8f64() #0 {
 ; SSE-LABEL: @fabs_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
-; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP4]])
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP5]])
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP7]])
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fabs_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
-; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP2]])
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP3]])
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -150,10 +150,10 @@ define void @fabs_4f32() #0 {
 define void @fabs_8f32() #0 {
 ; SSE-LABEL: @fabs_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -193,25 +193,25 @@ define void @fabs_8f32() #0 {
 define void @fabs_16f32() #0 {
 ; SSE-LABEL: @fabs_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP5]])
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]])
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fabs_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
-; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP2]])
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP3]])
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
index c8948e85a4608..dc2f382b747ba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
@@ -46,12 +46,12 @@ define void @fcopysign_2f64() #0 {
 define void @fcopysign_4f64() #0 {
 ; SSE-LABEL: @fcopysign_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
 ; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -84,31 +84,31 @@ define void @fcopysign_4f64() #0 {
 define void @fcopysign_8f64() #0 {
 ; SSE-LABEL: @fcopysign_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
-; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]])
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]])
 ; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fcopysign_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
-; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
-; AVX256-NEXT:    store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -184,12 +184,12 @@ define void @fcopysign_4f32() #0 {
 define void @fcopysign_8f32() #0 {
 ; SSE-LABEL: @fcopysign_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -238,31 +238,31 @@ define void @fcopysign_8f32() #0 {
 define void @fcopysign_16f32() #0 {
 ; SSE-LABEL: @fcopysign_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
-; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]])
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]])
 ; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fcopysign_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
-; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
-; AVX256-NEXT:    store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fma.ll b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
index 5c9f93d278533..9fa8c55c9fd9c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
@@ -160,14 +160,14 @@ define void @fma_8f64() #0 {
 ;
 ; FMA256-LABEL: @fma_8f64(
 ; FMA256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
-; FMA256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
-; FMA256-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]])
+; FMA256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[TMP7]])
 ; FMA256-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; FMA256-NEXT:    ret void
 ;
@@ -458,14 +458,14 @@ define void @fma_16f32() #0 {
 ;
 ; FMA256-LABEL: @fma_16f32(
 ; FMA256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
-; FMA256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
-; FMA256-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]])
+; FMA256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP5]], <8 x float> [[TMP6]], <8 x float> [[TMP7]])
 ; FMA256-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; FMA256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
index 8136f2cb2dfec..187f1a467ad51 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
@@ -44,12 +44,12 @@ define void @fmaxnum_2f64() #0 {
 define void @fmaxnum_4f64() #0 {
 ; SSE-LABEL: @fmaxnum_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
 ; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -82,31 +82,31 @@ define void @fmaxnum_4f64() #0 {
 define void @fmaxnum_8f64() #0 {
 ; SSE-LABEL: @fmaxnum_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
-; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]])
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]])
 ; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmaxnum_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
-; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
-; AVX256-NEXT:    store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -182,12 +182,12 @@ define void @fmaxnum_4f32() #0 {
 define void @fmaxnum_8f32() #0 {
 ; SSE-LABEL: @fmaxnum_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -236,31 +236,31 @@ define void @fmaxnum_8f32() #0 {
 define void @fmaxnum_16f32() #0 {
 ; SSE-LABEL: @fmaxnum_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
-; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]])
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]])
 ; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmaxnum_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
-; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
-; AVX256-NEXT:    store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
index 470dc8290eee8..2435832f87a39 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
@@ -44,12 +44,12 @@ define void @fminnum_2f64() #0 {
 define void @fminnum_4f64() #0 {
 ; SSE-LABEL: @fminnum_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
 ; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -82,31 +82,31 @@ define void @fminnum_4f64() #0 {
 define void @fminnum_8f64() #0 {
 ; SSE-LABEL: @fminnum_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
-; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]])
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]])
 ; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fminnum_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
-; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
-; AVX256-NEXT:    store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -182,12 +182,12 @@ define void @fminnum_4f32() #0 {
 define void @fminnum_8f32() #0 {
 ; SSE-LABEL: @fminnum_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -236,31 +236,31 @@ define void @fminnum_8f32() #0 {
 define void @fminnum_16f32() #0 {
 ; SSE-LABEL: @fminnum_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
-; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
-; SSE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
-; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]])
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]])
 ; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fminnum_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
-; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
-; AVX256-NEXT:    store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
index f7b8087134f38..e192be3955fe4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
@@ -49,14 +49,14 @@ define void @fmuladd_2f64() #0 {
 define void @fmuladd_4f64() #0 {
 ; SSE-LABEL: @fmuladd_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x double> [[TMP5]])
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x double> [[TMP6]])
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x double> [[TMP7]])
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -94,37 +94,37 @@ define void @fmuladd_4f64() #0 {
 define void @fmuladd_8f64() #0 {
 ; SSE-LABEL: @fmuladd_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
 ; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x double> [[TMP7]])
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]], <2 x double> [[TMP9]])
-; SSE-NEXT:    [[TMP14:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x double> [[TMP10]])
-; SSE-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]], <2 x double> [[TMP11]])
-; SSE-NEXT:    [[TMP16:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]], <2 x double> [[TMP12]])
-; SSE-NEXT:    store <2 x double> [[TMP13]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP15]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]])
+; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP14:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP15:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]])
 ; SSE-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmuladd_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
-; AVX256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
-; AVX256-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]])
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[TMP7]])
 ; AVX256-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -214,14 +214,14 @@ define void @fmuladd_4f32() #0 {
 define void @fmuladd_8f32() #0 {
 ; SSE-LABEL: @fmuladd_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]])
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x float> [[TMP6]])
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]])
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -279,37 +279,37 @@ define void @fmuladd_8f32() #0 {
 define void @fmuladd_16f32() #0 {
 ; SSE-LABEL: @fmuladd_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]])
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]], <4 x float> [[TMP9]])
-; SSE-NEXT:    [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]])
-; SSE-NEXT:    [[TMP15:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]])
-; SSE-NEXT:    [[TMP16:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]], <4 x float> [[TMP12]])
-; SSE-NEXT:    store <4 x float> [[TMP13]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP15]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]])
+; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP13:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP14:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP15:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]])
 ; SSE-NEXT:    store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmuladd_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
-; AVX256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
-; AVX256-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]])
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP5]], <8 x float> [[TMP6]], <8 x float> [[TMP7]])
 ; AVX256-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
index ed071f90ad76f..aa4cd49e4649e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@@ -82,10 +82,10 @@ define void @fptosi_8f64_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f64_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i64>
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -119,10 +119,10 @@ define void @fptosi_8f64_8i64() #0 {
 define void @fptosi_8f64_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i32>
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -315,10 +315,10 @@ define void @fptosi_8f32_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f32_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i64>
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -352,10 +352,10 @@ define void @fptosi_8f32_8i64() #0 {
 define void @fptosi_8f32_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
index 8d46e40fc5a9e..3490fa909486e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -82,10 +82,10 @@ define void @fptosi_8f64_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f64_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i64>
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -119,10 +119,10 @@ define void @fptosi_8f64_8i64() #0 {
 define void @fptosi_8f64_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i32>
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -315,10 +315,10 @@ define void @fptosi_8f32_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f32_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i64>
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -352,10 +352,10 @@ define void @fptosi_8f32_8i64() #0 {
 define void @fptosi_8f32_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
index 6842b8a0ef1b4..b3698f260f360 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -82,10 +82,10 @@ define void @fptoui_8f64_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptoui_8f64_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP3]] to <4 x i64>
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -119,10 +119,10 @@ define void @fptoui_8f64_8i64() #0 {
 define void @fptoui_8f64_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f64_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP3]] to <4 x i32>
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -315,10 +315,10 @@ define void @fptoui_8f32_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptoui_8f32_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i64>
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -352,10 +352,10 @@ define void @fptoui_8f32_8i64() #0 {
 define void @fptoui_8f32_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f32_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fround.ll b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
index 21a36e6227bbe..8089f70d342a8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
@@ -74,10 +74,10 @@ define void @ceil_4f64() #0 {
 ;
 ; SSE41-LABEL: @ceil_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -132,34 +132,34 @@ define void @ceil_8f64() #0 {
 ;
 ; SSE41-LABEL: @ceil_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]])
-; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP5]])
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP7]])
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ceil_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
-; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP3]])
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ceil_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
-; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP3]])
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -245,10 +245,10 @@ define void @floor_4f64() #0 {
 ;
 ; SSE41-LABEL: @floor_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -303,34 +303,34 @@ define void @floor_8f64() #0 {
 ;
 ; SSE41-LABEL: @floor_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]])
-; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP5]])
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP7]])
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @floor_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
-; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP3]])
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @floor_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
-; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP3]])
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -416,10 +416,10 @@ define void @nearbyint_4f64() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -474,34 +474,34 @@ define void @nearbyint_8f64() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]])
-; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP5]])
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP7]])
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @nearbyint_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
-; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP3]])
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @nearbyint_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
-; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP3]])
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -587,10 +587,10 @@ define void @rint_4f64() #0 {
 ;
 ; SSE41-LABEL: @rint_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -645,34 +645,34 @@ define void @rint_8f64() #0 {
 ;
 ; SSE41-LABEL: @rint_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]])
-; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP5]])
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP7]])
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @rint_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
-; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP3]])
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @rint_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
-; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP3]])
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -758,10 +758,10 @@ define void @trunc_4f64() #0 {
 ;
 ; SSE41-LABEL: @trunc_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -816,34 +816,34 @@ define void @trunc_8f64() #0 {
 ;
 ; SSE41-LABEL: @trunc_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]])
-; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP5]])
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP7]])
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @trunc_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
-; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP3]])
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @trunc_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
-; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP3]])
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -953,10 +953,10 @@ define void @ceil_8f32() #0 {
 ;
 ; SSE41-LABEL: @ceil_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1047,34 +1047,34 @@ define void @ceil_16f32() #0 {
 ;
 ; SSE41-LABEL: @ceil_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]])
-; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP5]])
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP7]])
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ceil_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
-; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP3]])
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ceil_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
-; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP3]])
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1208,10 +1208,10 @@ define void @floor_8f32() #0 {
 ;
 ; SSE41-LABEL: @floor_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1302,34 +1302,34 @@ define void @floor_16f32() #0 {
 ;
 ; SSE41-LABEL: @floor_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]])
-; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP5]])
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP7]])
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @floor_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
-; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP3]])
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @floor_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
-; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP3]])
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1463,10 +1463,10 @@ define void @nearbyint_8f32() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1557,34 +1557,34 @@ define void @nearbyint_16f32() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]])
-; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP5]])
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP7]])
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @nearbyint_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
-; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP3]])
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @nearbyint_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
-; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP3]])
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1718,10 +1718,10 @@ define void @rint_8f32() #0 {
 ;
 ; SSE41-LABEL: @rint_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1812,34 +1812,34 @@ define void @rint_16f32() #0 {
 ;
 ; SSE41-LABEL: @rint_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]])
-; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP5]])
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP7]])
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @rint_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
-; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP3]])
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @rint_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
-; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP3]])
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1973,10 +1973,10 @@ define void @trunc_8f32() #0 {
 ;
 ; SSE41-LABEL: @trunc_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -2067,34 +2067,34 @@ define void @trunc_16f32() #0 {
 ;
 ; SSE41-LABEL: @trunc_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])
-; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]])
-; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP5]])
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP7]])
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @trunc_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
-; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP3]])
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @trunc_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
-; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP3]])
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
index ae24e92d9e515..c1a4b88b015bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
@@ -13,14 +13,14 @@ define void @test1(double* %a, double* %b, double* %c) #0 personality i32 (...)*
 ; CHECK:       catch:
 ; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] [i8* null, i32 64, i8* null]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP6]]) [ "funclet"(token [[TMP1]]) ]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; CHECK-NEXT:    catchret from [[TMP1]] to label [[TRY_CONT:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
index f04fff807379d..d9e1a25c76e47 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
@@ -14,12 +14,12 @@ define void @foo1 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 16, i64 16>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> <i64 16, i64 16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
-; CHECK-NEXT:    store <2 x i32*> [[TMP6]], <2 x i32*>* [[TMP8]], align 8
+; CHECK-NEXT:    store <2 x i32*> [[TMP7]], <2 x i32*>* [[TMP8]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %y, i64 0, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 58e40177f54e2..2bfd90c09fe0d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -101,11 +101,11 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
-; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
@@ -117,11 +117,11 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
-; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index eb317de222f2a..49b55933d989e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -110,20 +110,20 @@ define i32 @maxi8_store_in(i32) {
 ; SSE-NEXT:    ret i32 [[TMP23]]
 ;
 ; AVX-LABEL: @maxi8_store_in(
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
 ; AVX-NEXT:    store i32 0, i32* @var, align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
 ; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; AVX-NEXT:    ret i32 [[TMP3]]
 ;
 ; AVX2-LABEL: @maxi8_store_in(
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
 ; AVX2-NEXT:    store i32 0, i32* @var, align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
 ; AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; AVX2-NEXT:    ret i32 [[TMP3]]
 ;
 ; THRESH-LABEL: @maxi8_store_in(
-; THRESH-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
 ; THRESH-NEXT:    store i32 0, i32* @var, align 8
+; THRESH-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
 ; THRESH-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; THRESH-NEXT:    ret i32 [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
index fd1544634d9dd..8f19a93258611 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -782,10 +782,10 @@ define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i
 ; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
 ; STORE-NEXT:    [[ADD1135:%.*]] = or i64 [[MUL]], 2
 ; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
-; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>*
-; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; STORE-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
 ; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
+; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>*
+; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
index 7be473214244d..b0fbc6f21433a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -51,15 +51,16 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
 ; SSE-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
 ; SSE-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
+; SSE-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
+; SSE-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
+; SSE-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
+; SSE-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
 ; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[C_ADDR_0352]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; SSE-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
 ; SSE-NEXT:    [[TMP6:%.*]] = bitcast i8* [[D_ADDR_0353]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1
-; SSE-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
 ; SSE-NEXT:    [[TMP8:%.*]] = bitcast i8* [[A_ADDR_0355]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1
-; SSE-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
 ; SSE-NEXT:    [[TMP10:%.*]] = bitcast i8* [[B_ADDR_0351]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = icmp ult <4 x i8> [[TMP5]], [[TMP7]]
@@ -67,7 +68,6 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32>
 ; SSE-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[SHUFFLE]]
 ; SSE-NEXT:    [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i8>
-; SSE-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
 ; SSE-NEXT:    [[TMP17:%.*]] = bitcast i8* [[E_ADDR_0354]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP16]], <4 x i8>* [[TMP17]], align 1
 ; SSE-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
@@ -86,15 +86,16 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
 ; SSE-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
 ; SSE-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
+; SSE-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
+; SSE-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
+; SSE-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
+; SSE-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
 ; SSE-NEXT:    [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX45]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1
-; SSE-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
 ; SSE-NEXT:    [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX47]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1
-; SSE-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
 ; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX49]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
-; SSE-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
 ; SSE-NEXT:    [[TMP24:%.*]] = bitcast i8* [[ARRAYIDX52]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1
 ; SSE-NEXT:    [[TMP26:%.*]] = icmp ult <4 x i8> [[TMP19]], [[TMP21]]
@@ -102,7 +103,6 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
 ; SSE-NEXT:    [[TMP29:%.*]] = mul <4 x i32> [[TMP28]], [[SHUFFLE1]]
 ; SSE-NEXT:    [[TMP30:%.*]] = trunc <4 x i32> [[TMP29]] to <4 x i8>
-; SSE-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
 ; SSE-NEXT:    [[TMP31:%.*]] = bitcast i8* [[ARRAYIDX56]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP30]], <4 x i8>* [[TMP31]], align 1
 ; SSE-NEXT:    [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
@@ -121,15 +121,16 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
 ; SSE-NEXT:    [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
 ; SSE-NEXT:    [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
+; SSE-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
+; SSE-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
+; SSE-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
+; SSE-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
 ; SSE-NEXT:    [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX93]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1
-; SSE-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
 ; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX95]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
-; SSE-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
 ; SSE-NEXT:    [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX97]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1
-; SSE-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
 ; SSE-NEXT:    [[TMP38:%.*]] = bitcast i8* [[ARRAYIDX100]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP39:%.*]] = load <4 x i8>, <4 x i8>* [[TMP38]], align 1
 ; SSE-NEXT:    [[TMP40:%.*]] = icmp ult <4 x i8> [[TMP33]], [[TMP35]]
@@ -137,7 +138,6 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP42:%.*]] = zext <4 x i8> [[TMP41]] to <4 x i32>
 ; SSE-NEXT:    [[TMP43:%.*]] = mul <4 x i32> [[TMP42]], [[SHUFFLE2]]
 ; SSE-NEXT:    [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i8>
-; SSE-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
 ; SSE-NEXT:    [[TMP45:%.*]] = bitcast i8* [[ARRAYIDX104]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP44]], <4 x i8>* [[TMP45]], align 1
 ; SSE-NEXT:    [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
@@ -156,15 +156,16 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
 ; SSE-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
 ; SSE-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
+; SSE-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
+; SSE-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
+; SSE-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
+; SSE-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; SSE-NEXT:    [[TMP46:%.*]] = bitcast i8* [[ARRAYIDX141]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP47:%.*]] = load <4 x i8>, <4 x i8>* [[TMP46]], align 1
-; SSE-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
 ; SSE-NEXT:    [[TMP48:%.*]] = bitcast i8* [[ARRAYIDX143]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP49:%.*]] = load <4 x i8>, <4 x i8>* [[TMP48]], align 1
-; SSE-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
 ; SSE-NEXT:    [[TMP50:%.*]] = bitcast i8* [[ARRAYIDX145]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP51:%.*]] = load <4 x i8>, <4 x i8>* [[TMP50]], align 1
-; SSE-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
 ; SSE-NEXT:    [[TMP52:%.*]] = bitcast i8* [[ARRAYIDX148]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1
 ; SSE-NEXT:    [[TMP54:%.*]] = icmp ult <4 x i8> [[TMP47]], [[TMP49]]
@@ -172,7 +173,6 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP56:%.*]] = zext <4 x i8> [[TMP55]] to <4 x i32>
 ; SSE-NEXT:    [[TMP57:%.*]] = mul <4 x i32> [[TMP56]], [[SHUFFLE3]]
 ; SSE-NEXT:    [[TMP58:%.*]] = trunc <4 x i32> [[TMP57]] to <4 x i8>
-; SSE-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; SSE-NEXT:    [[TMP59:%.*]] = bitcast i8* [[ARRAYIDX152]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP58]], <4 x i8>* [[TMP59]], align 1
 ; SSE-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
@@ -269,15 +269,16 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; AVX512-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
 ; AVX512-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
 ; AVX512-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
+; AVX512-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
+; AVX512-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
+; AVX512-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
+; AVX512-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; AVX512-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
 ; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; AVX512-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
 ; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1
-; AVX512-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
 ; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
 ; AVX512-NEXT:    [[TMP9:%.*]] = icmp ult <16 x i8> [[TMP2]], [[TMP4]]
@@ -285,7 +286,6 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; AVX512-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
 ; AVX512-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP11]], [[SHUFFLE]]
 ; AVX512-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
-; AVX512-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; AVX512-NEXT:    [[TMP14:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
 ; AVX512-NEXT:    store <16 x i8> [[TMP13]], <16 x i8>* [[TMP14]], align 1
 ; AVX512-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index d9b6207997236..b97d6018c3ddb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -292,20 +292,20 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x float> [[RD1]]
@@ -451,14 +451,14 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP11]]
+; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
+; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
+; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
+; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
+; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 2b28765d82a04..73c4187efa8f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -327,20 +327,20 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x float> [[RD1]]
@@ -486,14 +486,14 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP11]]
+; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
+; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
+; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
+; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
+; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
index 9e4645e8eb038..fd9e826904f52 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
@@ -9,12 +9,12 @@ define { <2 x float>, <2 x float> } @foo(%struct.sw* %v) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* undef, align 4
 ; CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_SW:%.*]], %struct.sw* [[V:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_SW]], %struct.sw* [[V]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[X]] to <2 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* undef, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[X]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 16
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], poison
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
index ed58f407628cd..ef7e827cd62a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
@@ -7,14 +7,14 @@ define void @julia_2xdouble([2 x double]* sret([2 x double]), [2 x double]*, [2
 ; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1
+; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
+; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
-; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
@@ -58,16 +58,16 @@ define void @julia_4xfloat([4 x float]* sret([4 x float]), [4 x float]*, [4 x fl
 ; CHECK-NEXT:    [[PX2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 2
 ; CHECK-NEXT:    [[PY2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 2
 ; CHECK-NEXT:    [[PX3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1
 ; CHECK-NEXT:    [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2
 ; CHECK-NEXT:    [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
index bde72f647604e..c087239f37b2e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
@@ -8,9 +8,9 @@ define void @inst_size(i64* %a, <2 x i64> %b) {
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
 ; CHECK-NEXT:    [[PTR4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 3
+; CHECK-NEXT:    [[T41:%.*]] = icmp sgt i64 0, [[VAL]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 4
-; CHECK-NEXT:    [[T41:%.*]] = icmp sgt i64 0, [[VAL]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label [[BLOCK:%.*]]
 ; CHECK:       block:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
index 950bbcb7d5dd6..7b2b50de7ca46 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
@@ -8,12 +8,12 @@ define void @vec_powi_f32(float* %a, float* %c, i32 %P) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]])
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
index 820d85eb309d1..6a2abcea80b4c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
@@ -22,16 +22,16 @@ define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
@@ -78,16 +78,16 @@ define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
index 11e313bdbe6fb..409ebe579a7e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -9,20 +9,20 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]]
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4
@@ -67,6 +67,10 @@ define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias noca
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
@@ -78,10 +82,6 @@ define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias noca
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]]
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
index 33ad7c4c4ec0f..6721b046c504c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
@@ -17,36 +17,36 @@ define dso_local void @j() local_unnamed_addr {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float>
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 1.000000e+01, float 1.000000e+01>
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP7]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    store float [[TMP9]], float* @g, align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
-; CHECK-NEXT:    store float [[TMP11]], float* @c, align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
-; CHECK-NEXT:    store float [[TMP12]], float* @d, align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
-; CHECK-NEXT:    store float [[TMP13]], float* @e, align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
-; CHECK-NEXT:    store float [[TMP14]], float* @f, align 4
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* @a, align 4
-; CHECK-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], <float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP8]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    store float [[TMP10]], float* @g, align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
+; CHECK-NEXT:    store float [[TMP12]], float* @c, align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    store float [[TMP13]], float* @d, align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
+; CHECK-NEXT:    store float [[TMP14]], float* @e, align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    store float [[TMP15]], float* @f, align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> <float poison, float -1.000000e+00, float poison, float -1.000000e+00>, float [[CONV19]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fsub <4 x float> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <4 x float> [[TMP11]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 3d68b9b8fa065..c6e3b1088ef25 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -148,17 +148,17 @@ define void @PR43578_prefer128(i32* %r, i64* %p, i64* %q) #0 {
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 2683ec1e1d722..c9ec8c8df6b2d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -148,17 +148,17 @@ define void @PR43578_prefer128(i32* %r, i64* %p, i64* %q) #0 {
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index ba3bd26d38610..df0ad7d02c681 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -237,6 +237,8 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
@@ -254,8 +256,6 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -329,6 +329,8 @@ define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, do
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
@@ -346,8 +348,6 @@ define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, do
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -416,6 +416,8 @@ define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
 ; CHECK-LABEL: @lookahead_crash(
 ; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]])
@@ -423,8 +425,6 @@ define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
@@ -457,6 +457,8 @@ define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x doubl
 ; CHECK-NEXT:    [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4
 ; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
 ; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
+; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
+; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
@@ -464,8 +466,6 @@ define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x doubl
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
-; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; CHECK-NEXT:    ret void
@@ -591,8 +591,6 @@ define void @ChecksExtractScores_different_vectors(double* %storeArray, double*
 ; CHECK-LABEL: @ChecksExtractScores_different_vectors(
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
 ; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
 ; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
 ; CHECK-NEXT:    [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
@@ -601,6 +599,10 @@ define void @ChecksExtractScores_different_vectors(double* %storeArray, double*
 ; CHECK-NEXT:    [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4
 ; CHECK-NEXT:    [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0
 ; CHECK-NEXT:    [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
+; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
+; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
@@ -609,8 +611,6 @@ define void @ChecksExtractScores_different_vectors(double* %storeArray, double*
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]]
-; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
-; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
index 5bfd4f0f5fcf7..f043e34542de0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
@@ -34,12 +34,12 @@ entry:
 define void @test2(double* %a, double* %b, i8* %e) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]], !fpmath !5
-; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
index 1b224cb4109c1..f39abd1cead22 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
@@ -28,11 +28,11 @@ define i32 @bar(double* nocapture %A, i32 %d) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 (...) @foo()
 ; CHECK-NEXT:    br label [[TMP7]]
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP9]] to <2 x double>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 9.000000e+00, double 5.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP9]] to <2 x double>*
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
index 17cd58d410c3d..9616940c1982d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -9,16 +9,16 @@ target triple = "i386-apple-macosx10.9.0"
 define void @test(double* %i1, double* %i2, double* %o) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I1_0:%.*]] = load double, double* [[I1:%.*]], align 16
-; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr double, double* [[I1]], i64 1
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr double, double* [[I1:%.*]], i64 1
+; CHECK-NEXT:    [[I1_0:%.*]] = load double, double* [[I1]], align 16
 ; CHECK-NEXT:    [[I1_1:%.*]] = load double, double* [[I1_GEP1]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1_0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1
 ; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds double, double* [[I2:%.*]], i64 0
-; CHECK-NEXT:    [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16
 ; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1
+; CHECK-NEXT:    [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16
 ; CHECK-NEXT:    [[I2_1:%.*]] = load double, double* [[I2_GEP1]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I2_0]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I2_1]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
index a7abe86c93fb8..0a3e501ef14c9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -13,14 +13,14 @@ define void @powof2div_uniform(i32* noalias nocapture %a, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -101,14 +101,14 @@ define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapt
 ; AVX-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; AVX-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; AVX-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; AVX-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; AVX-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; AVX-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; AVX-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; AVX-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; AVX-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 4, i32 8, i32 16>
-; AVX-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; AVX-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; AVX-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; AVX-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll
index 385d323801a99..dd83351dbd29f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll
@@ -13,14 +13,14 @@ define void @powof2mul_uniform(i32* noalias nocapture %a, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -68,14 +68,14 @@ define void @negpowof2mul_uniform(i32* noalias nocapture %a, i32* noalias nocapt
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 -2, i32 -2, i32 -2, i32 -2>
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -123,14 +123,14 @@ define void @powof2mul_nonuniform(i32* noalias nocapture %a, i32* noalias nocapt
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 2, i32 4, i32 8, i32 16>
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -178,14 +178,14 @@ define void @negpowof2mul_nonuniform(i32* noalias nocapture %a, i32* noalias noc
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 -2, i32 -4, i32 -8, i32 -16>
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -278,16 +278,16 @@ define void @PR51436(i64* nocapture %a) {
 ; AVX-NEXT:    [[GEP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 7
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
-; AVX-NEXT:    [[TMP2:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>*
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
-; AVX-NEXT:    [[TMP4:%.*]] = mul <4 x i64> [[TMP1]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
-; AVX-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
-; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
-; AVX-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
-; AVX-NEXT:    [[TMP8:%.*]] = bitcast i64* [[A]] to <4 x i64>*
-; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* [[TMP8]], align 8
+; AVX-NEXT:    [[TMP2:%.*]] = mul <4 x i64> [[TMP1]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP4:%.*]] = bitcast i64* [[A]] to <4 x i64>*
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[TMP4]], align 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>*
+; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8
+; AVX-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[TMP6]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[TMP7]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
 ; AVX-NEXT:    [[TMP9:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>*
-; AVX-NEXT:    store <4 x i64> [[TMP7]], <4 x i64>* [[TMP9]], align 8
+; AVX-NEXT:    store <4 x i64> [[TMP8]], <4 x i64>* [[TMP9]], align 8
 ; AVX-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index f36d8cbe1f3f5..f3cdf5650b12c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -35,14 +35,14 @@ define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
 ; AVX-NEXT:    store i64 [[OR_1]], i64* undef, align 8
 ; AVX-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
 ; AVX-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; AVX-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; AVX-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; AVX-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0
 ; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
 ; AVX-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
-; AVX-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
-; AVX-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
 ; AVX-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
 ; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
 ; AVX-NEXT:    ret void
@@ -72,22 +72,22 @@ define void @pr35497() local_unnamed_addr #0 {
 ; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; SSE-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
 ; SSE-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
 ; SSE-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
-; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
 ; SSE-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
 ; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
-; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
-; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
-; SSE-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; SSE-NEXT:    [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
 ; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
-; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; SSE-NEXT:    ret void
@@ -98,22 +98,22 @@ define void @pr35497() local_unnamed_addr #0 {
 ; AVX-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; AVX-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
-; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
 ; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
-; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
 ; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
-; AVX-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; AVX-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
-; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
-; AVX-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
-; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
 ; AVX-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
 ; AVX-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; AVX-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index f6dd7526e6e76..b7741a5edf59e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -68,16 +68,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
 ; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -86,16 +86,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
 ; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -200,28 +200,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
 ; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -229,28 +229,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
+; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
+; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
+; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
+; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
+; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
+; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
 ; AVX2-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX2-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -303,19 +303,19 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 2
 ; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
 ; AVX512VL-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 3
@@ -511,26 +511,26 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 ;
 ; AVX512VL-LABEL: @gather_load_4(
 ; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
 ; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 9
 ; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; AVX512VL-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; AVX512VL-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; AVX512VL-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
 ; AVX512VL-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
 ; AVX512VL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
@@ -586,109 +586,109 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 
 define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; SSE-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]]
-; SSE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3
+; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
 ; SSE-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; SSE-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; SSE-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; SSE-NEXT:    [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0
+; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2
 ; SSE-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3
 ; SSE-NEXT:    [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]]
-; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>*
+; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
 ; SSE-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
+; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
+; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
+; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
+; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
+; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
 ; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
@@ -696,52 +696,52 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
+; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
+; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
+; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
+; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
+; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
 ; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX2-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index fd1c612a0696e..2d8bb0f4e0245 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -68,16 +68,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
 ; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -86,16 +86,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
 ; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -200,28 +200,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
 ; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -229,28 +229,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
+; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
+; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
+; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
+; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
+; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
+; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
 ; AVX2-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX2-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -303,19 +303,19 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
-; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 2
 ; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
 ; AVX512VL-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 3
@@ -511,26 +511,26 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 ;
 ; AVX512VL-LABEL: @gather_load_4(
 ; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
 ; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
+; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 9
 ; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; AVX512VL-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; AVX512VL-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; AVX512VL-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
 ; AVX512VL-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
 ; AVX512VL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
@@ -586,109 +586,109 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 
 define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; SSE-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]]
-; SSE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2
+; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3
+; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
 ; SSE-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; SSE-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; SSE-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; SSE-NEXT:    [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0
+; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2
 ; SSE-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3
 ; SSE-NEXT:    [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]]
-; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>*
+; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
 ; SSE-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
+; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
+; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
+; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
+; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
+; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
 ; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
@@ -696,52 +696,52 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
+; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
+; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
+; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
+; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
+; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
 ; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX2-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
index 149303c4bdc46..6fc3921f5f8dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -20,22 +20,22 @@ define i32 @foo(i32* %diff) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], 5
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP1]], 6
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP1]], 7
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP1]], 7
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
index 5b00b2e044a57..a08ae378fd173 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
@@ -7,33 +7,33 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* undef, i64 5
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* undef, i64 6
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* undef, i64 7
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[TMP6]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw <4 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i32> [[TMP10]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1
-; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP21]]
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> [[TMP26]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP27]], <4 x i32>* [[TMP29]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP12:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw <4 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i32 3
+; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw <4 x i32> [[TMP14]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP25]]
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr inbounds i8, i8* undef, i64 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index d3da0c9572028..f2f12939894f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -9,8 +9,6 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
 ; CHECK:       if.then22.i:
-; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
-; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
@@ -19,20 +17,23 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
+; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
+; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[SHUFFLE1]], <i32 9, i32 10, i32 11, i32 12>
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
 ; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[CONV31_I]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], <i32 14, i32 15>
@@ -46,7 +47,6 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i8> [[TMP16]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 ; CHECK-NEXT:    store <16 x i8> [[TMP17]], <16 x i8>* [[TMP18]], align 1
 ; CHECK-NEXT:    unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/return.ll b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
index 4f7448f371028..6eb1bff85ada0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/return.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
@@ -44,9 +44,9 @@ define double @return2(double* nocapture readonly %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
index 769c752ecb067..5f08d13cbec21 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
@@ -16,12 +16,12 @@ define i32 @foo(i32 %0, i32* %1, float* %2)  {
 ; CHECK-NEXT:    br label [[T37:%.*]]
 ; CHECK:       t37:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP6]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 0
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 1
 ; CHECK-NEXT:    [[T31:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 2
 ; CHECK-NEXT:    [[T33:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP6]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[T21]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[SHUFFLE]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[T88:%.*]] = bitcast float* [[T9]] to <2 x float>*
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
index aabd7260d4aae..3e4cfe6e05157 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
@@ -53,11 +53,11 @@ define void @test(float * %a, float * %b, float * %c, float * %d) {
 ; CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[C2:%.*]] = getelementptr inbounds float, float* [[C]], i64 2
 ; CHECK-NEXT:    [[C3:%.*]] = getelementptr inbounds float, float* [[C]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 1
 ; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds float, float* [[D]], i64 2
 ; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds float, float* [[D]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[D]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
index 875fcbc52e6a8..a95f658eb411d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -19,22 +19,22 @@ define i32 @foo(i32* nocapture readonly %diff) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], 5
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP1]], 6
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP1]], 7
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP1]], 7
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
index d3aa91a0e76c9..5d41283995606 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -93,12 +93,12 @@ define void @ashr_v8i64() {
 ;
 ; AVX2-LABEL: @ashr_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
-; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP4]], [[TMP5]]
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -111,12 +111,12 @@ define void @ashr_v8i64() {
 ;
 ; XOP-LABEL: @ashr_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; XOP-NEXT:    ret void
 ;
@@ -158,31 +158,31 @@ define void @ashr_v8i64() {
 define void @ashr_v16i32() {
 ; SSE-LABEL: @ashr_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ashr_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -195,12 +195,12 @@ define void @ashr_v16i32() {
 ;
 ; XOP-LABEL: @ashr_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; XOP-NEXT:    ret void
 ;
@@ -274,31 +274,31 @@ define void @ashr_v16i32() {
 define void @ashr_v32i16() {
 ; SSE-LABEL: @ashr_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = ashr <8 x i16> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = ashr <8 x i16> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = ashr <8 x i16> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = ashr <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = ashr <8 x i16> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = ashr <8 x i16> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ashr_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -311,12 +311,12 @@ define void @ashr_v32i16() {
 ;
 ; XOP-LABEL: @ashr_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
@@ -454,31 +454,31 @@ define void @ashr_v32i16() {
 define void @ashr_v64i8() {
 ; SSE-LABEL: @ashr_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ashr_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -491,12 +491,12 @@ define void @ashr_v64i8() {
 ;
 ; XOP-LABEL: @ashr_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP3:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
index ba373f8547af4..d86fd028d5648 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -23,31 +23,31 @@
 define void @lshr_v8i64() {
 ; SSE-LABEL: @lshr_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -60,12 +60,12 @@ define void @lshr_v8i64() {
 ;
 ; XOP-LABEL: @lshr_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; XOP-NEXT:    ret void
 ;
@@ -107,31 +107,31 @@ define void @lshr_v8i64() {
 define void @lshr_v16i32() {
 ; SSE-LABEL: @lshr_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <4 x i32> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -144,12 +144,12 @@ define void @lshr_v16i32() {
 ;
 ; XOP-LABEL: @lshr_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; XOP-NEXT:    ret void
 ;
@@ -223,31 +223,31 @@ define void @lshr_v16i32() {
 define void @lshr_v32i16() {
 ; SSE-LABEL: @lshr_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <8 x i16> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = lshr <8 x i16> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <8 x i16> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <8 x i16> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = lshr <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <8 x i16> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <8 x i16> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -260,12 +260,12 @@ define void @lshr_v32i16() {
 ;
 ; XOP-LABEL: @lshr_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
@@ -403,31 +403,31 @@ define void @lshr_v32i16() {
 define void @lshr_v64i8() {
 ; SSE-LABEL: @lshr_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <16 x i8> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -440,12 +440,12 @@ define void @lshr_v64i8() {
 ;
 ; XOP-LABEL: @lshr_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP3:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
index 4ca73963881f3..8ebe430f1782d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -23,31 +23,31 @@
 define void @shl_v8i64() {
 ; SSE-LABEL: @shl_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = shl <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = shl <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -60,12 +60,12 @@ define void @shl_v8i64() {
 ;
 ; XOP-LABEL: @shl_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = shl <4 x i64> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; XOP-NEXT:    ret void
 ;
@@ -174,12 +174,12 @@ define void @shl_v16i32() {
 ;
 ; AVX-LABEL: @shl_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = shl <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -192,12 +192,12 @@ define void @shl_v16i32() {
 ;
 ; XOP-LABEL: @shl_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = shl <8 x i32> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; XOP-NEXT:    ret void
 ;
@@ -271,31 +271,31 @@ define void @shl_v16i32() {
 define void @shl_v32i16() {
 ; SSE-LABEL: @shl_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = shl <8 x i16> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = shl <8 x i16> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = shl <8 x i16> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = shl <8 x i16> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = shl <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = shl <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = shl <8 x i16> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP12:%.*]] = shl <8 x i16> [[TMP10]], [[TMP11]]
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = shl <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -308,12 +308,12 @@ define void @shl_v32i16() {
 ;
 ; XOP-LABEL: @shl_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = shl <16 x i16> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
@@ -451,31 +451,31 @@ define void @shl_v32i16() {
 define void @shl_v64i8() {
 ; SSE-LABEL: @shl_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
-; SSE-NEXT:    [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
-; SSE-NEXT:    [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = shl <16 x i8> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP12:%.*]] = shl <16 x i8> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]]
-; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -488,12 +488,12 @@ define void @shl_v64i8() {
 ;
 ; XOP-LABEL: @shl_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]]
-; XOP-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]]
-; XOP-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[TMP1]], [[TMP2]]
+; XOP-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP4]], [[TMP5]]
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
index 86d728cb7c4bd..ceb9f4bcf99fc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -6,19 +6,19 @@ define void @wombat(i32* %ptr, i32* %ptr1) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3
 ; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], poison
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> poison, <4 x i32> [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> poison, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
index 72b1ff2609d64..9349d5d95be7e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
@@ -12,13 +12,13 @@ define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i6
 ; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP10:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], <i32 7, i32 14, i32 21, i32 28>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP6]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 7, i32 14, i32 21, i32 28>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10]] = add i64 [[I_019]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP10]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
index 2a18d50b38933..120f80547ef57 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -33,12 +33,12 @@ define void @test1(double* %a, double* %b, double* %c) {
 ; Simple 3-pair chain with loads and stores, obfuscated with bitcasts
 define void @test2(double* %a, double* %b, i8* %e) {
 ; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
@@ -62,11 +62,11 @@ define void @test2(double* %a, double* %b, i8* %e) {
 ; Don't vectorize volatile loads.
 define void @test_volatile_load(double* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test_volatile_load(
-; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
-; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B]], align 8
 ; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
 ; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
index 0188dd74f7a00..72eb65e45fb7c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -181,10 +181,10 @@ define void @sitofp_8i64_8f64() #0 {
 ;
 ; AVX256DQ-LABEL: @sitofp_8i64_8f64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
-; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x double>
 ; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -234,10 +234,10 @@ define void @sitofp_2i32_2f64() #0 {
 define void @sitofp_4i32_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i32_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -265,25 +265,25 @@ define void @sitofp_4i32_4f64() #0 {
 define void @sitofp_8i32_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -339,10 +339,10 @@ define void @sitofp_2i16_2f64() #0 {
 define void @sitofp_4i16_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -370,25 +370,25 @@ define void @sitofp_4i16_4f64() #0 {
 define void @sitofp_8i16_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -444,10 +444,10 @@ define void @sitofp_2i8_2f64() #0 {
 define void @sitofp_4i8_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i8_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -475,25 +475,25 @@ define void @sitofp_4i8_4f64() #0 {
 define void @sitofp_8i8_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -578,10 +578,10 @@ define void @sitofp_4i64_4f32() #0 {
 define void @sitofp_8i64_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i64_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -643,10 +643,10 @@ define void @sitofp_4i32_4f32() #0 {
 define void @sitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -686,25 +686,25 @@ define void @sitofp_8i32_8f32() #0 {
 define void @sitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i32_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -790,10 +790,10 @@ define void @sitofp_4i16_4f32() #0 {
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -833,25 +833,25 @@ define void @sitofp_8i16_8f32() #0 {
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -937,10 +937,10 @@ define void @sitofp_4i8_4f32() #0 {
 define void @sitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -980,25 +980,25 @@ define void @sitofp_8i8_8f32() #0 {
 define void @sitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i8_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 00e077b663b94..9f31fc2f3dc19 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -181,10 +181,10 @@ define void @sitofp_8i64_8f64() #0 {
 ;
 ; AVX256DQ-LABEL: @sitofp_8i64_8f64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
-; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x double>
 ; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -234,10 +234,10 @@ define void @sitofp_2i32_2f64() #0 {
 define void @sitofp_4i32_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i32_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -265,25 +265,25 @@ define void @sitofp_4i32_4f64() #0 {
 define void @sitofp_8i32_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -339,10 +339,10 @@ define void @sitofp_2i16_2f64() #0 {
 define void @sitofp_4i16_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -370,25 +370,25 @@ define void @sitofp_4i16_4f64() #0 {
 define void @sitofp_8i16_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -444,10 +444,10 @@ define void @sitofp_2i8_2f64() #0 {
 define void @sitofp_4i8_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i8_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -475,25 +475,25 @@ define void @sitofp_4i8_4f64() #0 {
 define void @sitofp_8i8_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -578,10 +578,10 @@ define void @sitofp_4i64_4f32() #0 {
 define void @sitofp_8i64_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i64_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -643,10 +643,10 @@ define void @sitofp_4i32_4f32() #0 {
 define void @sitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -686,25 +686,25 @@ define void @sitofp_8i32_8f32() #0 {
 define void @sitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i32_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -790,10 +790,10 @@ define void @sitofp_4i16_4f32() #0 {
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -833,25 +833,25 @@ define void @sitofp_8i16_8f32() #0 {
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -937,10 +937,10 @@ define void @sitofp_4i8_4f32() #0 {
 define void @sitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -980,25 +980,25 @@ define void @sitofp_8i8_8f32() #0 {
 define void @sitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i8_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 148dbabdee715..bd1d24e1b45c8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -7,27 +7,28 @@ define dso_local void @_Z4testP1S(%struct.S* %p) local_unnamed_addr {
 ; CHECK-LABEL: @_Z4testP1S(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 1, i64 0
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 15
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 1
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 7
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 2
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 3
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
 ; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 4
-; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 12
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 5
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 13
 ; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 6
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14
 ; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 7
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 15
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 7
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 6
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 4
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 12
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 13
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 5
@@ -41,7 +42,6 @@ define dso_local void @_Z4testP1S(%struct.S* %p) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32*> [[TMP8]], i32* [[ARRAYIDX48]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret void
@@ -128,15 +128,15 @@ define dso_local void @test_unordered_splits(%struct.S* nocapture %p) local_unna
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -199,21 +199,21 @@ define dso_local void @test_cost_splits(%struct.S* nocapture %p) local_unnamed_a
 ; CHECK-NEXT:    [[G22:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 14
 ; CHECK-NEXT:    [[G23:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 15
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[G10]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[G10]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[G22]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
index c8baf37637b4a..b25275ee1c879 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
@@ -37,10 +37,10 @@ define void @sqrt_2f64() #0 {
 define void @sqrt_4f64() #0 {
 ; SSE-LABEL: @sqrt_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -68,25 +68,25 @@ define void @sqrt_4f64() #0 {
 define void @sqrt_8f64() #0 {
 ; SSE-LABEL: @sqrt_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
-; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4]])
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]])
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP7]])
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sqrt_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
-; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP2]])
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP3]])
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -148,10 +148,10 @@ define void @sqrt_4f32() #0 {
 define void @sqrt_8f32() #0 {
 ; SSE-LABEL: @sqrt_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -191,25 +191,25 @@ define void @sqrt_8f32() #0 {
 define void @sqrt_16f32() #0 {
 ; SSE-LABEL: @sqrt_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
-; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4]])
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP5]])
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP7]])
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sqrt_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
-; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP2]])
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP3]])
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
index 19f654e5a4f87..a5ae2a02a35f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -9,19 +9,19 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP6]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
index f773910d8c614..6a5e698371f04 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
@@ -4,26 +4,26 @@
 define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @non-ordered-stores(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
index 65d1fce9e1303..b24da6cf95d4f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
@@ -90,13 +90,13 @@ define void @store_reverse(i64* %p3) {
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 10
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP5]], align 8
@@ -148,9 +148,6 @@ define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
@@ -167,6 +164,9 @@ define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
@@ -230,9 +230,6 @@ define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
@@ -250,6 +247,9 @@ define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
index 7232b20eafc49..e04d9297ac8fb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -11,9 +11,9 @@ define void @tiny_tree_fully_vectorizable(double* noalias nocapture %dst, double
 ; CHECK-NEXT:    [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[SRC_ADDR_013]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[DST_ADDR_014]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]]
@@ -62,9 +62,9 @@ define void @tiny_tree_fully_vectorizable2(float* noalias nocapture %dst, float*
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC_ADDR_021]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
@@ -165,16 +165,16 @@ define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, fl
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
index b38bd8d6f88e5..69544d9e59827 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -39,10 +39,10 @@ define void @uitofp_2i64_2f64() #0 {
 define void @uitofp_4i64_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i64_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -70,25 +70,25 @@ define void @uitofp_4i64_4f64() #0 {
 define void @uitofp_8i64_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i64_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i64_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -171,10 +171,10 @@ define void @uitofp_2i32_2f64() #0 {
 define void @uitofp_4i32_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i32_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -202,25 +202,25 @@ define void @uitofp_4i32_4f64() #0 {
 define void @uitofp_8i32_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i32_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i32> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i32> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i32> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -276,10 +276,10 @@ define void @uitofp_2i16_2f64() #0 {
 define void @uitofp_4i16_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i16_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -307,25 +307,25 @@ define void @uitofp_4i16_4f64() #0 {
 define void @uitofp_8i16_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i16_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i16> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i16> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -381,10 +381,10 @@ define void @uitofp_2i8_2f64() #0 {
 define void @uitofp_4i8_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i8_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -412,25 +412,25 @@ define void @uitofp_4i8_4f64() #0 {
 define void @uitofp_8i8_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i8_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i8> [[TMP5]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i8> [[TMP7]] to <2 x double>
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x double>
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -515,10 +515,10 @@ define void @uitofp_4i64_4f32() #0 {
 define void @uitofp_8i64_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i64_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -580,10 +580,10 @@ define void @uitofp_4i32_4f32() #0 {
 define void @uitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i32_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -623,25 +623,25 @@ define void @uitofp_8i32_8f32() #0 {
 define void @uitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i32_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
-; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -727,10 +727,10 @@ define void @uitofp_4i16_4f32() #0 {
 define void @uitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i16_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -770,25 +770,25 @@ define void @uitofp_8i16_8f32() #0 {
 define void @uitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i16_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
-; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -874,10 +874,10 @@ define void @uitofp_4i8_4f32() #0 {
 define void @uitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i8_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -917,25 +917,25 @@ define void @uitofp_8i8_8f32() #0 {
 define void @uitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i8_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
-; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP5]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP7]] to <4 x float>
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP3]] to <8 x float>
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll
index 95f7430fe7b21..08840ff36c0a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll
@@ -8,19 +8,19 @@ define void @foo(i8* %c, float* %d) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 3
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 -1
+; CHECK-NEXT:    [[ADD_PTR37:%.*]] = getelementptr inbounds float, float* [[D]], i64 -2
+; CHECK-NEXT:    [[ADD_PTR45:%.*]] = getelementptr inbounds float, float* [[D]], i64 -3
+; CHECK-NEXT:    [[ADD_PTR53:%.*]] = getelementptr inbounds float, float* [[D]], i64 -4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX4]] to <4 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 1, i32 2, i32 7, i32 0>
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 -1
-; CHECK-NEXT:    [[ADD_PTR37:%.*]] = getelementptr inbounds float, float* [[D]], i64 -2
-; CHECK-NEXT:    [[ADD_PTR45:%.*]] = getelementptr inbounds float, float* [[D]], i64 -3
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP6]] to <4 x float>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fdiv <4 x float> [[TMP7]], poison
-; CHECK-NEXT:    [[ADD_PTR53:%.*]] = getelementptr inbounds float, float* [[D]], i64 -4
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ADD_PTR53]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
index 2d87f4fa5bea5..d1846d4f4818f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
@@ -6,10 +6,10 @@ define void @test(double* %isec) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[ISEC:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX10]] to <2 x double>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 3
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX10]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index 87709a87b3692..8a2e04339843b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -17,18 +17,18 @@ define void @foo() {
 ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float>
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]]
 ; CHECK-NEXT:    br label [[BB3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
index b54d67fb82011..779e8dee295fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
@@ -12,9 +12,9 @@ define void @test_sideeffect(float* %p) {
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, float* [[P]], i64 3
 ; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    call void @llvm.sideeffect()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    call void @llvm.sideeffect()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -45,9 +45,9 @@ define void @test_inaccessiblememonly(float* %p) {
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, float* [[P]], i64 3
 ; CHECK-NEXT:    call void @foo() #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    call void @foo() #[[ATTR1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    call void @foo() #[[ATTR1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void

From 1da213836b43e5c646a82a1e0e191aee7836e37e Mon Sep 17 00:00:00 2001
From: Mogball <jeffniu22@gmail.com>
Date: Mon, 21 Feb 2022 00:28:21 +0000
Subject: [PATCH 521/748] [pdl] Remove `NoSideEffect` from all PDL ops

This trait results in PDL ops being erroneously CSE'd. These ops are side-effect free in the rewriter but not in the matcher (where unused values aren't allowed anyways). These ops should have a more nuanced side-effect modeling, this is fixing a bug introduced by a previous change.

Reviewed By: rriddle

Differential Revision: https://reviews.llvm.org/D120222
---
 mlir/include/mlir/Dialect/PDL/IR/PDLOps.td | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
index 0c35d0457d845..1d4264eeb9857 100644
--- a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
+++ b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
@@ -14,7 +14,6 @@
 #define MLIR_DIALECT_PDL_IR_PDLOPS
 
 include "mlir/Dialect/PDL/IR/PDLTypes.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpAsmInterface.td"
 include "mlir/IR/SymbolInterfaces.td"
 
@@ -122,7 +121,7 @@ def PDL_ApplyNativeRewriteOp
 // pdl::AttributeOp
 //===----------------------------------------------------------------------===//
 
-def PDL_AttributeOp : PDL_Op<"attribute", [NoSideEffect]> {
+def PDL_AttributeOp : PDL_Op<"attribute"> {
   let summary = "Define an input attribute in a pattern";
   let description = [{
     `pdl.attribute` operations capture named attribute edges into an operation.
@@ -191,7 +190,7 @@ def PDL_EraseOp : PDL_Op<"erase", [HasParent<"pdl::RewriteOp">]> {
 //===----------------------------------------------------------------------===//
 
 def PDL_OperandOp
-    : PDL_Op<"operand", [HasParent<"pdl::PatternOp">, NoSideEffect]> {
+    : PDL_Op<"operand", [HasParent<"pdl::PatternOp">]> {
   let summary = "Define an external input operand in a pattern";
   let description = [{
     `pdl.operand` operations capture external operand edges into an operation
@@ -230,7 +229,7 @@ def PDL_OperandOp
 //===----------------------------------------------------------------------===//
 
 def PDL_OperandsOp
-    : PDL_Op<"operands", [HasParent<"pdl::PatternOp">, NoSideEffect]> {
+    : PDL_Op<"operands", [HasParent<"pdl::PatternOp">]> {
   let summary = "Define a range of input operands in a pattern";
   let description = [{
     `pdl.operands` operations capture external operand range edges into an
@@ -501,7 +500,7 @@ def PDL_ReplaceOp : PDL_Op<"replace", [
 // pdl::ResultOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultOp : PDL_Op<"result", [NoSideEffect]> {
+def PDL_ResultOp : PDL_Op<"result"> {
   let summary = "Extract a result from an operation";
   let description = [{
     `pdl.result` operations extract result edges from an operation node within
@@ -533,7 +532,7 @@ def PDL_ResultOp : PDL_Op<"result", [NoSideEffect]> {
 // pdl::ResultsOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultsOp : PDL_Op<"results", [NoSideEffect]> {
+def PDL_ResultsOp : PDL_Op<"results"> {
   let summary = "Extract a result group from an operation";
   let description = [{
     `pdl.results` operations extract a result group from an operation within a
@@ -639,7 +638,7 @@ def PDL_RewriteOp : PDL_Op<"rewrite", [
 // pdl::TypeOp
 //===----------------------------------------------------------------------===//
 
-def PDL_TypeOp : PDL_Op<"type", [NoSideEffect]> {
+def PDL_TypeOp : PDL_Op<"type"> {
   let summary = "Define a type handle within a pattern";
   let description = [{
     `pdl.type` operations capture result type constraints of `Attributes`,
@@ -668,7 +667,7 @@ def PDL_TypeOp : PDL_Op<"type", [NoSideEffect]> {
 // pdl::TypesOp
 //===----------------------------------------------------------------------===//
 
-def PDL_TypesOp : PDL_Op<"types", [NoSideEffect]> {
+def PDL_TypesOp : PDL_Op<"types"> {
   let summary = "Define a range of type handles within a pattern";
   let description = [{
     `pdl.types` operations capture result type constraints of `Value`s, and

From c5256412b76c6e42d21dd744a191b1c75861212d Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Mon, 21 Feb 2022 16:44:38 -0800
Subject: [PATCH 522/748] Updated reflection-dump.test for mpenum section

With 1c1e2cce9a50ac9fe6b884b79925d71914cf5a30 a new swift5 reflection section for multi-payload enum mask information was added, which is called mpenum. This change simply adds a check to make sure dsymutil can dump out information in that section into the dSYM bundle.

Differential Revision: https://reviews.llvm.org/D120291
---
 .../dsymutil/Inputs/reflection_metadata.yaml  | 38 ++++++++++++-------
 .../tools/dsymutil/X86/reflection-dump.test   |  3 ++
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/llvm/test/tools/dsymutil/Inputs/reflection_metadata.yaml b/llvm/test/tools/dsymutil/Inputs/reflection_metadata.yaml
index 964de15ce1ae2..2572a2b59012a 100644
--- a/llvm/test/tools/dsymutil/Inputs/reflection_metadata.yaml
+++ b/llvm/test/tools/dsymutil/Inputs/reflection_metadata.yaml
@@ -16,12 +16,12 @@ FileHeader:
   cpusubtype:      0x3
   filetype:        0x1
   ncmds:           8
-  sizeofcmds:      3040
+  sizeofcmds:      3120
   flags:           0x2000
   reserved:        0x0
 LoadCommands:
   - cmd:             LC_SEGMENT_64
-    cmdsize:         2792
+    cmdsize:         2872
     segname:         ''
     vmaddr:          0
     vmsize:          21352
@@ -36,7 +36,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x0
         size:            4571
-        offset:          0xC00
+        offset:          0xC50
         align:           4
         reloff:          0x5CF8
         nreloc:          74
@@ -56,7 +56,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x11DC
         size:            117
-        offset:          0x1DDC
+        offset:          0x1E2C
         align:           1
         reloff:          0x5F48
         nreloc:          22
@@ -77,7 +77,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x1254
         size:            24
-        offset:          0x1E54
+        offset:          0x1EA4
         align:           2
         reloff:          0x5FF8
         nreloc:          6
@@ -98,7 +98,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x17D8
         size:            37
-        offset:          0x23D8
+        offset:          0x2428
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -110,7 +110,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x1800
         size:            24
-        offset:          0x2400
+        offset:          0x2450
         align:           2
         reloff:          0x6530
         nreloc:          8
@@ -131,7 +131,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x1818
         size:            260
-        offset:          0x2418
+        offset:          0x2468
         align:           2
         reloff:          0x6570
         nreloc:          60
@@ -152,7 +152,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x1AC8
         size:            20
-        offset:          0x26C8
+        offset:          0x2718
         align:           2
         reloff:          0x67F8
         nreloc:          2
@@ -173,7 +173,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x1AEC
         size:            10
-        offset:          0x26EC
+        offset:          0x273C
         align:           2
         reloff:          0x0
         nreloc:          0
@@ -185,7 +185,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x1AF8
         size:            10
-        offset:          0x2710
+        offset:          0x2760
         align:           2
         reloff:          0x0
         nreloc:          0
@@ -197,7 +197,7 @@ LoadCommands:
         segname:         __TEXT
         addr:            0x1B04
         size:            10
-        offset:          0x2734
+        offset:          0x2784
         align:           2
         reloff:          0x0
         nreloc:          0
@@ -205,11 +205,23 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         content:         61626364656667686970
+      - sectname:        __swift5_mpenum
+        segname:         __TEXT
+        addr:            0x1B10
+        size:            10
+        offset:          0x27A8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x10000000
+        reserved1:       0x0
+        reserved2:       0x0
+        content:         71727374757677787980
       - sectname:        __bss
         segname:         __DATA
         addr:            0x3372
         size:            2084
-        offset:          0x51D0
+        offset:          0x5220
         align:           3
         reloff:          0x0
         nreloc:          0
diff --git a/llvm/test/tools/dsymutil/X86/reflection-dump.test b/llvm/test/tools/dsymutil/X86/reflection-dump.test
index e94af7e54582f..12cf11ed75127 100644
--- a/llvm/test/tools/dsymutil/X86/reflection-dump.test
+++ b/llvm/test/tools/dsymutil/X86/reflection-dump.test
@@ -51,3 +51,6 @@ CHECK-NEXT: 10000e264 51525354 55565758 5960               QRSTUVWXY`
 
 CHECK: Contents of section __DWARF,__swift5_acfuncs:
 CHECK-NEXT: 10000e270 61626364 65666768 6970               abcdefghip
+
+CHECK: Contents of section __DWARF,__swift5_mpenum:
+CHECK-NEXT: 10000e27c 71727374 75767778 7980               qrstuvwxy.

From d657c6893f9b987f23ddbb1eddf62cc3add77e28 Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <aardappel@gmail.com>
Date: Mon, 14 Feb 2022 15:55:24 -0800
Subject: [PATCH 523/748] [WebAssembly] Allow .data shorthand for .section
 .data,"",@

---
 llvm/lib/MC/MCParser/WasmAsmParser.cpp    | 8 ++++++++
 llvm/test/MC/WebAssembly/basic-assembly.s | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 7fe2711efcd91..ea93af7c99cbc 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolWasm.h"
@@ -52,6 +53,7 @@ class WasmAsmParser : public MCAsmParserExtension {
     this->MCAsmParserExtension::Initialize(*Parser);
 
     addDirectiveHandler<&WasmAsmParser::parseSectionDirectiveText>(".text");
+    addDirectiveHandler<&WasmAsmParser::parseSectionDirectiveData>(".data");
     addDirectiveHandler<&WasmAsmParser::parseSectionDirective>(".section");
     addDirectiveHandler<&WasmAsmParser::parseDirectiveSize>(".size");
     addDirectiveHandler<&WasmAsmParser::parseDirectiveType>(".type");
@@ -89,6 +91,12 @@ class WasmAsmParser : public MCAsmParserExtension {
     return false;
   }
 
+  bool parseSectionDirectiveData(StringRef, SMLoc) {
+    auto *S = getContext().getObjectFileInfo()->getDataSection();
+    getStreamer().SwitchSection(S);
+    return false;
+  }
+
   uint32_t parseSectionFlags(StringRef FlagStr, bool &Passive, bool &Group) {
     uint32_t flags = 0;
     for (char C : FlagStr) {
diff --git a/llvm/test/MC/WebAssembly/basic-assembly.s b/llvm/test/MC/WebAssembly/basic-assembly.s
index b86172ba9e24c..769cd7edfa8a3 100644
--- a/llvm/test/MC/WebAssembly/basic-assembly.s
+++ b/llvm/test/MC/WebAssembly/basic-assembly.s
@@ -137,6 +137,9 @@ test0:
     .int32      2000000000
     .size       .L.str, 28
 
+    .data
+    .int8       73
+
     .section    .init_array.42,"",@
     .p2align    2
     .int32      test0
@@ -272,6 +275,10 @@ empty_fref_table:
 # CHECK-NEXT:      .int32      2000000000
 # CHECK-NEXT:      .size       .L.str, 28
 
+# CHECK:           .data
+# CHECK-EMPTY:
+# CHECK-NEXT:      .int8       73
+
 # CHECK:           .section    .init_array.42,"",@
 # CHECK-NEXT:      .p2align    2
 # CHECK-NEXT:      .int32      test0

From 8612b11c866f23d2bb756782b8fa9e1dbe067f7c Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 10:27:00 -0800
Subject: [PATCH 524/748] [SLP] Use isInSchedulingRegion consistently [NFC]

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 99da014e0cc3d..bdc6a33eb2278 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2641,7 +2641,7 @@ class BoUpSLP {
 
     ScheduleData *getScheduleData(Value *V) {
       ScheduleData *SD = ScheduleDataMap[V];
-      if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+      if (SD && isInSchedulingRegion(SD))
         return SD;
       return nullptr;
     }
@@ -2652,7 +2652,7 @@ class BoUpSLP {
       auto I = ExtraScheduleDataMap.find(V);
       if (I != ExtraScheduleDataMap.end()) {
         ScheduleData *SD = I->second[Key];
-        if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+        if (SD && isInSchedulingRegion(SD))
           return SD;
       }
       return nullptr;
@@ -2774,7 +2774,7 @@ class BoUpSLP {
       auto I = ExtraScheduleDataMap.find(V);
       if (I != ExtraScheduleDataMap.end())
         for (auto &P : I->second)
-          if (P.second->SchedulingRegionID == SchedulingRegionID)
+          if (isInSchedulingRegion(P.second))
             Action(P.second);
     }
 
@@ -2876,8 +2876,8 @@ class BoUpSLP {
 
     /// The ID of the scheduling region. For a new vectorization iteration this
     /// is incremented which "removes" all ScheduleData from the region.
-    // Make sure that the initial SchedulingRegionID is greater than the
-    // initial SchedulingRegionID in ScheduleData (which is 0).
+    /// Make sure that the initial SchedulingRegionID is greater than the
+    /// initial SchedulingRegionID in ScheduleData (which is 0).
     int SchedulingRegionID = 1;
   };
 

From 63eb963e58663541d6feb58f53a1bd4903e3dabf Mon Sep 17 00:00:00 2001
From: Mogball <jeffniu22@gmail.com>
Date: Tue, 22 Feb 2022 18:26:52 +0000
Subject: [PATCH 525/748] [mlir][pdl] NFC re-add NoSideEffect to Result and
 Results Op

---
 mlir/include/mlir/Dialect/PDL/IR/PDLOps.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
index 1d4264eeb9857..b7329df48d524 100644
--- a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
+++ b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
@@ -500,7 +500,7 @@ def PDL_ReplaceOp : PDL_Op<"replace", [
 // pdl::ResultOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultOp : PDL_Op<"result"> {
+def PDL_ResultOp : PDL_Op<"result", [NoSideEffect]> {
   let summary = "Extract a result from an operation";
   let description = [{
     `pdl.result` operations extract result edges from an operation node within
@@ -532,7 +532,7 @@ def PDL_ResultOp : PDL_Op<"result"> {
 // pdl::ResultsOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultsOp : PDL_Op<"results"> {
+def PDL_ResultsOp : PDL_Op<"results", [NoSideEffect]> {
   let summary = "Extract a result group from an operation";
   let description = [{
     `pdl.results` operations extract a result group from an operation within a

From ecb27004ecbc97f8f075e81cabf95adbc84062e2 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 22 Feb 2022 10:31:09 -0800
Subject: [PATCH 526/748] Revert "[AArch64] Alter mull shuffle(ext(..)) combine
 to work on buildvectors"

This reverts commit 9fc1a0dcb79afb31470751651c30e843c12e9ca5.

We have bisected a compiler crash to this revision and will provide a
test case soon.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 74 ++++++++++++-------
 llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll  |  6 +-
 .../AArch64/aarch64-matrix-umull-smull.ll     | 49 ++++++------
 3 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5678029be376e..58d91c3412a93 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13448,17 +13448,33 @@ static EVT calculatePreExtendType(SDValue Extend) {
   }
 }
 
-/// Combines a buildvector(sext/zext) node pattern into sext/zext(buildvector)
+/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
-static SDValue performBuildVectorExtendCombine(SDValue BV, SelectionDAG &DAG) {
-  EVT VT = BV.getValueType();
-  if (BV.getOpcode() != ISD::BUILD_VECTOR)
+static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
+                                                SelectionDAG &DAG) {
+  ShuffleVectorSDNode *ShuffleNode =
+      dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
+  if (!ShuffleNode)
+    return SDValue();
+
+  // Ensuring the mask is zero before continuing
+  if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
+    return SDValue();
+
+  SDValue InsertVectorElt = VectorShuffle.getOperand(0);
+
+  if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  SDValue InsertLane = InsertVectorElt.getOperand(2);
+  ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
+  // Ensures the insert is inserting into lane 0
+  if (!Constant || Constant->getZExtValue() != 0)
     return SDValue();
 
-  // Use the first item in the buildvector to get the size of the extend, and
-  // make sure it looks valid.
-  SDValue Extend = BV->getOperand(0);
+  SDValue Extend = InsertVectorElt.getOperand(1);
   unsigned ExtendOpcode = Extend.getOpcode();
+
   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
                 ExtendOpcode == ISD::AssertSext;
@@ -13468,28 +13484,30 @@ static SDValue performBuildVectorExtendCombine(SDValue BV, SelectionDAG &DAG) {
 
   // Restrict valid pre-extend data type
   EVT PreExtendType = calculatePreExtendType(Extend);
-  if (PreExtendType.getSizeInBits() != VT.getScalarSizeInBits() / 2)
+  if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
+      PreExtendType != MVT::i32)
     return SDValue();
 
-  // Make sure all other operands are equally extended
-  for (SDValue Op : drop_begin(BV->ops())) {
-    unsigned Opc = Op.getOpcode();
-    bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
-                     Opc == ISD::AssertSext;
-    if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
-      return SDValue();
-  }
+  EVT TargetType = VectorShuffle.getValueType();
+  EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
+  if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
+    return SDValue();
+
+  SDLoc DL(VectorShuffle);
+
+  SDValue InsertVectorNode = DAG.getNode(
+      InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
+      DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
+      DAG.getConstant(0, DL, MVT::i64));
+
+  std::vector<int> ShuffleMask(TargetType.getVectorNumElements());
+
+  SDValue VectorShuffleNode =
+      DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
+                           DAG.getUNDEF(PreExtendVT), ShuffleMask);
 
-  EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
-  EVT PreExtendLegalType =
-      PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
-  SDLoc DL(BV);
-  SmallVector<SDValue, 8> NewOps;
-  for (SDValue Op : BV->ops())
-    NewOps.push_back(
-        DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, PreExtendLegalType));
-  SDValue NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
-  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
+  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+                     TargetType, VectorShuffleNode);
 }
 
 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
@@ -13500,8 +13518,8 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
   if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
     return SDValue();
 
-  SDValue Op0 = performBuildVectorExtendCombine(Mul->getOperand(0), DAG);
-  SDValue Op1 = performBuildVectorExtendCombine(Mul->getOperand(1), DAG);
+  SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
+  SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
 
   // Neither operands have been changed, don't make any further changes
   if (!Op0 && !Op1)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index 5a57e6e82dd2e..bc31d41a55f43 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -156,8 +156,10 @@ entry:
 define <8 x i16> @nonsplat_shuffleinsert(i8 %src, <8 x i8> %b) {
 ; CHECK-LABEL: nonsplat_shuffleinsert:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    dup v1.8b, w0
-; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
 entry:
     %in = sext i8 %src to i16
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 12b451f509f73..4f999edf3d571 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -201,22 +201,25 @@ define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB3_6
 ; CHECK-NEXT:  .LBB3_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
-; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    smull2 v4.4s, v1.8h, v2.8h
-; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    smull2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT:    smull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    stp q2, q4, [x11, #-32]
-; CHECK-NEXT:    stp q3, q5, [x11], #64
+; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB3_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9
@@ -314,22 +317,25 @@ define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB4_6
 ; CHECK-NEXT:  .LBB4_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
-; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    umull2 v4.4s, v1.8h, v2.8h
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    stp q2, q4, [x11, #-32]
-; CHECK-NEXT:    stp q3, q5, [x11], #64
+; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB4_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9
@@ -429,13 +435,12 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
-; CHECK-NEXT:    dup v2.8b, w9
 ; CHECK-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    add x8, x0, #8
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    mov x12, x11
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    dup v2.8h, w9
 ; CHECK-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp d3, d4, [x8, #-8]

From 9865c3f28aa812364584b55629eef9b52bb1230e Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 22 Feb 2022 10:46:49 -0800
Subject: [PATCH 527/748] Revert "[mlir][pdl] NFC re-add NoSideEffect to Result
 and Results Op"

This reverts commit 63eb963e58663541d6feb58f53a1bd4903e3dabf.

Breaks MLIR build.
---
 mlir/include/mlir/Dialect/PDL/IR/PDLOps.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
index b7329df48d524..1d4264eeb9857 100644
--- a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
+++ b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
@@ -500,7 +500,7 @@ def PDL_ReplaceOp : PDL_Op<"replace", [
 // pdl::ResultOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultOp : PDL_Op<"result", [NoSideEffect]> {
+def PDL_ResultOp : PDL_Op<"result"> {
   let summary = "Extract a result from an operation";
   let description = [{
     `pdl.result` operations extract result edges from an operation node within
@@ -532,7 +532,7 @@ def PDL_ResultOp : PDL_Op<"result", [NoSideEffect]> {
 // pdl::ResultsOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultsOp : PDL_Op<"results", [NoSideEffect]> {
+def PDL_ResultsOp : PDL_Op<"results"> {
   let summary = "Extract a result group from an operation";
   let description = [{
     `pdl.results` operations extract a result group from an operation within a

From ef7b9824cd2241bb8a7647429fe73048b68d66ae Mon Sep 17 00:00:00 2001
From: Mogball <jeffniu22@gmail.com>
Date: Tue, 22 Feb 2022 18:47:07 +0000
Subject: [PATCH 528/748] [mlir][pdl] NFC fix missing include

---
 mlir/include/mlir/Dialect/PDL/IR/PDLOps.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
index 1d4264eeb9857..206191a8ff78a 100644
--- a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
+++ b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
@@ -16,6 +16,7 @@
 include "mlir/Dialect/PDL/IR/PDLTypes.td"
 include "mlir/IR/OpAsmInterface.td"
 include "mlir/IR/SymbolInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // PDL Ops

From de2cc2a00298a952aeee1883086e4c614259333a Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 22 Feb 2022 10:48:25 -0800
Subject: [PATCH 529/748] Reland "[mlir][pdl] NFC re-add NoSideEffect to Result
 and Results Op"

This reverts commit 9865c3f28aa812364584b55629eef9b52bb1230e.

Looks like our commits raced and Jeff fixed the build issue.
---
 mlir/include/mlir/Dialect/PDL/IR/PDLOps.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
index 206191a8ff78a..61fe1261e6dd9 100644
--- a/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
+++ b/mlir/include/mlir/Dialect/PDL/IR/PDLOps.td
@@ -501,7 +501,7 @@ def PDL_ReplaceOp : PDL_Op<"replace", [
 // pdl::ResultOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultOp : PDL_Op<"result"> {
+def PDL_ResultOp : PDL_Op<"result", [NoSideEffect]> {
   let summary = "Extract a result from an operation";
   let description = [{
     `pdl.result` operations extract result edges from an operation node within
@@ -533,7 +533,7 @@ def PDL_ResultOp : PDL_Op<"result"> {
 // pdl::ResultsOp
 //===----------------------------------------------------------------------===//
 
-def PDL_ResultsOp : PDL_Op<"results"> {
+def PDL_ResultsOp : PDL_Op<"results", [NoSideEffect]> {
   let summary = "Extract a result group from an operation";
   let description = [{
     `pdl.results` operations extract a result group from an operation within a

From 621e2de138f70e175512c18d9f358666de93e838 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 9 Nov 2021 16:00:29 -0800
Subject: [PATCH 530/748] Add a (nonfunctional) -dyld_info flag to
 llvm-objdump.

Darwin otool implements this flag as a one-stop solution for
displaying bind and rebase info. As I am working on upstreaming
chained fixup support this command will be useful to write testcases.

Differential Revision: https://reviews.llvm.org/D113573
---
 llvm/docs/CommandGuide/llvm-objdump.rst           |  5 +++++
 llvm/test/tools/llvm-objdump/MachO/dyld_info.test |  7 +++++++
 llvm/tools/llvm-objdump/MachODump.cpp             | 13 +++++++++++--
 llvm/tools/llvm-objdump/MachODump.h               |  1 +
 llvm/tools/llvm-objdump/ObjdumpOpts.td            |  6 ++++++
 llvm/tools/llvm-objdump/OtoolOpts.td              |  1 -
 llvm/tools/llvm-objdump/llvm-objdump.cpp          | 10 +++++-----
 7 files changed, 35 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/MachO/dyld_info.test

diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
index cbc525fba8a61..5ee4b83f3fe5e 100644
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -302,6 +302,11 @@ MACH-O ONLY OPTIONS AND COMMANDS
 
   Disassemble just the specified symbol's instructions.
 
+.. option:: --dyld_info
+
+  Print bind and rebase information used by dyld to resolve external
+  references in a final linked binary.
+
 .. option:: --dylibs-used
 
   Display the shared libraries used for linked files.
diff --git a/llvm/test/tools/llvm-objdump/MachO/dyld_info.test b/llvm/test/tools/llvm-objdump/MachO/dyld_info.test
new file mode 100644
index 0000000000000..35642acb060b1
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/MachO/dyld_info.test
@@ -0,0 +1,7 @@
+RUN: llvm-objdump --macho --dyld_info %p/Inputs/bind.macho-x86_64 \
+RUN:   | FileCheck %s --match-full-lines --strict-whitespace \
+RUN:     --implicit-check-not={{.}}
+
+CHECK:{{.*}}bind.macho-x86_64:
+CHECK-NEXT:dyld information:
+CHECK-NEXT:[not yet implemented].
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index c785c6cfaa89a..f55e6314c0d26 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -81,6 +81,7 @@ bool objdump::DataInCode;
 bool objdump::FunctionStarts;
 bool objdump::LinkOptHints;
 bool objdump::InfoPlist;
+bool objdump::DyldInfo;
 bool objdump::DylibsUsed;
 bool objdump::DylibId;
 bool objdump::Verbose;
@@ -111,6 +112,7 @@ void objdump::parseMachOOptions(const llvm::opt::InputArgList &InputArgs) {
   FunctionStarts = InputArgs.hasArg(OBJDUMP_function_starts);
   LinkOptHints = InputArgs.hasArg(OBJDUMP_link_opt_hints);
   InfoPlist = InputArgs.hasArg(OBJDUMP_info_plist);
+  DyldInfo = InputArgs.hasArg(OBJDUMP_dyld_info);
   DylibsUsed = InputArgs.hasArg(OBJDUMP_dylibs_used);
   DylibId = InputArgs.hasArg(OBJDUMP_dylib_id);
   Verbose = !InputArgs.hasArg(OBJDUMP_non_verbose);
@@ -1182,6 +1184,11 @@ static void PrintLinkOptHints(MachOObjectFile *O) {
   }
 }
 
+static void PrintDyldInfo(MachOObjectFile *O) {
+  outs() << "dyld information:\n";
+  outs() << "[not yet implemented].\n";
+}
+
 static void PrintDylibs(MachOObjectFile *O, bool JustId) {
   unsigned Index = 0;
   for (const auto &Load : O->load_commands()) {
@@ -1900,8 +1907,8 @@ static void ProcessMachO(StringRef Name, MachOObjectFile *MachOOF,
   // UniversalHeaders or ArchiveHeaders.
   if (Disassemble || Relocations || PrivateHeaders || ExportsTrie || Rebase ||
       Bind || SymbolTable || LazyBind || WeakBind || IndirectSymbols ||
-      DataInCode || FunctionStarts || LinkOptHints || DylibsUsed || DylibId ||
-      Rpaths || ObjcMetaData || (!FilterSections.empty())) {
+      DataInCode || FunctionStarts || LinkOptHints || DyldInfo || DylibsUsed ||
+      DylibId || Rpaths || ObjcMetaData || (!FilterSections.empty())) {
     if (LeadingHeaders) {
       outs() << Name;
       if (!ArchiveMemberName.empty())
@@ -1970,6 +1977,8 @@ static void ProcessMachO(StringRef Name, MachOObjectFile *MachOOF,
     DumpSectionContents(FileName, MachOOF, Verbose);
   if (InfoPlist)
     DumpInfoPlistSectionContents(FileName, MachOOF);
+  if (DyldInfo)
+    PrintDyldInfo(MachOOF);
   if (DylibsUsed)
     PrintDylibs(MachOOF, false);
   if (DylibId)
diff --git a/llvm/tools/llvm-objdump/MachODump.h b/llvm/tools/llvm-objdump/MachODump.h
index 7568062bd6b0e..12783e15b425c 100644
--- a/llvm/tools/llvm-objdump/MachODump.h
+++ b/llvm/tools/llvm-objdump/MachODump.h
@@ -36,6 +36,7 @@ void parseMachOOptions(const llvm::opt::InputArgList &InputArgs);
 extern bool Bind;
 extern bool DataInCode;
 extern std::string DisSymName;
+extern bool DyldInfo;
 extern bool DylibId;
 extern bool DylibsUsed;
 extern bool ExportsTrie;
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index 9f27a6cdf163f..5b35ada6e124a 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -296,6 +296,12 @@ def info_plist : Flag<["--"], "info-plist">,
            "Mach-O objects (requires --macho)">,
   Group<grp_mach_o>;
 
+def dyld_info : Flag<["--"], "dyld_info">,
+      HelpText<"Print bind and rebase information used by dyld to resolve "
+               "external references in a final linked binary "
+               "(requires --macho)">,
+      Group<grp_mach_o>;
+
 def dylibs_used : Flag<["--"], "dylibs-used">,
   HelpText<"Print the shared libraries used for linked "
            "Mach-O files (requires --macho)">,
diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td
index 61ea701ed75dd..e8bef284c0e91 100644
--- a/llvm/tools/llvm-objdump/OtoolOpts.td
+++ b/llvm/tools/llvm-objdump/OtoolOpts.td
@@ -47,7 +47,6 @@ def X : Flag<["-"], "X">, HelpText<"omit leading addresses or headers">;
 // -addr_slide=arg
 // -function_offsets
 
-
 // Obsolete and unsupported:
 def grp_obsolete : OptionGroup<"kind">,
   HelpText<"Obsolete and unsupported flags">;
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 4cb226b795255..1b9da92eb3b3b 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2760,11 +2760,11 @@ int main(int argc, char **argv) {
       !DynamicRelocations && !FileHeaders && !PrivateHeaders && !RawClangAST &&
       !Relocations && !SectionHeaders && !SectionContents && !SymbolTable &&
       !DynamicSymbolTable && !UnwindInfo && !FaultMapSection &&
-      !(MachOOpt &&
-        (Bind || DataInCode || DylibId || DylibsUsed || ExportsTrie ||
-         FirstPrivateHeader || FunctionStarts || IndirectSymbols || InfoPlist ||
-         LazyBind || LinkOptHints || ObjcMetaData || Rebase || Rpaths ||
-         UniversalHeaders || WeakBind || !FilterSections.empty()))) {
+      !(MachOOpt && (Bind || DataInCode || DyldInfo || DylibId || DylibsUsed ||
+                     ExportsTrie || FirstPrivateHeader || FunctionStarts ||
+                     IndirectSymbols || InfoPlist || LazyBind || LinkOptHints ||
+                     ObjcMetaData || Rebase || Rpaths || UniversalHeaders ||
+                     WeakBind || !FilterSections.empty()))) {
     T->printHelp(ToolName);
     return 2;
   }

From a3bfb01d94cc486280f2f13f5e86f4070bf3b450 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Wed, 10 Nov 2021 16:25:26 -0800
Subject: [PATCH 531/748] Add support for chained fixup load commands to
 MachOObjectFile

This is part of a series of patches to upstream support for Mach-O chained fixups.

This patch adds support for parsing the chained fixup load command and
parsing the chained fixups header. It also puts into place the
abstract interface that will be used to iterate over the fixups.

Differential Revision: https://reviews.llvm.org/D113630
---
 llvm/include/llvm/BinaryFormat/MachO.h        |  26 ++-
 llvm/include/llvm/Object/MachO.h              | 126 +++++++++++++
 llvm/lib/Object/MachOObjectFile.cpp           | 161 +++++++++++++++-
 .../Object/AArch64/chained-fixups-header.test |  11 ++
 .../Object/Inputs/MachO/chained-fixups.yaml   | 173 ++++++++++++++++++
 .../tools/llvm-objdump/MachO/dyld_info.test   |   3 +-
 llvm/tools/llvm-objdump/MachODump.cpp         |  15 +-
 7 files changed, 507 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Object/AArch64/chained-fixups-header.test
 create mode 100644 llvm/test/Object/Inputs/MachO/chained-fixups.yaml

diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
index ce3a5c46e0d13..9850ba9c4d1cb 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -255,7 +255,8 @@ enum BindType {
 enum BindSpecialDylib {
   BIND_SPECIAL_DYLIB_SELF = 0,
   BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE = -1,
-  BIND_SPECIAL_DYLIB_FLAT_LOOKUP = -2
+  BIND_SPECIAL_DYLIB_FLAT_LOOKUP = -2,
+  BIND_SPECIAL_DYLIB_WEAK_LOOKUP = -3
 };
 
 enum {
@@ -1001,6 +1002,19 @@ struct nlist_64 {
   uint64_t n_value;
 };
 
+/// Structs for dyld chained fixups.
+/// dyld_chained_fixups_header is the data pointed to by LC_DYLD_CHAINED_FIXUPS
+/// load command.
+struct dyld_chained_fixups_header {
+  uint32_t fixups_version; ///< 0
+  uint32_t starts_offset;  ///< Offset of dyld_chained_starts_in_image.
+  uint32_t imports_offset; ///< Offset of imports table in chain_data.
+  uint32_t symbols_offset; ///< Offset of symbol strings in chain_data.
+  uint32_t imports_count;  ///< Number of imported symbol names.
+  uint32_t imports_format; ///< DYLD_CHAINED_IMPORT*
+  uint32_t symbols_format; ///< 0 => uncompressed, 1 => zlib compressed
+};
+
 // Byte order swapping functions for MachO structs
 
 inline void swapStruct(fat_header &mh) {
@@ -2008,6 +2022,16 @@ union alignas(4) macho_load_command {
 };
 LLVM_PACKED_END
 
+inline void swapStruct(dyld_chained_fixups_header &C) {
+  sys::swapByteOrder(C.fixups_version);
+  sys::swapByteOrder(C.starts_offset);
+  sys::swapByteOrder(C.imports_offset);
+  sys::swapByteOrder(C.symbols_offset);
+  sys::swapByteOrder(C.imports_count);
+  sys::swapByteOrder(C.imports_format);
+  sys::swapByteOrder(C.symbols_format);
+}
+
 /* code signing attributes of a process */
 
 enum CodeSignAttrs {
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 840faa7004289..3350e8215ff9f 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -260,6 +260,126 @@ class MachOBindEntry {
 };
 using bind_iterator = content_iterator<MachOBindEntry>;
 
+/// ChainedFixupTarget holds all the information about an external symbol
+/// necessary to bind this binary to that symbol. These values are referenced
+/// indirectly by chained fixup binds. This structure captures values from all
+/// import and symbol formats.
+///
+/// Be aware there are two notions of weak here:
+///   WeakImport == true
+///     The associated bind may be set to 0 if this symbol is missing from its
+///     parent library. This is called a "weak import."
+///   LibOrdinal == BIND_SPECIAL_DYLIB_WEAK_LOOKUP
+///     This symbol may be coalesced with other libraries vending the same
+///     symbol. E.g., C++'s "operator new". This is called a "weak bind."
+struct ChainedFixupTarget {
+public:
+  ChainedFixupTarget(int LibOrdinal, StringRef Symbol, uint64_t Addend,
+                     bool WeakImport)
+      : LibOrdinal(LibOrdinal), SymbolName(Symbol), Addend(Addend),
+        WeakImport(WeakImport) {}
+
+  int libOrdinal() { return LibOrdinal; }
+  StringRef symbolName() { return SymbolName; }
+  uint64_t addend() { return Addend; }
+  bool weakImport() { return WeakImport; }
+  bool weakBind() {
+    return LibOrdinal == MachO::BIND_SPECIAL_DYLIB_WEAK_LOOKUP;
+  }
+
+private:
+  int LibOrdinal;
+  StringRef SymbolName;
+  uint64_t Addend;
+  bool WeakImport;
+};
+
+/// MachOAbstractFixupEntry is an abstract class representing a fixup in a
+/// MH_DYLDLINK file. Fixups generally represent rebases and binds. Binds also
+/// subdivide into additional subtypes (weak, lazy, reexport).
+///
+/// The two concrete subclasses of MachOAbstractFixupEntry are:
+///
+///   MachORebaseBindEntry   - for dyld opcode-based tables, including threaded-
+///                            rebase, where rebases are mixed in with other
+///                            bind opcodes.
+///   MachOChainedFixupEntry - for pointer chains embedded in data pages.
+class MachOAbstractFixupEntry {
+public:
+  MachOAbstractFixupEntry(Error *Err, const MachOObjectFile *O);
+
+  int32_t segmentIndex() const;
+  uint64_t segmentOffset() const;
+  uint64_t segmentAddress() const;
+  StringRef segmentName() const;
+  StringRef sectionName() const;
+  StringRef typeName() const;
+  StringRef symbolName() const;
+  uint32_t flags() const;
+  int64_t addend() const;
+  int ordinal() const;
+
+  /// \return the location of this fixup as a VM Address. For the VM
+  /// Address this fixup is pointing to, use pointerValue().
+  uint64_t address() const;
+
+  /// \return the VM Address pointed to by this fixup. Use
+  /// pointerValue() to compare against other VM Addresses, such as
+  /// section addresses or segment vmaddrs.
+  uint64_t pointerValue() const { return PointerValue; }
+
+  /// \return the raw "on-disk" representation of the fixup. For
+  /// Threaded rebases and Chained pointers these values are generally
+  /// encoded into various different pointer formats. This value is
+  /// exposed in API for tools that want to display and annotate the
+  /// raw bits.
+  uint64_t rawValue() const { return RawValue; }
+
+  void moveNext();
+
+protected:
+  Error *E;
+  const MachOObjectFile *O;
+  uint64_t SegmentOffset = 0;
+  int32_t SegmentIndex = -1;
+  StringRef SymbolName;
+  int32_t Ordinal = 0;
+  uint32_t Flags = 0;
+  int64_t Addend = 0;
+  uint64_t PointerValue = 0;
+  uint64_t RawValue = 0;
+  bool Done = false;
+
+  void moveToFirst();
+  void moveToEnd();
+
+  /// \return the vm address of the start of __TEXT segment.
+  uint64_t textAddress() const { return TextAddress; }
+
+private:
+  uint64_t TextAddress;
+};
+
+class MachOChainedFixupEntry : public MachOAbstractFixupEntry {
+public:
+  enum class FixupKind { All, Bind, WeakBind, Rebase };
+
+  MachOChainedFixupEntry(Error *Err, const MachOObjectFile *O, FixupKind Kind,
+                         bool Parse);
+
+  bool operator==(const MachOChainedFixupEntry &) const;
+
+  void moveNext();
+  void moveToFirst();
+  void moveToEnd();
+
+private:
+  std::vector<ChainedFixupTarget> FixupTargets;
+  uint32_t FixupIndex = 0;
+  FixupKind Kind;
+};
+using fixup_iterator = content_iterator<MachOChainedFixupEntry>;
+
 class MachOObjectFile : public ObjectFile {
 public:
   struct LoadCommandInfo {
@@ -402,6 +522,10 @@ class MachOObjectFile : public ObjectFile {
   /// For use iterating over all bind table entries.
   iterator_range<bind_iterator> bindTable(Error &Err);
 
+  /// For iterating over all chained fixups.
+  iterator_range<fixup_iterator>
+  fixupTable(Error &Err, MachOChainedFixupEntry::FixupKind Kind);
+
   /// For use iterating over all lazy bind table entries.
   iterator_range<bind_iterator> lazyBindTable(Error &Err);
 
@@ -562,6 +686,7 @@ class MachOObjectFile : public ObjectFile {
   ArrayRef<uint8_t> getDyldInfoBindOpcodes() const;
   ArrayRef<uint8_t> getDyldInfoWeakBindOpcodes() const;
   ArrayRef<uint8_t> getDyldInfoLazyBindOpcodes() const;
+  Expected<std::vector<ChainedFixupTarget>> getDyldChainedFixupTargets() const;
   ArrayRef<uint8_t> getDyldInfoExportsTrie() const;
   SmallVector<uint64_t> getFunctionStarts() const;
   ArrayRef<uint8_t> getUuid() const;
@@ -691,6 +816,7 @@ class MachOObjectFile : public ObjectFile {
   const char *LinkOptHintsLoadCmd = nullptr;
   const char *DyldInfoLoadCmd = nullptr;
   const char *FuncStartsLoadCmd = nullptr;
+  const char *DyldChainedFixupsLoadCmd = nullptr;
   const char *UuidLoadCmd = nullptr;
   bool HasPageZeroSegment = false;
 };
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 85f3824aedd84..051055173d3f1 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -1380,6 +1380,11 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
       if ((Err = checkDyldInfoCommand(*this, Load, I, &DyldInfoLoadCmd,
                                       "LC_DYLD_INFO_ONLY", Elements)))
         return;
+    } else if (Load.C.cmd == MachO::LC_DYLD_CHAINED_FIXUPS) {
+      if ((Err = checkLinkeditDataCommand(
+               *this, Load, I, &DyldChainedFixupsLoadCmd,
+               "LC_DYLD_CHAINED_FIXUPS", Elements, "chained fixups")))
+        return;
     } else if (Load.C.cmd == MachO::LC_UUID) {
       if (Load.C.cmdsize != sizeof(MachO::uuid_command)) {
         Err = malformedError("LC_UUID command " + Twine(I) + " has incorrect "
@@ -1595,9 +1600,9 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
         return;
     // Note: LC_TWOLEVEL_HINTS is really obsolete and is not supported.
     } else if (Load.C.cmd == MachO::LC_TWOLEVEL_HINTS) {
-       if ((Err = checkTwoLevelHintsCommand(*this, Load, I,
-                                            &TwoLevelHintsLoadCmd, Elements)))
-         return;
+      if ((Err = checkTwoLevelHintsCommand(*this, Load, I,
+                                           &TwoLevelHintsLoadCmd, Elements)))
+        return;
     } else if (Load.C.cmd == MachO::LC_IDENT) {
       // Note: LC_IDENT is ignored.
       continue;
@@ -3185,6 +3190,106 @@ iterator_range<export_iterator> MachOObjectFile::exports(Error &Err) const {
   return exports(Err, getDyldInfoExportsTrie(), this);
 }
 
+MachOAbstractFixupEntry::MachOAbstractFixupEntry(Error *E,
+                                                 const MachOObjectFile *O)
+    : E(E), O(O) {
+  // Cache the vmaddress of __TEXT
+  for (const auto &Command : O->load_commands()) {
+    if (Command.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command SLC = O->getSegmentLoadCommand(Command);
+      if (StringRef(SLC.segname) == StringRef("__TEXT")) {
+        TextAddress = SLC.vmaddr;
+        break;
+      }
+    } else if (Command.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 SLC_64 = O->getSegment64LoadCommand(Command);
+      if (StringRef(SLC_64.segname) == StringRef("__TEXT")) {
+        TextAddress = SLC_64.vmaddr;
+        break;
+      }
+    }
+  }
+}
+
+int32_t MachOAbstractFixupEntry::segmentIndex() const { return SegmentIndex; }
+
+uint64_t MachOAbstractFixupEntry::segmentOffset() const {
+  return SegmentOffset;
+}
+
+uint64_t MachOAbstractFixupEntry::segmentAddress() const {
+  return O->BindRebaseAddress(SegmentIndex, 0);
+}
+
+StringRef MachOAbstractFixupEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+StringRef MachOAbstractFixupEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+uint64_t MachOAbstractFixupEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
+StringRef MachOAbstractFixupEntry::symbolName() const { return SymbolName; }
+
+int64_t MachOAbstractFixupEntry::addend() const { return Addend; }
+
+uint32_t MachOAbstractFixupEntry::flags() const { return Flags; }
+
+int MachOAbstractFixupEntry::ordinal() const { return Ordinal; }
+
+StringRef MachOAbstractFixupEntry::typeName() const { return "unknown"; }
+
+void MachOAbstractFixupEntry::moveToFirst() {
+  SegmentOffset = 0;
+  SegmentIndex = -1;
+  Ordinal = 0;
+  Flags = 0;
+  Addend = 0;
+  Done = false;
+}
+
+void MachOAbstractFixupEntry::moveToEnd() { Done = true; }
+
+MachOChainedFixupEntry::MachOChainedFixupEntry(Error *E,
+                                               const MachOObjectFile *O,
+                                               FixupKind Kind, bool Parse)
+    : MachOAbstractFixupEntry(E, O), Kind(Kind) {
+  ErrorAsOutParameter e(E);
+  if (Parse) {
+    if (auto FixupTargetsOrErr = O->getDyldChainedFixupTargets())
+      FixupTargets = *FixupTargetsOrErr;
+    else {
+      *E = FixupTargetsOrErr.takeError();
+      return;
+    }
+  }
+}
+
+void MachOChainedFixupEntry::moveToFirst() {
+  MachOAbstractFixupEntry::moveToFirst();
+  FixupIndex = 0;
+  moveNext();
+}
+
+void MachOChainedFixupEntry::moveToEnd() {
+  MachOAbstractFixupEntry::moveToEnd();
+}
+
+void MachOChainedFixupEntry::moveNext() { Done = true; }
+
+bool MachOChainedFixupEntry::operator==(
+    const MachOChainedFixupEntry &Other) const {
+  if (Done == Other.Done)
+    return true;
+  if ((FixupIndex == Other.FixupIndex))
+    return true;
+  return false;
+}
+
 MachORebaseEntry::MachORebaseEntry(Error *E, const MachOObjectFile *O,
                                    ArrayRef<uint8_t> Bytes, bool is64Bit)
     : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()),
@@ -4193,6 +4298,18 @@ iterator_range<bind_iterator> MachOObjectFile::weakBindTable(Error &Err) {
                    MachOBindEntry::Kind::Weak);
 }
 
+iterator_range<fixup_iterator>
+MachOObjectFile::fixupTable(Error &Err,
+                            MachOChainedFixupEntry::FixupKind Kind) {
+  MachOChainedFixupEntry Start(&Err, this, Kind, true);
+  Start.moveToFirst();
+
+  MachOChainedFixupEntry Finish(&Err, this, Kind, false);
+  Finish.moveToEnd();
+
+  return make_range(fixup_iterator(Start), fixup_iterator(Finish));
+}
+
 MachOObjectFile::load_command_iterator
 MachOObjectFile::begin_load_commands() const {
   return LoadCommands.begin();
@@ -4648,6 +4765,44 @@ ArrayRef<uint8_t> MachOObjectFile::getDyldInfoLazyBindOpcodes() const {
   return makeArrayRef(Ptr, DyldInfo.lazy_bind_size);
 }
 
+Expected<std::vector<ChainedFixupTarget>>
+MachOObjectFile::getDyldChainedFixupTargets() const {
+  // Load the dyld chained fixups load command.
+  if (!DyldChainedFixupsLoadCmd)
+    return std::vector<ChainedFixupTarget>();
+  auto DyldChainedFixupsOrErr = getStructOrErr<MachO::linkedit_data_command>(
+      *this, DyldChainedFixupsLoadCmd);
+  if (!DyldChainedFixupsOrErr)
+    return DyldChainedFixupsOrErr.takeError();
+  MachO::linkedit_data_command DyldChainedFixups = DyldChainedFixupsOrErr.get();
+
+  // If the load command is present but the data offset has been zeroed out,
+  // as is the case for dylib stubs, return an empty list of targets.
+  uint64_t CFHeaderOffset = DyldChainedFixups.dataoff;
+  std::vector<ChainedFixupTarget> Targets;
+  if (CFHeaderOffset == 0)
+    return Targets;
+
+  // Load the dyld chained fixups header.
+  const char *CFHeaderPtr = getPtr(*this, CFHeaderOffset);
+  auto CFHeaderOrErr =
+      getStructOrErr<MachO::dyld_chained_fixups_header>(*this, CFHeaderPtr);
+  if (!CFHeaderOrErr)
+    return CFHeaderOrErr.takeError();
+  MachO::dyld_chained_fixups_header CFHeader = CFHeaderOrErr.get();
+
+  // Reject unknown chained fixup formats.
+  if (CFHeader.fixups_version != 0)
+    return malformedError(Twine("bad chained fixups: unknown version: ") +
+                          Twine(CFHeader.fixups_version));
+  if (CFHeader.imports_format < 1 || CFHeader.imports_format > 3)
+    return malformedError(
+        Twine("bad chained fixups: unknown imports format: ") +
+        Twine(CFHeader.imports_format));
+
+  return Targets;
+}
+
 ArrayRef<uint8_t> MachOObjectFile::getDyldInfoExportsTrie() const {
   if (!DyldInfoLoadCmd)
     return None;
diff --git a/llvm/test/Object/AArch64/chained-fixups-header.test b/llvm/test/Object/AArch64/chained-fixups-header.test
new file mode 100644
index 0000000000000..b923ac8a8e7d3
--- /dev/null
+++ b/llvm/test/Object/AArch64/chained-fixups-header.test
@@ -0,0 +1,11 @@
+RUN: cat %p/../Inputs/MachO/chained-fixups.yaml \
+RUN:   | sed 's/__LINKEDIT:      00000000/__LINKEDIT:      AB000000/' \
+RUN:   | yaml2obj | not llvm-objdump --macho --dyld_info - 2>&1 \
+RUN:   | FileCheck %s --check-prefix=HEADER1
+HEADER1: truncated or malformed object (bad chained fixups: unknown version: 171)
+
+RUN: cat %p/../Inputs/MachO/chained-fixups.yaml \
+RUN:   | sed 's/1000000010000000/1000000AB0000000/' \
+RUN:   | yaml2obj | not llvm-objdump --macho --dyld_info - 2>&1 \
+RUN:   | FileCheck %s --check-prefix=HEADER2
+HEADER2: truncated or malformed object (bad chained fixups: unknown imports format: 171)
diff --git a/llvm/test/Object/Inputs/MachO/chained-fixups.yaml b/llvm/test/Object/Inputs/MachO/chained-fixups.yaml
new file mode 100644
index 0000000000000..1079b6f8f816e
--- /dev/null
+++ b/llvm/test/Object/Inputs/MachO/chained-fixups.yaml
@@ -0,0 +1,173 @@
+# This file was produced using:
+#   echo "int ext;" > a.c
+#   xcrun --sdk iphoneos clang -target arm64-apple-ios15.1 -o a.o a.c -c
+#   xcrun --sdk iphoneos clang -target arm64-apple-ios15.1 -dynamiclib a.o -o liba.dylib -install_name @executable_path/liba.dylib
+#   echo "extern int ext;" > b.c
+#   echo "int padding;" >> b.c
+#   echo "int *p = &ext + 4;" >> b.c
+#   xcrun --sdk iphoneos clang -target arm64-apple-ios15.1 -o b.o b.c -c
+#   xcrun --sdk iphoneos clang -target arm64-apple-ios15.1 -dynamiclib b.o -o libfixups.dylib -install_name @executable_path/libfixups.dylib -L. -la
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x6
+  ncmds:           16
+  sizeofcmds:      816
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x4000
+        size:            0
+        offset:          0x4000
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         ''
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          16384
+    vmsize:          16384
+    fileoff:         16384
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x4000
+        size:            8
+        offset:          0x4000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000001000000080'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          32768
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        160
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         64
+    dylib:
+      name:            24
+      timestamp:       1
+      current_version: 0
+      compatibility_version: 0
+    Content:         '@executable_path/libfixups.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_DYLD_CHAINED_FIXUPS
+    cmdsize:         16
+    dataoff:         32768
+    datasize:        88
+  - cmd:             LC_DYLD_EXPORTS_TRIE
+    cmdsize:         16
+    dataoff:         32856
+    datasize:        16
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          32880
+    nsyms:           2
+    stroff:          32912
+    strsize:         16
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      1
+    iundefsym:       1
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            56F7BCE0-C1A7-38E3-A90D-742D8E3D5FA9
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        2
+    minos:           983296
+    sdk:             983552
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         46596096
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       0
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 0
+      compatibility_version: 0
+    Content:         '@executable_path/liba.dylib'
+    ZeroPadBytes:    5
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 85917696
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         32872
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         32880
+    datasize:        0
+__LINKEDIT:      0000000020000000480000004C000000010000000100000000000000000000000300000000000000100000000000000018000000004006000040000000000000000000000100000001020000005F6578740000000000000000015F700006040080800100000000000000000000000000020000000F02000000400000000000000500000001000001000000000000000020005F70005F65787400000000000000
+...
diff --git a/llvm/test/tools/llvm-objdump/MachO/dyld_info.test b/llvm/test/tools/llvm-objdump/MachO/dyld_info.test
index 35642acb060b1..2af76077cd757 100644
--- a/llvm/test/tools/llvm-objdump/MachO/dyld_info.test
+++ b/llvm/test/tools/llvm-objdump/MachO/dyld_info.test
@@ -3,5 +3,4 @@ RUN:   | FileCheck %s --match-full-lines --strict-whitespace \
 RUN:     --implicit-check-not={{.}}
 
 CHECK:{{.*}}bind.macho-x86_64:
-CHECK-NEXT:dyld information:
-CHECK-NEXT:[not yet implemented].
+CHECK:dyld information:
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index f55e6314c0d26..88731e828598b 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -1184,9 +1184,20 @@ static void PrintLinkOptHints(MachOObjectFile *O) {
   }
 }
 
+static void printMachOChainedFixups(object::MachOObjectFile *Obj,
+                                    MachOChainedFixupEntry::FixupKind Type) {
+  Error Err = Error::success();
+  for (const object::MachOChainedFixupEntry &Entry :
+       Obj->fixupTable(Err, Type)) {
+    (void)Entry;
+  }
+  if (Err)
+    reportError(std::move(Err), Obj->getFileName());
+}
+
 static void PrintDyldInfo(MachOObjectFile *O) {
-  outs() << "dyld information:\n";
-  outs() << "[not yet implemented].\n";
+  outs() << "dyld information:" << '\n';
+  printMachOChainedFixups(O, MachOChainedFixupEntry::FixupKind::Bind);
 }
 
 static void PrintDylibs(MachOObjectFile *O, bool JustId) {

From b1fc966d2e4176e640cda52c7148f565e93bedd0 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Tue, 22 Feb 2022 20:14:33 +0100
Subject: [PATCH 532/748] [Driver] Support Solaris/amd64 GetTls

This is the driver part of D91605 <https://reviews.llvm.org/D91605>, a
workaround to allow direct calls to `__tls_get_addr` on Solaris/amd64.

Tested on `amd64-pc-solaris2.11` and `sparcv9-sun-solaris2.11`.

Differential Revision: https://reviews.llvm.org/D119829
---
 clang/lib/Driver/ToolChains/Solaris.cpp  | 14 ++++++-
 clang/test/Driver/solaris-ld-sanitizer.c | 51 ++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Driver/solaris-ld-sanitizer.c

diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 24f18b92dd661..2d40598bfc1c0 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -14,6 +14,8 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
+#include "clang/Driver/ToolChain.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -145,8 +147,18 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-lgcc");
       CmdArgs.push_back("-lm");
     }
-    if (NeedsSanitizerDeps)
+    if (NeedsSanitizerDeps) {
       linkSanitizerRuntimeDeps(getToolChain(), CmdArgs);
+
+      // Work around Solaris/amd64 ld bug when calling __tls_get_addr directly.
+      // However, ld -z relax=transtls is available since Solaris 11.2, but not
+      // in Illumos.
+      const SanitizerArgs &SA = getToolChain().getSanitizerArgs(Args);
+      if (getToolChain().getTriple().getArch() == llvm::Triple::x86_64 &&
+          (SA.needsAsanRt() || SA.needsStatsRt() ||
+           (SA.needsUbsanRt() && !SA.requiresMinimalRuntime())))
+        CmdArgs.push_back("-zrelax=transtls");
+    }
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
diff --git a/clang/test/Driver/solaris-ld-sanitizer.c b/clang/test/Driver/solaris-ld-sanitizer.c
new file mode 100644
index 0000000000000..caf01919d4671
--- /dev/null
+++ b/clang/test/Driver/solaris-ld-sanitizer.c
@@ -0,0 +1,51 @@
+/// General tests that the ld -z relax=transtls workaround is only applied
+/// on Solaris/amd64. Note that we use sysroot to make these tests
+/// independent of the host system.
+
+/// Check sparc-sun-solaris2.11, 32bit
+// RUN: %clang --target=sparc-sun-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-SPARC32 %s
+// CHECK-LD-SPARC32-NOT: -zrelax=transtls
+
+/// Check sparc-sun-solaris2.11, 32bit
+// RUN: %clang -fsanitize=undefined --target=sparc-sun-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-SPARC32 %s
+// CHECK-LD-SPARC32-NOT: -zrelax=transtls
+
+/// Check sparc-sun-solaris2.11, 64bit
+// RUN: %clang -m64 --target=sparc-sun-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-SPARC64 %s
+// CHECK-LD-SPARC64-NOT: -zrelax=transtls
+
+/// Check sparc-sun-solaris2.11, 64bit
+// RUN: %clang -m64 -fsanitize=undefined --target=sparc-sun-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_sparc_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-SPARC64 %s
+// CHECK-LD-SPARC64-NOT: -zrelax=transtls
+
+/// Check i386-pc-solaris2.11, 32bit
+// RUN: %clang --target=i386-pc-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-X32 %s
+// CHECK-LD-X32-NOT: -zrelax=transtls
+
+/// Check i386-pc-solaris2.11, 32bit
+// RUN: %clang -fsanitize=undefined --target=i386-pc-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-X32 %s
+// CHECK-LD-X32-NOT: -zrelax=transtls
+
+/// Check i386-pc-solaris2.11, 64bit
+// RUN: %clang -m64 --target=i386-pc-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-X64 %s
+// CHECK-LD-X64-NOT: -zrelax=transtls
+
+/// Check i386-pc-solaris2.11, 64bit
+// RUN: %clang -m64 -fsanitize=undefined --target=i386-pc-solaris2.11 %s -### 2>&1 \
+// RUN:     --gcc-toolchain="" --sysroot=%S/Inputs/solaris_x86_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-X64-UBSAN %s
+// CHECK-LD-X64-UBSAN: -zrelax=transtls

From cb8e9bea95b39e3aa7eff4091b4f721e22600878 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Tue, 22 Feb 2022 20:18:22 +0100
Subject: [PATCH 533/748] [sanitizer_common] Use GetStaticTlsBoundary on
 Solaris 11.4

This is a restricted alternative to D91605
<https://reviews.llvm.org/D91605> which only works on Solaris 11.4 SRU 10+,
but would break the build on Solaris 11.3 and Illumos which lack
`dlpi_tls_modid`.

Apart from that, the patch is trivial.  One caveat is that the
`sanitizer_common` and `asan` tests need to be linked explicitly with `ld
-z relax=transtls` on Solaris/amd64 since the archives with calls to
`__tls_get_addr` are linked in directly.

Tested on `amd64-pc-solaris2.11`, `sparcv9-sun-solaris2.11`, and
`x86_64-pc-linux-gnu`.

Differential Revision: https://reviews.llvm.org/D120048
---
 compiler-rt/cmake/config-ix.cmake                   | 13 +++++++++++++
 compiler-rt/lib/asan/tests/CMakeLists.txt           | 11 +++++++----
 .../sanitizer_common/sanitizer_linux_libcdep.cpp    | 10 +++-------
 .../lib/sanitizer_common/tests/CMakeLists.txt       |  4 +++-
 4 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 4299a0589a7b7..3ad2d6a932dba 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -226,6 +226,19 @@ function(get_target_flags_for_arch arch out_var)
   endif()
 endfunction()
 
+# Returns a list of architecture specific target ldflags in @out_var list.
+function(get_target_link_flags_for_arch arch out_var)
+  list(FIND COMPILER_RT_SUPPORTED_ARCH ${arch} ARCH_INDEX)
+  if(ARCH_INDEX EQUAL -1)
+    message(FATAL_ERROR "Unsupported architecture: ${arch}")
+  else()
+    # Workaround for direct calls to __tls_get_addr on Solaris/amd64.
+    if(OS_NAME MATCHES "SunOS" AND ${arch} MATCHES x86_64)
+      set(${out_var} "-Wl,-z,relax=transtls" PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
+
 # Returns a compiler and CFLAGS that should be used to run tests for the
 # specific architecture.  When cross-compiling, this is controled via
 # COMPILER_RT_TEST_COMPILER and COMPILER_RT_TEST_COMPILER_CFLAGS.
diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt
index 95a324766ae7c..047a3fa282c40 100644
--- a/compiler-rt/lib/asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/asan/tests/CMakeLists.txt
@@ -179,11 +179,14 @@ function(add_asan_tests arch test_runtime)
     set("${test_objects}" "${${test_objects}}" PARENT_SCOPE)
   endfunction()
 
+  set(TARGET_LINK_FLAGS)
+  get_target_link_flags_for_arch(${arch} TARGET_LINK_FLAGS)
+
   set(ASAN_INST_TEST_OBJECTS)
   generate_asan_tests(ASAN_INST_TEST_OBJECTS AsanUnitTests
     "Asan-${arch}${TEST_KIND}-Test"
     SUBDIR "${CONFIG_NAME}"
-    LINK_FLAGS ${ASAN_UNITTEST_INSTRUMENTED_LINK_FLAGS}
+    LINK_FLAGS ${ASAN_UNITTEST_INSTRUMENTED_LINK_FLAGS} ${TARGET_LINK_FLAGS}
     SOURCES ${ASAN_INST_TEST_SOURCES}
     CFLAGS ${ASAN_UNITTEST_INSTRUMENTED_CFLAGS} ${TEST_CFLAGS})
 
@@ -209,7 +212,7 @@ function(add_asan_tests arch test_runtime)
         SUBDIR "${CONFIG_NAME_DYNAMIC}"
         OBJECTS ${ASAN_INST_TEST_OBJECTS}
         DEPS asan ${ASAN_INST_TEST_OBJECTS}
-        LINK_FLAGS ${ASAN_DYNAMIC_UNITTEST_INSTRUMENTED_LINK_FLAGS}
+        LINK_FLAGS ${ASAN_DYNAMIC_UNITTEST_INSTRUMENTED_LINK_FLAGS} ${TARGET_LINK_FLAGS}
         )
     endif()
   endif()
@@ -220,7 +223,7 @@ function(add_asan_tests arch test_runtime)
     AsanUnitTests "Asan-${arch}${TEST_KIND}-Noinst-Test"
     SUBDIR "${CONFIG_NAME}"
     CFLAGS ${ASAN_UNITTEST_COMMON_CFLAGS}
-    LINK_FLAGS ${ASAN_UNITTEST_NOINST_LINK_FLAGS}
+    LINK_FLAGS ${ASAN_UNITTEST_NOINST_LINK_FLAGS} ${TARGET_LINK_FLAGS}
     SOURCES ${ASAN_NOINST_TEST_SOURCES}
     RUNTIME ${test_runtime})
 
@@ -230,7 +233,7 @@ function(add_asan_tests arch test_runtime)
     SUBDIR "${CONFIG_NAME}"
     CFLAGS ${ASAN_UNITTEST_INSTRUMENTED_CFLAGS}
     SOURCES ${ASAN_BENCHMARKS_SOURCES}
-    LINK_FLAGS ${ASAN_UNITTEST_INSTRUMENTED_LINK_FLAGS})
+    LINK_FLAGS ${ASAN_UNITTEST_INSTRUMENTED_LINK_FLAGS} ${TARGET_LINK_FLAGS})
 endfunction()
 
 if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 4ccd2e8281080..25ad825f568bd 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -216,7 +216,8 @@ void InitTlsSize() { }
 // On glibc x86_64, ThreadDescriptorSize() needs to be precise due to the usage
 // of g_tls_size. On other targets, ThreadDescriptorSize() is only used by lsan
 // to get the pointer to thread-specific data keys in the thread control block.
-#if (SANITIZER_FREEBSD || SANITIZER_LINUX) && !SANITIZER_ANDROID && !SANITIZER_GO
+#if (SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_SOLARIS) && \
+    !SANITIZER_ANDROID && !SANITIZER_GO
 // sizeof(struct pthread) from glibc.
 static atomic_uintptr_t thread_descriptor_size;
 
@@ -476,7 +477,7 @@ static void GetTls(uptr *addr, uptr *size) {
   const uptr pre_tcb_size = TlsPreTcbSize();
   *addr = tp - pre_tcb_size;
   *size = g_tls_size + pre_tcb_size;
-#elif SANITIZER_FREEBSD || SANITIZER_LINUX
+#elif SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_SOLARIS
   uptr align;
   GetStaticTlsBoundary(addr, size, &align);
 #if defined(__x86_64__) || defined(__i386__) || defined(__s390__) || \
@@ -537,11 +538,6 @@ static void GetTls(uptr *addr, uptr *size) {
       *addr = (uptr)tcb->tcb_dtv[1];
     }
   }
-#elif SANITIZER_SOLARIS
-  // FIXME
-  *addr = 0;
-  *size = 0;
-#else
 #error "Unknown OS"
 #endif
 }
diff --git a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
index 929ef15bd846d..f536df885abcf 100644
--- a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt
@@ -154,6 +154,8 @@ macro(add_sanitizer_tests_for_arch arch)
     list(APPEND extra_flags "-D_FILE_OFFSET_BITS=64")
   endif()
   get_sanitizer_common_lib_for_arch(${arch} SANITIZER_COMMON_LIB)
+  set(TARGET_LINK_FLAGS)
+  get_target_link_flags_for_arch(${arch} TARGET_LINK_FLAGS)
 
   set(SANITIZER_TEST_OBJECTS)
   generate_compiler_rt_tests(SANITIZER_TEST_OBJECTS SanitizerUnitTests
@@ -163,7 +165,7 @@ macro(add_sanitizer_tests_for_arch arch)
     COMPILE_DEPS ${SANITIZER_TEST_HEADERS}
     DEPS gtest
     CFLAGS  ${SANITIZER_TEST_CFLAGS_COMMON} ${extra_flags}
-    LINK_FLAGS ${SANITIZER_TEST_LINK_FLAGS_COMMON} ${extra_flags})
+    LINK_FLAGS ${SANITIZER_TEST_LINK_FLAGS_COMMON} ${TARGET_LINK_FLAGS} ${extra_flags})
 
   if("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" AND "${arch}" STREQUAL "x86_64")
     # Test that the libc-independent part of sanitizer_common is indeed

From 7fb02d2752c06a9000edd969a11eae1e08864b77 Mon Sep 17 00:00:00 2001
From: Zarko Todorovski <zarko@ca.ibm.com>
Date: Tue, 22 Feb 2022 14:34:10 -0500
Subject: [PATCH 534/748] [libc++][AIX] Add AIX error message as expected
 output

AIX's libc generates "Error -1 occurred" instead of the "Unknown Error"
expected by these test cases. Add this as expected output for AIX only.

Reviewed By: daltenty, #powerpc, #libc, zibi, Quuxplusone

Differential Revision: https://reviews.llvm.org/D119982
---
 .../syserr.errcat.objects/generic_category.pass.cpp          | 5 ++++-
 .../syserr.errcat.objects/system_category.pass.cpp           | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
index e01df7f1f1967..8114bcd841ab5 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
@@ -8,7 +8,6 @@
 
 // XFAIL: suse-linux-enterprise-server-11
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-// XFAIL: LIBCXX-AIX-FIXME
 
 // <system_error>
 
@@ -28,7 +27,11 @@ void test_message_for_bad_value() {
     const std::error_category& e_cat1 = std::generic_category();
     const std::string msg = e_cat1.message(-1);
     // Exact message format varies by platform.
+    #if defined(_AIX)
+    LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0);
+    #else
     LIBCPP_ASSERT(msg.rfind("Unknown error", 0) == 0);
+    #endif
     assert(errno == E2BIG);
 }
 
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index 6eeb3175a9e7a..5597900dcd58b 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -14,7 +14,6 @@
 
 // XFAIL: suse-linux-enterprise-server-11
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12}}
-// XFAIL: LIBCXX-AIX-FIXME
 
 #include <system_error>
 #include <cassert>
@@ -28,7 +27,11 @@ void test_message_for_bad_value() {
     const std::error_category& e_cat1 = std::system_category();
     const std::string msg = e_cat1.message(-1);
     // Exact message format varies by platform.
+    #if defined(_AIX)
+    LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0);
+    #else
     LIBCPP_ASSERT(msg.rfind("Unknown error", 0) == 0);
+    #endif
     assert(errno == E2BIG);
 }
 

From 30053c1445e2caa2c5f096c51667ec301eb7fbf5 Mon Sep 17 00:00:00 2001
From: Kai Nacke <kai.nacke@de.ibm.com>
Date: Tue, 22 Feb 2022 13:54:16 -0500
Subject: [PATCH 535/748] [SystemZ/z/OS] Add va intrinsics for XPLINK

Add support for va intrinsics for the XPLINK ABI.
Only the extended vararg variant, which uses a pointer to next
argument, is supported. The standard variant will build on this.

Reviewed By: uweigand

Differential Revision: https://reviews.llvm.org/D120148
---
 .../Target/SystemZ/SystemZISelLowering.cpp    | 30 +++++++++++++++-
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |  2 ++
 llvm/test/CodeGen/SystemZ/call-zos-vararg.ll  | 34 +++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 7fc3e33309830..3594f76e2c1a8 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -3505,6 +3505,32 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
 
 SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
+
+  if (Subtarget.isTargetXPLINK64())
+    return lowerVASTART_XPLINK(Op, DAG);
+  else
+    return lowerVASTART_ELF(Op, DAG);
+}
+
+SDValue SystemZTargetLowering::lowerVASTART_XPLINK(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SystemZMachineFunctionInfo *FuncInfo =
+      MF.getInfo<SystemZMachineFunctionInfo>();
+
+  SDLoc DL(Op);
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue SystemZTargetLowering::lowerVASTART_ELF(SDValue Op,
+                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   SystemZMachineFunctionInfo *FuncInfo =
     MF.getInfo<SystemZMachineFunctionInfo>();
@@ -3548,7 +3574,9 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   SDLoc DL(Op);
 
-  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
+  uint32_t Sz =
+      Subtarget.isTargetXPLINK64() ? getTargetMachine().getPointerSize(0) : 32;
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(Sz, DL),
                        Align(8), /*isVolatile*/ false, /*AlwaysInline*/ false,
                        /*isTailCall*/ false, MachinePointerInfo(DstSV),
                        MachinePointerInfo(SrcSV));
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 448d3d048e02d..04ed0c76be3eb 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -628,6 +628,8 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVASTART_ELF(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVASTART_XPLINK(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC_ELF(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll b/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
index 2efe27172efcc..2af2c29c1d53f 100644
--- a/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
+++ b/llvm/test/CodeGen/SystemZ/call-zos-vararg.ll
@@ -189,6 +189,40 @@ entry:
   ret i64 %retval
 }
 
+; Derived from C source:
+; #define _VARARG_EXT_
+; #include <stdarg.h>
+;
+; long pass(long x, ...) {
+;   va_list va;
+;   va_start(va, x);
+;   long ret = va_arg(va, long);
+;   va_end(va);
+;   return ret;
+; }
+;
+; CHECK-LABEL: pass_vararg:
+; CHECK: aghi    4, -160
+; CHECK: la      0, 2208(4)
+; CHECK: stg     0, 2200(4)
+define hidden i64 @pass_vararg(i64 %x, ...) {
+entry:
+  %va = alloca i8*, align 8
+  %va1 = bitcast i8** %va to i8*
+  call void @llvm.va_start(i8* %va1)
+  %argp.cur = load i8*, i8** %va, align 8
+  %argp.next = getelementptr inbounds i8, i8* %argp.cur, i64 8
+  store i8* %argp.next, i8** %va, align 8
+  %0 = bitcast i8* %argp.cur to i64*
+  %ret = load i64, i64* %0, align 8
+  %va2 = bitcast i8** %va to i8*
+  call void @llvm.va_end(i8* %va2)
+  ret i64 %ret
+}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+
 declare i64 @pass_vararg0(i64 %arg0, i64 %arg1, ...)
 declare i64 @pass_vararg1(fp128 %arg0, ...)
 declare i64 @pass_vararg2(i64 %arg0, ...)

From 0b302be023388e7cec2daf680a3ea6718c6af53f Mon Sep 17 00:00:00 2001
From: Dmitry Vassiliev <dvassiliev@accesssoftek.com>
Date: Wed, 23 Feb 2022 00:10:05 +0400
Subject: [PATCH 536/748] [Transforms] Pre-commit test cases for
 CorrelatedValuePropagation to handle both values of select This is a
 pre-commit of test cases relevant for D119643. CorrelatedValuePropagation
 should handle inverted select condition, but it does not yet.

---
 .../CorrelatedValuePropagation/basic.ll       | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index aa1d305f63bb6..024ff29d363ec 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -143,6 +143,67 @@ return:
   ret void
 }
 
+; "false" case for CorrelatedValuePropagation
+define void @loop1(i32* %x, i32* %y) {
+; CHECK-LABEL: @loop1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:  loop:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[F:%.*]], [[LOOP]] ], [ [[X:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[F]] = tail call i32* @f(i32* [[PHI]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32* [[F]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32* [[F]], i32* null
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32* [[SEL]], null
+; CHECK-NEXT:    br i1 [[CMP2]], label [[RETURN:%.*]], label [[LOOP]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+	%phi = phi i32* [ %sel, %loop ], [ %x, %entry ]
+	%f = tail call i32* @f(i32* %phi)
+	%cmp1 = icmp ne i32* %f, %y
+	%sel = select i1 %cmp1, i32* %f, i32* null
+	%cmp2 = icmp eq i32* %sel, null
+	br i1 %cmp2, label %return, label %loop
+
+return:
+  ret void
+}
+
+; "true" case for CorrelatedValuePropagation
+define void @loop2(i32* %x, i32* %y) {
+; CHECK-LABEL: @loop2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+;; CorrelatedValuePropagation should handle inverted select condition, but it does not yet.
+;; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[F:%.*]], [[LOOP]] ], [ [[X:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[F:%.*]] = tail call i32* @f(i32* [[PHI]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32* [[F]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32* null, i32* [[F]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32* [[SEL]], null
+; CHECK-NEXT:    br i1 [[CMP2]], label [[RETURN:%.*]], label [[LOOP]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+	%phi = phi i32* [ %sel, %loop ], [ %x, %entry ]
+	%f = tail call i32* @f(i32* %phi)
+	%cmp1 = icmp eq i32* %f, %y
+	%sel = select i1 %cmp1, i32* null, i32* %f
+	%cmp2 = icmp eq i32* %sel, null
+	br i1 %cmp2, label %return, label %loop
+
+return:
+  ret void
+}
+
 define i32 @switch1(i32 %s) {
 ; CHECK-LABEL: @switch1(
 ; CHECK-NEXT:  entry:

From 90a3b310917031c802bb930626e85f067d53ea5b Mon Sep 17 00:00:00 2001
From: Dmitry Vassiliev <dvassiliev@accesssoftek.com>
Date: Wed, 23 Feb 2022 00:11:20 +0400
Subject: [PATCH 537/748] [Transforms] Enhance CorrelatedValuePropagation to
 handle both values of select

The "Correlated Value Propagation" pass was missing a case when handling select instructions. It was only handling the "false" constant value, while in NVPTX the select may have the condition (and thus the branches) inverted, for example:
```
loop:
	%phi = phi i32* [ %sel, %loop ], [ %x, %entry ]
	%f = tail call i32* @f(i32* %phi)
	%cmp1 = icmp ne i32* %f, %y
	%sel = select i1 %cmp1, i32* %f, i32* null
	%cmp2 = icmp eq i32* %sel, null
	br i1 %cmp2, label %return, label %loop
```
But the select condition can be inverted:
```
	%cmp1 = icmp eq i32* %f, %y
	%sel = select i1 %cmp1, i32* null, i32* %f
```
The fix is to enhance "Correlated Value Propagation" to handle both branches of the select instruction.

Reviewed By: nikic, lebedev.ri

Differential Revision: https://reviews.llvm.org/D119643
---
 .../lib/Transforms/Scalar/CorrelatedValuePropagation.cpp | 9 +++++++++
 llvm/test/Transforms/CorrelatedValuePropagation/basic.ll | 5 ++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index cc90be9cfe895..dc28a1c3605c0 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -245,11 +245,20 @@ static Value *getValueOnEdge(LazyValueInfo *LVI, Value *Incoming,
   // value can never be that constant. In that case replace the incoming
   // value with the other value of the select. This often allows us to
   // remove the select later.
+
+  // The "false" case
   if (auto *C = dyn_cast<Constant>(SI->getFalseValue()))
     if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) ==
         LazyValueInfo::False)
       return SI->getTrueValue();
 
+  // The "true" case,
+  // similar to the select "false" case, but try the select "true" value
+  if (auto *C = dyn_cast<Constant>(SI->getTrueValue()))
+    if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C, From, To, CxtI) ==
+        LazyValueInfo::False)
+      return SI->getFalseValue();
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index 024ff29d363ec..55ae559eabb6d 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -179,9 +179,8 @@ define void @loop2(i32* %x, i32* %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-;; CorrelatedValuePropagation should handle inverted select condition, but it does not yet.
-;; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[F:%.*]], [[LOOP]] ], [ [[X:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[F:%.*]] = tail call i32* @f(i32* [[PHI]])
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32* [ [[F:%.*]], [[LOOP]] ], [ [[X:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[F]] = tail call i32* @f(i32* [[PHI]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32* [[F]], [[Y:%.*]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i32* null, i32* [[F]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32* [[SEL]], null

From a23f7c0cb6b42a06bc9707fdf46ce2a90080f61f Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 22 Feb 2022 12:47:29 -0800
Subject: [PATCH 538/748] Remove dead code.

---
 llvm/include/llvm/Object/MachO.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 3350e8215ff9f..423d05e71093f 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -376,7 +376,6 @@ class MachOChainedFixupEntry : public MachOAbstractFixupEntry {
 private:
   std::vector<ChainedFixupTarget> FixupTargets;
   uint32_t FixupIndex = 0;
-  FixupKind Kind;
 };
 using fixup_iterator = content_iterator<MachOChainedFixupEntry>;
 

From efe9fd08e04d66d6f110304688932504119cddbf Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 22 Feb 2022 12:47:39 -0800
Subject: [PATCH 539/748] Disable test on big endian machines. Yaml2obj has
 problems there.

---
 llvm/test/Object/AArch64/chained-fixups-header.test | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/Object/AArch64/chained-fixups-header.test b/llvm/test/Object/AArch64/chained-fixups-header.test
index b923ac8a8e7d3..9a97a719aa8f8 100644
--- a/llvm/test/Object/AArch64/chained-fixups-header.test
+++ b/llvm/test/Object/AArch64/chained-fixups-header.test
@@ -1,3 +1,4 @@
+REQUIRES: host-byteorder-little-endian
 RUN: cat %p/../Inputs/MachO/chained-fixups.yaml \
 RUN:   | sed 's/__LINKEDIT:      00000000/__LINKEDIT:      AB000000/' \
 RUN:   | yaml2obj | not llvm-objdump --macho --dyld_info - 2>&1 \

From 9b1ae9f67fb38a8623172b6e0f03133a32b78aad Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Tue, 22 Feb 2022 12:56:45 -0800
Subject: [PATCH 540/748] Revert "Remove dead code."

This reverts commit a23f7c0cb6b42a06bc9707fdf46ce2a90080f61f.

Breaks the build.
---
 llvm/include/llvm/Object/MachO.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 423d05e71093f..3350e8215ff9f 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -376,6 +376,7 @@ class MachOChainedFixupEntry : public MachOAbstractFixupEntry {
 private:
   std::vector<ChainedFixupTarget> FixupTargets;
   uint32_t FixupIndex = 0;
+  FixupKind Kind;
 };
 using fixup_iterator = content_iterator<MachOChainedFixupEntry>;
 

From 7ebb00a22e7ec78a3090ecc15f59d247e0f390db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Sun, 2 Jan 2022 21:53:02 +0100
Subject: [PATCH 541/748] [clang-format][NFC] Simplify if in
 ContinuationIndenter::addTokenOCL

Setting a boolean within an if and only using it in the very next if is
a bit confusing. Merge it into one if.

Differential Revision: https://reviews.llvm.org/D120237
---
 clang/lib/Format/ContinuationIndenter.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 62e0d01871e8d..69508c44dc436 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -784,14 +784,12 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
     //   OuterFunction(InnerFunctionCall( // break
     //       ParameterToInnerFunction))   // break
     //       .SecondInnerFunctionCall();
-    bool HasTrailingCall = false;
     if (Previous.MatchingParen) {
       const FormatToken *Next = Previous.MatchingParen->getNextNonComment();
-      HasTrailingCall = Next && Next->isMemberAccess();
+      if (Next && Next->isMemberAccess() && State.Stack.size() > 1 &&
+          State.Stack[State.Stack.size() - 2].CallContinuation == 0)
+        CurrentState.LastSpace = State.Column;
     }
-    if (HasTrailingCall && State.Stack.size() > 1 &&
-        State.Stack[State.Stack.size() - 2].CallContinuation == 0)
-      CurrentState.LastSpace = State.Column;
   }
 }
 

From 923c3755ea809275d6c06caf547525452568eb5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Mon, 21 Feb 2022 22:03:55 +0100
Subject: [PATCH 542/748] [clang-format] Don't break semi after requires clause
 ...

..regardless of the chosen style.

Fixes https://github.com/llvm/llvm-project/issues/53818

Differential Revision: https://reviews.llvm.org/D120278
---
 clang/lib/Format/TokenAnnotator.cpp   |  2 +-
 clang/unittests/Format/FormatTest.cpp | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 7649263a18a1e..42c271f35be44 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3963,7 +3963,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
       return Style.BreakBeforeConceptDeclarations == FormatStyle::BBCDS_Always;
     return Style.AlwaysBreakTemplateDeclarations == FormatStyle::BTDS_Yes;
   }
-  if (Left.ClosesRequiresClause) {
+  if (Left.ClosesRequiresClause && Right.isNot(tok::semi)) {
     switch (Style.RequiresClausePosition) {
     case FormatStyle::RCPS_OwnLine:
     case FormatStyle::RCPS_WithPreceding:
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 7d8b74c9c455f..98a0111d1ea40 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -23861,6 +23861,11 @@ TEST_F(FormatTest, RequiresClausesPositions) {
                "}",
                Style);
 
+  verifyFormat("template <typename T>\n"
+               "int bar(T t)\n"
+               "  requires F<T>;",
+               Style);
+
   Style.IndentRequiresClause = false;
   verifyFormat("template <typename T>\n"
                "requires F<T>\n"
@@ -23881,6 +23886,7 @@ TEST_F(FormatTest, RequiresClausesPositions) {
   verifyFormat("template <typename T> requires Foo<T> struct Bar {};\n"
                "template <typename T> requires Foo<T> void bar() {}\n"
                "template <typename T> void bar() requires Foo<T> {}\n"
+               "template <typename T> void bar() requires Foo<T>;\n"
                "template <typename T> requires Foo<T> Bar(T) -> Bar<T>;",
                Style);
 
@@ -23933,6 +23939,9 @@ TEST_F(FormatTest, RequiresClausesPositions) {
                "void bar()\n"
                "requires Foo<T> {}\n"
                "template <typename T>\n"
+               "void bar()\n"
+               "requires Foo<T>;\n"
+               "template <typename T>\n"
                "requires Foo<T> Bar(T) -> Bar<T>;",
                Style);
 
@@ -23992,6 +24001,7 @@ TEST_F(FormatTest, RequiresClausesPositions) {
                "template <typename T>\n"
                "void bar() requires Foo<T>\n"
                "{}\n"
+               "template <typename T> void bar() requires Foo<T>;\n"
                "template <typename T> requires Foo<T>\n"
                "Bar(T) -> Bar<T>;",
                Style);

From 746bd890002826a5663d241e3dc2140c2f3a3cdd Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 22 Feb 2022 16:25:51 -0500
Subject: [PATCH 543/748] fix comment typo to cycle bots

---
 lld/COFF/Writer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 12db942f1db55..f43853e32075a 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -712,7 +712,7 @@ bool Writer::fixGnuImportChunks() {
 
   bool hasIdata = false;
   // Sort all .idata$* chunks, grouping chunks from the same library,
-  // with alphabetical ordering of the object fils within a library.
+  // with alphabetical ordering of the object files within a library.
   for (auto it : partialSections) {
     PartialSection *pSec = it.second;
     if (!pSec->name.startswith(".idata"))

From 210bb04e23429fe9f79c4c83c7df450a0c3250c5 Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <ayermolo@fb.com>
Date: Tue, 22 Feb 2022 13:26:38 -0800
Subject: [PATCH 544/748] [BOLT][DWARF] Remove patchLowHigh unused function.

Cleanup after removing caching mechanims for ranges/abbrevs.

Reviewed By: rafauler, yota9

Differential Revision: https://reviews.llvm.org/D120174
---
 bolt/include/bolt/Rewrite/DWARFRewriter.h |  5 ---
 bolt/lib/Rewrite/DWARFRewriter.cpp        | 41 -----------------------
 2 files changed, 46 deletions(-)

diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 2b0888c2329d4..74e10b146c53f 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -147,11 +147,6 @@ class DWARFRewriter {
                                      SimpleBinaryPatcher &DebugInfoPatcher,
                                      Optional<uint64_t> RangesBase = None);
 
-  /// Patch DW_AT_(low|high)_pc values for the \p DIE based on \p Range.
-  void patchLowHigh(DWARFDie DIE, DebugAddressRange Range,
-                    SimpleBinaryPatcher &DebugInfoPatcher,
-                    Optional<uint64_t> DWOId);
-
   /// Helper function for creating and returning per-DWO patchers/writers.
   template <class T, class Patcher>
   Patcher *getBinaryDWOPatcherHelper(T &BinaryPatchers, uint64_t DwoId) {
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 40d43bf858f8f..df4e3d2ecf735 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1364,47 +1364,6 @@ void getRangeAttrData(DWARFDie DIE, Optional<AttrInfo> &LowPCVal,
 
 } // namespace
 
-void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range,
-                                 SimpleBinaryPatcher &DebugInfoPatcher,
-                                 Optional<uint64_t> DWOId) {
-  Optional<AttrInfo> LowPCVal = None;
-  Optional<AttrInfo> HighPCVal = None;
-  getRangeAttrData(DIE, LowPCVal, HighPCVal);
-  uint64_t LowPCOffset = LowPCVal->Offset;
-  uint64_t HighPCOffset = HighPCVal->Offset;
-  auto *TempDebugPatcher = &DebugInfoPatcher;
-  if (LowPCVal->V.getForm() == dwarf::DW_FORM_GNU_addr_index) {
-    uint32_t AddressIndex =
-        AddrWriter->getIndexFromAddress(Range.LowPC, *DWOId);
-    TempDebugPatcher = getBinaryDWODebugInfoPatcher(*DWOId);
-    TempDebugPatcher->addUDataPatch(LowPCOffset, AddressIndex, LowPCVal->Size);
-    // 2.17.2
-    // If the value of the DW_AT_high_pc is of class address, it is the
-    // relocated address of the first location past the last instruction
-    // associated with the entity; if it is of class constant, the value is
-    // an unsigned integer offset which when added to the low PC gives the
-    // address of the first location past the last instruction associated
-    // with the entity.
-    if (!HighPCVal->V.isFormClass(DWARFFormValue::FC_Constant)) {
-      AddressIndex = AddrWriter->getIndexFromAddress(Range.HighPC, *DWOId);
-      TempDebugPatcher->addUDataPatch(HighPCOffset, AddressIndex,
-                                      HighPCVal->Size);
-    }
-  } else {
-    TempDebugPatcher->addLE64Patch(LowPCOffset, Range.LowPC);
-  }
-
-  uint64_t HighPC = Range.HighPC;
-  // The DW_FORM_data* is delta between high and low pc
-  if (HighPCVal->V.getForm() != dwarf::Form::DW_FORM_addr)
-    HighPC -= Range.LowPC;
-
-  if (isHighPcFormEightBytes(HighPCVal->V.getForm()))
-    TempDebugPatcher->addLE64Patch(HighPCOffset, HighPC);
-  else
-    TempDebugPatcher->addLE32Patch(HighPCOffset, HighPC);
-}
-
 void DWARFRewriter::convertToRangesPatchAbbrev(
     const DWARFUnit &Unit, const DWARFAbbreviationDeclaration *Abbrev,
     DebugAbbrevWriter &AbbrevWriter, Optional<uint64_t> RangesBase) {

From f79f430d4b268429f96be95622facd2775b25624 Mon Sep 17 00:00:00 2001
From: Okwan Kwon <okkwon@gmail.com>
Date: Fri, 18 Feb 2022 18:07:36 +0000
Subject: [PATCH 545/748] Fold Tensor.extract_slice into a constant splat.

Fold arith.extract_slice into arith.constant when the source is a constant
splat and the result type is statically shaped.
---
 mlir/include/mlir/IR/BuiltinAttributes.h   |  5 +++++
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp   |  7 ++++++-
 mlir/lib/IR/BuiltinAttributes.cpp          | 12 ++++++++++++
 mlir/test/Dialect/Tensor/canonicalize.mlir | 11 +++++++++++
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/IR/BuiltinAttributes.h b/mlir/include/mlir/IR/BuiltinAttributes.h
index 5399718f67582..4371a1cb088f9 100644
--- a/mlir/include/mlir/IR/BuiltinAttributes.h
+++ b/mlir/include/mlir/IR/BuiltinAttributes.h
@@ -655,6 +655,11 @@ class DenseElementsAttr : public Attribute {
   /// same total number of elements as well as element type.
   DenseElementsAttr reshape(ShapedType newType);
 
+  /// Return a new DenseElementsAttr that has the same data as the current
+  /// attribute, but with a different shape for a splat type. The new type must
+  /// have the same element type.
+  DenseElementsAttr resizeSplat(ShapedType newType);
+
   /// Return a new DenseElementsAttr that has the same data as the current
   /// attribute, but has bitcast elements to 'newElType'. The new type must have
   /// the same bitwidth as the current element type.
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 5edb620d5cc32..70aa7b5fe57f6 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1227,7 +1227,12 @@ static Value foldExtractAfterInsertSlice(ExtractSliceOp extractOp) {
   return {};
 }
 
-OpFoldResult ExtractSliceOp::fold(ArrayRef<Attribute>) {
+OpFoldResult ExtractSliceOp::fold(ArrayRef<Attribute> operands) {
+  if (auto splat = operands[0].dyn_cast_or_null<SplatElementsAttr>()) {
+    auto resultType = result().getType().cast<ShapedType>();
+    if (resultType.hasStaticShape())
+      return splat.resizeSplat(resultType);
+  }
   if (getSourceType() == getType() &&
       succeeded(foldIdentityOffsetSizeAndStrideOpInterface(*this, getType())))
     return this->source();
diff --git a/mlir/lib/IR/BuiltinAttributes.cpp b/mlir/lib/IR/BuiltinAttributes.cpp
index 79e80f7c1317c..6988d1f8e4c30 100644
--- a/mlir/lib/IR/BuiltinAttributes.cpp
+++ b/mlir/lib/IR/BuiltinAttributes.cpp
@@ -967,6 +967,18 @@ DenseElementsAttr DenseElementsAttr::reshape(ShapedType newType) {
   return DenseIntOrFPElementsAttr::getRaw(newType, getRawData(), isSplat());
 }
 
+DenseElementsAttr DenseElementsAttr::resizeSplat(ShapedType newType) {
+  assert(isSplat() && "expected a splat type");
+
+  ShapedType curType = getType();
+  if (curType == newType)
+    return *this;
+
+  assert(newType.getElementType() == curType.getElementType() &&
+         "expected the same element type");
+  return DenseIntOrFPElementsAttr::getRaw(newType, getRawData(), true);
+}
+
 /// Return a new DenseElementsAttr that has the same data as the current
 /// attribute, but has bitcast elements such that it is now 'newType'. The new
 /// type must have the same shape and element types of the same bitwidth as the
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index ce3db8d6039c2..22770c2e67342 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -621,6 +621,17 @@ func @fold_extract_insert(%input : tensor<?x?x?xf32>, %slice: tensor<4x?x8xf32>,
 
 // -----
 
+// CHECK-LABEL: func @fold_extract_constant_splat
+//   CHECK-NOT: tensor.extract_slice
+//       CHECK: arith.constant dense<42> : tensor<4x4xi32>
+func @fold_extract_constant_splat() -> (tensor<4x4xi32>) {
+  %cst = arith.constant dense<42> : tensor<1024x1024xi32>
+  %1 = tensor.extract_slice %cst[0,0] [4,4] [1, 1] : tensor<1024x1024xi32> to tensor<4x4xi32>
+  return %1 : tensor<4x4xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @fold_overlapping_insert
 //  CHECK-SAME: %[[INPUT:.+]]: tensor<?x?x?xf32>, %{{.+}}: tensor<4x?x8xf32>, %[[SLICE2:.+]]: tensor<4x?x8xf32>
 func @fold_overlapping_insert(%input : tensor<?x?x?xf32>, %slice1: tensor<4x?x8xf32>, %slice2: tensor<4x?x8xf32>, %i: index, %size: index) -> (tensor<?x?x?xf32>) {

From d7851685a3991aea9b9a3bbce9699c046abf1a07 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Tue, 22 Feb 2022 15:37:50 -0600
Subject: [PATCH 546/748] [polly] Remove trailing whitespace from tests. NFC.

---
 .../CodeGen/OpenMP/invariant_base_pointers_preloaded.ll     | 2 +-
 polly/test/CodeGen/invariant_load_address_space.ll          | 2 +-
 polly/test/CodeGen/invariant_load_escaping.ll               | 2 +-
 polly/test/CodeGen/stmt_split_no_dependence.ll              | 2 +-
 polly/test/DeLICM/pr41656.ll                                | 2 +-
 polly/test/DeLICM/pr48783.ll                                | 2 +-
 .../DependenceInfo/nonaffine-condition-buildMemoryAccess.ll | 2 +-
 polly/test/GPGPU/add-scalars-in-scop-to-kills.ll            | 6 +++---
 polly/test/GPGPU/debug-metadata-leak.ll                     | 2 +-
 polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll      | 2 +-
 .../invariant-load-hoisting-with-variable-lower-bound.ll    | 2 +-
 polly/test/GPGPU/privatization-simple.ll                    | 2 +-
 polly/test/GPGPU/privatization.ll                           | 2 +-
 polly/test/MaximalStaticExpansion/read_from_original.ll     | 4 ++--
 polly/test/MaximalStaticExpansion/too_many_writes.ll        | 6 +++---
 .../MaximalStaticExpansion/working_deps_between_inners.ll   | 6 +++---
 .../working_deps_between_inners_phi.ll                      | 6 +++---
 polly/test/MaximalStaticExpansion/working_expansion.ll      | 4 ++--
 .../working_expansion_multiple_dependences_per_statement.ll | 4 ++--
 .../working_expansion_multiple_instruction_per_statement.ll | 2 +-
 .../test/MaximalStaticExpansion/working_phi_two_scalars.ll  | 2 +-
 .../ManualOptimization/disable_nonforced.ll                 | 2 +-
 polly/test/ScheduleOptimizer/prevectorization.ll            | 2 +-
 polly/test/ScopInfo/Alias-1.ll                              | 2 +-
 polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll        | 2 +-
 polly/test/ScopInfo/modulo_zext_1.ll                        | 2 +-
 polly/test/ScopInfo/multidim_fortran_srem.ll                | 2 +-
 polly/test/ScopInfo/opaque-struct.ll                        | 2 +-
 polly/test/ScopInfo/pointer-comparison-no-nsw.ll            | 2 +-
 polly/test/ScopInfo/reduction_disabled_multiplicative.ll    | 2 +-
 polly/test/ScopInfo/redundant_parameter_constraint.ll       | 2 +-
 polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll    | 2 +-
 polly/test/ScopInfo/switch-4.ll                             | 2 +-
 polly/test/ScopInfo/two-loops-one-infinite.ll               | 4 ++--
 polly/test/Simplify/dead_access_phi.ll                      | 4 ++--
 polly/test/Simplify/dead_access_value.ll                    | 4 ++--
 polly/test/Simplify/overwritten.ll                          | 4 ++--
 polly/test/Simplify/overwritten_3store.ll                   | 4 ++--
 polly/test/Simplify/overwritten_loadbetween.ll              | 4 ++--
 39 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
index 6dcdf9e6e1469..483e302852e6a 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
@@ -8,7 +8,7 @@
 ;        A[i] += A[0] + A[0];
 ;    }
 ;
-; CHECK:  %polly.subfn.storeaddr.polly.access.A.load = getelementptr inbounds 
+; CHECK:  %polly.subfn.storeaddr.polly.access.A.load = getelementptr inbounds
 ; CHECK:  store float %polly.access.A.load, float* %polly.subfn.storeaddr.polly.access.A.load
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/invariant_load_address_space.ll b/polly/test/CodeGen/invariant_load_address_space.ll
index 5ed73dc387cc2..62ff99bd7b809 100644
--- a/polly/test/CodeGen/invariant_load_address_space.ll
+++ b/polly/test/CodeGen/invariant_load_address_space.ll
@@ -2,7 +2,7 @@
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, i32 addrspace(1)* %B, i64 0
-; CHECK-NOT:     addrspacecast 
+; CHECK-NOT:     addrspacecast
 ; CHECK-NEXT:    %polly.access.B.load = load i32, i32 addrspace(1)* %polly.access.B
 ;
 ; CHECK-LABEL: polly.stmt.bb2:
diff --git a/polly/test/CodeGen/invariant_load_escaping.ll b/polly/test/CodeGen/invariant_load_escaping.ll
index 9c1fbfbc26de9..06d2003f6c19a 100644
--- a/polly/test/CodeGen/invariant_load_escaping.ll
+++ b/polly/test/CodeGen/invariant_load_escaping.ll
@@ -2,7 +2,7 @@
 ;
 ;    int f(int *A, int *B) {
 ;      // Possible aliasing between A and B but if not then *B would be
-;      // invariant. We assume this and hoist *B but need to use a merged 
+;      // invariant. We assume this and hoist *B but need to use a merged
 ;      // version in the return.
 ;      int i = 0;
 ;      int x = 0;
diff --git a/polly/test/CodeGen/stmt_split_no_dependence.ll b/polly/test/CodeGen/stmt_split_no_dependence.ll
index 12aeebc9bdd7a..0ad65b0b8587a 100644
--- a/polly/test/CodeGen/stmt_split_no_dependence.ll
+++ b/polly/test/CodeGen/stmt_split_no_dependence.ll
@@ -41,4 +41,4 @@ for.end: 					 ; preds = %for.cond
   ret void
 }
 
-!0 = !{!"polly_split_after"} 
+!0 = !{!"polly_split_after"}
diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll
index 8ef6ca13c9441..a297dcf51c8ca 100644
--- a/polly/test/DeLICM/pr41656.ll
+++ b/polly/test/DeLICM/pr41656.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %loadPolly -polly-scops -polly-delicm -analyze < %s | FileCheck %s
 ;
 ; llvm.org/PR41656
-; 
+;
 ; This test case has an InvalidContext such that part of the predecessors
 ; of for.body.us.i lie within the invalid context. This causes a
 ; consistency check withing the invalid context of PR41656 to fail.
diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll
index c490d9b55fbfb..d5faf82a374f1 100644
--- a/polly/test/DeLICM/pr48783.ll
+++ b/polly/test/DeLICM/pr48783.ll
@@ -61,7 +61,7 @@ for.cond2.for.end_crit_edge.us.i:                 ; preds = %for.body5.us.for.bo
 fill_samples.exit:                                ; preds = %for.cond2.for.end_crit_edge.us.i, %for.body.us.i.us
   ret void
 }
- 
+
 declare dso_local i32 @av_get_channel_layout_nb_channels() local_unnamed_addr #0
 
 ; Function Attrs: nounwind readnone speculatable
diff --git a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
index a228ca4db5a69..c216c45256f58 100644
--- a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
+++ b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s 
+; RUN: opt %loadPolly -polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK:        MayWriteAccess :=   [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
index f3272dea19583..1649024c0e71c 100644
--- a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
+++ b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
@@ -22,15 +22,15 @@
 ; #pragma scop
 ;     for(int i = 0; i < 1000; i++) {
 ; XLoopInit:        x = 0;
-; 
+;
 ;         if (control1 > 2)
 ;             C1Add: x += 10;
 ;         if (control2 > 3)
 ;             C2Add: x += A[i];
-; 
+;
 ; BLoopAccumX:        B[i] += x;
 ;     }
-; 
+;
 ; #pragma endscop
 ; }
 ; ModuleID = 'test.ll'
diff --git a/polly/test/GPGPU/debug-metadata-leak.ll b/polly/test/GPGPU/debug-metadata-leak.ll
index d0cb72ba31f4d..24a6f51d5fdd8 100644
--- a/polly/test/GPGPU/debug-metadata-leak.ll
+++ b/polly/test/GPGPU/debug-metadata-leak.ll
@@ -16,7 +16,7 @@
 ;
 ; https://reviews.llvm.org/D35630 removes this debug metadata before the
 ; instruction is copied to the GPUModule.
-; 
+;
 ; vec_add_1.c:
 ;      void vec_add_1(int N, int arr[N]) {
 ;        int i=0;
diff --git a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
index 7bc14364b9ca5..222817577aaf2 100644
--- a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
+++ b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
@@ -13,7 +13,7 @@
 ;    }
 
 
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s 
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
index 547aa8bd26ec2..310d7fb062353 100644
--- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
+++ b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
@@ -21,7 +21,7 @@
 ; This declaration would not have been generated unless a kernel launch exists.
 ; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
 
-; 
+;
 ; void f(int *begin, int *arr) {
 ;     for (int i = *begin; i < 100; i++) {
 ;         arr[i] = 0;
diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll
index 34f543786d6db..621e7b9791841 100644
--- a/polly/test/GPGPU/privatization-simple.ll
+++ b/polly/test/GPGPU/privatization-simple.ll
@@ -18,7 +18,7 @@
 ;         x = 0;
 ;         if(control) x = C[i];
 ;         B[i] = x * A[i];
-; 
+;
 ;     }
 ; #pragma endscop
 ; }
diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll
index 0b67b0bbc2583..7cbdc130cefaf 100644
--- a/polly/test/GPGPU/privatization.ll
+++ b/polly/test/GPGPU/privatization.ll
@@ -12,7 +12,7 @@
 ; the declare would not be generated unless a call to a kernel exists.
 ; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
 
-; 
+;
 ;
 ;    void checkPrivatization(int A[], int B[], int C[], int control) {
 ;      int x;
diff --git a/polly/test/MaximalStaticExpansion/read_from_original.ll b/polly/test/MaximalStaticExpansion/read_from_original.ll
index c46b25c445daf..afec6dea42450 100644
--- a/polly/test/MaximalStaticExpansion/read_from_original.ll
+++ b/polly/test/MaximalStaticExpansion/read_from_original.ll
@@ -7,7 +7,7 @@
 ;
 ; #define Ni 2000
 ; #define Nj 3000
-; 
+;
 ; double mse(double A[Ni], double B[Nj]) {
 ;   int i;
 ;   double tmp = 6;
@@ -15,7 +15,7 @@
 ;     for (int j = 2; j<Nj; j++) {
 ;       B[j-1] = j;
 ;     }
-;     A[i] = B[i]; 
+;     A[i] = B[i];
 ;   }
 ;   return tmp;
 ; }
diff --git a/polly/test/MaximalStaticExpansion/too_many_writes.ll b/polly/test/MaximalStaticExpansion/too_many_writes.ll
index 1dac814425671..5a8c1f5ce50fe 100644
--- a/polly/test/MaximalStaticExpansion/too_many_writes.ll
+++ b/polly/test/MaximalStaticExpansion/too_many_writes.ll
@@ -7,16 +7,16 @@
 ;
 ; #define Ni 2000
 ; #define Nj 2000
-; 
+;
 ; double mse(double A[Ni], double B[Nj]) {
 ;   int i;
 ;   double tmp = 6;
 ;   for (i = 0; i < Ni; i++) {
-;     B[i] = 2; 
+;     B[i] = 2;
 ;     for (int j = 0; j<Nj; j++) {
 ;       B[j] = j;
 ;     }
-;     A[i] = B[i]; 
+;     A[i] = B[i];
 ;   }
 ;   return tmp;
 ; }
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
index 7cf5f33c47478..d15654cdb1868 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
@@ -6,15 +6,15 @@
 ;
 ; #define Ni 2000
 ; #define Nj 3000
-; 
+;
 ; void tmp3(double A[Ni], double B[Nj]) {
 ;   int i,j;
 ;   double tmp = 6;
 ;   for (i = 0; i < Ni; i++) {
 ;
 ;     for(int h = 0; h<Nj; h++)
-;      B[h] = h; 
-;    
+;      B[h] = h;
+;
 ;     for(j = 0; j < Nj; j++) {
 ;      for(int k=0; k<Nj; k++) {
 ; 	tmp = i+k+j;
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
index 828c5a0f29f68..50b3574d3355e 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
@@ -8,15 +8,15 @@
 ;
 ; #define Ni 2000
 ; #define Nj 3000
-; 
+;
 ; void tmp3(double A[Ni], double B[Nj]) {
 ;   int i,j;
 ;   double tmp = 6;
 ;   for (i = 0; i < Ni; i++) {
-; 
+;
 ;     for(int h = 0; h<Nj; h++)
 ;       B[h] = h;
-;     
+;
 ;     for(j = 0; j < Nj; j++) {
 ;       for(int k=0; k<Nj; k++) {
 ; 	tmp = tmp+i+k+j;
diff --git a/polly/test/MaximalStaticExpansion/working_expansion.ll b/polly/test/MaximalStaticExpansion/working_expansion.ll
index 02e08b94e1e31..5d52e65d08c45 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion.ll
@@ -6,7 +6,7 @@
 ;
 ; #define Ni 2000
 ; #define Nj 3000
-; 
+;
 ; double mse(double A[Ni], double B[Nj]) {
 ;   int i;
 ;   double tmp = 6;
@@ -14,7 +14,7 @@
 ;     for (int j = 0; j<Nj; j++) {
 ;       B[j] = j;
 ;     }
-;     A[i] = B[i]; 
+;     A[i] = B[i];
 ;   }
 ;   return tmp;
 ; }
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
index e4c9f6825596d..ea52dc7d520aa 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
@@ -12,10 +12,10 @@
 ;   for (j = 0; j < Ni; j++) {
 ;     for (int i = 0; i<Nj; i++)
 ;       B[i] = i;
-; 
+;
 ;     for (int i = 0; i<Nj; i++)
 ;       D[i] = i;
-; 
+;
 ;     A[j] = B[j];
 ;     C[j] = D[j];
 ;   }
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
index 3872a40514e1b..befc9fc396c83 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -analyze < %s | FileCheck %s
 ;
-; Verify that the accesses are correctly expanded 
+; Verify that the accesses are correctly expanded
 ;
 ; Original source code :
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
index c4d8983b2824b..5aaf494bf6606 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
@@ -51,7 +51,7 @@
 ; CHECK-NOT: new: { Stmt_for_inc4[i0] -> MemRef_tmp_05__phi_Stmt_for_body_expanded[1 + i0] : i0 <= 9998 };
 ; CHECK: new: { Stmt_for_inc4[i0] -> MemRef_conv_lcssa__phi_Stmt_for_inc4_expanded[i0] };
 ; CHECK: new: { Stmt_for_inc4[i0] -> MemRef_add_lcssa__phi_Stmt_for_inc4_expanded[i0] };
-; 
+;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
index c041dce0c9e05..ce8bc3a2f66a9 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
@@ -2,7 +2,7 @@
 ;
 ; Check that the disable_nonforced metadata is honored; optimization
 ; heuristics/rescheduling must not be applied.
-; 
+;
 define void @func(i32 %n, double* noalias nonnull %A) {
 entry:
   br label %for
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 426972ddc878d..cf222b2543749 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S %loadPolly -basic-aa -polly-opt-isl \
 ; RUN: -polly-pattern-matching-based-opts=false -polly-vectorizer=polly \
-; RUN: -polly-ast -analyze < %s | FileCheck %s 
+; RUN: -polly-ast -analyze < %s | FileCheck %s
 ; RUN: opt -S %loadPolly -basic-aa -polly-opt-isl \
 ; RUN: -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine \
 ; RUN: -polly-ast -analyze < %s | FileCheck %s
diff --git a/polly/test/ScopInfo/Alias-1.ll b/polly/test/ScopInfo/Alias-1.ll
index e592990767192..e358148f3e21a 100644
--- a/polly/test/ScopInfo/Alias-1.ll
+++ b/polly/test/ScopInfo/Alias-1.ll
@@ -8,7 +8,7 @@ define void @f(i32* nocapture %a, i32* nocapture %b) nounwind {
 bb.nph:
   %0 = tail call i32 (...) @rnd() nounwind       ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  %sel.b = getelementptr inbounds i32, i32* %b, i64 4 
+  %sel.b = getelementptr inbounds i32, i32* %b, i64 4
   %iftmp.0.0 = select i1 %1, i32* %sel.b, i32* %a     ; <i32*> [#uses=2]
   br label %bb3
 
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
index 0c77832cfb491..efd2ba8f17afc 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
@@ -31,7 +31,7 @@
 ; CHECK-NEXT:         Schedule :=
 ; CHECK-NEXT:             [n] -> { Stmt_bb2[i0] -> [i0] };
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:             [n] -> { Stmt_bb2[i0] -> MemRef_A[o0] : (-n + o0) mod 42 = 0 and -41 <= o0 <= 41 and ((n > 0 and o0 >= 0) or (n <= 0 and o0 <= 0)) }; 
+; CHECK-NEXT:             [n] -> { Stmt_bb2[i0] -> MemRef_A[o0] : (-n + o0) mod 42 = 0 and -41 <= o0 <= 41 and ((n > 0 and o0 >= 0) or (n <= 0 and o0 <= 0)) };
 ; CHECK-NEXT:         MustWriteAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:             [n] -> { Stmt_bb2[i0] -> MemRef_A[o0] : (-n + o0) mod 42 = 0 and -41 <= o0 <= 41 and ((n > 0 and o0 >= 0) or (n <= 0 and o0 <= 0)) };
 ; CHECK-NEXT: }
diff --git a/polly/test/ScopInfo/modulo_zext_1.ll b/polly/test/ScopInfo/modulo_zext_1.ll
index 9116d323e3624..17efa8fe67612 100644
--- a/polly/test/ScopInfo/modulo_zext_1.ll
+++ b/polly/test/ScopInfo/modulo_zext_1.ll
@@ -14,7 +14,7 @@
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: +] [Scalar: 0]
 ; CHECK-NEXT:                [N] -> { Stmt_for_body[i0] -> MemRef_A[1] : (1 + i0) mod 2 = 0; Stmt_for_body[i0] -> MemRef_A[0] : (i0) mod 2 = 0 }
 ; CHECK-NEXT:            MustWriteAccess :=	[Reduction Type: +] [Scalar: 0]
-; CHECK-NEXT:               [N] -> { Stmt_for_body[i0] -> MemRef_A[1] : (1 + i0) mod 2 = 0; Stmt_for_body[i0] -> MemRef_A[0] : (i0) mod 2 = 0 }; 
+; CHECK-NEXT:               [N] -> { Stmt_for_body[i0] -> MemRef_A[1] : (1 + i0) mod 2 = 0; Stmt_for_body[i0] -> MemRef_A[0] : (i0) mod 2 = 0 };
 ; CHECK-NEXT:    }
 ;
 ;    void f(int *A, int N) {
diff --git a/polly/test/ScopInfo/multidim_fortran_srem.ll b/polly/test/ScopInfo/multidim_fortran_srem.ll
index c74e67e0ae002..5994c51b97198 100644
--- a/polly/test/ScopInfo/multidim_fortran_srem.ll
+++ b/polly/test/ScopInfo/multidim_fortran_srem.ll
@@ -23,7 +23,7 @@ target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 1]
 ; CHECK-NEXT:             [tmp180, tmp177, tmp183, tmp162, tmp157, tmp150, tmp146, tmp140, tmp] -> { Stmt_bb203[i0, i1, i2] -> MemRef_tmp194[] };
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:             [tmp180, tmp177, tmp183, tmp162, tmp157, tmp150, tmp146, tmp140, tmp] -> { Stmt_bb203[i0, i1, i2] -> MemRef_tmp173[o0, 1 + i1, 1 + i2] : (1 - i0 + o0) mod 3 = 0 and 0 <= o0 <= 2 }  
+; CHECK-NEXT:             [tmp180, tmp177, tmp183, tmp162, tmp157, tmp150, tmp146, tmp140, tmp] -> { Stmt_bb203[i0, i1, i2] -> MemRef_tmp173[o0, 1 + i1, 1 + i2] : (1 - i0 + o0) mod 3 = 0 and 0 <= o0 <= 2 }
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
 ; CHECK-NEXT:             [tmp180, tmp177, tmp183, tmp162, tmp157, tmp150, tmp146, tmp140, tmp] -> { Stmt_bb203[i0, i1, i2] -> MemRef_arg56[1 + i0, 1 + i1, 1 + i2] };
 ; CHECK-NEXT:         MustWriteAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/opaque-struct.ll b/polly/test/ScopInfo/opaque-struct.ll
index 2d6a66f578515..3808bba4a6170 100644
--- a/polly/test/ScopInfo/opaque-struct.ll
+++ b/polly/test/ScopInfo/opaque-struct.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s 
+; RUN: opt %loadPolly -polly-scops -disable-output < %s
 ;
 ; Check that we do not crash with unsized (opaque) types.
 ;
diff --git a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
index 232e119604162..7468f1be18ab6 100644
--- a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
+++ b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
@@ -8,7 +8,7 @@
 ;    }
 ;
 ; CHECK:      Invalid Context:
-; CHECK-NEXT:  [A, B] -> { : (4*floor((A - B)/4) < A - B) or ((-A + B) mod 4 = 0 and B >= 9223372036854775808 + A) or ((-A + B) mod 4 = 0 and B <= -4 + A) } 
+; CHECK-NEXT:  [A, B] -> { : (4*floor((A - B)/4) < A - B) or ((-A + B) mod 4 = 0 and B >= 9223372036854775808 + A) or ((-A + B) mod 4 = 0 and B <= -4 + A) }
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   [A, B] -> { Stmt_while_body[i0] : (-A + B) mod 4 = 0 and i0 >= 0 and 4i0 <= -4 - A + B }
diff --git a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
index 2d02d72faa626..778b117d842d2 100644
--- a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
+++ b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
@@ -10,7 +10,7 @@
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_prod[0] };
 ;
 ; int sum, prod;
-; 
+;
 ; void f() {
 ;   int i;
 ;   for (int i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/redundant_parameter_constraint.ll b/polly/test/ScopInfo/redundant_parameter_constraint.ll
index 4ab055f03e13d..e1c6801e84518 100644
--- a/polly/test/ScopInfo/redundant_parameter_constraint.ll
+++ b/polly/test/ScopInfo/redundant_parameter_constraint.ll
@@ -3,7 +3,7 @@
 ; The constraint that r2 has to be bigger than r1 is implicitly containted in
 ; the domain, hence we do not want to see it explicitly.
 ;
-; CHECK-NOT:  r2 >= 1 + r1 
+; CHECK-NOT:  r2 >= 1 + r1
 ;
 ;    void wraps(int *A, int p, short q, char r1, char r2) {
 ;      for (char i = r1; i < r2; i++)
diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
index 83574fe2556c4..d581d1ea371c8 100644
--- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
+++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
@@ -2,7 +2,7 @@
 
 ; Derived from test-suite/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c
 
-; The values %mul.i44 is simplified to constant 4 by ScalarEvolution, but 
+; The values %mul.i44 is simplified to constant 4 by ScalarEvolution, but
 ; SCEVAffinator used to check whether the sdiv's argument was constant.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopInfo/switch-4.ll b/polly/test/ScopInfo/switch-4.ll
index 02a917a2c0308..79c2364634f7e 100644
--- a/polly/test/ScopInfo/switch-4.ll
+++ b/polly/test/ScopInfo/switch-4.ll
@@ -51,7 +51,7 @@
 ; CHECK-NEXT:             [N] -> { Stmt_sw_bb_5[i0] -> MemRef_A[i0] };
 ; CHECK-NEXT:     Stmt_sw_bb_9
 ; CHECK-NEXT:         Domain :=
-; CHECK-NEXT:             [N] -> { Stmt_sw_bb_9[i0] : (1 + i0) mod 4 = 0 and 3 <= i0 < N }; 
+; CHECK-NEXT:             [N] -> { Stmt_sw_bb_9[i0] : (1 + i0) mod 4 = 0 and 3 <= i0 < N };
 ; CHECK-NEXT:         Schedule :=
 ; CHECK-NEXT:             [N] -> { Stmt_sw_bb_9[i0] -> [i0, 0] };
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: +] [Scalar: 0]
diff --git a/polly/test/ScopInfo/two-loops-one-infinite.ll b/polly/test/ScopInfo/two-loops-one-infinite.ll
index 974194f04cace..2eb032659f582 100644
--- a/polly/test/ScopInfo/two-loops-one-infinite.ll
+++ b/polly/test/ScopInfo/two-loops-one-infinite.ll
@@ -8,8 +8,8 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
 
 define void @foo(i32* noalias nocapture readonly %xxx, i32* noalias nocapture readonly %yyy, i8*** nocapture readonly %zzz, i32 %conv6) {
 while.body.us.preheader:
- %a2 = load i8**, i8*** %zzz, align 4  
- %sub = add nsw i32 %conv6, -1 
+ %a2 = load i8**, i8*** %zzz, align 4
+ %sub = add nsw i32 %conv6, -1
   br label %while.body.us
 
 while.body.us:                                    ; preds = %while.body.us.preheader, %if.then.us
diff --git a/polly/test/Simplify/dead_access_phi.ll b/polly/test/Simplify/dead_access_phi.ll
index 10fd3179ffdec..9b6726fed5d24 100644
--- a/polly/test/Simplify/dead_access_phi.ll
+++ b/polly/test/Simplify/dead_access_phi.ll
@@ -7,7 +7,7 @@
 ; for (int j = 0; j < n; j += 1) {
 ; body:
 ;   double phi = 42;
-; 
+;
 ; body_succ:
 ;   A[0] = 42.0;
 ; }
@@ -23,7 +23,7 @@ for:
 
     body:
       br label %body_succ
-      
+
     body_succ:
       %phi = phi double [42.0, %body]
       store double 42.0, double* %A
diff --git a/polly/test/Simplify/dead_access_value.ll b/polly/test/Simplify/dead_access_value.ll
index b95c3843f5b1c..bd36009222b56 100644
--- a/polly/test/Simplify/dead_access_value.ll
+++ b/polly/test/Simplify/dead_access_value.ll
@@ -7,7 +7,7 @@
 ; for (int j = 0; j < n; j += 1) {
 ; body:
 ;   double val = 12.5 + 12.5;
-; 
+;
 ; body_succ:
 ;   double unused = val + 21.0;
 ;   A[0] = 42.0;
@@ -25,7 +25,7 @@ for:
     body:
       %val = fadd double 12.5, 12.5
       br label %body_succ
-      
+
     body_succ:
       %unused = fadd double %val, 21.0
       store double 42.0, double* %A
diff --git a/polly/test/Simplify/overwritten.ll b/polly/test/Simplify/overwritten.ll
index 8e82d3348eeb1..b8a4b724229b9 100644
--- a/polly/test/Simplify/overwritten.ll
+++ b/polly/test/Simplify/overwritten.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-simplify -analyze < %s | FileCheck -match-full-lines %s 
-; RUN: opt %loadPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s 
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-simplify -analyze < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ;
diff --git a/polly/test/Simplify/overwritten_3store.ll b/polly/test/Simplify/overwritten_3store.ll
index c1a146be1c054..b5983fc253164 100644
--- a/polly/test/Simplify/overwritten_3store.ll
+++ b/polly/test/Simplify/overwritten_3store.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-simplify -analyze < %s | FileCheck -match-full-lines %s 
-; RUN: opt %loadPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s 
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-simplify -analyze < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ; Check that even multiple stores are removed.
diff --git a/polly/test/Simplify/overwritten_loadbetween.ll b/polly/test/Simplify/overwritten_loadbetween.ll
index eb74910e2e98f..8430e8778bb1e 100644
--- a/polly/test/Simplify/overwritten_loadbetween.ll
+++ b/polly/test/Simplify/overwritten_loadbetween.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-simplify -analyze < %s | FileCheck -match-full-lines %s 
-; RUN: opt %loadPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s 
+; RUN: opt %loadPolly -polly-simplify -analyze < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Do not remove overwrites when the value is read before.
 ;

From b3f4535a039918965adb21509700739afc25f9f1 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 22 Feb 2022 13:46:02 -0800
Subject: [PATCH 547/748] [SLP][NFC]Add a test for bottom to top reordering.

---
 .../X86/bottom-to-top-reorder.ll              | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
new file mode 100644
index 0000000000000..2ac4b7d83c41a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s
+
+define void @test(i32* %0, i32* %1, i32* %2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP2:%.*]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 3
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP23:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP20]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[SHUFFLE2]]
+; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[TMP25]], <i32 1, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], <i32 1, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = sub <4 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i32> [[TMP29]], <4 x i32> [[TMP30]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP32:%.*]] = add <4 x i32> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = sub <4 x i32> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP32]], <4 x i32> [[TMP33]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP34]], <4 x i32>* [[TMP35]], align 4
+; CHECK-NEXT:    ret void
+;
+  %4 = load i32, i32* %1, align 4
+  %5 = load i32, i32* %0, align 4
+  %6 = getelementptr inbounds i32, i32* %0, i64 4
+  %7 = load i32, i32* %6, align 4
+  %8 = getelementptr inbounds i32, i32* %1, i64 1
+  %9 = load i32, i32* %8, align 4
+  %10 = getelementptr inbounds i32, i32* %0, i64 1
+  %11 = load i32, i32* %10, align 4
+  %12 = getelementptr inbounds i32, i32* %0, i64 5
+  %13 = load i32, i32* %12, align 4
+  %14 = getelementptr inbounds i32, i32* %1, i64 2
+  %15 = load i32, i32* %14, align 4
+  %16 = getelementptr inbounds i32, i32* %0, i64 2
+  %17 = load i32, i32* %16, align 4
+  %18 = getelementptr inbounds i32, i32* %0, i64 6
+  %19 = load i32, i32* %18, align 4
+  %20 = getelementptr inbounds i32, i32* %1, i64 3
+  %21 = load i32, i32* %20, align 4
+  %22 = getelementptr inbounds i32, i32* %0, i64 3
+  %23 = load i32, i32* %22, align 4
+  %24 = getelementptr inbounds i32, i32* %0, i64 7
+  %25 = load i32, i32* %24, align 4
+  %26 = sub i32 0, %23
+  %27 = sub i32 %26, %25
+  %28 = add i32 %27, %21
+  %29 = sub i32 undef, %17
+  %30 = sub i32 %29, %19
+  %31 = add i32 %30, %15
+  %32 = sub i32 0, %11
+  %33 = sub i32 %32, %13
+  %34 = add i32 %33, %9
+  %35 = sub i32 0, %5
+  %36 = sub i32 %35, %7
+  %37 = add i32 %36, %4
+  %38 = add i32 %31, 1
+  %39 = add i32 %38, 0
+  %40 = add i32 %39, 0
+  store i32 %40, i32* %2, align 4
+  %41 = getelementptr inbounds i32, i32* %2, i64 2
+  %42 = add i32 0, %34
+  %43 = sub i32 %42, 0
+  %44 = sub i32 %43, 0
+  store i32 %44, i32* %41, align 4
+  %45 = getelementptr inbounds i32, i32* %2, i64 1
+  %46 = add i32 %37, 0
+  %47 = sub i32 %46, 0
+  %48 = sub i32 %47, 0
+  store i32 %48, i32* %45, align 4
+  %49 = getelementptr inbounds i32, i32* %2, i64 3
+  %50 = sub i32 %28, 0
+  %51 = sub i32 %50, 0
+  %52 = add i32 %51, 0
+  store i32 %52, i32* %49, align 4
+  ret void
+}

From 3cc15e2cb657f3a814c8bd482bb7108782561abd Mon Sep 17 00:00:00 2001
From: Brendon Cahoon <brendon.cahoon@amd.com>
Date: Sun, 20 Feb 2022 18:18:26 -0600
Subject: [PATCH 548/748] [SLP] Fix assert from non-constant index in
 insertelement

A call to getInsertIndex() in getTreeCost() is returning None,
which causes an assert because a non-constant index value for
insertelement was not expected. This case occurs when the
insertelement index value is defined with a PHI.

Differential Revision: https://reviews.llvm.org/D120223
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 62 ++++++++++---------
 .../slp-variable-insertelement.ll             | 31 ++++++++++
 2 files changed, 63 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/slp-variable-insertelement.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bdc6a33eb2278..4ea8a77583cc9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5919,39 +5919,41 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     // to detect it as a final shuffled/identity match.
     if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
       if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
-        unsigned InsertIdx = *getInsertIndex(VU);
-        auto *It = find_if(FirstUsers, [VU](Value *V) {
-          return areTwoInsertFromSameBuildVector(VU,
-                                                 cast<InsertElementInst>(V));
-        });
-        int VecId = -1;
-        if (It == FirstUsers.end()) {
-          VF.push_back(FTy->getNumElements());
-          ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
-          // Find the insertvector, vectorized in tree, if any.
-          Value *Base = VU;
-          while (isa<InsertElementInst>(Base)) {
-            // Build the mask for the vectorized insertelement instructions.
-            if (const TreeEntry *E = getTreeEntry(Base)) {
-              VU = cast<InsertElementInst>(Base);
-              do {
-                int Idx = E->findLaneForValue(Base);
-                ShuffleMask.back()[Idx] = Idx;
-                Base = cast<InsertElementInst>(Base)->getOperand(0);
-              } while (E == getTreeEntry(Base));
-              break;
+        Optional<unsigned> InsertIdx = getInsertIndex(VU);
+        if (InsertIdx) {
+          auto *It = find_if(FirstUsers, [VU](Value *V) {
+            return areTwoInsertFromSameBuildVector(VU,
+                                                   cast<InsertElementInst>(V));
+          });
+          int VecId = -1;
+          if (It == FirstUsers.end()) {
+            VF.push_back(FTy->getNumElements());
+            ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
+            // Find the insertvector, vectorized in tree, if any.
+            Value *Base = VU;
+            while (isa<InsertElementInst>(Base)) {
+              // Build the mask for the vectorized insertelement instructions.
+              if (const TreeEntry *E = getTreeEntry(Base)) {
+                VU = cast<InsertElementInst>(Base);
+                do {
+                  int Idx = E->findLaneForValue(Base);
+                  ShuffleMask.back()[Idx] = Idx;
+                  Base = cast<InsertElementInst>(Base)->getOperand(0);
+                } while (E == getTreeEntry(Base));
+                break;
+              }
+              Base = cast<InsertElementInst>(Base)->getOperand(0);
             }
-            Base = cast<InsertElementInst>(Base)->getOperand(0);
+            FirstUsers.push_back(VU);
+            DemandedElts.push_back(APInt::getZero(VF.back()));
+            VecId = FirstUsers.size() - 1;
+          } else {
+            VecId = std::distance(FirstUsers.begin(), It);
           }
-          FirstUsers.push_back(VU);
-          DemandedElts.push_back(APInt::getZero(VF.back()));
-          VecId = FirstUsers.size() - 1;
-        } else {
-          VecId = std::distance(FirstUsers.begin(), It);
+          ShuffleMask[VecId][*InsertIdx] = EU.Lane;
+          DemandedElts[VecId].setBit(*InsertIdx);
+          continue;
         }
-        ShuffleMask[VecId][InsertIdx] = EU.Lane;
-        DemandedElts[VecId].setBit(InsertIdx);
-        continue;
       }
     }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-variable-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/slp-variable-insertelement.ll
new file mode 100644
index 0000000000000..d97bfe59e55c6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/slp-variable-insertelement.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-max-vf=2 -slp-min-reg-size=32 -S < %s | FileCheck %s
+
+; It is possible to compute the tree cost for an insertelement that does not
+; have a constant index when the index is a PHI. Check if getInsertIndex
+; returns None.
+
+define void @test() local_unnamed_addr {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    unreachable
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ poison, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ poison, [[ENTRY]] ], [ 0, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 poison, i32 [[I]]
+; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  unreachable
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ poison, %for.body ]
+  %j = phi i32 [ poison, %entry ], [ 0, %for.body ]
+  %0 = insertelement <4 x i32> poison, i32 poison, i32 %i
+  br i1 poison, label %for.cond.cleanup, label %for.body
+}

From 2df019ab5aab3333397662e69c8c774782e5333b Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 22 Feb 2022 17:47:26 -0500
Subject: [PATCH 549/748] [gn build] bump fmsc-version to 1926

This is needed to pick up the workaround in fb1aa286c1
when building with a modern MSVC (like LLVM now requires).
---
 llvm/utils/gn/build/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn
index fff3e06ad8bb0..f539adacf805d 100644
--- a/llvm/utils/gn/build/BUILD.gn
+++ b/llvm/utils/gn/build/BUILD.gn
@@ -283,7 +283,7 @@ config("compiler_defaults") {
       "-Werror=date-time",
     ]
     if (current_os == "win") {
-      cflags += [ "-fmsc-version=1920" ]
+      cflags += [ "-fmsc-version=1926" ]
       if (use_lld) {
         cflags += [ "/Brepro" ]
         ldflags += [ "/Brepro" ]

From ed4f0cb87878e63378f2a37a2af2b538eb493a04 Mon Sep 17 00:00:00 2001
From: Ben Barham <ben_barham@apple.com>
Date: Mon, 21 Feb 2022 21:13:38 -0800
Subject: [PATCH 550/748] [VFS] Use generic_category for errors generated from
 the VFS

Errors are generally checked in clients by comparing to the portable
error condition in `std::errc`, which will have the `generic_category`
(eg. `std::errc::no_such_file_or_directory`). While in practice these
are usually equivalent for the standard errno's, they are not in *all*
implementations. One such example is CentOS 7.

Differential Revision: https://reviews.llvm.org/D120299
---
 llvm/lib/Support/VirtualFileSystem.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 49151682624d8..590bc1902fbe4 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -485,8 +485,7 @@ class CombiningDirIterImpl : public llvm::vfs::detail::DirIterImpl {
     }
 
     if (IsFirstTime && CurrentDirIter == directory_iterator())
-      return std::error_code(static_cast<int>(errc::no_such_file_or_directory),
-                             std::system_category());
+      return errc::no_such_file_or_directory;
     return {};
   }
 
@@ -1285,8 +1284,7 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
   }
 
   if (!S->isDirectory()) {
-    EC = std::error_code(static_cast<int>(errc::not_a_directory),
-                         std::system_category());
+    EC = errc::not_a_directory;
     return {};
   }
 

From 606cb8548a1b7763e0c8489c5efe66803a7ede72 Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@fb.com>
Date: Tue, 22 Feb 2022 18:14:47 -0500
Subject: [PATCH 551/748] [lld] Require C++14 in LLD standalone build

This is what the Clang standalone build does too. And setting this
seems to be required to get the standalone build to work on my Mac.

Reviewed By: #lld-macho, MaskRay, Ericson2314, smeenai

Differential Revision: https://reviews.llvm.org/D120269
---
 lld/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt
index 9bcc135665d02..f51c864af8375 100644
--- a/lld/CMakeLists.txt
+++ b/lld/CMakeLists.txt
@@ -11,6 +11,10 @@ endif()
 include(GNUInstallDirs)
 
 if(LLD_BUILT_STANDALONE)
+  set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ standard to conform to")
+  set(CMAKE_CXX_STANDARD_REQUIRED YES)
+  set(CMAKE_CXX_EXTENSIONS NO)
+
   set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
   # Rely on llvm-config.

From 774b571546915d34a7254b38833001c77745e760 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 22 Feb 2022 23:37:22 +0000
Subject: [PATCH 552/748] [AArch64] Alter mull shuffle(ext(..)) combine to work
 on buildvectors

We have a combine for converting mul(dup(ext(..)), ...) into
mul(ext(dup(..)), ..), for allowing more uses of smull and umull
instructions. Currently it looks for vector insert and shuffle vectors
to detect the element that we can convert to a vector extend. Not all
cases will have a shufflevector/insert element though.

This started by extending the recognition to buildvectors (with elements
that may be individually extended). The new method seems to cover all
the cases that the old method captured though, as the shuffle will
eventually be lowered to buildvectors, so the old method has been
removed to keep the code a little simpler. The new code detects legal
build_vector(ext(a), ext(b), ..), converting them to ext(build_vector(a,
b, ..)) providing all the extends/types match up.

Differential Revision: https://reviews.llvm.org/D120018
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 75 +++++++----------
 llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll  | 80 ++++++++++++++++++-
 .../AArch64/aarch64-matrix-umull-smull.ll     | 49 +++++-------
 3 files changed, 127 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 58d91c3412a93..2fe77449b3a07 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13448,33 +13448,17 @@ static EVT calculatePreExtendType(SDValue Extend) {
   }
 }
 
-/// Combines a dup(sext/zext) node pattern into sext/zext(dup)
+/// Combines a buildvector(sext/zext) node pattern into sext/zext(buildvector)
 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
-static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
-                                                SelectionDAG &DAG) {
-  ShuffleVectorSDNode *ShuffleNode =
-      dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
-  if (!ShuffleNode)
-    return SDValue();
-
-  // Ensuring the mask is zero before continuing
-  if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
-    return SDValue();
-
-  SDValue InsertVectorElt = VectorShuffle.getOperand(0);
-
-  if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
-    return SDValue();
-
-  SDValue InsertLane = InsertVectorElt.getOperand(2);
-  ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
-  // Ensures the insert is inserting into lane 0
-  if (!Constant || Constant->getZExtValue() != 0)
+static SDValue performBuildVectorExtendCombine(SDValue BV, SelectionDAG &DAG) {
+  EVT VT = BV.getValueType();
+  if (BV.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
-  SDValue Extend = InsertVectorElt.getOperand(1);
+  // Use the first item in the buildvector to get the size of the extend, and
+  // make sure it looks valid.
+  SDValue Extend = BV->getOperand(0);
   unsigned ExtendOpcode = Extend.getOpcode();
-
   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
                 ExtendOpcode == ISD::AssertSext;
@@ -13484,30 +13468,29 @@ static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
 
   // Restrict valid pre-extend data type
   EVT PreExtendType = calculatePreExtendType(Extend);
-  if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
-      PreExtendType != MVT::i32)
-    return SDValue();
-
-  EVT TargetType = VectorShuffle.getValueType();
-  EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
-  if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
+  if (PreExtendType == MVT::Other ||
+      PreExtendType.getSizeInBits() != VT.getScalarSizeInBits() / 2)
     return SDValue();
 
-  SDLoc DL(VectorShuffle);
-
-  SDValue InsertVectorNode = DAG.getNode(
-      InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
-      DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
-      DAG.getConstant(0, DL, MVT::i64));
-
-  std::vector<int> ShuffleMask(TargetType.getVectorNumElements());
-
-  SDValue VectorShuffleNode =
-      DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
-                           DAG.getUNDEF(PreExtendVT), ShuffleMask);
+  // Make sure all other operands are equally extended
+  for (SDValue Op : drop_begin(BV->ops())) {
+    unsigned Opc = Op.getOpcode();
+    bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
+                     Opc == ISD::AssertSext;
+    if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
+      return SDValue();
+  }
 
-  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
-                     TargetType, VectorShuffleNode);
+  EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
+  EVT PreExtendLegalType =
+      PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
+  SDLoc DL(BV);
+  SmallVector<SDValue, 8> NewOps;
+  for (SDValue Op : BV->ops())
+    NewOps.push_back(
+        DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, PreExtendLegalType));
+  SDValue NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
+  return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
 }
 
 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
@@ -13518,8 +13501,8 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
   if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
     return SDValue();
 
-  SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
-  SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
+  SDValue Op0 = performBuildVectorExtendCombine(Mul->getOperand(0), DAG);
+  SDValue Op1 = performBuildVectorExtendCombine(Mul->getOperand(1), DAG);
 
   // Neither operands have been changed, don't make any further changes
   if (!Op0 && !Op1)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index bc31d41a55f43..cceb79f97bb93 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -156,10 +156,8 @@ entry:
 define <8 x i16> @nonsplat_shuffleinsert(i8 %src, <8 x i8> %b) {
 ; CHECK-LABEL: nonsplat_shuffleinsert:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    dup v1.8b, w0
+; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
 ; CHECK-NEXT:    ret
 entry:
     %in = sext i8 %src to i16
@@ -170,6 +168,80 @@ entry:
     ret <8 x i16> %out
 }
 
+define <4 x i32> @nonsplat_shuffleinsert2(<4 x i16> %b, i16 %b0, i16 %b1, i16 %b2, i16 %b3) {
+; CHECK-LABEL: nonsplat_shuffleinsert2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    mov v1.h[1], w1
+; CHECK-NEXT:    mov v1.h[2], w2
+; CHECK-NEXT:    mov v1.h[3], w3
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+entry:
+    %s0 = sext i16 %b0 to i32
+    %s1 = sext i16 %b1 to i32
+    %s2 = sext i16 %b2 to i32
+    %s3 = sext i16 %b3 to i32
+    %ext.b = sext <4 x i16> %b to <4 x i32>
+    %v0 = insertelement <4 x i32> undef, i32 %s0, i32 0
+    %v1 = insertelement <4 x i32> %v0, i32 %s1, i32 1
+    %v2 = insertelement <4 x i32> %v1, i32 %s2, i32 2
+    %v3 = insertelement <4 x i32> %v2, i32 %s3, i32 3
+    %out = mul nsw <4 x i32> %v3, %ext.b
+    ret <4 x i32> %out
+}
+
+define void @typei1_orig(i64 %a, i8* %p, <8 x i16>* %q) {
+; CHECK-LABEL: typei1_orig:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    neg v0.8h, v0.8h
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    cmtst v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+    %tmp = xor <16 x i1> zeroinitializer, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+    %tmp6 = load <8 x i16>, <8 x i16>* %q, align 2
+    %tmp7 = sub <8 x i16> zeroinitializer, %tmp6
+    %tmp8 = shufflevector <8 x i16> %tmp7, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+    %tmp9 = icmp slt i64 0, %a
+    %tmp10 = zext i1 %tmp9 to i16
+    %tmp11 = insertelement <16 x i16> undef, i16 %tmp10, i64 0
+    %tmp12 = shufflevector <16 x i16> %tmp11, <16 x i16> undef, <16 x i32> zeroinitializer
+    %tmp13 = mul nuw <16 x i16> %tmp8, %tmp12
+    %tmp14 = icmp ne <16 x i16> %tmp13, zeroinitializer
+    %tmp15 = and <16 x i1> %tmp14, %tmp
+    %tmp16 = sext <16 x i1> %tmp15 to <16 x i8>
+    %tmp17 = bitcast i8* %p to <16 x i8>*
+    store <16 x i8> %tmp16, <16 x i8>* %tmp17, align 1
+    ret void
+}
+
+define <8 x i16> @typei1_v8i1_v8i16(i1 %src, <8 x i1> %b) {
+; CHECK-LABEL: typei1_v8i1_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.8b, #1
+; CHECK-NEXT:    and w8, w0, #0x1
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    ret
+entry:
+    %in = zext i1 %src to i16
+    %ext.b = zext <8 x i1> %b to <8 x i16>
+    %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0
+    %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+    %out = mul nsw <8 x i16> %broadcast.splat, %ext.b
+    ret <8 x i16> %out
+}
+
 define <8 x i16> @missing_insert(<8 x i8> %b) {
 ; CHECK-LABEL: missing_insert:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 4f999edf3d571..12b451f509f73 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -201,25 +201,22 @@ define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB3_6
 ; CHECK-NEXT:  .LBB3_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    stp q1, q3, [x11, #-32]
-; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    smull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
+; CHECK-NEXT:    smull2 v5.4s, v1.8h, v3.8h
+; CHECK-NEXT:    smull v3.4s, v0.4h, v3.4h
+; CHECK-NEXT:    stp q2, q4, [x11, #-32]
+; CHECK-NEXT:    stp q3, q5, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB3_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9
@@ -317,25 +314,22 @@ define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b .LBB4_6
 ; CHECK-NEXT:  .LBB4_3: // %vector.ph
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    ldp q2, q3, [x12, #-16]
 ; CHECK-NEXT:    subs x13, x13, #16
 ; CHECK-NEXT:    add x12, x12, #32
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v4.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    stp q1, q3, [x11, #-32]
-; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    umull2 v4.4s, v1.8h, v2.8h
+; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
+; CHECK-NEXT:    umull2 v5.4s, v1.8h, v3.8h
+; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
+; CHECK-NEXT:    stp q2, q4, [x11, #-32]
+; CHECK-NEXT:    stp q3, q5, [x11], #64
 ; CHECK-NEXT:    b.ne .LBB4_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block
 ; CHECK-NEXT:    cmp x10, x9
@@ -435,12 +429,13 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
+; CHECK-NEXT:    dup v2.8b, w9
 ; CHECK-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov x12, x11
+; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    dup v2.8h, w9
+; CHECK-NEXT:    mov x12, x11
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp d3, d4, [x8, #-8]

From 029283c1c0d8d06fbf000f5682c56b8595a1101f Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl@google.com>
Date: Tue, 22 Feb 2022 14:39:08 -0800
Subject: [PATCH 553/748] Encode address offsets of basic blocks relative to
 the end of the previous basic blocks.

Conceptually, the new encoding emits the offsets and sizes as label differences between each two consecutive basic block begin and end label. When decoding, the offsets must be aggregated along with basic block sizes to calculate the final relative-to-function offsets of basic blocks.

This encoding uses smaller values compared to the existing one (offsets relative to function symbol).
Smaller values tend to occupy fewer bytes in ULEB128 encoding. As a result, we get about 25% reduction
in the size of the bb-address-map section (reduction from about 9MB to 7MB).

Reviewed By: tmsriram, jhenderson

Differential Revision: https://reviews.llvm.org/D106421
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp           | 7 +++++--
 llvm/lib/CodeGen/BasicBlockSections.cpp              | 2 +-
 llvm/test/CodeGen/X86/basic-block-sections-labels.ll | 6 +++---
 llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test    | 2 +-
 llvm/tools/llvm-readobj/ELFDumper.cpp                | 5 ++++-
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 345cdb7c5597a..2ce587b499f03 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1152,16 +1152,19 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
   OutStreamer->emitSymbolValue(FunctionSymbol, getPointerSize());
   // Emit the total number of basic blocks in this function.
   OutStreamer->emitULEB128IntValue(MF.size());
+  const MCSymbol *PrevMBBEndSymbol = FunctionSymbol;
   // Emit BB Information for each basic block in the funciton.
   for (const MachineBasicBlock &MBB : MF) {
     const MCSymbol *MBBSymbol =
         MBB.isEntryBlock() ? FunctionSymbol : MBB.getSymbol();
-    // Emit the basic block offset.
-    emitLabelDifferenceAsULEB128(MBBSymbol, FunctionSymbol);
+    // Emit the basic block offset relative to the end of the previous block.
+    // This is zero unless the block is padded due to alignment.
+    emitLabelDifferenceAsULEB128(MBBSymbol, PrevMBBEndSymbol);
     // Emit the basic block size. When BBs have alignments, their size cannot
     // always be computed from their offsets.
     emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), MBBSymbol);
     OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB));
+    PrevMBBEndSymbol = MBB.getEndSymbol();
   }
   OutStreamer->PopSection();
 }
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index c1901bc46d727..29478b5dbc085 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -60,7 +60,7 @@
 // Basic Block Labels
 // ==================
 //
-// With -fbasic-block-sections=labels, we emit the offsets of BB addresses of
+// With -fbasic-block-sections=labels, we encode the offsets of BB addresses of
 // every function into the .llvm_bb_addr_map section. Along with the function
 // symbols, this allows for mapping of virtual addresses in PMU profiles back to
 // the corresponding basic blocks. This logic is implemented in AsmPrinter. This
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-labels.ll b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll
index b9bcdc5258ada..0de215d099d01 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-labels.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-labels.ll
@@ -51,12 +51,12 @@ declare i32 @__gxx_personality_v0(...)
 ; CHECK-NEXT:	.uleb128 .Lfunc_begin0-.Lfunc_begin0
 ; CHECK-NEXT:	.uleb128 .LBB_END0_0-.Lfunc_begin0
 ; CHECK-NEXT:	.byte	8
-; CHECK-NEXT:	.uleb128 .LBB0_1-.Lfunc_begin0
+; CHECK-NEXT:	.uleb128 .LBB0_1-.LBB_END0_0
 ; CHECK-NEXT:	.uleb128 .LBB_END0_1-.LBB0_1
 ; CHECK-NEXT:	.byte	8
-; CHECK-NEXT:	.uleb128 .LBB0_2-.Lfunc_begin0
+; CHECK-NEXT:	.uleb128 .LBB0_2-.LBB_END0_1
 ; CHECK-NEXT:	.uleb128 .LBB_END0_2-.LBB0_2
 ; CHECK-NEXT:	.byte	1
-; CHECK-NEXT:	.uleb128 .LBB0_3-.Lfunc_begin0
+; CHECK-NEXT:	.uleb128 .LBB0_3-.LBB_END0_2
 ; CHECK-NEXT:	.uleb128 .LBB_END0_3-.LBB0_3
 ; CHECK-NEXT:	.byte	5
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
index 0545cd959104a..7cb75d2acce38 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
@@ -30,7 +30,7 @@
 # LLVM-NEXT:         CanFallThrough: No
 # LLVM-NEXT:       }
 # LLVM-NEXT:       {
-# LLVM-NEXT:         Offset: 0x3
+# LLVM-NEXT:         Offset: 0x4
 # LLVM-NEXT:         Size: 0x4
 # LLVM-NEXT:         HasReturn: Yes
 # LLVM-NEXT:         HasTailCall: No
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index c4a20ca932d6a..6583897c77f8f 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -6962,10 +6962,13 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
       W.printString("Name", FuncName);
 
       ListScope L(W, "BB entries");
+      uint32_t FunctionRelativeAddress = 0;
       for (const BBAddrMap::BBEntry &BBE : AM.BBEntries) {
         DictScope L(W);
-        W.printHex("Offset", BBE.Offset);
+        FunctionRelativeAddress += BBE.Offset;
+        W.printHex("Offset", FunctionRelativeAddress);
         W.printHex("Size", BBE.Size);
+        FunctionRelativeAddress += BBE.Size;
         W.printBoolean("HasReturn", BBE.HasReturn);
         W.printBoolean("HasTailCall", BBE.HasTailCall);
         W.printBoolean("IsEHPad", BBE.IsEHPad);

From 4745c994e4a794ca177152c4c0bd0f640d0cbe8b Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor@mac.com>
Date: Tue, 28 Dec 2021 11:44:49 -0800
Subject: [PATCH 554/748] Set std::numeric_limits<>::tinyness_before to true
 for floating point types on ARM platforms.

Set std::numeric_limits<>::tinyness_before to true for floating point types on ARM platforms.

Section E1.3.5 in the ARMv8 Architecture Reference Manual specifies:
  Underflow. The bit is set to 1 if the absolute value of the result
  of an operation, produced before rounding, is less than the minimum
  positive normalized number for the destination precision, and the
  rounded result is inexact.

Reviewed By: #libc, majnemer, EricWF

Differential Revision: https://reviews.llvm.org/D116338
---
 libcxx/include/limits                                | 12 ++++++++++++
 .../numeric.limits.members/tinyness_before.pass.cpp  |  6 ++++++
 2 files changed, 18 insertions(+)

diff --git a/libcxx/include/limits b/libcxx/include/limits
index bf5d6d1fc4e71..5afef4bd7e064 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -339,7 +339,11 @@ protected:
     static _LIBCPP_CONSTEXPR const bool is_modulo = false;
 
     static _LIBCPP_CONSTEXPR const bool traps = false;
+#if (defined(__arm__) || defined(__aarch64__))
+    static _LIBCPP_CONSTEXPR const bool tinyness_before = true;
+#else
     static _LIBCPP_CONSTEXPR const bool tinyness_before = false;
+#endif
     static _LIBCPP_CONSTEXPR const float_round_style round_style = round_to_nearest;
 };
 
@@ -385,7 +389,11 @@ protected:
     static _LIBCPP_CONSTEXPR const bool is_modulo = false;
 
     static _LIBCPP_CONSTEXPR const bool traps = false;
+#if (defined(__arm__) || defined(__aarch64__))
+    static _LIBCPP_CONSTEXPR const bool tinyness_before = true;
+#else
     static _LIBCPP_CONSTEXPR const bool tinyness_before = false;
+#endif
     static _LIBCPP_CONSTEXPR const float_round_style round_style = round_to_nearest;
 };
 
@@ -435,7 +443,11 @@ protected:
     static _LIBCPP_CONSTEXPR const bool is_modulo = false;
 
     static _LIBCPP_CONSTEXPR const bool traps = false;
+#if (defined(__arm__) || defined(__aarch64__))
+    static _LIBCPP_CONSTEXPR const bool tinyness_before = true;
+#else
     static _LIBCPP_CONSTEXPR const bool tinyness_before = false;
+#endif
     static _LIBCPP_CONSTEXPR const float_round_style round_style = round_to_nearest;
 };
 
diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
index e132d4fc1449b..3231a63a54bde 100644
--- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/tinyness_before.pass.cpp
@@ -50,9 +50,15 @@ int main(int, char**)
     test<__int128_t, false>();
     test<__uint128_t, false>();
 #endif
+#if (defined(__arm__) || defined(__aarch64__))
+    test<float, true>();
+    test<double, true>();
+    test<long double, true>();
+#else
     test<float, false>();
     test<double, false>();
     test<long double, false>();
+#endif
 
   return 0;
 }

From 57a6d921639213b95d3c3e594e24a8be6fb6981e Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 15:52:10 -0800
Subject: [PATCH 555/748] [instcombine] Add test coverage for a tricky bit of
 reasoning about unescaped mallocs

---
 .../InstCombine/compare-unescaped.ll          | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/compare-unescaped.ll b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
index fefe036b0e7c1..d7b2eb817268c 100644
--- a/llvm/test/Transforms/InstCombine/compare-unescaped.ll
+++ b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
@@ -203,4 +203,120 @@ define i1 @compare_distinct_pointer_escape() {
   ret i1 %cmp
 }
 
+; The next block of tests demonstrate a very subtle correctness requirement.
+; We can generally assume any *single* heap layout we chose for the result of
+; a malloc call, but we can't simultanious assume two different ones.  As a
+; result, we must make sure that we only fold conditions if we can ensure that
+; we fold *all* potentially address capturing compares the same.  This is
+; the same point that applies to allocas, applied to noaiias/malloc.
+
+; These two functions represents either a) forging a pointer via inttoptr or
+; b) indexing off an adjacent allocation.  In either case, the operation is
+; obscured by an uninlined helper and not visible to instcombine.
+declare i8* @hidden_inttoptr()
+declare i8* @hidden_offset(i8* %other)
+
+; FIXME: Missed oppurtunity
+define i1 @ptrtoint_single_cmp() {
+; CHECK-LABEL: @ptrtoint_single_cmp(
+; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[M]], inttoptr (i64 2048 to i8*)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %m = call i8* @malloc(i64 4)
+  %rhs = inttoptr i64 2048 to i8*
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+define i1 @offset_single_cmp() {
+; CHECK-LABEL: @offset_single_cmp(
+; CHECK-NEXT:    ret i1 false
+;
+  %m = call i8* @malloc(i64 4)
+  %n = call i8* @malloc(i64 4)
+  %rhs = getelementptr i8, i8* %n, i32 4
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+define i1 @neg_consistent_fold1() {
+; CHECK-LABEL: @neg_consistent_fold1(
+; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M]], inttoptr (i64 2048 to i8*)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8* [[RHS2]], inttoptr (i64 2048 to i8*)
+; CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[CMP1]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = call i8* @malloc(i64 4)
+  %rhs = inttoptr i64 2048 to i8*
+  %rhs2 = call i8* @hidden_inttoptr()
+  %cmp1 = icmp eq i8* %m, %rhs
+  %cmp2 = icmp eq i8* %m, %rhs2
+  %res = and i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+define i1 @neg_consistent_fold2() {
+; CHECK-LABEL: @neg_consistent_fold2(
+; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    [[N:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    [[RHS:%.*]] = getelementptr i8, i8* [[N]], i64 4
+; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_offset(i8* [[N]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M]], [[RHS]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M]], [[RHS2]]
+; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %m = call i8* @malloc(i64 4)
+  %n = call i8* @malloc(i64 4)
+  %rhs = getelementptr i8, i8* %n, i32 4
+  %rhs2 = call i8* @hidden_offset(i8* %n)
+  %cmp1 = icmp eq i8* %m, %rhs
+  %cmp2 = icmp eq i8* %m, %rhs2
+  %res = and i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+define i1 @neg_consistent_fold3() {
+; CHECK-LABEL: @neg_consistent_fold3(
+; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    [[BC:%.*]] = bitcast i8* [[M]] to i32*
+; CHECK-NEXT:    [[LGP:%.*]] = load i32*, i32** @gp, align 8
+; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32* [[LGP]], [[BC]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M]], [[RHS2]]
+; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %m = call i8* @malloc(i64 4)
+  %bc = bitcast i8* %m to i32*
+  %lgp = load i32*, i32** @gp, align 8
+  %rhs2 = call i8* @hidden_inttoptr()
+  %cmp1 = icmp eq i32* %bc, %lgp
+  %cmp2 = icmp eq i8* %m, %rhs2
+  %res = and i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+; FIXME: This appears correct, but the current implementation relies
+; on visiting both cmps in the same pass.  We may have an simplification order
+; under which one is missed, and that would be a bug.
+define i1 @neg_consistent_fold4() {
+; CHECK-LABEL: @neg_consistent_fold4(
+; CHECK-NEXT:    ret i1 false
+;
+  %m = call i8* @malloc(i64 4)
+  %bc = bitcast i8* %m to i32*
+  %lgp = load i32*, i32** @gp, align 8
+  %cmp1 = icmp eq i32* %bc, %lgp
+  %cmp2 = icmp eq i32* %bc, %lgp
+  %res = and i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+
 !0 = !{}
+
+

From 2cca2c7d18f9e6ccb11e91fe19066f3c39dab76d Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 16:01:39 -0800
Subject: [PATCH 556/748] [instcombine] Extend test coverage for a tricky bit
 of reasoning about unescaped mallocs

---
 .../InstCombine/compare-unescaped.ll          | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/compare-unescaped.ll b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
index d7b2eb817268c..8cc77667d8cbf 100644
--- a/llvm/test/Transforms/InstCombine/compare-unescaped.ll
+++ b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
@@ -316,6 +316,53 @@ define i1 @neg_consistent_fold4() {
   ret i1 %res
 }
 
+declare void @unknown(i8*)
+
+; Points out that a nocapture call can't cause a consistent result issue
+; as it is (by assumption) not able to contain a comparison which might
+; capture the address.
+
+define i1 @consistent_nocapture_inttoptr() {
+; CHECK-LABEL: @consistent_nocapture_inttoptr(
+; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    call void @unknown(i8* nocapture [[M]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[M]], inttoptr (i64 2048 to i8*)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %m = call i8* @malloc(i64 4)
+  call void @unknown(i8* nocapture %m)
+  %rhs = inttoptr i64 2048 to i8*
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+define i1 @consistent_nocapture_offset() {
+; CHECK-LABEL: @consistent_nocapture_offset(
+; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    call void @unknown(i8* nocapture [[M]])
+; CHECK-NEXT:    ret i1 false
+;
+  %m = call i8* @malloc(i64 4)
+  call void @unknown(i8* nocapture %m)
+  %n = call i8* @malloc(i64 4)
+  %rhs = getelementptr i8, i8* %n, i32 4
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+define i1 @consistent_nocapture_through_global() {
+; CHECK-LABEL: @consistent_nocapture_through_global(
+; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    call void @unknown(i8* nocapture [[M]])
+; CHECK-NEXT:    ret i1 false
+;
+  %m = call i8* @malloc(i64 4)
+  call void @unknown(i8* nocapture %m)
+  %bc = bitcast i8* %m to i32*
+  %lgp = load i32*, i32** @gp, align 8, !nonnull !0
+  %cmp = icmp eq i32* %bc, %lgp
+  ret i1 %cmp
+}
 
 !0 = !{}
 

From 8b9f42b61b33ec1493e6d71d0240da6dfc847be2 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 16:02:27 -0800
Subject: [PATCH 557/748] [instcombine] Autogen a test for ease of update

---
 .../Transforms/InstCombine/compare-alloca.ll  | 63 +++++++++++++------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/compare-alloca.ll b/llvm/test/Transforms/InstCombine/compare-alloca.ll
index ff55176412057..a9e10724805ae 100644
--- a/llvm/test/Transforms/InstCombine/compare-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/compare-alloca.ll
@@ -1,54 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=instcombine -S %s | FileCheck %s
 target datalayout = "p:32:32"
 
 
 define i1 @alloca_argument_compare(i64* %arg) {
+; CHECK-LABEL: @alloca_argument_compare(
+; CHECK-NEXT:    ret i1 false
+;
   %alloc = alloca i64
   %cmp = icmp eq i64* %arg, %alloc
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_argument_compare
-  ; CHECK: ret i1 false
 }
 
 define i1 @alloca_argument_compare_swapped(i64* %arg) {
+; CHECK-LABEL: @alloca_argument_compare_swapped(
+; CHECK-NEXT:    ret i1 false
+;
   %alloc = alloca i64
   %cmp = icmp eq i64* %alloc, %arg
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_argument_compare_swapped
-  ; CHECK: ret i1 false
 }
 
 define i1 @alloca_argument_compare_ne(i64* %arg) {
+; CHECK-LABEL: @alloca_argument_compare_ne(
+; CHECK-NEXT:    ret i1 true
+;
   %alloc = alloca i64
   %cmp = icmp ne i64* %arg, %alloc
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_argument_compare_ne
-  ; CHECK: ret i1 true
 }
 
 define i1 @alloca_argument_compare_derived_ptrs(i64* %arg, i64 %x) {
+; CHECK-LABEL: @alloca_argument_compare_derived_ptrs(
+; CHECK-NEXT:    ret i1 false
+;
   %alloc = alloca i64, i64 8
   %p = getelementptr i64, i64* %arg, i64 %x
   %q = getelementptr i64, i64* %alloc, i64 3
   %cmp = icmp eq i64* %p, %q
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_argument_compare_derived_ptrs
-  ; CHECK: ret i1 false
 }
 
 declare void @escape(i64*)
 define i1 @alloca_argument_compare_escaped_alloca(i64* %arg) {
+; CHECK-LABEL: @alloca_argument_compare_escaped_alloca(
+; CHECK-NEXT:    [[ALLOC:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    call void @escape(i64* nonnull [[ALLOC]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64* [[ALLOC]], [[ARG:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %alloc = alloca i64
   call void @escape(i64* %alloc)
   %cmp = icmp eq i64* %alloc, %arg
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_argument_compare_escaped_alloca
-  ; CHECK: %cmp = icmp eq i64* %alloc, %arg
-  ; CHECK: ret i1 %cmp
 }
 
 declare void @check_compares(i1, i1)
 define void @alloca_argument_compare_two_compares(i64* %p) {
+; CHECK-LABEL: @alloca_argument_compare_two_compares(
+; CHECK-NEXT:    [[Q1:%.*]] = alloca [8 x i64], align 8
+; CHECK-NEXT:    [[Q1_SUB:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[Q1]], i32 0, i32 0
+; CHECK-NEXT:    [[R:%.*]] = getelementptr i64, i64* [[P:%.*]], i32 1
+; CHECK-NEXT:    [[S:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[Q1]], i32 0, i32 2
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64* [[Q1_SUB]], [[P]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64* [[R]], [[S]]
+; CHECK-NEXT:    call void @check_compares(i1 [[CMP1]], i1 [[CMP2]])
+; CHECK-NEXT:    ret void
+;
   %q = alloca i64, i64 8
   %r = getelementptr i64, i64* %p, i64 1
   %s = getelementptr i64, i64* %q, i64 2
@@ -57,24 +75,29 @@ define void @alloca_argument_compare_two_compares(i64* %p) {
   call void @check_compares(i1 %cmp1, i1 %cmp2)
   ret void
   ; We will only fold if there is a single cmp.
-  ; CHECK-LABEL: alloca_argument_compare_two_compares
-  ; CHECK: call void @check_compares(i1 %cmp1, i1 %cmp2)
 }
 
 define i1 @alloca_argument_compare_escaped_through_store(i64* %arg, i64** %ptr) {
+; CHECK-LABEL: @alloca_argument_compare_escaped_through_store(
+; CHECK-NEXT:    [[ALLOC:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64* [[ALLOC]], [[ARG:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds i64, i64* [[ALLOC]], i32 1
+; CHECK-NEXT:    store i64* [[P]], i64** [[PTR:%.*]], align 4
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %alloc = alloca i64
   %cmp = icmp eq i64* %alloc, %arg
   %p = getelementptr i64, i64* %alloc, i64 1
   store i64* %p, i64** %ptr
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_argument_compare_escaped_through_store
-  ; CHECK: %cmp = icmp eq i64* %alloc, %arg
-  ; CHECK: ret i1 %cmp
 }
 
 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 define i1 @alloca_argument_compare_benign_instrs(i8* %arg) {
+; CHECK-LABEL: @alloca_argument_compare_benign_instrs(
+; CHECK-NEXT:    ret i1 false
+;
   %alloc = alloca i8
   call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloc)
   %cmp = icmp eq i8* %arg, %alloc
@@ -82,16 +105,16 @@ define i1 @alloca_argument_compare_benign_instrs(i8* %arg) {
   store i8 %x, i8* %alloc
   call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloc)
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_argument_compare_benign_instrs
-  ; CHECK: ret i1 false
 }
 
 declare i64* @allocator()
 define i1 @alloca_call_compare() {
+; CHECK-LABEL: @alloca_call_compare(
+; CHECK-NEXT:    [[Q:%.*]] = call i64* @allocator()
+; CHECK-NEXT:    ret i1 false
+;
   %p = alloca i64
   %q = call i64* @allocator()
   %cmp = icmp eq i64* %p, %q
   ret i1 %cmp
-  ; CHECK-LABEL: alloca_call_compare
-  ; CHECK: ret i1 false
 }

From 2368f18eb305ae9d5a4f2110c048e5daf5007992 Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <aardappel@gmail.com>
Date: Tue, 22 Feb 2022 15:45:49 -0800
Subject: [PATCH 558/748] [WebAssembly] Fixed AsmPrinter not emitting .functype
 for intrinsics

Intrinsics like `memset` were not emitted as `.functype` because
WebAssemblyAsmPrinter::emitExternalDecls explicitly skips symbols
that are isIntrinsic. Removing that check doesn't work, since the symbol
from the module refers to a 4-argument `llvm.memset.p0i8.i32` rather
than the 3-argument `memset` symbol referenced in the call.
Our `WebAssemblyMCLowerPrePass` however does collect the
`memset` symbol, so the current solution is as simple as emitting
`.functype` for those.

Fixes: https://github.com/llvm/llvm-project/issues/53712

Differential Revision: https://reviews.llvm.org/D120365
---
 .../WebAssembly/WebAssemblyAsmPrinter.cpp     |  7 ++++-
 .../WebAssembly/extern-functype-intrinsic.ll  | 30 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 4dc2c4c9e29e2..a518938dd908c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -302,7 +302,12 @@ void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) {
   // not be found here.
   MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
   for (const auto &Name : MMIW.MachineSymbolsUsed) {
-    getOrCreateWasmSymbol(Name.getKey());
+    auto *WasmSym = cast<MCSymbolWasm>(getOrCreateWasmSymbol(Name.getKey()));
+    if (WasmSym->isFunction()) {
+      // TODO(wvo): is there any case where this overlaps with the call to
+      // emitFunctionType in the loop below?
+      getTargetStreamer()->emitFunctionType(WasmSym);
+    }
   }
 
   for (auto &It : OutContext.getSymbols()) {
diff --git a/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll b/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
new file mode 100644
index 0000000000000..f2677262e3b96
--- /dev/null
+++ b/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
@@ -0,0 +1,30 @@
+; RUN: llc %s -o - | FileCheck %s
+; RUN: llc %s -o - | llvm-mc -triple=wasm32-unknown-unknown | FileCheck %s
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
+target triple = "wasm32-unknown-wasi"
+
+; Function Attrs: nounwind
+define hidden i32 @d() local_unnamed_addr #0 {
+entry:
+  %0 = call i32 bitcast (i32 (...)* @g to i32 ()*)() #3
+  call void @llvm.memset.p0i8.i32(i8* nonnull align 4 inttoptr (i32 4 to i8*), i8 0, i32 %0, i1 false)                                        ; preds = %for.body.preheader, %entry
+  ret i32 undef
+}
+
+declare i32 @g(...) local_unnamed_addr #1
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg) #2
+
+attributes #0 = { nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" }
+attributes #1 = { "frame-pointer"="none" "no-prototype" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" }
+attributes #2 = { argmemonly nofree nounwind willreturn writeonly }
+attributes #3 = { nounwind }
+
+; CHECK:         .functype       memset (i32, i32, i32) -> (i32)
+; CHECK:         .functype       g () -> (i32)
+; CHECK:         call    g
+; CHECK:         call    memset

From 105ddd0fdca0e585db6be05ee8f4a1941d113ca2 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 22 Feb 2022 16:11:49 -0800
Subject: [PATCH 559/748] [NFC] Remove dead code (try 2)

This is causing
../../llvm/include/llvm/Object/MachO.h:379:13: warning: private field 'Kind' is not used [-Wunused-private-field]
  FixupKind Kind;

Previous attempt in a23f7c0cb6b42a06bc9707fdf46ce2a90080f61f.
---
 llvm/include/llvm/Object/MachO.h      |  7 ++-----
 llvm/lib/Object/MachOObjectFile.cpp   | 12 +++++-------
 llvm/tools/llvm-objdump/MachODump.cpp |  8 +++-----
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 3350e8215ff9f..de911c005b53d 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -364,8 +364,7 @@ class MachOChainedFixupEntry : public MachOAbstractFixupEntry {
 public:
   enum class FixupKind { All, Bind, WeakBind, Rebase };
 
-  MachOChainedFixupEntry(Error *Err, const MachOObjectFile *O, FixupKind Kind,
-                         bool Parse);
+  MachOChainedFixupEntry(Error *Err, const MachOObjectFile *O, bool Parse);
 
   bool operator==(const MachOChainedFixupEntry &) const;
 
@@ -376,7 +375,6 @@ class MachOChainedFixupEntry : public MachOAbstractFixupEntry {
 private:
   std::vector<ChainedFixupTarget> FixupTargets;
   uint32_t FixupIndex = 0;
-  FixupKind Kind;
 };
 using fixup_iterator = content_iterator<MachOChainedFixupEntry>;
 
@@ -523,8 +521,7 @@ class MachOObjectFile : public ObjectFile {
   iterator_range<bind_iterator> bindTable(Error &Err);
 
   /// For iterating over all chained fixups.
-  iterator_range<fixup_iterator>
-  fixupTable(Error &Err, MachOChainedFixupEntry::FixupKind Kind);
+  iterator_range<fixup_iterator> fixupTable(Error &Err);
 
   /// For use iterating over all lazy bind table entries.
   iterator_range<bind_iterator> lazyBindTable(Error &Err);
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 051055173d3f1..5d6e237c8a99a 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -3256,8 +3256,8 @@ void MachOAbstractFixupEntry::moveToEnd() { Done = true; }
 
 MachOChainedFixupEntry::MachOChainedFixupEntry(Error *E,
                                                const MachOObjectFile *O,
-                                               FixupKind Kind, bool Parse)
-    : MachOAbstractFixupEntry(E, O), Kind(Kind) {
+                                               bool Parse)
+    : MachOAbstractFixupEntry(E, O) {
   ErrorAsOutParameter e(E);
   if (Parse) {
     if (auto FixupTargetsOrErr = O->getDyldChainedFixupTargets())
@@ -4298,13 +4298,11 @@ iterator_range<bind_iterator> MachOObjectFile::weakBindTable(Error &Err) {
                    MachOBindEntry::Kind::Weak);
 }
 
-iterator_range<fixup_iterator>
-MachOObjectFile::fixupTable(Error &Err,
-                            MachOChainedFixupEntry::FixupKind Kind) {
-  MachOChainedFixupEntry Start(&Err, this, Kind, true);
+iterator_range<fixup_iterator> MachOObjectFile::fixupTable(Error &Err) {
+  MachOChainedFixupEntry Start(&Err, this, true);
   Start.moveToFirst();
 
-  MachOChainedFixupEntry Finish(&Err, this, Kind, false);
+  MachOChainedFixupEntry Finish(&Err, this, false);
   Finish.moveToEnd();
 
   return make_range(fixup_iterator(Start), fixup_iterator(Finish));
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index 88731e828598b..4ec555ecf3a30 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -1184,11 +1184,9 @@ static void PrintLinkOptHints(MachOObjectFile *O) {
   }
 }
 
-static void printMachOChainedFixups(object::MachOObjectFile *Obj,
-                                    MachOChainedFixupEntry::FixupKind Type) {
+static void printMachOChainedFixups(object::MachOObjectFile *Obj) {
   Error Err = Error::success();
-  for (const object::MachOChainedFixupEntry &Entry :
-       Obj->fixupTable(Err, Type)) {
+  for (const object::MachOChainedFixupEntry &Entry : Obj->fixupTable(Err)) {
     (void)Entry;
   }
   if (Err)
@@ -1197,7 +1195,7 @@ static void printMachOChainedFixups(object::MachOObjectFile *Obj,
 
 static void PrintDyldInfo(MachOObjectFile *O) {
   outs() << "dyld information:" << '\n';
-  printMachOChainedFixups(O, MachOChainedFixupEntry::FixupKind::Bind);
+  printMachOChainedFixups(O);
 }
 
 static void PrintDylibs(MachOObjectFile *O, bool JustId) {

From 3ef7e6c53c825903d77a8d004a82c865f493e1bf Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 22 Feb 2022 16:19:06 -0800
Subject: [PATCH 560/748] [clang] Remove an Address::deprecated() call in
 CGClass.cpp

---
 clang/lib/CodeGen/CGClass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 612209ef8fe8f..f6cacd07a66f2 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2148,7 +2148,7 @@ void CodeGenFunction::EmitCXXConstructorCall(const CXXConstructorDecl *D,
     assert(Args.size() == 2 && "unexpected argcount for trivial ctor");
 
     QualType SrcTy = D->getParamDecl(0)->getType().getNonReferenceType();
-    Address Src = Address::deprecated(Args[1].getRValue(*this).getScalarVal(),
+    Address Src = Address(Args[1].getRValue(*this).getScalarVal(), ConvertTypeForMem(SrcTy),
                                       CGM.getNaturalTypeAlignment(SrcTy));
     LValue SrcLVal = MakeAddrLValue(Src, SrcTy);
     QualType DestTy = getContext().getTypeDeclType(ClassDecl);

From 3de5322b5f719d9414423d4237a6533fe43cd7f8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 22 Feb 2022 16:20:40 -0800
Subject: [PATCH 561/748] [sanitizer] Refactor
 GetNextInstructionPc/GetPreviousInstructionPc

x86 uses offset 1 while most RISC architectures use offset 4.
Check x86 first to prevent changes for new RISC architectures.

Reviewed By: #sanitizers, vitalybuka

Differential Revision: https://reviews.llvm.org/D120362
---
 .../lib/sanitizer_common/sanitizer_stacktrace.cpp     | 11 ++++++-----
 .../lib/sanitizer_common/sanitizer_stacktrace.h       |  7 +++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
index 37e9e6dd08d7b..5a6329eec484d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
@@ -20,11 +20,10 @@
 namespace __sanitizer {
 
 uptr StackTrace::GetNextInstructionPc(uptr pc) {
-#if defined(__sparc__) || defined(__mips__)
-  return pc + 8;
-#elif defined(__powerpc__) || defined(__arm__) || defined(__aarch64__) || \
-    defined(__hexagon__)
+#if defined(__aarch64__)
   return STRIP_PAC_PC((void *)pc) + 4;
+#elif defined(__sparc__) || defined(__mips__)
+  return pc + 8;
 #elif SANITIZER_RISCV64
   // Current check order is 4 -> 2 -> 6 -> 8
   u8 InsnByte = *(u8 *)(pc);
@@ -47,8 +46,10 @@ uptr StackTrace::GetNextInstructionPc(uptr pc) {
   }
   // bail-out if could not figure out the instruction size
   return 0;
-#else
+#elif SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
   return pc + 1;
+#else
+  return pc + 4;
 #endif
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
index aebd504669d2d..82c2fda351277 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
@@ -88,9 +88,6 @@ uptr StackTrace::GetPreviousInstructionPc(uptr pc) {
   // so we return (pc-2) in that case in order to be safe.
   // For A32 mode we return (pc-4) because all instructions are 32 bit long.
   return (pc - 3) & (~1);
-#elif defined(__powerpc__) || defined(__powerpc64__) || defined(__aarch64__)
-  // PCs are always 4 byte aligned.
-  return pc - 4;
 #elif defined(__sparc__) || defined(__mips__)
   return pc - 8;
 #elif SANITIZER_RISCV64
@@ -101,8 +98,10 @@ uptr StackTrace::GetPreviousInstructionPc(uptr pc) {
   // It seems difficult to figure out the exact instruction length -
   // pc - 2 seems like a safe option for the purposes of stack tracing
   return pc - 2;
-#else
+#elif SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
   return pc - 1;
+#else
+  return pc - 4;
 #endif
 }
 

From 9030d90aeb842c43a9e7d44bbf280dca250a72d9 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 16:18:39 -0800
Subject: [PATCH 562/748] [instcombine] Add coverage for consistent use of
 unescaped malloc case

---
 .../Transforms/InstCombine/compare-alloca.ll  | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/compare-alloca.ll b/llvm/test/Transforms/InstCombine/compare-alloca.ll
index a9e10724805ae..21ec6cac3d681 100644
--- a/llvm/test/Transforms/InstCombine/compare-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/compare-alloca.ll
@@ -118,3 +118,169 @@ define i1 @alloca_call_compare() {
   %cmp = icmp eq i64* %p, %q
   ret i1 %cmp
 }
+
+
+; The next block of tests demonstrate a very subtle correctness requirement.
+; We can generally assume any *single* stack layout we chose for the result of
+; an alloca, but we can't simultanious assume two different ones.  As a
+; result, we must make sure that we only fold conditions if we can ensure that
+; we fold *all* potentially address capturing compares the same.
+
+; These two functions represents either a) forging a pointer via inttoptr or
+; b) indexing off an adjacent allocation.  In either case, the operation is
+; obscured by an uninlined helper and not visible to instcombine.
+declare i8* @hidden_inttoptr()
+declare i8* @hidden_offset(i8* %other)
+
+define i1 @ptrtoint_single_cmp() {
+; CHECK-LABEL: @ptrtoint_single_cmp(
+; CHECK-NEXT:    ret i1 false
+;
+  %m = alloca i8, i32 4
+  %rhs = inttoptr i64 2048 to i8*
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+define i1 @offset_single_cmp() {
+; CHECK-LABEL: @offset_single_cmp(
+; CHECK-NEXT:    ret i1 false
+;
+  %m = alloca i8, i32 4
+  %n = alloca i8, i32 4
+  %rhs = getelementptr i8, i8* %n, i32 4
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+define i1 @neg_consistent_fold1() {
+; CHECK-LABEL: @neg_consistent_fold1(
+; CHECK-NEXT:    [[M1:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[M1_SUB:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[M1]], i32 0, i32 0
+; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M1_SUB]], inttoptr (i64 2048 to i8*)
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M1_SUB]], [[RHS2]]
+; CHECK-NEXT:    [[RES:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %m = alloca i8, i32 4
+  %rhs = inttoptr i64 2048 to i8*
+  %rhs2 = call i8* @hidden_inttoptr()
+  %cmp1 = icmp eq i8* %m, %rhs
+  %cmp2 = icmp eq i8* %m, %rhs2
+  %res = or i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+define i1 @neg_consistent_fold2() {
+; CHECK-LABEL: @neg_consistent_fold2(
+; CHECK-NEXT:    [[M1:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[N2:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[N2_SUB:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[N2]], i32 0, i32 0
+; CHECK-NEXT:    [[M1_SUB:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[M1]], i32 0, i32 0
+; CHECK-NEXT:    [[RHS:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[N2]], i32 0, i32 4
+; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_offset(i8* nonnull [[N2_SUB]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M1_SUB]], [[RHS]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M1_SUB]], [[RHS2]]
+; CHECK-NEXT:    [[RES:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %m = alloca i8, i32 4
+  %n = alloca i8, i32 4
+  %rhs = getelementptr i8, i8* %n, i32 4
+  %rhs2 = call i8* @hidden_offset(i8* %n)
+  %cmp1 = icmp eq i8* %m, %rhs
+  %cmp2 = icmp eq i8* %m, %rhs2
+  %res = or i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+define i1 @neg_consistent_fold3() {
+; CHECK-LABEL: @neg_consistent_fold3(
+; CHECK-NEXT:    [[M1:%.*]] = alloca i32, align 1
+; CHECK-NEXT:    [[M1_SUB:%.*]] = bitcast i32* [[M1]] to i8*
+; CHECK-NEXT:    [[LGP:%.*]] = load i32*, i32** @gp, align 8
+; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32* [[M1]], [[LGP]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[RHS2]], [[M1_SUB]]
+; CHECK-NEXT:    [[RES:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %m = alloca i8, i32 4
+  %bc = bitcast i8* %m to i32*
+  %lgp = load i32*, i32** @gp, align 8
+  %rhs2 = call i8* @hidden_inttoptr()
+  %cmp1 = icmp eq i32* %bc, %lgp
+  %cmp2 = icmp eq i8* %m, %rhs2
+  %res = or i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+define i1 @neg_consistent_fold4() {
+; CHECK-LABEL: @neg_consistent_fold4(
+; CHECK-NEXT:    ret i1 false
+;
+  %m = alloca i8, i32 4
+  %bc = bitcast i8* %m to i32*
+  %lgp = load i32*, i32** @gp, align 8
+  %cmp1 = icmp eq i32* %bc, %lgp
+  %cmp2 = icmp eq i32* %bc, %lgp
+  %res = or i1 %cmp1, %cmp2
+  ret i1 %res
+}
+
+; A nocapture call can't cause a consistent result issue as it is (by
+; assumption) not able to contain a comparison which might capture the
+; address.
+
+declare void @unknown(i8*)
+
+; TODO: Missing optimization
+define i1 @consistent_nocapture_inttoptr() {
+; CHECK-LABEL: @consistent_nocapture_inttoptr(
+; CHECK-NEXT:    [[M1:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[M1_SUB:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[M1]], i32 0, i32 0
+; CHECK-NEXT:    call void @unknown(i8* nocapture nonnull [[M1_SUB]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[M1_SUB]], inttoptr (i64 2048 to i8*)
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %m = alloca i8, i32 4
+  call void @unknown(i8* nocapture %m)
+  %rhs = inttoptr i64 2048 to i8*
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+define i1 @consistent_nocapture_offset() {
+; CHECK-LABEL: @consistent_nocapture_offset(
+; CHECK-NEXT:    [[M1:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[M1_SUB:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[M1]], i32 0, i32 0
+; CHECK-NEXT:    call void @unknown(i8* nocapture nonnull [[M1_SUB]])
+; CHECK-NEXT:    ret i1 false
+;
+  %m = alloca i8, i32 4
+  call void @unknown(i8* nocapture %m)
+  %n = alloca i8, i32 4
+  %rhs = getelementptr i8, i8* %n, i32 4
+  %cmp = icmp eq i8* %m, %rhs
+  ret i1 %cmp
+}
+
+@gp = global i32* null, align 8
+; TODO: Missing optimization
+define i1 @consistent_nocapture_through_global() {
+; CHECK-LABEL: @consistent_nocapture_through_global(
+; CHECK-NEXT:    [[M1:%.*]] = alloca i32, align 1
+; CHECK-NEXT:    [[M1_SUB:%.*]] = bitcast i32* [[M1]] to i8*
+; CHECK-NEXT:    call void @unknown(i8* nocapture nonnull [[M1_SUB]])
+; CHECK-NEXT:    [[LGP:%.*]] = load i32*, i32** @gp, align 8, !nonnull !0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[M1]], [[LGP]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %m = alloca i8, i32 4
+  call void @unknown(i8* nocapture %m)
+  %bc = bitcast i8* %m to i32*
+  %lgp = load i32*, i32** @gp, align 8, !nonnull !{}
+  %cmp = icmp eq i32* %bc, %lgp
+  ret i1 %cmp
+}

From 8b83b8f131a4ea7e7d892d6afd76d54429d5bc09 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Tue, 22 Feb 2022 12:21:07 -0800
Subject: [PATCH 563/748] [mlir][sparse] refactor sparse compiler pipeline to
 single place

Reviewed By: bixia

Differential Revision: https://reviews.llvm.org/D120347
---
 .../Dialect/SparseTensor/python/test_SDDMM.py | 21 +++------
 .../Dialect/SparseTensor/python/test_SpMM.py  | 21 +++------
 .../test_elementwise_add_sparse_output.py     | 17 +------
 .../SparseTensor/python/test_output.py        | 21 +++------
 .../SparseTensor/python/test_stress.py        | 44 ++++++-------------
 .../python/tools/sparse_compiler.py           | 19 ++++++++
 6 files changed, 50 insertions(+), 93 deletions(-)
 create mode 100644 mlir/test/Integration/Dialect/SparseTensor/python/tools/sparse_compiler.py

diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py
index c52b30c2c21d9..538d5c853901a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py
@@ -4,18 +4,19 @@
 import ctypes
 import numpy as np
 import os
-
-import mlir.all_passes_registration
+import sys
 
 from mlir import ir
 from mlir import runtime as rt
 from mlir import execution_engine
-from mlir import passmanager
 
 from mlir.dialects import sparse_tensor as st
 from mlir.dialects import builtin
 from mlir.dialects.linalg.opdsl import lang as dsl
 
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+from tools import sparse_compiler
 
 @dsl.linalg_structured_op
 def sddmm_dsl(
@@ -119,18 +120,6 @@ def build_compile_and_run_SDDMMM(attr: st.EncodingAttr, opt: str,
     quit(f'FAILURE')
 
 
-class SparseCompiler:
-  """Sparse compiler passes."""
-
-  def __init__(self, options: str):
-    pipeline = (
-        f'sparse-compiler{{{options} reassociate-fp-reductions=1 enable-index-optimizations=1}}')
-    self.pipeline = pipeline
-
-  def __call__(self, module: ir.Module):
-    passmanager.PassManager.parse(self.pipeline).run(module)
-
-
 def main():
   support_lib = os.getenv('SUPPORT_LIB')
   assert support_lib is not None, 'SUPPORT_LIB is undefined'
@@ -166,7 +155,7 @@ def main():
                   opt = (f'parallelization-strategy={par} '
                          f'vectorization-strategy={vec} '
                          f'vl={vl} enable-simd-index32={e}')
-                  compiler = SparseCompiler(options=opt)
+                  compiler = sparse_compiler.SparseCompiler(options=opt)
                   build_compile_and_run_SDDMMM(attr, opt, support_lib, compiler)
                   count = count + 1
   # CHECK: Passed 16 tests
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py
index 1b66628ad7bda..77b94ea887767 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py
@@ -4,18 +4,19 @@
 import ctypes
 import numpy as np
 import os
-
-import mlir.all_passes_registration
+import sys
 
 from mlir import ir
 from mlir import runtime as rt
 from mlir import execution_engine
-from mlir import passmanager
 
 from mlir.dialects import sparse_tensor as st
 from mlir.dialects import builtin
 from mlir.dialects.linalg.opdsl import lang as dsl
 
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+from tools import sparse_compiler
 
 @dsl.linalg_structured_op
 def matmul_dsl(
@@ -108,18 +109,6 @@ def build_compile_and_run_SpMM(attr: st.EncodingAttr, support_lib: str,
     quit(f'FAILURE')
 
 
-class SparseCompiler:
-  """Sparse compiler passes."""
-
-  def __init__(self, options: str):
-    pipeline = (
-        f'sparse-compiler{{{options} reassociate-fp-reductions=1 enable-index-optimizations=1}}')
-    self.pipeline = pipeline
-
-  def __call__(self, module: ir.Module):
-    passmanager.PassManager.parse(self.pipeline).run(module)
-
-
 def main():
   support_lib = os.getenv('SUPPORT_LIB')
   assert support_lib is not None, 'SUPPORT_LIB is undefined'
@@ -155,7 +144,7 @@ def main():
         for pwidth in bitwidths:
           for iwidth in bitwidths:
             attr = st.EncodingAttr.get(level, ordering, pwidth, iwidth)
-            compiler = SparseCompiler(options=opt)
+            compiler = sparse_compiler.SparseCompiler(options=opt)
             build_compile_and_run_SpMM(attr, support_lib, compiler)
             count = count + 1
     # CHECK: Passed 8 tests
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_elementwise_add_sparse_output.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_elementwise_add_sparse_output.py
index 52e089eac8fcd..1cc79c43f728a 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_elementwise_add_sparse_output.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_elementwise_add_sparse_output.py
@@ -5,12 +5,9 @@
 import os
 import sys
 
-import mlir.all_passes_registration
-
 from mlir import ir
 from mlir import runtime as rt
 from mlir import execution_engine
-from mlir import passmanager
 from mlir.dialects import sparse_tensor as st
 from mlir.dialects import builtin
 from mlir.dialects.linalg.opdsl import lang as dsl
@@ -18,6 +15,7 @@
 _SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(_SCRIPT_PATH)
 from tools import np_to_sparse_tensor as test_tools
+from tools import sparse_compiler
 
 # TODO: Use linalg_structured_op to generate the kernel after making it to
 # handle sparse tensor outputs.
@@ -61,21 +59,10 @@
 """
 
 
-class _SparseCompiler:
-  """Sparse compiler passes."""
-
-  def __init__(self):
-    self.pipeline = (
-        f'sparse-compiler{{reassociate-fp-reductions=1 enable-index-optimizations=1}}')
-
-  def __call__(self, module: ir.Module):
-    passmanager.PassManager.parse(self.pipeline).run(module)
-
-
 def _run_test(support_lib, kernel):
   """Compiles, runs and checks results."""
   module = ir.Module.parse(kernel)
-  _SparseCompiler()(module)
+  sparse_compiler.SparseCompiler(options='')(module)
   engine = execution_engine.ExecutionEngine(
       module, opt_level=0, shared_libs=[support_lib])
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
index c29f618e26980..5e2210b2d81ba 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
@@ -3,18 +3,19 @@
 
 import ctypes
 import os
+import sys
 import tempfile
 
-import mlir.all_passes_registration
-
 from mlir import execution_engine
 from mlir import ir
-from mlir import passmanager
 from mlir import runtime as rt
 
 from mlir.dialects import builtin
 from mlir.dialects import sparse_tensor as st
 
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+from tools import sparse_compiler
 
 # TODO: move more into actual IR building.
 def boilerplate(attr: st.EncodingAttr):
@@ -68,18 +69,6 @@ def build_compile_and_run_output(attr: st.EncodingAttr, support_lib: str,
       quit('FAILURE')
 
 
-class SparseCompiler:
-  """Sparse compiler passes."""
-
-  def __init__(self):
-    pipeline = (
-        f'sparse-compiler{{reassociate-fp-reductions=1 enable-index-optimizations=1}}')
-    self.pipeline = pipeline
-
-  def __call__(self, module: ir.Module):
-    passmanager.PassManager.parse(self.pipeline).run(module)
-
-
 def main():
   support_lib = os.getenv('SUPPORT_LIB')
   assert support_lib is not None, 'SUPPORT_LIB is undefined'
@@ -103,7 +92,7 @@ def main():
       for ordering in orderings:
         for bwidth in bitwidths:
           attr = st.EncodingAttr.get(level, ordering, bwidth, bwidth)
-          compiler = SparseCompiler()
+          compiler = sparse_compiler.SparseCompiler(options='')
           build_compile_and_run_output(attr, support_lib, compiler)
           count = count + 1
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
index ccf1ffd6cd263..7958e76862c46 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
@@ -6,21 +6,23 @@
 import itertools
 import os
 import sys
+
 from typing import List, Callable
 
 import numpy as np
 
-import mlir.all_passes_registration
-
 from mlir import ir
 from mlir import runtime as rt
 from mlir.execution_engine import ExecutionEngine
-from mlir.passmanager import PassManager
 
 from mlir.dialects import builtin
 from mlir.dialects import std
 from mlir.dialects import sparse_tensor as st
 
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+from tools import sparse_compiler
+
 # ===----------------------------------------------------------------------=== #
 
 # TODO: move this boilerplate to its own module, so it can be used by
@@ -137,13 +139,15 @@ def writeTo(self, filename):
       f.write(str(self._module))
     return self
 
-  def compile(self, compiler: Callable[[ir.Module], ExecutionEngine]):
+  def compile(self, compiler, support_lib: str):
     """Compile the ir.Module."""
     assert self._module is not None, \
         'StressTest: must call build() before compile()'
     assert self._engine is None, \
         'StressTest: must not call compile() repeatedly'
-    self._engine = compiler(self._module)
+    compiler(self._module)
+    self._engine = ExecutionEngine(
+        self._module, opt_level=0, shared_libs=[support_lib])
     return self
 
   def run(self, np_arg0: np.ndarray) -> np.ndarray:
@@ -163,24 +167,6 @@ def run(self, np_arg0: np.ndarray) -> np.ndarray:
 
 # ===----------------------------------------------------------------------=== #
 
-# TODO: move this boilerplate to its own module, so it can be used by
-# other tests and programs.
-class SparseCompiler:
-  """Sparse compiler passes."""
-
-  def __init__(self, sparsification_options: str, support_lib: str):
-    self._support_lib = support_lib
-    self._pipeline = (
-        f'sparse-compiler{{{sparsification_options} reassociate-fp-reductions=1 enable-index-optimizations=1}}')
-    # Must be in the scope of a `with ir.Context():`
-    self._passmanager = PassManager.parse(self._pipeline)
-
-  def __call__(self, module: ir.Module) -> ExecutionEngine:
-    self._passmanager.run(module)
-    return ExecutionEngine(module, opt_level=0, shared_libs=[self._support_lib])
-
-# ===----------------------------------------------------------------------=== #
-
 def main():
   """
   USAGE: python3 test_stress.py [raw_module.mlir [compiled_module.mlir]]
@@ -208,7 +194,7 @@ def main():
         f'vectorization-strategy={vec} '
         f'vl={vl} '
         f'enable-simd-index32={e}')
-    compiler = SparseCompiler(sparsification_options, support_lib)
+    compiler = sparse_compiler.SparseCompiler(options=sparsification_options)
     f64 = ir.F64Type.get()
     # Be careful about increasing this because
     #     len(types) = 1 + 2^rank * rank! * len(bitwidths)^2
@@ -243,12 +229,10 @@ def main():
       size *= d
     np_arg0 = np.arange(size, dtype=tyconv.irtype_to_dtype(f64)).reshape(*shape)
     np_out = (
-        StressTest(tyconv)
-        .build(types)
-        .writeTo(sys.argv[1] if len(sys.argv) > 1 else None)
-        .compile(compiler)
-        .writeTo(sys.argv[2] if len(sys.argv) > 2 else None)
-        .run(np_arg0))
+        StressTest(tyconv).build(types).writeTo(
+            sys.argv[1] if len(sys.argv) > 1 else None).compile(
+                compiler, support_lib).writeTo(
+                    sys.argv[2] if len(sys.argv) > 2 else None).run(np_arg0))
     # CHECK: Passed
     if np.allclose(np_out, np_arg0):
       print('Passed')
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/tools/sparse_compiler.py b/mlir/test/Integration/Dialect/SparseTensor/python/tools/sparse_compiler.py
new file mode 100644
index 0000000000000..47b145ff3cb3c
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/tools/sparse_compiler.py
@@ -0,0 +1,19 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#  This file contains the sparse compiler class.
+
+from mlir import all_passes_registration
+from mlir import ir
+from mlir import passmanager
+
+class SparseCompiler:
+  """Sparse compiler definition."""
+
+  def __init__(self, options: str):
+    pipeline = f'sparse-compiler{{{options} reassociate-fp-reductions=1 enable-index-optimizations=1}}'
+    self.pipeline = pipeline
+
+  def __call__(self, module: ir.Module):
+    passmanager.PassManager.parse(self.pipeline).run(module)

From fc0bd3c2cee929ffbd75b5cca486f4c77f7d5c59 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 22 Feb 2022 16:25:57 -0800
Subject: [PATCH 564/748] [libFuzzer] Refactor
 GetNextInstructionPc/GetPreviousInstructionPc

Port the change to compiler-rt/lib/fuzzer/FuzzerTracePC.cpp .
Update RISCV to use PC-2: this is coarse (C extension may be disabled) but
sufficient for pure symbolization purpose.

The commit is separate from D120362 so that bisecting/reverting is easier.
---
 compiler-rt/lib/fuzzer/FuzzerTracePC.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/FuzzerTracePC.cpp b/compiler-rt/lib/fuzzer/FuzzerTracePC.cpp
index af8d1ce50f3fb..f12f7aa61bc4a 100644
--- a/compiler-rt/lib/fuzzer/FuzzerTracePC.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerTracePC.cpp
@@ -133,13 +133,14 @@ inline ALWAYS_INLINE uintptr_t GetPreviousInstructionPc(uintptr_t PC) {
   // so we return (pc-2) in that case in order to be safe.
   // For A32 mode we return (pc-4) because all instructions are 32 bit long.
   return (PC - 3) & (~1);
-#elif defined(__powerpc__) || defined(__powerpc64__) || defined(__aarch64__)
-  // PCs are always 4 byte aligned.
-  return PC - 4;
 #elif defined(__sparc__) || defined(__mips__)
   return PC - 8;
-#else
+#elif defined(__riscv__)
+  return PC - 2;
+#elif defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
   return PC - 1;
+#else
+  return PC - 4;
 #endif
 }
 

From 939d62c18530fce6544aae7bcdec7f1e3a600044 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Tue, 22 Feb 2022 16:20:09 -0800
Subject: [PATCH 565/748] [AMDGPU] Pre-commit load/store combine tests. NFC.

---
 .../AMDGPU/merge-global-load-store.mir        | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
index d2404fca19b50..8d4fcca8f741d 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
@@ -8,13 +8,13 @@ body:             |
 
     ; GCN-LABEL: name: merge_global_load_dword_2
     ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `float addrspace(1)* undef` + 4, align 4, addrspace 1)
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
     ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
     %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
     S_NOP 0, implicit %1, implicit %2
 ...
 
@@ -397,3 +397,40 @@ body:             |
     %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
     S_NOP 0, implicit %2, implicit %3
 ...
+
+---
+name:            merge_global_load_dword_2_out_of_order
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_2_out_of_order
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, align 8, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_load_dword_3_out_of_order
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_load_dword_3_out_of_order
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 [[DEF]], 0, 0, implicit $exec :: (load (s96) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX3_]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub2
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 16, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, align 8, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...

From 1ec9dd3aae0b8c90a91f845ad629ef7d199986c0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 22 Feb 2022 16:30:02 -0800
Subject: [PATCH 566/748] [sancov] Refactor getPreviousInstructionPc

Note: on some architectures lik AArch64, the PC does not match
compiler-rt/lib/sanitizer_common/sanitizer_coverage_libcdep_new.cpp:`__sanitizer_cov_trace_pc_guard`
---
 llvm/tools/sancov/sancov.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp
index c997154bac47c..9a523984df75d 100644
--- a/llvm/tools/sancov/sancov.cpp
+++ b/llvm/tools/sancov/sancov.cpp
@@ -687,17 +687,20 @@ findSanitizerCovFunctions(const object::ObjectFile &O) {
   return Result;
 }
 
+// Ported from
+// compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h:GetPreviousInstructionPc
+// GetPreviousInstructionPc.
 static uint64_t getPreviousInstructionPc(uint64_t PC,
                                          Triple TheTriple) {
-  if (TheTriple.isARM()) {
+  if (TheTriple.isARM())
     return (PC - 3) & (~1);
-  } else if (TheTriple.isAArch64()) {
-    return PC - 4;
-  } else if (TheTriple.isMIPS()) {
+  if (TheTriple.isMIPS())
     return PC - 8;
-  } else {
+  if (TheTriple.isRISCV())
+    return PC - 2;
+  if (TheTriple.isX86())
     return PC - 1;
-  }
+  return PC - 4;
 }
 
 // Locate addresses of all coverage points in a file. Coverage point

From ea31442279601c7870b6c82b7d1d69578834eb59 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 16:36:08 -0800
Subject: [PATCH 567/748] [NFC] Add a bit more coverage for an upcoming patch

---
 .../InstCombine/compare-unescaped.ll          | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/compare-unescaped.ll b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
index 8cc77667d8cbf..245432ee74ff8 100644
--- a/llvm/test/Transforms/InstCombine/compare-unescaped.ll
+++ b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
@@ -364,6 +364,53 @@ define i1 @consistent_nocapture_through_global() {
   ret i1 %cmp
 }
 
+; End consistent heap layout tests
+
+; We can fold this by assuming a single heap layout
+define i1 @two_nonnull_mallocs() {
+; CHECK-LABEL: @two_nonnull_mallocs(
+; CHECK-NEXT:    ret i1 false
+;
+  %m = call nonnull i8* @malloc(i64 4)
+  %n = call nonnull i8* @malloc(i64 4)
+  %cmp = icmp eq i8* %m, %n
+  ret i1 %cmp
+}
+
+; The address of %n is captured, but %m can be arranged to make
+; the comparison non-equal.
+define i1 @two_nonnull_mallocs2() {
+; CHECK-LABEL: @two_nonnull_mallocs2(
+; CHECK-NEXT:    [[N:%.*]] = call nonnull dereferenceable(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    call void @unknown(i8* nonnull [[N]])
+; CHECK-NEXT:    ret i1 false
+;
+  %m = call nonnull i8* @malloc(i64 4)
+  %n = call nonnull i8* @malloc(i64 4)
+  call void @unknown(i8* %n)
+  %cmp = icmp eq i8* %m, %n
+  ret i1 %cmp
+}
+
+; TODO: We can fold this, but don't with the current scheme.
+define i1 @two_nonnull_mallocs_hidden() {
+; CHECK-LABEL: @two_nonnull_mallocs_hidden(
+; CHECK-NEXT:    [[M:%.*]] = call nonnull dereferenceable(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    [[N:%.*]] = call nonnull dereferenceable(4) i8* @malloc(i64 4)
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, i8* [[M]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, i8* [[N]], i64 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[GEP1]], [[GEP2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %m = call nonnull i8* @malloc(i64 4)
+  %n = call nonnull i8* @malloc(i64 4)
+  %gep1 = getelementptr i8, i8* %m, i32 1
+  %gep2 = getelementptr i8, i8* %n, i32 2
+  %cmp = icmp eq i8* %gep1, %gep2
+  ret i1 %cmp
+}
+
+
 !0 = !{}
 
 

From ed69e3266ca54c674c421318497d2de89f94463b Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Tue, 22 Feb 2022 16:38:57 -0800
Subject: [PATCH 568/748] [Docs]Add office hours.

---
 llvm/docs/GettingInvolved.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index f7cd97b19e9db..82d89e4e70470 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -241,6 +241,14 @@ don't find anyone present, chances are they happen to be off that day.
       `ics <https://calendar.google.com/calendar/ical/co0h4ndpvtfe64opn7eraiq3ac%40group.calendar.google.com/public/basic.ics>`__
     - `Jitsi <https://meet.jit.si/KristofBeylsLLVMOfficeHour>`__
     - English, Flemish, Dutch
+  * - Alina Sbirlea
+    - General questions on how to contribute to LLVM; women in compilers;
+      MemorySSA, BatchAA, various loop passes, new pass manager.
+    - Monthly, 2nd Tuesdays, 10.00am PT/7:00pm CET, for 30 minutes.
+      `ics <https://calendar.google.com/calendar/ical/c_pm6e7160iq7n5fcm1s6m3rjhh4%40group.calendar.google.com/public/basic.ics>`__
+      `gcal <https://calendar.google.com/calendar/embed?src=c_pm6e7160iq7n5fcm1s6m3rjhh4%40group.calendar.google.com>`__
+    - `GoogleMeet <https://meet.google.com/hhk-xpdj-gvx>`__
+    - English, Romanian
 
 
 IRC

From 7fea963a4535e6d05c7e1931ec5a0f78c6be6045 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Tue, 22 Feb 2022 16:43:33 -0800
Subject: [PATCH 569/748] [Docs] Add self to credits

---
 llvm/CREDITS.TXT | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/CREDITS.TXT b/llvm/CREDITS.TXT
index c7b0cf63bb9f8..721dcd74b7326 100644
--- a/llvm/CREDITS.TXT
+++ b/llvm/CREDITS.TXT
@@ -464,6 +464,10 @@ N: Ruchira Sasanka
 E: sasanka@uiuc.edu
 D: Graph coloring register allocator for the Sparc64 backend
 
+N: Alina Sbirlea
+E: alina.sbirlea@gmail.com
+D: MemorySSA, BatchAA, misc loop and new pass manager work.
+
 N: Arnold Schwaighofer
 E: arnold.schwaighofer@gmail.com
 D: Tail call optimization for the x86 backend

From cde658fa1f1449d2ec966b8c0df0444b882eb69f Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 22 Feb 2022 16:54:09 -0800
Subject: [PATCH 570/748] [clang] Remove Address::deprecated() calls in
 CGVTables.cpp

---
 clang/lib/CodeGen/CGVTables.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index 34df7da7985b4..536db8dc4b41a 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -91,7 +91,10 @@ static RValue PerformReturnAdjustment(CodeGenFunction &CGF,
   auto ClassDecl = ResultType->getPointeeType()->getAsCXXRecordDecl();
   auto ClassAlign = CGF.CGM.getClassPointerAlignment(ClassDecl);
   ReturnValue = CGF.CGM.getCXXABI().performReturnAdjustment(
-      CGF, Address::deprecated(ReturnValue, ClassAlign), Thunk.Return);
+      CGF,
+      Address(ReturnValue, CGF.ConvertTypeForMem(ResultType->getPointeeType()),
+              ClassAlign),
+      Thunk.Return);
 
   if (NullCheckValue) {
     CGF.Builder.CreateBr(AdjustEnd);
@@ -198,7 +201,8 @@ CodeGenFunction::GenerateVarArgsThunk(llvm::Function *Fn,
   // Find the first store of "this", which will be to the alloca associated
   // with "this".
   Address ThisPtr =
-      Address::deprecated(&*AI, CGM.getClassPointerAlignment(MD->getParent()));
+      Address(&*AI, ConvertTypeForMem(MD->getThisType()->getPointeeType()),
+              CGM.getClassPointerAlignment(MD->getParent()));
   llvm::BasicBlock *EntryBB = &Fn->front();
   llvm::BasicBlock::iterator ThisStore =
       llvm::find_if(*EntryBB, [&](llvm::Instruction &I) {

From b661470bce1454f7e08c7efe932067a25737db7f Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Tue, 22 Feb 2022 16:33:18 -0800
Subject: [PATCH 571/748] Revert "Revert "[AArch64][GlobalISel] Optimize
 conjunctions of compares to conditional compares.""

This reverts commit 55c181a6c786cfbfa8b7aabe0a8ba721a65b1445.

The original commit I made was an old patch, mea culpa. Committing the right
implementation with test case for the reported crash.
---
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |  32 ++
 .../GISel/AArch64InstructionSelector.cpp      | 374 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       | 281 +++++--------
 3 files changed, 493 insertions(+), 194 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 7103656365b1b..58fe48200e732 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 #define LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
 
+#include "llvm/IR/Instructions.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -226,6 +227,37 @@ class GSelect : public GenericMachineInstr {
   }
 };
 
+/// Represent a G_ICMP or G_FCMP.
+class GAnyCmp : public GenericMachineInstr {
+public:
+  CmpInst::Predicate getCond() const {
+    return static_cast<CmpInst::Predicate>(getOperand(1).getPredicate());
+  }
+  Register getLHSReg() const { return getReg(2); }
+  Register getRHSReg() const { return getReg(3); }
+
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_ICMP ||
+           MI->getOpcode() == TargetOpcode::G_FCMP;
+  }
+};
+
+/// Represent a G_ICMP.
+class GICmp : public GAnyCmp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_ICMP;
+  }
+};
+
+/// Represent a G_FCMP.
+class GFCmp : public GAnyCmp {
+public:
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_FCMP;
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 8a79d2426c8f0..5426844e59ca1 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -63,6 +64,7 @@ namespace {
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATE_BITSET
 
+
 class AArch64InstructionSelector : public InstructionSelector {
 public:
   AArch64InstructionSelector(const AArch64TargetMachine &TM,
@@ -294,6 +296,20 @@ class AArch64InstructionSelector : public InstructionSelector {
   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
 
+  /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
+  /// In some cases this is even possible with OR operations in the expression.
+  MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
+                                MachineIRBuilder &MIB) const;
+  MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
+                                          CmpInst::Predicate CC,
+                                          AArch64CC::CondCode Predicate,
+                                          AArch64CC::CondCode OutCC,
+                                          MachineIRBuilder &MIB) const;
+  MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
+                                   bool Negate, Register CCOp,
+                                   AArch64CC::CondCode Predicate,
+                                   MachineIRBuilder &MIB) const;
+
   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
   /// \p IsNegative is true if the test should be "not zero".
   /// This will also optimize the test bit instruction when possible.
@@ -425,7 +441,8 @@ class AArch64InstructionSelector : public InstructionSelector {
   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
 
   // Optimization methods.
-  bool tryOptSelect(MachineInstr &MI);
+  bool tryOptSelect(GSelect &Sel);
+  bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                       MachineOperand &Predicate,
                                       MachineIRBuilder &MIRBuilder) const;
@@ -1310,6 +1327,90 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
   }
 }
 
+/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
+static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
+                                    AArch64CC::CondCode &CondCode,
+                                    AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case CmpInst::FCMP_OEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case CmpInst::FCMP_OGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_OGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case CmpInst::FCMP_OLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case CmpInst::FCMP_OLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case CmpInst::FCMP_ONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_ORD:
+    CondCode = AArch64CC::VC;
+    break;
+  case CmpInst::FCMP_UNO:
+    CondCode = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case CmpInst::FCMP_UGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case CmpInst::FCMP_ULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case CmpInst::FCMP_ULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case CmpInst::FCMP_UNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
+
+/// Convert an IR fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
+                                     AArch64CC::CondCode &CondCode,
+                                     AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
+    assert(CondCode2 == AArch64CC::AL);
+    break;
+  case CmpInst::FCMP_ONE:
+    // (a one b)
+    // == ((a olt b) || (a ogt b))
+    // == ((a ord b) && (a une b))
+    CondCode = AArch64CC::VC;
+    CondCode2 = AArch64CC::NE;
+    break;
+  case CmpInst::FCMP_UEQ:
+    // (a ueq b)
+    // == ((a uno b) || (a oeq b))
+    // == ((a ule b) && (a uge b))
+    CondCode = AArch64CC::PL;
+    CondCode2 = AArch64CC::LE;
+    break;
+  }
+}
+
 /// Return a register which can be used as a bit to test in a TB(N)Z.
 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
                               MachineRegisterInfo &MRI) {
@@ -3292,17 +3393,18 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return selectCopy(I, TII, MRI, TRI, RBI);
 
   case TargetOpcode::G_SELECT: {
-    if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
+    auto &Sel = cast<GSelect>(I);
+    if (MRI.getType(Sel.getCondReg()) != LLT::scalar(1)) {
       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
                         << ", expected: " << LLT::scalar(1) << '\n');
       return false;
     }
 
-    const Register CondReg = I.getOperand(1).getReg();
-    const Register TReg = I.getOperand(2).getReg();
-    const Register FReg = I.getOperand(3).getReg();
+    const Register CondReg = Sel.getCondReg();
+    const Register TReg = Sel.getTrueReg();
+    const Register FReg = Sel.getFalseReg();
 
-    if (tryOptSelect(I))
+    if (tryOptSelect(Sel))
       return true;
 
     // Make sure to use an unused vreg instead of wzr, so that the peephole
@@ -3311,9 +3413,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
-    if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
+    if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
       return false;
-    I.eraseFromParent();
+    Sel.eraseFromParent();
     return true;
   }
   case TargetOpcode::G_ICMP: {
@@ -4702,7 +4804,256 @@ AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
   }
 }
 
-bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
+/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
+/// expressed as a conjunction.
+/// \param CanNegate    Set to true if we can negate the whole sub-tree just by
+///                     changing the conditions on the CMP tests.
+///                     (this means we can call emitConjunctionRec() with
+///                      Negate==true on this sub-tree)
+/// \param MustBeFirst  Set to true if this subtree needs to be negated and we
+///                     cannot do the negation naturally. We are required to
+///                     emit the subtree first in this case.
+/// \param WillNegate   Is true if are called when the result of this
+///                     subexpression must be negated. This happens when the
+///                     outer expression is an OR. We can use this fact to know
+///                     that we have a double negation (or (or ...) ...) that
+///                     can be implemented for free.
+static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
+                               bool WillNegate, MachineRegisterInfo &MRI,
+                               unsigned Depth = 0) {
+  if (!MRI.hasOneNonDBGUse(Val))
+    return false;
+  MachineInstr *ValDef = MRI.getVRegDef(Val);
+  unsigned Opcode = ValDef->getOpcode();
+  if (Opcode == TargetOpcode::G_TRUNC) {
+    // Look through a trunc.
+    Val = ValDef->getOperand(1).getReg();
+    ValDef = MRI.getVRegDef(Val);
+    Opcode = ValDef->getOpcode();
+  }
+  if (isa<GAnyCmp>(ValDef)) {
+    CanNegate = true;
+    MustBeFirst = false;
+    return true;
+  }
+  // Protect against exponential runtime and stack overflow.
+  if (Depth > 6)
+    return false;
+  if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
+    bool IsOR = Opcode == TargetOpcode::G_OR;
+    Register O0 = ValDef->getOperand(1).getReg();
+    Register O1 = ValDef->getOperand(2).getReg();
+    bool CanNegateL;
+    bool MustBeFirstL;
+    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
+      return false;
+    bool CanNegateR;
+    bool MustBeFirstR;
+    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
+      return false;
+
+    if (MustBeFirstL && MustBeFirstR)
+      return false;
+
+    if (IsOR) {
+      // For an OR expression we need to be able to naturally negate at least
+      // one side or we cannot do the transformation at all.
+      if (!CanNegateL && !CanNegateR)
+        return false;
+      // If we the result of the OR will be negated and we can naturally negate
+      // the leaves, then this sub-tree as a whole negates naturally.
+      CanNegate = WillNegate && CanNegateL && CanNegateR;
+      // If we cannot naturally negate the whole sub-tree, then this must be
+      // emitted first.
+      MustBeFirst = !CanNegate;
+    } else {
+      assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
+      // We cannot naturally negate an AND operation.
+      CanNegate = false;
+      MustBeFirst = MustBeFirstL || MustBeFirstR;
+    }
+    return true;
+  }
+  return false;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
+    Register LHS, Register RHS, CmpInst::Predicate CC,
+    AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
+    MachineIRBuilder &MIB) const {
+  // TODO: emit CMN as an optimization.
+  auto &MRI = *MIB.getMRI();
+  LLT OpTy = MRI.getType(LHS);
+  assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
+  unsigned CCmpOpc;
+  if (CmpInst::isIntPredicate(CC)) {
+    CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
+  } else {
+    switch (OpTy.getSizeInBits()) {
+    case 16:
+      CCmpOpc = AArch64::FCCMPHrr;
+      break;
+    case 32:
+      CCmpOpc = AArch64::FCCMPSrr;
+      break;
+    case 64:
+      CCmpOpc = AArch64::FCCMPDrr;
+      break;
+    default:
+      return nullptr;
+    }
+  }
+  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+  auto CCmp =
+      MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate);
+  constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
+  return &*CCmp;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
+    Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
+    AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
+  // We're at a tree leaf, produce a conditional comparison operation.
+  auto &MRI = *MIB.getMRI();
+  MachineInstr *ValDef = MRI.getVRegDef(Val);
+  unsigned Opcode = ValDef->getOpcode();
+  if (Opcode == TargetOpcode::G_TRUNC) {
+    // Look through a trunc.
+    Val = ValDef->getOperand(1).getReg();
+    ValDef = MRI.getVRegDef(Val);
+    Opcode = ValDef->getOpcode();
+  }
+  if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
+    Register LHS = Cmp->getLHSReg();
+    Register RHS = Cmp->getRHSReg();
+    CmpInst::Predicate CC = Cmp->getCond();
+    if (Negate)
+      CC = CmpInst::getInversePredicate(CC);
+    if (isa<GICmp>(Cmp)) {
+      OutCC = changeICMPPredToAArch64CC(CC);
+    } else {
+      // Handle special FP cases.
+      AArch64CC::CondCode ExtraCC;
+      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+      // Some floating point conditions can't be tested with a single condition
+      // code. Construct an additional comparison in this case.
+      if (ExtraCC != AArch64CC::AL) {
+        MachineInstr *ExtraCmp;
+        if (!CCOp)
+          ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
+        else
+          ExtraCmp =
+              emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
+        CCOp = ExtraCmp->getOperand(0).getReg();
+        Predicate = ExtraCC;
+      }
+    }
+
+    // Produce a normal comparison if we are first in the chain
+    if (!CCOp) {
+      auto Dst = MRI.cloneVirtualRegister(LHS);
+      if (isa<GICmp>(Cmp))
+        return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
+      return emitFPCompare(Cmp->getOperand(2).getReg(),
+                           Cmp->getOperand(3).getReg(), MIB);
+    }
+    // Otherwise produce a ccmp.
+    return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
+  }
+  assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
+
+  bool IsOR = Opcode == TargetOpcode::G_OR;
+
+  Register LHS = ValDef->getOperand(1).getReg();
+  bool CanNegateL;
+  bool MustBeFirstL;
+  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
+  assert(ValidL && "Valid conjunction/disjunction tree");
+  (void)ValidL;
+
+  Register RHS = ValDef->getOperand(2).getReg();
+  bool CanNegateR;
+  bool MustBeFirstR;
+  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
+  assert(ValidR && "Valid conjunction/disjunction tree");
+  (void)ValidR;
+
+  // Swap sub-tree that must come first to the right side.
+  if (MustBeFirstL) {
+    assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
+    std::swap(LHS, RHS);
+    std::swap(CanNegateL, CanNegateR);
+    std::swap(MustBeFirstL, MustBeFirstR);
+  }
+
+  bool NegateR;
+  bool NegateAfterR;
+  bool NegateL;
+  bool NegateAfterAll;
+  if (Opcode == TargetOpcode::G_OR) {
+    // Swap the sub-tree that we can negate naturally to the left.
+    if (!CanNegateL) {
+      assert(CanNegateR && "at least one side must be negatable");
+      assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
+      assert(!Negate);
+      std::swap(LHS, RHS);
+      NegateR = false;
+      NegateAfterR = true;
+    } else {
+      // Negate the left sub-tree if possible, otherwise negate the result.
+      NegateR = CanNegateR;
+      NegateAfterR = !CanNegateR;
+    }
+    NegateL = true;
+    NegateAfterAll = !Negate;
+  } else {
+    assert(Opcode == TargetOpcode::G_AND &&
+           "Valid conjunction/disjunction tree");
+    assert(!Negate && "Valid conjunction/disjunction tree");
+
+    NegateL = false;
+    NegateR = false;
+    NegateAfterR = false;
+    NegateAfterAll = false;
+  }
+
+  // Emit sub-trees.
+  AArch64CC::CondCode RHSCC;
+  MachineInstr *CmpR =
+      emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
+  if (NegateAfterR)
+    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+  MachineInstr *CmpL = emitConjunctionRec(
+      LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
+  if (NegateAfterAll)
+    OutCC = AArch64CC::getInvertedCondCode(OutCC);
+  return CmpL;
+}
+
+MachineInstr *AArch64InstructionSelector::emitConjunction(
+    Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
+  bool DummyCanNegate;
+  bool DummyMustBeFirst;
+  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
+                          *MIB.getMRI()))
+    return nullptr;
+  return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
+}
+
+bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
+                                                         MachineInstr &CondMI) {
+  AArch64CC::CondCode AArch64CC;
+  MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
+  if (!ConjMI)
+    return false;
+
+  emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
+  SelI.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
   MachineRegisterInfo &MRI = *MIB.getMRI();
   // We want to recognize this pattern:
   //
@@ -4755,8 +5106,11 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
     return false;
 
   unsigned CondOpc = CondDef->getOpcode();
-  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
+  if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
+    if (tryOptSelectConjunction(I, *CondDef))
+      return true;
     return false;
+  }
 
   AArch64CC::CondCode CondCode;
   if (CondOpc == TargetOpcode::G_ICMP) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index f81ed69b137f6..d1430096e0c22 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -569,14 +569,10 @@ define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_and:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    mov w9, #5
-; GISEL-NEXT:    cmp w9, w1
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, x2, x3, ne
+; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    cmp w8, w1
+; GISEL-NEXT:    ccmp w0, w1, #0, ne
+; GISEL-NEXT:    csel x0, x2, x3, lt
 ; GISEL-NEXT:    ret
   %1 = icmp slt i32 %w0, %w1
   %2 = icmp ne i32 5, %w1
@@ -595,14 +591,10 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    mov w9, #5
-; GISEL-NEXT:    cmp w9, w1
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, x2, x3, ne
+; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    cmp w8, w1
+; GISEL-NEXT:    ccmp w0, w1, #8, eq
+; GISEL-NEXT:    csel x0, x2, x3, lt
 ; GISEL-NEXT:    ret
   %1 = icmp slt i32 %w0, %w1
   %2 = icmp ne i32 5, %w1
@@ -611,6 +603,28 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
   ret i64 %sel
 }
 
+define float @select_or_float(i32 %w0, i32 %w1, float %x2, float %x3) {
+; CHECK-LABEL: select_or_float:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w1, #5
+; CHECK-NEXT:    ccmp w0, w1, #8, eq
+; CHECK-NEXT:    fcsel s0, s0, s1, lt
+; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: select_or_float:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    cmp w8, w1
+; GISEL-NEXT:    ccmp w0, w1, #8, eq
+; GISEL-NEXT:    fcsel s0, s0, s1, lt
+; GISEL-NEXT:    ret
+  %1 = icmp slt i32 %w0, %w1
+  %2 = icmp ne i32 5, %w1
+  %3 = or i1 %1, %2
+  %sel = select i1 %3, float %x2,float %x3
+  ret float %sel
+}
+
 define i64 @gccbug(i64 %x0, i64 %x1) {
 ; CHECK-LABEL: gccbug:
 ; CHECK:       ; %bb.0:
@@ -623,17 +637,12 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 ;
 ; GISEL-LABEL: gccbug:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp x1, #0
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    mov w9, #2
+; GISEL-NEXT:    mov w8, #2
+; GISEL-NEXT:    mov w9, #4
 ; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    cmp x0, #4
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w11, w10
-; GISEL-NEXT:    and w8, w10, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csinc x0, x9, xzr, ne
+; GISEL-NEXT:    ccmp x0, x9, #4, ne
+; GISEL-NEXT:    ccmp x1, xzr, #0, eq
+; GISEL-NEXT:    csinc x0, x8, xzr, eq
 ; GISEL-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
@@ -658,19 +667,13 @@ define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) {
 ;
 ; GISEL-LABEL: select_ororand:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w1, #13
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    cmp w2, #2
-; GISEL-NEXT:    cset w10, lt
+; GISEL-NEXT:    mov w8, #13
+; GISEL-NEXT:    mov w9, #2
 ; GISEL-NEXT:    cmp w3, #4
-; GISEL-NEXT:    cset w11, gt
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w3, wzr, ne
+; GISEL-NEXT:    ccmp w2, w9, #0, gt
+; GISEL-NEXT:    ccmp w1, w8, #2, ge
+; GISEL-NEXT:    ccmp w0, wzr, #4, ls
+; GISEL-NEXT:    csel w0, w3, wzr, eq
 ; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %w0, 0
   %c1 = icmp ugt i32 %w1, 13
@@ -694,16 +697,10 @@ define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) {
 ;
 ; GISEL-LABEL: select_andor:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
 ; GISEL-NEXT:    cmp w1, w2
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    orr w9, w10, w9
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    ccmp w0, wzr, #4, lt
+; GISEL-NEXT:    ccmp w0, w1, #0, eq
+; GISEL-NEXT:    csel w0, w0, w1, eq
 ; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %v1, %v2
   %c1 = icmp sge i32 %v2, %v3
@@ -872,14 +869,9 @@ define i32 @select_and_olt_one(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_olt_one:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #4, mi
+; GISEL-NEXT:    fccmp d2, d3, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vc
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp one double %v2, %v3
@@ -900,14 +892,9 @@ define i32 @select_and_one_olt(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_one_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, vc
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp one double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -928,14 +915,9 @@ define i32 @select_and_olt_ueq(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_olt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    cset w10, vs
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, mi
+; GISEL-NEXT:    fccmp d2, d3, #8, le
+; GISEL-NEXT:    csel w0, w0, w1, pl
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -956,14 +938,9 @@ define i32 @select_and_ueq_olt(double %v0, double %v1, double %v2, double %v3, i
 ; GISEL-LABEL: select_and_ueq_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #8, le
+; GISEL-NEXT:    fccmp d2, d3, #0, pl
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp ueq double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -984,14 +961,9 @@ define i32 @select_or_olt_one(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_olt_one:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, pl
+; GISEL-NEXT:    fccmp d2, d3, #8, le
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp one double %v2, %v3
@@ -1012,14 +984,9 @@ define i32 @select_or_one_olt(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_one_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #8, le
+; GISEL-NEXT:    fccmp d2, d3, #8, pl
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp one double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -1040,14 +1007,9 @@ define i32 @select_or_olt_ueq(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_olt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    cset w10, vs
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #4, pl
+; GISEL-NEXT:    fccmp d2, d3, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vs
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -1068,14 +1030,9 @@ define i32 @select_or_ueq_olt(double %v0, double %v1, double %v2, double %v3, i3
 ; GISEL-LABEL: select_or_ueq_olt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d0, d1, #1, ne
+; GISEL-NEXT:    fccmp d2, d3, #8, vc
+; GISEL-NEXT:    csel w0, w0, w1, mi
 ; GISEL-NEXT:    ret
   %c0 = fcmp ueq double %v0, %v1
   %c1 = fcmp olt double %v2, %v3
@@ -1097,17 +1054,10 @@ define i32 @select_or_olt_ogt_ueq(double %v0, double %v1, double %v2, double %v3
 ; GISEL-LABEL: select_or_olt_ogt_ueq:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    fcmp d4, d5
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    cset w11, vs
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    orr w8, w10, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #0, pl
+; GISEL-NEXT:    fccmp d4, d5, #4, le
+; GISEL-NEXT:    fccmp d4, d5, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vs
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ogt double %v2, %v3
@@ -1131,17 +1081,10 @@ define i32 @select_or_olt_ueq_ogt(double %v0, double %v1, double %v2, double %v3
 ; GISEL-LABEL: select_or_olt_ueq_ogt:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcmp d0, d1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcmp d2, d3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    cset w10, vs
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    fcmp d4, d5
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w8, w9, w8
-; GISEL-NEXT:    orr w8, w10, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp d2, d3, #4, pl
+; GISEL-NEXT:    fccmp d2, d3, #1, ne
+; GISEL-NEXT:    fccmp d4, d5, #0, vc
+; GISEL-NEXT:    csel w0, w0, w1, gt
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt double %v0, %v1
   %c1 = fcmp ueq double %v2, %v3
@@ -1170,15 +1113,11 @@ define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    fcvt s1, h1
+; GISEL-NEXT:    fcvt s2, h2
+; GISEL-NEXT:    fcvt s3, h3
 ; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcvt s0, h2
-; GISEL-NEXT:    fcvt s1, h3
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp s2, s3, #8, mi
+; GISEL-NEXT:    csel w0, w0, w1, ge
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp oge half %v2, %v3
@@ -1204,17 +1143,12 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    fcvt s0, h0
 ; GISEL-NEXT:    fcvt s1, h1
+; GISEL-NEXT:    fcvt s2, h2
+; GISEL-NEXT:    fcvt s3, h3
 ; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    fcvt s0, h2
-; GISEL-NEXT:    fcvt s1, h3
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    cset w9, mi
-; GISEL-NEXT:    cset w10, gt
-; GISEL-NEXT:    orr w9, w9, w10
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    fccmp s2, s3, #4, mi
+; GISEL-NEXT:    fccmp s2, s3, #1, ne
+; GISEL-NEXT:    csel w0, w0, w1, vc
 ; GISEL-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp one half %v2, %v3
@@ -1294,18 +1228,11 @@ define i32 @deep_or(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    cmp w2, #15
-; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    mov w8, #15
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    and w9, w10, w9
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    ccmp w2, w8, #4, ne
+; GISEL-NEXT:    ccmp w1, wzr, #4, eq
+; GISEL-NEXT:    ccmp w0, wzr, #4, ne
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0
@@ -1333,18 +1260,11 @@ define i32 @deep_or1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or1:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    cmp w2, #15
-; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    mov w8, #15
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    ccmp w2, w8, #4, ne
+; GISEL-NEXT:    ccmp w0, wzr, #4, eq
+; GISEL-NEXT:    ccmp w1, wzr, #4, ne
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0
@@ -1372,18 +1292,11 @@ define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: deep_or2:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    cmp w2, #15
-; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    mov w8, #15
 ; GISEL-NEXT:    cmp w2, #20
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    ccmp w2, w8, #4, ne
+; GISEL-NEXT:    ccmp w1, wzr, #4, eq
+; GISEL-NEXT:    ccmp w0, wzr, #4, ne
 ; GISEL-NEXT:    csel w0, w4, w5, ne
 ; GISEL-NEXT:    ret
   %c0 = icmp ne i32 %a0, 0

From a9861d3c85e7087099d78950e80d288f4c7f4df0 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 17:25:06 -0800
Subject: [PATCH 572/748] [instcombine] Avoid binops for comparison consistency
 tests

It turns out that instcombine is smarter than I am, and several of these ended up folded for the wrong reasons.
---
 .../Transforms/InstCombine/compare-alloca.ll  | 41 ++++++++++--------
 .../InstCombine/compare-unescaped.ll          | 43 ++++++++++---------
 2 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/compare-alloca.ll b/llvm/test/Transforms/InstCombine/compare-alloca.ll
index 21ec6cac3d681..6201b346126dc 100644
--- a/llvm/test/Transforms/InstCombine/compare-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/compare-alloca.ll
@@ -153,26 +153,28 @@ define i1 @offset_single_cmp() {
   ret i1 %cmp
 }
 
-define i1 @neg_consistent_fold1() {
+declare void @witness(i1, i1)
+
+define void @neg_consistent_fold1() {
 ; CHECK-LABEL: @neg_consistent_fold1(
 ; CHECK-NEXT:    [[M1:%.*]] = alloca [4 x i8], align 1
 ; CHECK-NEXT:    [[M1_SUB:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[M1]], i32 0, i32 0
 ; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M1_SUB]], inttoptr (i64 2048 to i8*)
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M1_SUB]], [[RHS2]]
-; CHECK-NEXT:    [[RES:%.*]] = or i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    call void @witness(i1 [[CMP1]], i1 [[CMP2]])
+; CHECK-NEXT:    ret void
 ;
   %m = alloca i8, i32 4
   %rhs = inttoptr i64 2048 to i8*
   %rhs2 = call i8* @hidden_inttoptr()
   %cmp1 = icmp eq i8* %m, %rhs
   %cmp2 = icmp eq i8* %m, %rhs2
-  %res = or i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
-define i1 @neg_consistent_fold2() {
+define void @neg_consistent_fold2() {
 ; CHECK-LABEL: @neg_consistent_fold2(
 ; CHECK-NEXT:    [[M1:%.*]] = alloca [4 x i8], align 1
 ; CHECK-NEXT:    [[N2:%.*]] = alloca [4 x i8], align 1
@@ -182,8 +184,8 @@ define i1 @neg_consistent_fold2() {
 ; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_offset(i8* nonnull [[N2_SUB]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M1_SUB]], [[RHS]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M1_SUB]], [[RHS2]]
-; CHECK-NEXT:    [[RES:%.*]] = or i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    call void @witness(i1 [[CMP1]], i1 [[CMP2]])
+; CHECK-NEXT:    ret void
 ;
   %m = alloca i8, i32 4
   %n = alloca i8, i32 4
@@ -191,11 +193,11 @@ define i1 @neg_consistent_fold2() {
   %rhs2 = call i8* @hidden_offset(i8* %n)
   %cmp1 = icmp eq i8* %m, %rhs
   %cmp2 = icmp eq i8* %m, %rhs2
-  %res = or i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
-define i1 @neg_consistent_fold3() {
+define void @neg_consistent_fold3() {
 ; CHECK-LABEL: @neg_consistent_fold3(
 ; CHECK-NEXT:    [[M1:%.*]] = alloca i32, align 1
 ; CHECK-NEXT:    [[M1_SUB:%.*]] = bitcast i32* [[M1]] to i8*
@@ -203,8 +205,8 @@ define i1 @neg_consistent_fold3() {
 ; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32* [[M1]], [[LGP]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[RHS2]], [[M1_SUB]]
-; CHECK-NEXT:    [[RES:%.*]] = or i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    call void @witness(i1 [[CMP1]], i1 [[CMP2]])
+; CHECK-NEXT:    ret void
 ;
   %m = alloca i8, i32 4
   %bc = bitcast i8* %m to i32*
@@ -212,21 +214,22 @@ define i1 @neg_consistent_fold3() {
   %rhs2 = call i8* @hidden_inttoptr()
   %cmp1 = icmp eq i32* %bc, %lgp
   %cmp2 = icmp eq i8* %m, %rhs2
-  %res = or i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
-define i1 @neg_consistent_fold4() {
+define void @neg_consistent_fold4() {
 ; CHECK-LABEL: @neg_consistent_fold4(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    call void @witness(i1 false, i1 false)
+; CHECK-NEXT:    ret void
 ;
   %m = alloca i8, i32 4
   %bc = bitcast i8* %m to i32*
   %lgp = load i32*, i32** @gp, align 8
   %cmp1 = icmp eq i32* %bc, %lgp
   %cmp2 = icmp eq i32* %bc, %lgp
-  %res = or i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
 ; A nocapture call can't cause a consistent result issue as it is (by
diff --git a/llvm/test/Transforms/InstCombine/compare-unescaped.ll b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
index 245432ee74ff8..9d9c2d2308195 100644
--- a/llvm/test/Transforms/InstCombine/compare-unescaped.ll
+++ b/llvm/test/Transforms/InstCombine/compare-unescaped.ll
@@ -240,25 +240,27 @@ define i1 @offset_single_cmp() {
   ret i1 %cmp
 }
 
-define i1 @neg_consistent_fold1() {
+declare void @witness(i1, i1)
+
+define void @neg_consistent_fold1() {
 ; CHECK-LABEL: @neg_consistent_fold1(
 ; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
 ; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M]], inttoptr (i64 2048 to i8*)
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8* [[RHS2]], inttoptr (i64 2048 to i8*)
-; CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[CMP1]], [[TMP1]]
-; CHECK-NEXT:    ret i1 [[TMP2]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M]], [[RHS2]]
+; CHECK-NEXT:    call void @witness(i1 [[CMP1]], i1 [[CMP2]])
+; CHECK-NEXT:    ret void
 ;
   %m = call i8* @malloc(i64 4)
   %rhs = inttoptr i64 2048 to i8*
   %rhs2 = call i8* @hidden_inttoptr()
   %cmp1 = icmp eq i8* %m, %rhs
   %cmp2 = icmp eq i8* %m, %rhs2
-  %res = and i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
-define i1 @neg_consistent_fold2() {
+define void @neg_consistent_fold2() {
 ; CHECK-LABEL: @neg_consistent_fold2(
 ; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
 ; CHECK-NEXT:    [[N:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
@@ -266,8 +268,8 @@ define i1 @neg_consistent_fold2() {
 ; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_offset(i8* [[N]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8* [[M]], [[RHS]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M]], [[RHS2]]
-; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    call void @witness(i1 [[CMP1]], i1 [[CMP2]])
+; CHECK-NEXT:    ret void
 ;
   %m = call i8* @malloc(i64 4)
   %n = call i8* @malloc(i64 4)
@@ -275,11 +277,11 @@ define i1 @neg_consistent_fold2() {
   %rhs2 = call i8* @hidden_offset(i8* %n)
   %cmp1 = icmp eq i8* %m, %rhs
   %cmp2 = icmp eq i8* %m, %rhs2
-  %res = and i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
-define i1 @neg_consistent_fold3() {
+define void @neg_consistent_fold3() {
 ; CHECK-LABEL: @neg_consistent_fold3(
 ; CHECK-NEXT:    [[M:%.*]] = call dereferenceable_or_null(4) i8* @malloc(i64 4)
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast i8* [[M]] to i32*
@@ -287,8 +289,8 @@ define i1 @neg_consistent_fold3() {
 ; CHECK-NEXT:    [[RHS2:%.*]] = call i8* @hidden_inttoptr()
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32* [[LGP]], [[BC]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8* [[M]], [[RHS2]]
-; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[RES]]
+; CHECK-NEXT:    call void @witness(i1 [[CMP1]], i1 [[CMP2]])
+; CHECK-NEXT:    ret void
 ;
   %m = call i8* @malloc(i64 4)
   %bc = bitcast i8* %m to i32*
@@ -296,24 +298,25 @@ define i1 @neg_consistent_fold3() {
   %rhs2 = call i8* @hidden_inttoptr()
   %cmp1 = icmp eq i32* %bc, %lgp
   %cmp2 = icmp eq i8* %m, %rhs2
-  %res = and i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
 ; FIXME: This appears correct, but the current implementation relies
 ; on visiting both cmps in the same pass.  We may have an simplification order
 ; under which one is missed, and that would be a bug.
-define i1 @neg_consistent_fold4() {
+define void @neg_consistent_fold4() {
 ; CHECK-LABEL: @neg_consistent_fold4(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    call void @witness(i1 false, i1 false)
+; CHECK-NEXT:    ret void
 ;
   %m = call i8* @malloc(i64 4)
   %bc = bitcast i8* %m to i32*
   %lgp = load i32*, i32** @gp, align 8
   %cmp1 = icmp eq i32* %bc, %lgp
   %cmp2 = icmp eq i32* %bc, %lgp
-  %res = and i1 %cmp1, %cmp2
-  ret i1 %res
+  call void @witness(i1 %cmp1, i1 %cmp2)
+  ret void
 }
 
 declare void @unknown(i8*)

From 2172b1758d9a943c7cd0b6fd886c07b7f7bd47c7 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 22 Feb 2022 17:49:38 -0800
Subject: [PATCH 573/748] Remove redundant word word in AsmParser DIFlag
 parsing

---
 llvm/lib/AsmParser/LLParser.cpp           | 4 ++--
 llvm/test/Assembler/invalid-diflag-bad.ll | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Assembler/invalid-diflag-bad.ll

diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 769601c7e6338..adeb3ba15a92a 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4167,8 +4167,8 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, DIFlagField &Result) {
 
     Val = DINode::getFlag(Lex.getStrVal());
     if (!Val)
-      return tokError(Twine("invalid debug info flag flag '") +
-                      Lex.getStrVal() + "'");
+      return tokError(Twine("invalid debug info flag '") + Lex.getStrVal() +
+                      "'");
     Lex.Lex();
     return false;
   };
diff --git a/llvm/test/Assembler/invalid-diflag-bad.ll b/llvm/test/Assembler/invalid-diflag-bad.ll
new file mode 100644
index 0000000000000..cde59f6ce6674
--- /dev/null
+++ b/llvm/test/Assembler/invalid-diflag-bad.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:27: error: invalid debug info flag 'DIFlagUnknown'
+!0 = !DISubprogram(flags: DIFlagUnknown)

From 55cb84d9fbea1ead9434de519e1b4e1d2d803048 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Tue, 22 Feb 2022 21:13:03 -0500
Subject: [PATCH 574/748] [OpenMP] Unrecognized objects should not be
 considered failure

Summary:
This patch removes the error we recieve when attempting to extract
offloading sections. We shouldn't consider this a failure because
extracting bitcode isn't necessarily required.
---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 4aca707348b2d..60a5a0fab59f6 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -517,7 +517,7 @@ extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer,
     return extractFromArchive(*LibFile->get(), DeviceFiles);
   }
   default:
-    return errorCodeToError(object_error::invalid_file_type);
+    return None;
   }
 }
 
@@ -1227,8 +1227,7 @@ int main(int argc, const char **argv) {
     if (Optional<std::string> Library = searchLibrary(Arg, LibraryPaths))
       Filename = *Library;
 
-    if ((sys::path::extension(Filename) == ".o" ||
-         sys::path::extension(Filename) == ".a")) {
+    if (sys::fs::exists(Filename) && !sys::fs::is_directory(Filename)) {
       ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
           MemoryBuffer::getFileOrSTDIN(Filename);
       if (std::error_code EC = BufferOrErr.getError())

From 6a0b78af9175af45641c854a88761f284361aaee Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Tue, 22 Feb 2022 21:21:33 -0500
Subject: [PATCH 575/748] [OpenMP] Remove static allocator in linker wrapper

Summary:
We don't need this static allocator to survive the entire file, the
strings stored have a defined lifetime.
---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 60a5a0fab59f6..2d73e0e574cbb 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -142,9 +142,6 @@ static SmallVector<std::string, 16> TempFiles;
 /// Codegen flags for LTO backend.
 static codegen::RegisterCodeGenFlags CodeGenFlags;
 
-/// Static buffer to hold StringRef values.
-static BumpPtrAllocator Alloc;
-
 /// Magic section string that marks the existence of offloading data. The
 /// section string will be formatted as `.llvm.offloading.<triple>.<arch>`.
 #define OFFLOAD_SECTION_MAGIC_STR ".llvm.offloading."
@@ -866,6 +863,7 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
   SmallVector<std::string, 4> NewInputFiles;
   DenseSet<StringRef> UsedInRegularObj;
   DenseSet<StringRef> UsedInSharedLib;
+  BumpPtrAllocator Alloc;
   StringSaver Saver(Alloc);
 
   // Search for bitcode files in the input and create an LTO input file. If it

From 36e335eeb577b6dc559de3a66bc832afae1f56c4 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 22 Feb 2022 18:28:32 -0800
Subject: [PATCH 576/748] [clang] Remove Address::deprecated() calls in
 CodeGenFunction.cpp

---
 clang/lib/CodeGen/CodeGenFunction.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index c4ccc8e1b0424..9c3e5d5460014 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1103,9 +1103,10 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
         EI->getType()->getPointerElementType(), &*EI, Idx);
     llvm::Type *Ty =
         cast<llvm::GetElementPtrInst>(Addr)->getResultElementType();
-    ReturnValuePointer = Address::deprecated(Addr, getPointerAlign());
+    ReturnValuePointer = Address(Addr, Ty, getPointerAlign());
     Addr = Builder.CreateAlignedLoad(Ty, Addr, getPointerAlign(), "agg.result");
-    ReturnValue = Address::deprecated(Addr, CGM.getNaturalTypeAlignment(RetTy));
+    ReturnValue =
+        Address(Addr, ConvertType(RetTy), CGM.getNaturalTypeAlignment(RetTy));
   } else {
     ReturnValue = CreateIRTemp(RetTy, "retval");
 
@@ -2481,7 +2482,7 @@ Address CodeGenFunction::EmitFieldAnnotations(const FieldDecl *D,
     V = Builder.CreateBitCast(V, VTy);
   }
 
-  return Address::deprecated(V, Addr.getAlignment());
+  return Address(V, Addr.getElementType(), Addr.getAlignment());
 }
 
 CodeGenFunction::CGCapturedStmtInfo::~CGCapturedStmtInfo() { }

From 251640ab575634256de6d6fde5c5359fe21efe91 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 22 Feb 2022 19:20:55 -0800
Subject: [PATCH 577/748] [ELF][test] Terminate .debug_info with a null entry
 to fix warnings

---
 lld/test/ELF/comdat-discarded-gdb-index.s | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lld/test/ELF/comdat-discarded-gdb-index.s b/lld/test/ELF/comdat-discarded-gdb-index.s
index 43505960498a2..b9f36d8f66083 100644
--- a/lld/test/ELF/comdat-discarded-gdb-index.s
+++ b/lld/test/ELF/comdat-discarded-gdb-index.s
@@ -1,6 +1,6 @@
 # REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
-# RUN: ld.lld --gdb-index %t.o %t.o -o %t
+# RUN: ld.lld --gdb-index -e 0 %t.o %t.o -o /dev/null 2>&1 | count 0
 
 ## .debug_info has a relocation to .text.foo . The second %t.o is discarded.
 ## Check we don't error on the relocation.
@@ -48,6 +48,7 @@ foo:
   .byte   2              # Abbrev [2] DW_TAG_subprogram
   .asciz  "foo"          # DW_AT_name
   .byte   0
+  .byte   0
 .Lcu_end0:
 
 .section .debug_gnu_pubnames,"",@progbits

From f5153d9e72622ac83005e8bf82c4456db6f66689 Mon Sep 17 00:00:00 2001
From: Alex Lorenz <arphaman@gmail.com>
Date: Mon, 14 Feb 2022 14:20:28 -0800
Subject: [PATCH 578/748] [compiler-rt][builtins] build the macOS compiler-rt
 built-ins with Mac Catalyst support

This patch extends compiler-rt's cmake config to build macOS builtins with both macOS and Mac Catalyst support.
This is done by telling the compiler to emit macho files with two build version load commands.

Differential Revision: https://reviews.llvm.org/D118875
---
 compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake | 8 ++++++++
 compiler-rt/cmake/base-config-ix.cmake                | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake
index 276fcbb9c0e3c..2f8cb12e03a40 100644
--- a/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTDarwinUtils.cmake
@@ -298,6 +298,14 @@ macro(darwin_add_builtin_library name suffix)
          -target "${LIB_ARCH}-apple-${base_os}${DARWIN_${LIBOS}_BUILTIN_MIN_VER}-simulator")
   endif()
 
+  if ("${COMPILER_RT_ENABLE_MACCATALYST}" AND
+      "${LIB_OS}" MATCHES "^osx$")
+    # Build the macOS builtins with Mac Catalyst support.
+    list(APPEND builtin_cflags
+      -target ${LIB_ARCH}-apple-macos${DARWIN_osx_BUILTIN_MIN_VER}
+      -darwin-target-variant ${LIB_ARCH}-apple-ios13.1-macabi)
+  endif()
+
   set_target_compile_flags(${libname}
     ${sysroot_flag}
     ${DARWIN_${LIB_OS}_BUILTIN_MIN_VER_FLAG}
diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake
index d7b0124f35463..257666727c8ad 100644
--- a/compiler-rt/cmake/base-config-ix.cmake
+++ b/compiler-rt/cmake/base-config-ix.cmake
@@ -3,6 +3,7 @@
 # .o files. This is particularly useful in producing larger, more complex
 # runtime libraries.
 
+include(BuiltinTests)
 include(CheckIncludeFile)
 include(CheckCXXSourceCompiles)
 include(GNUInstallDirs)
@@ -138,6 +139,12 @@ if(APPLE)
     set(OSX_SYSROOT_FLAG "")
   endif()
 
+  try_compile_only(COMPILER_RT_HAS_DARWIN_TARGET_VARIANT_FLAG
+                   FLAGS
+                   "-target" "x86_64-apple-macos10.15"
+                   "-darwin-target-variant" "x86_64-apple-ios13.1-macabi"
+                   "-Werror")
+  option(COMPILER_RT_ENABLE_MACCATALYST "Enable building for Mac Catalyst" ${COMPILER_RT_HAS_DARWIN_TARGET_VARIANT_FLAG})
   option(COMPILER_RT_ENABLE_IOS "Enable building for iOS" On)
   option(COMPILER_RT_ENABLE_WATCHOS "Enable building for watchOS - Experimental" Off)
   option(COMPILER_RT_ENABLE_TVOS "Enable building for tvOS - Experimental" Off)

From 045f07b7dc0ccdafea6e874c65929cfd5c957966 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 22 Feb 2022 20:29:08 -0800
Subject: [PATCH 579/748] [ProfileData] Remove unused and racy
 FunctionSamples::Format after D51643

The write may be racy if ThinLTO creates multiple `InProcessThinBackend` instances.
---
 llvm/include/llvm/ProfileData/SampleProf.h | 2 --
 llvm/lib/ProfileData/SampleProf.cpp        | 1 -
 llvm/lib/ProfileData/SampleProfReader.cpp  | 1 -
 3 files changed, 4 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index 2e255b90d1b5f..fdd5f290b1216 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -1056,8 +1056,6 @@ class FunctionSamples {
 
   void setContext(const SampleContext &FContext) { Context = FContext; }
 
-  static SampleProfileFormat Format;
-
   /// Whether the profile uses MD5 to represent string.
   static bool UseMD5;
 
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index 5e11df6b6aad4..7d33743ecc575 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -43,7 +43,6 @@ cl::opt<bool> GenerateMergedBaseProfiles(
 
 namespace llvm {
 namespace sampleprof {
-SampleProfileFormat FunctionSamples::Format;
 bool FunctionSamples::ProfileIsProbeBased = false;
 bool FunctionSamples::ProfileIsCSFlat = false;
 bool FunctionSamples::ProfileIsCSNested = false;
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 5ccf734ef4d8d..22f57c8e676e9 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -1882,7 +1882,6 @@ SampleProfileReader::create(std::unique_ptr<MemoryBuffer> &B, LLVMContext &C,
     Reader->Remapper = std::move(ReaderOrErr.get());
   }
 
-  FunctionSamples::Format = Reader->getFormat();
   if (std::error_code EC = Reader->readHeader()) {
     return EC;
   }

From b96fc4860f1615d8d1f686f1e400cc1f8e0d58ac Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 22 Feb 2022 21:42:15 -0800
Subject: [PATCH 580/748] [ELF][test] Fix CU address_size in some gdb-index
 tests

Revert 251640ab575634256de6d6fde5c5359fe21efe91 which fixed the wrong thing.

While here, add `2>&1 | count 0` to assert no warning from lib/DebugInfo/DWARF.
---
 lld/test/ELF/comdat-discarded-gdb-index.s | 3 +--
 lld/test/ELF/gdb-index-multiple-cu-2.s    | 6 +++---
 lld/test/ELF/gdb-index-multiple-cu.s      | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/lld/test/ELF/comdat-discarded-gdb-index.s b/lld/test/ELF/comdat-discarded-gdb-index.s
index b9f36d8f66083..96905626d3c7e 100644
--- a/lld/test/ELF/comdat-discarded-gdb-index.s
+++ b/lld/test/ELF/comdat-discarded-gdb-index.s
@@ -40,7 +40,7 @@ foo:
   .long   .Lcu_end0 - .Lcu_begin0 - 4
   .short  4              # DWARF version number
   .long   0              # Offset Into Abbrev. Section
-  .byte   4              # Address Size
+  .byte   8              # Address Size
 .Ldie0:
   .byte   1              # Abbrev [1] DW_TAG_compile_unit
   .quad   .Lfunc_begin0  # DW_AT_low_pc
@@ -48,7 +48,6 @@ foo:
   .byte   2              # Abbrev [2] DW_TAG_subprogram
   .asciz  "foo"          # DW_AT_name
   .byte   0
-  .byte   0
 .Lcu_end0:
 
 .section .debug_gnu_pubnames,"",@progbits
diff --git a/lld/test/ELF/gdb-index-multiple-cu-2.s b/lld/test/ELF/gdb-index-multiple-cu-2.s
index 06316860871da..d4c32a102553e 100644
--- a/lld/test/ELF/gdb-index-multiple-cu-2.s
+++ b/lld/test/ELF/gdb-index-multiple-cu-2.s
@@ -1,7 +1,7 @@
 # REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %p/Inputs/gdb-index-multiple-cu-2.s -o %t1.o
-# RUN: ld.lld --gdb-index %t.o %t1.o -o %t
+# RUN: ld.lld --gdb-index %t.o %t1.o -o %t 2>&1 | count 0
 # RUN: llvm-dwarfdump -gdb-index %t | FileCheck %s
 
 # %t.o has 2 CUs while %t1 has 1, thus _start in %t1.o should have cuIndex 2.
@@ -22,7 +22,7 @@
 	.long	.Lcu_end0 - .Lcu_begin0 - 4
 	.short	4              # DWARF version number
 	.long	0              # Offset Into Abbrev. Section
-	.byte	4              # Address Size
+	.byte	8              # Address Size
 	.byte	1              # Abbrev [1] DW_TAG_compile_unit
 	.byte	0
 .Lcu_end0:
@@ -30,7 +30,7 @@
 	.long	.Lcu_end1 - .Lcu_begin1 - 4
 	.short	4              # DWARF version number
 	.long	0              # Offset Into Abbrev. Section
-	.byte	4              # Address Size
+	.byte	8              # Address Size
 	.byte	1              # Abbrev [1] DW_TAG_compile_unit
 	.byte	0
 .Lcu_end1:
diff --git a/lld/test/ELF/gdb-index-multiple-cu.s b/lld/test/ELF/gdb-index-multiple-cu.s
index 8702d9f3924db..5227681e96964 100644
--- a/lld/test/ELF/gdb-index-multiple-cu.s
+++ b/lld/test/ELF/gdb-index-multiple-cu.s
@@ -1,6 +1,6 @@
 # REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
-# RUN: ld.lld --gdb-index %t.o -o %t
+# RUN: ld.lld --gdb-index %t.o -o %t 2>&1 | count 0
 # RUN: llvm-dwarfdump -gdb-index %t | FileCheck %s
 
 # cuIndexAndAttrs of _start:
@@ -40,7 +40,7 @@ foo:
 	.long	.Lcu_end0 - .Lcu_begin0 - 4
 	.short	4              # DWARF version number
 	.long	0              # Offset Into Abbrev. Section
-	.byte	4              # Address Size
+	.byte	8              # Address Size
 .Ldie0:
 	.byte	1              # Abbrev [1] DW_TAG_compile_unit
 	.byte	2              # Abbrev [2] DW_TAG_subprogram
@@ -52,7 +52,7 @@ foo:
 	.long	.Lcu_end1 - .Lcu_begin1 - 4
 	.short	4              # DWARF version number
 	.long	0              # Offset Into Abbrev. Section
-	.byte	4              # Address Size
+	.byte	8              # Address Size
 .Ldie1:
 	.byte	1              # Abbrev [1] DW_TAG_compile_unit
 	.byte	2              # Abbrev [2] DW_TAG_subprogram

From f415d74d1df3995795c889333d493b0dcc31a863 Mon Sep 17 00:00:00 2001
From: minglotus-6 <mingmingl@google.com>
Date: Mon, 21 Feb 2022 19:17:06 -0800
Subject: [PATCH 581/748] [SampleProfile] Handle the case when the option
 `MaxNumPromotions` is zero.

In places where `MaxNumPromotions` is used to allocated an array, bail out early to prevent allocating an array of length 0.

Differential Revision: https://reviews.llvm.org/D120295
---
 llvm/lib/Transforms/IPO/SampleProfile.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index fc0b6f5991a1f..5985281019b0a 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -841,6 +841,13 @@ static void
 updateIDTMetaData(Instruction &Inst,
                   const SmallVectorImpl<InstrProfValueData> &CallTargets,
                   uint64_t Sum) {
+  // Bail out early if MaxNumPromotions is zero.
+  // This prevents allocating an array of zero length below.
+  //
+  // Note `updateIDTMetaData` is called in two places so check
+  // `MaxNumPromotions` inside it.
+  if (MaxNumPromotions == 0)
+    return;
   uint32_t NumVals = 0;
   // OldSum is the existing total count in the value profile data.
   uint64_t OldSum = 0;
@@ -924,6 +931,10 @@ updateIDTMetaData(Instruction &Inst,
 bool SampleProfileLoader::tryPromoteAndInlineCandidate(
     Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
     SmallVector<CallBase *, 8> *InlinedCallSite) {
+  // Bail out early if MaxNumPromotions is zero.
+  // This prevents allocating an array of zero length in callees below.
+  if (MaxNumPromotions == 0)
+    return false;
   auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
   auto R = SymbolMap.find(CalleeFunctionName);
   if (R == SymbolMap.end() || !R->getValue())

From 7abcb7ba87b10544b8a7c52015a2951e509f765e Mon Sep 17 00:00:00 2001
From: Lian Wang <Lian.Wang@streamcomputing.com>
Date: Wed, 23 Feb 2022 06:14:58 +0000
Subject: [PATCH 582/748] [RISCV] Supplement more tests for GREVI aliaes in Zbp
 extension

Supplement tests for some aliaes of grevi.

RV32:
add rev4.h/rev2.h in rv32zbp.ll
add rev/rev2/rev4/rev8/rev16 in rv32zbp-intrinsic.ll

RV64:
add rev4.h/rev2.h in rv64zbp.ll
add rev.h/rev/rev2/rev4/rev8/rev16/rev32/rev.w/rev2.w/
    rev4.w/rev8.w/rev16.w in rv64zbp-intrinsic.ll

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120304
---
 llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll |  45 +++
 llvm/test/CodeGen/RISCV/rv32zbp.ll           |  94 ++++++
 llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll | 108 +++++++
 llvm/test/CodeGen/RISCV/rv64zbp.ll           | 296 +++++++++++++++----
 4 files changed, 489 insertions(+), 54 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
index 4f5ccca74b2cb..475d3b5460993 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
@@ -32,6 +32,51 @@ define i32 @grevi32(i32 %a) nounwind {
  ret i32 %tmp
 }
 
+define i32 @revi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: revi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    rev a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.grev.i32(i32 %a, i32 31)
+ ret i32 %tmp
+}
+
+define i32 @rev2i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: rev2i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    rev2 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.grev.i32(i32 %a, i32 30)
+ ret i32 %tmp
+}
+
+define i32 @rev4i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: rev4i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    rev4 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.grev.i32(i32 %a, i32 28)
+ ret i32 %tmp
+}
+
+define i32 @rev8i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: rev8i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    rev8 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.grev.i32(i32 %a, i32 24)
+ ret i32 %tmp
+}
+
+define i32 @rev16i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: rev16i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    rev16 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.grev.i32(i32 %a, i32 16)
+ ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.gorc.i32(i32 %a, i32 %b)
 
 define i32 @gorc32(i32 %a, i32 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index 7e113d6be7d0a..7203aeb2a99b6 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -1607,6 +1607,100 @@ define i32 @grev8_i32(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @grev12_i32(i32 %a) nounwind {
+; RV32I-LABEL: grev12_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 4
+; RV32I-NEXT:    lui a2, 986895
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    lui a2, 1044496
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    lui a2, 4080
+; RV32I-NEXT:    addi a2, a2, 255
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: grev12_i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    rev4.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 4
+  %shl1 = and i32 %and1, -252645136
+  %and1b = lshr i32 %a, 4
+  %shr1 = and i32 %and1b, 252645135
+  %or1 = or i32 %shl1, %shr1
+  %and2 = shl i32 %or1, 8
+  %shl2 = and i32 %and2, -16711936
+  %and2b = lshr i32 %or1, 8
+  %shr2 = and i32 %and2b, 16711935
+  %or2 = or i32 %shl2, %shr2
+  ret i32 %or2
+}
+
+define i32 @grev14_i32(i32 %a) nounwind {
+; RV32I-LABEL: grev14_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    lui a2, 838861
+; RV32I-NEXT:    addi a2, a2, -820
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 4
+; RV32I-NEXT:    lui a2, 986895
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    lui a2, 1044496
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    lui a2, 4080
+; RV32I-NEXT:    addi a2, a2, 255
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: grev14_i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    rev2.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 2
+  %shl1 = and i32 %and1, -858993460
+  %and1b = lshr i32 %a, 2
+  %shr1 = and i32 %and1b, 858993459
+  %or1 = or i32 %shl1, %shr1
+  %and2 = shl i32 %or1, 4
+  %shl2 = and i32 %and2, -252645136
+  %and2b = lshr i32 %or1, 4
+  %shr2 = and i32 %and2b, 252645135
+  %or2 = or i32 %shl2, %shr2
+  %and3 = shl i32 %or2, 8
+  %shl3 = and i32 %and3, -16711936
+  %and3b = lshr i32 %or2, 8
+  %shr3 = and i32 %and3b, 16711935
+  %or3 = or i32 %shl3, %shr3
+  ret i32 %or3
+}
+
 define i64 @grev8_i64(i64 %a) nounwind {
 ; RV32I-LABEL: grev8_i64:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
index b236fb6f060e8..b82d520efbfa0 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
@@ -342,6 +342,114 @@ define i64 @grevi64(i64 %a) nounwind {
  ret i64 %tmp
 }
 
+define i64 @revhwi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: revhwi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 15)
+ ret i64 %tmp
+}
+
+define i64 @rev16wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev16wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev16.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 16)
+ ret i64 %tmp
+}
+
+define i64 @rev8wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev8wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev8.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 24)
+ ret i64 %tmp
+}
+
+define i64 @rev4wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev4wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev4.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 28)
+ ret i64 %tmp
+}
+
+define i64 @rev2wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev2wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev2.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 30)
+ ret i64 %tmp
+}
+
+define i64 @revwi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: revwi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 31)
+ ret i64 %tmp
+}
+
+define i64 @rev32i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev32i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev32 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 32)
+ ret i64 %tmp
+}
+
+define i64 @rev16i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev16i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev16 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 48)
+ ret i64 %tmp
+}
+
+define i64 @rev8i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev8i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev8 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 56)
+ ret i64 %tmp
+}
+
+define i64 @rev4i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev4i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev4 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 60)
+ ret i64 %tmp
+}
+
+define i64 @rev2i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: rev2i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev2 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 62)
+ ret i64 %tmp
+}
+
+define i64 @revi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: revi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.grev.i64(i64 %a, i64 63)
+ ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.gorc.i64(i64 %a, i64 %b)
 
 define i64 @gorc64(i64 %a, i64 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll
index 6a4376409fab5..552991076629e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll
@@ -1510,6 +1510,194 @@ define i64 @grev8_i64(i64 %a) nounwind {
   ret i64 %or
 }
 
+define signext i32 @grev12_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: grev12_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a1, a0, 4
+; RV64I-NEXT:    lui a2, 986895
+; RV64I-NEXT:    addiw a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    slliw a1, a0, 8
+; RV64I-NEXT:    lui a2, 1044496
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    lui a2, 4080
+; RV64I-NEXT:    addiw a2, a2, 255
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: grev12_i32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    greviw a0, a0, 12
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 4
+  %shl1 = and i32 %and1, -252645136
+  %and1b = lshr i32 %a, 4
+  %shr1 = and i32 %and1b, 252645135
+  %or1 = or i32 %shl1, %shr1
+  %and2 = shl i32 %or1, 8
+  %shl2 = and i32 %and2, -16711936
+  %and2b = lshr i32 %or1, 8
+  %shr2 = and i32 %and2b, 16711935
+  %or2 = or i32 %shl2, %shr2
+  ret i32 %or2
+}
+
+define i64 @grev12_i64(i64 %a) nounwind {
+; RV64I-LABEL: grev12_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, %hi(.LCPI44_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI44_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI44_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI44_1)(a2)
+; RV64I-NEXT:    slli a3, a0, 4
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    lui a1, %hi(.LCPI44_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI44_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI44_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI44_3)(a2)
+; RV64I-NEXT:    slli a3, a0, 8
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: grev12_i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev4.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i64 %a, 4
+  %shl1 = and i64 %and1, -1085102592571150096
+  %and1b = lshr i64 %a, 4
+  %shr1 = and i64 %and1b, 1085102592571150095
+  %or1 = or i64 %shl1, %shr1
+  %and2 = shl i64 %or1, 8
+  %shl2 = and i64 %and2, -71777214294589696
+  %and2b = lshr i64 %or1, 8
+  %shr2 = and i64 %and2b, 71777214294589695
+  %or2 = or i64 %shl2, %shr2
+  ret i64 %or2
+}
+
+define signext i32 @grev14_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: grev14_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a1, a0, 2
+; RV64I-NEXT:    lui a2, 838861
+; RV64I-NEXT:    addiw a2, a2, -820
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    slliw a1, a0, 4
+; RV64I-NEXT:    lui a2, 986895
+; RV64I-NEXT:    addiw a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    slliw a1, a0, 8
+; RV64I-NEXT:    lui a2, 1044496
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    lui a2, 4080
+; RV64I-NEXT:    addiw a2, a2, 255
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: grev14_i32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    greviw a0, a0, 14
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 2
+  %shl1 = and i32 %and1, -858993460
+  %and1b = lshr i32 %a, 2
+  %shr1 = and i32 %and1b, 858993459
+  %or1 = or i32 %shl1, %shr1
+  %and2 = shl i32 %or1, 4
+  %shl2 = and i32 %and2, -252645136
+  %and2b = lshr i32 %or1, 4
+  %shr2 = and i32 %and2b, 252645135
+  %or2 = or i32 %shl2, %shr2
+  %and3 = shl i32 %or2, 8
+  %shl3 = and i32 %and3, -16711936
+  %and3b = lshr i32 %or2, 8
+  %shr3 = and i32 %and3b, 16711935
+  %or3 = or i32 %shl3, %shr3
+  ret i32 %or3
+}
+
+define i64 @grev14_i64(i64 %a) nounwind {
+; RV64I-LABEL: grev14_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI46_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI46_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI46_1)(a2)
+; RV64I-NEXT:    slli a3, a0, 2
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    lui a1, %hi(.LCPI46_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI46_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI46_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI46_3)(a2)
+; RV64I-NEXT:    slli a3, a0, 4
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    lui a1, %hi(.LCPI46_4)
+; RV64I-NEXT:    ld a1, %lo(.LCPI46_4)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI46_5)
+; RV64I-NEXT:    ld a2, %lo(.LCPI46_5)(a2)
+; RV64I-NEXT:    slli a3, a0, 8
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: grev14_i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    rev2.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i64 %a, 2
+  %shl1 = and i64 %and1, -3689348814741910324
+  %and1b = lshr i64 %a, 2
+  %shr1 = and i64 %and1b, 3689348814741910323
+  %or1 = or i64 %shl1, %shr1
+  %and2 = shl i64 %or1, 4
+  %shl2 = and i64 %and2, -1085102592571150096
+  %and2b = lshr i64 %or1, 4
+  %shr2 = and i64 %and2b, 1085102592571150095
+  %or2 = or i64 %shl2, %shr2
+  %and3 = shl i64 %or2, 8
+  %shl3 = and i64 %and3, -71777214294589696
+  %and3b = lshr i64 %or2, 8
+  %shr3 = and i64 %and3b, 71777214294589695
+  %or3 = or i64 %shl3, %shr3
+  ret i64 %or3
+}
+
 define signext i32 @grev16_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: grev16_i32:
 ; RV64I:       # %bb.0:
@@ -1653,19 +1841,19 @@ define signext i32 @grev3b_i32(i32 signext %a) nounwind {
 define i64 @grev3b_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev3b_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI49_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI49_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI49_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI49_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI53_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI53_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI53_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI53_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI49_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI49_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI49_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI49_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI53_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI53_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI53_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI53_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -1745,19 +1933,19 @@ define signext i32 @grev2b_i32(i32 signext %a) nounwind {
 define i64 @grev2b_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev2b_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI51_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI51_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI51_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI51_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI55_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI55_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI55_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI55_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a3, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    lui a3, %hi(.LCPI51_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI51_2)(a3)
-; RV64I-NEXT:    lui a4, %hi(.LCPI51_3)
-; RV64I-NEXT:    ld a4, %lo(.LCPI51_3)(a4)
+; RV64I-NEXT:    lui a3, %hi(.LCPI55_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI55_2)(a3)
+; RV64I-NEXT:    lui a4, %hi(.LCPI55_3)
+; RV64I-NEXT:    ld a4, %lo(.LCPI55_3)(a4)
 ; RV64I-NEXT:    slli a5, a0, 2
 ; RV64I-NEXT:    and a3, a5, a3
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -1856,19 +2044,19 @@ define signext i32 @grev0_i32(i32 signext %a) nounwind {
 define i64 @grev0_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev0_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI53_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI53_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI53_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI53_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI57_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI57_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI57_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI57_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a3, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    lui a3, %hi(.LCPI53_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI53_2)(a3)
-; RV64I-NEXT:    lui a4, %hi(.LCPI53_3)
-; RV64I-NEXT:    ld a4, %lo(.LCPI53_3)(a4)
+; RV64I-NEXT:    lui a3, %hi(.LCPI57_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI57_2)(a3)
+; RV64I-NEXT:    lui a4, %hi(.LCPI57_3)
+; RV64I-NEXT:    ld a4, %lo(.LCPI57_3)(a4)
 ; RV64I-NEXT:    slli a5, a0, 2
 ; RV64I-NEXT:    and a5, a5, a3
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -2263,22 +2451,22 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    and a3, a4, a3
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lui a3, %hi(.LCPI64_0)
-; RV64I-NEXT:    ld a3, %lo(.LCPI64_0)(a3)
+; RV64I-NEXT:    lui a3, %hi(.LCPI68_0)
+; RV64I-NEXT:    ld a3, %lo(.LCPI68_0)(a3)
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    lui a2, %hi(.LCPI64_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI64_1)(a2)
+; RV64I-NEXT:    lui a2, %hi(.LCPI68_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI68_1)(a2)
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    lui a2, %hi(.LCPI64_2)
-; RV64I-NEXT:    ld a2, %lo(.LCPI64_2)(a2)
+; RV64I-NEXT:    lui a2, %hi(.LCPI68_2)
+; RV64I-NEXT:    ld a2, %lo(.LCPI68_2)(a2)
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
@@ -2386,20 +2574,20 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
 define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV64I-LABEL: bitreverse_bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI68_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI68_0)(a1)
+; RV64I-NEXT:    lui a1, %hi(.LCPI72_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI72_0)(a1)
 ; RV64I-NEXT:    srli a2, a0, 4
 ; RV64I-NEXT:    and a2, a2, a1
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI68_1)
-; RV64I-NEXT:    ld a1, %lo(.LCPI68_1)(a1)
+; RV64I-NEXT:    lui a1, %hi(.LCPI72_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI72_1)(a1)
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    srli a2, a0, 2
 ; RV64I-NEXT:    and a2, a2, a1
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI68_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI68_2)(a1)
+; RV64I-NEXT:    lui a1, %hi(.LCPI72_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI72_2)(a1)
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    srli a2, a0, 1
@@ -2453,14 +2641,14 @@ define signext i32 @shfl1_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @shfl1_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl1_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI70_1)
-; RV64I-NEXT:    ld a1, %lo(.LCPI70_1)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI70_0)
-; RV64I-NEXT:    ld a2, %lo(.LCPI70_0)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI74_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI74_1)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI74_0)
+; RV64I-NEXT:    ld a2, %lo(.LCPI74_0)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
-; RV64I-NEXT:    lui a3, %hi(.LCPI70_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI70_2)(a3)
+; RV64I-NEXT:    lui a3, %hi(.LCPI74_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI74_2)(a3)
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -2517,14 +2705,14 @@ define signext i32 @shfl2_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @shfl2_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl2_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI72_1)
-; RV64I-NEXT:    ld a1, %lo(.LCPI72_1)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI72_0)
-; RV64I-NEXT:    ld a2, %lo(.LCPI72_0)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI76_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI76_1)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI76_0)
+; RV64I-NEXT:    ld a2, %lo(.LCPI76_0)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
-; RV64I-NEXT:    lui a3, %hi(.LCPI72_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI72_2)(a3)
+; RV64I-NEXT:    lui a3, %hi(.LCPI76_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI76_2)(a3)
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -2581,13 +2769,13 @@ define signext i32 @shfl4_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @shfl4_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl4_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI74_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI74_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI74_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI74_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI78_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI78_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI78_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI78_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
-; RV64I-NEXT:    lui a4, %hi(.LCPI74_2)
-; RV64I-NEXT:    ld a4, %lo(.LCPI74_2)(a4)
+; RV64I-NEXT:    lui a4, %hi(.LCPI78_2)
+; RV64I-NEXT:    ld a4, %lo(.LCPI78_2)(a4)
 ; RV64I-NEXT:    and a2, a3, a2
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    srli a0, a0, 4

From b3d1f073de971f7597ba937d8065dbba56cd8fc7 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Wed, 23 Feb 2022 08:27:10 +0100
Subject: [PATCH 583/748] [flang] Lower real constant

This patch handles lowering of real constant.

This patch is part of the upstreaming effort from fir-dev branch.

Reviewed By: PeteSteinfeld

Differential Revision: https://reviews.llvm.org/D120354

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Co-authored-by: Jean Perier <jperier@nvidia.com>
---
 flang/lib/Lower/ConvertExpr.cpp | 31 ++++++++++++++++++++++++++++++-
 flang/lib/Lower/ConvertType.cpp |  2 +-
 flang/test/Lower/assignment.f90 | 29 +++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 76bee213c96b6..013adb797da93 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -15,6 +15,7 @@
 #include "flang/Evaluate/real.h"
 #include "flang/Evaluate/traverse.h"
 #include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/ConvertType.h"
 #include "flang/Lower/IntrinsicCall.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Lower/Todo.h"
@@ -138,6 +139,14 @@ class ScalarExprLowering {
     return builder.createBool(getLoc(), value);
   }
 
+  /// Generate a real constant with a value `value`.
+  template <int KIND>
+  mlir::Value genRealConstant(mlir::MLIRContext *context,
+                              const llvm::APFloat &value) {
+    mlir::Type fltTy = Fortran::lower::convertReal(context, KIND);
+    return builder.createRealConstant(getLoc(), fltTy, value);
+  }
+
   /// Returns a reference to a symbol or its box/boxChar descriptor if it has
   /// one.
   ExtValue gen(Fortran::semantics::SymbolRef sym) {
@@ -350,7 +359,27 @@ class ScalarExprLowering {
     } else if constexpr (TC == Fortran::common::TypeCategory::Logical) {
       return genBoolConstant(value.IsTrue());
     } else if constexpr (TC == Fortran::common::TypeCategory::Real) {
-      TODO(getLoc(), "genval real constant");
+      std::string str = value.DumpHexadecimal();
+      if constexpr (KIND == 2) {
+        llvm::APFloat floatVal{llvm::APFloatBase::IEEEhalf(), str};
+        return genRealConstant<KIND>(builder.getContext(), floatVal);
+      } else if constexpr (KIND == 3) {
+        llvm::APFloat floatVal{llvm::APFloatBase::BFloat(), str};
+        return genRealConstant<KIND>(builder.getContext(), floatVal);
+      } else if constexpr (KIND == 4) {
+        llvm::APFloat floatVal{llvm::APFloatBase::IEEEsingle(), str};
+        return genRealConstant<KIND>(builder.getContext(), floatVal);
+      } else if constexpr (KIND == 10) {
+        llvm::APFloat floatVal{llvm::APFloatBase::x87DoubleExtended(), str};
+        return genRealConstant<KIND>(builder.getContext(), floatVal);
+      } else if constexpr (KIND == 16) {
+        llvm::APFloat floatVal{llvm::APFloatBase::IEEEquad(), str};
+        return genRealConstant<KIND>(builder.getContext(), floatVal);
+      } else {
+        // convert everything else to double
+        llvm::APFloat floatVal{llvm::APFloatBase::IEEEdouble(), str};
+        return genRealConstant<KIND>(builder.getContext(), floatVal);
+      }
     } else if constexpr (TC == Fortran::common::TypeCategory::Complex) {
       TODO(getLoc(), "genval complex constant");
     } else /*constexpr*/ {
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 39424d3ff0b0a..429fae81e25cc 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -520,7 +520,7 @@ mlir::Type Fortran::lower::translateVariableToFIRType(
 }
 
 mlir::Type Fortran::lower::convertReal(mlir::MLIRContext *context, int kind) {
-  return genFIRType<Fortran::common::TypeCategory::Real>(context, kind);
+  return genRealType(context, kind);
 }
 
 mlir::Type Fortran::lower::getSequenceRefType(mlir::Type refType) {
diff --git a/flang/test/Lower/assignment.f90 b/flang/test/Lower/assignment.f90
index ce9689a708a8f..26aa33631d0e4 100644
--- a/flang/test/Lower/assignment.f90
+++ b/flang/test/Lower/assignment.f90
@@ -255,3 +255,32 @@ real function divf(a, b)
 ! CHECK:         fir.store %[[DIV]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         return %[[RET]] : !fir.complex<4>
+
+subroutine real_constant()
+  real(2) :: a
+  real(4) :: b
+  real(8) :: c
+  real(10) :: d
+  real(16) :: e
+  a = 2.0_2
+  b = 4.0_4
+  c = 8.0_8
+  d = 10.0_10
+  e = 16.0_16
+end
+
+! CHECK: %[[A:.*]] = fir.alloca f16
+! CHECK: %[[B:.*]] = fir.alloca f32
+! CHECK: %[[C:.*]] = fir.alloca f64
+! CHECK: %[[D:.*]] = fir.alloca f80
+! CHECK: %[[E:.*]] = fir.alloca f128
+! CHECK: %[[C2:.*]] = arith.constant 2.000000e+00 : f16
+! CHECK: fir.store %[[C2]] to %[[A]] : !fir.ref<f16>
+! CHECK: %[[C4:.*]] = arith.constant 4.000000e+00 : f32
+! CHECK: fir.store %[[C4]] to %[[B]] : !fir.ref<f32>
+! CHECK: %[[C8:.*]] = arith.constant 8.000000e+00 : f64
+! CHECK: fir.store %[[C8]] to %[[C]] : !fir.ref<f64>
+! CHECK: %[[C10:.*]] = arith.constant 1.000000e+01 : f80
+! CHECK: fir.store %[[C10]] to %[[D]] : !fir.ref<f80>
+! CHECK: %[[C16:.*]] = arith.constant 1.600000e+01 : f128
+! CHECK: fir.store %[[C16]] to %[[E]] : !fir.ref<f128>

From 3497124771aa3730073360afd6470bf57122fede Mon Sep 17 00:00:00 2001
From: Lian Wang <Lian.Wang@streamcomputing.com>
Date: Wed, 23 Feb 2022 08:01:51 +0000
Subject: [PATCH 584/748] [RISCV] Add more test for GORCI aliaes in Zbp
 extension

Supplement tests for some aliaes of gorci.

RV32:
add orc4.h/orc2.h in rv32zbp.ll
add orc.h/orc16/orc8/orc4/orc2/orc in rv32zbp-intrinsic.ll

RV64:
add orc4.h/orc2.h in rv64zbp.ll
add orc.h/orc32/orc16/orc8/orc4/orc2/orc/orc16.w/orc8.w/
    orc4.w/orc2.w/orc.w in rv64zbp-intrinsic.ll

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120388
---
 llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll |  54 +++
 llvm/test/CodeGen/RISCV/rv32zbp.ll           | 240 +++++++++
 llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll | 108 +++++
 llvm/test/CodeGen/RISCV/rv64zbp.ll           | 485 +++++++++++++------
 4 files changed, 749 insertions(+), 138 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
index 475d3b5460993..f7f9f8ad7e21f 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp-intrinsic.ll
@@ -107,6 +107,60 @@ define i32 @gorci32(i32 %a) nounwind {
  ret i32 %tmp
 }
 
+define i32 @orchi32(i32 %a) nounwind {
+; RV32ZBP-LABEL: orchi32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.gorc.i32(i32 %a, i32 15)
+ ret i32 %tmp
+}
+
+define i32 @orc16i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: orc16i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc16 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.gorc.i32(i32 %a, i32 16)
+ ret i32 %tmp
+}
+
+define i32 @orc8i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: orc8i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc8 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.gorc.i32(i32 %a, i32 24)
+ ret i32 %tmp
+}
+
+define i32 @orc4i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: orc4i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc4 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.gorc.i32(i32 %a, i32 28)
+ ret i32 %tmp
+}
+
+define i32 @orc2i32(i32 %a) nounwind {
+; RV32ZBP-LABEL: orc2i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc2 a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.gorc.i32(i32 %a, i32 30)
+ ret i32 %tmp
+}
+
+define i32 @orci32(i32 %a) nounwind {
+; RV32ZBP-LABEL: orci32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc a0, a0
+; RV32ZBP-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.gorc.i32(i32 %a, i32 31)
+ ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.shfl.i32(i32 %a, i32 %b)
 
 define i32 @shfl32(i32 %a, i32 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll
index 7203aeb2a99b6..76464a5024484 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll
@@ -698,6 +698,246 @@ define i64 @gorc8_i64(i64 %a) nounwind {
   ret i64 %or2
 }
 
+define i32 @gorc12_i32(i32 %a) nounwind {
+; RV32I-LABEL: gorc12_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 4
+; RV32I-NEXT:    lui a2, 986895
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    addi a3, a3, -241
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    lui a2, 1044496
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    addi a3, a3, 255
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: gorc12_i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc4.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 4
+  %shl1 = and i32 %and1, -252645136
+  %and1b = lshr i32 %a, 4
+  %shr1 = and i32 %and1b, 252645135
+  %or1 = or i32 %shr1, %a
+  %or1b = or i32 %or1, %shl1
+  %and2 = shl i32 %or1b, 8
+  %shl2 = and i32 %and2, -16711936
+  %and2b = lshr i32 %or1b, 8
+  %shr2 = and i32 %and2b, 16711935
+  %or2 = or i32 %shr2, %or1b
+  %or2b = or i32 %or2, %shl2
+  ret i32 %or2b
+}
+
+define i64 @gorc12_i64(i64 %a) nounwind {
+; RV32I-LABEL: gorc12_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a1, 4
+; RV32I-NEXT:    slli a3, a0, 4
+; RV32I-NEXT:    lui a4, 986895
+; RV32I-NEXT:    addi a4, a4, 240
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    srli a5, a1, 4
+; RV32I-NEXT:    lui a6, 61681
+; RV32I-NEXT:    addi a6, a6, -241
+; RV32I-NEXT:    and a5, a5, a6
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    lui a4, 1044496
+; RV32I-NEXT:    addi a4, a4, -256
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a1, 8
+; RV32I-NEXT:    srli a5, a0, 8
+; RV32I-NEXT:    lui a6, 4080
+; RV32I-NEXT:    addi a6, a6, 255
+; RV32I-NEXT:    and a5, a5, a6
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: gorc12_i64:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc4.h a0, a0
+; RV32ZBP-NEXT:    orc4.h a1, a1
+; RV32ZBP-NEXT:    ret
+  %and1 = shl i64 %a, 4
+  %shl1 = and i64 %and1, -1085102592571150096
+  %and1b = lshr i64 %a, 4
+  %shr1 = and i64 %and1b, 1085102592571150095
+  %or1 = or i64 %shr1, %a
+  %or1b = or i64 %or1, %shl1
+  %and2 = shl i64 %or1b, 8
+  %shl2 = and i64 %and2, -71777214294589696
+  %and2b = lshr i64 %or1b, 8
+  %shr2 = and i64 %and2b, 71777214294589695
+  %or2 = or i64 %shr2, %or1b
+  %or2b = or i64 %or2, %shl2
+  ret i64 %or2b
+}
+
+define i32 @gorc14_i32(i32 %a) nounwind {
+; RV32I-LABEL: gorc14_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    lui a2, 838861
+; RV32I-NEXT:    addi a2, a2, -820
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 4
+; RV32I-NEXT:    lui a2, 986895
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    addi a3, a3, -241
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    lui a2, 1044496
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    addi a3, a3, 255
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: gorc14_i32:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc2.h a0, a0
+; RV32ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 2
+  %shl1 = and i32 %and1, -858993460
+  %and1b = lshr i32 %a, 2
+  %shr1 = and i32 %and1b, 858993459
+  %or1 = or i32 %shr1, %a
+  %or1b = or i32 %or1, %shl1
+  %and2 = shl i32 %or1b, 4
+  %shl2 = and i32 %and2, -252645136
+  %and2b = lshr i32 %or1b, 4
+  %shr2 = and i32 %and2b, 252645135
+  %or2 = or i32 %shr2, %or1b
+  %or2b = or i32 %or2, %shl2
+  %and3 = shl i32 %or2b, 8
+  %shl3 = and i32 %and3, -16711936
+  %and3b = lshr i32 %or2b, 8
+  %shr3 = and i32 %and3b, 16711935
+  %or3 = or i32 %shr3, %or2b
+  %or3b = or i32 %or3, %shl3
+  ret i32 %or3b
+}
+
+define i64 @gorc14_i64(i64 %a) nounwind {
+; RV32I-LABEL: gorc14_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 2
+; RV32I-NEXT:    slli a3, a1, 2
+; RV32I-NEXT:    lui a4, 838861
+; RV32I-NEXT:    addi a4, a4, -820
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a1, 2
+; RV32I-NEXT:    srli a5, a0, 2
+; RV32I-NEXT:    lui a6, 209715
+; RV32I-NEXT:    addi a6, a6, 819
+; RV32I-NEXT:    and a5, a5, a6
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    slli a2, a1, 4
+; RV32I-NEXT:    slli a3, a0, 4
+; RV32I-NEXT:    lui a4, 986895
+; RV32I-NEXT:    addi a4, a4, 240
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    srli a5, a1, 4
+; RV32I-NEXT:    lui a6, 61681
+; RV32I-NEXT:    addi a6, a6, -241
+; RV32I-NEXT:    and a5, a5, a6
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    lui a4, 1044496
+; RV32I-NEXT:    addi a4, a4, -256
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a1, 8
+; RV32I-NEXT:    srli a5, a0, 8
+; RV32I-NEXT:    lui a6, 4080
+; RV32I-NEXT:    addi a6, a6, 255
+; RV32I-NEXT:    and a5, a5, a6
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    ret
+;
+; RV32ZBP-LABEL: gorc14_i64:
+; RV32ZBP:       # %bb.0:
+; RV32ZBP-NEXT:    orc2.h a0, a0
+; RV32ZBP-NEXT:    orc2.h a1, a1
+; RV32ZBP-NEXT:    ret
+  %and1 = shl i64 %a, 2
+  %shl1 = and i64 %and1, -3689348814741910324
+  %and1b = lshr i64 %a, 2
+  %shr1 = and i64 %and1b, 3689348814741910323
+  %or1 = or i64 %shr1, %a
+  %or1b = or i64 %or1, %shl1
+  %and2 = shl i64 %or1b, 4
+  %shl2 = and i64 %and2, -1085102592571150096
+  %and2b = lshr i64 %or1b, 4
+  %shr2 = and i64 %and2b, 1085102592571150095
+  %or2 = or i64 %shr2, %or1b
+  %or2b = or i64 %or2, %shl2
+  %and3 = shl i64 %or2b, 8
+  %shl3 = and i64 %and3, -71777214294589696
+  %and3b = lshr i64 %or2b, 8
+  %shr3 = and i64 %and3b, 71777214294589695
+  %or3 = or i64 %shr3, %or2b
+  %or3b = or i64 %or3, %shl3
+  ret i64 %or3b
+}
+
 define i32 @gorc16_i32(i32 %a) nounwind {
 ; RV32I-LABEL: gorc16_i32:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
index b82d520efbfa0..89130bc1dc4ef 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp-intrinsic.ll
@@ -482,6 +482,114 @@ define i64 @gorci64(i64 %a) nounwind {
  ret i64 %tmp
 }
 
+define i64 @orchi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orchi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 15)
+ ret i64 %tmp
+}
+
+define i64 @orc16wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc16wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc16.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 16)
+ ret i64 %tmp
+}
+
+define i64 @orc8wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc8wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc8.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 24)
+ ret i64 %tmp
+}
+
+define i64 @orc4wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc4wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc4.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 28)
+ ret i64 %tmp
+}
+
+define i64 @orc2wi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc2wi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc2.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 30)
+ ret i64 %tmp
+}
+
+define i64 @orcwi64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orcwi64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc.w a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 31)
+ ret i64 %tmp
+}
+
+define i64 @orc32i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc32i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc32 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 32)
+ ret i64 %tmp
+}
+
+define i64 @orc16i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc16i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc16 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 48)
+ ret i64 %tmp
+}
+
+define i64 @orc8i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc8i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc8 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 56)
+ ret i64 %tmp
+}
+
+define i64 @orc4i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc4i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc4 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 60)
+ ret i64 %tmp
+}
+
+define i64 @orc2i64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orc2i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc2 a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 62)
+ ret i64 %tmp
+}
+
+define i64 @orci64(i64 %a) nounwind {
+; RV64ZBP-LABEL: orci64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc a0, a0
+; RV64ZBP-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.gorc.i64(i64 %a, i64 63)
+ ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.shfl.i64(i64 %a, i64 %b)
 
 define i64 @shfl64(i64 %a, i64 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/rv64zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbp.ll
index 552991076629e..7210e2d41f686 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbp.ll
@@ -613,6 +613,215 @@ define i64 @gorc8_i64(i64 %a) nounwind {
   ret i64 %or2
 }
 
+define signext i32 @gorc12_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: gorc12_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a1, a0, 4
+; RV64I-NEXT:    lui a2, 986895
+; RV64I-NEXT:    addiw a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slliw a1, a0, 8
+; RV64I-NEXT:    lui a2, 1044496
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    lui a3, 4080
+; RV64I-NEXT:    addiw a3, a3, 255
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: gorc12_i32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    gorciw a0, a0, 12
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 4
+  %shl1 = and i32 %and1, -252645136
+  %and1b = lshr i32 %a, 4
+  %shr1 = and i32 %and1b, 252645135
+  %or1 = or i32 %shr1, %a
+  %or1b = or i32 %or1, %shl1
+  %and2 = shl i32 %or1b, 8
+  %shl2 = and i32 %and2, -16711936
+  %and2b = lshr i32 %or1b, 8
+  %shr2 = and i32 %and2b, 16711935
+  %or2 = or i32 %shr2, %or1b
+  %or2b = or i32 %or2, %shl2
+  ret i32 %or2b
+}
+
+define i64 @gorc12_i64(i64 %a) nounwind {
+; RV64I-LABEL: gorc12_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, %hi(.LCPI17_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI17_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI17_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI17_1)(a2)
+; RV64I-NEXT:    slli a3, a0, 4
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, %hi(.LCPI17_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI17_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI17_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI17_3)(a2)
+; RV64I-NEXT:    slli a3, a0, 8
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: gorc12_i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc4.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i64 %a, 4
+  %shl1 = and i64 %and1, -1085102592571150096
+  %and1b = lshr i64 %a, 4
+  %shr1 = and i64 %and1b, 1085102592571150095
+  %or1 = or i64 %shr1, %a
+  %or1b = or i64 %or1, %shl1
+  %and2 = shl i64 %or1b, 8
+  %shl2 = and i64 %and2, -71777214294589696
+  %and2b = lshr i64 %or1b, 8
+  %shr2 = and i64 %and2b, 71777214294589695
+  %or2 = or i64 %shr2, %or1b
+  %or2b = or i64 %or2, %shl2
+  ret i64 %or2b
+}
+
+define signext i32 @gorc14_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: gorc14_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a1, a0, 2
+; RV64I-NEXT:    lui a2, 838861
+; RV64I-NEXT:    addiw a2, a2, -820
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slliw a1, a0, 4
+; RV64I-NEXT:    lui a2, 986895
+; RV64I-NEXT:    addiw a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    lui a2, 1044496
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    lui a3, 4080
+; RV64I-NEXT:    addiw a3, a3, 255
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: gorc14_i32:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    gorciw a0, a0, 14
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i32 %a, 2
+  %shl1 = and i32 %and1, -858993460
+  %and1b = lshr i32 %a, 2
+  %shr1 = and i32 %and1b, 858993459
+  %or1 = or i32 %shr1, %a
+  %or1b = or i32 %or1, %shl1
+  %and2 = shl i32 %or1b, 4
+  %shl2 = and i32 %and2, -252645136
+  %and2b = lshr i32 %or1b, 4
+  %shr2 = and i32 %and2b, 252645135
+  %or2 = or i32 %shr2, %or1b
+  %or2b = or i32 %or2, %shl2
+  %and3 = shl i32 %or2b, 8
+  %shl3 = and i32 %and3, -16711936
+  %and3b = lshr i32 %or2b, 8
+  %shr3 = and i32 %and3b, 16711935
+  %or3 = or i32 %shr3, %or2b
+  %or3b = or i32 %or3, %shl3
+  ret i32 %or3b
+}
+
+define i64 @gorc14_i64(i64 %a) nounwind {
+; RV64I-LABEL: gorc14_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, %hi(.LCPI19_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI19_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI19_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI19_1)(a2)
+; RV64I-NEXT:    slli a3, a0, 2
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, %hi(.LCPI19_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI19_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI19_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI19_3)(a2)
+; RV64I-NEXT:    slli a3, a0, 4
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, %hi(.LCPI19_4)
+; RV64I-NEXT:    ld a1, %lo(.LCPI19_4)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI19_5)
+; RV64I-NEXT:    ld a2, %lo(.LCPI19_5)(a2)
+; RV64I-NEXT:    slli a3, a0, 8
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBP-LABEL: gorc14_i64:
+; RV64ZBP:       # %bb.0:
+; RV64ZBP-NEXT:    orc2.h a0, a0
+; RV64ZBP-NEXT:    ret
+  %and1 = shl i64 %a, 2
+  %shl1 = and i64 %and1, -3689348814741910324
+  %and1b = lshr i64 %a, 2
+  %shr1 = and i64 %and1b, 3689348814741910323
+  %or1 = or i64 %shr1, %a
+  %or1b = or i64 %or1, %shl1
+  %and2 = shl i64 %or1b, 4
+  %shl2 = and i64 %and2, -1085102592571150096
+  %and2b = lshr i64 %or1b, 4
+  %shr2 = and i64 %and2b, 1085102592571150095
+  %or2 = or i64 %shr2, %or1b
+  %or2b = or i64 %or2, %shl2
+  %and3 = shl i64 %or2b, 8
+  %shl3 = and i64 %and3, -71777214294589696
+  %and3b = lshr i64 %or2b, 8
+  %shr3 = and i64 %and3b, 71777214294589695
+  %or3 = or i64 %shr3, %or2b
+  %or3b = or i64 %or3, %shl3
+  ret i64 %or3b
+}
+
 define signext i32 @gorc16_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: gorc16_i32:
 ; RV64I:       # %bb.0:
@@ -765,10 +974,10 @@ define signext i32 @gorc2b_i32(i32 signext %a) nounwind {
 define i64 @gorc2b_i64(i64 %a) nounwind {
 ; RV64I-LABEL: gorc2b_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI22_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI22_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI22_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI22_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI26_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI26_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI26_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI26_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a3, a3, a1
 ; RV64I-NEXT:    srli a4, a0, 2
@@ -864,20 +1073,20 @@ define signext i32 @gorc3b_i32(i32 signext %a) nounwind {
 define i64 @gorc3b_i64(i64 %a) nounwind {
 ; RV64I-LABEL: gorc3b_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI24_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI24_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI24_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI24_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI28_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI28_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI28_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI28_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a3, a3, a1
 ; RV64I-NEXT:    srli a4, a0, 1
 ; RV64I-NEXT:    and a4, a4, a2
 ; RV64I-NEXT:    or a0, a4, a0
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lui a3, %hi(.LCPI24_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI24_2)(a3)
-; RV64I-NEXT:    lui a4, %hi(.LCPI24_3)
-; RV64I-NEXT:    ld a4, %lo(.LCPI24_3)(a4)
+; RV64I-NEXT:    lui a3, %hi(.LCPI28_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI28_2)(a3)
+; RV64I-NEXT:    lui a4, %hi(.LCPI28_3)
+; RV64I-NEXT:    ld a4, %lo(.LCPI28_3)(a4)
 ; RV64I-NEXT:    slli a5, a0, 2
 ; RV64I-NEXT:    and a3, a5, a3
 ; RV64I-NEXT:    srli a5, a0, 2
@@ -982,10 +1191,10 @@ define signext i32 @grev1_i32(i32 signext %a) nounwind {
 define i64 @grev1_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev1_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI28_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI28_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI28_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI28_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI32_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI32_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI32_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI32_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -1034,10 +1243,10 @@ define signext i32 @grev2_i32(i32 signext %a) nounwind {
 define i64 @grev2_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev2_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI30_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI30_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI30_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI30_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI34_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI34_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI34_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI34_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -1100,19 +1309,19 @@ define signext i32 @grev3_i32(i32 signext %a) nounwind {
 define i64 @grev3_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev3_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI32_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI32_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI32_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI36_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI36_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI36_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI36_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI32_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI32_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI32_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI32_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI36_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI36_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI36_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI36_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -1166,10 +1375,10 @@ define signext i32 @grev4_i32(i32 signext %a) nounwind {
 define i64 @grev4_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev4_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI34_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI34_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI34_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI34_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI38_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI38_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI38_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI38_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 4
@@ -1232,19 +1441,19 @@ define signext i32 @grev5_i32(i32 signext %a) nounwind {
 define i64 @grev5_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev5_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI36_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI36_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI36_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI36_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI40_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI40_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI40_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI40_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI36_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI36_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI36_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI36_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI40_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI40_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI40_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI40_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 4
@@ -1313,19 +1522,19 @@ define signext i32 @grev6_i32(i32 signext %a) nounwind {
 define i64 @grev6_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev6_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI38_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI38_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI38_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI38_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI42_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI42_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI42_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI42_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI38_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI38_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI38_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI38_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI42_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI42_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI42_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI42_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 4
@@ -1407,28 +1616,28 @@ define signext i32 @grev7_i32(i32 signext %a) nounwind {
 define i64 @grev7_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev7_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI40_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI40_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI40_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI40_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI44_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI44_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI44_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI44_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI40_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI40_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI40_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI40_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI44_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI44_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI44_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI44_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI40_4)
-; RV64I-NEXT:    ld a1, %lo(.LCPI40_4)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI40_5)
-; RV64I-NEXT:    ld a2, %lo(.LCPI40_5)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI44_4)
+; RV64I-NEXT:    ld a1, %lo(.LCPI44_4)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI44_5)
+; RV64I-NEXT:    ld a2, %lo(.LCPI44_5)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 4
@@ -1487,10 +1696,10 @@ define signext i32 @grev8_i32(i32 signext %a) nounwind {
 define i64 @grev8_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev8_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI42_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI42_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI42_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI42_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI46_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI46_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI46_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 8
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 8
@@ -1553,19 +1762,19 @@ define signext i32 @grev12_i32(i32 signext %a) nounwind {
 define i64 @grev12_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev12_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI44_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI44_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI44_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI44_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI48_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI48_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI48_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI48_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 4
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI44_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI44_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI44_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI44_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI48_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI48_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI48_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI48_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 8
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 8
@@ -1647,28 +1856,28 @@ define signext i32 @grev14_i32(i32 signext %a) nounwind {
 define i64 @grev14_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev14_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI46_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI46_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI46_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI46_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI50_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI50_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI50_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI50_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI46_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI46_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI46_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI46_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI50_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI50_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI50_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI50_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 4
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI46_4)
-; RV64I-NEXT:    ld a1, %lo(.LCPI46_4)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI46_5)
-; RV64I-NEXT:    ld a2, %lo(.LCPI46_5)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI50_4)
+; RV64I-NEXT:    ld a1, %lo(.LCPI50_4)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI50_5)
+; RV64I-NEXT:    ld a2, %lo(.LCPI50_5)(a2)
 ; RV64I-NEXT:    slli a3, a0, 8
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 8
@@ -1841,19 +2050,19 @@ define signext i32 @grev3b_i32(i32 signext %a) nounwind {
 define i64 @grev3b_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev3b_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI53_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI53_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI53_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI53_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI57_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI57_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI57_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI57_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    lui a1, %hi(.LCPI53_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI53_2)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI53_3)
-; RV64I-NEXT:    ld a2, %lo(.LCPI53_3)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI57_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI57_2)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI57_3)
+; RV64I-NEXT:    ld a2, %lo(.LCPI57_3)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -1933,19 +2142,19 @@ define signext i32 @grev2b_i32(i32 signext %a) nounwind {
 define i64 @grev2b_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev2b_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI55_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI55_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI55_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI55_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI59_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI59_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI59_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI59_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a3, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    lui a3, %hi(.LCPI55_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI55_2)(a3)
-; RV64I-NEXT:    lui a4, %hi(.LCPI55_3)
-; RV64I-NEXT:    ld a4, %lo(.LCPI55_3)(a4)
+; RV64I-NEXT:    lui a3, %hi(.LCPI59_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI59_2)(a3)
+; RV64I-NEXT:    lui a4, %hi(.LCPI59_3)
+; RV64I-NEXT:    ld a4, %lo(.LCPI59_3)(a4)
 ; RV64I-NEXT:    slli a5, a0, 2
 ; RV64I-NEXT:    and a3, a5, a3
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -2044,19 +2253,19 @@ define signext i32 @grev0_i32(i32 signext %a) nounwind {
 define i64 @grev0_i64(i64 %a) nounwind {
 ; RV64I-LABEL: grev0_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI57_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI57_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI57_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI57_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI61_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI61_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI61_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI61_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a3, a3, a1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    lui a3, %hi(.LCPI57_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI57_2)(a3)
-; RV64I-NEXT:    lui a4, %hi(.LCPI57_3)
-; RV64I-NEXT:    ld a4, %lo(.LCPI57_3)(a4)
+; RV64I-NEXT:    lui a3, %hi(.LCPI61_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI61_2)(a3)
+; RV64I-NEXT:    lui a4, %hi(.LCPI61_3)
+; RV64I-NEXT:    ld a4, %lo(.LCPI61_3)(a4)
 ; RV64I-NEXT:    slli a5, a0, 2
 ; RV64I-NEXT:    and a5, a5, a3
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -2451,22 +2660,22 @@ define i64 @bitreverse_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    and a3, a4, a3
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lui a3, %hi(.LCPI68_0)
-; RV64I-NEXT:    ld a3, %lo(.LCPI68_0)(a3)
+; RV64I-NEXT:    lui a3, %hi(.LCPI72_0)
+; RV64I-NEXT:    ld a3, %lo(.LCPI72_0)(a3)
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    lui a2, %hi(.LCPI68_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI68_1)(a2)
+; RV64I-NEXT:    lui a2, %hi(.LCPI72_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI72_1)(a2)
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    lui a2, %hi(.LCPI68_2)
-; RV64I-NEXT:    ld a2, %lo(.LCPI68_2)(a2)
+; RV64I-NEXT:    lui a2, %hi(.LCPI72_2)
+; RV64I-NEXT:    ld a2, %lo(.LCPI72_2)(a2)
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
@@ -2574,20 +2783,20 @@ define i32 @bitreverse_bswap_i32(i32 %a) {
 define i64 @bitreverse_bswap_i64(i64 %a) {
 ; RV64I-LABEL: bitreverse_bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI72_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI72_0)(a1)
+; RV64I-NEXT:    lui a1, %hi(.LCPI76_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI76_0)(a1)
 ; RV64I-NEXT:    srli a2, a0, 4
 ; RV64I-NEXT:    and a2, a2, a1
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI72_1)
-; RV64I-NEXT:    ld a1, %lo(.LCPI72_1)(a1)
+; RV64I-NEXT:    lui a1, %hi(.LCPI76_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI76_1)(a1)
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    srli a2, a0, 2
 ; RV64I-NEXT:    and a2, a2, a1
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, %hi(.LCPI72_2)
-; RV64I-NEXT:    ld a1, %lo(.LCPI72_2)(a1)
+; RV64I-NEXT:    lui a1, %hi(.LCPI76_2)
+; RV64I-NEXT:    ld a1, %lo(.LCPI76_2)(a1)
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    srli a2, a0, 1
@@ -2641,14 +2850,14 @@ define signext i32 @shfl1_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @shfl1_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl1_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI74_1)
-; RV64I-NEXT:    ld a1, %lo(.LCPI74_1)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI74_0)
-; RV64I-NEXT:    ld a2, %lo(.LCPI74_0)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI78_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI78_1)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI78_0)
+; RV64I-NEXT:    ld a2, %lo(.LCPI78_0)(a2)
 ; RV64I-NEXT:    slli a3, a0, 1
 ; RV64I-NEXT:    and a1, a3, a1
-; RV64I-NEXT:    lui a3, %hi(.LCPI74_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI74_2)(a3)
+; RV64I-NEXT:    lui a3, %hi(.LCPI78_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI78_2)(a3)
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -2705,14 +2914,14 @@ define signext i32 @shfl2_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @shfl2_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl2_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI76_1)
-; RV64I-NEXT:    ld a1, %lo(.LCPI76_1)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI76_0)
-; RV64I-NEXT:    ld a2, %lo(.LCPI76_0)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI80_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI80_1)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI80_0)
+; RV64I-NEXT:    ld a2, %lo(.LCPI80_0)(a2)
 ; RV64I-NEXT:    slli a3, a0, 2
 ; RV64I-NEXT:    and a1, a3, a1
-; RV64I-NEXT:    lui a3, %hi(.LCPI76_2)
-; RV64I-NEXT:    ld a3, %lo(.LCPI76_2)(a3)
+; RV64I-NEXT:    lui a3, %hi(.LCPI80_2)
+; RV64I-NEXT:    ld a3, %lo(.LCPI80_2)(a3)
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    srli a0, a0, 2
@@ -2769,13 +2978,13 @@ define signext i32 @shfl4_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @shfl4_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl4_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(.LCPI78_0)
-; RV64I-NEXT:    ld a1, %lo(.LCPI78_0)(a1)
-; RV64I-NEXT:    lui a2, %hi(.LCPI78_1)
-; RV64I-NEXT:    ld a2, %lo(.LCPI78_1)(a2)
+; RV64I-NEXT:    lui a1, %hi(.LCPI82_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI82_0)(a1)
+; RV64I-NEXT:    lui a2, %hi(.LCPI82_1)
+; RV64I-NEXT:    ld a2, %lo(.LCPI82_1)(a2)
 ; RV64I-NEXT:    slli a3, a0, 4
-; RV64I-NEXT:    lui a4, %hi(.LCPI78_2)
-; RV64I-NEXT:    ld a4, %lo(.LCPI78_2)(a4)
+; RV64I-NEXT:    lui a4, %hi(.LCPI82_2)
+; RV64I-NEXT:    ld a4, %lo(.LCPI82_2)(a4)
 ; RV64I-NEXT:    and a2, a3, a2
 ; RV64I-NEXT:    and a1, a0, a1
 ; RV64I-NEXT:    srli a0, a0, 4

From a2fab82f33bb8cc38cd1dfe7856dae706ce4297a Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 11 Feb 2022 14:09:15 +0100
Subject: [PATCH 585/748] [pseudo] Implement LRTable.

This patch introduces a dense implementation of the LR parsing table, which is
used by LR parsers.

We build a SLR(1) parsing table from the LR(0) graph.

Statistics of the LR parsing table on the C++ spec grammar:
  - number of states: 1449
  - number of actions: 83069
  - size of the table (bytes): 334928

Differential Revision: https://reviews.llvm.org/D118196
---
 .../clang/Tooling/Syntax/Pseudo/Grammar.h     |   4 +-
 .../clang/Tooling/Syntax/Pseudo/LRTable.h     | 182 ++++++++++++++++++
 .../lib/Tooling/Syntax/Pseudo/CMakeLists.txt  |   4 +-
 clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp   |  17 ++
 .../lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp  |  12 --
 clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp   | 124 ++++++++++++
 .../Tooling/Syntax/Pseudo/LRTableBuild.cpp    | 143 ++++++++++++++
 clang/test/Syntax/check-cxx-bnf.test          |   2 +-
 clang/test/Syntax/lr-build-basic.test         |  24 +++
 clang/test/Syntax/lr-build-conflicts.test     |  47 +++++
 clang/tools/clang-pseudo/ClangPseudo.cpp      |  43 +++--
 .../Tooling/Syntax/Pseudo/CMakeLists.txt      |   2 +-
 .../Tooling/Syntax/Pseudo/LRGraphTest.cpp     |  84 --------
 .../Tooling/Syntax/Pseudo/LRTableTest.cpp     |  56 ++++++
 14 files changed, 631 insertions(+), 113 deletions(-)
 create mode 100644 clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
 create mode 100644 clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
 create mode 100644 clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
 create mode 100644 clang/test/Syntax/lr-build-basic.test
 create mode 100644 clang/test/Syntax/lr-build-conflicts.test
 delete mode 100644 clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
 create mode 100644 clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp

diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
index a7ecfea902b6d..086809ef41423 100644
--- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -154,6 +154,8 @@ std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
 // It can be constructed dynamically (from compiling BNF file) or statically
 // (a compiled data-source).
 struct GrammarTable {
+  GrammarTable();
+
   struct Nonterminal {
     std::string Name;
     // Corresponding rules that construct the non-terminal, it is a [start, end)
@@ -169,7 +171,7 @@ struct GrammarTable {
   std::vector<Rule> Rules;
   // A table of terminals (aka tokens). It corresponds to the clang::Token.
   // clang::tok::TokenKind is the index of the table.
-  std::vector<std::string> Terminals;
+  llvm::ArrayRef<std::string> Terminals;
   // A table of nonterminals, sorted by name.
   // SymbolID is the index of the table.
   std::vector<Nonterminal> Nonterminals;
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
new file mode 100644
index 0000000000000..025f7f141633a
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
@@ -0,0 +1,182 @@
+//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  The LRTable (referred as LR parsing table in the LR literature) is the core
+//  component in LR parsers, it drives the LR parsers by specifying an action to
+//  take given the current state on the top of the stack and the current
+//  lookahead token.
+//
+//  The LRTable can be described as a matrix where the rows represent
+//  the states of the LR graph, the columns represent the symbols of the
+//  grammar, and each entry of the matrix (called action) represents a
+//  state transition in the graph.
+//
+//  Typically, based on the category of the grammar symbol, the LRTable is
+//  broken into two logically separate tables:
+//    - ACTION table with terminals as columns -- e.g ACTION[S, a] specifies
+//      next action (shift/reduce/accept/error) on state S under a lookahead
+//      terminal a
+//    - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specify
+//      the state which we transist to from the state S with the nonterminal X
+//
+//  LRTable is *performance-critial* as it is consulted frequently during a
+//  parse. In general, LRTable is very sparse (most of the entries are empty).
+//  For example, for the C++ language, the SLR table has ~1500 states and 650
+//  symbols which results in a matrix having 975K entries, ~90% of entries are
+//  empty.
+//
+//  This file implements a speed-and-space-efficient LRTable.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// Represents the LR parsing table, which can efficiently the question "what is
+// the next step given the lookahead token and current state on top of the
+// stack?".
+//
+// This is a dense implementation, which only takes an amount of space that is
+// proportional to the number of non-empty entries in the table.
+//
+// Unlike the typical LR parsing table which allows at most one available action
+// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
+// to be used in nondeterministic LR parsers (e.g. GLR).
+class LRTable {
+public:
+  // StateID is only 13 bits wide.
+  using StateID = uint16_t;
+  static constexpr unsigned StateBits = 13;
+
+  // Action represents the terminal and nonterminal actions, it combines the
+  // entry of the ACTION and GOTO tables from the LR literature.
+  class Action {
+  public:
+    enum Kind : uint8_t {
+      Sentinel = 0,
+      // Terminal actions, corresponding to entries of ACTION table.
+
+      // Shift to state n: move forward with the lookahead, and push state n
+      // onto the state stack.
+      // A shift is a forward transition, and the value n is the next state that
+      // the parser is to enter.
+      Shift,
+      // Reduce by a rule: pop the state stack.
+      Reduce,
+      // Signals that we have parsed the input successfully.
+      Accept,
+
+      // Nonterminal actions, corresponding to entry of GOTO table.
+
+      // Go to state n: push state n onto the state stack.
+      // Similar to Shift, but it is a nonterminal forward transition.
+      GoTo,
+    };
+
+    static Action accept(RuleID RID) { return Action(Accept, RID); }
+    static Action goTo(StateID S) { return Action(GoTo, S); }
+    static Action shift(StateID S) { return Action(Shift, S); }
+    static Action reduce(RuleID RID) { return Action(Reduce, RID); }
+    static Action sentinel() { return Action(Sentinel, 0); }
+
+    StateID getShiftState() const {
+      assert(kind() == Shift);
+      return Value;
+    }
+    StateID getGoToState() const {
+      assert(kind() == GoTo);
+      return Value;
+    }
+    RuleID getReduceRule() const {
+      assert(kind() == Reduce);
+      return Value;
+    }
+    Kind kind() const { return static_cast<Kind>(K); }
+
+    bool operator==(const Action &L) const { return opaque() == L.opaque(); }
+    uint16_t opaque() const { return K << ValueBits | Value; };
+
+  private:
+    Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
+    static constexpr unsigned ValueBits = StateBits;
+    static constexpr unsigned KindBits = 3;
+    static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
+    static_assert(KindBits + ValueBits <= 16,
+                  "Must be able to store kind and value efficiently");
+    uint16_t K : KindBits;
+    // Either StateID or RuleID, depending on the Kind.
+    uint16_t Value : ValueBits;
+  };
+
+  // Returns all available actions for the given state on a terminal.
+  // Expected to be called by LR parsers.
+  llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
+  // Returns the state after we reduce a nonterminal.
+  // Expected to be called by LR parsers.
+  StateID getGoToState(StateID State, SymbolID Nonterminal) const;
+
+  // Looks up available actions.
+  // Returns empty if no available actions in the table.
+  llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+
+  size_t bytes() const {
+    return sizeof(*this) + Actions.capacity() * sizeof(Action) +
+           States.capacity() * sizeof(StateID) +
+           NontermOffset.capacity() * sizeof(uint32_t) +
+           TerminalOffset.capacity() * sizeof(uint32_t);
+  }
+
+  std::string dumpStatistics() const;
+  std::string dumpForTests(const Grammar &G) const;
+
+  // Build a SLR(1) parsing table.
+  static LRTable buildSLR(const Grammar &G);
+
+  class Builder;
+  // Represents an entry in the table, used for building the LRTable.
+  struct Entry {
+    StateID State;
+    SymbolID Symbol;
+    Action Act;
+  };
+  // Build a specifid table for testing purposes.
+  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+
+private:
+  // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
+  // Our physical representation is quite different for compactness.
+
+  // Index is nonterminal SymbolID, value is the offset into States/Actions
+  // where the entries for this nonterminal begin.
+  // Give a non-terminal id, the corresponding half-open range of StateIdx is
+  // [NontermIdx[id], NontermIdx[id+1]).
+  std::vector<uint32_t> NontermOffset;
+  // Similar to NontermOffset, but for terminals, index is tok::TokenKind.
+  std::vector<uint32_t> TerminalOffset;
+  // Parallel to Actions, the value is State (rows of the matrix).
+  // Grouped by the SymbolID, and only subranges are sorted.
+  std::vector<StateID> States;
+  // A flat list of available actions, sorted by (SymbolID, State).
+  std::vector<Action> Actions;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
index 43fab1f98a063..8afe7f73f3085 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,7 +4,9 @@ add_clang_library(clangToolingSyntaxPseudo
   Grammar.cpp
   GrammarBNF.cpp
   LRGraph.cpp
-  
+  LRTable.cpp
+  LRTableBuild.cpp
+
   LINK_LIBS
   clangBasic
   clangLex
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
index a2cd51a6c7569..4f1a5111ea73c 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -163,6 +163,23 @@ std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
   return FollowSets;
 }
 
+static llvm::ArrayRef<std::string> getTerminalNames() {
+  static const std::vector<std::string> *TerminalNames = []() {
+    static std::vector<std::string> TerminalNames;
+    TerminalNames.reserve(NumTerminals);
+    for (unsigned I = 0; I < NumTerminals; ++I) {
+      tok::TokenKind K = static_cast<tok::TokenKind>(I);
+      if (const auto *Punc = tok::getPunctuatorSpelling(K))
+        TerminalNames.push_back(Punc);
+      else
+        TerminalNames.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
+    }
+    return &TerminalNames;
+  }();
+  return *TerminalNames;
+}
+GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
+
 } // namespace pseudo
 } // namespace syntax
 } // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
index b19bed3449ba9..bc90a9674d9ef 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
@@ -21,17 +21,6 @@ namespace {
 static const llvm::StringRef OptSuffix = "_opt";
 static const llvm::StringRef StartSymbol = "_";
 
-void initTerminals(std::vector<std::string> &Out) {
-  Out.clear();
-  Out.reserve(NumTerminals);
-  for (unsigned I = 0; I < NumTerminals; ++I) {
-    tok::TokenKind K = static_cast<tok::TokenKind>(I);
-    if (const auto *Punc = tok::getPunctuatorSpelling(K))
-      Out.push_back(Punc);
-    else
-      Out.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
-  }
-}
 // Builds grammar from BNF files.
 class GrammarBuilder {
 public:
@@ -53,7 +42,6 @@ class GrammarBuilder {
            "Optional symbols should be eliminated!");
 
     auto T = std::make_unique<GrammarTable>();
-    initTerminals(T->Terminals);
 
     // Assemble the name->ID and ID->nonterminal name maps.
     llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
new file mode 100644
index 0000000000000..2ecb9b1cd2ce2
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
@@ -0,0 +1,124 @@
+//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
+  switch (A.kind()) {
+  case LRTable::Action::Shift:
+    return OS << llvm::formatv("shift state {0}", A.getShiftState());
+  case LRTable::Action::Reduce:
+    return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
+  case LRTable::Action::GoTo:
+    return OS << llvm::formatv("go to state {0}", A.getGoToState());
+  case LRTable::Action::Accept:
+    return OS << "acc";
+  case LRTable::Action::Sentinel:
+    llvm_unreachable("unexpected Sentinel action kind!");
+  }
+}
+
+std::string LRTable::dumpStatistics() const {
+  StateID NumOfStates = 0;
+  for (StateID It : States)
+    NumOfStates = std::max(It, NumOfStates);
+  return llvm::formatv(R"(
+Statistics of the LR parsing table:
+    number of states: {0}
+    number of actions: {1}
+    size of the table (bytes): {2}
+)",
+                       NumOfStates, Actions.size(), bytes())
+      .str();
+}
+
+std::string LRTable::dumpForTests(const Grammar &G) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  StateID MaxState = 0;
+  for (StateID It : States)
+    MaxState = std::max(MaxState, It);
+  OS << "LRTable:\n";
+  for (StateID S = 0; S <= MaxState; ++S) {
+    OS << llvm::formatv("State {0}\n", S);
+    for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
+      SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
+      for (auto A : find(S, TokID)) {
+        if (A.kind() == LRTable::Action::Shift)
+          OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
+                                        G.symbolName(TokID), A.getShiftState());
+        else if (A.kind() == LRTable::Action::Reduce)
+          OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
+                                        G.symbolName(TokID), A.getReduceRule(),
+                                        G.dumpRule(A.getReduceRule()));
+        else if (A.kind() == LRTable::Action::Accept)
+          OS.indent(4) << llvm::formatv("'{0}': accept\n", G.symbolName(TokID));
+      }
+    }
+    for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
+         ++NontermID) {
+      if (find(S, NontermID).empty())
+        continue;
+      OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
+                                    G.symbolName(NontermID),
+                                    getGoToState(S, NontermID));
+    }
+  }
+  return OS.str();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
+                                                    SymbolID Terminal) const {
+  assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
+  return find(State, Terminal);
+}
+
+LRTable::StateID LRTable::getGoToState(StateID State,
+                                       SymbolID Nonterminal) const {
+  assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
+  auto Result = find(State, Nonterminal);
+  assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
+  return Result.front().getGoToState();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
+  size_t Idx = isToken(ID) ? symbolToToken(ID) : ID;
+  assert(isToken(ID) ? Idx + 1 < TerminalOffset.size()
+                     : Idx + 1 < NontermOffset.size());
+  std::pair<size_t, size_t> TargetStateRange =
+      isToken(ID) ? std::make_pair(TerminalOffset[Idx], TerminalOffset[Idx + 1])
+                  : std::make_pair(NontermOffset[Idx], NontermOffset[Idx + 1]);
+  auto TargetedStates =
+      llvm::makeArrayRef(States.data() + TargetStateRange.first,
+                         States.data() + TargetStateRange.second);
+
+  assert(llvm::is_sorted(TargetedStates) &&
+         "subrange of the StateIdx should be sorted!");
+  const LRTable::StateID *It = llvm::partition_point(
+      TargetedStates, [&Src](LRTable::StateID S) { return S < Src; });
+  if (It == TargetedStates.end())
+    return {};
+  size_t Start = It - States.data(), End = Start;
+  while (End < States.size() && States[End] == Src)
+    ++End;
+  return llvm::makeArrayRef(&Actions[Start], &Actions[End]);
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
new file mode 100644
index 0000000000000..f07d8b106806e
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
@@ -0,0 +1,143 @@
+//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include <cstdint>
+
+namespace llvm {
+template <> struct DenseMapInfo<clang::syntax::pseudo::LRTable::Entry> {
+  using Entry = clang::syntax::pseudo::LRTable::Entry;
+  static inline Entry getEmptyKey() {
+    static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-1), 0,
+                   clang::syntax::pseudo::LRTable::Action::sentinel()};
+    return E;
+  }
+  static inline Entry getTombstoneKey() {
+    static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-2), 0,
+                   clang::syntax::pseudo::LRTable::Action::sentinel()};
+    return E;
+  }
+  static unsigned getHashValue(const Entry &I) {
+    return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
+  }
+  static bool isEqual(const Entry &LHS, const Entry &RHS) {
+    return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
+           LHS.Act == RHS.Act;
+  }
+};
+} // namespace llvm
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+class LRTable::Builder {
+public:
+  bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
+  LRTable build(const GrammarTable &GT) && {
+    // E.g. given the following parsing table with 3 states and 3 terminals:
+    //
+    //            a    b     c
+    // +-------+----+-------+-+
+    // |state0 |    | s0,r0 | |
+    // |state1 | acc|       | |
+    // |state2 |    |  r1   | |
+    // +-------+----+-------+-+
+    //
+    // The final LRTable:
+    //  - TerminalOffset: [a] = 0, [b] = 1, [c] = 4, [d] = 4 (d is a sentinel)
+    //  -  States:     [ 1,    0,  0,  2]
+    //    Actions:     [ acc, s0, r0, r1]
+    //                   ~~~ corresponding range for terminal a
+    //                        ~~~~~~~~~~ corresponding range for terminal b
+    // First step, we sort all entries by (Symbol, State, Action).
+    std::vector<Entry> Sorted(Entries.begin(), Entries.end());
+    llvm::sort(Sorted, [](const Entry &L, const Entry &R) {
+      return std::forward_as_tuple(L.Symbol, L.State, L.Act.opaque()) <
+             std::forward_as_tuple(R.Symbol, R.State, R.Act.opaque());
+    });
+
+    LRTable Table;
+    Table.Actions.reserve(Sorted.size());
+    Table.States.reserve(Sorted.size());
+    // We are good to finalize the States and Actions.
+    for (const auto &E : Sorted) {
+      Table.Actions.push_back(E.Act);
+      Table.States.push_back(E.State);
+    }
+    // Initialize the terminal and nonterminal idx, all ranges are empty by
+    // default.
+    Table.TerminalOffset = std::vector<uint32_t>(GT.Terminals.size() + 1, 0);
+    Table.NontermOffset = std::vector<uint32_t>(GT.Nonterminals.size() + 1, 0);
+    size_t SortedIndex = 0;
+    for (SymbolID NonterminalID = 0; NonterminalID < Table.NontermOffset.size();
+         ++NonterminalID) {
+      Table.NontermOffset[NonterminalID] = SortedIndex;
+      while (SortedIndex < Sorted.size() &&
+             Sorted[SortedIndex].Symbol == NonterminalID)
+        ++SortedIndex;
+    }
+    for (size_t Terminal = 0; Terminal < Table.TerminalOffset.size();
+         ++Terminal) {
+      Table.TerminalOffset[Terminal] = SortedIndex;
+      while (SortedIndex < Sorted.size() &&
+             Sorted[SortedIndex].Symbol ==
+                 tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
+        ++SortedIndex;
+    }
+    return Table;
+  }
+
+private:
+  llvm::DenseSet<Entry> Entries;
+};
+
+LRTable LRTable::buildForTests(const GrammarTable &GT,
+                               llvm::ArrayRef<Entry> Entries) {
+  Builder Build;
+  for (const Entry &E : Entries)
+    Build.insert(E);
+  return std::move(Build).build(GT);
+}
+
+LRTable LRTable::buildSLR(const Grammar &G) {
+  Builder Build;
+  auto Graph = LRGraph::buildLR0(G);
+  for (const auto &T : Graph.edges()) {
+    Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
+    Build.insert({T.Src, T.Label, Act});
+  }
+  assert(Graph.states().size() <= (1 << StateBits) &&
+         "Graph states execceds the maximum limit!");
+  auto FollowSets = followSets(G);
+  for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
+    for (const Item &I : Graph.states()[SID].Items) {
+      // If we've just parsed the start symbol, we can accept the input.
+      if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) {
+        Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())});
+        continue;
+      }
+      if (!I.hasNext()) {
+        // If we've reached the end of a rule A := ..., then we can reduce if
+        // the next token is in the follow set of A".
+        for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
+          assert(isToken(Follow));
+          Build.insert({SID, Follow, Action::reduce(I.rule())});
+        }
+      }
+    }
+  }
+  return std::move(Build).build(G.table());
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/test/Syntax/check-cxx-bnf.test b/clang/test/Syntax/check-cxx-bnf.test
index fcc0fa6a1ecc7..e7e7194257629 100644
--- a/clang/test/Syntax/check-cxx-bnf.test
+++ b/clang/test/Syntax/check-cxx-bnf.test
@@ -1,2 +1,2 @@
 // verify clang/lib/Tooling/Syntax/Pseudo/cxx.bnf
-// RUN: clang-pseudo -check-grammar=%cxx-bnf-file
+// RUN: clang-pseudo -grammar=%cxx-bnf-file
diff --git a/clang/test/Syntax/lr-build-basic.test b/clang/test/Syntax/lr-build-basic.test
new file mode 100644
index 0000000000000..d6538338991e1
--- /dev/null
+++ b/clang/test/Syntax/lr-build-basic.test
@@ -0,0 +1,24 @@
+_ := expr
+expr := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+#      GRAPH: States:
+# GRPAH-NEXT: State 0
+# GRPAH-NEXT:     _ :=  • expr
+# GRPAH-NEXT:     expr :=  • IDENTIFIER
+# GRPAH-NEXT: State 1
+# GRPAH-NEXT:     _ := expr • 
+# GRPAH-NEXT: State 2
+# GRPAH-NEXT:     expr := IDENTIFIER • 
+# GRPAH-NEXT: 0 ->[expr] 1
+# GRPAH-NEXT: 0 ->[IDENTIFIER] 2
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+#      TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT:     'IDENTIFIER': shift state 2
+# TABLE-NEXT:     'expr': go to state 1
+# TABLE-NEXT: State 1
+# TABLE-NEXT:     'EOF': accept
+# TABLE-NEXT: State 2
+# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := IDENTIFIER'
diff --git a/clang/test/Syntax/lr-build-conflicts.test b/clang/test/Syntax/lr-build-conflicts.test
new file mode 100644
index 0000000000000..4292a7184e0f8
--- /dev/null
+++ b/clang/test/Syntax/lr-build-conflicts.test
@@ -0,0 +1,47 @@
+_ := expr
+expr := expr - expr  # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+#      GRAPH: States
+# GRAPH-NEXT: State 0
+# GRAPH-NEXT:     _ :=  • expr
+# GRAPH-NEXT:     expr :=  • expr - expr
+# GRAPH-NEXT:     expr :=  • IDENTIFIER
+# GRAPH-NEXT: State 1
+# GRAPH-NEXT:     _ := expr • 
+# GRAPH-NEXT:     expr := expr • - expr
+# GRAPH-NEXT: State 2
+# GRAPH-NEXT:     expr := IDENTIFIER • 
+# GRAPH-NEXT: State 3
+# GRAPH-NEXT:     expr :=  • expr - expr
+# GRAPH-NEXT:     expr := expr - • expr
+# GRAPH-NEXT:     expr :=  • IDENTIFIER
+# GRAPH-NEXT: State 4
+# GRAPH-NEXT:     expr := expr - expr • 
+# GRAPH-NEXT:     expr := expr • - expr
+# GRAPH-NEXT: 0 ->[expr] 1
+# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 1 ->[-] 3
+# GRAPH-NEXT: 3 ->[expr] 4
+# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 4 ->[-] 3
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+#      TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT:     'IDENTIFIER': shift state 2
+# TABLE-NEXT:     'expr': go to state 1
+# TABLE-NEXT: State 1
+# TABLE-NEXT:     'EOF': accept
+# TABLE-NEXT:     '-': shift state 3
+# TABLE-NEXT: State 2
+# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT:     '-': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT: State 3
+# TABLE-NEXT:     'IDENTIFIER': shift state 2
+# TABLE-NEXT:     'expr': go to state 4
+# TABLE-NEXT: State 4
+# TABLE-NEXT:     'EOF': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT:     '-': shift state 3
+# TABLE-NEXT:     '-': reduce by rule 2 'expr := expr - expr'
diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp
index 6fb8f58fa016c..449b9181f3ee0 100644
--- a/clang/tools/clang-pseudo/ClangPseudo.cpp
+++ b/clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -18,30 +20,45 @@ using llvm::cl::init;
 using llvm::cl::opt;
 
 static opt<std::string>
-    CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."),
-                 init(""));
+    Grammar("grammar", desc("Parse and check a BNF grammar file."), init(""));
+static opt<bool> PrintGraph("print-graph",
+                            desc("Print the LR graph for the grammar"));
+static opt<bool> PrintTable("print-table",
+                            desc("Print the LR table for the grammar"));
+
+static std::string readOrDie(llvm::StringRef Path) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      llvm::MemoryBuffer::getFile(Path);
+  if (std::error_code EC = Text.getError()) {
+    llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
+                 << "\n";
+    ::exit(1);
+  }
+  return Text.get()->getBuffer().str();
+}
 
 int main(int argc, char *argv[]) {
   llvm::cl::ParseCommandLineOptions(argc, argv, "");
 
-  if (CheckGrammar.getNumOccurrences()) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
-        llvm::MemoryBuffer::getFile(CheckGrammar);
-    if (std::error_code EC = Text.getError()) {
-      llvm::errs() << "Error: can't read grammar file '" << CheckGrammar
-                   << "': " << EC.message() << "\n";
-      return 1;
-    }
+  if (Grammar.getNumOccurrences()) {
+    std::string Text = readOrDie(Grammar);
     std::vector<std::string> Diags;
-    auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags);
+    auto G = Grammar::parseBNF(Text, Diags);
 
     if (!Diags.empty()) {
       llvm::errs() << llvm::join(Diags, "\n");
       return 2;
     }
-    llvm::errs() << llvm::formatv("grammar file {0} is parsed successfully\n",
-                                  CheckGrammar);
+    llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n",
+                                  Grammar);
+    if (PrintGraph)
+      llvm::outs() << clang::syntax::pseudo::LRGraph::buildLR0(*G).dumpForTests(
+          *G);
+    if (PrintTable)
+      llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests(
+          *G);
     return 0;
   }
+
   return 0;
 }
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
index de1e1216c58d6..509e9e4a1598b 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,7 +4,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_unittest(ClangPseudoTests
   GrammarTest.cpp
-  LRGraphTest.cpp
+  LRTableTest.cpp
 )
 
 clang_target_link_libraries(ClangPseudoTests
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
deleted file mode 100644
index e7f7e1a7e65d9..0000000000000
--- a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-TEST(LRGraph, Build) {
-  struct TestCase {
-    llvm::StringRef BNF;
-    llvm::StringRef ExpectedStates;
-  };
-
-  TestCase Cases[] = {{
-                          R"bnf(
-_ := expr
-expr := IDENTIFIER
-      )bnf",
-                          R"(States:
-State 0
-    _ :=  • expr
-    expr :=  • IDENTIFIER
-State 1
-    _ := expr • 
-State 2
-    expr := IDENTIFIER • 
-0 ->[expr] 1
-0 ->[IDENTIFIER] 2
-)"},
-                      {// A grammar with a S/R conflict in SLR table:
-                       // (id-id)-id, or id-(id-id).
-                       R"bnf(
-_ := expr
-expr := expr - expr  # S/R conflict at state 4 on '-' token
-expr := IDENTIFIER
-      )bnf",
-                       R"(States:
-State 0
-    _ :=  • expr
-    expr :=  • expr - expr
-    expr :=  • IDENTIFIER
-State 1
-    _ := expr • 
-    expr := expr • - expr
-State 2
-    expr := IDENTIFIER • 
-State 3
-    expr :=  • expr - expr
-    expr := expr - • expr
-    expr :=  • IDENTIFIER
-State 4
-    expr := expr - expr • 
-    expr := expr • - expr
-0 ->[expr] 1
-0 ->[IDENTIFIER] 2
-1 ->[-] 3
-3 ->[expr] 4
-3 ->[IDENTIFIER] 2
-4 ->[-] 3
-)"}};
-  for (const auto &C : Cases) {
-    std::vector<std::string> Diags;
-    auto G = Grammar::parseBNF(C.BNF, Diags);
-    ASSERT_THAT(Diags, testing::IsEmpty());
-    auto LR0 = LRGraph::buildLR0(*G);
-    EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates);
-  }
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
new file mode 100644
index 0000000000000..88ac697ce250d
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
@@ -0,0 +1,56 @@
+//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::IsEmpty;
+using testing::UnorderedElementsAre;
+using Action = LRTable::Action;
+
+TEST(LRTable, Builder) {
+  GrammarTable GTable;
+
+  //           eof   semi  ...
+  // +-------+----+-------+---
+  // |state0 |    | s0,r0 |...
+  // |state1 | acc|       |...
+  // |state2 |    |  r1   |...
+  // +-------+----+-------+---
+  std::vector<LRTable::Entry> Entries = {
+      {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)},
+      {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)},
+      {/* State */ 1, tokenSymbol(tok::eof), Action::accept(2)},
+      {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}};
+  GrammarTable GT;
+  LRTable T = LRTable::buildForTests(GT, Entries);
+  EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty());
+  EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)),
+              UnorderedElementsAre(Action::shift(0), Action::reduce(0)));
+  EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)),
+              UnorderedElementsAre(Action::accept(2)));
+  EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty());
+  EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)),
+              UnorderedElementsAre(Action::reduce(1)));
+  // Verify the behaivor for other non-available-actions terminals.
+  EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

From ef9a659631112fb714e1f6ab85cb526ac83e22d0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 23 Feb 2022 08:21:48 +0000
Subject: [PATCH 586/748] [gn build] Port a2fab82f33bb

---
 .../utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn | 2 ++
 .../gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
index 297e0ceb0f04b..c6d549ef68aa6 100644
--- a/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
@@ -10,5 +10,7 @@ static_library("Pseudo") {
     "Grammar.cpp",
     "GrammarBNF.cpp",
     "LRGraph.cpp",
+    "LRTable.cpp",
+    "LRTableBuild.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
index f45d70d4fed37..99a3396428f58 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
@@ -12,6 +12,6 @@ unittest("ClangPseudoTests") {
   ]
   sources = [
     "GrammarTest.cpp",
-    "LRGraphTest.cpp",
+    "LRTableTest.cpp",
   ]
 }

From 7ea103de140b59a64fc884fa90afd2213619384d Mon Sep 17 00:00:00 2001
From: Stanislav Gatev <sgatev@google.com>
Date: Fri, 18 Feb 2022 18:51:42 +0000
Subject: [PATCH 587/748] [clang][dataflow] Add support for global storage
 values

This is part of the implementation of the dataflow analysis framework.
See "[RFC] A dataflow analysis framework for Clang AST" on cfe-dev.

Reviewed-by: ymandel, xazax.hun

Differential Revision: https://reviews.llvm.org/D120149
---
 .../FlowSensitive/DataflowEnvironment.h       |   5 +
 .../FlowSensitive/DataflowEnvironment.cpp     |  46 +++++
 clang/lib/Analysis/FlowSensitive/Transfer.cpp |  23 +++
 .../Analysis/FlowSensitive/TransferTest.cpp   | 167 ++++++++++++++++++
 4 files changed, 241 insertions(+)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index af613c95bb8dc..bab20418a016a 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -49,6 +49,11 @@ enum class SkipPast {
 };
 
 /// Holds the state of the program (store and heap) at a given program point.
+///
+/// WARNING: Symbolic values that are created by the environment for static
+/// local and global variables are not currently invalidated on function calls.
+/// This is unsound and should be taken into account when designing dataflow
+/// analyses.
 class Environment {
 public:
   /// Supplements `Environment` with non-standard comparison and join
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index eca58b313761b..f20c747c56c2d 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
 #include <memory>
 #include <utility>
 
@@ -56,10 +57,55 @@ static bool equivalentValues(QualType Type, Value *Val1, Value *Val2,
   return Model.compareEquivalent(Type, *Val1, *Val2);
 }
 
+/// Initializes a global storage value.
+static void initGlobalVar(const VarDecl &D, Environment &Env) {
+  if (!D.hasGlobalStorage() ||
+      Env.getStorageLocation(D, SkipPast::None) != nullptr)
+    return;
+
+  auto &Loc = Env.createStorageLocation(D);
+  Env.setStorageLocation(D, Loc);
+  if (auto *Val = Env.createValue(D.getType()))
+    Env.setValue(Loc, *Val);
+}
+
+/// Initializes a global storage value.
+static void initGlobalVar(const Decl &D, Environment &Env) {
+  if (auto *V = dyn_cast<VarDecl>(&D))
+    initGlobalVar(*V, Env);
+}
+
+/// Initializes global storage values that are declared or referenced from
+/// sub-statements of `S`.
+// FIXME: Add support for resetting globals after function calls to enable
+// the implementation of sound analyses.
+static void initGlobalVars(const Stmt &S, Environment &Env) {
+  for (auto *Child : S.children()) {
+    if (Child != nullptr)
+      initGlobalVars(*Child, Env);
+  }
+
+  if (auto *DS = dyn_cast<DeclStmt>(&S)) {
+    if (DS->isSingleDecl()) {
+      const auto &D = *cast<VarDecl>(DS->getSingleDecl());
+      initGlobalVar(D, Env);
+    } else {
+      for (auto *D : DS->getDeclGroup())
+        initGlobalVar(*D, Env);
+    }
+  } else if (auto *E = dyn_cast<DeclRefExpr>(&S)) {
+    initGlobalVar(*E->getDecl(), Env);
+  } else if (auto *E = dyn_cast<MemberExpr>(&S)) {
+    initGlobalVar(*E->getMemberDecl(), Env);
+  }
+}
+
 Environment::Environment(DataflowAnalysisContext &DACtx,
                          const DeclContext &DeclCtx)
     : Environment(DACtx) {
   if (const auto *FuncDecl = dyn_cast<FunctionDecl>(&DeclCtx)) {
+    assert(FuncDecl->getBody() != nullptr);
+    initGlobalVars(*FuncDecl->getBody(), *this);
     for (const auto *ParamDecl : FuncDecl->parameters()) {
       assert(ParamDecl != nullptr);
       auto &ParamLoc = createStorageLocation(*ParamDecl);
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index cd9b8b0e454e4..4b5d23593a4bd 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -136,6 +136,11 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     // Group decls are converted into single decls in the CFG so the cast below
     // is safe.
     const auto &D = *cast<VarDecl>(S->getSingleDecl());
+
+    // Static local vars are already initialized in `Environment`.
+    if (D.hasGlobalStorage())
+      return;
+
     auto &Loc = Env.createStorageLocation(D);
     Env.setStorageLocation(D, Loc);
 
@@ -291,6 +296,24 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     if (Member->isFunctionOrFunctionTemplate())
       return;
 
+    if (auto *D = dyn_cast<VarDecl>(Member)) {
+      if (D->hasGlobalStorage()) {
+        auto *VarDeclLoc = Env.getStorageLocation(*D, SkipPast::None);
+        if (VarDeclLoc == nullptr)
+          return;
+
+        if (VarDeclLoc->getType()->isReferenceType()) {
+          Env.setStorageLocation(*S, *VarDeclLoc);
+        } else {
+          auto &Loc = Env.createStorageLocation(*S);
+          Env.setStorageLocation(*S, Loc);
+          Env.setValue(Loc, Env.takeOwnership(
+                                std::make_unique<ReferenceValue>(*VarDeclLoc)));
+        }
+        return;
+      }
+    }
+
     // The receiver can be either a value or a pointer to a value. Skip past the
     // indirection to handle both cases.
     auto *BaseLoc = cast_or_null<AggregateStorageLocation>(
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 83ccba1a25382..fda4af435c4a7 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2153,4 +2153,171 @@ TEST_F(TransferTest, AssignFromBoolNegation) {
               });
 }
 
+TEST_F(TransferTest, StaticIntSingleVarDecl) {
+  std::string Code = R"(
+    void target() {
+      static int Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                const StorageLocation *FooLoc =
+                    Env.getStorageLocation(*FooDecl, SkipPast::None);
+                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+                const Value *FooVal = Env.getValue(*FooLoc);
+                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+              });
+}
+
+TEST_F(TransferTest, StaticIntGroupVarDecl) {
+  std::string Code = R"(
+    void target() {
+      static int Foo, Bar;
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const StorageLocation *FooLoc =
+                    Env.getStorageLocation(*FooDecl, SkipPast::None);
+                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+                const StorageLocation *BarLoc =
+                    Env.getStorageLocation(*BarDecl, SkipPast::None);
+                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(BarLoc));
+
+                const Value *FooVal = Env.getValue(*FooLoc);
+                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+
+                const Value *BarVal = Env.getValue(*BarLoc);
+                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(BarVal));
+
+                EXPECT_NE(FooVal, BarVal);
+              });
+}
+
+TEST_F(TransferTest, GlobalIntVarDecl) {
+  std::string Code = R"(
+    static int Foo;
+
+    void target() {
+      int Bar = Foo;
+      int Baz = Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+                ASSERT_THAT(BazDecl, NotNull());
+
+                const Value *BarVal =
+                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
+                const Value *BazVal =
+                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
+                EXPECT_EQ(BarVal, BazVal);
+              });
+}
+
+TEST_F(TransferTest, StaticMemberIntVarDecl) {
+  std::string Code = R"(
+    struct A {
+      static int Foo;
+    };
+
+    void target(A a) {
+      int Bar = a.Foo;
+      int Baz = a.Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+                ASSERT_THAT(BazDecl, NotNull());
+
+                const Value *BarVal =
+                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
+                const Value *BazVal =
+                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
+                EXPECT_EQ(BarVal, BazVal);
+              });
+}
+
+TEST_F(TransferTest, StaticMemberRefVarDecl) {
+  std::string Code = R"(
+    struct A {
+      static int &Foo;
+    };
+
+    void target(A a) {
+      int Bar = a.Foo;
+      int Baz = a.Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+                ASSERT_THAT(BazDecl, NotNull());
+
+                const Value *BarVal =
+                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
+                const Value *BazVal =
+                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
+                EXPECT_EQ(BarVal, BazVal);
+              });
+}
+
 } // namespace

From f85a6a812718cfdaeb1d0dc971ce3875aa82fafe Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 23 Feb 2022 16:22:55 +0800
Subject: [PATCH 588/748] [NFC] Add unittest for Decl::isInExportDeclContext

---
 clang/unittests/AST/DeclTest.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/clang/unittests/AST/DeclTest.cpp b/clang/unittests/AST/DeclTest.cpp
index 588ef859a181e..a84ebbd9835db 100644
--- a/clang/unittests/AST/DeclTest.cpp
+++ b/clang/unittests/AST/DeclTest.cpp
@@ -157,3 +157,17 @@ TEST(Decl, EnumDeclRange) {
   EXPECT_EQ(SM.getFileOffset(BarRange.getBegin()), Code.range().Begin);
   EXPECT_EQ(SM.getFileOffset(BarRange.getEnd()), Code.range().End);
 }
+
+TEST(Decl, IsInExportDeclContext) {
+  llvm::Annotations Code(R"(
+    export module m;
+    export template <class T>
+    void f() {})");
+  auto AST =
+      tooling::buildASTFromCodeWithArgs(Code.code(), /*Args=*/{"-std=c++20"});
+  ASTContext &Ctx = AST->getASTContext();
+
+  const auto *f =
+      selectFirst<FunctionDecl>("f", match(functionDecl().bind("f"), Ctx));
+  EXPECT_TRUE(f->isInExportDeclContext());
+}

From c34d89818341b3c7c96bc8f59e3f98063d4ae9fd Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Wed, 23 Feb 2022 08:33:58 +0000
Subject: [PATCH 589/748] [ASTMatchers] Expand isInline matcher to VarDecl

Add support to the `isInline` matcher for C++17's inline variables.

Reviewed By: aaron.ballman

Differential Revision: https://reviews.llvm.org/D118900
---
 clang/docs/LibASTMatchersReference.html       | 25 +++++++++++++++++--
 clang/docs/ReleaseNotes.rst                   |  2 ++
 clang/include/clang/ASTMatchers/ASTMatchers.h | 14 +++++++----
 .../ASTMatchers/ASTMatchersInternalTest.cpp   |  2 ++
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index a3f57996a6fb2..d552f4ccd7668 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -4322,7 +4322,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('isInline1')"><a name="isInline1Anchor">isInline</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isInline1"><pre>Matches function and namespace declarations that are marked with
+<tr><td colspan="4" class="doc" id="isInline1"><pre>Matches functions, variables and namespace declarations that are marked with
 the inline keyword.
 
 Given
@@ -4331,8 +4331,10 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
   namespace n {
   inline namespace m {}
   }
+  inline int Foo = 5;
 functionDecl(isInline()) will match ::f().
 namespaceDecl(isInline()) will match n::m.
+varDecl(isInline()) will match Foo;
 </pre></td></tr>
 
 
@@ -4697,7 +4699,7 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 
 
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1NamespaceDecl.html">NamespaceDecl</a>&gt;</td><td class="name" onclick="toggle('isInline0')"><a name="isInline0Anchor">isInline</a></td><td></td></tr>
-<tr><td colspan="4" class="doc" id="isInline0"><pre>Matches function and namespace declarations that are marked with
+<tr><td colspan="4" class="doc" id="isInline0"><pre>Matches functions, variables and namespace declarations that are marked with
 the inline keyword.
 
 Given
@@ -4706,8 +4708,10 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
   namespace n {
   inline namespace m {}
   }
+  inline int Foo = 5;
 functionDecl(isInline()) will match ::f().
 namespaceDecl(isInline()) will match n::m.
+varDecl(isInline()) will match Foo;
 </pre></td></tr>
 
 
@@ -5728,6 +5732,23 @@ <h2 id="narrowing-matchers">Narrowing Matchers</h2>
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isInline2')"><a name="isInline2Anchor">isInline</a></td><td></td></tr>
+<tr><td colspan="4" class="doc" id="isInline2"><pre>Matches functions, variables and namespace declarations that are marked with
+the inline keyword.
+
+Given
+  inline void f();
+  void g();
+  namespace n {
+  inline namespace m {}
+  }
+  inline int Foo = 5;
+functionDecl(isInline()) will match ::f().
+namespaceDecl(isInline()) will match n::m.
+varDecl(isInline()) will match Foo;
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1VarDecl.html">VarDecl</a>&gt;</td><td class="name" onclick="toggle('isStaticLocal0')"><a name="isStaticLocal0Anchor">isStaticLocal</a></td><td></td></tr>
 <tr><td colspan="4" class="doc" id="isStaticLocal0"><pre>Matches a static variable with local scope.
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 68a867409c160..4131c022f5944 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -186,6 +186,8 @@ Build System Changes
 AST Matchers
 ------------
 
+- Expanded ``isInline`` narrowing matcher to support c++17 inline variables.
+
 clang-format
 ------------
 
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index 86bd44091b593..6664a5bcfe7fb 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -7673,7 +7673,7 @@ AST_MATCHER_P(FunctionDecl, hasExplicitSpecifier, internal::Matcher<Expr>,
   return InnerMatcher.matches(*ES.getExpr(), Finder, Builder);
 }
 
-/// Matches function and namespace declarations that are marked with
+/// Matches functions, variables and namespace declarations that are marked with
 /// the inline keyword.
 ///
 /// Given
@@ -7683,18 +7683,22 @@ AST_MATCHER_P(FunctionDecl, hasExplicitSpecifier, internal::Matcher<Expr>,
 ///   namespace n {
 ///   inline namespace m {}
 ///   }
+///   inline int Foo = 5;
 /// \endcode
 /// functionDecl(isInline()) will match ::f().
 /// namespaceDecl(isInline()) will match n::m.
-AST_POLYMORPHIC_MATCHER(isInline,
-                        AST_POLYMORPHIC_SUPPORTED_TYPES(NamespaceDecl,
-                                                        FunctionDecl)) {
+/// varDecl(isInline()) will match Foo;
+AST_POLYMORPHIC_MATCHER(isInline, AST_POLYMORPHIC_SUPPORTED_TYPES(NamespaceDecl,
+                                                                  FunctionDecl,
+                                                                  VarDecl)) {
   // This is required because the spelling of the function used to determine
   // whether inline is specified or not differs between the polymorphic types.
   if (const auto *FD = dyn_cast<FunctionDecl>(&Node))
     return FD->isInlineSpecified();
-  else if (const auto *NSD = dyn_cast<NamespaceDecl>(&Node))
+  if (const auto *NSD = dyn_cast<NamespaceDecl>(&Node))
     return NSD->isInline();
+  if (const auto *VD = dyn_cast<VarDecl>(&Node))
+    return VD->isInline();
   llvm_unreachable("Not a valid polymorphic type");
 }
 
diff --git a/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
index 6fec0d2e73694..2766065f9e5d1 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersInternalTest.cpp
@@ -197,6 +197,8 @@ TEST(IsInlineMatcher, IsInline) {
                       functionDecl(isInline(), hasName("f"))));
   EXPECT_TRUE(matches("namespace n { inline namespace m {} }",
                       namespaceDecl(isInline(), hasName("m"))));
+  EXPECT_TRUE(matches("inline int Foo = 5;",
+                      varDecl(isInline(), hasName("Foo")), {Lang_CXX17}));
 }
 
 // FIXME: Figure out how to specify paths so the following tests pass on

From 79353f940cf441e69f32af0a78a48baee89e8517 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Wed, 23 Feb 2022 08:35:30 +0000
Subject: [PATCH 590/748] [clang-tidy][NFC] Remove Tristate from CachedGlobList

The tristate is a little redundant as we can determine if the item was already in the cache based on the return from try_emplace.

Reviewed By: salman-javed-nz

Differential Revision: https://reviews.llvm.org/D120196
---
 clang-tools-extra/clang-tidy/GlobList.cpp | 16 ++++++----------
 clang-tools-extra/clang-tidy/GlobList.h   |  3 +--
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/GlobList.cpp b/clang-tools-extra/clang-tidy/GlobList.cpp
index fe41feef38abf..a55cac412cf63 100644
--- a/clang-tools-extra/clang-tidy/GlobList.cpp
+++ b/clang-tools-extra/clang-tidy/GlobList.cpp
@@ -65,16 +65,12 @@ bool GlobList::contains(StringRef S) const {
 }
 
 bool CachedGlobList::contains(StringRef S) const {
-  switch (auto &Result = Cache[S]) {
-  case Yes:
-    return true;
-  case No:
-    return false;
-  case None:
-    Result = GlobList::contains(S) ? Yes : No;
-    return Result == Yes;
-  }
-  llvm_unreachable("invalid enum");
+  auto Entry = Cache.try_emplace(S);
+  bool &Value = Entry.first->getValue();
+  // If the entry was just inserted, determine its required value.
+  if (Entry.second)
+    Value = GlobList::contains(S);
+  return Value;
 }
 
 } // namespace tidy
diff --git a/clang-tools-extra/clang-tidy/GlobList.h b/clang-tools-extra/clang-tidy/GlobList.h
index de7020ef3f165..3eec92edaa695 100644
--- a/clang-tools-extra/clang-tidy/GlobList.h
+++ b/clang-tools-extra/clang-tidy/GlobList.h
@@ -59,8 +59,7 @@ class CachedGlobList final : public GlobList {
   bool contains(StringRef S) const override;
 
 private:
-  enum Tristate { None, Yes, No };
-  mutable llvm::StringMap<Tristate> Cache;
+  mutable llvm::StringMap<bool> Cache;
 };
 
 } // namespace tidy

From 65dc78d63ee2eb20fbed54401091f08a685ef8c1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 22 Feb 2022 17:53:14 +0100
Subject: [PATCH 591/748] [InstCombine] Remove one-use limitation from X-Y==0
 fold

This one-use limitation is artificial, we do not increase
instruction count if we perform the fold with multiple uses. The
motivating case is shown in @sub_eq_zero_select, where the one-use
limitation causes us to miss a subsequent select fold.

I believe the backend is pretty good about reusing flag-producing
subs for cmps with same operands, so I think doing this is fine.

Differential Revision: https://reviews.llvm.org/D120337
---
 .../InstCombine/InstCombineCompares.cpp       | 10 ++--
 llvm/test/Transforms/InstCombine/icmp-sub.ll  |  8 +--
 .../InstCombine/prevent-cmp-merge.ll          |  6 +-
 ...ult-of-usub-is-non-zero-and-no-overflow.ll | 56 +++++++++----------
 4 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 13540a77b511c..eab4aa93df48f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2593,6 +2593,11 @@ Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
       !subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
     return new ICmpInst(SwappedPred, Y, ConstantInt::get(Ty, SubResult));
 
+  // X - Y == 0 --> X == Y.
+  // X - Y != 0 --> X != Y.
+  if (Cmp.isEquality() && C.isZero())
+    return new ICmpInst(Pred, X, Y);
+
   // The following transforms are only worth it if the only user of the subtract
   // is the icmp.
   // TODO: This is an artificial restriction for all of the transforms below
@@ -2600,11 +2605,6 @@ Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
   if (!Sub->hasOneUse())
     return nullptr;
 
-  // X - Y == 0 --> X == Y.
-  // X - Y != 0 --> X != Y.
-  if (Cmp.isEquality() && C.isZero())
-    return new ICmpInst(Pred, X, Y);
-
   if (Sub->hasNoSignedWrap()) {
     // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
     if (Pred == ICmpInst::ICMP_SGT && C.isAllOnes())
diff --git a/llvm/test/Transforms/InstCombine/icmp-sub.ll b/llvm/test/Transforms/InstCombine/icmp-sub.ll
index 2efa2bbe269ad..0541f2f0f1369 100644
--- a/llvm/test/Transforms/InstCombine/icmp-sub.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-sub.ll
@@ -467,7 +467,7 @@ define i1 @sub_eq_zero_use(i32 %x, i32 %y) {
 ; CHECK-LABEL: @sub_eq_zero_use(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    call void @use(i32 [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[SUB]], 0
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[X]], [[Y]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %sub = sub i32 %x, %y
@@ -480,7 +480,7 @@ define <2 x i1> @sub_ne_zero_use(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @sub_ne_zero_use(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    call void @use_vec(<2 x i8> [[SUB]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[SUB]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[X]], [[Y]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %sub = sub <2 x i8> %x, %y
@@ -493,9 +493,7 @@ define i32 @sub_eq_zero_select(i32 %a, i32 %b, i32* %p) {
 ; CHECK-LABEL: @sub_eq_zero_select(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    store i32 [[SUB]], i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SUB]], 0
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[B]]
-; CHECK-NEXT:    ret i32 [[SEL]]
+; CHECK-NEXT:    ret i32 [[B]]
 ;
   %sub = sub i32 %a, %b
   store i32 %sub, i32* %p
diff --git a/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll b/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll
index 23dfb956263c4..cd05022b0d35d 100644
--- a/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll
+++ b/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll
@@ -56,7 +56,7 @@ define zeroext i1 @test2(i32 %lhs, i32 %rhs) {
 define zeroext i1 @test3(i32 %lhs, i32 %rhs) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[LHS:%.*]], [[RHS:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[SUB]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[LHS]], [[RHS]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SUB]], 31
 ; CHECK-NEXT:    [[SEL:%.*]] = or i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret i1 [[SEL]]
@@ -72,9 +72,9 @@ define zeroext i1 @test3(i32 %lhs, i32 %rhs) {
 define zeroext i1 @test3_logical(i32 %lhs, i32 %rhs) {
 ; CHECK-LABEL: @test3_logical(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[LHS:%.*]], [[RHS:%.*]]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[SUB]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[LHS]], [[RHS]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SUB]], 31
-; CHECK-NEXT:    [[SEL:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]]
 ; CHECK-NEXT:    ret i1 [[SEL]]
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
index d81d26c25a521..300fa0c49c88a 100644
--- a/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/result-of-usub-is-non-zero-and-no-overflow.ll
@@ -22,7 +22,7 @@ define i1 @t0_noncanonical_ignoreme(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -43,7 +43,7 @@ define i1 @t0_noncanonical_ignoreme_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -64,7 +64,7 @@ define i1 @t1(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -85,7 +85,7 @@ define i1 @t1_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -105,7 +105,7 @@ define i1 @t1_strict(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    ret i1 [[NO_UNDERFLOW]]
 ;
@@ -125,7 +125,7 @@ define i1 @t1_strict_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    ret i1 [[NO_UNDERFLOW]]
 ;
@@ -201,7 +201,7 @@ define i1 @t3_commutability0(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -222,7 +222,7 @@ define i1 @t3_commutability0_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -242,7 +242,7 @@ define i1 @t4_commutability1(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -263,7 +263,7 @@ define i1 @t4_commutability1_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -283,7 +283,7 @@ define i1 @t5_commutability2(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -304,7 +304,7 @@ define i1 @t5_commutability2_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -382,7 +382,7 @@ define i1 @t7(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -403,7 +403,7 @@ define i1 @t7_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -423,7 +423,7 @@ define i1 @t7_nonstrict(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NULL]])
 ; CHECK-NEXT:    ret i1 [[UNDERFLOW]]
 ;
@@ -443,7 +443,7 @@ define i1 @t7_nonstrict_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NULL]])
 ; CHECK-NEXT:    ret i1 [[UNDERFLOW]]
 ;
@@ -511,7 +511,7 @@ define i1 @t9_commutative(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -532,7 +532,7 @@ define i1 @t9_commutative_logical(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
 ; CHECK-NEXT:    [[UNDERFLOW:%.*]] = icmp ult i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[UNDERFLOW]])
-; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NULL:%.*]] = icmp eq i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i8 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -556,7 +556,7 @@ define i1 @t10(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -580,7 +580,7 @@ define i1 @t10_logical(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -603,7 +603,7 @@ define i1 @t11_commutative(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -627,7 +627,7 @@ define i1 @t11_commutative_logical(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ule i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -651,7 +651,7 @@ define i1 @t12(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -675,7 +675,7 @@ define i1 @t12_logical(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -698,7 +698,7 @@ define i1 @t13(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -722,7 +722,7 @@ define i1 @t13_logical(i64 %base, i64* nonnull %offsetptr) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[OFFSET]], [[BASE]]
 ; CHECK-NEXT:    ret i1 [[TMP1]]
@@ -745,7 +745,7 @@ define i1 @t14_bad(i64 %base, i64 %offset) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ult i64 [[ADJUSTED]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -766,7 +766,7 @@ define i1 @t14_bad_logical(i64 %base, i64 %offset) {
 ; CHECK-NEXT:    call void @use64(i64 [[ADJUSTED]])
 ; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ult i64 [[ADJUSTED]], [[BASE]]
 ; CHECK-NEXT:    call void @use1(i1 [[NO_UNDERFLOW]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i64 [[BASE]], [[OFFSET]]
 ; CHECK-NEXT:    call void @use1(i1 [[NOT_NULL]])
 ; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
 ; CHECK-NEXT:    ret i1 [[R]]

From a10789d6cda71cd2ea13309fd6daf854d44e0906 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 23 Feb 2022 08:55:54 +0000
Subject: [PATCH 592/748] [ARM] Recognize SSAT and USAT from SMIN/SMAX

We have some recognition of SSAT and USAT from SELECT_CC at the moment.
This extends the matching to SMIN/SMAX which can help catch more cases,
either from min/max being the canonical form in instcombine or from some
expanded nodes like fp_to_si_sat.

Differential Revision: https://reviews.llvm.org/D119819
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp    |  51 +++++++++
 llvm/test/CodeGen/ARM/fpclamptosat.ll      | 117 ++++-----------------
 llvm/test/CodeGen/ARM/sadd_sat.ll          |  24 +----
 llvm/test/CodeGen/ARM/sadd_sat_plus.ll     |  24 +----
 llvm/test/CodeGen/ARM/ssat-unroll-loops.ll |  74 +++++--------
 llvm/test/CodeGen/ARM/ssat.ll              |  76 ++-----------
 llvm/test/CodeGen/ARM/ssub_sat.ll          |  24 +----
 llvm/test/CodeGen/ARM/ssub_sat_plus.ll     |  24 +----
 llvm/test/CodeGen/ARM/usat.ll              |  64 ++---------
 9 files changed, 129 insertions(+), 349 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index c678901bb3280..cdf5caff228e5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1564,6 +1564,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::SRL);
   if (Subtarget->isThumb1Only())
     setTargetDAGCombine(ISD::SHL);
+  // Attempt to lower smin/smax to ssat/usat
+  if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
+      Subtarget->isThumb2()) {
+    setTargetDAGCombine(ISD::SMIN);
+    setTargetDAGCombine(ISD::SMAX);
+  }
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
@@ -17557,12 +17563,57 @@ static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
+// constant bounds.
+static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG,
+                                         const ARMSubtarget *Subtarget) {
+  if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
+      !Subtarget->isThumb2())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+
+  if (VT != MVT::i32 ||
+      (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
+      !isa<ConstantSDNode>(Op.getOperand(1)) ||
+      !isa<ConstantSDNode>(Op0.getOperand(1)))
+    return SDValue();
+
+  SDValue Min = Op;
+  SDValue Max = Op0;
+  SDValue Input = Op0.getOperand(0);
+  if (Min.getOpcode() == ISD::SMAX)
+    std::swap(Min, Max);
+
+  APInt MinC = Min.getConstantOperandAPInt(1);
+  APInt MaxC = Max.getConstantOperandAPInt(1);
+
+  if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
+      !(MinC + 1).isPowerOf2())
+    return SDValue();
+
+  SDLoc DL(Op);
+  if (MinC == ~MaxC)
+    return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
+                       DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
+  if (MaxC == 0)
+    return DAG.getNode(ARMISD::USAT, DL, VT, Input,
+                       DAG.getConstant(MinC.countTrailingOnes(), DL, VT));
+
+  return SDValue();
+}
+
 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
 /// saturates.
 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
                                     const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
+
+  if (VT == MVT::i32)
+    return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
+
   if (!ST->hasMVEIntegerOps())
     return SDValue();
 
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll
index 5223ae1286f53..48241424ac6e2 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll
@@ -2718,28 +2718,14 @@ define i16 @stest_f64i16_mm(double %x) {
 ; VFP2-NEXT:    push {r7, lr}
 ; VFP2-NEXT:    vmov r0, r1, d0
 ; VFP2-NEXT:    bl __aeabi_d2iz
-; VFP2-NEXT:    movw r1, #32767
-; VFP2-NEXT:    cmp r0, r1
-; VFP2-NEXT:    it ge
-; VFP2-NEXT:    movge r0, r1
-; VFP2-NEXT:    movw r1, #32768
-; VFP2-NEXT:    movt r1, #65535
-; VFP2-NEXT:    cmn.w r0, #32768
-; VFP2-NEXT:    it le
-; VFP2-NEXT:    movle r0, r1
+; VFP2-NEXT:    ssat r0, #16, r0
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f64i16_mm:
 ; FULL:       @ %bb.0: @ %entry
 ; FULL-NEXT:    vcvt.s32.f64 s0, d0
-; FULL-NEXT:    movw r1, #32767
 ; FULL-NEXT:    vmov r0, s0
-; FULL-NEXT:    cmp r0, r1
-; FULL-NEXT:    csel r0, r0, r1, lt
-; FULL-NEXT:    movw r1, #32768
-; FULL-NEXT:    movt r1, #65535
-; FULL-NEXT:    cmn.w r0, #32768
-; FULL-NEXT:    csel r0, r0, r1, gt
+; FULL-NEXT:    ssat r0, #16, r0
 ; FULL-NEXT:    bx lr
 entry:
   %conv = fptosi double %x to i32
@@ -2820,21 +2806,14 @@ define i16 @ustest_f64i16_mm(double %x) {
 ; VFP2-NEXT:    push {r7, lr}
 ; VFP2-NEXT:    vmov r0, r1, d0
 ; VFP2-NEXT:    bl __aeabi_d2iz
-; VFP2-NEXT:    movw r1, #65535
-; VFP2-NEXT:    cmp r0, r1
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, r0
-; VFP2-NEXT:    bic.w r0, r1, r1, asr #31
+; VFP2-NEXT:    usat r0, #16, r0
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: ustest_f64i16_mm:
 ; FULL:       @ %bb.0: @ %entry
 ; FULL-NEXT:    vcvt.s32.f64 s0, d0
-; FULL-NEXT:    movw r1, #65535
 ; FULL-NEXT:    vmov r0, s0
-; FULL-NEXT:    cmp r0, r1
-; FULL-NEXT:    csel r0, r0, r1, lt
-; FULL-NEXT:    bic.w r0, r0, r0, asr #31
+; FULL-NEXT:    usat r0, #16, r0
 ; FULL-NEXT:    bx lr
 entry:
   %conv = fptosi double %x to i32
@@ -2870,33 +2849,12 @@ define i16 @stest_f32i16_mm(float %x) {
 ; SOFT-NEXT:  .LCPI39_1:
 ; SOFT-NEXT:    .long 4294934528 @ 0xffff8000
 ;
-; VFP2-LABEL: stest_f32i16_mm:
-; VFP2:       @ %bb.0: @ %entry
-; VFP2-NEXT:    vcvt.s32.f32 s0, s0
-; VFP2-NEXT:    movw r1, #32767
-; VFP2-NEXT:    vmov r0, s0
-; VFP2-NEXT:    cmp r0, r1
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, r0
-; VFP2-NEXT:    movw r0, #32768
-; VFP2-NEXT:    cmn.w r1, #32768
-; VFP2-NEXT:    movt r0, #65535
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r0, r1
-; VFP2-NEXT:    bx lr
-;
-; FULL-LABEL: stest_f32i16_mm:
-; FULL:       @ %bb.0: @ %entry
-; FULL-NEXT:    vcvt.s32.f32 s0, s0
-; FULL-NEXT:    movw r1, #32767
-; FULL-NEXT:    vmov r0, s0
-; FULL-NEXT:    cmp r0, r1
-; FULL-NEXT:    csel r0, r0, r1, lt
-; FULL-NEXT:    movw r1, #32768
-; FULL-NEXT:    movt r1, #65535
-; FULL-NEXT:    cmn.w r0, #32768
-; FULL-NEXT:    csel r0, r0, r1, gt
-; FULL-NEXT:    bx lr
+; VFP-LABEL: stest_f32i16_mm:
+; VFP:       @ %bb.0: @ %entry
+; VFP-NEXT:    vcvt.s32.f32 s0, s0
+; VFP-NEXT:    vmov r0, s0
+; VFP-NEXT:    ssat r0, #16, r0
+; VFP-NEXT:    bx lr
 entry:
   %conv = fptosi float %x to i32
   %spec.store.select = call i32 @llvm.smin.i32(i32 %conv, i32 32767)
@@ -2968,26 +2926,12 @@ define i16 @ustest_f32i16_mm(float %x) {
 ; SOFT-NEXT:  .LCPI41_0:
 ; SOFT-NEXT:    .long 65535 @ 0xffff
 ;
-; VFP2-LABEL: ustest_f32i16_mm:
-; VFP2:       @ %bb.0: @ %entry
-; VFP2-NEXT:    vcvt.s32.f32 s0, s0
-; VFP2-NEXT:    movw r1, #65535
-; VFP2-NEXT:    vmov r0, s0
-; VFP2-NEXT:    cmp r0, r1
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, r0
-; VFP2-NEXT:    bic.w r0, r1, r1, asr #31
-; VFP2-NEXT:    bx lr
-;
-; FULL-LABEL: ustest_f32i16_mm:
-; FULL:       @ %bb.0: @ %entry
-; FULL-NEXT:    vcvt.s32.f32 s0, s0
-; FULL-NEXT:    movw r1, #65535
-; FULL-NEXT:    vmov r0, s0
-; FULL-NEXT:    cmp r0, r1
-; FULL-NEXT:    csel r0, r0, r1, lt
-; FULL-NEXT:    bic.w r0, r0, r0, asr #31
-; FULL-NEXT:    bx lr
+; VFP-LABEL: ustest_f32i16_mm:
+; VFP:       @ %bb.0: @ %entry
+; VFP-NEXT:    vcvt.s32.f32 s0, s0
+; VFP-NEXT:    vmov r0, s0
+; VFP-NEXT:    usat r0, #16, r0
+; VFP-NEXT:    bx lr
 entry:
   %conv = fptosi float %x to i32
   %spec.store.select = call i32 @llvm.smin.i32(i32 %conv, i32 65535)
@@ -3031,30 +2975,16 @@ define i16 @stest_f16i16_mm(half %x) {
 ; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    bl __aeabi_h2f
 ; VFP2-NEXT:    vmov s0, r0
-; VFP2-NEXT:    movw r1, #32767
 ; VFP2-NEXT:    vcvt.s32.f32 s0, s0
 ; VFP2-NEXT:    vmov r0, s0
-; VFP2-NEXT:    cmp r0, r1
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, r0
-; VFP2-NEXT:    movw r0, #32768
-; VFP2-NEXT:    cmn.w r1, #32768
-; VFP2-NEXT:    movt r0, #65535
-; VFP2-NEXT:    it gt
-; VFP2-NEXT:    movgt r0, r1
+; VFP2-NEXT:    ssat r0, #16, r0
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: stest_f16i16_mm:
 ; FULL:       @ %bb.0: @ %entry
 ; FULL-NEXT:    vcvt.s32.f16 s0, s0
-; FULL-NEXT:    movw r1, #32767
 ; FULL-NEXT:    vmov r0, s0
-; FULL-NEXT:    cmp r0, r1
-; FULL-NEXT:    csel r0, r0, r1, lt
-; FULL-NEXT:    movw r1, #32768
-; FULL-NEXT:    movt r1, #65535
-; FULL-NEXT:    cmn.w r0, #32768
-; FULL-NEXT:    csel r0, r0, r1, gt
+; FULL-NEXT:    ssat r0, #16, r0
 ; FULL-NEXT:    bx lr
 entry:
   %conv = fptosi half %x to i32
@@ -3143,23 +3073,16 @@ define i16 @ustest_f16i16_mm(half %x) {
 ; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    bl __aeabi_h2f
 ; VFP2-NEXT:    vmov s0, r0
-; VFP2-NEXT:    movw r1, #65535
 ; VFP2-NEXT:    vcvt.s32.f32 s0, s0
 ; VFP2-NEXT:    vmov r0, s0
-; VFP2-NEXT:    cmp r0, r1
-; VFP2-NEXT:    it lt
-; VFP2-NEXT:    movlt r1, r0
-; VFP2-NEXT:    bic.w r0, r1, r1, asr #31
+; VFP2-NEXT:    usat r0, #16, r0
 ; VFP2-NEXT:    pop {r7, pc}
 ;
 ; FULL-LABEL: ustest_f16i16_mm:
 ; FULL:       @ %bb.0: @ %entry
 ; FULL-NEXT:    vcvt.s32.f16 s0, s0
-; FULL-NEXT:    movw r1, #65535
 ; FULL-NEXT:    vmov r0, s0
-; FULL-NEXT:    cmp r0, r1
-; FULL-NEXT:    csel r0, r0, r1, lt
-; FULL-NEXT:    bic.w r0, r0, r0, asr #31
+; FULL-NEXT:    usat r0, #16, r0
 ; FULL-NEXT:    bx lr
 entry:
   %conv = fptosi half %x to i32
diff --git a/llvm/test/CodeGen/ARM/sadd_sat.ll b/llvm/test/CodeGen/ARM/sadd_sat.ll
index 287e52d5044d6..e0aca8e433805 100644
--- a/llvm/test/CodeGen/ARM/sadd_sat.ll
+++ b/llvm/test/CodeGen/ARM/sadd_sat.ll
@@ -148,15 +148,7 @@ define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind {
 ; CHECK-T2NODSP-LABEL: func16:
 ; CHECK-T2NODSP:       @ %bb.0:
 ; CHECK-T2NODSP-NEXT:    add r0, r1
-; CHECK-T2NODSP-NEXT:    movw r1, #32767
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it lt
-; CHECK-T2NODSP-NEXT:    movlt r1, r0
-; CHECK-T2NODSP-NEXT:    movw r0, #32768
-; CHECK-T2NODSP-NEXT:    cmn.w r1, #32768
-; CHECK-T2NODSP-NEXT:    movt r0, #65535
-; CHECK-T2NODSP-NEXT:    it gt
-; CHECK-T2NODSP-NEXT:    movgt r0, r1
+; CHECK-T2NODSP-NEXT:    ssat r0, #16, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func16:
@@ -219,12 +211,7 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 ; CHECK-T2NODSP-LABEL: func8:
 ; CHECK-T2NODSP:       @ %bb.0:
 ; CHECK-T2NODSP-NEXT:    add r0, r1
-; CHECK-T2NODSP-NEXT:    cmp r0, #127
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #127
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #128
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #127
+; CHECK-T2NODSP-NEXT:    ssat r0, #8, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func8:
@@ -280,12 +267,7 @@ define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 ; CHECK-T2NODSP-LABEL: func3:
 ; CHECK-T2NODSP:       @ %bb.0:
 ; CHECK-T2NODSP-NEXT:    add r0, r1
-; CHECK-T2NODSP-NEXT:    cmp r0, #7
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #7
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #8
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #7
+; CHECK-T2NODSP-NEXT:    ssat r0, #4, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func3:
diff --git a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
index 6a8d9def56638..bbdfa6cea6e47 100644
--- a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
@@ -151,15 +151,7 @@ define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounw
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
 ; CHECK-T2NODSP-NEXT:    sxth r1, r1
 ; CHECK-T2NODSP-NEXT:    add r0, r1
-; CHECK-T2NODSP-NEXT:    movw r1, #32767
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it lt
-; CHECK-T2NODSP-NEXT:    movlt r1, r0
-; CHECK-T2NODSP-NEXT:    movw r0, #32768
-; CHECK-T2NODSP-NEXT:    movt r0, #65535
-; CHECK-T2NODSP-NEXT:    cmn.w r1, #32768
-; CHECK-T2NODSP-NEXT:    it gt
-; CHECK-T2NODSP-NEXT:    movgt r0, r1
+; CHECK-T2NODSP-NEXT:    ssat r0, #16, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func16:
@@ -205,12 +197,7 @@ define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind {
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
 ; CHECK-T2NODSP-NEXT:    sxtb r1, r1
 ; CHECK-T2NODSP-NEXT:    add r0, r1
-; CHECK-T2NODSP-NEXT:    cmp r0, #127
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #127
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #128
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #127
+; CHECK-T2NODSP-NEXT:    ssat r0, #8, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func8:
@@ -257,12 +244,7 @@ define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
 ; CHECK-T2NODSP-NEXT:    lsls r1, r1, #28
 ; CHECK-T2NODSP-NEXT:    add.w r0, r0, r1, asr #28
-; CHECK-T2NODSP-NEXT:    cmp r0, #7
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #7
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #8
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #7
+; CHECK-T2NODSP-NEXT:    ssat r0, #4, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func4:
diff --git a/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll b/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
index 1f7574a8cca98..def54a046bfc0 100644
--- a/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
+++ b/llvm/test/CodeGen/ARM/ssat-unroll-loops.ll
@@ -125,63 +125,43 @@ while.end:                                        ; preds = %while.body, %while.
 define void @ssat_unroll_minmax(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i16* nocapture writeonly %pDst, i32 %blockSize) {
 ; CHECK-LABEL: ssat_unroll_minmax:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r11, lr}
-; CHECK-NEXT:    push {r4, r5, r11, lr}
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB1_6
+; CHECK-NEXT:    beq .LBB1_5
 ; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT:    movw r12, #32768
-; CHECK-NEXT:    sub lr, r3, #1
+; CHECK-NEXT:    sub r12, r3, #1
 ; CHECK-NEXT:    tst r3, #1
-; CHECK-NEXT:    movt r12, #65535
 ; CHECK-NEXT:    beq .LBB1_3
 ; CHECK-NEXT:  @ %bb.2: @ %while.body.prol.preheader
-; CHECK-NEXT:    ldrsh r3, [r0], #2
-; CHECK-NEXT:    ldrsh r4, [r1], #2
-; CHECK-NEXT:    smulbb r3, r4, r3
-; CHECK-NEXT:    asr r4, r3, #14
-; CHECK-NEXT:    cmn r4, #32768
-; CHECK-NEXT:    mov r4, r12
-; CHECK-NEXT:    asrgt r4, r3, #14
-; CHECK-NEXT:    movw r3, #32767
-; CHECK-NEXT:    cmp r4, r3
-; CHECK-NEXT:    movge r4, r3
-; CHECK-NEXT:    mov r3, lr
-; CHECK-NEXT:    strh r4, [r2], #2
+; CHECK-NEXT:    ldrsh lr, [r0], #2
+; CHECK-NEXT:    ldrsh r3, [r1], #2
+; CHECK-NEXT:    smulbb r3, r3, lr
+; CHECK-NEXT:    ssat r3, #16, r3, asr #14
+; CHECK-NEXT:    strh r3, [r2], #2
+; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:  .LBB1_3: @ %while.body.prol.loopexit
-; CHECK-NEXT:    cmp lr, #0
-; CHECK-NEXT:    beq .LBB1_6
-; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader1
-; CHECK-NEXT:    movw lr, #32767
-; CHECK-NEXT:  .LBB1_5: @ %while.body
+; CHECK-NEXT:    cmp r12, #0
+; CHECK-NEXT:    popeq {r11, pc}
+; CHECK-NEXT:  .LBB1_4: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrsh r4, [r0]
-; CHECK-NEXT:    ldrsh r5, [r1]
-; CHECK-NEXT:    smulbb r4, r5, r4
-; CHECK-NEXT:    asr r5, r4, #14
-; CHECK-NEXT:    cmn r5, #32768
-; CHECK-NEXT:    mov r5, r12
-; CHECK-NEXT:    asrgt r5, r4, #14
-; CHECK-NEXT:    cmp r5, lr
-; CHECK-NEXT:    movge r5, lr
-; CHECK-NEXT:    strh r5, [r2]
-; CHECK-NEXT:    ldrsh r4, [r0, #2]
+; CHECK-NEXT:    ldrsh r12, [r0]
+; CHECK-NEXT:    subs r3, r3, #2
+; CHECK-NEXT:    ldrsh lr, [r1]
+; CHECK-NEXT:    smulbb r12, lr, r12
+; CHECK-NEXT:    ssat r12, #16, r12, asr #14
+; CHECK-NEXT:    strh r12, [r2]
+; CHECK-NEXT:    ldrsh r12, [r0, #2]
 ; CHECK-NEXT:    add r0, r0, #4
-; CHECK-NEXT:    ldrsh r5, [r1, #2]
+; CHECK-NEXT:    ldrsh lr, [r1, #2]
 ; CHECK-NEXT:    add r1, r1, #4
-; CHECK-NEXT:    smulbb r4, r5, r4
-; CHECK-NEXT:    asr r5, r4, #14
-; CHECK-NEXT:    cmn r5, #32768
-; CHECK-NEXT:    mov r5, r12
-; CHECK-NEXT:    asrgt r5, r4, #14
-; CHECK-NEXT:    cmp r5, lr
-; CHECK-NEXT:    movge r5, lr
-; CHECK-NEXT:    subs r3, r3, #2
-; CHECK-NEXT:    strh r5, [r2, #2]
+; CHECK-NEXT:    smulbb r12, lr, r12
+; CHECK-NEXT:    ssat r12, #16, r12, asr #14
+; CHECK-NEXT:    strh r12, [r2, #2]
 ; CHECK-NEXT:    add r2, r2, #4
-; CHECK-NEXT:    bne .LBB1_5
-; CHECK-NEXT:  .LBB1_6: @ %while.end
-; CHECK-NEXT:    pop {r4, r5, r11, pc}
+; CHECK-NEXT:    bne .LBB1_4
+; CHECK-NEXT:  .LBB1_5: @ %while.end
+; CHECK-NEXT:    pop {r11, pc}
 entry:
   %cmp.not7 = icmp eq i32 %blockSize, 0
   br i1 %cmp.not7, label %while.end, label %while.body.preheader
diff --git a/llvm/test/CodeGen/ARM/ssat.ll b/llvm/test/CodeGen/ARM/ssat.ll
index ff16b59489f5b..f792a38987af5 100644
--- a/llvm/test/CodeGen/ARM/ssat.ll
+++ b/llvm/test/CodeGen/ARM/ssat.ll
@@ -669,14 +669,7 @@ define i32 @mm_sat_base_32bit(i32 %x) {
 ;
 ; V6T2-LABEL: mm_sat_base_32bit:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movge r0, r1
-; V6T2-NEXT:    movw r1, #0
-; V6T2-NEXT:    movt r1, #65408
-; V6T2-NEXT:    cmn r0, #8388608
-; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    ssat r0, #24, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
@@ -705,13 +698,7 @@ define i16 @mm_sat_base_16bit(i16 %x) {
 ; V6T2-LABEL: mm_sat_base_16bit:
 ; V6T2:       @ %bb.0: @ %entry
 ; V6T2-NEXT:    sxth r0, r0
-; V6T2-NEXT:    movw r1, #2047
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movlt r1, r0
-; V6T2-NEXT:    movw r0, #63488
-; V6T2-NEXT:    movt r0, #65535
-; V6T2-NEXT:    cmn r1, #2048
-; V6T2-NEXT:    movgt r0, r1
+; V6T2-NEXT:    ssat r0, #12, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i16 @llvm.smin.i16(i16 %x, i16 2047)
@@ -734,10 +721,7 @@ define i8 @mm_sat_base_8bit(i8 %x) {
 ; V6T2-LABEL: mm_sat_base_8bit:
 ; V6T2:       @ %bb.0: @ %entry
 ; V6T2-NEXT:    sxtb r0, r0
-; V6T2-NEXT:    cmp r0, #31
-; V6T2-NEXT:    movge r0, #31
-; V6T2-NEXT:    cmn r0, #32
-; V6T2-NEXT:    mvnle r0, #31
+; V6T2-NEXT:    ssat r0, #6, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i8 @llvm.smin.i8(i8 %x, i8 31)
@@ -763,14 +747,7 @@ define i32 @mm_sat_lower_upper_1(i32 %x) {
 ;
 ; V6T2-LABEL: mm_sat_lower_upper_1:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movge r0, r1
-; V6T2-NEXT:    movw r1, #0
-; V6T2-NEXT:    movt r1, #65408
-; V6T2-NEXT:    cmn r0, #8388608
-; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    ssat r0, #24, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
@@ -796,14 +773,7 @@ define i32 @mm_sat_lower_upper_2(i32 %x) {
 ;
 ; V6T2-LABEL: mm_sat_lower_upper_2:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movge r0, r1
-; V6T2-NEXT:    movw r1, #0
-; V6T2-NEXT:    movt r1, #65408
-; V6T2-NEXT:    cmn r0, #8388608
-; V6T2-NEXT:    movle r0, r1
+; V6T2-NEXT:    ssat r0, #24, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
@@ -829,14 +799,7 @@ define i32 @mm_sat_upper_lower_1(i32 %x) {
 ;
 ; V6T2-LABEL: mm_sat_upper_lower_1:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #0
-; V6T2-NEXT:    cmn r0, #8388608
-; V6T2-NEXT:    movt r1, #65408
-; V6T2-NEXT:    movle r0, r1
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    ssat r0, #24, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
@@ -862,14 +825,7 @@ define i32 @mm_sat_upper_lower_2(i32 %x) {
 ;
 ; V6T2-LABEL: mm_sat_upper_lower_2:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #0
-; V6T2-NEXT:    cmn r0, #8388608
-; V6T2-NEXT:    movt r1, #65408
-; V6T2-NEXT:    movle r0, r1
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    ssat r0, #24, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
@@ -895,14 +851,7 @@ define i32 @mm_sat_upper_lower_3(i32 %x) {
 ;
 ; V6T2-LABEL: mm_sat_upper_lower_3:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #0
-; V6T2-NEXT:    cmn r0, #8388608
-; V6T2-NEXT:    movt r1, #65408
-; V6T2-NEXT:    movle r0, r1
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    ssat r0, #24, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
@@ -928,14 +877,7 @@ define i32 @mm_sat_le_ge(i32 %x) {
 ;
 ; V6T2-LABEL: mm_sat_le_ge:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #0
-; V6T2-NEXT:    cmn r0, #8388608
-; V6T2-NEXT:    movt r1, #65408
-; V6T2-NEXT:    movle r0, r1
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movge r0, r1
+; V6T2-NEXT:    ssat r0, #24, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smax.i32(i32 %x, i32 -8388608)
diff --git a/llvm/test/CodeGen/ARM/ssub_sat.ll b/llvm/test/CodeGen/ARM/ssub_sat.ll
index 30d7a683654a9..1bafba3b49ed7 100644
--- a/llvm/test/CodeGen/ARM/ssub_sat.ll
+++ b/llvm/test/CodeGen/ARM/ssub_sat.ll
@@ -147,15 +147,7 @@ define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind {
 ; CHECK-T2NODSP-LABEL: func16:
 ; CHECK-T2NODSP:       @ %bb.0:
 ; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
-; CHECK-T2NODSP-NEXT:    movw r1, #32767
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it lt
-; CHECK-T2NODSP-NEXT:    movlt r1, r0
-; CHECK-T2NODSP-NEXT:    movw r0, #32768
-; CHECK-T2NODSP-NEXT:    cmn.w r1, #32768
-; CHECK-T2NODSP-NEXT:    movt r0, #65535
-; CHECK-T2NODSP-NEXT:    it gt
-; CHECK-T2NODSP-NEXT:    movgt r0, r1
+; CHECK-T2NODSP-NEXT:    ssat r0, #16, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func16:
@@ -218,12 +210,7 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 ; CHECK-T2NODSP-LABEL: func8:
 ; CHECK-T2NODSP:       @ %bb.0:
 ; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
-; CHECK-T2NODSP-NEXT:    cmp r0, #127
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #127
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #128
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #127
+; CHECK-T2NODSP-NEXT:    ssat r0, #8, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func8:
@@ -279,12 +266,7 @@ define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 ; CHECK-T2NODSP-LABEL: func3:
 ; CHECK-T2NODSP:       @ %bb.0:
 ; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
-; CHECK-T2NODSP-NEXT:    cmp r0, #7
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #7
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #8
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #7
+; CHECK-T2NODSP-NEXT:    ssat r0, #4, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func3:
diff --git a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
index 5bf7b326c5b91..0a2d1f0e7a240 100644
--- a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
@@ -151,15 +151,7 @@ define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounw
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
 ; CHECK-T2NODSP-NEXT:    sxth r1, r1
 ; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
-; CHECK-T2NODSP-NEXT:    movw r1, #32767
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it lt
-; CHECK-T2NODSP-NEXT:    movlt r1, r0
-; CHECK-T2NODSP-NEXT:    movw r0, #32768
-; CHECK-T2NODSP-NEXT:    movt r0, #65535
-; CHECK-T2NODSP-NEXT:    cmn.w r1, #32768
-; CHECK-T2NODSP-NEXT:    it gt
-; CHECK-T2NODSP-NEXT:    movgt r0, r1
+; CHECK-T2NODSP-NEXT:    ssat r0, #16, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func16:
@@ -205,12 +197,7 @@ define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind {
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
 ; CHECK-T2NODSP-NEXT:    sxtb r1, r1
 ; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
-; CHECK-T2NODSP-NEXT:    cmp r0, #127
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #127
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #128
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #127
+; CHECK-T2NODSP-NEXT:    ssat r0, #8, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func8:
@@ -257,12 +244,7 @@ define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
 ; CHECK-T2NODSP-NEXT:    lsls r1, r1, #28
 ; CHECK-T2NODSP-NEXT:    sub.w r0, r0, r1, asr #28
-; CHECK-T2NODSP-NEXT:    cmp r0, #7
-; CHECK-T2NODSP-NEXT:    it ge
-; CHECK-T2NODSP-NEXT:    movge r0, #7
-; CHECK-T2NODSP-NEXT:    cmn.w r0, #8
-; CHECK-T2NODSP-NEXT:    it le
-; CHECK-T2NODSP-NEXT:    mvnle r0, #7
+; CHECK-T2NODSP-NEXT:    ssat r0, #4, r0
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func4:
diff --git a/llvm/test/CodeGen/ARM/usat.ll b/llvm/test/CodeGen/ARM/usat.ll
index 077aa9de317d2..024a98dd29346 100644
--- a/llvm/test/CodeGen/ARM/usat.ll
+++ b/llvm/test/CodeGen/ARM/usat.ll
@@ -624,23 +624,12 @@ define i32 @mm_unsigned_sat_base_32bit(i32 %x) {
 ;
 ; V6-LABEL: mm_unsigned_sat_base_32bit:
 ; V6:       @ %bb.0: @ %entry
-; V6-NEXT:    ldr r1, .LCPI15_0
-; V6-NEXT:    cmp r0, r1
-; V6-NEXT:    movlt r1, r0
-; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    usat r0, #23, r0
 ; V6-NEXT:    bx lr
-; V6-NEXT:    .p2align 2
-; V6-NEXT:  @ %bb.1:
-; V6-NEXT:  .LCPI15_0:
-; V6-NEXT:    .long 8388607 @ 0x7fffff
 ;
 ; V6T2-LABEL: mm_unsigned_sat_base_32bit:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movlt r1, r0
-; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    usat r0, #23, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
@@ -662,21 +651,14 @@ define i16 @mm_unsigned_sat_base_16bit(i16 %x) {
 ;
 ; V6-LABEL: mm_unsigned_sat_base_16bit:
 ; V6:       @ %bb.0: @ %entry
-; V6-NEXT:    mov r1, #255
 ; V6-NEXT:    sxth r0, r0
-; V6-NEXT:    orr r1, r1, #1792
-; V6-NEXT:    cmp r0, r1
-; V6-NEXT:    movlt r1, r0
-; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    usat r0, #11, r0
 ; V6-NEXT:    bx lr
 ;
 ; V6T2-LABEL: mm_unsigned_sat_base_16bit:
 ; V6T2:       @ %bb.0: @ %entry
 ; V6T2-NEXT:    sxth r0, r0
-; V6T2-NEXT:    movw r1, #2047
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movlt r1, r0
-; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    usat r0, #11, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i16 @llvm.smin.i16(i16 %x, i16 2047)
@@ -698,17 +680,13 @@ define i8 @mm_unsigned_sat_base_8bit(i8 %x) {
 ; V6-LABEL: mm_unsigned_sat_base_8bit:
 ; V6:       @ %bb.0: @ %entry
 ; V6-NEXT:    sxtb r0, r0
-; V6-NEXT:    cmp r0, #31
-; V6-NEXT:    movge r0, #31
-; V6-NEXT:    bic r0, r0, r0, asr #31
+; V6-NEXT:    usat r0, #5, r0
 ; V6-NEXT:    bx lr
 ;
 ; V6T2-LABEL: mm_unsigned_sat_base_8bit:
 ; V6T2:       @ %bb.0: @ %entry
 ; V6T2-NEXT:    sxtb r0, r0
-; V6T2-NEXT:    cmp r0, #31
-; V6T2-NEXT:    movge r0, #31
-; V6T2-NEXT:    bic r0, r0, r0, asr #31
+; V6T2-NEXT:    usat r0, #5, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i8 @llvm.smin.i8(i8 %x, i8 31)
@@ -731,23 +709,12 @@ define i32 @mm_unsigned_sat_lower_upper_1(i32 %x) {
 ;
 ; V6-LABEL: mm_unsigned_sat_lower_upper_1:
 ; V6:       @ %bb.0: @ %entry
-; V6-NEXT:    ldr r1, .LCPI18_0
-; V6-NEXT:    cmp r0, r1
-; V6-NEXT:    movlt r1, r0
-; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    usat r0, #23, r0
 ; V6-NEXT:    bx lr
-; V6-NEXT:    .p2align 2
-; V6-NEXT:  @ %bb.1:
-; V6-NEXT:  .LCPI18_0:
-; V6-NEXT:    .long 8388607 @ 0x7fffff
 ;
 ; V6T2-LABEL: mm_unsigned_sat_lower_upper_1:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movlt r1, r0
-; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    usat r0, #23, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)
@@ -770,23 +737,12 @@ define i32 @mm_unsigned_sat_lower_upper_2(i32 %x) {
 ;
 ; V6-LABEL: mm_unsigned_sat_lower_upper_2:
 ; V6:       @ %bb.0: @ %entry
-; V6-NEXT:    ldr r1, .LCPI19_0
-; V6-NEXT:    cmp r0, r1
-; V6-NEXT:    movlt r1, r0
-; V6-NEXT:    bic r0, r1, r1, asr #31
+; V6-NEXT:    usat r0, #23, r0
 ; V6-NEXT:    bx lr
-; V6-NEXT:    .p2align 2
-; V6-NEXT:  @ %bb.1:
-; V6-NEXT:  .LCPI19_0:
-; V6-NEXT:    .long 8388607 @ 0x7fffff
 ;
 ; V6T2-LABEL: mm_unsigned_sat_lower_upper_2:
 ; V6T2:       @ %bb.0: @ %entry
-; V6T2-NEXT:    movw r1, #65535
-; V6T2-NEXT:    movt r1, #127
-; V6T2-NEXT:    cmp r0, r1
-; V6T2-NEXT:    movlt r1, r0
-; V6T2-NEXT:    bic r0, r1, r1, asr #31
+; V6T2-NEXT:    usat r0, #23, r0
 ; V6T2-NEXT:    bx lr
 entry:
   %0 = call i32 @llvm.smin.i32(i32 %x, i32 8388607)

From a5bbc6ef99bbc7fcf321326df2889e063ed77004 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 23 Feb 2022 01:20:48 -0800
Subject: [PATCH 593/748] [NFC] Remove unnecessary "#include"s from header
 files

---
 llvm/include/llvm/Analysis/SparsePropagation.h        |  1 +
 llvm/include/llvm/Analysis/TargetFolder.h             |  1 +
 llvm/include/llvm/Analysis/ValueLattice.h             |  3 ++-
 llvm/include/llvm/IR/AbstractCallSite.h               |  6 +++---
 llvm/include/llvm/IR/Argument.h                       |  1 -
 llvm/include/llvm/IR/AutoUpgrade.h                    |  3 ++-
 llvm/include/llvm/IR/BasicBlock.h                     |  3 ---
 llvm/include/llvm/IR/CFG.h                            |  1 -
 llvm/include/llvm/IR/ConstantFolder.h                 |  1 -
 llvm/include/llvm/IR/Function.h                       |  2 --
 llvm/include/llvm/IR/InstVisitor.h                    |  1 -
 llvm/include/llvm/IR/InstrTypes.h                     | 10 ++++------
 llvm/include/llvm/IR/Instruction.h                    |  1 -
 llvm/include/llvm/IR/Instructions.h                   | 11 +++++------
 llvm/include/llvm/IR/IntrinsicInst.h                  |  3 ++-
 llvm/lib/Analysis/DomTreeUpdater.cpp                  |  1 +
 llvm/lib/CodeGen/MachineModuleInfo.cpp                |  1 +
 llvm/lib/CodeGen/MachineVerifier.cpp                  |  1 +
 llvm/lib/IR/ReplaceConstant.cpp                       |  1 +
 llvm/lib/Passes/StandardInstrumentations.cpp          |  1 +
 .../lib/Target/AArch64/AArch64MachineFunctionInfo.cpp |  5 +++--
 .../AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp      |  1 +
 llvm/lib/Transforms/IPO/CalledValuePropagation.cpp    |  2 ++
 llvm/lib/Transforms/Utils/CallGraphUpdater.cpp        |  1 +
 llvm/lib/Transforms/Utils/UnifyLoopExits.cpp          |  1 +
 llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp     |  1 +
 llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp   |  1 +
 llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp     |  1 +
 llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp  |  1 +
 .../tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp |  1 +
 30 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Analysis/SparsePropagation.h b/llvm/include/llvm/Analysis/SparsePropagation.h
index 6eb6d5518a41b..428238c5fa0bc 100644
--- a/llvm/include/llvm/Analysis/SparsePropagation.h
+++ b/llvm/include/llvm/Analysis/SparsePropagation.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_SPARSEPROPAGATION_H
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
 #include <set>
diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h
index 1fd16add07d1b..df4267f5fb645 100644
--- a/llvm/include/llvm/Analysis/TargetFolder.h
+++ b/llvm/include/llvm/Analysis/TargetFolder.h
@@ -20,6 +20,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilderFolder.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/Analysis/ValueLattice.h b/llvm/include/llvm/Analysis/ValueLattice.h
index 1b3b02c084419..bc6b279e9ed52 100644
--- a/llvm/include/llvm/Analysis/ValueLattice.h
+++ b/llvm/include/llvm/Analysis/ValueLattice.h
@@ -9,9 +9,10 @@
 #ifndef LLVM_ANALYSIS_VALUELATTICE_H
 #define LLVM_ANALYSIS_VALUELATTICE_H
 
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Instructions.h"
-//
+
 //===----------------------------------------------------------------------===//
 //                               ValueLatticeElement
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/AbstractCallSite.h b/llvm/include/llvm/IR/AbstractCallSite.h
index 69048554a05cb..50afe016f0d63 100644
--- a/llvm/include/llvm/IR/AbstractCallSite.h
+++ b/llvm/include/llvm/IR/AbstractCallSite.h
@@ -14,17 +14,17 @@
 #ifndef LLVM_IR_ABSTRACTCALLSITE_H
 #define LLVM_IR_ABSTRACTCALLSITE_H
 
-#include "llvm/IR/Argument.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
 #include <cassert>
 
 namespace llvm {
 
+class Argument;
+class Use;
+
 /// AbstractCallSite
 ///
 /// An abstract call site is a wrapper that allows to treat direct,
diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
index 7cbfa2a7b6cec..3b74853cdafa1 100644
--- a/llvm/include/llvm/IR/Argument.h
+++ b/llvm/include/llvm/IR/Argument.h
@@ -14,7 +14,6 @@
 #define LLVM_IR_ARGUMENT_H
 
 #include "llvm/ADT/Twine.h"
-#include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Value.h"
 
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index b0db790da99d1..bcece46f89e02 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -21,9 +21,10 @@ namespace llvm {
   class Constant;
   class Function;
   class Instruction;
+  class GlobalVariable;
   class MDNode;
   class Module;
-  class GlobalVariable;
+  class StringRef;
   class Type;
   class Value;
 
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 184ddfc01c296..3204ec76b340d 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -22,9 +22,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/CBindingWrapping.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
 #include <iterator>
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
index 0ee584f8af7ed..28a8d31a4cc60 100644
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -25,7 +25,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
 #include <cassert>
 #include <cstddef>
 #include <iterator>
diff --git a/llvm/include/llvm/IR/ConstantFolder.h b/llvm/include/llvm/IR/ConstantFolder.h
index 28dc63a5886e5..6ec316ea61c4f 100644
--- a/llvm/include/llvm/IR/ConstantFolder.h
+++ b/llvm/include/llvm/IR/ConstantFolder.h
@@ -20,7 +20,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilderFolder.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 1b9843e08b28a..a0aeec5f1af9f 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -32,8 +32,6 @@
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index 585129904dd4d..89bf234c9de7c 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -15,7 +15,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index b50a02c5dde52..58f867cb11e18 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -21,22 +21,16 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/OperandTraits.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -47,6 +41,10 @@
 
 namespace llvm {
 
+class StringRef;
+class Type;
+class Value;
+
 namespace Intrinsic {
 typedef unsigned ID;
 }
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 1937ffd36f7b1..8d0a8363cdfb4 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -24,7 +24,6 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
 #include <cstdint>
 #include <utility>
 
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 5929cff3b4fb3..e9ea6c6d86c34 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -21,24 +21,18 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/OperandTraits.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstddef>
@@ -47,9 +41,14 @@
 
 namespace llvm {
 
+class APFloat;
 class APInt;
+class BasicBlock;
 class ConstantInt;
 class DataLayout;
+class StringRef;
+class Type;
+class Value;
 
 //===----------------------------------------------------------------------===//
 //                                AllocaInst Class
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 05652204c281d..959cbbdb1d403 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -31,7 +31,6 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
@@ -39,6 +38,8 @@
 
 namespace llvm {
 
+class Metadata;
+
 /// A wrapper class for inspecting calls to intrinsic functions.
 /// This allows the standard isa/dyncast/cast functionality to work with calls
 /// to intrinsic functions.
diff --git a/llvm/lib/Analysis/DomTreeUpdater.cpp b/llvm/lib/Analysis/DomTreeUpdater.cpp
index 6e299263e66da..6b9376543b1a9 100644
--- a/llvm/lib/Analysis/DomTreeUpdater.cpp
+++ b/llvm/lib/Analysis/DomTreeUpdater.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <algorithm>
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 31d4fc7d02bf1..a50de50844943 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Instructions.h"
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c9d3e473062b1..95676530a54ce 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -55,6 +55,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index d2f676192e7fc..069da26e63b12 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueMap.h"
 
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 5861c59175dd6..a8d6578793b39 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 6950675c5d536..7d08f073fd81d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -15,8 +15,9 @@
 
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64InstrInfo.h"
-#include <llvm/IR/Metadata.h>
-#include <llvm/IR/Module.h>
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 3ddfab1b670ec..fb7709d66c761 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPU.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
index 927dceec8865b..ec5d578eb8277 100644
--- a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -19,11 +19,13 @@
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
 #include "llvm/Analysis/SparsePropagation.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "called-value-propagation"
diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
index ac3839f2a4ab5..a5cdb583cef09 100644
--- a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 0b718ed6136ea..036c29b4ee266 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Transforms/Utils/UnifyLoopExits.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils.h"
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp
index 04ac47ebc4a76..c9def7ee35547 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp
@@ -14,6 +14,7 @@
 #include "ReduceArguments.h"
 #include "Delta.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Intrinsics.h"
 #include <set>
 #include <vector>
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
index c76322b255372..b51891c883457 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
@@ -14,6 +14,7 @@
 #include "ReduceBasicBlocks.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp
index 2a26d4340af83..53f5b7d3fcec2 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp
@@ -15,6 +15,7 @@
 #include "ReduceFunctions.h"
 #include "Delta.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include <iterator>
 #include <vector>
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
index f27959fb98af5..692d7c2e9385d 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ReduceInstructions.h"
+#include "llvm/IR/Constants.h"
 
 using namespace llvm;
 
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp
index f1dcf24c6ea72..04099ba3cd3c4 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp
@@ -9,6 +9,7 @@
 #include "ReduceOperandsToArgs.h"
 #include "Delta.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"

From 8182dba0c2981690dda3cb8be6e4ec085bf9b2a1 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Tue, 22 Feb 2022 09:52:09 +0100
Subject: [PATCH 594/748] [compiler-rt][builtins] Fix CMake builtin target flag

clang-cl doesn't support -target <target>, instead it only supports
--target=<target> so building a RUNTIME configuration for clang-cl
ended up in never building builtins. Which in turn lead to clang-cl
not being able to find the runtime libraries because we depend
on the compiler_rt.builtins.lib being in the runtime dir for the
Driver to add it as a candidate.

I don't think this should have any downsides since most the code
these days are using --target=<target> instead of the old syntax.

Reviewed By: mstorsjo

Differential Revision: https://reviews.llvm.org/D120308
---
 compiler-rt/cmake/Modules/BuiltinTests.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/cmake/Modules/BuiltinTests.cmake b/compiler-rt/cmake/Modules/BuiltinTests.cmake
index 904904d545a5b..7d71ca3f71efd 100644
--- a/compiler-rt/cmake/Modules/BuiltinTests.cmake
+++ b/compiler-rt/cmake/Modules/BuiltinTests.cmake
@@ -46,7 +46,7 @@ function(try_compile_only output)
 
   set(TRY_COMPILE_FLAGS "${ARG_FLAGS}")
   if(CMAKE_C_COMPILER_ID MATCHES Clang AND CMAKE_C_COMPILER_TARGET)
-    list(APPEND TRY_COMPILE_FLAGS "-target ${CMAKE_C_COMPILER_TARGET}")
+    list(APPEND TRY_COMPILE_FLAGS "--target=${CMAKE_C_COMPILER_TARGET}")
   endif()
 
   string(REPLACE ";" " " extra_flags "${TRY_COMPILE_FLAGS}")

From c1b9672534cdc798f2a7ba6b7b6be85dea0d8a5a Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 23 Feb 2022 01:23:21 -0800
Subject: [PATCH 595/748] [NFC] Add #include for constants

---
 llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
index 023a41c5bfd69..c142729e2c6f4 100644
--- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"

From 823b32fbfba6aa354f8bc4908423acec07fb5f85 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 23 Feb 2022 01:26:53 -0800
Subject: [PATCH 596/748] [NFC] Add #include for constants

---
 llvm/unittests/Analysis/CGSCCPassManagerTest.cpp | 1 +
 llvm/unittests/Analysis/LoadsTest.cpp            | 1 +
 llvm/unittests/Analysis/PhiValuesTest.cpp        | 1 +
 3 files changed, 3 insertions(+)

diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
index e0b0aeda5bb28..d0bca9d1004d9 100644
--- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
diff --git a/llvm/unittests/Analysis/LoadsTest.cpp b/llvm/unittests/Analysis/LoadsTest.cpp
index 5570b747a4464..0111cfeefa41a 100644
--- a/llvm/unittests/Analysis/LoadsTest.cpp
+++ b/llvm/unittests/Analysis/LoadsTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/Loads.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/unittests/Analysis/PhiValuesTest.cpp b/llvm/unittests/Analysis/PhiValuesTest.cpp
index 82c02337ef2c9..a1506515f5430 100644
--- a/llvm/unittests/Analysis/PhiValuesTest.cpp
+++ b/llvm/unittests/Analysis/PhiValuesTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/PhiValues.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"

From eb4c8608115c1c9af0fc8cb5b1e9f2bc960014ef Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Sun, 20 Feb 2022 00:03:20 +0100
Subject: [PATCH 597/748] Cleanup llvm/DebugInfo/PDB headers

accumulated preprocessed size:
before: 1065515095
after: 1065629059

Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup
Differential Revision: https://reviews.llvm.org/D120195
---
 lld/COFF/PDB.cpp                              |  2 ++
 .../Plugins/SymbolFile/PDB/PDBASTParser.cpp   |  2 ++
 .../Plugins/SymbolFile/PDB/SymbolFilePDB.cpp  |  3 +++
 llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h  |  4 ++-
 .../PDB/Native/DbiModuleDescriptorBuilder.h   | 10 +++----
 .../DebugInfo/PDB/Native/DbiStreamBuilder.h   | 22 +++++++---------
 .../DebugInfo/PDB/Native/GSIStreamBuilder.h   | 14 +++++-----
 .../DebugInfo/PDB/Native/InfoStreamBuilder.h  |  8 +++---
 .../DebugInfo/PDB/Native/NativeEnumGlobals.h  |  2 +-
 .../PDB/Native/NativeEnumLineNumbers.h        |  4 ---
 .../DebugInfo/PDB/Native/NativeEnumSymbols.h  |  2 +-
 .../DebugInfo/PDB/Native/NativeEnumTypes.h    |  7 +++--
 .../DebugInfo/PDB/Native/NativeExeSymbol.h    |  5 +++-
 .../PDB/Native/NativeFunctionSymbol.h         |  7 +++--
 .../PDB/Native/NativeInlineSiteSymbol.h       |  5 +++-
 .../DebugInfo/PDB/Native/NativeLineNumber.h   |  4 ++-
 .../DebugInfo/PDB/Native/NativePublicSymbol.h |  5 ++--
 .../llvm/DebugInfo/PDB/Native/NativeSession.h | 12 ++++++---
 .../DebugInfo/PDB/Native/NativeSourceFile.h   |  5 ++--
 .../PDB/Native/NativeSymbolEnumerator.h       |  8 ++++--
 .../DebugInfo/PDB/Native/NativeTypeEnum.h     |  8 +++---
 .../PDB/Native/NativeTypeFunctionSig.h        |  8 +++---
 .../DebugInfo/PDB/Native/NativeTypePointer.h  |  5 ++--
 .../DebugInfo/PDB/Native/NativeTypeTypedef.h  |  8 +++++-
 .../llvm/DebugInfo/PDB/Native/NativeTypeUDT.h |  8 ++++--
 .../DebugInfo/PDB/Native/NativeTypeVTShape.h  |  6 +++--
 .../DebugInfo/PDB/Native/PDBFileBuilder.h     | 14 ++++++----
 .../llvm/DebugInfo/PDB/Native/SymbolCache.h   | 16 ++++++++----
 .../DebugInfo/PDB/Native/TpiStreamBuilder.h   | 15 +++--------
 llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h   |  5 ++--
 .../llvm/DebugInfo/PDB/PDBSymbolAnnotation.h  |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolBlock.h       |  2 --
 .../DebugInfo/PDB/PDBSymbolCompilandDetails.h |  1 -
 .../DebugInfo/PDB/PDBSymbolCompilandEnv.h     |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolCustom.h      |  2 --
 .../llvm/DebugInfo/PDB/PDBSymbolData.h        |  6 ++---
 .../llvm/DebugInfo/PDB/PDBSymbolFunc.h        | 11 +++++---
 .../DebugInfo/PDB/PDBSymbolFuncDebugEnd.h     |  2 --
 .../DebugInfo/PDB/PDBSymbolFuncDebugStart.h   |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolLabel.h       |  1 -
 .../DebugInfo/PDB/PDBSymbolPublicSymbol.h     |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolThunk.h       |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeArray.h   |  1 -
 .../DebugInfo/PDB/PDBSymbolTypeBaseClass.h    |  6 ++---
 .../llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h  |  1 -
 .../DebugInfo/PDB/PDBSymbolTypeDimension.h    |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h    |  8 +++---
 .../llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h  |  1 -
 .../DebugInfo/PDB/PDBSymbolTypeFunctionArg.h  |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypePointer.h |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h     |  9 +++----
 .../llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h  |  1 -
 .../DebugInfo/PDB/PDBSymbolTypeVTableShape.h  |  1 -
 .../llvm/DebugInfo/PDB/PDBSymbolUnknown.h     |  1 -
 .../DebugInfo/PDB/PDBSymbolUsingNamespace.h   |  1 -
 llvm/include/llvm/DebugInfo/PDB/UDTLayout.h   |  1 -
 llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp |  1 -
 llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp  |  1 -
 .../PDB/Native/DbiModuleDescriptorBuilder.cpp | 10 ++++---
 .../DebugInfo/PDB/Native/DbiStreamBuilder.cpp |  1 -
 .../DebugInfo/PDB/Native/GSIStreamBuilder.cpp |  3 ++-
 .../PDB/Native/InfoStreamBuilder.cpp          |  2 --
 .../PDB/Native/NativeCompilandSymbol.cpp      |  2 --
 .../PDB/Native/NativeEnumGlobals.cpp          |  4 ++-
 .../PDB/Native/NativeEnumLineNumbers.cpp      |  7 ++---
 .../PDB/Native/NativeEnumModules.cpp          |  5 +---
 .../PDB/Native/NativeEnumSymbols.cpp          |  4 +--
 .../DebugInfo/PDB/Native/NativeEnumTypes.cpp  | 10 ++++---
 .../DebugInfo/PDB/Native/NativeExeSymbol.cpp  |  6 ++---
 .../PDB/Native/NativeFunctionSymbol.cpp       |  8 ++++--
 .../PDB/Native/NativeInlineSiteSymbol.cpp     |  4 +++
 .../DebugInfo/PDB/Native/NativeLineNumber.cpp |  1 +
 .../PDB/Native/NativePublicSymbol.cpp         |  3 +--
 .../DebugInfo/PDB/Native/NativeRawSymbol.cpp  |  1 -
 .../DebugInfo/PDB/Native/NativeSession.cpp    | 26 ++++++++++++-------
 .../DebugInfo/PDB/Native/NativeSourceFile.cpp |  2 ++
 .../PDB/Native/NativeSymbolEnumerator.cpp     |  2 +-
 .../DebugInfo/PDB/Native/NativeTypeArray.cpp  |  7 ++---
 .../DebugInfo/PDB/Native/NativeTypeEnum.cpp   |  5 ++--
 .../PDB/Native/NativeTypeFunctionSig.cpp      |  3 ++-
 .../PDB/Native/NativeTypePointer.cpp          |  3 ++-
 .../DebugInfo/PDB/Native/NativeTypeUDT.cpp    |  9 ++++---
 .../PDB/Native/NativeTypeVTShape.cpp          |  3 +++
 .../DebugInfo/PDB/Native/PDBFileBuilder.cpp   | 16 ++++++++----
 llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp | 15 ++++++++---
 .../DebugInfo/PDB/Native/TpiStreamBuilder.cpp |  6 +----
 llvm/lib/DebugInfo/PDB/PDB.cpp                |  1 -
 llvm/lib/DebugInfo/PDB/PDBContext.cpp         |  1 +
 llvm/lib/DebugInfo/PDB/PDBExtras.cpp          |  1 -
 llvm/lib/DebugInfo/PDB/PDBSymbol.cpp          |  2 +-
 .../lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp |  2 --
 llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp     |  3 ---
 llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp |  3 ++-
 .../PDB/PDBSymbolCompilandDetails.cpp         |  3 ---
 .../DebugInfo/PDB/PDBSymbolCompilandEnv.cpp   |  4 +--
 llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp    |  3 ---
 llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp      |  3 +--
 llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp       |  3 +--
 llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp      |  1 +
 .../DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp   |  3 ---
 .../DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp |  4 +--
 llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp     |  2 --
 .../DebugInfo/PDB/PDBSymbolPublicSymbol.cpp   |  4 +--
 llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp     |  2 --
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp |  2 --
 .../DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp  |  4 +--
 .../DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp    |  2 --
 .../lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp |  3 ---
 .../DebugInfo/PDB/PDBSymbolTypeDimension.cpp  |  3 ---
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp  |  5 ++--
 .../lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp |  3 ---
 .../PDB/PDBSymbolTypeFunctionArg.cpp          |  2 --
 .../DebugInfo/PDB/PDBSymbolTypeManaged.cpp    |  3 ---
 .../DebugInfo/PDB/PDBSymbolTypePointer.cpp    |  3 ---
 .../DebugInfo/PDB/PDBSymbolTypeTypedef.cpp    |  2 --
 llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp   |  9 +------
 .../lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp |  2 --
 .../PDB/PDBSymbolTypeVTableShape.cpp          |  4 +--
 llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp   |  3 ---
 .../DebugInfo/PDB/PDBSymbolUsingNamespace.cpp |  3 ---
 llvm/lib/DebugInfo/PDB/UDTLayout.cpp          |  2 ++
 llvm/tools/llvm-pdbutil/LinePrinter.cpp       |  1 +
 .../PrettyClassDefinitionDumper.cpp           |  2 ++
 .../PrettyClassLayoutGraphicalDumper.cpp      |  2 ++
 llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp  |  2 ++
 .../PrettyExternalSymbolDumper.cpp            |  1 +
 .../llvm-pdbutil/PrettyFunctionDumper.cpp     |  3 +++
 llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp  |  3 +++
 .../llvm-pdbutil/PrettyTypedefDumper.cpp      |  2 ++
 .../llvm-pdbutil/PrettyVariableDumper.cpp     |  5 +++-
 llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp      |  3 +++
 .../DebugInfo/PDB/NativeSessionTest.cpp       |  2 ++
 .../DebugInfo/PDB/NativeSymbolReuseTest.cpp   |  3 +++
 136 files changed, 300 insertions(+), 290 deletions(-)

diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index dea84eca5b121..2ceb4fb98031c 100644
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -18,6 +18,8 @@
 #include "Writer.h"
 #include "lld/Common/Timer.h"
 #include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
diff --git a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
index b4085edd0b97e..b466f334af39a 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
@@ -23,12 +23,14 @@
 #include "lldb/Symbol/TypeMap.h"
 #include "lldb/Symbol/TypeSystem.h"
 #include "lldb/Utility/LLDBLog.h"
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index 7f450af703331..16a8ad8a10422 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -28,6 +28,7 @@
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/RegularExpression.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/IPDBDataStream.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
@@ -45,7 +46,9 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 
diff --git a/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h b/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
index 1a03d42ded922..2ac18a8efaba8 100644
--- a/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
+++ b/llvm/include/llvm/DebugInfo/MSF/MSFBuilder.h
@@ -11,7 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -22,6 +22,8 @@ namespace llvm {
 class FileBufferByteStream;
 namespace msf {
 
+struct MSFLayout;
+
 class MSFBuilder {
 public:
   /// Create a new `MSFBuilder`.
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
index 8a49f46320b07..287f319e01b09 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
@@ -9,13 +9,12 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_DBIMODULEDESCRIPTORBUILDER_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_DBIMODULEDESCRIPTORBUILDER_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 #include <string>
@@ -23,9 +22,8 @@
 
 namespace llvm {
 class BinaryStreamWriter;
-
 namespace codeview {
-class DebugSubsectionRecordBuilder;
+class DebugSubsection;
 }
 
 namespace msf {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
index ef441d4330404..2f99aa942a056 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
@@ -10,35 +10,33 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_DBISTREAMBUILDER_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 
 #include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/Support/BinaryStreamRef.h"
 
 namespace llvm {
+
+class BinaryStreamWriter;
 namespace codeview {
 struct FrameData;
 }
 namespace msf {
 class MSFBuilder;
-}
-namespace object {
-struct coff_section;
-struct FpoData;
+struct MSFLayout;
 }
 namespace pdb {
-class DbiStream;
-struct DbiStreamHeader;
 class DbiModuleDescriptorBuilder;
-class PDBFile;
 
 class DbiStreamBuilder {
 public:
@@ -134,7 +132,7 @@ class DbiStreamBuilder {
   std::vector<SecMapEntry> SectionMap;
   std::array<Optional<DebugStream>, (int)DbgHeaderType::Max> DbgStreams;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
index 9530a15849d5e..28a72c887f25a 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
@@ -10,18 +10,20 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_GSISTREAMBUILDER_H
 
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamRef.h"
-#include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+namespace codeview {
+class ConstantSym;
+class DataSym;
+class ProcRefSym;
+} // namespace codeview
+template <typename T> struct BinaryItemTraits;
 
 template <> struct BinaryItemTraits<codeview::CVSymbol> {
   static size_t length(const codeview::CVSymbol &Item) {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
index 4952173c5873e..7acd336bbe082 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
@@ -12,10 +12,9 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/Error.h"
 
-#include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 class WritableBinaryStreamRef;
@@ -24,7 +23,6 @@ namespace msf {
 class MSFBuilder;
 }
 namespace pdb {
-class PDBFile;
 class NamedStreamMap;
 
 class InfoStreamBuilder {
@@ -70,7 +68,7 @@ class InfoStreamBuilder {
 
   NamedStreamMap &NamedStreams;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h
index 073878afd1293..c10e652efa8d2 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMGLOBALS_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMGLOBALS_H
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h
index 32a4515d557e6..14b28c346bf24 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h
@@ -9,16 +9,12 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMLINENUMBERS_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMLINENUMBERS_H
 
-#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
-#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
 
 namespace llvm {
 namespace pdb {
-class IPDBLineNumber;
 
 class NativeEnumLineNumbers : public IPDBEnumChildren<IPDBLineNumber> {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
index 480b3fb11419d..5fc91675f2092 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMSYMBOLS_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMSYMBOLS_H
 
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include <vector>
 
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h
index 25c56567384fb..2ca000c1c0fe6 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeEnumTypes.h
@@ -9,14 +9,17 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMTYPES_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMTYPES_H
 
-#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <vector>
 
 namespace llvm {
+namespace codeview {
+class LazyRandomTypeCollection;
+}
 namespace pdb {
 
 class NativeSession;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
index 280358d02305c..82fdff130c4f7 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
@@ -9,12 +9,15 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEEXESYMBOL_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEEXESYMBOL_H
 
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
 
+class NativeSession;
+
 class DbiStream;
 
 class NativeExeSymbol : public NativeRawSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
index b219055d21539..c15e22f61077a 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h
@@ -9,14 +9,17 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEFUNCTIONSYMBOL_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEFUNCTIONSYMBOL_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+class raw_ostream;
 namespace pdb {
 
+class NativeSession;
+
 class NativeFunctionSymbol : public NativeRawSymbol {
 public:
   NativeFunctionSymbol(NativeSession &Session, SymIndexId Id,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
index 2f6aba038ae8e..b2818a0aadfda 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h
@@ -11,12 +11,15 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
 
+class NativeSession;
+
 class NativeInlineSiteSymbol : public NativeRawSymbol {
 public:
   NativeInlineSiteSymbol(NativeSession &Session, SymIndexId Id,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
index be0ddf0a063aa..53f2985833fd9 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeLineNumber.h
@@ -11,10 +11,12 @@
 
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 namespace llvm {
 namespace pdb {
+
+class NativeSession;
+
 class NativeLineNumber : public IPDBLineNumber {
 public:
   explicit NativeLineNumber(const NativeSession &Session,
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
index 9f410e27f4cbe..43de80507d023 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativePublicSymbol.h
@@ -9,13 +9,14 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEPUBLICSYMBOL_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEPUBLICSYMBOL_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 namespace llvm {
+
+class raw_ostream;
 namespace pdb {
+class NativeSession;
 
 class NativePublicSymbol : public NativeRawSymbol {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 5f8fc587e5466..95be7d09aae98 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -9,13 +9,11 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVESESSION_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVESESSION_H
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 
@@ -24,6 +22,12 @@ class MemoryBuffer;
 namespace pdb {
 class PDBFile;
 class NativeExeSymbol;
+class IPDBSourceFile;
+class ModuleDebugStreamRef;
+class PDBSymbol;
+class PDBSymbolCompiland;
+class PDBSymbolExe;
+template <typename ChildType> class IPDBEnumChildren;
 
 class NativeSession : public IPDBSession {
   struct PdbSearchOptions {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h
index eb6336f268e80..c6653368bc0ce 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSourceFile.h
@@ -11,11 +11,12 @@
 
 #include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
+class PDBSymbolCompiland;
+template <typename ChildType> class IPDBEnumChildren;
 class NativeSession;
 
 class NativeSourceFile : public IPDBSourceFile {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h
index d6a3125ee40b7..ab4abc4d3c2cc 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h
@@ -9,12 +9,16 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVESYMBOLENUMERATOR_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVESYMBOLENUMERATOR_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+
+class raw_ostream;
 namespace pdb {
+class NativeSession;
 class NativeTypeEnum;
 
 class NativeSymbolEnumerator : public NativeRawSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h
index 2068c88fc74a0..429c06f29ac7f 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeEnum.h
@@ -10,12 +10,14 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEENUM_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+class raw_ostream;
 namespace pdb {
 
 class NativeTypeBuiltin;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
index 90b5d80689591..47ea722313c36 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeFunctionSig.h
@@ -9,17 +9,15 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEFUNCTIONSIG_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEFUNCTIONSIG_H
 
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
 
-class NativeTypeUDT;
-
 class NativeTypeFunctionSig : public NativeRawSymbol {
 protected:
   void initialize() override;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h
index 7a3dfaecefebf..1f357754ac0f5 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypePointer.h
@@ -10,10 +10,11 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEPOINTER_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h
index 292fc48e7b6d4..9ce079c5e5a9a 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeTypedef.h
@@ -9,14 +9,20 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPETYPEDEF_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPETYPEDEF_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace pdb {
 
+class NativeSession;
+
 class NativeTypeTypedef : public NativeRawSymbol {
 public:
   // Create a pointer record for a non-simple type.
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h
index e1b31a256c12a..a1dd39c0b4be4 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeUDT.h
@@ -10,13 +10,17 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEUDT_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+
+class raw_ostream;
 namespace pdb {
+class NativeSession;
 
 class NativeTypeUDT : public NativeRawSymbol {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
index 21995ca665c17..92d51706c1dac 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h
@@ -9,13 +9,15 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEVTSHAPE_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_NATIVETYPEVTSHAPE_H
 
-#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
 namespace pdb {
+class NativeSession;
 
 class NativeTypeVTShape : public NativeRawSymbol {
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
index 004d005280d48..c23d958f8ed06 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/PDBFileBuilder.h
@@ -9,24 +9,28 @@
 #ifndef LLVM_DEBUGINFO_PDB_NATIVE_PDBFILEBUILDER_H
 #define LLVM_DEBUGINFO_PDB_NATIVE_PDBFILEBUILDER_H
 
-#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <memory>
 
 namespace llvm {
+class WritableBinaryStream;
+namespace codeview {
+struct GUID;
+}
+
 namespace msf {
 class MSFBuilder;
+struct MSFLayout;
 }
 namespace pdb {
+struct SrcHeaderBlockEntry;
 class DbiStreamBuilder;
 class InfoStreamBuilder;
 class GSIStreamBuilder;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h b/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
index 1ff6ca173b2b0..7c5b6b9e1bdf6 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/SymbolCache.h
@@ -10,23 +10,29 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_SYMBOLCACHE_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/IntervalMap.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include <memory>
 #include <vector>
 
 namespace llvm {
+namespace codeview {
+class InlineSiteSym;
+struct FileChecksumEntry;
+} // namespace codeview
 namespace pdb {
+class IPDBSourceFile;
+class NativeSession;
+class PDBSymbol;
+class PDBSymbolCompiland;
 class DbiStream;
-class PDBFile;
 
 class SymbolCache {
   NativeSession &Session;
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
index f18d38ae0b31e..9f320358144c4 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -10,12 +10,10 @@
 #define LLVM_DEBUGINFO_PDB_NATIVE_TPISTREAMBUILDER_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 
@@ -23,7 +21,7 @@
 
 namespace llvm {
 class BinaryByteStream;
-class WritableBinaryStreamRef;
+template <typename T> struct BinaryItemTraits;
 
 template <> struct BinaryItemTraits<llvm::codeview::CVType> {
   static size_t length(const codeview::CVType &Item) { return Item.length(); }
@@ -32,16 +30,11 @@ template <> struct BinaryItemTraits<llvm::codeview::CVType> {
   }
 };
 
-namespace codeview {
-class TypeRecord;
-}
 namespace msf {
 class MSFBuilder;
 struct MSFLayout;
 }
 namespace pdb {
-class PDBFile;
-class TpiStream;
 struct TpiStreamHeader;
 
 class TpiStreamBuilder {
@@ -88,7 +81,7 @@ class TpiStreamBuilder {
   const TpiStreamHeader *Header;
   uint32_t Idx;
 };
-}
+} // namespace pdb
 }
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
index 24cf1e459f92a..4e34b75b6117e 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbol.h
@@ -9,11 +9,9 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOL_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOL_H
 
-#include "ConcreteSymbolEnumerator.h"
 #include "IPDBRawSymbol.h"
 #include "PDBExtras.h"
 #include "PDBTypes.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 
 #define FORWARD_SYMBOL_METHOD(MethodName)                                      \
@@ -43,6 +41,9 @@ class raw_ostream;
 
 namespace pdb {
 class IPDBSession;
+class PDBSymDumper;
+class PDBSymbol;
+template <typename ChildType> class ConcreteSymbolEnumerator;
 
 #define DECLARE_PDB_SYMBOL_CONCRETE_TYPE(TagValue)                             \
 private:                                                                       \
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
index c76466a97b66d..c8d3d0b7bb963 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolAnnotation.h
@@ -13,7 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolAnnotation : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
index cf471450d9891..09142227b0176 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
@@ -13,8 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
 class PDBSymbolBlock : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
index dbd8ba5a63ff7..46c1592685338 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolCompilandDetails : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
index 61607a03593d8..cba082f2ff196 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 class PDBSymbolCompilandEnv : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::CompilandEnv)
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
index 75a86411643a6..c78b47ce99248 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolCustom.h
@@ -15,8 +15,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 /// PDBSymbolCustom represents symbols that are compiler-specific and do not
 /// fit anywhere else in the lexical hierarchy.
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
index 7e9b69d7cf4bb..61e67d1368a88 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolData.h
@@ -9,16 +9,16 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLDATA_H
 
-#include "IPDBLineNumber.h"
 #include "PDBSymbol.h"
 #include "PDBTypes.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
+class PDBSymDumper;
+
 class PDBSymbolData : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Data)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
index f50057c68406e..bfc7f76897182 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
@@ -9,17 +9,20 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
 
-#include "IPDBLineNumber.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
 #include "PDBSymbol.h"
-#include "PDBSymbolTypeFunctionSig.h"
 #include "PDBTypes.h"
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
+class PDBSymDumper;
+class PDBSymbolData;
+class PDBSymbolTypeFunctionSig;
+template <typename ChildType> class IPDBEnumChildren;
+
 class PDBSymbolFunc : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Function)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
index 1cdc1811bb1a2..09c6f47289608 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
@@ -14,8 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
 class PDBSymbolFuncDebugEnd : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
index 021f27c7f0f76..843a8348a2f0b 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolFuncDebugStart : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
index 33eb36696cc2d..148802a47cbcc 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolLabel : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
index f8dcb2ba9d5f1..a757cc02624b1 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolPublicSymbol : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
index a5f795cc1303d..2b81a63995e6e 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolThunk : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
index d4cd6e71423e6..496141e5fa68f 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeArray : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
index bd2dbc914725b..c74ac3fb9cce1 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
@@ -12,14 +12,14 @@
 #include "PDBSymbol.h"
 #include "PDBTypes.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
+class PDBSymDumper;
+
 class PDBSymbolTypeBaseClass : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::BaseClass)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
index df6309b1545c6..b923983095f33 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeBuiltin : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
index 7bf0317ff1ca1..b15abf7bedfd7 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeCustom : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
index 5d742237bac4e..e7570b41dd21b 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeDimension : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
index 0aab910395093..ee1f736c17a04 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
@@ -9,16 +9,18 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
 
-#include "IPDBLineNumber.h"
 #include "PDBSymbol.h"
-#include "PDBSymbolTypeBuiltin.h"
 #include "PDBTypes.h"
 
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
+class PDBSymDumper;
+class PDBSymbolTypeBuiltin;
+
 class PDBSymbolTypeEnum : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Enum)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
index d56a90662dae8..9fde421162614 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeFriend : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
index 559ceec5aace4..71decff722a51 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeFunctionArg : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
index 5e7b83ce8004f..866bf520a3b26 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeManaged : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
index da25eab50f9bc..1b43ef9a21bd2 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypePointer : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
index 8dc29ca261929..3f37730cf1df4 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeTypedef : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
index 3e73ad7ac85a3..a3a49a4b619a5 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
@@ -9,18 +9,17 @@
 #ifndef LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEUDT_H
 
-#include "IPDBLineNumber.h"
-#include "IPDBSession.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
 #include "PDBSymbol.h"
-#include "PDBSymbolTypeBaseClass.h"
 #include "PDBTypes.h"
 
 namespace llvm {
 
-class raw_ostream;
-
 namespace pdb {
 
+class PDBSymDumper;
+
 class PDBSymbolTypeUDT : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::UDT)
 public:
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
index d08728dafa762..6223bee986707 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeVTable : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
index c7e2ac1485039..bec0a9970a9f2 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolTypeVTableShape : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
index 5b4909b800b99..a53af49bc9e01 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUnknown.h
@@ -13,7 +13,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolUnknown : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
index 19a8f414eb43d..dde25a023d000 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
 
-class raw_ostream;
 namespace pdb {
 
 class PDBSymbolUsingNamespace : public PDBSymbol {
diff --git a/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h b/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h
index c67b093b63c0b..8631c412f1142 100644
--- a/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h
+++ b/llvm/include/llvm/DebugInfo/PDB/UDTLayout.h
@@ -18,7 +18,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include <cstdint>
 #include <memory>
 #include <string>
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index cb5924784da81..6461f2ac031d2 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -9,7 +9,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp b/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
index 00fc70ca5a545..94935d63452ed 100644
--- a/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index 14e6f938dc018..c12ac38c2317f 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -10,12 +10,10 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
-#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/BinaryStreamWriter.h"
@@ -25,6 +23,12 @@ using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
+namespace llvm {
+namespace codeview {
+class DebugSubsection;
+}
+} // namespace llvm
+
 static uint32_t calculateDiSymbolStreamSize(uint32_t SymbolByteSize,
                                             uint32_t C13Size) {
   uint32_t Size = sizeof(uint32_t);   // Signature
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index c11fe2e3ea0ff..69db1279dae51 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/BinaryStreamWriter.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index b048e0c8e4588..cf6790d72f8bd 100644
--- a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -14,7 +14,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
 #include "llvm/DebugInfo/CodeView/RecordName.h"
-#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
@@ -22,6 +22,7 @@
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Parallel.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index 42daa7cae7997..04bb26ab5e1d4 100644
--- a/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -10,9 +10,7 @@
 
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/BinaryStreamWriter.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
index 7717f062eac11..d24364312b31c 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -9,8 +9,6 @@
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
-#include "llvm/ADT/STLExtras.h"
-
 namespace llvm {
 namespace pdb {
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp
index 54646867bc5ff..b861fc2435b82 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumGlobals.cpp
@@ -8,13 +8,15 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h"
 
-#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
index 1e4b076463351..5815d08e46687 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
@@ -8,13 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
-#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
index c6621924b5160..7108b8efff831 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -8,13 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
 
-#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
-#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 
 namespace llvm {
 namespace pdb {
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
index feede1dbc9583..24fe2244cfc52 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumSymbols.cpp
@@ -8,11 +8,11 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
 
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
index 2524e10cb6c53..b5548696fa1db 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
@@ -8,13 +8,17 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
 
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
index 895f8943157a3..ae0f66c31fde1 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -8,14 +8,14 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
index 55052839b3e88..b1caa5add5b33 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
@@ -8,11 +8,15 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
 
+#include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
index fb51179cca784..fd79acf2d7e20 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeInlineSiteSymbol.cpp
@@ -12,8 +12,12 @@
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
index 155ed0cdb8284..aa7d6ac6f29d9 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
index 739480e4a57bd..339af61080099 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
@@ -9,8 +9,7 @@
 #include "llvm/DebugInfo/PDB/Native/NativePublicSymbol.h"
 
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index 2ad552470b617..89f9f9836fec1 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -10,7 +10,6 @@
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
-#include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
index 7212a0e650357..cf314c3bede3f 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -8,31 +8,33 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
-#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumInjectedSources.h"
-#include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
 #include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 
@@ -45,6 +47,12 @@ using namespace llvm;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
+namespace llvm {
+namespace codeview {
+union DebugInfo;
+}
+} // namespace llvm
+
 static DbiStream *getDbiStreamPtr(PDBFile &File) {
   Expected<DbiStream &> DbiS = File.getPDBDbiStream();
   if (DbiS)
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
index fd813dee6b9f8..8d6f8ebebf4c7 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
index 24807a7b48650..a6e8cbf715481 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
@@ -8,7 +8,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h"
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
index 31fc0ac5f6196..e98f357ac485e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
@@ -8,9 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/Native/NativeTypeArray.h"
 
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
-#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
index 98a8475eee589..f347e0d22f56f 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
@@ -9,8 +9,9 @@
 #include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
@@ -18,8 +19,6 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 
-#include "llvm/Support/FormatVariadic.h"
-
 #include <cassert>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
index 251ed58d250bf..7db3f1c631288 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeFunctionSig.cpp
@@ -10,9 +10,10 @@
 
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
-#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp
index ae153d6f4f23c..14b903ccef5ab 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypePointer.cpp
@@ -7,8 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeTypePointer.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 
 #include <cassert>
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
index 2ed23f5f54199..95fba1114ef20 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
@@ -7,10 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeTypeUDT.h"
-
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
-
-#include <cassert>
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp
index b022345b9a757..63bb3f046e236 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeVTShape.cpp
@@ -1,4 +1,7 @@
 #include "llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index b23ce8cc60059..73106c2195ad3 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -7,20 +7,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
+#include "llvm/DebugInfo/MSF/IMSFFile.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
-#include "llvm/Support/BinaryStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/CRC.h"
-#include "llvm/Support/Chrono.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/xxhash.h"
 
@@ -30,6 +32,10 @@ using namespace llvm::msf;
 using namespace llvm::pdb;
 using namespace llvm::support;
 
+namespace llvm {
+class WritableBinaryStream;
+}
+
 PDBFileBuilder::PDBFileBuilder(BumpPtrAllocator &Allocator)
     : Allocator(Allocator), InjectedSourceHashTraits(Strings),
       InjectedSourceTable(2) {}
diff --git a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
index f9e67014477e5..b3efbf97b4630 100644
--- a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
@@ -1,20 +1,25 @@
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 
-#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
-#include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
-#include "llvm/DebugInfo/PDB/Native/NativeEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
 #include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeInlineSiteSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/NativePublicSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
@@ -28,10 +33,12 @@
 #include "llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 5f4f497690b6f..986e45e050c76 100644
--- a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -9,17 +9,13 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryByteStream.h"
-#include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
diff --git a/llvm/lib/DebugInfo/PDB/PDB.cpp b/llvm/lib/DebugInfo/PDB/PDB.cpp
index e5b7731f6f4ac..d106ba8fefc16 100644
--- a/llvm/lib/DebugInfo/PDB/PDB.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDB.cpp
@@ -15,7 +15,6 @@
 #endif
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBContext.cpp b/llvm/lib/DebugInfo/PDB/PDBContext.cpp
index 0ebb70e010d5a..a0a572ef6b661 100644
--- a/llvm/lib/DebugInfo/PDB/PDBContext.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBContext.cpp
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/Object/COFF.h"
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index a6d7ca0da7a98..45503aecef3a6 100644
--- a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
index d6bc7ee9c9514..4eb5af9bd2921 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
@@ -43,7 +44,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
-#include <algorithm>
 #include <memory>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
index 0fa83efb7ae0e..089f4de0f4227 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
index 9452282a8817b..49ee4937521b9 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index 529100b23ba5a..bd60489b6bed6 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -9,10 +9,11 @@
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
-#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Path.h"
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
index 0d86dfe1e632a..f775ac949cd82 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
index 61f119405fd95..2c2ed59c1726a 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
@@ -10,9 +10,7 @@
 
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
index 6c9a4aa76c3d7..405b07c2b6890 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -10,9 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
index d2b82111ccd58..c604b5cd3a6ab 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -7,12 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSectionContrib.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
index c85756c43e479..b9575e59f44d0 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
@@ -8,11 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index cb0329bc0ed72..e976d9155efde 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
index 66433dc17b49c..5c72e3f621213 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
index fe32c93c01211..fd537a9eeea4b 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
 
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
index 1fffe69a0c830..896719a6a8e2e 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
index 08697683f6415..a00b1be40e18f 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
index 6483858183e53..42502a55ef761 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
index a0d521abe43f3..bb4eb43f22e5f 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
index 08467059b5e14..539c3547a4b03 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
 
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
index a0dd9ef601c0b..eca2a09c1f77b 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
index 6723894c90ea8..a616b4e26cb17 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
index 4a25a391f278e..2828ce4df3f80 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
@@ -10,9 +10,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
index b9fdf6aec8114..8f0b4d66a4a36 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
@@ -7,11 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
-
-#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 
-#include <utility>
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
index 4ffea42cbb0a7..d4bd9996d7862 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
index 683e93548fb14..acda57f44e335 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
index e80e6c716572a..fa6e630e3c457 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
index 462fc315359b5..9e238c7caa37a 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
@@ -8,11 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 
-#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
index 70749d9bf5f51..c2ce21c6ca691 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
index d302c29a3bec3..3e9aa9e67e84f 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
@@ -8,17 +8,10 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 
-#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
index 4e2a45116d512..a4d81888e4576 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
@@ -10,8 +10,6 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
index 78957620e0836..4d2ab29138964 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
@@ -8,10 +8,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
index 650d01183171b..85294a4cded28 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
index 74afbdb180865..98aaaa9b10b92 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
@@ -9,9 +9,6 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/llvm/lib/DebugInfo/PDB/UDTLayout.cpp b/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
index 55854bb49888a..53b4d9477a909 100644
--- a/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
+++ b/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
@@ -17,6 +18,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
diff --git a/llvm/tools/llvm-pdbutil/LinePrinter.cpp b/llvm/tools/llvm-pdbutil/LinePrinter.cpp
index dd6ca5bf41b16..6e32105b157af 100644
--- a/llvm/tools/llvm-pdbutil/LinePrinter.cpp
+++ b/llvm/tools/llvm-pdbutil/LinePrinter.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
 #include "llvm/Support/BinaryStreamReader.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp
index b7eccac5988ca..37b3fc9706429 100644
--- a/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyClassDefinitionDumper.cpp
@@ -14,7 +14,9 @@
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
 
diff --git a/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
index a522935e34f14..364fd3ff7b54b 100644
--- a/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyClassLayoutGraphicalDumper.cpp
@@ -17,8 +17,10 @@
 #include "PrettyVariableDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp
index 9ed5893f252e8..cb96b55c4a833 100644
--- a/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyEnumDumper.cpp
@@ -12,6 +12,8 @@
 #include "PrettyBuiltinDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
index fede031ec0c09..14f241264e2f4 100644
--- a/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyExternalSymbolDumper.cpp
@@ -9,6 +9,7 @@
 #include "PrettyExternalSymbolDumper.h"
 #include "LinePrinter.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
index b820ca3339654..f9a5a801c28f4 100644
--- a/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyFunctionDumper.cpp
@@ -10,6 +10,8 @@
 #include "LinePrinter.h"
 #include "PrettyBuiltinDumper.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
@@ -17,6 +19,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
index 2f7a39803ca5e..5730a0c494de2 100644
--- a/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyTypeDumper.cpp
@@ -16,6 +16,8 @@
 #include "PrettyTypedefDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
@@ -25,6 +27,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 #include "llvm/DebugInfo/PDB/UDTLayout.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
index ef73a8cdf9c48..09c591015c56b 100644
--- a/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyTypedefDumper.cpp
@@ -13,8 +13,10 @@
 #include "PrettyFunctionDumper.h"
 #include "PrettyTypeDumper.h"
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
index 6dd7cc384cc9e..60fb0d2ae0bdb 100644
--- a/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyVariableDumper.cpp
@@ -13,16 +13,19 @@
 #include "PrettyFunctionDumper.h"
 #include "llvm-pdbutil.h"
 
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include "llvm/Support/Format.h"
diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index b152ebd6dccb2..39ca2e01789ff 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -44,8 +44,10 @@
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
@@ -67,6 +69,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
diff --git a/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp b/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
index 002dd6579a461..b010b5fce8675 100644
--- a/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
+++ b/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
@@ -7,10 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/Support/Path.h"
 
 #include "llvm/Testing/Support/Error.h"
diff --git a/llvm/unittests/DebugInfo/PDB/NativeSymbolReuseTest.cpp b/llvm/unittests/DebugInfo/PDB/NativeSymbolReuseTest.cpp
index 91c7db4fc393f..daf375c8e5cd1 100644
--- a/llvm/unittests/DebugInfo/PDB/NativeSymbolReuseTest.cpp
+++ b/llvm/unittests/DebugInfo/PDB/NativeSymbolReuseTest.cpp
@@ -8,10 +8,13 @@
 
 #include "llvm/DebugInfo/PDB/PDB.h"
 
+#include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
+#include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/Support/Path.h"
 
 #include "llvm/Testing/Support/Error.h"

From 606320ed30fd8a8fc01afb71a7e107cd7f1f90da Mon Sep 17 00:00:00 2001
From: Simon Moll <simon.moll@emea.nec.com>
Date: Wed, 23 Feb 2022 10:07:50 +0100
Subject: [PATCH 598/748] [VE][NFC] Move functions to VVP module

Separate vector isel functions to the module they belong to. Keep scalar
stuff and calls into vector isel in the VEISelLowering.
---
 llvm/lib/Target/VE/VEISelLowering.cpp  | 72 --------------------------
 llvm/lib/Target/VE/VVPISelLowering.cpp | 71 +++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 0e3f2eb522829..1f75dcc6324ce 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -2707,78 +2707,6 @@ bool VETargetLowering::hasAndNot(SDValue Y) const {
   return true;
 }
 
-SDValue VETargetLowering::splitMaskArithmetic(SDValue Op,
-                                              SelectionDAG &DAG) const {
-  VECustomDAG CDAG(DAG, Op);
-  SDValue AVL =
-      CDAG.getConstant(Op.getValueType().getVectorNumElements(), MVT::i32);
-  SDValue A = Op->getOperand(0);
-  SDValue B = Op->getOperand(1);
-  SDValue LoA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Lo, AVL);
-  SDValue HiA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Hi, AVL);
-  SDValue LoB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Lo, AVL);
-  SDValue HiB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Hi, AVL);
-  unsigned Opc = Op.getOpcode();
-  auto LoRes = CDAG.getNode(Opc, MVT::v256i1, {LoA, LoB});
-  auto HiRes = CDAG.getNode(Opc, MVT::v256i1, {HiA, HiB});
-  return CDAG.getPack(MVT::v512i1, LoRes, HiRes, AVL);
-}
-
-SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
-  // Can we represent this as a VVP node.
-  const unsigned Opcode = Op->getOpcode();
-  auto VVPOpcodeOpt = getVVPOpcode(Opcode);
-  if (!VVPOpcodeOpt.hasValue())
-    return SDValue();
-  unsigned VVPOpcode = VVPOpcodeOpt.getValue();
-  const bool FromVP = ISD::isVPOpcode(Opcode);
-
-  // The representative and legalized vector type of this operation.
-  VECustomDAG CDAG(DAG, Op);
-  EVT OpVecVT = Op.getValueType();
-  EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
-  auto Packing = getTypePacking(LegalVecVT.getSimpleVT());
-
-  SDValue AVL;
-  SDValue Mask;
-
-  if (FromVP) {
-    // All upstream VP SDNodes always have a mask and avl.
-    auto MaskIdx = ISD::getVPMaskIdx(Opcode);
-    auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode);
-    if (MaskIdx)
-      Mask = Op->getOperand(*MaskIdx);
-    if (AVLIdx)
-      AVL = Op->getOperand(*AVLIdx);
-
-  }
-
-  // Materialize default mask and avl.
-  if (!AVL)
-    AVL = CDAG.getConstant(OpVecVT.getVectorNumElements(), MVT::i32);
-  if (!Mask)
-    Mask = CDAG.getConstantMask(Packing, true);
-
-  if (isVVPBinaryOp(VVPOpcode)) {
-    assert(LegalVecVT.isSimple());
-    return CDAG.getNode(VVPOpcode, LegalVecVT,
-                        {Op->getOperand(0), Op->getOperand(1), Mask, AVL});
-  }
-  if (VVPOpcode == VEISD::VVP_SELECT) {
-    auto Mask = Op->getOperand(0);
-    auto OnTrue = Op->getOperand(1);
-    auto OnFalse = Op->getOperand(2);
-    return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
-  }
-  if (VVPOpcode == VEISD::VVP_SETCC) {
-    auto LHS = Op->getOperand(0);
-    auto RHS = Op->getOperand(1);
-    auto Pred = Op->getOperand(2);
-    return CDAG.getNode(VVPOpcode, LegalVecVT, {LHS, RHS, Pred, Mask, AVL});
-  }
-  llvm_unreachable("lowerToVVP called for unexpected SDNode.");
-}
-
 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index 54fdd9f3ac543..e3fba730e5ad4 100644
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -18,6 +18,77 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ve-lower"
 
+SDValue VETargetLowering::splitMaskArithmetic(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  VECustomDAG CDAG(DAG, Op);
+  SDValue AVL =
+      CDAG.getConstant(Op.getValueType().getVectorNumElements(), MVT::i32);
+  SDValue A = Op->getOperand(0);
+  SDValue B = Op->getOperand(1);
+  SDValue LoA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Lo, AVL);
+  SDValue HiA = CDAG.getUnpack(MVT::v256i1, A, PackElem::Hi, AVL);
+  SDValue LoB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Lo, AVL);
+  SDValue HiB = CDAG.getUnpack(MVT::v256i1, B, PackElem::Hi, AVL);
+  unsigned Opc = Op.getOpcode();
+  auto LoRes = CDAG.getNode(Opc, MVT::v256i1, {LoA, LoB});
+  auto HiRes = CDAG.getNode(Opc, MVT::v256i1, {HiA, HiB});
+  return CDAG.getPack(MVT::v512i1, LoRes, HiRes, AVL);
+}
+
+SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
+  // Can we represent this as a VVP node.
+  const unsigned Opcode = Op->getOpcode();
+  auto VVPOpcodeOpt = getVVPOpcode(Opcode);
+  if (!VVPOpcodeOpt.hasValue())
+    return SDValue();
+  unsigned VVPOpcode = VVPOpcodeOpt.getValue();
+  const bool FromVP = ISD::isVPOpcode(Opcode);
+
+  // The representative and legalized vector type of this operation.
+  VECustomDAG CDAG(DAG, Op);
+  EVT OpVecVT = Op.getValueType();
+  EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
+  auto Packing = getTypePacking(LegalVecVT.getSimpleVT());
+
+  SDValue AVL;
+  SDValue Mask;
+
+  if (FromVP) {
+    // All upstream VP SDNodes always have a mask and avl.
+    auto MaskIdx = ISD::getVPMaskIdx(Opcode);
+    auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode);
+    if (MaskIdx)
+      Mask = Op->getOperand(*MaskIdx);
+    if (AVLIdx)
+      AVL = Op->getOperand(*AVLIdx);
+  }
+
+  // Materialize default mask and avl.
+  if (!AVL)
+    AVL = CDAG.getConstant(OpVecVT.getVectorNumElements(), MVT::i32);
+  if (!Mask)
+    Mask = CDAG.getConstantMask(Packing, true);
+
+  if (isVVPBinaryOp(VVPOpcode)) {
+    assert(LegalVecVT.isSimple());
+    return CDAG.getNode(VVPOpcode, LegalVecVT,
+                        {Op->getOperand(0), Op->getOperand(1), Mask, AVL});
+  }
+  if (VVPOpcode == VEISD::VVP_SELECT) {
+    auto Mask = Op->getOperand(0);
+    auto OnTrue = Op->getOperand(1);
+    auto OnFalse = Op->getOperand(2);
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
+  }
+  if (VVPOpcode == VEISD::VVP_SETCC) {
+    auto LHS = Op->getOperand(0);
+    auto RHS = Op->getOperand(1);
+    auto Pred = Op->getOperand(2);
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {LHS, RHS, Pred, Mask, AVL});
+  }
+  llvm_unreachable("lowerToVVP called for unexpected SDNode.");
+}
+
 SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
                                                    SelectionDAG &DAG) const {
   VECustomDAG CDAG(DAG, Op);

From fa0a80e017ebd58a71bdb4e4493bb022f80fe791 Mon Sep 17 00:00:00 2001
From: Balazs Benics <balazs.benics@sigmatechnology.se>
Date: Wed, 23 Feb 2022 10:37:03 +0100
Subject: [PATCH 599/748] Revert "Revert "[analyzer] Add failing test case
 demonstrating buggy taint propagation""

This reverts commit b8ae323cca61dc1edcd36e9ae18c7e4c3d76d52e.

Let's try `REQUIRES: asserts`.
---
 .../Checkers/GenericTaintChecker.cpp          | 25 +++++++++--
 ...nt-checker-callback-order-has-definition.c | 43 +++++++++++++++++++
 ...hecker-callback-order-without-definition.c | 36 ++++++++++++++++
 3 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Analysis/taint-checker-callback-order-has-definition.c
 create mode 100644 clang/test/Analysis/taint-checker-callback-order-without-definition.c

diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
index e2209e3debfde..428778e6cfaa6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -32,6 +32,8 @@
 #include <memory>
 #include <utility>
 
+#define DEBUG_TYPE "taint-checker"
+
 using namespace clang;
 using namespace ento;
 using namespace taint;
@@ -691,6 +693,13 @@ void GenericTaintChecker::checkPostCall(const CallEvent &Call,
   if (TaintArgs.isEmpty())
     return;
 
+  LLVM_DEBUG(for (ArgIdxTy I
+                  : TaintArgs) {
+    llvm::dbgs() << "PostCall<";
+    Call.dump(llvm::dbgs());
+    llvm::dbgs() << "> actually wants to taint arg index: " << I << '\n';
+  });
+
   for (ArgIdxTy ArgNum : TaintArgs) {
     // Special handling for the tainted return value.
     if (ArgNum == ReturnValueIndex) {
@@ -768,15 +777,25 @@ void GenericTaintRule::process(const GenericTaintChecker &Checker,
 
   /// Propagate taint where it is necessary.
   ForEachCallArg(
-      [this, &State, WouldEscape](ArgIdxTy I, const Expr *E, SVal V) {
-        if (PropDstArgs.contains(I))
+      [this, &State, WouldEscape, &Call](ArgIdxTy I, const Expr *E, SVal V) {
+        if (PropDstArgs.contains(I)) {
+          LLVM_DEBUG(llvm::dbgs() << "PreCall<"; Call.dump(llvm::dbgs());
+                     llvm::dbgs()
+                     << "> prepares tainting arg index: " << I << '\n';);
           State = State->add<TaintArgsOnPostVisit>(I);
+        }
 
         // TODO: We should traverse all reachable memory regions via the
         // escaping parameter. Instead of doing that we simply mark only the
         // referred memory region as tainted.
-        if (WouldEscape(V, E->getType()))
+        if (WouldEscape(V, E->getType())) {
+          LLVM_DEBUG(if (!State->contains<TaintArgsOnPostVisit>(I)) {
+            llvm::dbgs() << "PreCall<";
+            Call.dump(llvm::dbgs());
+            llvm::dbgs() << "> prepares tainting arg index: " << I << '\n';
+          });
           State = State->add<TaintArgsOnPostVisit>(I);
+        }
       });
 
   C.addTransition(State);
diff --git a/clang/test/Analysis/taint-checker-callback-order-has-definition.c b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
new file mode 100644
index 0000000000000..82943ad46fbd8
--- /dev/null
+++ b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
@@ -0,0 +1,43 @@
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=core,alpha.security.taint \
+// RUN:   -mllvm -debug-only=taint-checker \
+// RUN:   2>&1 | FileCheck %s
+
+// REQUIRES: asserts
+// FIXME: We should not crash.
+// XFAIL: *
+
+struct _IO_FILE;
+typedef struct _IO_FILE FILE;
+FILE *fopen(const char *fname, const char *mode);
+
+void nested_call(void) {}
+
+char *fgets(char *s, int n, FILE *fp) {
+  nested_call();   // no-crash: we should not try adding taint to a non-existent argument.
+  return (char *)0;
+}
+
+void top(const char *fname, char *buf) {
+  FILE *fp = fopen(fname, "r");
+  // CHECK:      PreCall<fopen(fname, "r")> prepares tainting arg index: -1
+  // CHECK-NEXT: PostCall<fopen(fname, "r")> actually wants to taint arg index: -1
+
+  if (!fp)
+    return;
+
+  (void)fgets(buf, 42, fp); // Trigger taint propagation.
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: -1
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 0
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 1
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 2
+
+  // FIXME: We should propagate taint from PreCall<fgets> -> PostCall<fgets>.
+  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: -1
+  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: 0
+  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: 1
+  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: 2
+
+  // FIXME: We should not crash.
+  // CHECK: PLEASE submit a bug report
+}
diff --git a/clang/test/Analysis/taint-checker-callback-order-without-definition.c b/clang/test/Analysis/taint-checker-callback-order-without-definition.c
new file mode 100644
index 0000000000000..dba23f367fd66
--- /dev/null
+++ b/clang/test/Analysis/taint-checker-callback-order-without-definition.c
@@ -0,0 +1,36 @@
+// RUN: %clang_analyze_cc1 %s \
+// RUN:   -analyzer-checker=core,alpha.security.taint \
+// RUN:   -mllvm -debug-only=taint-checker \
+// RUN:   2>&1 | FileCheck %s
+
+// REQUIRES: asserts
+
+struct _IO_FILE;
+typedef struct _IO_FILE FILE;
+FILE *fopen(const char *fname, const char *mode);
+
+char *fgets(char *s, int n, FILE *fp); // no-definition
+
+void top(const char *fname, char *buf) {
+  FILE *fp = fopen(fname, "r"); // Introduce taint.
+  // CHECK:      PreCall<fopen(fname, "r")> prepares tainting arg index: -1
+  // CHECK-NEXT: PostCall<fopen(fname, "r")> actually wants to taint arg index: -1
+
+  if (!fp)
+    return;
+
+  (void)fgets(buf, 42, fp); // Trigger taint propagation.
+
+  // FIXME: Why is the arg index 1 prepared for taint?
+  // Before the call it wasn't tainted, and it also shouldn't be tainted after the call.
+
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: -1
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 0
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 1
+  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 2
+  //
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: -1
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 0
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 1
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 2
+}

From ea249489f5f5c770475587010833b8c3ecb581e7 Mon Sep 17 00:00:00 2001
From: Anton Afanasyev <anton.a.afanasyev@gmail.com>
Date: Wed, 15 Sep 2021 09:18:51 +0300
Subject: [PATCH 600/748] [Test][AggressiveInstCombine] Add test for `phi`
 instruction

---
 .../AggressiveInstCombine/trunc_phi.ll        | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll

diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
new file mode 100644
index 0000000000000..46bdb60fada6c
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s
+
+define i16 @trunc_phi(i8 %x) {
+; CHECK-LABEL: @trunc_phi(
+; CHECK-NEXT:  LoopHeader:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       Loop:
+; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOPHEADER]] ], [ [[I:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SHL]] = shl i32 [[ZEXT2]], 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16
+; CHECK-NEXT:    [[I]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOPEND:%.*]], label [[LOOP]]
+; CHECK:       LoopEnd:
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+LoopHeader:
+  %zext = zext i8 %x to i32
+  br label %Loop
+
+Loop:
+  %zext2 = phi i32 [%zext, %LoopHeader], [%shl, %Loop]
+  %j = phi i32 [0, %LoopHeader], [%i, %Loop]
+  %shl = shl i32 %zext2, 1
+  %trunc = trunc i32 %shl to i16
+  %i = add i32 %j, 1
+  %cmp = icmp eq i32 %i, 10
+  br i1 %cmp, label %LoopEnd, label %Loop
+
+LoopEnd:
+  ret i16 %trunc
+}
+
+define i16 @trunc_phi2(i8 %x, i32 %sw) {
+; CHECK-LABEL: @trunc_phi2(
+; CHECK-NEXT:  LoopHeader:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    switch i32 [[SW:%.*]], label [[LOOPEND:%.*]] [
+; CHECK-NEXT:    i32 0, label [[LOOP:%.*]]
+; CHECK-NEXT:    i32 1, label [[LOOP]]
+; CHECK-NEXT:    ]
+; CHECK:       Loop:
+; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[ZEXT]], [[LOOPHEADER]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOPHEADER]] ], [ 0, [[LOOPHEADER]] ], [ [[I:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[SHL]] = shl i32 [[ZEXT2]], 1
+; CHECK-NEXT:    [[I]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOPEND]], label [[LOOP]]
+; CHECK:       LoopEnd:
+; CHECK-NEXT:    [[ZEXT3:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER]] ], [ [[ZEXT2]], [[LOOP]] ]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ZEXT3]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+LoopHeader:
+  %zext = zext i8 %x to i32
+  switch i32 %sw, label %LoopEnd [ i32 0, label %Loop
+  i32 1, label %Loop ]
+
+Loop:
+  %zext2 = phi i32 [%zext, %LoopHeader], [%zext, %LoopHeader], [%shl, %Loop]
+  %j = phi i32 [0, %LoopHeader], [0, %LoopHeader], [%i, %Loop]
+  %shl = shl i32 %zext2, 1
+  %i = add i32 %j, 1
+  %cmp = icmp eq i32 %i, 10
+  br i1 %cmp, label %LoopEnd, label %Loop
+
+LoopEnd:
+  %zext3 = phi i32 [%zext, %LoopHeader], [%zext2, %Loop]
+  %trunc = trunc i32 %zext3 to i16
+  ret i16 %trunc
+}

From 0fc11418a793a4b71008ff21687acf9ea78add59 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 23 Feb 2022 10:52:24 +0100
Subject: [PATCH 601/748] [mlir][Bazel] Fix wrong dependency for GPUBaseIncGen.

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index bb61bb6a9c7b8..05ae5cd80c962 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3242,7 +3242,7 @@ gentbl_cc_library(
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/GPU/GPUBase.td",
-    deps = [":GPUOpsTdFiles"],
+    deps = [":OpBaseTdFiles"],
 )
 
 gentbl_cc_library(

From 25d7b4fb446b6511e9dcd582e05dbb7f7f936c01 Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Tue, 22 Feb 2022 22:56:22 +0300
Subject: [PATCH 602/748] [objcopy][NFC] Rename files to avoid clashing of
 archive members.

libtool uses file names to name members of an static library.
Files, located in different directories and having matching name,
would have the same name inside an archive. This is not a problem
for ld, but may be a problem for ar. This patch renames files
from ObjCopy library to avoid names clashing.

See https://reviews.llvm.org/D88827#3335814

Differential Revision: https://reviews.llvm.org/D120345
---
 llvm/lib/ObjCopy/CMakeLists.txt                  | 16 ++++++++--------
 llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp            |  6 +++---
 .../ObjCopy/COFF/{Object.cpp => COFFObject.cpp}  |  4 ++--
 llvm/lib/ObjCopy/COFF/{Object.h => COFFObject.h} |  8 ++++----
 .../ObjCopy/COFF/{Reader.cpp => COFFReader.cpp}  |  6 +++---
 llvm/lib/ObjCopy/COFF/{Reader.h => COFFReader.h} |  8 ++++----
 .../ObjCopy/COFF/{Writer.cpp => COFFWriter.cpp}  |  6 +++---
 llvm/lib/ObjCopy/COFF/{Writer.h => COFFWriter.h} |  8 ++++----
 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp              |  2 +-
 .../ObjCopy/ELF/{Object.cpp => ELFObject.cpp}    |  4 ++--
 llvm/lib/ObjCopy/ELF/{Object.h => ELFObject.h}   |  8 ++++----
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h      |  2 +-
 .../MachO/{Object.cpp => MachOObject.cpp}        |  4 ++--
 .../ObjCopy/MachO/{Object.h => MachOObject.h}    |  8 ++++----
 llvm/lib/ObjCopy/MachO/MachOReader.cpp           |  2 +-
 llvm/lib/ObjCopy/MachO/MachOReader.h             |  2 +-
 llvm/lib/ObjCopy/MachO/MachOWriter.cpp           |  2 +-
 llvm/lib/ObjCopy/MachO/MachOWriter.h             |  2 +-
 llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp            |  6 +++---
 .../ObjCopy/wasm/{Object.cpp => WasmObject.cpp}  |  4 ++--
 llvm/lib/ObjCopy/wasm/{Object.h => WasmObject.h} |  8 ++++----
 .../ObjCopy/wasm/{Reader.cpp => WasmReader.cpp}  |  4 ++--
 llvm/lib/ObjCopy/wasm/{Reader.h => WasmReader.h} | 10 +++++-----
 .../ObjCopy/wasm/{Writer.cpp => WasmWriter.cpp}  |  4 ++--
 llvm/lib/ObjCopy/wasm/{Writer.h => WasmWriter.h} | 10 +++++-----
 25 files changed, 72 insertions(+), 72 deletions(-)
 rename llvm/lib/ObjCopy/COFF/{Object.cpp => COFFObject.cpp} (97%)
 rename llvm/lib/ObjCopy/COFF/{Object.h => COFFObject.h} (97%)
 rename llvm/lib/ObjCopy/COFF/{Reader.cpp => COFFReader.cpp} (98%)
 rename llvm/lib/ObjCopy/COFF/{Reader.h => COFFReader.h} (83%)
 rename llvm/lib/ObjCopy/COFF/{Writer.cpp => COFFWriter.cpp} (99%)
 rename llvm/lib/ObjCopy/COFF/{Writer.h => COFFWriter.h} (88%)
 rename llvm/lib/ObjCopy/ELF/{Object.cpp => ELFObject.cpp} (99%)
 rename llvm/lib/ObjCopy/ELF/{Object.h => ELFObject.h} (99%)
 rename llvm/lib/ObjCopy/MachO/{Object.cpp => MachOObject.cpp} (98%)
 rename llvm/lib/ObjCopy/MachO/{Object.h => MachOObject.h} (98%)
 rename llvm/lib/ObjCopy/wasm/{Object.cpp => WasmObject.cpp} (91%)
 rename llvm/lib/ObjCopy/wasm/{Object.h => WasmObject.h} (86%)
 rename llvm/lib/ObjCopy/wasm/{Reader.cpp => WasmReader.cpp} (91%)
 rename llvm/lib/ObjCopy/wasm/{Reader.h => WasmReader.h} (74%)
 rename llvm/lib/ObjCopy/wasm/{Writer.cpp => WasmWriter.cpp} (96%)
 rename llvm/lib/ObjCopy/wasm/{Writer.h => WasmWriter.h} (86%)

diff --git a/llvm/lib/ObjCopy/CMakeLists.txt b/llvm/lib/ObjCopy/CMakeLists.txt
index ec1160e331c9a..9b8365cd2f89c 100644
--- a/llvm/lib/ObjCopy/CMakeLists.txt
+++ b/llvm/lib/ObjCopy/CMakeLists.txt
@@ -34,19 +34,19 @@ add_llvm_component_library(LLVMObjCopy
   ObjCopy.cpp
   ConfigManager.cpp
   COFF/COFFObjcopy.cpp
-  COFF/Object.cpp
-  COFF/Reader.cpp
-  COFF/Writer.cpp
+  COFF/COFFObject.cpp
+  COFF/COFFReader.cpp
+  COFF/COFFWriter.cpp
   ELF/ELFObjcopy.cpp
-  ELF/Object.cpp
+  ELF/ELFObject.cpp
   MachO/MachOObjcopy.cpp
   MachO/MachOReader.cpp
   MachO/MachOWriter.cpp
   MachO/MachOLayoutBuilder.cpp
-  MachO/Object.cpp
-  wasm/Object.cpp
-  wasm/Reader.cpp
-  wasm/Writer.cpp
+  MachO/MachOObject.cpp
+  wasm/WasmObject.cpp
+  wasm/WasmReader.cpp
+  wasm/WasmWriter.cpp
   wasm/WasmObjcopy.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
index 31801231e46be..16b57703391bd 100644
--- a/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFObjcopy.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjCopy/COFF/COFFObjcopy.h"
-#include "Object.h"
-#include "Reader.h"
-#include "Writer.h"
+#include "COFFObject.h"
+#include "COFFReader.h"
+#include "COFFWriter.h"
 #include "llvm/ObjCopy/COFF/COFFConfig.h"
 #include "llvm/ObjCopy/CommonConfig.h"
 
diff --git a/llvm/lib/ObjCopy/COFF/Object.cpp b/llvm/lib/ObjCopy/COFF/COFFObject.cpp
similarity index 97%
rename from llvm/lib/ObjCopy/COFF/Object.cpp
rename to llvm/lib/ObjCopy/COFF/COFFObject.cpp
index ec2628c7eca9e..1d27b7eaa8918 100644
--- a/llvm/lib/ObjCopy/COFF/Object.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFObject.cpp
@@ -1,4 +1,4 @@
-//===- Object.cpp ---------------------------------------------------------===//
+//===- COFFObject.cpp -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Object.h"
+#include "COFFObject.h"
 #include "llvm/ADT/DenseSet.h"
 #include <algorithm>
 
diff --git a/llvm/lib/ObjCopy/COFF/Object.h b/llvm/lib/ObjCopy/COFF/COFFObject.h
similarity index 97%
rename from llvm/lib/ObjCopy/COFF/Object.h
rename to llvm/lib/ObjCopy/COFF/COFFObject.h
index 2f4d8af41fdee..66c0a19429cee 100644
--- a/llvm/lib/ObjCopy/COFF/Object.h
+++ b/llvm/lib/ObjCopy/COFF/COFFObject.h
@@ -1,4 +1,4 @@
-//===- Object.h -------------------------------------------------*- C++ -*-===//
+//===- COFFObject.h ---------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_COFF_OBJECT_H
-#define LLVM_LIB_OBJCOPY_COFF_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_COFF_COFFOBJECT_H
+#define LLVM_LIB_OBJCOPY_COFF_COFFOBJECT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -209,4 +209,4 @@ void copyPeHeader(PeHeader1Ty &Dest, const PeHeader2Ty &Src) {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_COFF_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_COFF_COFFOBJECT_H
diff --git a/llvm/lib/ObjCopy/COFF/Reader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
similarity index 98%
rename from llvm/lib/ObjCopy/COFF/Reader.cpp
rename to llvm/lib/ObjCopy/COFF/COFFReader.cpp
index d1beacb3bd67e..44bf303078dd0 100644
--- a/llvm/lib/ObjCopy/COFF/Reader.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
@@ -1,4 +1,4 @@
-//===- Reader.cpp ---------------------------------------------------------===//
+//===- COFFReader.cpp -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Reader.h"
-#include "Object.h"
+#include "COFFReader.h"
+#include "COFFObject.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/COFF.h"
diff --git a/llvm/lib/ObjCopy/COFF/Reader.h b/llvm/lib/ObjCopy/COFF/COFFReader.h
similarity index 83%
rename from llvm/lib/ObjCopy/COFF/Reader.h
rename to llvm/lib/ObjCopy/COFF/COFFReader.h
index 9e4d5124829c7..b4957f8443924 100644
--- a/llvm/lib/ObjCopy/COFF/Reader.h
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.h
@@ -1,4 +1,4 @@
-//===- Reader.h -------------------------------------------------*- C++ -*-===//
+//===- COFFReader.h ---------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_COFF_READER_H
-#define LLVM_LIB_OBJCOPY_COFF_READER_H
+#ifndef LLVM_LIB_OBJCOPY_COFF_COFFREADER_H
+#define LLVM_LIB_OBJCOPY_COFF_COFFREADER_H
 
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Object/COFF.h"
@@ -38,4 +38,4 @@ class COFFReader {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_COFF_READER_H
+#endif // LLVM_LIB_OBJCOPY_COFF_COFFREADER_H
diff --git a/llvm/lib/ObjCopy/COFF/Writer.cpp b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp
similarity index 99%
rename from llvm/lib/ObjCopy/COFF/Writer.cpp
rename to llvm/lib/ObjCopy/COFF/COFFWriter.cpp
index fcbfef96d8609..88eb4d14ba25f 100644
--- a/llvm/lib/ObjCopy/COFF/Writer.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.cpp
@@ -1,4 +1,4 @@
-//===- Writer.cpp ---------------------------------------------------------===//
+//===- COFFWriter.cpp -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Writer.h"
-#include "Object.h"
+#include "COFFWriter.h"
+#include "COFFObject.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/COFF.h"
diff --git a/llvm/lib/ObjCopy/COFF/Writer.h b/llvm/lib/ObjCopy/COFF/COFFWriter.h
similarity index 88%
rename from llvm/lib/ObjCopy/COFF/Writer.h
rename to llvm/lib/ObjCopy/COFF/COFFWriter.h
index 95e7f5da1ad4b..b7dca69e9a81a 100644
--- a/llvm/lib/ObjCopy/COFF/Writer.h
+++ b/llvm/lib/ObjCopy/COFF/COFFWriter.h
@@ -1,4 +1,4 @@
-//===- Writer.h -------------------------------------------------*- C++ -*-===//
+//===- COFFWriter.h ---------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_COFF_WRITER_H
-#define LLVM_LIB_OBJCOPY_COFF_WRITER_H
+#ifndef LLVM_LIB_OBJCOPY_COFF_COFFWRITER_H
+#define LLVM_LIB_OBJCOPY_COFF_COFFWRITER_H
 
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Error.h"
@@ -60,4 +60,4 @@ class COFFWriter {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_COFF_WRITER_H
+#endif // LLVM_LIB_OBJCOPY_COFF_COFFWRITER_H
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 9e41a04919522..9059e4a17afd4 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjCopy/ELF/ELFObjcopy.h"
-#include "Object.h"
+#include "ELFObject.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
diff --git a/llvm/lib/ObjCopy/ELF/Object.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
similarity index 99%
rename from llvm/lib/ObjCopy/ELF/Object.cpp
rename to llvm/lib/ObjCopy/ELF/ELFObject.cpp
index be255470ebc8e..522804acac785 100644
--- a/llvm/lib/ObjCopy/ELF/Object.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -1,4 +1,4 @@
-//===- Object.cpp ---------------------------------------------------------===//
+//===- ELFObject.cpp ------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Object.h"
+#include "ELFObject.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/llvm/lib/ObjCopy/ELF/Object.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
similarity index 99%
rename from llvm/lib/ObjCopy/ELF/Object.h
rename to llvm/lib/ObjCopy/ELF/ELFObject.h
index b14f7f2c72384..37134c849a15b 100644
--- a/llvm/lib/ObjCopy/ELF/Object.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -1,4 +1,4 @@
-//===- Object.h -------------------------------------------------*- C++ -*-===//
+//===- ELFObject.h ----------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_ELF_OBJECT_H
-#define LLVM_LIB_OBJCOPY_ELF_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_ELF_ELFOBJECT_H
+#define LLVM_LIB_OBJCOPY_ELF_ELFOBJECT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -1110,4 +1110,4 @@ class Object {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_ELF_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_ELF_ELFOBJECT_H
diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
index 709534306fda9..bf4a9f14bf741 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
 #define LLVM_LIB_OBJCOPY_MACHO_MACHOLAYOUTBUILDER_H
 
-#include "Object.h"
+#include "MachOObject.h"
 #include "llvm/ObjCopy/MachO/MachOObjcopy.h"
 
 namespace llvm {
diff --git a/llvm/lib/ObjCopy/MachO/Object.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
similarity index 98%
rename from llvm/lib/ObjCopy/MachO/Object.cpp
rename to llvm/lib/ObjCopy/MachO/MachOObject.cpp
index 6312adbbc9f73..56f31e456198f 100644
--- a/llvm/lib/ObjCopy/MachO/Object.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -1,4 +1,4 @@
-//===- Object.cpp - Mach-O object file model --------------------*- C++ -*-===//
+//===- MachOObject.cpp - Mach-O object file model ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Object.h"
+#include "MachOObject.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include <unordered_set>
 
diff --git a/llvm/lib/ObjCopy/MachO/Object.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
similarity index 98%
rename from llvm/lib/ObjCopy/MachO/Object.h
rename to llvm/lib/ObjCopy/MachO/MachOObject.h
index bb7f1fa81800f..df9261b76e4d5 100644
--- a/llvm/lib/ObjCopy/MachO/Object.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -1,4 +1,4 @@
-//===- Object.h - Mach-O object file model ----------------------*- C++ -*-===//
+//===- MachOObject.h - Mach-O object file model -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_MACHO_OBJECT_H
-#define LLVM_LIB_OBJCOPY_MACHO_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
+#define LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
@@ -371,4 +371,4 @@ struct Object {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_MACHO_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
index d68d1692997ad..94459a436094f 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MachOReader.h"
-#include "Object.h"
+#include "MachOObject.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/Errc.h"
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.h b/llvm/lib/ObjCopy/MachO/MachOReader.h
index fee2112845a5e..ef374aa9efae5 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.h
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
 #define LLVM_LIB_OBJCOPY_MACHO_MACHOREADER_H
 
-#include "Object.h"
+#include "MachOObject.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/ObjCopy/MachO/MachOObjcopy.h"
 #include "llvm/Object/MachO.h"
diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
index 52f20794cc574..2a2eda45db390 100644
--- a/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "MachOWriter.h"
 #include "MachOLayoutBuilder.h"
-#include "Object.h"
+#include "MachOObject.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Object/MachO.h"
diff --git a/llvm/lib/ObjCopy/MachO/MachOWriter.h b/llvm/lib/ObjCopy/MachO/MachOWriter.h
index 2898df6c4bf0f..a54c102942462 100644
--- a/llvm/lib/ObjCopy/MachO/MachOWriter.h
+++ b/llvm/lib/ObjCopy/MachO/MachOWriter.h
@@ -10,7 +10,7 @@
 #define LLVM_LIB_OBJCOPY_MACHO_MACHOWRITER_H
 
 #include "MachOLayoutBuilder.h"
-#include "Object.h"
+#include "MachOObject.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/ObjCopy/MachO/MachOObjcopy.h"
 #include "llvm/Object/MachO.h"
diff --git a/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
index 69b5e6fe8bee4..ff56df82eaf74 100644
--- a/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjCopy/wasm/WasmObjcopy.h"
-#include "Object.h"
-#include "Reader.h"
-#include "Writer.h"
+#include "WasmObject.h"
+#include "WasmReader.h"
+#include "WasmWriter.h"
 #include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
diff --git a/llvm/lib/ObjCopy/wasm/Object.cpp b/llvm/lib/ObjCopy/wasm/WasmObject.cpp
similarity index 91%
rename from llvm/lib/ObjCopy/wasm/Object.cpp
rename to llvm/lib/ObjCopy/wasm/WasmObject.cpp
index e7a2956fedcaf..28a2de6e6e4f1 100644
--- a/llvm/lib/ObjCopy/wasm/Object.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmObject.cpp
@@ -1,4 +1,4 @@
-//===- Object.cpp ---------------------------------------------------------===//
+//===- WasmObject.cpp -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Object.h"
+#include "WasmObject.h"
 
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/ObjCopy/wasm/Object.h b/llvm/lib/ObjCopy/wasm/WasmObject.h
similarity index 86%
rename from llvm/lib/ObjCopy/wasm/Object.h
rename to llvm/lib/ObjCopy/wasm/WasmObject.h
index e58fc8c454962..9bc5831926c6c 100644
--- a/llvm/lib/ObjCopy/wasm/Object.h
+++ b/llvm/lib/ObjCopy/wasm/WasmObject.h
@@ -1,4 +1,4 @@
-//===- Object.h -------------------------------------------------*- C++ -*-===//
+//===- WasmObject.h ---------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_WASM_OBJECT_H
-#define LLVM_LIB_OBJCOPY_WASM_OBJECT_H
+#ifndef LLVM_LIB_OBJCOPY_WASM_WASMOBJECT_H
+#define LLVM_LIB_OBJCOPY_WASM_WASMOBJECT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -44,4 +44,4 @@ struct Object {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_WASM_OBJECT_H
+#endif // LLVM_LIB_OBJCOPY_WASM_WASMOBJECT_H
diff --git a/llvm/lib/ObjCopy/wasm/Reader.cpp b/llvm/lib/ObjCopy/wasm/WasmReader.cpp
similarity index 91%
rename from llvm/lib/ObjCopy/wasm/Reader.cpp
rename to llvm/lib/ObjCopy/wasm/WasmReader.cpp
index 13fa84ad80201..b998571472656 100644
--- a/llvm/lib/ObjCopy/wasm/Reader.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmReader.cpp
@@ -1,4 +1,4 @@
-//===- Reader.cpp ---------------------------------------------------------===//
+//===- WasmReader.cpp -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Reader.h"
+#include "WasmReader.h"
 
 namespace llvm {
 namespace objcopy {
diff --git a/llvm/lib/ObjCopy/wasm/Reader.h b/llvm/lib/ObjCopy/wasm/WasmReader.h
similarity index 74%
rename from llvm/lib/ObjCopy/wasm/Reader.h
rename to llvm/lib/ObjCopy/wasm/WasmReader.h
index d8dd541894541..d71660fa2b657 100644
--- a/llvm/lib/ObjCopy/wasm/Reader.h
+++ b/llvm/lib/ObjCopy/wasm/WasmReader.h
@@ -1,4 +1,4 @@
-//===- Reader.h -------------------------------------------------*- C++ -*-===//
+//===- WasmReader.h ---------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_WASM_READER_H
-#define LLVM_LIB_OBJCOPY_WASM_READER_H
+#ifndef LLVM_LIB_OBJCOPY_WASM_WASMREADER_H
+#define LLVM_LIB_OBJCOPY_WASM_WASMREADER_H
 
-#include "Object.h"
+#include "WasmObject.h"
 
 namespace llvm {
 namespace objcopy {
@@ -28,4 +28,4 @@ class Reader {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_WASM_READER_H
+#endif // LLVM_LIB_OBJCOPY_WASM_WASMREADER_H
diff --git a/llvm/lib/ObjCopy/wasm/Writer.cpp b/llvm/lib/ObjCopy/wasm/WasmWriter.cpp
similarity index 96%
rename from llvm/lib/ObjCopy/wasm/Writer.cpp
rename to llvm/lib/ObjCopy/wasm/WasmWriter.cpp
index 2fad9e60c50f4..fdcd441cc798c 100644
--- a/llvm/lib/ObjCopy/wasm/Writer.cpp
+++ b/llvm/lib/ObjCopy/wasm/WasmWriter.cpp
@@ -1,4 +1,4 @@
-//===- Writer.cpp ---------------------------------------------------------===//
+//===- WasmWriter.cpp -----------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Writer.h"
+#include "WasmWriter.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
diff --git a/llvm/lib/ObjCopy/wasm/Writer.h b/llvm/lib/ObjCopy/wasm/WasmWriter.h
similarity index 86%
rename from llvm/lib/ObjCopy/wasm/Writer.h
rename to llvm/lib/ObjCopy/wasm/WasmWriter.h
index 332b96e892516..14bbcf88875ec 100644
--- a/llvm/lib/ObjCopy/wasm/Writer.h
+++ b/llvm/lib/ObjCopy/wasm/WasmWriter.h
@@ -1,4 +1,4 @@
-//===- Writer.h -------------------------------------------------*- C++ -*-===//
+//===- WasmWriter.h ---------------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_OBJCOPY_WASM_WRITER_H
-#define LLVM_LIB_OBJCOPY_WASM_WRITER_H
+#ifndef LLVM_LIB_OBJCOPY_WASM_WASMWRITER_H
+#define LLVM_LIB_OBJCOPY_WASM_WASMWRITER_H
 
-#include "Object.h"
+#include "WasmObject.h"
 #include <cstdint>
 #include <vector>
 
@@ -46,4 +46,4 @@ class Writer {
 } // end namespace objcopy
 } // end namespace llvm
 
-#endif // LLVM_LIB_OBJCOPY_WASM_WRITER_H
+#endif // LLVM_LIB_OBJCOPY_WASM_WASMWRITER_H

From 16a91a1cbe98268d5e0343e313d848650f5f3541 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Thu, 17 Feb 2022 13:32:32 +0000
Subject: [PATCH 603/748] [flang][driver] Make `flang-new` always generate
 run-time type info

Currently, the driver generates the tables with "run-time type
information for derived types" only when specific actions are run.
However, the corresponding data might be required by the subsequent
compilation stages (e.g. lowering, code-gen) and should be generated
unconditionally. Note that this is only possible once the semantic
checks have been run.

Note that when generating these tables, extra semantic errors might be
generated. The driver will always report these and in most cases such
semantic errors will cause the driver to exit immediately. The only
exception are actions inheriting from `PrescanAndSemaDebugAction`.
Currently, there's only one such action: `DebugDumpAllAction`
(corresponds to `-fdebug-dump-all` command-line flag). I've updated the
comments for this action to clarify this.

This change will mostly affect lowering, which currently is only
available for most basic examples (e.g. empty programs). I wasn't able
to find a working case that would demonstrate the new behaviour. I
hope that this change is straightforward enough and am submitting it
without a test.

Differential Revision: https://reviews.llvm.org/D120051
---
 .../include/flang/Frontend/CompilerInstance.h | 13 +++++++
 flang/include/flang/Frontend/FrontendAction.h |  4 ++
 .../include/flang/Frontend/FrontendActions.h  |  7 +++-
 flang/lib/Frontend/FrontendAction.cpp         | 15 ++++++++
 flang/lib/Frontend/FrontendActions.cpp        | 38 ++++++++-----------
 5 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/flang/include/flang/Frontend/CompilerInstance.h b/flang/include/flang/Frontend/CompilerInstance.h
index 29a1bc7b9a5c7..e2ebfd3265661 100644
--- a/flang/include/flang/Frontend/CompilerInstance.h
+++ b/flang/include/flang/Frontend/CompilerInstance.h
@@ -13,6 +13,7 @@
 #include "flang/Frontend/PreprocessorOptions.h"
 #include "flang/Parser/parsing.h"
 #include "flang/Parser/provenance.h"
+#include "flang/Semantics/runtime-type-info.h"
 #include "flang/Semantics/semantics.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -47,6 +48,8 @@ class CompilerInstance {
 
   std::unique_ptr<Fortran::semantics::Semantics> semantics_;
 
+  std::unique_ptr<Fortran::semantics::RuntimeDerivedTypeTables> rtTyTables_;
+
   /// The stream for diagnostics from Semantics
   llvm::raw_ostream *semaOutputStream_ = &llvm::errs();
 
@@ -129,6 +132,16 @@ class CompilerInstance {
     semantics_ = std::move(semantics);
   }
 
+  void setRtTyTables(
+      std::unique_ptr<Fortran::semantics::RuntimeDerivedTypeTables> tables) {
+    rtTyTables_ = std::move(tables);
+  }
+
+  Fortran::semantics::RuntimeDerivedTypeTables &getRtTyTables() {
+    assert(rtTyTables_ && "Missing runtime derived type tables!");
+    return *rtTyTables_;
+  }
+
   /// }
   /// @name High-Level Operations
   /// {
diff --git a/flang/include/flang/Frontend/FrontendAction.h b/flang/include/flang/Frontend/FrontendAction.h
index aac1fcf268a08..db053952291e6 100644
--- a/flang/include/flang/Frontend/FrontendAction.h
+++ b/flang/include/flang/Frontend/FrontendAction.h
@@ -112,6 +112,10 @@ class FrontendAction {
   // Run semantic checks for the current input file. Return False if fatal
   // errors are reported, True otherwise.
   bool RunSemanticChecks();
+  // Generate run-time type information for derived types. This may lead to new
+  // semantic errors. Return False if fatal errors are reported, True
+  // otherwise.
+  bool GenerateRtTypeTables();
 
   // Report fatal semantic errors. Return True if present, false otherwise.
   bool reportFatalSemanticErrors();
diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h
index 6a9afd1afc5c0..3ccd39fdd8606 100644
--- a/flang/include/flang/Frontend/FrontendActions.h
+++ b/flang/include/flang/Frontend/FrontendActions.h
@@ -133,7 +133,12 @@ class PluginParseTreeAction : public PrescanAndSemaAction {
 // PrescanAndSemaDebug Actions
 //
 // These actions will parse the input, run the semantic checks and execute
-// their actions regardless of whether any semantic errors are found.
+// their actions _regardless of_ whether any semantic errors have been found.
+// This can be useful when adding new languge feature and when you wish to
+// investigate compiler output (e.g. the parse tree) despite any semantic
+// errors.
+//
+// NOTE: Use with care and for development only!
 //===----------------------------------------------------------------------===//
 class PrescanAndSemaDebugAction : public FrontendAction {
 
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 762cbc8fc47b5..ce838c45dc1a1 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -180,6 +180,21 @@ bool FrontendAction::RunSemanticChecks() {
   return true;
 }
 
+bool FrontendAction::GenerateRtTypeTables() {
+  instance().setRtTyTables(
+      std::make_unique<Fortran::semantics::RuntimeDerivedTypeTables>(
+          BuildRuntimeDerivedTypeTables(
+              instance().invocation().semanticsContext())));
+
+  // The runtime derived type information table builder may find additional
+  // semantic errors. Report them.
+  if (reportFatalSemanticErrors()) {
+    return false;
+  }
+
+  return true;
+}
+
 template <unsigned N>
 bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
   if (!instance_->parsing().messages().empty() &&
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 43ab3f689522d..d787add5dcfe2 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -47,12 +47,18 @@ bool PrescanAndParseAction::BeginSourceFileAction() {
 }
 
 bool PrescanAndSemaAction::BeginSourceFileAction() {
-  return RunPrescan() && RunParse() && RunSemanticChecks();
+  return RunPrescan() && RunParse() && RunSemanticChecks() &&
+      GenerateRtTypeTables();
 }
 
 bool PrescanAndSemaDebugAction::BeginSourceFileAction() {
-  // Semantic checks are made to succeed unconditionally.
-  return RunPrescan() && RunParse() && (RunSemanticChecks() || true);
+  // This is a "debug" action for development purposes. To facilitate this, the
+  // semantic checks are made to succeed unconditionally to prevent this action
+  // from exiting early (i.e. in the presence of semantic errors). We should
+  // never do this in actions intended for end-users or otherwise regular
+  // compiler workflows!
+  return RunPrescan() && RunParse() && (RunSemanticChecks() || true) &&
+      (GenerateRtTypeTables() || true);
 }
 
 bool CodeGenAction::BeginSourceFileAction() {
@@ -218,25 +224,18 @@ void DebugUnparseWithSymbolsAction::ExecuteAction() {
 
 void DebugDumpSymbolsAction::ExecuteAction() {
   CompilerInstance &ci = this->instance();
-  auto &semantics = ci.semantics();
 
-  auto tables{Fortran::semantics::BuildRuntimeDerivedTypeTables(
-      instance().invocation().semanticsContext())};
-  // The runtime derived type information table builder may find and report
-  // semantic errors. So it is important that we report them _after_
-  // BuildRuntimeDerivedTypeTables is run.
-  reportFatalSemanticErrors();
-
-  if (!tables.schemata) {
+  if (!ci.getRtTyTables().schemata) {
     unsigned DiagID =
         ci.diagnostics().getCustomDiagID(clang::DiagnosticsEngine::Error,
             "could not find module file for __fortran_type_info");
     ci.diagnostics().Report(DiagID);
     llvm::errs() << "\n";
+    return;
   }
 
   // Dump symbols
-  semantics.DumpSymbols(llvm::outs());
+  ci.semantics().DumpSymbols(llvm::outs());
 }
 
 void DebugDumpAllAction::ExecuteAction() {
@@ -250,27 +249,20 @@ void DebugDumpAllAction::ExecuteAction() {
   Fortran::parser::DumpTree(
       llvm::outs(), parseTree, &ci.invocation().asFortran());
 
-  auto &semantics = ci.semantics();
-  auto tables{Fortran::semantics::BuildRuntimeDerivedTypeTables(
-      instance().invocation().semanticsContext())};
-  // The runtime derived type information table builder may find and report
-  // semantic errors. So it is important that we report them _after_
-  // BuildRuntimeDerivedTypeTables is run.
-  reportFatalSemanticErrors();
-
-  if (!tables.schemata) {
+  if (!ci.getRtTyTables().schemata) {
     unsigned DiagID =
         ci.diagnostics().getCustomDiagID(clang::DiagnosticsEngine::Error,
             "could not find module file for __fortran_type_info");
     ci.diagnostics().Report(DiagID);
     llvm::errs() << "\n";
+    return;
   }
 
   // Dump symbols
   llvm::outs() << "=====================";
   llvm::outs() << " Flang: symbols dump ";
   llvm::outs() << "=====================\n";
-  semantics.DumpSymbols(llvm::outs());
+  ci.semantics().DumpSymbols(llvm::outs());
 }
 
 void DebugDumpParseTreeNoSemaAction::ExecuteAction() {

From 9d91e03b7d075239483e9a5793e1e5db9152bdf1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 23 Feb 2022 10:09:40 +0000
Subject: [PATCH 604/748] [clang] CIndex.cpp - use cast<> instead of dyn_cast<>
 to avoid dereference of nullptr

The pointers are used immediately, so assert the cast is correct instead of returning nullptr
---
 clang/tools/libclang/CIndex.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 53494ecc7ae9d..e77b85d63c53e 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -4010,7 +4010,7 @@ static const ExprEvalResult *evaluateExpr(Expr *expr, CXCursor C) {
   }
 
   if (expr->getStmtClass() == Stmt::ImplicitCastExprClass) {
-    const ImplicitCastExpr *I = dyn_cast<ImplicitCastExpr>(expr);
+    const auto *I = cast<ImplicitCastExpr>(expr);
     auto *subExpr = I->getSubExprAsWritten();
     if (subExpr->getStmtClass() == Stmt::StringLiteralClass ||
         subExpr->getStmtClass() == Stmt::ObjCStringLiteralClass) {
@@ -6740,8 +6740,8 @@ void clang_getDefinitionSpellingAndExtent(
     CXCursor C, const char **startBuf, const char **endBuf, unsigned *startLine,
     unsigned *startColumn, unsigned *endLine, unsigned *endColumn) {
   assert(getCursorDecl(C) && "CXCursor has null decl");
-  const FunctionDecl *FD = dyn_cast<FunctionDecl>(getCursorDecl(C));
-  CompoundStmt *Body = dyn_cast<CompoundStmt>(FD->getBody());
+  const auto *FD = cast<FunctionDecl>(getCursorDecl(C));
+  const auto *Body = cast<CompoundStmt>(FD->getBody());
 
   SourceManager &SM = FD->getASTContext().getSourceManager();
   *startBuf = SM.getCharacterData(Body->getLBracLoc());
@@ -8289,7 +8289,7 @@ unsigned clang_Cursor_getObjCPropertyAttributes(CXCursor C, unsigned reserved) {
     return CXObjCPropertyAttr_noattr;
 
   unsigned Result = CXObjCPropertyAttr_noattr;
-  const ObjCPropertyDecl *PD = dyn_cast<ObjCPropertyDecl>(getCursorDecl(C));
+  const auto *PD = cast<ObjCPropertyDecl>(getCursorDecl(C));
   ObjCPropertyAttribute::Kind Attr = PD->getPropertyAttributesAsWritten();
 
 #define SET_CXOBJCPROP_ATTR(A)                                                 \
@@ -8317,7 +8317,7 @@ CXString clang_Cursor_getObjCPropertyGetterName(CXCursor C) {
   if (C.kind != CXCursor_ObjCPropertyDecl)
     return cxstring::createNull();
 
-  const ObjCPropertyDecl *PD = dyn_cast<ObjCPropertyDecl>(getCursorDecl(C));
+  const auto *PD = cast<ObjCPropertyDecl>(getCursorDecl(C));
   Selector sel = PD->getGetterName();
   if (sel.isNull())
     return cxstring::createNull();
@@ -8329,7 +8329,7 @@ CXString clang_Cursor_getObjCPropertySetterName(CXCursor C) {
   if (C.kind != CXCursor_ObjCPropertyDecl)
     return cxstring::createNull();
 
-  const ObjCPropertyDecl *PD = dyn_cast<ObjCPropertyDecl>(getCursorDecl(C));
+  const auto *PD = cast<ObjCPropertyDecl>(getCursorDecl(C));
   Selector sel = PD->getSetterName();
   if (sel.isNull())
     return cxstring::createNull();

From 7021b5a2fa58e85fb2adbf57c722aef271310262 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 23 Feb 2022 10:12:15 +0000
Subject: [PATCH 605/748] [gn build] Port 25d7b4fb446b

---
 .../gn/secondary/llvm/lib/ObjCopy/BUILD.gn    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn
index 78fceb2e92362..4e0825d78e8d5 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ObjCopy/BUILD.gn
@@ -1,29 +1,29 @@
 static_library("ObjCopy") {
   output_name = "LLVMObjCopy"
   deps = [
-    "//llvm/lib/Object",
     "//llvm/lib/MC",
+    "//llvm/lib/Object",
     "//llvm/lib/Support",
   ]
   include_dirs = [ "." ]
   sources = [
     "Archive.cpp",
-    "ObjCopy.cpp",
-    "ConfigManager.cpp",
     "COFF/COFFObjcopy.cpp",
-    "COFF/Object.cpp",
-    "COFF/Reader.cpp",
-    "COFF/Writer.cpp",
+    "COFF/COFFObject.cpp",
+    "COFF/COFFReader.cpp",
+    "COFF/COFFWriter.cpp",
+    "ConfigManager.cpp",
     "ELF/ELFObjcopy.cpp",
-    "ELF/Object.cpp",
+    "ELF/ELFObject.cpp",
+    "MachO/MachOLayoutBuilder.cpp",
     "MachO/MachOObjcopy.cpp",
+    "MachO/MachOObject.cpp",
     "MachO/MachOReader.cpp",
     "MachO/MachOWriter.cpp",
-    "MachO/MachOLayoutBuilder.cpp",
-    "MachO/Object.cpp",
-    "wasm/Object.cpp",
-    "wasm/Reader.cpp",
-    "wasm/Writer.cpp",
+    "ObjCopy.cpp",
     "wasm/WasmObjcopy.cpp",
+    "wasm/WasmObject.cpp",
+    "wasm/WasmReader.cpp",
+    "wasm/WasmWriter.cpp",
   ]
 }

From 5a74e6a21c9520e0619c98a66815fc8b5117e321 Mon Sep 17 00:00:00 2001
From: Iain Sandoe <iain@sandoe.co.uk>
Date: Sat, 12 Feb 2022 13:43:32 +0000
Subject: [PATCH 606/748] [Modules] Add module structure output to
 -module-file-info.

It is useful to be able to visualise the C++20 modules content of a PCM file
both for inspection and for testing.  In particular, when adding more module
types to support C++20 Partitions and Header Units, we would like to be able
to confirm that the output PCM has the intended structure.

The existing scheme for dumping data is restricted to the content of the AST
file control block, which does not include structural data beyond imports.

The change here makes use of the AST unit that is set up by BeginSourceFile
to query for the information on the primary and sub-modules.  We can then
inspect each of these in turn, accounting for Global, Private, Imported and
Exported modules/fragments and then showing the sub-stucture of the main
module(s).

The disadvantage of this mechanism is that it has no easy method to control
the granularity of the output.  Perhaps more detailed inspection would be
better handled by a stand-alone module inspection tool.

Differential Revision: https://reviews.llvm.org/D119823
---
 clang/lib/Frontend/FrontendActions.cpp        | 103 +++++++++++++++++-
 clang/test/Modules/cxx20-module-file-info.cpp |  64 +++++++++++
 2 files changed, 165 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/cxx20-module-file-info.cpp

diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index ad2e6039477f8..baf3ac34db620 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -11,6 +11,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/LangStandard.h"
+#include "clang/Basic/Module.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Frontend/ASTConsumers.h"
 #include "clang/Frontend/CompilerInstance.h"
@@ -24,6 +25,7 @@
 #include "clang/Sema/TemplateInstCallback.h"
 #include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ASTWriter.h"
+#include "clang/Serialization/ModuleFile.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -806,7 +808,25 @@ bool DumpModuleInfoAction::BeginInvocation(CompilerInstance &CI) {
   return true;
 }
 
+static StringRef ModuleKindName(Module::ModuleKind MK) {
+  switch (MK) {
+  case Module::ModuleMapModule:
+    return "Module Map Module";
+  case Module::ModuleInterfaceUnit:
+    return "Interface Unit";
+  case Module::ModulePartitionInterface:
+    return "Partition Interface";
+  case Module::ModulePartitionImplementation:
+    return "Partition Implementation";
+  case Module::GlobalModuleFragment:
+    return "Global Module Fragment";
+  case Module::PrivateModuleFragment:
+    return "Private Module Fragment";
+  }
+}
+
 void DumpModuleInfoAction::ExecuteAction() {
+  assert(isCurrentFileAST() && "dumping non-AST?");
   // Set up the output file.
   std::unique_ptr<llvm::raw_fd_ostream> OutFile;
   StringRef OutputFileName = getCompilerInstance().getFrontendOpts().OutputFile;
@@ -827,8 +847,87 @@ void DumpModuleInfoAction::ExecuteAction() {
 
   Preprocessor &PP = getCompilerInstance().getPreprocessor();
   DumpModuleInfoListener Listener(Out);
-  HeaderSearchOptions &HSOpts =
-      PP.getHeaderSearchInfo().getHeaderSearchOpts();
+  HeaderSearchOptions &HSOpts = PP.getHeaderSearchInfo().getHeaderSearchOpts();
+
+  // The FrontendAction::BeginSourceFile () method loads the AST so that much
+  // of the information is already available and modules should have been
+  // loaded.
+
+  const LangOptions &LO = getCurrentASTUnit().getLangOpts();
+  if (LO.CPlusPlusModules && !LO.CurrentModule.empty()) {
+
+    ASTReader *R = getCurrentASTUnit().getASTReader().get();
+    unsigned SubModuleCount = R->getTotalNumSubmodules();
+    serialization::ModuleFile &MF = R->getModuleManager().getPrimaryModule();
+    Out << "  ====== C++20 Module structure ======\n";
+
+    if (MF.ModuleName != LO.CurrentModule)
+      Out << "  Mismatched module names : " << MF.ModuleName << " and "
+          << LO.CurrentModule << "\n";
+
+    struct SubModInfo {
+      unsigned Idx;
+      Module *Mod;
+      Module::ModuleKind Kind;
+      std::string &Name;
+      bool Seen;
+    };
+    std::map<std::string, SubModInfo> SubModMap;
+    auto PrintSubMapEntry = [&](std::string Name, Module::ModuleKind Kind) {
+      Out << "    " << ModuleKindName(Kind) << " '" << Name << "'";
+      auto I = SubModMap.find(Name);
+      if (I == SubModMap.end())
+        Out << " was not found in the sub modules!\n";
+      else {
+        I->second.Seen = true;
+        Out << " is at index #" << I->second.Idx << "\n";
+      }
+    };
+    Module *Primary = nullptr;
+    for (unsigned Idx = 0; Idx <= SubModuleCount; ++Idx) {
+      Module *M = R->getModule(Idx);
+      if (!M)
+        continue;
+      if (M->Name == LO.CurrentModule) {
+        Primary = M;
+        Out << "  " << ModuleKindName(M->Kind) << " '" << LO.CurrentModule
+            << "' is the Primary Module at index #" << Idx << "\n";
+        SubModMap.insert({M->Name, {Idx, M, M->Kind, M->Name, true}});
+      } else
+        SubModMap.insert({M->Name, {Idx, M, M->Kind, M->Name, false}});
+    }
+    if (Primary) {
+      if (!Primary->submodules().empty())
+        Out << "   Sub Modules:\n";
+      for (auto MI : Primary->submodules()) {
+        PrintSubMapEntry(MI->Name, MI->Kind);
+      }
+      if (!Primary->Imports.empty())
+        Out << "   Imports:\n";
+      for (auto IMP : Primary->Imports) {
+        PrintSubMapEntry(IMP->Name, IMP->Kind);
+      }
+      if (!Primary->Exports.empty())
+        Out << "   Exports:\n";
+      for (unsigned MN = 0, N = Primary->Exports.size(); MN != N; ++MN) {
+        if (Module *M = Primary->Exports[MN].getPointer()) {
+          PrintSubMapEntry(M->Name, M->Kind);
+        }
+      }
+    }
+    // Now let's print out any modules we did not see as part of the Primary.
+    for (auto SM : SubModMap) {
+      if (!SM.second.Seen && SM.second.Mod) {
+        Out << "  " << ModuleKindName(SM.second.Kind) << " '" << SM.first
+            << "' at index #" << SM.second.Idx
+            << " has no direct reference in the Primary\n";
+      }
+    }
+    Out << "  ====== ======\n";
+  }
+
+  // The reminder of the output is produced from the listener as the AST
+  // FileCcontrolBlock is (re-)parsed.
   ASTReader::readASTFileControlBlock(
       getCurrentFile(), FileMgr, getCompilerInstance().getPCHContainerReader(),
       /*FindModuleFileExtensions=*/true, Listener,
diff --git a/clang/test/Modules/cxx20-module-file-info.cpp b/clang/test/Modules/cxx20-module-file-info.cpp
new file mode 100644
index 0000000000000..99a215645e8fe
--- /dev/null
+++ b/clang/test/Modules/cxx20-module-file-info.cpp
@@ -0,0 +1,64 @@
+// Test output from -module-file-info about C++20 modules.
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/mod-info-tu1.cpp \
+// RUN:  -o %t/A.pcm
+
+// RUN: %clang_cc1 -std=c++20 -module-file-info %t/A.pcm | FileCheck \
+// RUN:  --check-prefix=CHECK-A %s
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/mod-info-tu2.cpp \
+// RUN:  -o %t/B.pcm
+
+// RUN: %clang_cc1 -std=c++20 -module-file-info %t/B.pcm | FileCheck \
+// RUN:  --check-prefix=CHECK-B %s
+
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/mod-info-tu3.cpp \
+// RUN:  -fmodule-file=%t/A.pcm -fmodule-file=%t/B.pcm -o %t/Foo.pcm
+
+// RUN: %clang_cc1 -std=c++20 -module-file-info %t/Foo.pcm | FileCheck \
+// RUN:  --check-prefix=CHECK-FOO %s
+
+// expected-no-diagnostics
+
+//--- mod-info-tu1.cpp
+export module A;
+
+void a();
+
+// CHECK-A: ====== C++20
+// CHECK-A-NEXT: Interface Unit 'A' is the Primary Module at index #1
+
+//--- mod-info-tu2.cpp
+export module B;
+
+void b();
+
+// CHECK-B: ====== C++20
+// CHECK-B-NEXT: Interface Unit 'B' is the Primary Module at index #1
+
+//--- mod-info-tu3.cpp
+module;
+
+export module Foo;
+
+import A;
+export import B;
+
+namespace hello {
+export void say(const char *);
+}
+
+void foo() {}
+
+// CHECK-FOO: ====== C++20
+// CHECK-FOO-NEXT:  Interface Unit 'Foo' is the Primary Module at index #3
+// CHECK-FOO-NEXT:   Sub Modules:
+// CHECK-FOO-NEXT:    Global Module Fragment '<global>' is at index #4
+// CHECK-FOO-NEXT:   Imports:
+// CHECK-FOO-NEXT:    Interface Unit 'A' is at index #1
+// CHECK-FOO-NEXT:   Exports:
+// CHECK-FOO-NEXT:    Interface Unit 'B' is at index #2

From 169e1aba55bed9f7ffa000f9f170ab2defbc40b2 Mon Sep 17 00:00:00 2001
From: Stanislav Gatev <sgatev@google.com>
Date: Wed, 23 Feb 2022 10:32:17 +0000
Subject: [PATCH 607/748] Revert "[clang][dataflow] Add support for global
 storage values"

This reverts commit 7ea103de140b59a64fc884fa90afd2213619384d.
---
 .../FlowSensitive/DataflowEnvironment.h       |   5 -
 .../FlowSensitive/DataflowEnvironment.cpp     |  46 -----
 clang/lib/Analysis/FlowSensitive/Transfer.cpp |  23 ---
 .../Analysis/FlowSensitive/TransferTest.cpp   | 167 ------------------
 4 files changed, 241 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index bab20418a016a..af613c95bb8dc 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -49,11 +49,6 @@ enum class SkipPast {
 };
 
 /// Holds the state of the program (store and heap) at a given program point.
-///
-/// WARNING: Symbolic values that are created by the environment for static
-/// local and global variables are not currently invalidated on function calls.
-/// This is unsound and should be taken into account when designing dataflow
-/// analyses.
 class Environment {
 public:
   /// Supplements `Environment` with non-standard comparison and join
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index f20c747c56c2d..eca58b313761b 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -22,7 +22,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <cassert>
 #include <memory>
 #include <utility>
 
@@ -57,55 +56,10 @@ static bool equivalentValues(QualType Type, Value *Val1, Value *Val2,
   return Model.compareEquivalent(Type, *Val1, *Val2);
 }
 
-/// Initializes a global storage value.
-static void initGlobalVar(const VarDecl &D, Environment &Env) {
-  if (!D.hasGlobalStorage() ||
-      Env.getStorageLocation(D, SkipPast::None) != nullptr)
-    return;
-
-  auto &Loc = Env.createStorageLocation(D);
-  Env.setStorageLocation(D, Loc);
-  if (auto *Val = Env.createValue(D.getType()))
-    Env.setValue(Loc, *Val);
-}
-
-/// Initializes a global storage value.
-static void initGlobalVar(const Decl &D, Environment &Env) {
-  if (auto *V = dyn_cast<VarDecl>(&D))
-    initGlobalVar(*V, Env);
-}
-
-/// Initializes global storage values that are declared or referenced from
-/// sub-statements of `S`.
-// FIXME: Add support for resetting globals after function calls to enable
-// the implementation of sound analyses.
-static void initGlobalVars(const Stmt &S, Environment &Env) {
-  for (auto *Child : S.children()) {
-    if (Child != nullptr)
-      initGlobalVars(*Child, Env);
-  }
-
-  if (auto *DS = dyn_cast<DeclStmt>(&S)) {
-    if (DS->isSingleDecl()) {
-      const auto &D = *cast<VarDecl>(DS->getSingleDecl());
-      initGlobalVar(D, Env);
-    } else {
-      for (auto *D : DS->getDeclGroup())
-        initGlobalVar(*D, Env);
-    }
-  } else if (auto *E = dyn_cast<DeclRefExpr>(&S)) {
-    initGlobalVar(*E->getDecl(), Env);
-  } else if (auto *E = dyn_cast<MemberExpr>(&S)) {
-    initGlobalVar(*E->getMemberDecl(), Env);
-  }
-}
-
 Environment::Environment(DataflowAnalysisContext &DACtx,
                          const DeclContext &DeclCtx)
     : Environment(DACtx) {
   if (const auto *FuncDecl = dyn_cast<FunctionDecl>(&DeclCtx)) {
-    assert(FuncDecl->getBody() != nullptr);
-    initGlobalVars(*FuncDecl->getBody(), *this);
     for (const auto *ParamDecl : FuncDecl->parameters()) {
       assert(ParamDecl != nullptr);
       auto &ParamLoc = createStorageLocation(*ParamDecl);
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 4b5d23593a4bd..cd9b8b0e454e4 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -136,11 +136,6 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     // Group decls are converted into single decls in the CFG so the cast below
     // is safe.
     const auto &D = *cast<VarDecl>(S->getSingleDecl());
-
-    // Static local vars are already initialized in `Environment`.
-    if (D.hasGlobalStorage())
-      return;
-
     auto &Loc = Env.createStorageLocation(D);
     Env.setStorageLocation(D, Loc);
 
@@ -296,24 +291,6 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     if (Member->isFunctionOrFunctionTemplate())
       return;
 
-    if (auto *D = dyn_cast<VarDecl>(Member)) {
-      if (D->hasGlobalStorage()) {
-        auto *VarDeclLoc = Env.getStorageLocation(*D, SkipPast::None);
-        if (VarDeclLoc == nullptr)
-          return;
-
-        if (VarDeclLoc->getType()->isReferenceType()) {
-          Env.setStorageLocation(*S, *VarDeclLoc);
-        } else {
-          auto &Loc = Env.createStorageLocation(*S);
-          Env.setStorageLocation(*S, Loc);
-          Env.setValue(Loc, Env.takeOwnership(
-                                std::make_unique<ReferenceValue>(*VarDeclLoc)));
-        }
-        return;
-      }
-    }
-
     // The receiver can be either a value or a pointer to a value. Skip past the
     // indirection to handle both cases.
     auto *BaseLoc = cast_or_null<AggregateStorageLocation>(
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index fda4af435c4a7..83ccba1a25382 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2153,171 +2153,4 @@ TEST_F(TransferTest, AssignFromBoolNegation) {
               });
 }
 
-TEST_F(TransferTest, StaticIntSingleVarDecl) {
-  std::string Code = R"(
-    void target() {
-      static int Foo;
-      // [[p]]
-    }
-  )";
-  runDataflow(Code,
-              [](llvm::ArrayRef<
-                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
-                     Results,
-                 ASTContext &ASTCtx) {
-                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
-                const Environment &Env = Results[0].second.Env;
-
-                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-                ASSERT_THAT(FooDecl, NotNull());
-
-                const StorageLocation *FooLoc =
-                    Env.getStorageLocation(*FooDecl, SkipPast::None);
-                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
-
-                const Value *FooVal = Env.getValue(*FooLoc);
-                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
-              });
-}
-
-TEST_F(TransferTest, StaticIntGroupVarDecl) {
-  std::string Code = R"(
-    void target() {
-      static int Foo, Bar;
-      (void)0;
-      // [[p]]
-    }
-  )";
-  runDataflow(Code,
-              [](llvm::ArrayRef<
-                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
-                     Results,
-                 ASTContext &ASTCtx) {
-                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
-                const Environment &Env = Results[0].second.Env;
-
-                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
-                ASSERT_THAT(FooDecl, NotNull());
-
-                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-                ASSERT_THAT(BarDecl, NotNull());
-
-                const StorageLocation *FooLoc =
-                    Env.getStorageLocation(*FooDecl, SkipPast::None);
-                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
-
-                const StorageLocation *BarLoc =
-                    Env.getStorageLocation(*BarDecl, SkipPast::None);
-                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(BarLoc));
-
-                const Value *FooVal = Env.getValue(*FooLoc);
-                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
-
-                const Value *BarVal = Env.getValue(*BarLoc);
-                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(BarVal));
-
-                EXPECT_NE(FooVal, BarVal);
-              });
-}
-
-TEST_F(TransferTest, GlobalIntVarDecl) {
-  std::string Code = R"(
-    static int Foo;
-
-    void target() {
-      int Bar = Foo;
-      int Baz = Foo;
-      // [[p]]
-    }
-  )";
-  runDataflow(Code,
-              [](llvm::ArrayRef<
-                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
-                     Results,
-                 ASTContext &ASTCtx) {
-                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
-                const Environment &Env = Results[0].second.Env;
-
-                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-                ASSERT_THAT(BarDecl, NotNull());
-
-                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
-                ASSERT_THAT(BazDecl, NotNull());
-
-                const Value *BarVal =
-                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
-                const Value *BazVal =
-                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
-                EXPECT_EQ(BarVal, BazVal);
-              });
-}
-
-TEST_F(TransferTest, StaticMemberIntVarDecl) {
-  std::string Code = R"(
-    struct A {
-      static int Foo;
-    };
-
-    void target(A a) {
-      int Bar = a.Foo;
-      int Baz = a.Foo;
-      // [[p]]
-    }
-  )";
-  runDataflow(Code,
-              [](llvm::ArrayRef<
-                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
-                     Results,
-                 ASTContext &ASTCtx) {
-                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
-                const Environment &Env = Results[0].second.Env;
-
-                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-                ASSERT_THAT(BarDecl, NotNull());
-
-                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
-                ASSERT_THAT(BazDecl, NotNull());
-
-                const Value *BarVal =
-                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
-                const Value *BazVal =
-                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
-                EXPECT_EQ(BarVal, BazVal);
-              });
-}
-
-TEST_F(TransferTest, StaticMemberRefVarDecl) {
-  std::string Code = R"(
-    struct A {
-      static int &Foo;
-    };
-
-    void target(A a) {
-      int Bar = a.Foo;
-      int Baz = a.Foo;
-      // [[p]]
-    }
-  )";
-  runDataflow(Code,
-              [](llvm::ArrayRef<
-                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
-                     Results,
-                 ASTContext &ASTCtx) {
-                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
-                const Environment &Env = Results[0].second.Env;
-
-                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
-                ASSERT_THAT(BarDecl, NotNull());
-
-                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
-                ASSERT_THAT(BazDecl, NotNull());
-
-                const Value *BarVal =
-                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
-                const Value *BazVal =
-                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
-                EXPECT_EQ(BarVal, BazVal);
-              });
-}
-
 } // namespace

From 152325d2f3b619cfa617c3f65d70444eb2d15033 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Tue, 22 Feb 2022 10:36:26 +0300
Subject: [PATCH 608/748] =?UTF-8?q?[ArgPromotion]=20Regenerate=20test=20ch?=
 =?UTF-8?q?ecks=20for=20crash.ll=20=E2=80=93=20restored=20ALL=5FOLDPM=20pr?=
 =?UTF-8?q?efix,=20add=20=E2=80=93allow-unused-prefixes.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This test has two runs that differ in what functions are left after the inliner,
for example: barney exists on OLDPM path but don’t exist on NEWPM path.
I restored prefixes this test had had after automatic checks were introduced
for this test.

For now there are no checks left for ALL_NEWPM path, but the behavior seem to
change over time so I added –allow-unused-prefixes to ease following check updates.

Renamed %tmp => %temp IR values to avoid update warning.

Differential revision: https://reviews.llvm.org/D120207
---
 .../Transforms/ArgumentPromotion/crash.ll     | 62 ++++++++++---------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/llvm/test/Transforms/ArgumentPromotion/crash.ll b/llvm/test/Transforms/ArgumentPromotion/crash.ll
index d55f4624e0c34..d10fab463d692 100644
--- a/llvm/test/Transforms/ArgumentPromotion/crash.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/crash.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
-; RUN: opt -S < %s -inline -argpromotion | FileCheck %s --check-prefix=ARGPROMOTION
-; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s --check-prefixes=ARGPROMOTION,ALL_NEWPM
+; RUN: opt -S < %s -inline -argpromotion | FileCheck %s --check-prefixes=ARGPROMOTION,ALL_OLDPM --allow-unused-prefixes
+; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s --check-prefixes=ARGPROMOTION,ALL_NEWPM --allow-unused-prefixes
 
 %S = type { %S* }
 
 ; Inlining should nuke the invoke (and any inlined calls) here even with
 ; argument promotion running along with it.
 define void @zot() personality i32 (...)* @wibble {
-; ARGPROMOTION-LABEL: define {{[^@]+}}@zot() personality i32 (...)* @wibble
+; ARGPROMOTION-LABEL: define {{[^@]+}}@zot() personality i32 (...)* @wibble {
 ; ARGPROMOTION-NEXT:  bb:
 ; ARGPROMOTION-NEXT:    unreachable
 ; ARGPROMOTION:       hoge.exit:
@@ -15,7 +15,7 @@ define void @zot() personality i32 (...)* @wibble {
 ; ARGPROMOTION:       bb1:
 ; ARGPROMOTION-NEXT:    unreachable
 ; ARGPROMOTION:       bb2:
-; ARGPROMOTION-NEXT:    [[TMP:%.*]] = landingpad { i8*, i32 }
+; ARGPROMOTION-NEXT:    [[TEMP:%.*]] = landingpad { i8*, i32 }
 ; ARGPROMOTION-NEXT:    cleanup
 ; ARGPROMOTION-NEXT:    unreachable
 ;
@@ -27,15 +27,15 @@ bb1:
   unreachable
 
 bb2:
-  %tmp = landingpad { i8*, i32 }
+  %temp = landingpad { i8*, i32 }
   cleanup
   unreachable
 }
 
 define internal void @hoge() {
 bb:
-  %tmp = call fastcc i8* @spam(i1 (i8*)* @eggs)
-  %tmp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
+  %temp = call fastcc i8* @spam(i1 (i8*)* @eggs)
+  %temp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
   unreachable
 }
 
@@ -45,54 +45,58 @@ bb:
 }
 
 define internal i1 @eggs(i8* %arg) {
-; ALL_NEWPM-LABEL: define {{[^@]+}}@eggs()
-; ALL_NEWPM-NEXT:  bb:
-; ALL_NEWPM-NEXT:    unreachable
+; ARGPROMOTION-LABEL: define {{[^@]+}}@eggs() {
+; ARGPROMOTION-NEXT:  bb:
+; ARGPROMOTION-NEXT:    unreachable
 ;
 bb:
-  %tmp = call zeroext i1 @barney(i8* %arg)
+  %temp = call zeroext i1 @barney(i8* %arg)
   unreachable
 }
 
 define internal i1 @barney(i8* %arg) {
+; ALL_OLDPM-LABEL: define {{[^@]+}}@barney() {
+; ALL_OLDPM-NEXT:  bb:
+; ALL_OLDPM-NEXT:    ret i1 undef
+;
 bb:
   ret i1 undef
 }
 
 define i32 @test_inf_promote_caller(i32 %arg) {
 ; ARGPROMOTION-LABEL: define {{[^@]+}}@test_inf_promote_caller
-; ARGPROMOTION-SAME: (i32 [[ARG:%.*]])
+; ARGPROMOTION-SAME: (i32 [[ARG:%.*]]) {
 ; ARGPROMOTION-NEXT:  bb:
-; ARGPROMOTION-NEXT:    [[TMP:%.*]] = alloca [[S:%.*]]
-; ARGPROMOTION-NEXT:    [[TMP1:%.*]] = alloca [[S]]
-; ARGPROMOTION-NEXT:    [[TMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TMP]], %S* [[TMP1]])
+; ARGPROMOTION-NEXT:    [[TEMP:%.*]] = alloca [[S:%.*]], align 8
+; ARGPROMOTION-NEXT:    [[TEMP1:%.*]] = alloca [[S]], align 8
+; ARGPROMOTION-NEXT:    [[TEMP2:%.*]] = call i32 @test_inf_promote_callee(%S* [[TEMP]], %S* [[TEMP1]])
 ; ARGPROMOTION-NEXT:    ret i32 0
 ;
 bb:
-  %tmp = alloca %S
-  %tmp1 = alloca %S
-  %tmp2 = call i32 @test_inf_promote_callee(%S* %tmp, %S* %tmp1)
+  %temp = alloca %S
+  %temp1 = alloca %S
+  %temp2 = call i32 @test_inf_promote_callee(%S* %temp, %S* %temp1)
 
   ret i32 0
 }
 
 define internal i32 @test_inf_promote_callee(%S* %arg, %S* %arg1) {
 ; ARGPROMOTION-LABEL: define {{[^@]+}}@test_inf_promote_callee
-; ARGPROMOTION-SAME: (%S* [[ARG:%.*]], %S* [[ARG1:%.*]])
+; ARGPROMOTION-SAME: (%S* [[ARG:%.*]], %S* [[ARG1:%.*]]) {
 ; ARGPROMOTION-NEXT:  bb:
-; ARGPROMOTION-NEXT:    [[TMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
-; ARGPROMOTION-NEXT:    [[TMP2:%.*]] = load %S*, %S** [[TMP]]
-; ARGPROMOTION-NEXT:    [[TMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
-; ARGPROMOTION-NEXT:    [[TMP4:%.*]] = load %S*, %S** [[TMP3]]
-; ARGPROMOTION-NEXT:    [[TMP5:%.*]] = call i32 @test_inf_promote_callee(%S* [[TMP4]], %S* [[TMP2]])
+; ARGPROMOTION-NEXT:    [[TEMP:%.*]] = getelementptr [[S:%.*]], %S* [[ARG1]], i32 0, i32 0
+; ARGPROMOTION-NEXT:    [[TEMP2:%.*]] = load %S*, %S** [[TEMP]], align 8
+; ARGPROMOTION-NEXT:    [[TEMP3:%.*]] = getelementptr [[S]], %S* [[ARG]], i32 0, i32 0
+; ARGPROMOTION-NEXT:    [[TEMP4:%.*]] = load %S*, %S** [[TEMP3]], align 8
+; ARGPROMOTION-NEXT:    [[TEMP5:%.*]] = call i32 @test_inf_promote_callee(%S* [[TEMP4]], %S* [[TEMP2]])
 ; ARGPROMOTION-NEXT:    ret i32 0
 ;
 bb:
-  %tmp = getelementptr %S, %S* %arg1, i32 0, i32 0
-  %tmp2 = load %S*, %S** %tmp
-  %tmp3 = getelementptr %S, %S* %arg, i32 0, i32 0
-  %tmp4 = load %S*, %S** %tmp3
-  %tmp5 = call i32 @test_inf_promote_callee(%S* %tmp4, %S* %tmp2)
+  %temp = getelementptr %S, %S* %arg1, i32 0, i32 0
+  %temp2 = load %S*, %S** %temp
+  %temp3 = getelementptr %S, %S* %arg, i32 0, i32 0
+  %temp4 = load %S*, %S** %temp3
+  %temp5 = call i32 @test_inf_promote_callee(%S* %temp4, %S* %temp2)
 
   ret i32 0
 }

From 4b5261e10f752e682a3e7ade385e927c2e7ee1d3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 11:49:18 +0100
Subject: [PATCH 609/748] [InstCombine] Add tests for sub of umin intrinsic
 (NFC)

We should be converting these into usub.sat.
---
 .../test/Transforms/InstCombine/sub-minmax.ll | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/sub-minmax.ll b/llvm/test/Transforms/InstCombine/sub-minmax.ll
index efc5c07601660..5dbf02c14528c 100644
--- a/llvm/test/Transforms/InstCombine/sub-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/sub-minmax.ll
@@ -567,5 +567,75 @@ define i8 @umax_sub_op1_use(i8 %x, i8 %y) {
   ret i8 %r
 }
 
+define i8 @umin_sub_op1(i8 %x, i8 %y) {
+; CHECK-LABEL: @umin_sub_op1(
+; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y]], [[U]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %u = call i8 @llvm.umin.i8(i8 %y, i8 %x)
+  %r = sub i8 %y, %u
+  ret i8 %r
+}
+
+define i8 @umin_sub_op1_commute(i8 %x, i8 %y) {
+; CHECK-LABEL: @umin_sub_op1_commute(
+; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y]], [[U]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %u = call i8 @llvm.umin.i8(i8 %x, i8 %y)
+  %r = sub i8 %y, %u
+  ret i8 %r
+}
+
+define i8 @umin_sub_op0(i8 %x, i8 %y) {
+; CHECK-LABEL: @umin_sub_op0(
+; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[U]], [[Y]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %u = call i8 @llvm.umin.i8(i8 %y, i8 %x)
+  %r = sub i8 %u, %y
+  ret i8 %r
+}
+
+define i8 @umin_sub_op0_commute(i8 %x, i8 %y) {
+; CHECK-LABEL: @umin_sub_op0_commute(
+; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[U]], [[Y]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %u = call i8 @llvm.umin.i8(i8 %x, i8 %y)
+  %r = sub i8 %u, %y
+  ret i8 %r
+}
+
+define i8 @umin_sub_op1_use(i8 %x, i8 %y) {
+; CHECK-LABEL: @umin_sub_op1_use(
+; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    call void @use8(i8 [[U]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y]], [[U]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %u = call i8 @llvm.umin.i8(i8 %y, i8 %x)
+  call void @use8(i8 %u)
+  %r = sub i8 %y, %u
+  ret i8 %r
+}
+
+define i8 @umin_sub_op0_use(i8 %x, i8 %y) {
+; CHECK-LABEL: @umin_sub_op0_use(
+; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    call void @use8(i8 [[U]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 [[U]], [[Y]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %u = call i8 @llvm.umin.i8(i8 %y, i8 %x)
+  call void @use8(i8 %u)
+  %r = sub i8 %u, %y
+  ret i8 %r
+}
+
 declare void @use8(i8)
 declare void @use32(i32 %u)

From e2f627e5e3855309f3a7421f6786b401efb6b7c7 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 11:56:34 +0100
Subject: [PATCH 610/748] [InstCombine] Fold sub of umin to usub.sat

We were handling sub of umax, but not the conjugated umin case.

https://alive2.llvm.org/ce/z/4fdZfy
https://alive2.llvm.org/ce/z/BhUQBM
---
 .../InstCombine/InstCombineAddSub.cpp         | 11 ++++++++
 .../test/Transforms/InstCombine/sub-minmax.ll | 28 ++++++++-----------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0598f751febe2..8a881924ab09f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2187,12 +2187,23 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return replaceInstUsesWith(
         I, Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {X, Op1}));
 
+  // Op0 - umin(X, Op0) --> usub.sat(Op0, X)
+  if (match(Op1, m_OneUse(m_c_UMin(m_Value(X), m_Specific(Op0)))))
+    return replaceInstUsesWith(
+        I, Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {Op0, X}));
+
   // Op0 - umax(X, Op0) --> 0 - usub.sat(X, Op0)
   if (match(Op1, m_OneUse(m_c_UMax(m_Value(X), m_Specific(Op0))))) {
     Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {X, Op0});
     return BinaryOperator::CreateNeg(USub);
   }
 
+  // umin(X, Op1) - Op1 --> 0 - usub.sat(Op1, X)
+  if (match(Op0, m_OneUse(m_c_UMin(m_Value(X), m_Specific(Op1))))) {
+    Value *USub = Builder.CreateIntrinsic(Intrinsic::usub_sat, {Ty}, {Op1, X});
+    return BinaryOperator::CreateNeg(USub);
+  }
+
   // C - ctpop(X) => ctpop(~X) if C is bitwidth
   if (match(Op0, m_SpecificInt(Ty->getScalarSizeInBits())) &&
       match(Op1, m_OneUse(m_Intrinsic<Intrinsic::ctpop>(m_Value(X)))))
diff --git a/llvm/test/Transforms/InstCombine/sub-minmax.ll b/llvm/test/Transforms/InstCombine/sub-minmax.ll
index 5dbf02c14528c..339dd733a9684 100644
--- a/llvm/test/Transforms/InstCombine/sub-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/sub-minmax.ll
@@ -39,9 +39,8 @@ define i32 @na_minus_max_na_b(i32 %A, i32 %B) {
 
 define i5 @sub_umin(i5 %a, i5 %b) {
 ; CHECK-LABEL: @sub_umin(
-; CHECK-NEXT:    [[UMIN:%.*]] = call i5 @llvm.umin.i5(i5 [[A:%.*]], i5 [[B:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = sub i5 [[A]], [[UMIN]]
-; CHECK-NEXT:    ret i5 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i5 @llvm.usub.sat.i5(i5 [[A:%.*]], i5 [[B:%.*]])
+; CHECK-NEXT:    ret i5 [[TMP1]]
 ;
   %umin = call i5 @llvm.umin.i5(i5 %a, i5 %b)
   %r = sub i5 %a, %umin
@@ -50,9 +49,8 @@ define i5 @sub_umin(i5 %a, i5 %b) {
 
 define <2 x i8> @sub_umin_commute_vec(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @sub_umin_commute_vec(
-; CHECK-NEXT:    [[UMIN:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[B:%.*]], <2 x i8> [[A:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = sub <2 x i8> [[B]], [[UMIN]]
-; CHECK-NEXT:    ret <2 x i8> [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[B:%.*]], <2 x i8> [[A:%.*]])
+; CHECK-NEXT:    ret <2 x i8> [[TMP1]]
 ;
   %umin = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %b, <2 x i8> %a)
   %r = sub <2 x i8> %b, %umin
@@ -569,9 +567,8 @@ define i8 @umax_sub_op1_use(i8 %x, i8 %y) {
 
 define i8 @umin_sub_op1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umin_sub_op1(
-; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y]], [[U]]
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %u = call i8 @llvm.umin.i8(i8 %y, i8 %x)
   %r = sub i8 %y, %u
@@ -580,9 +577,8 @@ define i8 @umin_sub_op1(i8 %x, i8 %y) {
 
 define i8 @umin_sub_op1_commute(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umin_sub_op1_commute(
-; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = sub i8 [[Y]], [[U]]
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %u = call i8 @llvm.umin.i8(i8 %x, i8 %y)
   %r = sub i8 %y, %u
@@ -591,8 +587,8 @@ define i8 @umin_sub_op1_commute(i8 %x, i8 %y) {
 
 define i8 @umin_sub_op0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umin_sub_op0(
-; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = sub i8 [[U]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 0, [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %u = call i8 @llvm.umin.i8(i8 %y, i8 %x)
@@ -602,8 +598,8 @@ define i8 @umin_sub_op0(i8 %x, i8 %y) {
 
 define i8 @umin_sub_op0_commute(i8 %x, i8 %y) {
 ; CHECK-LABEL: @umin_sub_op0_commute(
-; CHECK-NEXT:    [[U:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = sub i8 [[U]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[Y:%.*]], i8 [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = sub i8 0, [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %u = call i8 @llvm.umin.i8(i8 %x, i8 %y)

From f84d732f8c1737940afab71824134f41f37a048b Mon Sep 17 00:00:00 2001
From: Anton Afanasyev <anton.a.afanasyev@gmail.com>
Date: Wed, 15 Sep 2021 09:19:53 +0300
Subject: [PATCH 611/748] [AggressiveInstCombine] Add `phi` nodes support to
 `TruncInstCombine`

Expand `TruncInstCombine` to handle loops by adding `phi` nodes
to expression graph.

Reviewed by: RKSimon, lebedev.ri

Differential Revision: https://reviews.llvm.org/D109817
---
 .../AggressiveInstCombineInternal.h           | 42 ++++-----
 .../TruncInstCombine.cpp                      | 87 ++++++++++++++-----
 .../AggressiveInstCombine/trunc_phi.ll        | 20 ++---
 3 files changed, 94 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
index 6c73645b20f20..9fc103d45d985 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -23,14 +23,14 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// TruncInstCombine - looks for expression dags dominated by trunc instructions
-// and for each eligible dag, it will create a reduced bit-width expression and
-// replace the old expression with this new one and remove the old one.
-// Eligible expression dag is such that:
+// TruncInstCombine - looks for expression graphs dominated by trunc
+// instructions and for each eligible graph, it will create a reduced bit-width
+// expression and replace the old expression with this new one and remove the
+// old one. Eligible expression graph is such that:
 //   1. Contains only supported instructions.
 //   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
 //   3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
-//   4. All instructions in the dag must not have users outside the dag.
+//   4. All instructions in the graph must not have users outside the graph.
 //      Only exception is for {ZExt, SExt}Inst with operand type equal to the
 //      new reduced type chosen in (3).
 //
@@ -63,7 +63,7 @@ class TruncInstCombine {
   /// Current processed TruncInst instruction.
   TruncInst *CurrentTruncInst = nullptr;
 
-  /// Information per each instruction in the expression dag.
+  /// Information per each instruction in the expression graph.
   struct Info {
     /// Number of LSBs that are needed to generate a valid expression.
     unsigned ValidBitWidth = 0;
@@ -72,10 +72,10 @@ class TruncInstCombine {
     /// The reduced value generated to replace the old instruction.
     Value *NewValue = nullptr;
   };
-  /// An ordered map representing expression dag post-dominated by current
-  /// processed TruncInst. It maps each instruction in the dag to its Info
+  /// An ordered map representing expression graph post-dominated by current
+  /// processed TruncInst. It maps each instruction in the graph to its Info
   /// structure. The map is ordered such that each instruction appears before
-  /// all other instructions in the dag that uses it.
+  /// all other instructions in the graph that uses it.
   MapVector<Instruction *, Info> InstInfoMap;
 
 public:
@@ -87,11 +87,11 @@ class TruncInstCombine {
   bool run(Function &F);
 
 private:
-  /// Build expression dag dominated by the /p CurrentTruncInst and append it to
-  /// the InstInfoMap container.
+  /// Build expression graph dominated by the /p CurrentTruncInst and append it
+  /// to the InstInfoMap container.
   ///
-  /// \return true only if succeed to generate an eligible sub expression dag.
-  bool buildTruncExpressionDag();
+  /// \return true only if succeed to generate an eligible sub expression graph.
+  bool buildTruncExpressionGraph();
 
   /// Calculate the minimal allowed bit-width of the chain ending with the
   /// currently visited truncate's operand.
@@ -100,12 +100,12 @@ class TruncInstCombine {
   /// truncate's operand can be shrunk to.
   unsigned getMinBitWidth();
 
-  /// Build an expression dag dominated by the current processed TruncInst and
+  /// Build an expression graph dominated by the current processed TruncInst and
   /// Check if it is eligible to be reduced to a smaller type.
   ///
   /// \return the scalar version of the new type to be used for the reduced
-  ///         expression dag, or nullptr if the expression dag is not eligible
-  ///         to be reduced.
+  ///         expression graph, or nullptr if the expression graph is not
+  ///         eligible to be reduced.
   Type *getBestTruncatedType();
 
   KnownBits computeKnownBits(const Value *V) const {
@@ -128,12 +128,12 @@ class TruncInstCombine {
   /// \return the new reduced value.
   Value *getReducedOperand(Value *V, Type *SclTy);
 
-  /// Create a new expression dag using the reduced /p SclTy type and replace
-  /// the old expression dag with it. Also erase all instructions in the old
-  /// dag, except those that are still needed outside the dag.
+  /// Create a new expression graph using the reduced /p SclTy type and replace
+  /// the old expression graph with it. Also erase all instructions in the old
+  /// graph, except those that are still needed outside the graph.
   ///
-  /// \param SclTy scalar version of new type to reduce expression dag into.
-  void ReduceExpressionDag(Type *SclTy);
+  /// \param SclTy scalar version of new type to reduce expression graph into.
+  void ReduceExpressionGraph(Type *SclTy);
 };
 } // end namespace llvm.
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 4624b735bef8c..71f3d76c0ba78 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
-// for each eligible dag, it will create a reduced bit-width expression, replace
-// the old expression with this new one and remove the old expression.
-// Eligible expression dag is such that:
+// TruncInstCombine - looks for expression graphs post-dominated by TruncInst
+// and for each eligible graph, it will create a reduced bit-width expression,
+// replace the old expression with this new one and remove the old expression.
+// Eligible expression graph is such that:
 //   1. Contains only supported instructions.
 //   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
 //   3. Can be evaluated into type with reduced legal bit-width.
-//   4. All instructions in the dag must not have users outside the dag.
+//   4. All instructions in the graph must not have users outside the graph.
 //      The only exception is for {ZExt, SExt}Inst with operand type equal to
 //      the new reduced type evaluated in (3).
 //
@@ -39,14 +39,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aggressive-instcombine"
 
-STATISTIC(
-    NumDAGsReduced,
-    "Number of truncations eliminated by reducing bit width of expression DAG");
+STATISTIC(NumExprsReduced, "Number of truncations eliminated by reducing bit "
+                           "width of expression graph");
 STATISTIC(NumInstrsReduced,
           "Number of instructions whose bit width was reduced");
 
 /// Given an instruction and a container, it fills all the relevant operands of
-/// that instruction, with respect to the Trunc expression dag optimizaton.
+/// that instruction, with respect to the Trunc expression graph optimizaton.
 static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
   unsigned Opc = I->getOpcode();
   switch (Opc) {
@@ -78,15 +77,19 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
     Ops.push_back(I->getOperand(1));
     Ops.push_back(I->getOperand(2));
     break;
+  case Instruction::PHI:
+    for (Value *V : cast<PHINode>(I)->incoming_values())
+      Ops.push_back(V);
+    break;
   default:
     llvm_unreachable("Unreachable!");
   }
 }
 
-bool TruncInstCombine::buildTruncExpressionDag() {
+bool TruncInstCombine::buildTruncExpressionGraph() {
   SmallVector<Value *, 8> Worklist;
   SmallVector<Instruction *, 8> Stack;
-  // Clear old expression dag.
+  // Clear old instructions info.
   InstInfoMap.clear();
 
   Worklist.push_back(CurrentTruncInst->getOperand(0));
@@ -150,11 +153,19 @@ bool TruncInstCombine::buildTruncExpressionDag() {
       append_range(Worklist, Operands);
       break;
     }
+    case Instruction::PHI: {
+      SmallVector<Value *, 2> Operands;
+      getRelevantOperands(I, Operands);
+      // Add only operands not in Stack to prevent cycle
+      for (auto *Op : Operands)
+        if (all_of(Stack, [Op](Value *V) { return Op != V; }))
+          Worklist.push_back(Op);
+      break;
+    }
     default:
       // TODO: Can handle more cases here:
       // 1. shufflevector
       // 2. sdiv, srem
-      // 3. phi node(and loop handling)
       // ...
       return false;
     }
@@ -254,7 +265,7 @@ unsigned TruncInstCombine::getMinBitWidth() {
 }
 
 Type *TruncInstCombine::getBestTruncatedType() {
-  if (!buildTruncExpressionDag())
+  if (!buildTruncExpressionGraph())
     return nullptr;
 
   // We don't want to duplicate instructions, which isn't profitable. Thus, we
@@ -367,8 +378,10 @@ Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
   return Entry.NewValue;
 }
 
-void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
+void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
   NumInstrsReduced += InstInfoMap.size();
+  // Pairs of old and new phi-nodes
+  SmallVector<std::pair<PHINode *, PHINode *>, 2> OldNewPHINodes;
   for (auto &Itr : InstInfoMap) { // Forward
     Instruction *I = Itr.first;
     TruncInstCombine::Info &NodeInfo = Itr.second;
@@ -451,6 +464,12 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
       Res = Builder.CreateSelect(Op0, LHS, RHS);
       break;
     }
+    case Instruction::PHI: {
+      Res = Builder.CreatePHI(getReducedType(I, SclTy), I->getNumOperands());
+      OldNewPHINodes.push_back(
+          std::make_pair(cast<PHINode>(I), cast<PHINode>(Res)));
+      break;
+    }
     default:
       llvm_unreachable("Unhandled instruction");
     }
@@ -460,6 +479,14 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
       ResI->takeName(I);
   }
 
+  for (auto &Node : OldNewPHINodes) {
+    PHINode *OldPN = Node.first;
+    PHINode *NewPN = Node.second;
+    for (auto Incoming : zip(OldPN->incoming_values(), OldPN->blocks()))
+      NewPN->addIncoming(getReducedOperand(std::get<0>(Incoming), SclTy),
+                         std::get<1>(Incoming));
+  }
+
   Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
   Type *DstTy = CurrentTruncInst->getType();
   if (Res->getType() != DstTy) {
@@ -470,17 +497,31 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
   }
   CurrentTruncInst->replaceAllUsesWith(Res);
 
-  // Erase old expression dag, which was replaced by the reduced expression dag.
-  // We iterate backward, which means we visit the instruction before we visit
-  // any of its operands, this way, when we get to the operand, we already
-  // removed the instructions (from the expression dag) that uses it.
+  // Erase old expression graph, which was replaced by the reduced expression
+  // graph.
   CurrentTruncInst->eraseFromParent();
+  // First, erase old phi-nodes and its uses
+  for (auto &Node : OldNewPHINodes) {
+    PHINode *OldPN = Node.first;
+    OldPN->replaceAllUsesWith(PoisonValue::get(OldPN->getType()));
+    OldPN->eraseFromParent();
+  }
+  // Now we have expression graph turned into dag.
+  // We iterate backward, which means we visit the instruction before we
+  // visit any of its operands, this way, when we get to the operand, we already
+  // removed the instructions (from the expression dag) that uses it.
   for (auto &I : llvm::reverse(InstInfoMap)) {
+    // Skip phi-nodes since they were erased before
+    if (isa<PHINode>(I.first))
+      continue;
     // We still need to check that the instruction has no users before we erase
     // it, because {SExt, ZExt}Inst Instruction might have other users that was
     // not reduced, in such case, we need to keep that instruction.
     if (I.first->use_empty())
       I.first->eraseFromParent();
+    else
+      assert((isa<SExtInst>(I.first) || isa<ZExtInst>(I.first)) &&
+             "Only {SExt, ZExt}Inst might have unreduced users");
   }
 }
 
@@ -498,18 +539,18 @@ bool TruncInstCombine::run(Function &F) {
   }
 
   // Process all TruncInst in the Worklist, for each instruction:
-  //   1. Check if it dominates an eligible expression dag to be reduced.
-  //   2. Create a reduced expression dag and replace the old one with it.
+  //   1. Check if it dominates an eligible expression graph to be reduced.
+  //   2. Create a reduced expression graph and replace the old one with it.
   while (!Worklist.empty()) {
     CurrentTruncInst = Worklist.pop_back_val();
 
     if (Type *NewDstSclTy = getBestTruncatedType()) {
       LLVM_DEBUG(
-          dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
+          dbgs() << "ICE: TruncInstCombine reducing type of expression graph "
                     "dominated by: "
                  << CurrentTruncInst << '\n');
-      ReduceExpressionDag(NewDstSclTy);
-      ++NumDAGsReduced;
+      ReduceExpressionGraph(NewDstSclTy);
+      ++NumExprsReduced;
       MadeIRChange = true;
     }
   }
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
index 46bdb60fada6c..01103a1a5afbf 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
@@ -4,18 +4,17 @@
 define i16 @trunc_phi(i8 %x) {
 ; CHECK-LABEL: @trunc_phi(
 ; CHECK-NEXT:  LoopHeader:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       Loop:
-; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i16 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[SHL:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOPHEADER]] ], [ [[I:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SHL]] = shl i32 [[ZEXT2]], 1
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16
+; CHECK-NEXT:    [[SHL]] = shl i16 [[ZEXT2]], 1
 ; CHECK-NEXT:    [[I]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I]], 10
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOPEND:%.*]], label [[LOOP]]
 ; CHECK:       LoopEnd:
-; CHECK-NEXT:    ret i16 [[TRUNC]]
+; CHECK-NEXT:    ret i16 [[SHL]]
 ;
 LoopHeader:
   %zext = zext i8 %x to i32
@@ -37,22 +36,21 @@ LoopEnd:
 define i16 @trunc_phi2(i8 %x, i32 %sw) {
 ; CHECK-LABEL: @trunc_phi2(
 ; CHECK-NEXT:  LoopHeader:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16
 ; CHECK-NEXT:    switch i32 [[SW:%.*]], label [[LOOPEND:%.*]] [
 ; CHECK-NEXT:    i32 0, label [[LOOP:%.*]]
 ; CHECK-NEXT:    i32 1, label [[LOOP]]
 ; CHECK-NEXT:    ]
 ; CHECK:       Loop:
-; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[ZEXT]], [[LOOPHEADER]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i16 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[ZEXT]], [[LOOPHEADER]] ], [ [[SHL:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOPHEADER]] ], [ 0, [[LOOPHEADER]] ], [ [[I:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SHL]] = shl i32 [[ZEXT2]], 1
+; CHECK-NEXT:    [[SHL]] = shl i16 [[ZEXT2]], 1
 ; CHECK-NEXT:    [[I]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I]], 10
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOPEND]], label [[LOOP]]
 ; CHECK:       LoopEnd:
-; CHECK-NEXT:    [[ZEXT3:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER]] ], [ [[ZEXT2]], [[LOOP]] ]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ZEXT3]] to i16
-; CHECK-NEXT:    ret i16 [[TRUNC]]
+; CHECK-NEXT:    [[ZEXT3:%.*]] = phi i16 [ [[ZEXT]], [[LOOPHEADER]] ], [ [[ZEXT2]], [[LOOP]] ]
+; CHECK-NEXT:    ret i16 [[ZEXT3]]
 ;
 LoopHeader:
   %zext = zext i8 %x to i32

From c5bcfb983e47167a8a1826c1a64d7aa1849add06 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Date: Wed, 23 Feb 2022 11:05:46 +0000
Subject: [PATCH 612/748] [RISCV] Avoid infinite loop between
 DAGCombiner::visitMUL and RISCVISelLowering::transformAddImmMulImm

See https://github.com/llvm/llvm-project/issues/53831 for a full discussion.

The basic issue is that DAGCombiner::visitMUL and
RISCVISelLowering;:transformAddImmMullImm get stuck in a loop, as the
current checks in transformAddImmMulImm aren't sufficient to avoid all
cases where DAGCombiner::isMulAddWithConstProfitable might trigger a
transformation. This patch makes transformAddImmMulImm bail out if C0
(the constant used for multiplication) has more than one use.

Differential Revision: https://reviews.llvm.org/D120332
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  5 +++++
 llvm/test/CodeGen/RISCV/addimm-mulimm.ll    | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a54ae084cfe6b..fbe767a4897d9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7477,6 +7477,11 @@ static SDValue transformAddImmMulImm(SDNode *N, SelectionDAG &DAG,
   auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!N0C || !N1C)
     return SDValue();
+  // If N0C has multiple uses it's possible one of the cases in
+  // DAGCombiner::isMulAddWithConstProfitable will be true, which would result
+  // in an infinite loop.
+  if (!N0C->hasOneUse())
+    return SDValue();
   int64_t C0 = N0C->getSExtValue();
   int64_t C1 = N1C->getSExtValue();
   int64_t CA, CB;
diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index 4706f3904701d..adf0b98742e1b 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -872,3 +872,16 @@ define i64 @mulneg3000_sub8990_c(i64 %x) {
   %tmp1 = add i64 %tmp0, -8990
   ret i64 %tmp1
 }
+
+; This test case previously caused an infinite loop between transformations
+; performed in RISCVISelLowering;:transformAddImmMulImm and
+; DAGCombiner::visitMUL.
+define i1 @pr53831(i32 %x) {
+  %tmp0 = add i32 %x, 1
+  %tmp1 = mul i32 %tmp0, 24
+  %tmp2 = add i32 %tmp1, 1
+  %tmp3 = mul i32 %x, 24
+  %tmp4 = add i32 %tmp3, 2048
+  %tmp5 = icmp eq i32 %tmp4, %tmp2
+	ret i1 %tmp5
+}

From e0f1dd018e0f94a7d694bc615975c3a7d26d9e50 Mon Sep 17 00:00:00 2001
From: iains <iain@sandoe.co.uk>
Date: Tue, 22 Feb 2022 20:58:26 +0000
Subject: [PATCH 613/748] [C++20][Modules] Rework testcase to use split file
 [NFC].

This switches the testcase committed for initial C++20 modules import tracking to
use split-file rather than preprocessor directives.

Differential Revision: https://reviews.llvm.org/D120352
---
 .../Modules/cxx20-import-diagnostics-a.cpp    | 49 +++++++++----------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/clang/test/Modules/cxx20-import-diagnostics-a.cpp b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
index fd4085bcb4713..8e2940a432e6d 100644
--- a/clang/test/Modules/cxx20-import-diagnostics-a.cpp
+++ b/clang/test/Modules/cxx20-import-diagnostics-a.cpp
@@ -1,42 +1,43 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
+// RUN: split-file %s %t
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=0 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/import-diags-tu1.cpp \
 // RUN:  -o %t/B.pcm
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=1 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/import-diags-tu2.cpp \
 // RUN:  -o %t/C.pcm
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=2 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/import-diags-tu3.cpp \
 // RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/AOK1.pcm
 
-// RUN: %clang_cc1 -std=c++20 -S -D TU=3 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -S %t/import-diags-tu4.cpp \
 // RUN:  -fmodule-file=%t/AOK1.pcm -o %t/tu_3.s -verify
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=4 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/import-diags-tu5.cpp \
 // RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/BC.pcm -verify
 
-// RUN: %clang_cc1 -std=c++20 -S -D TU=5 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -S %t/import-diags-tu6.cpp \
 // RUN:  -fmodule-file=%t/B.pcm -fmodule-file=%t/C.pcm -o %t/tu_5.s -verify
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=6 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/import-diags-tu7.cpp \
 // RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=7 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/import-diags-tu8.cpp \
 // RUN:  -fmodule-file=%t/B.pcm -o %t/D.pcm -verify
 
-// RUN: %clang_cc1 -std=c++20 -S -D TU=8 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -S %t/import-diags-tu9.cpp \
 // RUN:  -fmodule-file=%t/B.pcm -o %t/tu_8.s -verify
 
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface -D TU=9 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/import-diags-tu10.cpp \
 // RUN:  -o %t/B.pcm -verify
 
-// RUN: %clang_cc1 -std=c++20 -emit-obj -D TU=10 -x c++ %s \
+// RUN: %clang_cc1 -std=c++20 -emit-obj %t/import-diags-tu11.cpp \
 // RUN:  -fmodule-file=%t/C.pcm  -o %t/impl.o
 
 // Test diagnostics for incorrect module import sequences.
 
-#if TU == 0
+//--- import-diags-tu1.cpp
 
 export module B;
 
@@ -44,7 +45,7 @@ int foo ();
 
 // expected-no-diagnostics
 
-#elif TU == 1
+//--- import-diags-tu2.cpp
 
 export module C;
 
@@ -52,7 +53,7 @@ int bar ();
 
 // expected-no-diagnostics
 
-#elif TU == 2
+//--- import-diags-tu3.cpp
 
 export module AOK1;
 
@@ -63,7 +64,7 @@ export int theAnswer ();
 
 // expected-no-diagnostics
 
-#elif TU == 3
+//--- import-diags-tu4.cpp
 
 module;
 
@@ -73,7 +74,7 @@ export import C; // expected-error {{export declaration can only be used within
 
 int theAnswer () { return 42; }
 
-#elif TU == 4
+//--- import-diags-tu5.cpp
 
 export module BC;
 
@@ -83,7 +84,7 @@ int foo () { return 10; }
 
 import C; // expected-error {{imports must immediately follow the module declaration}}
 
-#elif TU == 5
+//--- import-diags-tu6.cpp
 
 module B; // implicitly imports B.
 
@@ -91,7 +92,7 @@ int foo () { return 10; }
 
 import C; // expected-error {{imports must immediately follow the module declaration}}
 
-#elif TU == 6
+//--- import-diags-tu7.cpp
 
 module;
 // We can only have preprocessor commands here, which could include an include
@@ -103,7 +104,7 @@ export module D;
 
 int delta ();
 
-#elif TU == 7
+//--- import-diags-tu8.cpp
 
 export module D;
 
@@ -113,19 +114,19 @@ module :private;
 
 import B; // expected-error {{module imports cannot be in the private module fragment}}
 
-#elif TU == 8
+//--- import-diags-tu9.cpp
 
 module B;
 
 import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
 
-#elif TU == 9
+//--- import-diags-tu10.cpp
 
 export module B;
 
 import B; // expected-error {{import of module 'B' appears within same top-level module 'B'}}
 
-#elif TU == 10
+//--- import-diags-tu11.cpp
 
 int x;
 
@@ -134,7 +135,3 @@ import C;
 int baz() { return 6174; }
 
 // expected-no-diagnostics
-
-#else
-#error "no MODE set"
-#endif

From d6e008089c2d758dabefe71ae7cf18e46928b5ef Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 12:21:42 +0100
Subject: [PATCH 614/748] [InstCombine] Add tests for add of clamp pattern
 (NFC)

Add intrinsic versions of existing SPF tests.
---
 .../Transforms/InstCombine/max_known_bits.ll  | 71 +++++++++++++++++--
 1 file changed, 67 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/max_known_bits.ll b/llvm/test/Transforms/InstCombine/max_known_bits.ll
index da581888aeaf0..a9f5795f9e8c9 100644
--- a/llvm/test/Transforms/InstCombine/max_known_bits.ll
+++ b/llvm/test/Transforms/InstCombine/max_known_bits.ll
@@ -61,8 +61,8 @@ define i32 @min_max_clamp_3(i16 %x) {
 ; CHECK-NEXT:    [[B:%.*]] = select i1 [[A]], i16 [[X]], i16 -2048
 ; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[B]], 2047
 ; CHECK-NEXT:    [[D:%.*]] = select i1 [[C]], i16 [[B]], i16 2047
-; CHECK-NEXT:    [[G:%.*]] = sext i16 [[D]] to i32
-; CHECK-NEXT:    ret i32 [[G]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[D]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %a = icmp sgt i16 %x, -2048
   %b = select i1 %a, i16 %x, i16 -2048
@@ -81,8 +81,8 @@ define i32 @min_max_clamp_4(i16 %x) {
 ; CHECK-NEXT:    [[B:%.*]] = select i1 [[A]], i16 [[X]], i16 2047
 ; CHECK-NEXT:    [[C:%.*]] = icmp sgt i16 [[B]], -2048
 ; CHECK-NEXT:    [[D:%.*]] = select i1 [[C]], i16 [[B]], i16 -2048
-; CHECK-NEXT:    [[G:%.*]] = sext i16 [[D]] to i32
-; CHECK-NEXT:    ret i32 [[G]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[D]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %a = icmp slt i16 %x, 2047
   %b = select i1 %a, i16 %x, i16 2047
@@ -93,3 +93,66 @@ define i32 @min_max_clamp_4(i16 %x) {
   %g = add i32 %f, -1
   ret i32 %g
 }
+
+; Intrinsic versions of the above tests.
+
+declare i16 @llvm.smin.i16(i16, i16)
+declare i16 @llvm.smax.i16(i16, i16)
+
+define i16 @min_max_clamp_intrinsic(i16 %x) {
+; CHECK-LABEL: @min_max_clamp_intrinsic(
+; CHECK-NEXT:    [[A:%.*]] = call i16 @llvm.smax.i16(i16 [[X:%.*]], i16 -2048)
+; CHECK-NEXT:    [[B:%.*]] = call i16 @llvm.smin.i16(i16 [[A]], i16 2047)
+; CHECK-NEXT:    [[C:%.*]] = add nsw i16 [[B]], 1
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = call i16 @llvm.smax.i16(i16 %x, i16 -2048)
+  %b = call i16 @llvm.smin.i16(i16 %a, i16 2047)
+  %c = add i16 %b, 1
+  ret i16 %c
+}
+
+define i16 @min_max_clamp_intrinsic_2(i16 %x) {
+; CHECK-LABEL: @min_max_clamp_intrinsic_2(
+; CHECK-NEXT:    [[A:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047)
+; CHECK-NEXT:    [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A]], i16 -2048)
+; CHECK-NEXT:    [[C:%.*]] = add i16 [[B]], 1
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = call i16 @llvm.smin.i16(i16 %x, i16 2047)
+  %b = call i16 @llvm.smax.i16(i16 %a, i16 -2048)
+  %c = add i16 %b, 1
+  ret i16 %c
+}
+
+define i32 @min_max_clamp_intrinsic_3(i16 %x) {
+; CHECK-LABEL: @min_max_clamp_intrinsic_3(
+; CHECK-NEXT:    [[A:%.*]] = call i16 @llvm.smax.i16(i16 [[X:%.*]], i16 -2048)
+; CHECK-NEXT:    [[B:%.*]] = call i16 @llvm.smin.i16(i16 [[A]], i16 2047)
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[B]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = call i16 @llvm.smax.i16(i16 %x, i16 -2048)
+  %b = call i16 @llvm.smin.i16(i16 %a, i16 2047)
+  %c = add i16 %b, 1
+  %d = sext i16 %c to i32
+  %e = add i32 %d, -1
+  ret i32 %e
+}
+
+define i32 @min_max_clamp_intrinsic_4(i16 %x) {
+; CHECK-LABEL: @min_max_clamp_intrinsic_4(
+; CHECK-NEXT:    [[A:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047)
+; CHECK-NEXT:    [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A]], i16 -2048)
+; CHECK-NEXT:    [[C:%.*]] = add i16 [[B]], 1
+; CHECK-NEXT:    [[D:%.*]] = sext i16 [[C]] to i32
+; CHECK-NEXT:    [[E:%.*]] = add nsw i32 [[D]], -1
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %a = call i16 @llvm.smin.i16(i16 %x, i16 2047)
+  %b = call i16 @llvm.smax.i16(i16 %a, i16 -2048)
+  %c = add i16 %b, 1
+  %d = sext i16 %c to i32
+  %e = add i32 %d, -1
+  ret i32 %e
+}

From 6777ec9e4df79d88f179593e20d6cb58a3effba4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 12:40:31 +0100
Subject: [PATCH 615/748] [ValueTracking] Support signed intrinsic clamp

This is the same special logic we apply for SPF signed clamps
when computing the number of sign bits, just for intrinsics.

This just uses the same logic as the select case, but there's
multiple directions this could be improved in: We could also use
the num sign bits from the clamped value, we could do this during
constant range calculation, and there's probably unsigned analogues
for the constant range case at least.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 24 +++++++++++++++++++
 .../Transforms/InstCombine/max_known_bits.ll  |  8 +++----
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index ef84e0c69b3c8..b8ff6beccae51 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -2902,6 +2902,24 @@ static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
   return CLow->sle(*CHigh);
 }
 
+static bool isSignedMinMaxIntrinsicClamp(const IntrinsicInst *II,
+                                         const APInt *&CLow,
+                                         const APInt *&CHigh) {
+  assert((II->getIntrinsicID() == Intrinsic::smin ||
+          II->getIntrinsicID() == Intrinsic::smax) && "Must be smin/smax");
+
+  Intrinsic::ID InverseID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
+  auto *InnerII = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
+  if (!InnerII || InnerII->getIntrinsicID() != InverseID ||
+      !match(II->getArgOperand(1), m_APInt(CLow)) ||
+      !match(InnerII->getArgOperand(1), m_APInt(CHigh)))
+    return false;
+
+  if (II->getIntrinsicID() == Intrinsic::smin)
+    std::swap(CLow, CHigh);
+  return CLow->sle(*CHigh);
+}
+
 /// For vector constants, loop over the elements and find the constant with the
 /// minimum number of sign bits. Return 0 if the value is not a vector constant
 /// or if any element was not analyzed; otherwise, return the count for the
@@ -3242,6 +3260,12 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
 
           // Absolute value reduces number of sign bits by at most 1.
           return Tmp - 1;
+        case Intrinsic::smin:
+        case Intrinsic::smax: {
+          const APInt *CLow, *CHigh;
+          if (isSignedMinMaxIntrinsicClamp(II, CLow, CHigh))
+            return std::min(CLow->getNumSignBits(), CHigh->getNumSignBits());
+        }
         }
       }
     }
diff --git a/llvm/test/Transforms/InstCombine/max_known_bits.ll b/llvm/test/Transforms/InstCombine/max_known_bits.ll
index a9f5795f9e8c9..7ae179d843c15 100644
--- a/llvm/test/Transforms/InstCombine/max_known_bits.ll
+++ b/llvm/test/Transforms/InstCombine/max_known_bits.ll
@@ -116,7 +116,7 @@ define i16 @min_max_clamp_intrinsic_2(i16 %x) {
 ; CHECK-LABEL: @min_max_clamp_intrinsic_2(
 ; CHECK-NEXT:    [[A:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047)
 ; CHECK-NEXT:    [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A]], i16 -2048)
-; CHECK-NEXT:    [[C:%.*]] = add i16 [[B]], 1
+; CHECK-NEXT:    [[C:%.*]] = add nsw i16 [[B]], 1
 ; CHECK-NEXT:    ret i16 [[C]]
 ;
   %a = call i16 @llvm.smin.i16(i16 %x, i16 2047)
@@ -144,10 +144,8 @@ define i32 @min_max_clamp_intrinsic_4(i16 %x) {
 ; CHECK-LABEL: @min_max_clamp_intrinsic_4(
 ; CHECK-NEXT:    [[A:%.*]] = call i16 @llvm.smin.i16(i16 [[X:%.*]], i16 2047)
 ; CHECK-NEXT:    [[B:%.*]] = call i16 @llvm.smax.i16(i16 [[A]], i16 -2048)
-; CHECK-NEXT:    [[C:%.*]] = add i16 [[B]], 1
-; CHECK-NEXT:    [[D:%.*]] = sext i16 [[C]] to i32
-; CHECK-NEXT:    [[E:%.*]] = add nsw i32 [[D]], -1
-; CHECK-NEXT:    ret i32 [[E]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[B]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %a = call i16 @llvm.smin.i16(i16 %x, i16 2047)
   %b = call i16 @llvm.smax.i16(i16 %a, i16 -2048)

From fbe38a784e2852b22f5a44ad417e071ff583d57d Mon Sep 17 00:00:00 2001
From: Dawid Jurczak <dawid_jurek@vp.pl>
Date: Tue, 22 Feb 2022 12:46:56 +0100
Subject: [PATCH 616/748] [NFC][Lexer] Make access to LangOpts more consistent

Before this change without any good reason Lexer::LangOpts is sometimes accessed by getter and another time read directly in Lexer functions.
Since getLangOpts is a bit more verbose prefer direct access to LangOpts member when possible.

Differential Revision: https://reviews.llvm.org/D120333
---
 clang/lib/Lex/Lexer.cpp | 42 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index a180bba365cf8..4f8910e7ac9ef 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1881,7 +1881,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
     if (!LangOpts.C99) {
       if (!isHexaLiteral(BufferPtr, LangOpts))
         IsHexFloat = false;
-      else if (!getLangOpts().CPlusPlus17 &&
+      else if (!LangOpts.CPlusPlus17 &&
                std::find(BufferPtr, CurPtr, '_') != CurPtr)
         IsHexFloat = false;
     }
@@ -1890,12 +1890,12 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
   }
 
   // If we have a digit separator, continue.
-  if (C == '\'' && (getLangOpts().CPlusPlus14 || getLangOpts().C2x)) {
+  if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C2x)) {
     unsigned NextSize;
-    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
+    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
-        Diag(CurPtr, getLangOpts().CPlusPlus
+        Diag(CurPtr, LangOpts.CPlusPlus
                          ? diag::warn_cxx11_compat_digit_separator
                          : diag::warn_c2x_compat_digit_separator);
       CurPtr = ConsumeChar(CurPtr, Size, Result);
@@ -1921,7 +1921,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 /// in C++11, or warn on a ud-suffix in C++98.
 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
                                bool IsStringLiteral) {
-  assert(getLangOpts().CPlusPlus);
+  assert(LangOpts.CPlusPlus);
 
   // Maximally munch an identifier.
   unsigned Size;
@@ -1937,7 +1937,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       return CurPtr;
   }
 
-  if (!getLangOpts().CPlusPlus11) {
+  if (!LangOpts.CPlusPlus11) {
     if (!isLexingRawMode())
       Diag(CurPtr,
            C == '_' ? diag::warn_cxx11_compat_user_defined_literal
@@ -1955,7 +1955,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
     bool IsUDSuffix = false;
     if (C == '_')
       IsUDSuffix = true;
-    else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
+    else if (IsStringLiteral && LangOpts.CPlusPlus14) {
       // In C++1y, we need to look ahead a few characters to see if this is a
       // valid suffix for a string literal or a numeric literal (this could be
       // the 'operator""if' defining a numeric literal operator).
@@ -1965,13 +1965,12 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Chars = 1;
       while (true) {
         unsigned NextSize;
-        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
-                                         getLangOpts());
+        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);
-          IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
-                                                            CompleteSuffix);
+          IsUDSuffix =
+              StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
           break;
         }
 
@@ -1986,10 +1985,10 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
 
     if (!IsUDSuffix) {
       if (!isLexingRawMode())
-        Diag(CurPtr, getLangOpts().MSVCCompat
+        Diag(CurPtr, LangOpts.MSVCCompat
                          ? diag::ext_ms_reserved_user_defined_literal
                          : diag::ext_reserved_user_defined_literal)
-          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+            << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
       return CurPtr;
     }
 
@@ -2022,9 +2021,8 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
       (Kind == tok::utf8_string_literal ||
        Kind == tok::utf16_string_literal ||
        Kind == tok::utf32_string_literal))
-    Diag(BufferPtr, getLangOpts().CPlusPlus
-           ? diag::warn_cxx98_compat_unicode_literal
-           : diag::warn_c99_compat_unicode_literal);
+    Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
+                                       : diag::warn_c99_compat_unicode_literal);
 
   char C = getAndAdvanceChar(CurPtr, Result);
   while (C != '"') {
@@ -2058,7 +2056,7 @@ bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
   }
 
   // If we are in C++11, lex the optional ud-suffix.
-  if (getLangOpts().CPlusPlus)
+  if (LangOpts.CPlusPlus)
     CurPtr = LexUDSuffix(Result, CurPtr, true);
 
   // If a nul character existed in the string, warn about it.
@@ -2142,7 +2140,7 @@ bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
   }
 
   // If we are in C++11, lex the optional ud-suffix.
-  if (getLangOpts().CPlusPlus)
+  if (LangOpts.CPlusPlus)
     CurPtr = LexUDSuffix(Result, CurPtr, true);
 
   // Update the location of token as well as BufferPtr.
@@ -2238,7 +2236,7 @@ bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
 
   if (!isLexingRawMode()) {
     if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
-      Diag(BufferPtr, getLangOpts().CPlusPlus
+      Diag(BufferPtr, LangOpts.CPlusPlus
                           ? diag::warn_cxx98_compat_unicode_literal
                           : diag::warn_c99_compat_unicode_literal);
     else if (Kind == tok::utf8_char_constant)
@@ -2280,7 +2278,7 @@ bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
   }
 
   // If we are in C++11, lex the optional ud-suffix.
-  if (getLangOpts().CPlusPlus)
+  if (LangOpts.CPlusPlus)
     CurPtr = LexUDSuffix(Result, CurPtr, false);
 
   // If a nul character existed in the character, warn about it.
@@ -3841,7 +3839,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
     } else if (Char == '=') {
       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
       if (After == '>') {
-        if (getLangOpts().CPlusPlus20) {
+        if (LangOpts.CPlusPlus20) {
           if (!isLexingRawMode())
             Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
           CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
@@ -3851,7 +3849,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
         }
         // Suggest adding a space between the '<=' and the '>' to avoid a
         // change in semantics if this turns up in C++ <=17 mode.
-        if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
+        if (LangOpts.CPlusPlus && !isLexingRawMode()) {
           Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
             << FixItHint::CreateInsertion(
                    getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");

From a848a5cf2f2f2f8a621bdc5a12e0fe49dc743176 Mon Sep 17 00:00:00 2001
From: Balazs Benics <balazs.benics@sigmatechnology.se>
Date: Wed, 23 Feb 2022 12:53:07 +0100
Subject: [PATCH 617/748] Revert "Revert "[analyzer] Fix taint propagation by
 remembering to the location context""

This reverts commit d16c5f4192c30d53468a472c6820163a81192825.

Let's try `REQUIRES: asserts`.
---
 .../Checkers/GenericTaintChecker.cpp          | 39 +++++++++++++------
 ...nt-checker-callback-order-has-definition.c | 14 ++-----
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
index 428778e6cfaa6..66143f78932c3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -38,6 +38,8 @@ using namespace clang;
 using namespace ento;
 using namespace taint;
 
+using llvm::ImmutableSet;
+
 namespace {
 
 class GenericTaintChecker;
@@ -434,7 +436,9 @@ template <> struct ScalarEnumerationTraits<TaintConfiguration::VariadicType> {
 /// to the call post-visit. The values are signed integers, which are either
 /// ReturnValueIndex, or indexes of the pointer/reference argument, which
 /// points to data, which should be tainted on return.
-REGISTER_SET_WITH_PROGRAMSTATE(TaintArgsOnPostVisit, ArgIdxTy)
+REGISTER_MAP_WITH_PROGRAMSTATE(TaintArgsOnPostVisit, const LocationContext *,
+                               ImmutableSet<ArgIdxTy>)
+REGISTER_SET_FACTORY_WITH_PROGRAMSTATE(ArgIdxFactory, ArgIdxTy)
 
 void GenericTaintRuleParser::validateArgVector(const std::string &Option,
                                                const ArgVecTy &Args) const {
@@ -685,22 +689,26 @@ void GenericTaintChecker::checkPostCall(const CallEvent &Call,
   // Set the marked values as tainted. The return value only accessible from
   // checkPostStmt.
   ProgramStateRef State = C.getState();
+  const StackFrameContext *CurrentFrame = C.getStackFrame();
 
   // Depending on what was tainted at pre-visit, we determined a set of
   // arguments which should be tainted after the function returns. These are
   // stored in the state as TaintArgsOnPostVisit set.
-  TaintArgsOnPostVisitTy TaintArgs = State->get<TaintArgsOnPostVisit>();
-  if (TaintArgs.isEmpty())
+  TaintArgsOnPostVisitTy TaintArgsMap = State->get<TaintArgsOnPostVisit>();
+
+  const ImmutableSet<ArgIdxTy> *TaintArgs = TaintArgsMap.lookup(CurrentFrame);
+  if (!TaintArgs)
     return;
+  assert(!TaintArgs->isEmpty());
 
   LLVM_DEBUG(for (ArgIdxTy I
-                  : TaintArgs) {
+                  : *TaintArgs) {
     llvm::dbgs() << "PostCall<";
     Call.dump(llvm::dbgs());
     llvm::dbgs() << "> actually wants to taint arg index: " << I << '\n';
   });
 
-  for (ArgIdxTy ArgNum : TaintArgs) {
+  for (ArgIdxTy ArgNum : *TaintArgs) {
     // Special handling for the tainted return value.
     if (ArgNum == ReturnValueIndex) {
       State = addTaint(State, Call.getReturnValue());
@@ -714,7 +722,7 @@ void GenericTaintChecker::checkPostCall(const CallEvent &Call,
   }
 
   // Clear up the taint info from the state.
-  State = State->remove<TaintArgsOnPostVisit>();
+  State = State->remove<TaintArgsOnPostVisit>(CurrentFrame);
   C.addTransition(State);
 }
 
@@ -776,28 +784,33 @@ void GenericTaintRule::process(const GenericTaintChecker &Checker,
   };
 
   /// Propagate taint where it is necessary.
+  auto &F = State->getStateManager().get_context<ArgIdxFactory>();
+  ImmutableSet<ArgIdxTy> Result = F.getEmptySet();
   ForEachCallArg(
-      [this, &State, WouldEscape, &Call](ArgIdxTy I, const Expr *E, SVal V) {
+      [this, WouldEscape, &Call, &Result, &F](ArgIdxTy I, const Expr *E,
+                                              SVal V) {
         if (PropDstArgs.contains(I)) {
           LLVM_DEBUG(llvm::dbgs() << "PreCall<"; Call.dump(llvm::dbgs());
                      llvm::dbgs()
                      << "> prepares tainting arg index: " << I << '\n';);
-          State = State->add<TaintArgsOnPostVisit>(I);
+          Result = F.add(Result, I);
         }
 
         // TODO: We should traverse all reachable memory regions via the
         // escaping parameter. Instead of doing that we simply mark only the
         // referred memory region as tainted.
         if (WouldEscape(V, E->getType())) {
-          LLVM_DEBUG(if (!State->contains<TaintArgsOnPostVisit>(I)) {
+          LLVM_DEBUG(if (!Result.contains(I)) {
             llvm::dbgs() << "PreCall<";
             Call.dump(llvm::dbgs());
             llvm::dbgs() << "> prepares tainting arg index: " << I << '\n';
           });
-          State = State->add<TaintArgsOnPostVisit>(I);
+          Result = F.add(Result, I);
         }
       });
 
+  if (!Result.isEmpty())
+    State = State->set<TaintArgsOnPostVisit>(C.getStackFrame(), Result);
   C.addTransition(State);
 }
 
@@ -888,7 +901,11 @@ void GenericTaintChecker::taintUnsafeSocketProtocol(const CallEvent &Call,
   if (SafeProtocol)
     return;
 
-  C.addTransition(C.getState()->add<TaintArgsOnPostVisit>(ReturnValueIndex));
+  ProgramStateRef State = C.getState();
+  auto &F = State->getStateManager().get_context<ArgIdxFactory>();
+  ImmutableSet<ArgIdxTy> Result = F.add(F.getEmptySet(), ReturnValueIndex);
+  State = State->set<TaintArgsOnPostVisit>(C.getStackFrame(), Result);
+  C.addTransition(State);
 }
 
 /// Checker registration
diff --git a/clang/test/Analysis/taint-checker-callback-order-has-definition.c b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
index 82943ad46fbd8..f718fa5a49fc4 100644
--- a/clang/test/Analysis/taint-checker-callback-order-has-definition.c
+++ b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
@@ -4,8 +4,6 @@
 // RUN:   2>&1 | FileCheck %s
 
 // REQUIRES: asserts
-// FIXME: We should not crash.
-// XFAIL: *
 
 struct _IO_FILE;
 typedef struct _IO_FILE FILE;
@@ -32,12 +30,8 @@ void top(const char *fname, char *buf) {
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 1
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 2
 
-  // FIXME: We should propagate taint from PreCall<fgets> -> PostCall<fgets>.
-  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: -1
-  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: 0
-  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: 1
-  // CHECK-NEXT: PostCall<nested_call()> actually wants to taint arg index: 2
-
-  // FIXME: We should not crash.
-  // CHECK: PLEASE submit a bug report
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: -1
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 0
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 1
+  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 2
 }

From 7036413dc21254c8bf2f4ac62a3b087bc4b94ce8 Mon Sep 17 00:00:00 2001
From: Balazs Benics <balazs.benics@sigmatechnology.se>
Date: Wed, 23 Feb 2022 12:55:31 +0100
Subject: [PATCH 618/748] Revert "Revert "[analyzer] Fix taint rule of fgets
 and setproctitle_init""

This reverts commit 2acead35c1289d2b3593a992b0639ca6427e481f.

Let's try `REQUIRES: asserts`.
---
 .../Checkers/GenericTaintChecker.cpp          |  4 ++--
 ...nt-checker-callback-order-has-definition.c |  4 +---
 ...hecker-callback-order-without-definition.c |  5 -----
 clang/test/Analysis/taint-generic.c           | 19 +++++++++++++++++++
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
index 66143f78932c3..d15a4659a96e6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -559,7 +559,7 @@ void GenericTaintChecker::initTaintRules(CheckerContext &C) const {
       {{"atoll"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{"fgetc"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
       {{"fgetln"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
-      {{"fgets"}, TR::Prop({{2}}, {{0}, ReturnValueIndex})},
+      {{"fgets"}, TR::Prop({{2}}, {{0, ReturnValueIndex}})},
       {{"fscanf"}, TR::Prop({{0}}, {{}, 2})},
       {{"sscanf"}, TR::Prop({{0}}, {{}, 2})},
       {{"getc"}, TR::Prop({{0}}, {{ReturnValueIndex}})},
@@ -632,7 +632,7 @@ void GenericTaintChecker::initTaintRules(CheckerContext &C) const {
   if (TR::UntrustedEnv(C)) {
     // void setproctitle_init(int argc, char *argv[], char *envp[])
     GlobalCRules.push_back(
-        {{{"setproctitle_init"}}, TR::Sink({{2}}, MsgCustomSink)});
+        {{{"setproctitle_init"}}, TR::Sink({{1, 2}}, MsgCustomSink)});
     GlobalCRules.push_back({{"getenv"}, TR::Source({{ReturnValueIndex}})});
   }
 
diff --git a/clang/test/Analysis/taint-checker-callback-order-has-definition.c b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
index f718fa5a49fc4..eaf96cc675f06 100644
--- a/clang/test/Analysis/taint-checker-callback-order-has-definition.c
+++ b/clang/test/Analysis/taint-checker-callback-order-has-definition.c
@@ -27,11 +27,9 @@ void top(const char *fname, char *buf) {
   (void)fgets(buf, 42, fp); // Trigger taint propagation.
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: -1
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 0
-  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 1
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 2
-
+  //
   // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: -1
   // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 0
-  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 1
   // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 2
 }
diff --git a/clang/test/Analysis/taint-checker-callback-order-without-definition.c b/clang/test/Analysis/taint-checker-callback-order-without-definition.c
index dba23f367fd66..6de87f736926d 100644
--- a/clang/test/Analysis/taint-checker-callback-order-without-definition.c
+++ b/clang/test/Analysis/taint-checker-callback-order-without-definition.c
@@ -21,16 +21,11 @@ void top(const char *fname, char *buf) {
 
   (void)fgets(buf, 42, fp); // Trigger taint propagation.
 
-  // FIXME: Why is the arg index 1 prepared for taint?
-  // Before the call it wasn't tainted, and it also shouldn't be tainted after the call.
-
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: -1
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 0
-  // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 1
   // CHECK-NEXT: PreCall<fgets(buf, 42, fp)> prepares tainting arg index: 2
   //
   // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: -1
   // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 0
-  // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 1
   // CHECK-NEXT: PostCall<fgets(buf, 42, fp)> actually wants to taint arg index: 2
 }
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index 6979c06677646..0612e1b9f98bf 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -58,9 +58,11 @@ extern FILE *stdin;
 
 #define bool _Bool
 
+char *getenv(const char *name);
 int fscanf(FILE *restrict stream, const char *restrict format, ...);
 int sprintf(char *str, const char *format, ...);
 void setproctitle(const char *fmt, ...);
+void setproctitle_init(int argc, char *argv[], char *envp[]);
 typedef __typeof(sizeof(int)) size_t;
 
 // Define string functions. Use builtin for some of them. They all default to
@@ -404,3 +406,20 @@ void testConfigurationSinks(void) {
 void testUnknownFunction(void (*foo)(void)) {
   foo(); // no-crash
 }
+
+void testProctitleFalseNegative() {
+  char flag[80];
+  fscanf(stdin, "%79s", flag);
+  char *argv[] = {"myapp", flag};
+  // FIXME: We should have a warning below: Untrusted data passed to sink.
+  setproctitle_init(1, argv, 0);
+}
+
+void testProctitle2(char *real_argv[]) {
+  char *app = getenv("APP_NAME");
+  if (!app)
+    return;
+  char *argv[] = {app, "--foobar"};
+  setproctitle_init(1, argv, 0);         // expected-warning {{Untrusted data is passed to a user-defined sink}}
+  setproctitle_init(1, real_argv, argv); // expected-warning {{Untrusted data is passed to a user-defined sink}}
+}

From 5fb65557e36ab92d7b38606f7dffd1a3d73a2344 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 13:16:01 +0100
Subject: [PATCH 619/748] [InstCombine] Remove unused visitUDivOperand()
 argument (NFC)

This function only works on the RHS operand.
---
 .../InstCombine/InstCombineMulDivRem.cpp      | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 1aa10b550fc40..db239385aed06 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -972,22 +972,22 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
 // instruction, seeing through select instructions, to determine if we can
 // replace the udiv with something simpler.  If we find that an operand is not
 // able to simplify the udiv, we abort the entire transformation.
-static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
+static size_t visitUDivOperand(Value *Op, const BinaryOperator &I,
                                SmallVectorImpl<UDivFoldAction> &Actions,
                                unsigned Depth = 0) {
   // FIXME: assert that Op1 isn't/doesn't contain undef.
 
   // Check to see if this is an unsigned division with an exact power of 2,
   // if so, convert to a right shift.
-  if (match(Op1, m_Power2())) {
-    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
+  if (match(Op, m_Power2())) {
+    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op));
     return Actions.size();
   }
 
   // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
-  if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
-      match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
-    Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
+  if (match(Op, m_Shl(m_Power2(), m_Value())) ||
+      match(Op, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
+    Actions.push_back(UDivFoldAction(foldUDivShl, Op));
     return Actions.size();
   }
 
@@ -995,14 +995,13 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
   if (Depth++ == MaxDepth)
     return 0;
 
-  if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op))
     // FIXME: missed optimization: if one of the hands of select is/contains
     //        undef, just directly pick the other one.
     // FIXME: can both hands contain undef?
-    if (size_t LHSIdx =
-            visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
-      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
-        Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
+    if (size_t LHSIdx = visitUDivOperand(SI->getOperand(1), I, Actions, Depth))
+      if (visitUDivOperand(SI->getOperand(2), I, Actions, Depth)) {
+        Actions.push_back(UDivFoldAction(nullptr, Op, LHSIdx - 1));
         return Actions.size();
       }
 
@@ -1108,7 +1107,7 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
 
   // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
   SmallVector<UDivFoldAction, 6> UDivActions;
-  if (visitUDivOperand(Op0, Op1, I, UDivActions))
+  if (visitUDivOperand(Op1, I, UDivActions))
     for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
       FoldUDivOperandCb Action = UDivActions[i].FoldAction;
       Value *ActionOp1 = UDivActions[i].OperandToFold;

From aa9c2d19d9b73589d72114d6e0a4fb4ce42b922b Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Wed, 23 Feb 2022 12:22:09 +0000
Subject: [PATCH 620/748] [OpenCL] Align subgroup builtin guards

Until now, subgroup builtins are available with `opencl-c.h` when at
least one of `cl_intel_subgroups`, `cl_khr_subgroups`, or
`__opencl_c_subgroups` is defined.  With `-fdeclare-opencl-builtins`,
subgroup builtins are conditionalized on `cl_khr_subgroups` only.

Align `-fdeclare-opencl-builtins` to `opencl-c.h` by introducing the
internal `__opencl_subgroup_builtins` macro.

Differential Revision: https://reviews.llvm.org/D120254
---
 clang/lib/Headers/opencl-c-base.h                 | 5 +++++
 clang/lib/Headers/opencl-c.h                      | 4 ++--
 clang/lib/Sema/OpenCLBuiltins.td                  | 2 +-
 clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl | 5 +++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h
index 5191c41bcd057..d0a0d5bdbf4f5 100644
--- a/clang/lib/Headers/opencl-c-base.h
+++ b/clang/lib/Headers/opencl-c-base.h
@@ -80,6 +80,11 @@
 #define __opencl_c_named_address_space_builtins 1
 #endif // !defined(__opencl_c_generic_address_space)
 
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
+// Internal feature macro to provide subgroup builtins.
+#define __opencl_subgroup_builtins 1
+#endif
+
 // built-in scalar data types:
 
 /**
diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h
index 18c1c317e100f..172b2c192709f 100644
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@@ -16282,7 +16282,7 @@ queue_t __ovld get_default_queue(void);
 
 // OpenCL Extension v2.0 s9.17 - Sub-groups
 
-#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
+#if defined(__opencl_subgroup_builtins)
 // Shared Sub Group Functions
 uint    __ovld get_sub_group_size(void);
 uint    __ovld get_max_sub_group_size(void);
@@ -16381,7 +16381,7 @@ double  __ovld __conv sub_group_scan_inclusive_min(double x);
 double  __ovld __conv sub_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64
 
-#endif //cl_khr_subgroups cl_intel_subgroups __opencl_c_subgroups
+#endif // __opencl_subgroup_builtins
 
 #if defined(cl_khr_subgroup_extended_types)
 char __ovld __conv sub_group_broadcast( char value, uint index );
diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td
index e6da5e34f7091..ff23a1c52ff38 100644
--- a/clang/lib/Sema/OpenCLBuiltins.td
+++ b/clang/lib/Sema/OpenCLBuiltins.td
@@ -83,7 +83,7 @@ def AtomicFp64TypeExt : TypeExtension<"cl_khr_int64_base_atomics cl_khr_int64_ex
 
 // FunctionExtension definitions.
 def FuncExtNone                          : FunctionExtension<"">;
-def FuncExtKhrSubgroups                  : FunctionExtension<"cl_khr_subgroups">;
+def FuncExtKhrSubgroups                  : FunctionExtension<"__opencl_subgroup_builtins">;
 def FuncExtKhrSubgroupExtendedTypes      : FunctionExtension<"cl_khr_subgroup_extended_types">;
 def FuncExtKhrSubgroupNonUniformVote     : FunctionExtension<"cl_khr_subgroup_non_uniform_vote">;
 def FuncExtKhrSubgroupBallot             : FunctionExtension<"cl_khr_subgroup_ballot">;
diff --git a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
index d2d7fff02efaa..89a4646839acb 100644
--- a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
+++ b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL -fdeclare-opencl-builtins -DNO_HEADER
 // RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL -fdeclare-opencl-builtins -finclude-default-header
-// RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL1.2 -fdeclare-opencl-builtins -DNO_HEADER
-// RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL1.2 -fdeclare-opencl-builtins -finclude-default-header
+// RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL1.2 -fdeclare-opencl-builtins -DNO_HEADER -cl-ext=-cl_intel_subgroups
+// RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL1.2 -fdeclare-opencl-builtins -finclude-default-header -cl-ext=-cl_intel_subgroups
 // RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL2.0 -fdeclare-opencl-builtins -DNO_HEADER
 // RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL2.0 -fdeclare-opencl-builtins -finclude-default-header
 // RUN: %clang_cc1 %s -triple spir -verify -pedantic -Wconversion -Werror -fsyntax-only -cl-std=CL3.0 -fdeclare-opencl-builtins -finclude-default-header
@@ -79,6 +79,7 @@ typedef struct {int a;} ndrange_t;
 #define cl_khr_subgroup_non_uniform_arithmetic 1
 #define cl_khr_subgroup_clustered_reduce 1
 #define __opencl_c_read_write_images 1
+#define __opencl_subgroup_builtins 1
 #endif
 
 #if (__OPENCL_CPP_VERSION__ == 100 || __OPENCL_C_VERSION__ == 200)

From 5658d869ff4cefc5c3626f7e5658bb43b2b32063 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Wed, 23 Feb 2022 11:49:43 +0000
Subject: [PATCH 621/748] AArch64: clamp UBFX high-bit to 32-bits

We were producing invalid instructions like "ubfx w0, w0, #20, #16".
---
 llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp |  1 +
 llvm/test/CodeGen/AArch64/bitfield.ll           | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 770a5bbb0717f..c8a26247e228b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1852,6 +1852,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
     VT = Opd0->getValueType(0);
   } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
     Opd0 = Op0->getOperand(0);
+    ClampMSB = (VT == MVT::i32);
   } else if (BiggerPattern) {
     // Let's pretend a 0 shift right has been performed.
     // The resulting code will be at least as good as the original one
diff --git a/llvm/test/CodeGen/AArch64/bitfield.ll b/llvm/test/CodeGen/AArch64/bitfield.ll
index 2ea0a41466144..58fd0db036caa 100644
--- a/llvm/test/CodeGen/AArch64/bitfield.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield.ll
@@ -230,3 +230,16 @@ define dso_local i64 @test_sbfx64(i64* %addr) {
    %extended = ashr i64 %shifted, 1
    ret i64 %extended
 }
+
+define i32 @test_ubfx_mask(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: test_ubfx_mask:
+; CHECK: lsr w0, w1, #20
+  %mask = and i32 %lhs, 20
+  %i7 = add i32 %mask, 1
+  %i8 = xor i32 %lhs, 20
+  %i9 = xor i32 %i8, %i7
+  %i10 = and i32 %i9, 20
+  %shift = lshr i32 %rhs, %i10
+  %shift.masked = and i32 %shift, 65535
+  ret i32 %shift.masked
+}

From 8ad6d5e465bba198c883e699c28690b0ea79400d Mon Sep 17 00:00:00 2001
From: Anton Afanasyev <anton.a.afanasyev@gmail.com>
Date: Wed, 23 Feb 2022 15:55:06 +0300
Subject: [PATCH 622/748] Revert "[AggressiveInstCombine] Add `phi` nodes
 support to `TruncInstCombine`"

This reverts commit f84d732f8c1737940afab71824134f41f37a048b.
Breakage of "sanitizer-x86_64-linux-fast"
---
 .../AggressiveInstCombineInternal.h           | 42 ++++-----
 .../TruncInstCombine.cpp                      | 87 +++++--------------
 .../AggressiveInstCombine/trunc_phi.ll        | 20 +++--
 3 files changed, 55 insertions(+), 94 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
index 9fc103d45d985..6c73645b20f20 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -23,14 +23,14 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// TruncInstCombine - looks for expression graphs dominated by trunc
-// instructions and for each eligible graph, it will create a reduced bit-width
-// expression and replace the old expression with this new one and remove the
-// old one. Eligible expression graph is such that:
+// TruncInstCombine - looks for expression dags dominated by trunc instructions
+// and for each eligible dag, it will create a reduced bit-width expression and
+// replace the old expression with this new one and remove the old one.
+// Eligible expression dag is such that:
 //   1. Contains only supported instructions.
 //   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
 //   3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
-//   4. All instructions in the graph must not have users outside the graph.
+//   4. All instructions in the dag must not have users outside the dag.
 //      Only exception is for {ZExt, SExt}Inst with operand type equal to the
 //      new reduced type chosen in (3).
 //
@@ -63,7 +63,7 @@ class TruncInstCombine {
   /// Current processed TruncInst instruction.
   TruncInst *CurrentTruncInst = nullptr;
 
-  /// Information per each instruction in the expression graph.
+  /// Information per each instruction in the expression dag.
   struct Info {
     /// Number of LSBs that are needed to generate a valid expression.
     unsigned ValidBitWidth = 0;
@@ -72,10 +72,10 @@ class TruncInstCombine {
     /// The reduced value generated to replace the old instruction.
     Value *NewValue = nullptr;
   };
-  /// An ordered map representing expression graph post-dominated by current
-  /// processed TruncInst. It maps each instruction in the graph to its Info
+  /// An ordered map representing expression dag post-dominated by current
+  /// processed TruncInst. It maps each instruction in the dag to its Info
   /// structure. The map is ordered such that each instruction appears before
-  /// all other instructions in the graph that uses it.
+  /// all other instructions in the dag that uses it.
   MapVector<Instruction *, Info> InstInfoMap;
 
 public:
@@ -87,11 +87,11 @@ class TruncInstCombine {
   bool run(Function &F);
 
 private:
-  /// Build expression graph dominated by the /p CurrentTruncInst and append it
-  /// to the InstInfoMap container.
+  /// Build expression dag dominated by the /p CurrentTruncInst and append it to
+  /// the InstInfoMap container.
   ///
-  /// \return true only if succeed to generate an eligible sub expression graph.
-  bool buildTruncExpressionGraph();
+  /// \return true only if succeed to generate an eligible sub expression dag.
+  bool buildTruncExpressionDag();
 
   /// Calculate the minimal allowed bit-width of the chain ending with the
   /// currently visited truncate's operand.
@@ -100,12 +100,12 @@ class TruncInstCombine {
   /// truncate's operand can be shrunk to.
   unsigned getMinBitWidth();
 
-  /// Build an expression graph dominated by the current processed TruncInst and
+  /// Build an expression dag dominated by the current processed TruncInst and
   /// Check if it is eligible to be reduced to a smaller type.
   ///
   /// \return the scalar version of the new type to be used for the reduced
-  ///         expression graph, or nullptr if the expression graph is not
-  ///         eligible to be reduced.
+  ///         expression dag, or nullptr if the expression dag is not eligible
+  ///         to be reduced.
   Type *getBestTruncatedType();
 
   KnownBits computeKnownBits(const Value *V) const {
@@ -128,12 +128,12 @@ class TruncInstCombine {
   /// \return the new reduced value.
   Value *getReducedOperand(Value *V, Type *SclTy);
 
-  /// Create a new expression graph using the reduced /p SclTy type and replace
-  /// the old expression graph with it. Also erase all instructions in the old
-  /// graph, except those that are still needed outside the graph.
+  /// Create a new expression dag using the reduced /p SclTy type and replace
+  /// the old expression dag with it. Also erase all instructions in the old
+  /// dag, except those that are still needed outside the dag.
   ///
-  /// \param SclTy scalar version of new type to reduce expression graph into.
-  void ReduceExpressionGraph(Type *SclTy);
+  /// \param SclTy scalar version of new type to reduce expression dag into.
+  void ReduceExpressionDag(Type *SclTy);
 };
 } // end namespace llvm.
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 71f3d76c0ba78..4624b735bef8c 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TruncInstCombine - looks for expression graphs post-dominated by TruncInst
-// and for each eligible graph, it will create a reduced bit-width expression,
-// replace the old expression with this new one and remove the old expression.
-// Eligible expression graph is such that:
+// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
+// for each eligible dag, it will create a reduced bit-width expression, replace
+// the old expression with this new one and remove the old expression.
+// Eligible expression dag is such that:
 //   1. Contains only supported instructions.
 //   2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
 //   3. Can be evaluated into type with reduced legal bit-width.
-//   4. All instructions in the graph must not have users outside the graph.
+//   4. All instructions in the dag must not have users outside the dag.
 //      The only exception is for {ZExt, SExt}Inst with operand type equal to
 //      the new reduced type evaluated in (3).
 //
@@ -39,13 +39,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aggressive-instcombine"
 
-STATISTIC(NumExprsReduced, "Number of truncations eliminated by reducing bit "
-                           "width of expression graph");
+STATISTIC(
+    NumDAGsReduced,
+    "Number of truncations eliminated by reducing bit width of expression DAG");
 STATISTIC(NumInstrsReduced,
           "Number of instructions whose bit width was reduced");
 
 /// Given an instruction and a container, it fills all the relevant operands of
-/// that instruction, with respect to the Trunc expression graph optimizaton.
+/// that instruction, with respect to the Trunc expression dag optimizaton.
 static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
   unsigned Opc = I->getOpcode();
   switch (Opc) {
@@ -77,19 +78,15 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
     Ops.push_back(I->getOperand(1));
     Ops.push_back(I->getOperand(2));
     break;
-  case Instruction::PHI:
-    for (Value *V : cast<PHINode>(I)->incoming_values())
-      Ops.push_back(V);
-    break;
   default:
     llvm_unreachable("Unreachable!");
   }
 }
 
-bool TruncInstCombine::buildTruncExpressionGraph() {
+bool TruncInstCombine::buildTruncExpressionDag() {
   SmallVector<Value *, 8> Worklist;
   SmallVector<Instruction *, 8> Stack;
-  // Clear old instructions info.
+  // Clear old expression dag.
   InstInfoMap.clear();
 
   Worklist.push_back(CurrentTruncInst->getOperand(0));
@@ -153,19 +150,11 @@ bool TruncInstCombine::buildTruncExpressionGraph() {
       append_range(Worklist, Operands);
       break;
     }
-    case Instruction::PHI: {
-      SmallVector<Value *, 2> Operands;
-      getRelevantOperands(I, Operands);
-      // Add only operands not in Stack to prevent cycle
-      for (auto *Op : Operands)
-        if (all_of(Stack, [Op](Value *V) { return Op != V; }))
-          Worklist.push_back(Op);
-      break;
-    }
     default:
       // TODO: Can handle more cases here:
       // 1. shufflevector
       // 2. sdiv, srem
+      // 3. phi node(and loop handling)
       // ...
       return false;
     }
@@ -265,7 +254,7 @@ unsigned TruncInstCombine::getMinBitWidth() {
 }
 
 Type *TruncInstCombine::getBestTruncatedType() {
-  if (!buildTruncExpressionGraph())
+  if (!buildTruncExpressionDag())
     return nullptr;
 
   // We don't want to duplicate instructions, which isn't profitable. Thus, we
@@ -378,10 +367,8 @@ Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
   return Entry.NewValue;
 }
 
-void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
+void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
   NumInstrsReduced += InstInfoMap.size();
-  // Pairs of old and new phi-nodes
-  SmallVector<std::pair<PHINode *, PHINode *>, 2> OldNewPHINodes;
   for (auto &Itr : InstInfoMap) { // Forward
     Instruction *I = Itr.first;
     TruncInstCombine::Info &NodeInfo = Itr.second;
@@ -464,12 +451,6 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
       Res = Builder.CreateSelect(Op0, LHS, RHS);
       break;
     }
-    case Instruction::PHI: {
-      Res = Builder.CreatePHI(getReducedType(I, SclTy), I->getNumOperands());
-      OldNewPHINodes.push_back(
-          std::make_pair(cast<PHINode>(I), cast<PHINode>(Res)));
-      break;
-    }
     default:
       llvm_unreachable("Unhandled instruction");
     }
@@ -479,14 +460,6 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
       ResI->takeName(I);
   }
 
-  for (auto &Node : OldNewPHINodes) {
-    PHINode *OldPN = Node.first;
-    PHINode *NewPN = Node.second;
-    for (auto Incoming : zip(OldPN->incoming_values(), OldPN->blocks()))
-      NewPN->addIncoming(getReducedOperand(std::get<0>(Incoming), SclTy),
-                         std::get<1>(Incoming));
-  }
-
   Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
   Type *DstTy = CurrentTruncInst->getType();
   if (Res->getType() != DstTy) {
@@ -497,31 +470,17 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
   }
   CurrentTruncInst->replaceAllUsesWith(Res);
 
-  // Erase old expression graph, which was replaced by the reduced expression
-  // graph.
-  CurrentTruncInst->eraseFromParent();
-  // First, erase old phi-nodes and its uses
-  for (auto &Node : OldNewPHINodes) {
-    PHINode *OldPN = Node.first;
-    OldPN->replaceAllUsesWith(PoisonValue::get(OldPN->getType()));
-    OldPN->eraseFromParent();
-  }
-  // Now we have expression graph turned into dag.
-  // We iterate backward, which means we visit the instruction before we
-  // visit any of its operands, this way, when we get to the operand, we already
+  // Erase old expression dag, which was replaced by the reduced expression dag.
+  // We iterate backward, which means we visit the instruction before we visit
+  // any of its operands, this way, when we get to the operand, we already
   // removed the instructions (from the expression dag) that uses it.
+  CurrentTruncInst->eraseFromParent();
   for (auto &I : llvm::reverse(InstInfoMap)) {
-    // Skip phi-nodes since they were erased before
-    if (isa<PHINode>(I.first))
-      continue;
     // We still need to check that the instruction has no users before we erase
     // it, because {SExt, ZExt}Inst Instruction might have other users that was
     // not reduced, in such case, we need to keep that instruction.
     if (I.first->use_empty())
       I.first->eraseFromParent();
-    else
-      assert((isa<SExtInst>(I.first) || isa<ZExtInst>(I.first)) &&
-             "Only {SExt, ZExt}Inst might have unreduced users");
   }
 }
 
@@ -539,18 +498,18 @@ bool TruncInstCombine::run(Function &F) {
   }
 
   // Process all TruncInst in the Worklist, for each instruction:
-  //   1. Check if it dominates an eligible expression graph to be reduced.
-  //   2. Create a reduced expression graph and replace the old one with it.
+  //   1. Check if it dominates an eligible expression dag to be reduced.
+  //   2. Create a reduced expression dag and replace the old one with it.
   while (!Worklist.empty()) {
     CurrentTruncInst = Worklist.pop_back_val();
 
     if (Type *NewDstSclTy = getBestTruncatedType()) {
       LLVM_DEBUG(
-          dbgs() << "ICE: TruncInstCombine reducing type of expression graph "
+          dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
                     "dominated by: "
                  << CurrentTruncInst << '\n');
-      ReduceExpressionGraph(NewDstSclTy);
-      ++NumExprsReduced;
+      ReduceExpressionDag(NewDstSclTy);
+      ++NumDAGsReduced;
       MadeIRChange = true;
     }
   }
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
index 01103a1a5afbf..46bdb60fada6c 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_phi.ll
@@ -4,17 +4,18 @@
 define i16 @trunc_phi(i8 %x) {
 ; CHECK-LABEL: @trunc_phi(
 ; CHECK-NEXT:  LoopHeader:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       Loop:
-; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i16 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[SHL:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOPHEADER]] ], [ [[I:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SHL]] = shl i16 [[ZEXT2]], 1
+; CHECK-NEXT:    [[SHL]] = shl i32 [[ZEXT2]], 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16
 ; CHECK-NEXT:    [[I]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I]], 10
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOPEND:%.*]], label [[LOOP]]
 ; CHECK:       LoopEnd:
-; CHECK-NEXT:    ret i16 [[SHL]]
+; CHECK-NEXT:    ret i16 [[TRUNC]]
 ;
 LoopHeader:
   %zext = zext i8 %x to i32
@@ -36,21 +37,22 @@ LoopEnd:
 define i16 @trunc_phi2(i8 %x, i32 %sw) {
 ; CHECK-LABEL: @trunc_phi2(
 ; CHECK-NEXT:  LoopHeader:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
 ; CHECK-NEXT:    switch i32 [[SW:%.*]], label [[LOOPEND:%.*]] [
 ; CHECK-NEXT:    i32 0, label [[LOOP:%.*]]
 ; CHECK-NEXT:    i32 1, label [[LOOP]]
 ; CHECK-NEXT:    ]
 ; CHECK:       Loop:
-; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i16 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[ZEXT]], [[LOOPHEADER]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ZEXT2:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER:%.*]] ], [ [[ZEXT]], [[LOOPHEADER]] ], [ [[SHL:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOPHEADER]] ], [ 0, [[LOOPHEADER]] ], [ [[I:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SHL]] = shl i16 [[ZEXT2]], 1
+; CHECK-NEXT:    [[SHL]] = shl i32 [[ZEXT2]], 1
 ; CHECK-NEXT:    [[I]] = add i32 [[J]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I]], 10
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOPEND]], label [[LOOP]]
 ; CHECK:       LoopEnd:
-; CHECK-NEXT:    [[ZEXT3:%.*]] = phi i16 [ [[ZEXT]], [[LOOPHEADER]] ], [ [[ZEXT2]], [[LOOP]] ]
-; CHECK-NEXT:    ret i16 [[ZEXT3]]
+; CHECK-NEXT:    [[ZEXT3:%.*]] = phi i32 [ [[ZEXT]], [[LOOPHEADER]] ], [ [[ZEXT2]], [[LOOP]] ]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ZEXT3]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
 ;
 LoopHeader:
   %zext = zext i8 %x to i32

From 3c840e3c00e910c47a3f61f755fdc402d51e9fb6 Mon Sep 17 00:00:00 2001
From: Simon Atanasyan <simon@atanasyan.com>
Date: Wed, 23 Feb 2022 15:53:41 +0300
Subject: [PATCH 623/748] [MIPS] Recognize DT_MIPS_XHASH dynamic table tag

LLVM tools do not emit `DT_MIPS_XHASH` dynamic table tag. But now
`llvm-objdump` and `llvm-readelf` recognize this tag and print it.

Fixes https://github.com/llvm/llvm-project/issues/53996
---
 llvm/include/llvm/BinaryFormat/DynamicTags.def            | 1 +
 .../ELF/dynamic-section-machine-specific.test             | 3 +++
 .../llvm-readobj/ELF/dynamic-tags-machine-specific.test   | 8 ++++++--
 .../tools/obj2yaml/ELF/dynamic-section-arch-tags.yaml     | 4 ++++
 llvm/tools/llvm-readobj/ELFDumper.cpp                     | 1 +
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/DynamicTags.def b/llvm/include/llvm/BinaryFormat/DynamicTags.def
index 814d8b113ec4e..ae25ec53813c7 100644
--- a/llvm/include/llvm/BinaryFormat/DynamicTags.def
+++ b/llvm/include/llvm/BinaryFormat/DynamicTags.def
@@ -209,6 +209,7 @@ MIPS_DYNAMIC_TAG(MIPS_RWPLT, 0x70000034)        // Points to the base
                                                 // of a writable PLT.
 MIPS_DYNAMIC_TAG(MIPS_RLD_MAP_REL, 0x70000035)  // Relative offset of run time loader
                                                 // map, used for debugging.
+MIPS_DYNAMIC_TAG(MIPS_XHASH, 0x70000036)        // GNU-style hash table with xlat.
 
 // PPC specific dynamic table entries.
 PPC_DYNAMIC_TAG(PPC_GOT, 0x70000000) // Uses Secure PLT ABI.
diff --git a/llvm/test/tools/llvm-objdump/ELF/dynamic-section-machine-specific.test b/llvm/test/tools/llvm-objdump/ELF/dynamic-section-machine-specific.test
index 08d7d2e9c7c73..20219dd4893b7 100644
--- a/llvm/test/tools/llvm-objdump/ELF/dynamic-section-machine-specific.test
+++ b/llvm/test/tools/llvm-objdump/ELF/dynamic-section-machine-specific.test
@@ -86,6 +86,7 @@ ProgramHeaders:
 # MIPS-NEXT:  MIPS_PLTGOT                0x0000000000001000
 # MIPS-NEXT:  MIPS_RWPLT                 0x0000000000001000
 # MIPS-NEXT:  MIPS_RLD_MAP_REL           0x0000000000001000
+# MIPS-NEXT:  MIPS_XHASH                 0x0000000000002000
 
 --- !ELF
 FileHeader:
@@ -187,6 +188,8 @@ Sections:
         Value: 0x1000
       - Tag:   DT_MIPS_RLD_MAP_REL
         Value: 0x1000
+      - Tag:   DT_MIPS_XHASH
+        Value: 0x2000
       - Tag:   DT_NULL
         Value: 0
 ProgramHeaders:
diff --git a/llvm/test/tools/llvm-readobj/ELF/dynamic-tags-machine-specific.test b/llvm/test/tools/llvm-readobj/ELF/dynamic-tags-machine-specific.test
index 970edccd7777d..c32ea33b9b3cb 100644
--- a/llvm/test/tools/llvm-readobj/ELF/dynamic-tags-machine-specific.test
+++ b/llvm/test/tools/llvm-readobj/ELF/dynamic-tags-machine-specific.test
@@ -53,7 +53,7 @@ ProgramHeaders:
 # RUN: llvm-readelf --dynamic-table %t.mips \
 # RUN:  | FileCheck %s --strict-whitespace --match-full-lines --check-prefix=GNU-MIPS
 
-#      LLVM-MIPS:DynamicSection [ (47 entries)
+#      LLVM-MIPS:DynamicSection [ (48 entries)
 # LLVM-MIPS-NEXT:  Tag                Type                       Name/Value
 # LLVM-MIPS-NEXT:  0x0000000000000004 HASH                       0x1000
 # LLVM-MIPS-NEXT:  0x0000000070000001 MIPS_RLD_VERSION           305419896
@@ -101,10 +101,11 @@ ProgramHeaders:
 # LLVM-MIPS-NEXT:  0x0000000070000032 MIPS_PLTGOT                0x1000
 # LLVM-MIPS-NEXT:  0x0000000070000034 MIPS_RWPLT                 0x1000
 # LLVM-MIPS-NEXT:  0x0000000070000035 MIPS_RLD_MAP_REL           0x1000
+# LLVM-MIPS-NEXT:  0x0000000070000036 MIPS_XHASH                 0x2000
 # LLVM-MIPS-NEXT:  0x0000000000000000 NULL                       0x0
 # LLVM-MIPS-NEXT:]
 
-#      GNU-MIPS:Dynamic section at offset {{.*}} contains 47 entries:
+#      GNU-MIPS:Dynamic section at offset {{.*}} contains 48 entries:
 # GNU-MIPS-NEXT:  Tag                Type                         Name/Value
 # GNU-MIPS-NEXT:  0x0000000000000004 (HASH)                       0x1000
 # GNU-MIPS-NEXT:  0x0000000070000001 (MIPS_RLD_VERSION)           305419896
@@ -152,6 +153,7 @@ ProgramHeaders:
 # GNU-MIPS-NEXT:  0x0000000070000032 (MIPS_PLTGOT)                0x1000
 # GNU-MIPS-NEXT:  0x0000000070000034 (MIPS_RWPLT)                 0x1000
 # GNU-MIPS-NEXT:  0x0000000070000035 (MIPS_RLD_MAP_REL)           0x1000
+# GNU-MIPS-NEXT:  0x0000000070000036 (MIPS_XHASH)                 0x2000
 # GNU-MIPS-NEXT:  0x0000000000000000 (NULL)                       0x0
 
 --- !ELF
@@ -256,6 +258,8 @@ Sections:
         Value: 0x1000
       - Tag:   DT_MIPS_RLD_MAP_REL
         Value: 0x1000
+      - Tag:   DT_MIPS_XHASH
+        Value: 0x2000
       - Tag:   DT_NULL
         Value: 0
 ProgramHeaders:
diff --git a/llvm/test/tools/obj2yaml/ELF/dynamic-section-arch-tags.yaml b/llvm/test/tools/obj2yaml/ELF/dynamic-section-arch-tags.yaml
index 5523b2fee37df..538838ff7c1e8 100644
--- a/llvm/test/tools/obj2yaml/ELF/dynamic-section-arch-tags.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/dynamic-section-arch-tags.yaml
@@ -97,6 +97,8 @@
 # MIPS-NEXT:   Value:           0x2D
 # MIPS-NEXT: - Tag:             DT_MIPS_RLD_MAP_REL
 # MIPS-NEXT:   Value:           0x2E
+# MIPS-NEXT: - Tag:             DT_MIPS_XHASH
+# MIPS-NEXT:   Value:           0x2F
 
 --- !ELF
 FileHeader:
@@ -200,6 +202,8 @@ Sections:
         Value:           0x000000000000002D
       - Tag:             DT_MIPS_RLD_MAP_REL
         Value:           0x000000000000002E
+      - Tag:             DT_MIPS_XHASH
+        Value:           0x000000000000002F
 
 ## Check we can handle Hexagon specific tags.
 # RUN: yaml2obj --docnum=2 %s -o %t2
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 6583897c77f8f..20264ad72b5d4 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -2266,6 +2266,7 @@ std::string ELFDumper<ELFT>::getDynamicEntry(uint64_t Type,
     case DT_MIPS_PLTGOT:
     case DT_MIPS_RWPLT:
     case DT_MIPS_RLD_MAP_REL:
+    case DT_MIPS_XHASH:
       return FormatHexValue(Value);
     case DT_MIPS_FLAGS:
       return FormatFlags(Value, makeArrayRef(ElfDynamicDTMipsFlags));

From 5dd0c396384624cdcae646e8cdd254f28e3e478e Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Wed, 23 Feb 2022 08:09:53 -0500
Subject: [PATCH 624/748] [Libomptarget][NFC} Fix missing newline in error
 message

---
 openmp/libomptarget/DeviceRTL/src/State.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index a530c5e0b2471..81a1bf2c6657a 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -138,7 +138,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
 
   if (config::isDebugMode(config::DebugKind::CommonIssues))
     PRINT("Shared memory stack full, fallback to dynamic allocation of global "
-          "memory will negatively impact performance.");
+          "memory will negatively impact performance.\n");
   void *GlobalMemory = memory::allocGlobal(
       AlignedBytes, "Slow path shared memory allocation, insufficient "
                     "shared memory stack memory!");

From 14536ce007b7040c93749a2213bbb291e54f8d05 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 23 Feb 2022 08:13:15 -0500
Subject: [PATCH 625/748] Add myself to the office hours; correct small typo in
 prose

---
 llvm/docs/GettingInvolved.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 82d89e4e70470..06acc7b07c033 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -219,7 +219,7 @@ Office hours
 A number of experienced LLVM contributors make themselves available for a chat
 on a regular schedule, to anyone who is looking for some guidance. Please find
 the list of who is available when, through which medium, and what their area of
-expertise is. Don't by shy to dial in!
+expertise is. Don't be too shy to dial in!
 
 Of course, people take time off from time to time, so if you dial in and you
 don't find anyone present, chances are they happen to be off that day.
@@ -249,6 +249,13 @@ don't find anyone present, chances are they happen to be off that day.
       `gcal <https://calendar.google.com/calendar/embed?src=c_pm6e7160iq7n5fcm1s6m3rjhh4%40group.calendar.google.com>`__
     - `GoogleMeet <https://meet.google.com/hhk-xpdj-gvx>`__
     - English, Romanian
+  * - Aaron Ballman
+    - Clang internals; clang-tidy; clang-query; AST matchers
+    - Monthly, 2nd Monday of the month at 10:00am Eastern/14:00 UTC, for 30 minutes.
+      `ics <https://calendar.google.com/calendar/ical/npgke5dug0uliud0qapptmps58%40group.calendar.google.com/public/basic.ics>`__
+      `gcal <https://calendar.google.com/calendar/embed?src=npgke5dug0uliud0qapptmps58%40group.calendar.google.com>`__
+    - `GoogleMeet <https://meet.google.com/xok-iqne-gmi>`__
+    - English, Norwegian (not fluently)
 
 
 IRC

From 40f908195807953f19322a8d1a193d243dc5bfb6 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomasp@graphcore.ai>
Date: Wed, 23 Feb 2022 13:24:06 +0000
Subject: [PATCH 626/748] [LAA] Add missing newline in debug print

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 37f867e1c2e61..2a62c46a05c47 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2291,7 +2291,7 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
                   "at most once.\n");
     return;
   }
-  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
+  LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
 
   SymbolicStrides[Ptr] = Stride;
   StrideSet.insert(Stride);

From d0810779b1f310d99176467d5d5b5aa4e26d7eb5 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 21 Feb 2022 11:07:38 +0100
Subject: [PATCH 627/748] [lldb] Modernize ThreadLauncher

Accept a function object instead of a raw pointer. This avoids a bunch
of boilerplate typically needed to pass arguments to the thread
functions.

Differential Revision: https://reviews.llvm.org/D120321
---
 lldb/include/lldb/Core/Communication.h        |  4 +-
 lldb/include/lldb/Core/Debugger.h             |  6 +-
 lldb/include/lldb/Host/ThreadLauncher.h       | 13 ++-
 lldb/include/lldb/Target/Process.h            | 11 ---
 lldb/source/API/SBHostOS.cpp                  |  4 +-
 lldb/source/Core/Communication.cpp            | 33 +++----
 lldb/source/Core/Debugger.cpp                 | 20 ++--
 lldb/source/Host/common/Host.cpp              | 81 +++++++---------
 .../Host/common/HostNativeThreadBase.cpp      | 12 +--
 lldb/source/Host/common/ThreadLauncher.cpp    | 18 ++--
 lldb/source/Host/macosx/objcxx/Host.mm        |  5 +-
 .../Process/MacOSX-Kernel/ProcessKDP.cpp      | 32 +++----
 .../Process/MacOSX-Kernel/ProcessKDP.h        |  2 +-
 .../Process/Windows/Common/DebuggerThread.cpp | 52 ++---------
 .../Process/Windows/Common/DebuggerThread.h   |  2 -
 .../gdb-remote/GDBRemoteCommunication.cpp     | 15 ++-
 .../gdb-remote/GDBRemoteCommunication.h       |  2 +-
 .../Process/gdb-remote/ProcessGDBRemote.cpp   | 92 +++++++++----------
 .../Process/gdb-remote/ProcessGDBRemote.h     |  2 +-
 lldb/source/Target/Process.cpp                | 19 ++--
 lldb/unittests/Host/CMakeLists.txt            |  1 +
 lldb/unittests/Host/ThreadLauncherTest.cpp    | 29 ++++++
 22 files changed, 194 insertions(+), 261 deletions(-)
 create mode 100644 lldb/unittests/Host/ThreadLauncherTest.cpp

diff --git a/lldb/include/lldb/Core/Communication.h b/lldb/include/lldb/Core/Communication.h
index fdcb6c5fb9822..44b3a16a05269 100644
--- a/lldb/include/lldb/Core/Communication.h
+++ b/lldb/include/lldb/Core/Communication.h
@@ -277,7 +277,7 @@ class Communication : public Broadcaster {
   ///     \b True if the read thread is running, \b false otherwise.
   bool ReadThreadIsRunning();
 
-  /// The static read thread function. This function will call the "DoRead"
+  /// The read thread function. This function will call the "DoRead"
   /// function continuously and wait for data to become available. When data
   /// is received it will append the available data to the internal cache and
   /// broadcast a \b eBroadcastBitReadThreadGotBytes event.
@@ -289,7 +289,7 @@ class Communication : public Broadcaster {
   ///     \b NULL.
   ///
   /// \see void Communication::ReadThreadGotBytes (const uint8_t *, size_t);
-  static lldb::thread_result_t ReadThread(lldb::thread_arg_t comm_ptr);
+  lldb::thread_result_t ReadThread();
 
   void SetReadThreadBytesReceivedCallback(ReadThreadBytesReceived callback,
                                           void *callback_baton);
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index f9a1f1eea54f2..d4fae0c5d09cb 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -439,8 +439,6 @@ class Debugger : public std::enable_shared_from_this<Debugger>,
 
   void StopEventHandlerThread();
 
-  static lldb::thread_result_t EventHandlerThread(lldb::thread_arg_t arg);
-
   void PushIOHandler(const lldb::IOHandlerSP &reader_sp,
                      bool cancel_top_handler = true);
 
@@ -454,9 +452,9 @@ class Debugger : public std::enable_shared_from_this<Debugger>,
 
   void JoinIOHandlerThread();
 
-  static lldb::thread_result_t IOHandlerThread(lldb::thread_arg_t arg);
+  lldb::thread_result_t IOHandlerThread();
 
-  void DefaultEventHandler();
+  lldb::thread_result_t DefaultEventHandler();
 
   void HandleBreakpointEvent(const lldb::EventSP &event_sp);
 
diff --git a/lldb/include/lldb/Host/ThreadLauncher.h b/lldb/include/lldb/Host/ThreadLauncher.h
index 00b42fa6a11d5..8bb6c79466a76 100644
--- a/lldb/include/lldb/Host/ThreadLauncher.h
+++ b/lldb/include/lldb/Host/ThreadLauncher.h
@@ -20,8 +20,8 @@ namespace lldb_private {
 class ThreadLauncher {
 public:
   static llvm::Expected<HostThread>
-  LaunchThread(llvm::StringRef name, lldb::thread_func_t thread_function,
-               lldb::thread_arg_t thread_arg,
+  LaunchThread(llvm::StringRef name,
+               std::function<lldb::thread_result_t()> thread_function,
                size_t min_stack_byte_size = 0); // Minimum stack size in bytes,
                                                 // set stack size to zero for
                                                 // default platform thread stack
@@ -29,12 +29,11 @@ class ThreadLauncher {
 
   struct HostThreadCreateInfo {
     std::string thread_name;
-    lldb::thread_func_t thread_fptr;
-    lldb::thread_arg_t thread_arg;
+    std::function<lldb::thread_result_t()> impl;
 
-    HostThreadCreateInfo(const char *name, lldb::thread_func_t fptr,
-                         lldb::thread_arg_t arg)
-        : thread_name(name ? name : ""), thread_fptr(fptr), thread_arg(arg) {}
+    HostThreadCreateInfo(std::string thread_name,
+                         std::function<lldb::thread_result_t()> impl)
+        : thread_name(std::move(thread_name)), impl(std::move(impl)) {}
   };
 };
 }
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index adceec619ff04..23debbee705ea 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -2989,17 +2989,6 @@ void PruneThreadPlans();
   void ResumePrivateStateThread();
 
 private:
-  struct PrivateStateThreadArgs {
-    PrivateStateThreadArgs(Process *p, bool s)
-        : process(p), is_secondary_thread(s){};
-    Process *process;
-    bool is_secondary_thread;
-  };
-
-  // arg is a pointer to a new'ed PrivateStateThreadArgs structure.
-  // PrivateStateThread will free it for you.
-  static lldb::thread_result_t PrivateStateThread(void *arg);
-
   // The starts up the private state thread that will watch for events from the
   // debugee. Pass true for is_secondary_thread in the case where you have to
   // temporarily spin up a secondary state thread to handle events from a hand-
diff --git a/lldb/source/API/SBHostOS.cpp b/lldb/source/API/SBHostOS.cpp
index 06cf654031a1d..cb026fd9203b9 100644
--- a/lldb/source/API/SBHostOS.cpp
+++ b/lldb/source/API/SBHostOS.cpp
@@ -102,7 +102,9 @@ lldb::thread_t SBHostOS::ThreadCreate(const char *name,
                                       void *thread_arg, SBError *error_ptr) {
   LLDB_INSTRUMENT_VA(name, thread_function, thread_arg, error_ptr);
   llvm::Expected<HostThread> thread =
-      ThreadLauncher::LaunchThread(name, thread_function, thread_arg);
+      ThreadLauncher::LaunchThread(name, [thread_function, thread_arg] {
+        return thread_function(thread_arg);
+      });
   if (!thread) {
     if (error_ptr)
       error_ptr->SetError(Status(thread.takeError()));
diff --git a/lldb/source/Core/Communication.cpp b/lldb/source/Core/Communication.cpp
index 0e4d4fc49a429..f41ce46ede88f 100644
--- a/lldb/source/Core/Communication.cpp
+++ b/lldb/source/Core/Communication.cpp
@@ -213,7 +213,7 @@ bool Communication::StartReadThread(Status *error_ptr) {
   m_read_thread_enabled = true;
   m_read_thread_did_exit = false;
   auto maybe_thread = ThreadLauncher::LaunchThread(
-      thread_name, Communication::ReadThread, this);
+      thread_name, [this] { return ReadThread(); });
   if (maybe_thread) {
     m_read_thread = *maybe_thread;
   } else {
@@ -311,12 +311,10 @@ size_t Communication::ReadFromConnection(void *dst, size_t dst_len,
 
 bool Communication::ReadThreadIsRunning() { return m_read_thread_enabled; }
 
-lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) {
-  Communication *comm = (Communication *)p;
-
+lldb::thread_result_t Communication::ReadThread() {
   Log *log = GetLog(LLDBLog::Communication);
 
-  LLDB_LOGF(log, "%p Communication::ReadThread () thread starting...", p);
+  LLDB_LOG(log, "Communication({0}) thread starting...", this);
 
   uint8_t buf[1024];
 
@@ -324,11 +322,11 @@ lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) {
   ConnectionStatus status = eConnectionStatusSuccess;
   bool done = false;
   bool disconnect = false;
-  while (!done && comm->m_read_thread_enabled) {
-    size_t bytes_read = comm->ReadFromConnection(
+  while (!done && m_read_thread_enabled) {
+    size_t bytes_read = ReadFromConnection(
         buf, sizeof(buf), std::chrono::seconds(5), status, &error);
     if (bytes_read > 0 || status == eConnectionStatusEndOfFile)
-      comm->AppendBytesToCache(buf, bytes_read, true, status);
+      AppendBytesToCache(buf, bytes_read, true, status);
 
     switch (status) {
     case eConnectionStatusSuccess:
@@ -336,12 +334,12 @@ lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) {
 
     case eConnectionStatusEndOfFile:
       done = true;
-      disconnect = comm->GetCloseOnEOF();
+      disconnect = GetCloseOnEOF();
       break;
     case eConnectionStatusError: // Check GetError() for details
       if (error.GetType() == eErrorTypePOSIX && error.GetError() == EIO) {
         // EIO on a pipe is usually caused by remote shutdown
-        disconnect = comm->GetCloseOnEOF();
+        disconnect = GetCloseOnEOF();
         done = true;
       }
       if (error.Fail())
@@ -352,7 +350,7 @@ lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) {
                                        // SynchronizeWithReadThread()
       // The connection returns eConnectionStatusInterrupted only when there is
       // no input pending to be read, so we can signal that.
-      comm->BroadcastEvent(eBroadcastBitNoMorePendingInput);
+      BroadcastEvent(eBroadcastBitNoMorePendingInput);
       break;
     case eConnectionStatusNoConnection:   // No connection
     case eConnectionStatusLostConnection: // Lost connection while connected to
@@ -367,26 +365,25 @@ lldb::thread_result_t Communication::ReadThread(lldb::thread_arg_t p) {
     }
   }
   log = GetLog(LLDBLog::Communication);
-  if (log)
-    LLDB_LOGF(log, "%p Communication::ReadThread () thread exiting...", p);
+  LLDB_LOG(log, "Communication({0}) thread exiting...", this);
 
   // Handle threads wishing to synchronize with us.
   {
     // Prevent new ones from showing up.
-    comm->m_read_thread_did_exit = true;
+    m_read_thread_did_exit = true;
 
     // Unblock any existing thread waiting for the synchronization signal.
-    comm->BroadcastEvent(eBroadcastBitNoMorePendingInput);
+    BroadcastEvent(eBroadcastBitNoMorePendingInput);
 
     // Wait for the thread to finish...
-    std::lock_guard<std::mutex> guard(comm->m_synchronize_mutex);
+    std::lock_guard<std::mutex> guard(m_synchronize_mutex);
     // ... and disconnect.
     if (disconnect)
-      comm->Disconnect();
+      Disconnect();
   }
 
   // Let clients know that this thread is exiting
-  comm->BroadcastEvent(eBroadcastBitReadThreadDidExit);
+  BroadcastEvent(eBroadcastBitReadThreadDidExit);
   return {};
 }
 
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index b4ef65c2a87f0..ae4fb93e6d4af 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -1576,7 +1576,7 @@ void Debugger::CancelForwardEvents(const ListenerSP &listener_sp) {
   m_forward_listener_sp.reset();
 }
 
-void Debugger::DefaultEventHandler() {
+lldb::thread_result_t Debugger::DefaultEventHandler() {
   ListenerSP listener_sp(GetListener());
   ConstString broadcaster_class_target(Target::GetStaticBroadcasterClass());
   ConstString broadcaster_class_process(Process::GetStaticBroadcasterClass());
@@ -1662,10 +1662,6 @@ void Debugger::DefaultEventHandler() {
       }
     }
   }
-}
-
-lldb::thread_result_t Debugger::EventHandlerThread(lldb::thread_arg_t arg) {
-  ((Debugger *)arg)->DefaultEventHandler();
   return {};
 }
 
@@ -1687,8 +1683,9 @@ bool Debugger::StartEventHandlerThread() {
 
     // Use larger 8MB stack for this thread
     llvm::Expected<HostThread> event_handler_thread =
-        ThreadLauncher::LaunchThread(thread_name, EventHandlerThread, this,
-                                     g_debugger_event_thread_stack_bytes);
+        ThreadLauncher::LaunchThread(
+            thread_name, [this] { return DefaultEventHandler(); },
+            g_debugger_event_thread_stack_bytes);
 
     if (event_handler_thread) {
       m_event_handler_thread = *event_handler_thread;
@@ -1716,10 +1713,9 @@ void Debugger::StopEventHandlerThread() {
   }
 }
 
-lldb::thread_result_t Debugger::IOHandlerThread(lldb::thread_arg_t arg) {
-  Debugger *debugger = (Debugger *)arg;
-  debugger->RunIOHandlers();
-  debugger->StopEventHandlerThread();
+lldb::thread_result_t Debugger::IOHandlerThread() {
+  RunIOHandlers();
+  StopEventHandlerThread();
   return {};
 }
 
@@ -1728,7 +1724,7 @@ bool Debugger::HasIOHandlerThread() { return m_io_handler_thread.IsJoinable(); }
 bool Debugger::StartIOHandlerThread() {
   if (!m_io_handler_thread.IsJoinable()) {
     llvm::Expected<HostThread> io_handler_thread = ThreadLauncher::LaunchThread(
-        "lldb.debugger.io-handler", IOHandlerThread, this,
+        "lldb.debugger.io-handler", [this] { return IOHandlerThread(); },
         8 * 1024 * 1024); // Use larger 8MB stack for this thread
     if (io_handler_thread) {
       m_io_handler_thread = *io_handler_thread;
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index f6cb82408e3c5..8bfc54aa387ee 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -92,29 +92,22 @@ using namespace lldb;
 using namespace lldb_private;
 
 #if !defined(__APPLE__) && !defined(_WIN32)
-struct MonitorInfo {
-  lldb::pid_t pid; // The process ID to monitor
-  Host::MonitorChildProcessCallback
-      callback; // The callback function to call when "pid" exits or signals
-  bool monitor_signals; // If true, call the callback when "pid" gets signaled.
-};
-
-static thread_result_t MonitorChildProcessThreadFunction(void *arg);
+static thread_result_t
+MonitorChildProcessThreadFunction(::pid_t pid,
+                                  Host::MonitorChildProcessCallback callback,
+                                  bool monitor_signals);
 
 llvm::Expected<HostThread> Host::StartMonitoringChildProcess(
     const Host::MonitorChildProcessCallback &callback, lldb::pid_t pid,
     bool monitor_signals) {
-  MonitorInfo *info_ptr = new MonitorInfo();
-
-  info_ptr->pid = pid;
-  info_ptr->callback = callback;
-  info_ptr->monitor_signals = monitor_signals;
-
   char thread_name[256];
   ::snprintf(thread_name, sizeof(thread_name),
              "<lldb.host.wait4(pid=%" PRIu64 ")>", pid);
-  return ThreadLauncher::LaunchThread(
-      thread_name, MonitorChildProcessThreadFunction, info_ptr, 0);
+  assert(pid <= UINT32_MAX);
+  return ThreadLauncher::LaunchThread(thread_name, [pid, callback,
+                                                    monitor_signals] {
+    return MonitorChildProcessThreadFunction(pid, callback, monitor_signals);
+  });
 }
 
 #ifndef __linux__
@@ -163,20 +156,14 @@ static bool CheckForMonitorCancellation() {
   return false;
 }
 
-static thread_result_t MonitorChildProcessThreadFunction(void *arg) {
+static thread_result_t
+MonitorChildProcessThreadFunction(::pid_t pid,
+                                  Host::MonitorChildProcessCallback callback,
+                                  bool monitor_signals) {
   Log *log = GetLog(LLDBLog::Process);
-  const char *function = __FUNCTION__;
-  LLDB_LOGF(log, "%s (arg = %p) thread starting...", function, arg);
-
-  MonitorInfo *info = (MonitorInfo *)arg;
+  LLDB_LOG(log, "pid = {0}, monitor_signals = {1}", pid, monitor_signals);
 
-  const Host::MonitorChildProcessCallback callback = info->callback;
-  const bool monitor_signals = info->monitor_signals;
-
-  assert(info->pid <= UINT32_MAX);
-  const ::pid_t pid = monitor_signals ? -1 * getpgid(info->pid) : info->pid;
-
-  delete info;
+  pid = monitor_signals ? -1 * getpgid(pid) : pid;
 
   int status = -1;
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__OpenBSD__)
@@ -194,8 +181,7 @@ static thread_result_t MonitorChildProcessThreadFunction(void *arg) {
 
   while (true) {
     log = GetLog(LLDBLog::Process);
-    LLDB_LOGF(log, "%s ::waitpid (pid = %" PRIi32 ", &status, options = %i)...",
-              function, pid, options);
+    LLDB_LOG(log, "::waitpid({0}, &status, {1})...", pid, options);
 
     if (CheckForMonitorCancellation())
       break;
@@ -211,8 +197,8 @@ static thread_result_t MonitorChildProcessThreadFunction(void *arg) {
         continue;
       else {
         LLDB_LOG(log,
-                 "arg = {0}, thread exiting because waitpid failed ({1})...",
-                 arg, llvm::sys::StrError());
+                 "pid = {0}, thread exiting because waitpid failed ({1})...",
+                 pid, llvm::sys::StrError());
         break;
       }
     } else if (wait_pid > 0) {
@@ -245,12 +231,11 @@ static thread_result_t MonitorChildProcessThreadFunction(void *arg) {
 #endif
 
         log = GetLog(LLDBLog::Process);
-        LLDB_LOGF(log,
-                  "%s ::waitpid (pid = %" PRIi32
-                  ", &status, options = %i) => pid = %" PRIi32
-                  ", status = 0x%8.8x (%s), signal = %i, exit_state = %i",
-                  function, pid, options, wait_pid, status, status_cstr, signal,
-                  exit_status);
+        LLDB_LOG(log,
+                 "::waitpid({0}, &status, {1}) => pid = {2}, status = {3:x} "
+                 "({4}), signal = {5}, exit_state = {6}",
+                 pid, options, wait_pid, status, status_cstr, signal,
+                 exit_status);
 
         if (exited || (signal != 0 && monitor_signals)) {
           bool callback_return = false;
@@ -259,18 +244,18 @@ static thread_result_t MonitorChildProcessThreadFunction(void *arg) {
 
           // If our process exited, then this thread should exit
           if (exited && wait_pid == abs(pid)) {
-            LLDB_LOGF(log,
-                      "%s (arg = %p) thread exiting because pid received "
-                      "exit signal...",
-                      __FUNCTION__, arg);
+            LLDB_LOG(
+                log,
+                "pid = {0} thread exiting because pid received exit signal...",
+                pid);
             break;
           }
           // If the callback returns true, it means this process should exit
           if (callback_return) {
-            LLDB_LOGF(log,
-                      "%s (arg = %p) thread exiting because callback "
-                      "returned true...",
-                      __FUNCTION__, arg);
+            LLDB_LOG(
+                log,
+                "pid = {0} thread exiting because callback returned true...",
+                pid);
             break;
           }
         }
@@ -278,9 +263,7 @@ static thread_result_t MonitorChildProcessThreadFunction(void *arg) {
     }
   }
 
-  log = GetLog(LLDBLog::Process);
-  LLDB_LOGF(log, "%s (arg = %p) thread exiting...", __FUNCTION__, arg);
-
+  LLDB_LOG(GetLog(LLDBLog::Process), "pid = {0} thread exiting...", pid);
   return nullptr;
 }
 
diff --git a/lldb/source/Host/common/HostNativeThreadBase.cpp b/lldb/source/Host/common/HostNativeThreadBase.cpp
index 4e69306e4d659..5814a7fd54bb5 100644
--- a/lldb/source/Host/common/HostNativeThreadBase.cpp
+++ b/lldb/source/Host/common/HostNativeThreadBase.cpp
@@ -52,16 +52,12 @@ lldb::thread_t HostNativeThreadBase::Release() {
 
 lldb::thread_result_t
 HostNativeThreadBase::ThreadCreateTrampoline(lldb::thread_arg_t arg) {
-  ThreadLauncher::HostThreadCreateInfo *info =
-      (ThreadLauncher::HostThreadCreateInfo *)arg;
-  llvm::set_thread_name(info->thread_name);
-
-  thread_func_t thread_fptr = info->thread_fptr;
-  thread_arg_t thread_arg = info->thread_arg;
+  std::unique_ptr<ThreadLauncher::HostThreadCreateInfo> info_up(
+      (ThreadLauncher::HostThreadCreateInfo *)arg);
+  llvm::set_thread_name(info_up->thread_name);
 
   Log *log = GetLog(LLDBLog::Thread);
   LLDB_LOGF(log, "thread created");
 
-  delete info;
-  return thread_fptr(thread_arg);
+  return info_up->impl();
 }
diff --git a/lldb/source/Host/common/ThreadLauncher.cpp b/lldb/source/Host/common/ThreadLauncher.cpp
index bd104b3e4aed2..28c90215f8747 100644
--- a/lldb/source/Host/common/ThreadLauncher.cpp
+++ b/lldb/source/Host/common/ThreadLauncher.cpp
@@ -21,17 +21,18 @@
 using namespace lldb;
 using namespace lldb_private;
 
-llvm::Expected<HostThread> ThreadLauncher::LaunchThread(
-    llvm::StringRef name, lldb::thread_func_t thread_function,
-    lldb::thread_arg_t thread_arg, size_t min_stack_byte_size) {
-  // Host::ThreadCreateTrampoline will delete this pointer for us.
-  HostThreadCreateInfo *info_ptr =
-      new HostThreadCreateInfo(name.data(), thread_function, thread_arg);
+llvm::Expected<HostThread>
+ThreadLauncher::LaunchThread(llvm::StringRef name,
+                             std::function<thread_result_t()> impl,
+                             size_t min_stack_byte_size) {
+  // Host::ThreadCreateTrampoline will take ownership if thread creation is
+  // successful.
+  auto info_up = std::make_unique<HostThreadCreateInfo>(name.str(), impl);
   lldb::thread_t thread;
 #ifdef _WIN32
   thread = (lldb::thread_t)::_beginthreadex(
       0, (unsigned)min_stack_byte_size,
-      HostNativeThread::ThreadCreateTrampoline, info_ptr, 0, NULL);
+      HostNativeThread::ThreadCreateTrampoline, info_up.get(), 0, NULL);
   if (thread == LLDB_INVALID_HOST_THREAD)
     return llvm::errorCodeToError(llvm::mapWindowsError(GetLastError()));
 #else
@@ -63,7 +64,7 @@ llvm::Expected<HostThread> ThreadLauncher::LaunchThread(
   }
   int err =
       ::pthread_create(&thread, thread_attr_ptr,
-                       HostNativeThread::ThreadCreateTrampoline, info_ptr);
+                       HostNativeThread::ThreadCreateTrampoline, info_up.get());
 
   if (destroy_attr)
     ::pthread_attr_destroy(&thread_attr);
@@ -73,5 +74,6 @@ llvm::Expected<HostThread> ThreadLauncher::LaunchThread(
         std::error_code(err, std::generic_category()));
 #endif
 
+  info_up.release();
   return HostThread(thread);
 }
diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm
index 969173e7122cd..ab2c295004e4c 100644
--- a/lldb/source/Host/macosx/objcxx/Host.mm
+++ b/lldb/source/Host/macosx/objcxx/Host.mm
@@ -136,8 +136,7 @@
 
 #if TARGET_OS_OSX
 
-static void *AcceptPIDFromInferior(void *arg) {
-  const char *connect_url = (const char *)arg;
+static void *AcceptPIDFromInferior(const char *connect_url) {
   ConnectionFileDescriptor file_conn;
   Status error;
   if (file_conn.Connect(connect_url, &error) == eConnectionStatusSuccess) {
@@ -286,7 +285,7 @@ repeat with the_window in (get windows)\n\
   // to the process that we wanted to launch. So when our process actually
   // gets launched, we will handshake with it and get the process ID for it.
   llvm::Expected<HostThread> accept_thread = ThreadLauncher::LaunchThread(
-      unix_socket_name, AcceptPIDFromInferior, connect_url);
+      unix_socket_name, [&] { return AcceptPIDFromInferior(connect_url); });
 
   if (!accept_thread)
     return Status(accept_thread.takeError());
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 65112275892a3..8843dc87e5cbf 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -724,7 +724,7 @@ bool ProcessKDP::StartAsyncThread() {
     return true;
 
   llvm::Expected<HostThread> async_thread = ThreadLauncher::LaunchThread(
-      "<lldb.process.kdp-remote.async>", ProcessKDP::AsyncThread, this);
+      "<lldb.process.kdp-remote.async>", [this] { return AsyncThread(); });
   if (!async_thread) {
     LLDB_LOG_ERROR(GetLog(LLDBLog::Host), async_thread.takeError(),
                    "failed to launch host thread: {}");
@@ -746,25 +746,21 @@ void ProcessKDP::StopAsyncThread() {
     m_async_thread.Join(nullptr);
 }
 
-void *ProcessKDP::AsyncThread(void *arg) {
-  ProcessKDP *process = (ProcessKDP *)arg;
-
-  const lldb::pid_t pid = process->GetID();
+void *ProcessKDP::AsyncThread() {
+  const lldb::pid_t pid = GetID();
 
   Log *log = GetLog(KDPLog::Process);
   LLDB_LOGF(log,
-            "ProcessKDP::AsyncThread (arg = %p, pid = %" PRIu64
-            ") thread starting...",
-            arg, pid);
+            "ProcessKDP::AsyncThread(pid = %" PRIu64 ") thread starting...",
+            pid);
 
   ListenerSP listener_sp(Listener::MakeListener("ProcessKDP::AsyncThread"));
   EventSP event_sp;
   const uint32_t desired_event_mask =
       eBroadcastBitAsyncContinue | eBroadcastBitAsyncThreadShouldExit;
 
-  if (listener_sp->StartListeningForEvents(&process->m_async_broadcaster,
-                                           desired_event_mask) ==
-      desired_event_mask) {
+  if (listener_sp->StartListeningForEvents(
+          &m_async_broadcaster, desired_event_mask) == desired_event_mask) {
     bool done = false;
     while (!done) {
       LLDB_LOGF(log,
@@ -787,9 +783,9 @@ void *ProcessKDP::AsyncThread(void *arg) {
           switch (event_type) {
           case eBroadcastBitAsyncContinue: {
             is_running = true;
-            if (process->m_comm.WaitForPacketWithTimeoutMicroSeconds(
+            if (m_comm.WaitForPacketWithTimeoutMicroSeconds(
                     exc_reply_packet, 1 * USEC_PER_SEC)) {
-              ThreadSP thread_sp(process->GetKernelThread());
+              ThreadSP thread_sp(GetKernelThread());
               if (thread_sp) {
                 lldb::RegisterContextSP reg_ctx_sp(
                     thread_sp->GetRegisterContext());
@@ -801,7 +797,7 @@ void *ProcessKDP::AsyncThread(void *arg) {
 
               // TODO: parse the stop reply packet
               is_running = false;
-              process->SetPrivateState(eStateStopped);
+              SetPrivateState(eStateStopped);
             } else {
               // Check to see if we are supposed to exit. There is no way to
               // interrupt a running kernel, so all we can do is wait for an
@@ -843,12 +839,10 @@ void *ProcessKDP::AsyncThread(void *arg) {
     }
   }
 
-  LLDB_LOGF(log,
-            "ProcessKDP::AsyncThread (arg = %p, pid = %" PRIu64
-            ") thread exiting...",
-            arg, pid);
+  LLDB_LOGF(log, "ProcessKDP::AsyncThread(pid = %" PRIu64 ") thread exiting...",
+            pid);
 
-  process->m_async_thread.Reset();
+  m_async_thread.Reset();
   return NULL;
 }
 
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
index 3386354d0b4cb..a8cfb0ec9f870 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
@@ -179,7 +179,7 @@ class ProcessKDP : public lldb_private::Process {
 
   void StopAsyncThread();
 
-  static void *AsyncThread(void *arg);
+  void *AsyncThread();
 
 private:
   // For ProcessKDP only
diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
index a78ca2aabe134..e442650f0920e 100644
--- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.cpp
@@ -36,25 +36,6 @@
 using namespace lldb;
 using namespace lldb_private;
 
-namespace {
-struct DebugLaunchContext {
-  DebugLaunchContext(DebuggerThread *thread,
-                     const ProcessLaunchInfo &launch_info)
-      : m_thread(thread), m_launch_info(launch_info) {}
-  DebuggerThread *m_thread;
-  ProcessLaunchInfo m_launch_info;
-};
-
-struct DebugAttachContext {
-  DebugAttachContext(DebuggerThread *thread, lldb::pid_t pid,
-                     const ProcessAttachInfo &attach_info)
-      : m_thread(thread), m_pid(pid), m_attach_info(attach_info) {}
-  DebuggerThread *m_thread;
-  lldb::pid_t m_pid;
-  ProcessAttachInfo m_attach_info;
-};
-} // namespace
-
 DebuggerThread::DebuggerThread(DebugDelegateSP debug_delegate)
     : m_debug_delegate(debug_delegate), m_pid_to_detach(0),
       m_is_shutting_down(false) {
@@ -68,11 +49,9 @@ Status DebuggerThread::DebugLaunch(const ProcessLaunchInfo &launch_info) {
   LLDB_LOG(log, "launching '{0}'", launch_info.GetExecutableFile().GetPath());
 
   Status result;
-  DebugLaunchContext *context = new DebugLaunchContext(this, launch_info);
-
-  llvm::Expected<HostThread> secondary_thread =
-      ThreadLauncher::LaunchThread("lldb.plugin.process-windows.secondary[?]",
-                                   DebuggerThreadLaunchRoutine, context);
+  llvm::Expected<HostThread> secondary_thread = ThreadLauncher::LaunchThread(
+      "lldb.plugin.process-windows.secondary[?]",
+      [this, launch_info] { return DebuggerThreadLaunchRoutine(launch_info); });
   if (!secondary_thread) {
     result = Status(secondary_thread.takeError());
     LLDB_LOG(log, "couldn't launch debugger thread. {0}", result);
@@ -87,11 +66,10 @@ Status DebuggerThread::DebugAttach(lldb::pid_t pid,
   LLDB_LOG(log, "attaching to '{0}'", pid);
 
   Status result;
-  DebugAttachContext *context = new DebugAttachContext(this, pid, attach_info);
-
-  llvm::Expected<HostThread> secondary_thread =
-      ThreadLauncher::LaunchThread("lldb.plugin.process-windows.secondary[?]",
-                                   DebuggerThreadAttachRoutine, context);
+  llvm::Expected<HostThread> secondary_thread = ThreadLauncher::LaunchThread(
+      "lldb.plugin.process-windows.secondary[?]", [this, pid, attach_info] {
+        return DebuggerThreadAttachRoutine(pid, attach_info);
+      });
   if (!secondary_thread) {
     result = Status(secondary_thread.takeError());
     LLDB_LOG(log, "couldn't attach to process '{0}'. {1}", pid, result);
@@ -100,22 +78,6 @@ Status DebuggerThread::DebugAttach(lldb::pid_t pid,
   return result;
 }
 
-lldb::thread_result_t DebuggerThread::DebuggerThreadLaunchRoutine(void *data) {
-  DebugLaunchContext *context = static_cast<DebugLaunchContext *>(data);
-  lldb::thread_result_t result =
-      context->m_thread->DebuggerThreadLaunchRoutine(context->m_launch_info);
-  delete context;
-  return result;
-}
-
-lldb::thread_result_t DebuggerThread::DebuggerThreadAttachRoutine(void *data) {
-  DebugAttachContext *context = static_cast<DebugAttachContext *>(data);
-  lldb::thread_result_t result = context->m_thread->DebuggerThreadAttachRoutine(
-      context->m_pid, context->m_attach_info);
-  delete context;
-  return result;
-}
-
 lldb::thread_result_t DebuggerThread::DebuggerThreadLaunchRoutine(
     const ProcessLaunchInfo &launch_info) {
   // Grab a shared_ptr reference to this so that we know it won't get deleted
diff --git a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.h b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.h
index 56701307fd757..e3439ff34584b 100644
--- a/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.h
+++ b/lldb/source/Plugins/Process/Windows/Common/DebuggerThread.h
@@ -91,10 +91,8 @@ class DebuggerThread : public std::enable_shared_from_this<DebuggerThread> {
   // exit.
   bool m_detached = false;
 
-  static lldb::thread_result_t DebuggerThreadLaunchRoutine(void *data);
   lldb::thread_result_t
   DebuggerThreadLaunchRoutine(const ProcessLaunchInfo &launch_info);
-  static lldb::thread_result_t DebuggerThreadAttachRoutine(void *data);
   lldb::thread_result_t
   DebuggerThreadAttachRoutine(lldb::pid_t pid,
                               const ProcessAttachInfo &launch_info);
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 38d9e400978d2..d660a4d070421 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -843,7 +843,7 @@ Status GDBRemoteCommunication::StartListenThread(const char *hostname,
   m_listen_url = listen_url;
   SetConnection(std::make_unique<ConnectionFileDescriptor>());
   llvm::Expected<HostThread> listen_thread = ThreadLauncher::LaunchThread(
-      listen_url, GDBRemoteCommunication::ListenThread, this);
+      listen_url, [this] { return GDBRemoteCommunication::ListenThread(); });
   if (!listen_thread)
     return Status(listen_thread.takeError());
   m_listen_thread = *listen_thread;
@@ -857,23 +857,22 @@ bool GDBRemoteCommunication::JoinListenThread() {
   return true;
 }
 
-lldb::thread_result_t
-GDBRemoteCommunication::ListenThread(lldb::thread_arg_t arg) {
-  GDBRemoteCommunication *comm = (GDBRemoteCommunication *)arg;
+lldb::thread_result_t GDBRemoteCommunication::ListenThread() {
   Status error;
   ConnectionFileDescriptor *connection =
-      (ConnectionFileDescriptor *)comm->GetConnection();
+      (ConnectionFileDescriptor *)GetConnection();
 
   if (connection) {
     // Do the listen on another thread so we can continue on...
     if (connection->Connect(
-            comm->m_listen_url.c_str(), [comm](llvm::StringRef port_str) {
+            m_listen_url.c_str(),
+            [this](llvm::StringRef port_str) {
               uint16_t port = 0;
               llvm::to_integer(port_str, port, 10);
-              comm->m_port_promise.set_value(port);
+              m_port_promise.set_value(port);
             },
             &error) != eConnectionStatusSuccess)
-      comm->SetConnection(nullptr);
+      SetConnection(nullptr);
   }
   return {};
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
index afc7e740d4c96..a325baa59ba3e 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.h
@@ -218,7 +218,7 @@ class GDBRemoteCommunication : public Communication {
 
   bool JoinListenThread();
 
-  static lldb::thread_result_t ListenThread(lldb::thread_arg_t arg);
+  lldb::thread_result_t ListenThread();
 
 private:
   // Promise used to grab the port number from listening thread
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index c40e738cae515..3f5cb1606f21b 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -3541,8 +3541,10 @@ bool ProcessGDBRemote::StartAsyncThread() {
     // Create a thread that watches our internal state and controls which
     // events make it to clients (into the DCProcess event queue).
 
-    llvm::Expected<HostThread> async_thread = ThreadLauncher::LaunchThread(
-        "<lldb.process.gdb-remote.async>", ProcessGDBRemote::AsyncThread, this);
+    llvm::Expected<HostThread> async_thread =
+        ThreadLauncher::LaunchThread("<lldb.process.gdb-remote.async>", [this] {
+          return ProcessGDBRemote::AsyncThread();
+        });
     if (!async_thread) {
       LLDB_LOG_ERROR(GetLog(LLDBLog::Host), async_thread.takeError(),
                      "failed to launch host thread: {}");
@@ -3580,14 +3582,10 @@ void ProcessGDBRemote::StopAsyncThread() {
         __FUNCTION__);
 }
 
-thread_result_t ProcessGDBRemote::AsyncThread(void *arg) {
-  ProcessGDBRemote *process = (ProcessGDBRemote *)arg;
-
+thread_result_t ProcessGDBRemote::AsyncThread() {
   Log *log = GetLog(GDBRLog::Process);
-  LLDB_LOGF(log,
-            "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
-            ") thread starting...",
-            __FUNCTION__, arg, process->GetID());
+  LLDB_LOGF(log, "ProcessGDBRemote::%s(pid = %" PRIu64 ") thread starting...",
+            __FUNCTION__, GetID());
 
   EventSP event_sp;
 
@@ -3603,19 +3601,19 @@ thread_result_t ProcessGDBRemote::AsyncThread(void *arg) {
   // fetch loop.
 
   bool done = false;
-  while (!done && process->GetPrivateState() != eStateExited) {
+  while (!done && GetPrivateState() != eStateExited) {
     LLDB_LOGF(log,
-              "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
+              "ProcessGDBRemote::%s(pid = %" PRIu64
               ") listener.WaitForEvent (NULL, event_sp)...",
-              __FUNCTION__, arg, process->GetID());
+              __FUNCTION__, GetID());
 
-    if (process->m_async_listener_sp->GetEvent(event_sp, llvm::None)) {
+    if (m_async_listener_sp->GetEvent(event_sp, llvm::None)) {
       const uint32_t event_type = event_sp->GetType();
-      if (event_sp->BroadcasterIs(&process->m_async_broadcaster)) {
+      if (event_sp->BroadcasterIs(&m_async_broadcaster)) {
         LLDB_LOGF(log,
-                  "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
+                  "ProcessGDBRemote::%s(pid = %" PRIu64
                   ") Got an event of type: %d...",
-                  __FUNCTION__, arg, process->GetID(), event_type);
+                  __FUNCTION__, GetID(), event_type);
 
         switch (event_type) {
         case eBroadcastBitAsyncContinue: {
@@ -3627,39 +3625,39 @@ thread_result_t ProcessGDBRemote::AsyncThread(void *arg) {
                 (const char *)continue_packet->GetBytes();
             const size_t continue_cstr_len = continue_packet->GetByteSize();
             LLDB_LOGF(log,
-                      "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
+                      "ProcessGDBRemote::%s(pid = %" PRIu64
                       ") got eBroadcastBitAsyncContinue: %s",
-                      __FUNCTION__, arg, process->GetID(), continue_cstr);
+                      __FUNCTION__, GetID(), continue_cstr);
 
             if (::strstr(continue_cstr, "vAttach") == nullptr)
-              process->SetPrivateState(eStateRunning);
+              SetPrivateState(eStateRunning);
             StringExtractorGDBRemote response;
 
             StateType stop_state =
-                process->GetGDBRemote().SendContinuePacketAndWaitForResponse(
-                    *process, *process->GetUnixSignals(),
+                GetGDBRemote().SendContinuePacketAndWaitForResponse(
+                    *this, *GetUnixSignals(),
                     llvm::StringRef(continue_cstr, continue_cstr_len),
-                    process->GetInterruptTimeout(), response);
+                    GetInterruptTimeout(), response);
 
             // We need to immediately clear the thread ID list so we are sure
             // to get a valid list of threads. The thread ID list might be
             // contained within the "response", or the stop reply packet that
             // caused the stop. So clear it now before we give the stop reply
             // packet to the process using the
-            // process->SetLastStopPacket()...
-            process->ClearThreadIDList();
+            // SetLastStopPacket()...
+            ClearThreadIDList();
 
             switch (stop_state) {
             case eStateStopped:
             case eStateCrashed:
             case eStateSuspended:
-              process->SetLastStopPacket(response);
-              process->SetPrivateState(stop_state);
+              SetLastStopPacket(response);
+              SetPrivateState(stop_state);
               break;
 
             case eStateExited: {
-              process->SetLastStopPacket(response);
-              process->ClearThreadIDList();
+              SetLastStopPacket(response);
+              ClearThreadIDList();
               response.SetFilePos(1);
 
               int exit_status = response.GetHexU8();
@@ -3674,7 +3672,7 @@ thread_result_t ProcessGDBRemote::AsyncThread(void *arg) {
                   extractor.GetHexByteString(desc_string);
                 }
               }
-              process->SetExitStatus(exit_status, desc_string.c_str());
+              SetExitStatus(exit_status, desc_string.c_str());
               done = true;
               break;
             }
@@ -3685,20 +3683,20 @@ thread_result_t ProcessGDBRemote::AsyncThread(void *arg) {
               // helpful error message about why the attach failed.
               if (::strstr(continue_cstr, "vAttach") != nullptr &&
                   response.GetError() == 0x87) {
-                process->SetExitStatus(-1, "cannot attach to process due to "
-                                           "System Integrity Protection");
+                SetExitStatus(-1, "cannot attach to process due to "
+                                  "System Integrity Protection");
               } else if (::strstr(continue_cstr, "vAttach") != nullptr &&
                          response.GetStatus().Fail()) {
-                process->SetExitStatus(-1, response.GetStatus().AsCString());
+                SetExitStatus(-1, response.GetStatus().AsCString());
               } else {
-                process->SetExitStatus(-1, "lost connection");
+                SetExitStatus(-1, "lost connection");
               }
               done = true;
               break;
             }
 
             default:
-              process->SetPrivateState(stop_state);
+              SetPrivateState(stop_state);
               break;
             }   // switch(stop_state)
           }     // if (continue_packet)
@@ -3707,49 +3705,47 @@ thread_result_t ProcessGDBRemote::AsyncThread(void *arg) {
 
         case eBroadcastBitAsyncThreadShouldExit:
           LLDB_LOGF(log,
-                    "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
+                    "ProcessGDBRemote::%s(pid = %" PRIu64
                     ") got eBroadcastBitAsyncThreadShouldExit...",
-                    __FUNCTION__, arg, process->GetID());
+                    __FUNCTION__, GetID());
           done = true;
           break;
 
         default:
           LLDB_LOGF(log,
-                    "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
+                    "ProcessGDBRemote::%s(pid = %" PRIu64
                     ") got unknown event 0x%8.8x",
-                    __FUNCTION__, arg, process->GetID(), event_type);
+                    __FUNCTION__, GetID(), event_type);
           done = true;
           break;
         }
-      } else if (event_sp->BroadcasterIs(&process->m_gdb_comm)) {
+      } else if (event_sp->BroadcasterIs(&m_gdb_comm)) {
         switch (event_type) {
         case Communication::eBroadcastBitReadThreadDidExit:
-          process->SetExitStatus(-1, "lost connection");
+          SetExitStatus(-1, "lost connection");
           done = true;
           break;
 
         default:
           LLDB_LOGF(log,
-                    "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
+                    "ProcessGDBRemote::%s(pid = %" PRIu64
                     ") got unknown event 0x%8.8x",
-                    __FUNCTION__, arg, process->GetID(), event_type);
+                    __FUNCTION__, GetID(), event_type);
           done = true;
           break;
         }
       }
     } else {
       LLDB_LOGF(log,
-                "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
+                "ProcessGDBRemote::%s(pid = %" PRIu64
                 ") listener.WaitForEvent (NULL, event_sp) => false",
-                __FUNCTION__, arg, process->GetID());
+                __FUNCTION__, GetID());
       done = true;
     }
   }
 
-  LLDB_LOGF(log,
-            "ProcessGDBRemote::%s (arg = %p, pid = %" PRIu64
-            ") thread exiting...",
-            __FUNCTION__, arg, process->GetID());
+  LLDB_LOGF(log, "ProcessGDBRemote::%s(pid = %" PRIu64 ") thread exiting...",
+            __FUNCTION__, GetID());
 
   return {};
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index dd907042608da..47b329eba731c 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -342,7 +342,7 @@ class ProcessGDBRemote : public Process,
 
   void StopAsyncThread();
 
-  static lldb::thread_result_t AsyncThread(void *arg);
+  lldb::thread_result_t AsyncThread();
 
   static bool
   MonitorDebugserverProcess(std::weak_ptr<ProcessGDBRemote> process_wp,
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index c70bfcfc448de..cbb10a0ac8506 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -3509,12 +3509,13 @@ bool Process::StartPrivateStateThread(bool is_secondary_thread) {
                "<lldb.process.internal-state(pid=%" PRIu64 ")>", GetID());
   }
 
-  // Create the private state thread, and start it running.
-  PrivateStateThreadArgs *args_ptr =
-      new PrivateStateThreadArgs(this, is_secondary_thread);
   llvm::Expected<HostThread> private_state_thread =
-      ThreadLauncher::LaunchThread(thread_name, Process::PrivateStateThread,
-                                   (void *)args_ptr, 8 * 1024 * 1024);
+      ThreadLauncher::LaunchThread(
+          thread_name,
+          [this, is_secondary_thread] {
+            return RunPrivateStateThread(is_secondary_thread);
+          },
+          8 * 1024 * 1024);
   if (!private_state_thread) {
     LLDB_LOG(GetLog(LLDBLog::Host), "failed to launch host thread: {}",
              llvm::toString(private_state_thread.takeError()));
@@ -3729,14 +3730,6 @@ Status Process::HaltPrivate() {
   return error;
 }
 
-thread_result_t Process::PrivateStateThread(void *arg) {
-  std::unique_ptr<PrivateStateThreadArgs> args_up(
-      static_cast<PrivateStateThreadArgs *>(arg));
-  thread_result_t result =
-      args_up->process->RunPrivateStateThread(args_up->is_secondary_thread);
-  return result;
-}
-
 thread_result_t Process::RunPrivateStateThread(bool is_secondary_thread) {
   bool control_only = true;
 
diff --git a/lldb/unittests/Host/CMakeLists.txt b/lldb/unittests/Host/CMakeLists.txt
index ae6afd592e547..bf14bf16e4e36 100644
--- a/lldb/unittests/Host/CMakeLists.txt
+++ b/lldb/unittests/Host/CMakeLists.txt
@@ -12,6 +12,7 @@ set (FILES
   SocketAddressTest.cpp
   SocketTest.cpp
   SocketTestUtilities.cpp
+  ThreadLauncherTest.cpp
   XMLTest.cpp
 )
 
diff --git a/lldb/unittests/Host/ThreadLauncherTest.cpp b/lldb/unittests/Host/ThreadLauncherTest.cpp
new file mode 100644
index 0000000000000..a19351a59b8dd
--- /dev/null
+++ b/lldb/unittests/Host/ThreadLauncherTest.cpp
@@ -0,0 +1,29 @@
+//===-- ThreadLauncherTest.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/ThreadLauncher.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <future>
+
+using namespace lldb_private;
+
+TEST(ThreadLauncherTest, LaunchThread) {
+  std::promise<int> promise;
+  std::future<int> future = promise.get_future();
+  llvm::Expected<HostThread> thread =
+      ThreadLauncher::LaunchThread("test", [&promise] {
+        promise.set_value(47);
+        return (lldb::thread_result_t)47;
+      });
+  ASSERT_THAT_EXPECTED(thread, llvm::Succeeded());
+  EXPECT_EQ(future.get(), 47);
+  lldb::thread_result_t result;
+  thread->Join(&result);
+  EXPECT_EQ(result, (lldb::thread_result_t)47);
+}

From f4568e12219f3a6ada20035ea40223680e274203 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 21 Feb 2022 15:08:23 +0100
Subject: [PATCH 628/748] [lldb] Simplify HostThreadMacOSX

The class is using an incredibly elaborate setup to create and destroy
an NSAutoreleasePool object. We can do it in a much simpler way by
making those calls inside our thread startup function.

The only effect of this patch is that the pool gets released at the end
of the ThreadCreateTrampoline function, instead of slightly later, when
pthreads begin thread-specific cleanup. However, the key destruction
order is unspecified, so nothing should be relying on that.

I didn't find a specific reason for why this would have to be done that
way in git history. It seems that before D5198, this was thread-specific
keys were the only way an os implementation (in Host::ThreadCreated)
could attach some value to a thread.

Differential Revision: https://reviews.llvm.org/D120322
---
 .../lldb/Host/macosx/HostThreadMacOSX.h       |  3 +-
 .../Host/macosx/objcxx/HostThreadMacOSX.mm    | 52 +------------------
 2 files changed, 3 insertions(+), 52 deletions(-)

diff --git a/lldb/include/lldb/Host/macosx/HostThreadMacOSX.h b/lldb/include/lldb/Host/macosx/HostThreadMacOSX.h
index 4e41119d97c6a..0299be3874085 100644
--- a/lldb/include/lldb/Host/macosx/HostThreadMacOSX.h
+++ b/lldb/include/lldb/Host/macosx/HostThreadMacOSX.h
@@ -17,8 +17,7 @@ class HostThreadMacOSX : public HostThreadPosix {
   friend class ThreadLauncher;
 
 public:
-  HostThreadMacOSX();
-  HostThreadMacOSX(lldb::thread_t thread);
+  using HostThreadPosix::HostThreadPosix;
 
 protected:
   static lldb::thread_result_t ThreadCreateTrampoline(lldb::thread_arg_t arg);
diff --git a/lldb/source/Host/macosx/objcxx/HostThreadMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostThreadMacOSX.mm
index 4f7e7ab248ad6..b24fe187b1a55 100644
--- a/lldb/source/Host/macosx/objcxx/HostThreadMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostThreadMacOSX.mm
@@ -7,62 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/macosx/HostThreadMacOSX.h"
-#include "lldb/Host/Host.h"
-
 #include <CoreFoundation/CoreFoundation.h>
 #include <Foundation/Foundation.h>
 
-#include <pthread.h>
-
 using namespace lldb_private;
 
-
-static pthread_once_t g_thread_create_once = PTHREAD_ONCE_INIT;
-static pthread_key_t g_thread_create_key = 0;
-
-namespace {
-class MacOSXDarwinThread {
-public:
-  MacOSXDarwinThread() { m_pool = [[NSAutoreleasePool alloc] init]; }
-
-  ~MacOSXDarwinThread() {
-    if (m_pool) {
-      [m_pool drain];
-      m_pool = nil;
-    }
-  }
-
-  static void PThreadDestructor(void *v) {
-    if (v)
-      delete static_cast<MacOSXDarwinThread *>(v);
-    ::pthread_setspecific(g_thread_create_key, NULL);
-  }
-
-protected:
-  NSAutoreleasePool *m_pool = nil;
-
-private:
-  MacOSXDarwinThread(const MacOSXDarwinThread &) = delete;
-  const MacOSXDarwinThread &operator=(const MacOSXDarwinThread &) = delete;
-};
-} // namespace
-
-static void InitThreadCreated() {
-  ::pthread_key_create(&g_thread_create_key,
-                       MacOSXDarwinThread::PThreadDestructor);
-}
-
-HostThreadMacOSX::HostThreadMacOSX() : HostThreadPosix() {}
-
-HostThreadMacOSX::HostThreadMacOSX(lldb::thread_t thread)
-    : HostThreadPosix(thread) {}
-
 lldb::thread_result_t
 HostThreadMacOSX::ThreadCreateTrampoline(lldb::thread_arg_t arg) {
-  ::pthread_once(&g_thread_create_once, InitThreadCreated);
-  if (g_thread_create_key) {
-    ::pthread_setspecific(g_thread_create_key, new MacOSXDarwinThread());
+  @autoreleasepool {
+    return HostThreadPosix::ThreadCreateTrampoline(arg);
   }
-
-  return HostThreadPosix::ThreadCreateTrampoline(arg);
 }

From 57c6012213b50804ed78530b89bae30c0ee4fe82 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Wed, 23 Feb 2022 14:28:56 +0100
Subject: [PATCH 629/748] Add missing <ctime> include

As a follow-up to eb4c8608115c1c9af0fc8cb5b1e9f2bc960014ef
Should fix http://45.33.8.238/win/53749/step_4.txt

Related to https://reviews.llvm.org/D120195
---
 llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index 73106c2195ad3..5ff84ca23cbd4 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/xxhash.h"
 
+#include <ctime>
+
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::msf;

From 2f300d34decba547dd07f5cd6034a6b2b2ca11a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 21 Feb 2022 16:01:13 +0100
Subject: [PATCH 630/748] [clang][driver][wasm] Fix libstdc++ target-dependent
 include dir

The triple goes after the gcc version, not before. Also add the
/backward version.

Differential Revision: https://reviews.llvm.org/D120251
---
 clang/lib/Driver/ToolChains/WebAssembly.cpp | 4 +++-
 clang/test/Driver/wasm-toolchain.cpp        | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 292cf4d66971f..c5e4d569793c3 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -528,10 +528,12 @@ void WebAssembly::addLibStdCXXIncludePaths(
 
   // First add the per-target include path if the OS is known.
   if (IsKnownOs) {
-    std::string TargetDir = LibPath + "/" + MultiarchTriple + "/c++/" + Version;
+    std::string TargetDir = LibPath + "/c++/" + Version + "/" + MultiarchTriple;
     addSystemInclude(DriverArgs, CC1Args, TargetDir);
   }
 
   // Second add the generic one.
   addSystemInclude(DriverArgs, CC1Args, LibPath + "/c++/" + Version);
+  // Third the backward one.
+  addSystemInclude(DriverArgs, CC1Args, LibPath + "/c++/" + Version + "/backward");
 }
diff --git a/clang/test/Driver/wasm-toolchain.cpp b/clang/test/Driver/wasm-toolchain.cpp
index 4a95860413dd9..3ff6a2cd53282 100644
--- a/clang/test/Driver/wasm-toolchain.cpp
+++ b/clang/test/Driver/wasm-toolchain.cpp
@@ -80,8 +80,9 @@
 // COMPILE_STDCXX: clang{{.*}}" "-cc1"
 // COMPILE_STDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]*]]"
 // COMPILE_STDCXX: "-isysroot" "[[SYSROOT:[^"]+]]"
-// COMPILE_STDCXX: "-internal-isystem" "[[SYSROOT:[^"]+]]/include/wasm32-wasi/c++/4.8"
+// COMPILE_STDCXX: "-internal-isystem" "[[SYSROOT:[^"]+]]/include/c++/4.8/wasm32-wasi"
 // COMPILE_STDCXX: "-internal-isystem" "[[SYSROOT:[^"]+]]/include/c++/4.8"
+// COMPILE_STDCXX: "-internal-isystem" "[[SYSROOT:[^"]+]]/include/c++/4.8/backward"
 // COMPILE_STDCXX: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
 // COMPILE_STDCXX: "-internal-isystem" "[[SYSROOT:[^"]+]]/include/wasm32-wasi"
 // COMPILE_STDCXX: "-internal-isystem" "[[SYSROOT:[^"]+]]/include"

From 27d9a58407c44c8bb3fe7b94ff8d3b9bea25afc4 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 23 Feb 2022 14:15:47 +0100
Subject: [PATCH 631/748] [clang][modules] Infer framework modules in explicit
 builds

This patch enables inferring framework modules in explicit builds in all contexts. Until now, inferring framework modules only worked with `-fimplicit-module-maps` due to this block of code:

```
// HeaderSearch::loadFrameworkModule
  case LMM_InvalidModuleMap:
    // Try to infer a module map from the framework directory.
    if (HSOpts->ImplicitModuleMaps)
      ModMap.inferFrameworkModule(Dir, IsSystem, /*Parent=*/nullptr);
    break;
```

Reviewed By: Bigcheese

Differential Revision: https://reviews.llvm.org/D113880
---
 clang/include/clang/Lex/ModuleMap.h            | 6 ++++++
 clang/lib/Frontend/FrontendAction.cpp          | 9 +++++++++
 clang/test/Modules/explicit-build-inferred.cpp | 3 +--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h
index 08c61a5dc5607..26169ae9cee95 100644
--- a/clang/include/clang/Lex/ModuleMap.h
+++ b/clang/include/clang/Lex/ModuleMap.h
@@ -584,6 +584,12 @@ class ModuleMap {
     return ModuleScopeIDs[ExistingModule] < CurrentModuleScopeID;
   }
 
+  /// Check whether a framework module can be inferred in the given directory.
+  bool canInferFrameworkModule(const DirectoryEntry *Dir) const {
+    auto It = InferredDirectories.find(Dir);
+    return It != InferredDirectories.end() && It->getSecond().InferModules;
+  }
+
   /// Retrieve the module map file containing the definition of the given
   /// module.
   ///
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 089f40b36089a..c5b9e80356db4 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -465,6 +465,15 @@ static bool loadModuleMapForModuleBuild(CompilerInstance &CI, bool IsSystem,
   if (SrcMgr.getBufferOrFake(ModuleMapID).getBufferSize() == Offset)
     Offset = 0;
 
+  // Infer framework module if possible.
+  if (HS.getModuleMap().canInferFrameworkModule(ModuleMap->getDir())) {
+    SmallString<128> InferredFrameworkPath = ModuleMap->getDir()->getName();
+    llvm::sys::path::append(InferredFrameworkPath,
+                            CI.getLangOpts().ModuleName + ".framework");
+    if (auto Dir = CI.getFileManager().getDirectory(InferredFrameworkPath))
+      (void)HS.getModuleMap().inferFrameworkModule(*Dir, IsSystem, nullptr);
+  }
+
   return false;
 }
 
diff --git a/clang/test/Modules/explicit-build-inferred.cpp b/clang/test/Modules/explicit-build-inferred.cpp
index 2ee585692a687..42a22fd136b7b 100644
--- a/clang/test/Modules/explicit-build-inferred.cpp
+++ b/clang/test/Modules/explicit-build-inferred.cpp
@@ -1,11 +1,10 @@
 // RUN: rm -rf %t && mkdir %t
 //
-// RUN: %clang_cc1 -fmodules -fno-implicit-modules -fimplicit-module-maps \
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules \
 // RUN:   -emit-module -x c++ %S/Inputs/explicit-build-inferred/frameworks/module.modulemap \
 // RUN:   -fmodule-name=Inferred -o %t/Inferred.pcm -F %S/Inputs/explicit-build-inferred/frameworks
 //
 // RUN: %clang_cc1 -fmodules -fno-implicit-modules -fsyntax-only %s \
-// RUN:   -fmodule-map-file=%S/Inputs/explicit-build-inferred/frameworks/module.modulemap \
 // RUN:   -fmodule-file=%t/Inferred.pcm -F %S/Inputs/explicit-build-inferred/frameworks
 
 #include <Inferred/Inferred.h>

From 82951cfb8a413aab9c4b8aeecbd7475dda8f1fb1 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 23 Feb 2022 14:51:55 +0100
Subject: [PATCH 632/748] Fix HostProcessWindows for D120321

---
 .../lldb/Host/windows/HostProcessWindows.h    |  2 -
 .../Host/windows/HostProcessWindows.cpp       | 38 +++++++++----------
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/lldb/include/lldb/Host/windows/HostProcessWindows.h b/lldb/include/lldb/Host/windows/HostProcessWindows.h
index dc27bdc46bb8f..4c69a2f434cf1 100644
--- a/lldb/include/lldb/Host/windows/HostProcessWindows.h
+++ b/lldb/include/lldb/Host/windows/HostProcessWindows.h
@@ -34,8 +34,6 @@ class HostProcessWindows : public HostNativeProcessBase {
                   bool monitor_signals) override;
 
 private:
-  static lldb::thread_result_t MonitorThread(void *thread_arg);
-
   void Close();
 
   bool m_owns_handle;
diff --git a/lldb/source/Host/windows/HostProcessWindows.cpp b/lldb/source/Host/windows/HostProcessWindows.cpp
index 741ec68d1d1ee..6ccb725ef56ee 100644
--- a/lldb/source/Host/windows/HostProcessWindows.cpp
+++ b/lldb/source/Host/windows/HostProcessWindows.cpp
@@ -63,38 +63,36 @@ bool HostProcessWindows::IsRunning() const {
   return (code == STILL_ACTIVE);
 }
 
+static lldb::thread_result_t
+MonitorThread(const Host::MonitorChildProcessCallback &callback,
+              HANDLE process_handle) {
+  DWORD exit_code;
+
+  ::WaitForSingleObject(process_handle, INFINITE);
+  ::GetExitCodeProcess(process_handle, &exit_code);
+  callback(::GetProcessId(process_handle), true, 0, exit_code);
+  ::CloseHandle(process_handle);
+  return {};
+}
+
 llvm::Expected<HostThread> HostProcessWindows::StartMonitoring(
     const Host::MonitorChildProcessCallback &callback, bool monitor_signals) {
-  MonitorInfo *info = new MonitorInfo;
-  info->callback = callback;
+  HANDLE process_handle;
 
   // Since the life of this HostProcessWindows instance and the life of the
   // process may be different, duplicate the handle so that the monitor thread
   // can have ownership over its own copy of the handle.
   if (::DuplicateHandle(GetCurrentProcess(), m_process, GetCurrentProcess(),
-                        &info->process_handle, 0, FALSE, DUPLICATE_SAME_ACCESS)) {
-    return ThreadLauncher::LaunchThread("ChildProcessMonitor",
-                                        HostProcessWindows::MonitorThread,
-                                        info);
+                        &process_handle, 0, FALSE, DUPLICATE_SAME_ACCESS)) {
+    return ThreadLauncher::LaunchThread(
+        "ChildProcessMonitor", [callback, process_handle] {
+          return MonitorThread(callback, process_handle);
+        });
   } else {
     return llvm::errorCodeToError(llvm::mapWindowsError(GetLastError()));
   }
 }
 
-lldb::thread_result_t HostProcessWindows::MonitorThread(void *thread_arg) {
-  DWORD exit_code;
-
-  MonitorInfo *info = static_cast<MonitorInfo *>(thread_arg);
-  if (info) {
-    ::WaitForSingleObject(info->process_handle, INFINITE);
-    ::GetExitCodeProcess(info->process_handle, &exit_code);
-    info->callback(::GetProcessId(info->process_handle), true, 0, exit_code);
-    ::CloseHandle(info->process_handle);
-    delete (info);
-  }
-  return {};
-}
-
 void HostProcessWindows::Close() {
   if (m_owns_handle && m_process != LLDB_INVALID_PROCESS)
     ::CloseHandle(m_process);

From 5ccb0582c2b199913829d75a1dbbc866a707f400 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 14:52:56 +0100
Subject: [PATCH 633/748] [InstCombine] Simplify udiv -> lshr folding

What we're really doing here is converting Op0 udiv Op1 into
Op0 lshr log2(Op1), so phrase it in that way. Actually pushing
the lshr into the log2(Op1) expression should be seen as a separate
transform.
---
 .../InstCombine/InstCombineMulDivRem.cpp      | 78 +++++++++----------
 llvm/test/Transforms/InstCombine/div-shift.ll | 22 +++---
 2 files changed, 46 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index db239385aed06..36fb08f58221c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -902,9 +902,7 @@ static const unsigned MaxDepth = 6;
 
 namespace {
 
-using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
-                                           const BinaryOperator &I,
-                                           InstCombinerImpl &IC);
+using FoldUDivOperandCb = Value *(*)(Value *V, InstCombinerImpl &IC);
 
 /// Used to maintain state for visitUDivOperand().
 struct UDivFoldAction {
@@ -917,7 +915,7 @@ struct UDivFoldAction {
 
   union {
     /// The instruction returned when FoldAction is invoked.
-    Instruction *FoldResult;
+    Value *FoldResult;
 
     /// Stores the LHS action index if this action joins two actions together.
     size_t SelectLHSIdx;
@@ -931,26 +929,20 @@ struct UDivFoldAction {
 
 } // end anonymous namespace
 
-// X udiv 2^C -> X >> C
-static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
-                                    const BinaryOperator &I,
-                                    InstCombinerImpl &IC) {
-  Constant *C1 = ConstantExpr::getExactLogBase2(cast<Constant>(Op1));
-  if (!C1)
+// log2(2^C) -> C
+static Value *foldUDivPow2Cst(Value *V, InstCombinerImpl &IC) {
+  Constant *C = ConstantExpr::getExactLogBase2(cast<Constant>(V));
+  if (!C)
     llvm_unreachable("Failed to constant fold udiv -> logbase2");
-  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
-  if (I.isExact())
-    LShr->setIsExact();
-  return LShr;
+  return C;
 }
 
-// X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
-// X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
-static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
-                                InstCombinerImpl &IC) {
+// log2(C1 << N) -> N+C2, where C1 is 1<<C2
+// log2(zext(C1 << N)) -> zext(N+C2), where C1 is 1<<C2
+static Value *foldUDivShl(Value *V, InstCombinerImpl &IC) {
   Value *ShiftLeft;
-  if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
-    ShiftLeft = Op1;
+  if (!match(V, m_ZExt(m_Value(ShiftLeft))))
+    ShiftLeft = V;
 
   Constant *CI;
   Value *N;
@@ -960,19 +952,16 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
   if (!Log2Base)
     llvm_unreachable("getLogBase2 should never fail here!");
   N = IC.Builder.CreateAdd(N, Log2Base);
-  if (Op1 != ShiftLeft)
-    N = IC.Builder.CreateZExt(N, Op1->getType());
-  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
-  if (I.isExact())
-    LShr->setIsExact();
-  return LShr;
+  if (V != ShiftLeft)
+    N = IC.Builder.CreateZExt(N, V->getType());
+  return N;
 }
 
 // Recursively visits the possible right hand operands of a udiv
 // instruction, seeing through select instructions, to determine if we can
 // replace the udiv with something simpler.  If we find that an operand is not
 // able to simplify the udiv, we abort the entire transformation.
-static size_t visitUDivOperand(Value *Op, const BinaryOperator &I,
+static size_t visitUDivOperand(Value *Op,
                                SmallVectorImpl<UDivFoldAction> &Actions,
                                unsigned Depth = 0) {
   // FIXME: assert that Op1 isn't/doesn't contain undef.
@@ -999,8 +988,8 @@ static size_t visitUDivOperand(Value *Op, const BinaryOperator &I,
     // FIXME: missed optimization: if one of the hands of select is/contains
     //        undef, just directly pick the other one.
     // FIXME: can both hands contain undef?
-    if (size_t LHSIdx = visitUDivOperand(SI->getOperand(1), I, Actions, Depth))
-      if (visitUDivOperand(SI->getOperand(2), I, Actions, Depth)) {
+    if (size_t LHSIdx = visitUDivOperand(SI->getOperand(1), Actions, Depth))
+      if (visitUDivOperand(SI->getOperand(2), Actions, Depth)) {
         Actions.push_back(UDivFoldAction(nullptr, Op, LHSIdx - 1));
         return Actions.size();
       }
@@ -1105,15 +1094,15 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
       return BinaryOperator::CreateUDiv(A, X);
   }
 
-  // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
+  // Op1 udiv Op2 -> Op1 lshr log2(Op2), if log2() folds away.
   SmallVector<UDivFoldAction, 6> UDivActions;
-  if (visitUDivOperand(Op1, I, UDivActions))
+  if (visitUDivOperand(Op1, UDivActions))
     for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
       FoldUDivOperandCb Action = UDivActions[i].FoldAction;
       Value *ActionOp1 = UDivActions[i].OperandToFold;
-      Instruction *Inst;
+      Value *Res;
       if (Action)
-        Inst = Action(Op0, ActionOp1, I, *this);
+        Res = Action(ActionOp1, *this);
       else {
         // This action joins two actions together.  The RHS of this action is
         // simply the last action we processed, we saved the LHS action index in
@@ -1122,18 +1111,19 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
         Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
         size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
         Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
-        Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
-                                  SelectLHS, SelectRHS);
+        Res = Builder.CreateSelect(cast<SelectInst>(ActionOp1)->getCondition(),
+                                   SelectLHS, SelectRHS);
       }
 
       // If this is the last action to process, return it to the InstCombiner.
-      // Otherwise, we insert it before the UDiv and record it so that we may
-      // use it as part of a joining action (i.e., a SelectInst).
+      // Otherwise, record it so that we may use it as part of a joining action
+      // (i.e., a SelectInst).
       if (e - i != 1) {
-        Inst->insertBefore(&I);
-        UDivActions[i].FoldResult = Inst;
-      } else
-        return Inst;
+        UDivActions[i].FoldResult = Res;
+      } else {
+        return replaceInstUsesWith(
+            I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
+      }
     }
 
   return nullptr;
@@ -1241,8 +1231,10 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
     if (match(Op1, m_NegatedPower2())) {
       // X sdiv (-(1 << C)) -> -(X sdiv (1 << C)) ->
       //                    -> -(X udiv (1 << C)) -> -(X u>> C)
-      return BinaryOperator::CreateNeg(Builder.Insert(foldUDivPow2Cst(
-          Op0, ConstantExpr::getNeg(cast<Constant>(Op1)), I, *this)));
+      Constant *CNegLog2 = ConstantExpr::getExactLogBase2(
+          ConstantExpr::getNeg(cast<Constant>(Op1)));
+      Value *Shr = Builder.CreateLShr(Op0, CNegLog2, I.getName(), I.isExact());
+      return BinaryOperator::CreateNeg(Shr);
     }
 
     if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
diff --git a/llvm/test/Transforms/InstCombine/div-shift.ll b/llvm/test/Transforms/InstCombine/div-shift.ll
index 8ee9063894d99..6d285ab2f0993 100644
--- a/llvm/test/Transforms/InstCombine/div-shift.ll
+++ b/llvm/test/Transforms/InstCombine/div-shift.ll
@@ -6,8 +6,8 @@ define i32 @t1(i16 zeroext %x, i32 %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[D:%.*]] = lshr i32 [[CONV]], [[TMP0]]
-; CHECK-NEXT:    ret i32 [[D]]
+; CHECK-NEXT:    [[D1:%.*]] = lshr i32 [[CONV]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[D1]]
 ;
 entry:
   %conv = zext i16 %x to i32
@@ -21,8 +21,8 @@ define <2 x i32> @t1vec(<2 x i16> %x, <2 x i32> %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV:%.*]] = zext <2 x i16> [[X:%.*]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[Y:%.*]], <i32 1, i32 1>
-; CHECK-NEXT:    [[D:%.*]] = lshr <2 x i32> [[CONV]], [[TMP0]]
-; CHECK-NEXT:    ret <2 x i32> [[D]]
+; CHECK-NEXT:    [[D1:%.*]] = lshr <2 x i32> [[CONV]], [[TMP0]]
+; CHECK-NEXT:    ret <2 x i32> [[D1]]
 ;
 entry:
   %conv = zext <2 x i16> %x to <2 x i32>
@@ -61,9 +61,9 @@ define i64 @t3(i64 %x, i32 %y) {
 define i32 @t4(i32 %x, i32 %y) {
 ; CHECK-LABEL: @t4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[Y:%.*]], 5
-; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[TMP1]], i32 [[Y]], i32 5
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[X:%.*]], [[DOTV]]
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[Y]], i32 5
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[X:%.*]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = shl i32 1, %y
   %2 = icmp ult i32 %1, 32
@@ -74,10 +74,10 @@ define i32 @t4(i32 %x, i32 %y) {
 
 define i32 @t5(i1 %x, i1 %y, i32 %V) {
 ; CHECK-LABEL: @t5(
-; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[X:%.*]], i32 5, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[V:%.*]], [[DOTV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[Y:%.*]], i32 [[TMP1]], i32 0
-; CHECK-NEXT:    ret i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[X:%.*]], i32 5, i32 6
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[Y:%.*]], i32 [[TMP1]], i32 [[V:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[V]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = shl i32 1, %V
   %2 = select i1 %x, i32 32, i32 64

From 03dff12197d15161ffc9ec7afeb9501551d6119e Mon Sep 17 00:00:00 2001
From: Stanislav Gatev <sgatev@google.com>
Date: Wed, 23 Feb 2022 13:38:51 +0000
Subject: [PATCH 634/748] Revert "Revert "[clang][dataflow] Add support for
 global storage values""

This reverts commit 169e1aba55bed9f7ffa000f9f170ab2defbc40b2.

It also fixes an incorrect assumption in `initGlobalVars`.
---
 .../FlowSensitive/DataflowEnvironment.h       |   5 +
 .../FlowSensitive/DataflowEnvironment.cpp     |  45 +++++
 clang/lib/Analysis/FlowSensitive/Transfer.cpp |  23 +++
 .../Analysis/FlowSensitive/TransferTest.cpp   | 167 ++++++++++++++++++
 4 files changed, 240 insertions(+)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index af613c95bb8dc..bab20418a016a 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -49,6 +49,11 @@ enum class SkipPast {
 };
 
 /// Holds the state of the program (store and heap) at a given program point.
+///
+/// WARNING: Symbolic values that are created by the environment for static
+/// local and global variables are not currently invalidated on function calls.
+/// This is unsound and should be taken into account when designing dataflow
+/// analyses.
 class Environment {
 public:
   /// Supplements `Environment` with non-standard comparison and join
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index eca58b313761b..0fb341fd0bb05 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
 #include <memory>
 #include <utility>
 
@@ -56,10 +57,54 @@ static bool equivalentValues(QualType Type, Value *Val1, Value *Val2,
   return Model.compareEquivalent(Type, *Val1, *Val2);
 }
 
+/// Initializes a global storage value.
+static void initGlobalVar(const VarDecl &D, Environment &Env) {
+  if (!D.hasGlobalStorage() ||
+      Env.getStorageLocation(D, SkipPast::None) != nullptr)
+    return;
+
+  auto &Loc = Env.createStorageLocation(D);
+  Env.setStorageLocation(D, Loc);
+  if (auto *Val = Env.createValue(D.getType()))
+    Env.setValue(Loc, *Val);
+}
+
+/// Initializes a global storage value.
+static void initGlobalVar(const Decl &D, Environment &Env) {
+  if (auto *V = dyn_cast<VarDecl>(&D))
+    initGlobalVar(*V, Env);
+}
+
+/// Initializes global storage values that are declared or referenced from
+/// sub-statements of `S`.
+// FIXME: Add support for resetting globals after function calls to enable
+// the implementation of sound analyses.
+static void initGlobalVars(const Stmt &S, Environment &Env) {
+  for (auto *Child : S.children()) {
+    if (Child != nullptr)
+      initGlobalVars(*Child, Env);
+  }
+
+  if (auto *DS = dyn_cast<DeclStmt>(&S)) {
+    if (DS->isSingleDecl()) {
+      initGlobalVar(*DS->getSingleDecl(), Env);
+    } else {
+      for (auto *D : DS->getDeclGroup())
+        initGlobalVar(*D, Env);
+    }
+  } else if (auto *E = dyn_cast<DeclRefExpr>(&S)) {
+    initGlobalVar(*E->getDecl(), Env);
+  } else if (auto *E = dyn_cast<MemberExpr>(&S)) {
+    initGlobalVar(*E->getMemberDecl(), Env);
+  }
+}
+
 Environment::Environment(DataflowAnalysisContext &DACtx,
                          const DeclContext &DeclCtx)
     : Environment(DACtx) {
   if (const auto *FuncDecl = dyn_cast<FunctionDecl>(&DeclCtx)) {
+    assert(FuncDecl->getBody() != nullptr);
+    initGlobalVars(*FuncDecl->getBody(), *this);
     for (const auto *ParamDecl : FuncDecl->parameters()) {
       assert(ParamDecl != nullptr);
       auto &ParamLoc = createStorageLocation(*ParamDecl);
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index cd9b8b0e454e4..4b5d23593a4bd 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -136,6 +136,11 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     // Group decls are converted into single decls in the CFG so the cast below
     // is safe.
     const auto &D = *cast<VarDecl>(S->getSingleDecl());
+
+    // Static local vars are already initialized in `Environment`.
+    if (D.hasGlobalStorage())
+      return;
+
     auto &Loc = Env.createStorageLocation(D);
     Env.setStorageLocation(D, Loc);
 
@@ -291,6 +296,24 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     if (Member->isFunctionOrFunctionTemplate())
       return;
 
+    if (auto *D = dyn_cast<VarDecl>(Member)) {
+      if (D->hasGlobalStorage()) {
+        auto *VarDeclLoc = Env.getStorageLocation(*D, SkipPast::None);
+        if (VarDeclLoc == nullptr)
+          return;
+
+        if (VarDeclLoc->getType()->isReferenceType()) {
+          Env.setStorageLocation(*S, *VarDeclLoc);
+        } else {
+          auto &Loc = Env.createStorageLocation(*S);
+          Env.setStorageLocation(*S, Loc);
+          Env.setValue(Loc, Env.takeOwnership(
+                                std::make_unique<ReferenceValue>(*VarDeclLoc)));
+        }
+        return;
+      }
+    }
+
     // The receiver can be either a value or a pointer to a value. Skip past the
     // indirection to handle both cases.
     auto *BaseLoc = cast_or_null<AggregateStorageLocation>(
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 83ccba1a25382..fda4af435c4a7 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2153,4 +2153,171 @@ TEST_F(TransferTest, AssignFromBoolNegation) {
               });
 }
 
+TEST_F(TransferTest, StaticIntSingleVarDecl) {
+  std::string Code = R"(
+    void target() {
+      static int Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                const StorageLocation *FooLoc =
+                    Env.getStorageLocation(*FooDecl, SkipPast::None);
+                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+                const Value *FooVal = Env.getValue(*FooLoc);
+                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+              });
+}
+
+TEST_F(TransferTest, StaticIntGroupVarDecl) {
+  std::string Code = R"(
+    void target() {
+      static int Foo, Bar;
+      (void)0;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+                ASSERT_THAT(FooDecl, NotNull());
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const StorageLocation *FooLoc =
+                    Env.getStorageLocation(*FooDecl, SkipPast::None);
+                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+                const StorageLocation *BarLoc =
+                    Env.getStorageLocation(*BarDecl, SkipPast::None);
+                ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(BarLoc));
+
+                const Value *FooVal = Env.getValue(*FooLoc);
+                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+
+                const Value *BarVal = Env.getValue(*BarLoc);
+                EXPECT_TRUE(isa_and_nonnull<IntegerValue>(BarVal));
+
+                EXPECT_NE(FooVal, BarVal);
+              });
+}
+
+TEST_F(TransferTest, GlobalIntVarDecl) {
+  std::string Code = R"(
+    static int Foo;
+
+    void target() {
+      int Bar = Foo;
+      int Baz = Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+                ASSERT_THAT(BazDecl, NotNull());
+
+                const Value *BarVal =
+                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
+                const Value *BazVal =
+                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
+                EXPECT_EQ(BarVal, BazVal);
+              });
+}
+
+TEST_F(TransferTest, StaticMemberIntVarDecl) {
+  std::string Code = R"(
+    struct A {
+      static int Foo;
+    };
+
+    void target(A a) {
+      int Bar = a.Foo;
+      int Baz = a.Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+                ASSERT_THAT(BazDecl, NotNull());
+
+                const Value *BarVal =
+                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
+                const Value *BazVal =
+                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
+                EXPECT_EQ(BarVal, BazVal);
+              });
+}
+
+TEST_F(TransferTest, StaticMemberRefVarDecl) {
+  std::string Code = R"(
+    struct A {
+      static int &Foo;
+    };
+
+    void target(A a) {
+      int Bar = a.Foo;
+      int Baz = a.Foo;
+      // [[p]]
+    }
+  )";
+  runDataflow(Code,
+              [](llvm::ArrayRef<
+                     std::pair<std::string, DataflowAnalysisState<NoopLattice>>>
+                     Results,
+                 ASTContext &ASTCtx) {
+                ASSERT_THAT(Results, ElementsAre(Pair("p", _)));
+                const Environment &Env = Results[0].second.Env;
+
+                const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+                ASSERT_THAT(BarDecl, NotNull());
+
+                const ValueDecl *BazDecl = findValueDecl(ASTCtx, "Baz");
+                ASSERT_THAT(BazDecl, NotNull());
+
+                const Value *BarVal =
+                    cast<IntegerValue>(Env.getValue(*BarDecl, SkipPast::None));
+                const Value *BazVal =
+                    cast<IntegerValue>(Env.getValue(*BazDecl, SkipPast::None));
+                EXPECT_EQ(BarVal, BazVal);
+              });
+}
+
 } // namespace

From 8386eb23bfe625bc3fe42586b4d47c8eb54c37d7 Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@fb.com>
Date: Wed, 23 Feb 2022 08:57:52 -0500
Subject: [PATCH 635/748] [lld-macho][nfc] Move ICF-specific logic into ICF.cpp

This mirrors the code organization in `lld/ELF`.

Reviewed By: #lld-macho, thakis

Differential Revision: https://reviews.llvm.org/D120378
---
 lld/MachO/ICF.cpp          | 12 ++++++++---
 lld/MachO/InputSection.cpp | 42 --------------------------------------
 lld/MachO/InputSection.h   |  2 --
 3 files changed, 9 insertions(+), 47 deletions(-)

diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index fa018f4d3ce13..3efdeb4a5d8f4 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -15,6 +15,7 @@
 #include "lld/Common/CommonLinkerContext.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/xxhash.h"
 
 #include <atomic>
 
@@ -361,7 +362,8 @@ void macho::foldIdenticalSections() {
   for (ConcatInputSection *isec : inputSections) {
     // FIXME: consider non-code __text sections as hashable?
     bool isHashable = (isCodeSection(isec) || isCfStringSection(isec)) &&
-                      !isec->shouldOmitFromOutput() && isec->isHashableForICF();
+                      !isec->shouldOmitFromOutput() &&
+                      sectionType(isec->getFlags()) == MachO::S_REGULAR;
     if (isHashable) {
       hashable.push_back(isec);
       for (Defined *d : isec->symbols)
@@ -371,8 +373,12 @@ void macho::foldIdenticalSections() {
       isec->icfEqClass[0] = ++icfUniqueID;
     }
   }
-  parallelForEach(hashable,
-                  [](ConcatInputSection *isec) { isec->hashForICF(); });
+  parallelForEach(hashable, [](ConcatInputSection *isec) {
+    assert(isec->icfEqClass[0] == 0); // don't overwrite a unique ID!
+    // Turn-on the top bit to guarantee that valid hashes have no collisions
+    // with the small-integer unique IDs for ICF-ineligible sections
+    isec->icfEqClass[0] = xxHash64(isec->data) | (1ull << 63);
+  });
   // Now that every input section is either hashed or marked as unique, run the
   // segregation algorithm to detect foldable subsections.
   ICF(hashable).run();
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index 439b9fdd5f012..f6c2685d3ced5 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -77,48 +77,6 @@ std::string InputSection::getLocation(uint64_t off) const {
       .str();
 }
 
-// ICF needs to hash any section that might potentially be duplicated so
-// that it can match on content rather than identity.
-bool ConcatInputSection::isHashableForICF() const {
-  switch (sectionType(getFlags())) {
-  case S_REGULAR:
-    return true;
-  case S_CSTRING_LITERALS:
-  case S_4BYTE_LITERALS:
-  case S_8BYTE_LITERALS:
-  case S_16BYTE_LITERALS:
-  case S_LITERAL_POINTERS:
-    llvm_unreachable("found unexpected literal type in ConcatInputSection");
-  case S_ZEROFILL:
-  case S_GB_ZEROFILL:
-  case S_NON_LAZY_SYMBOL_POINTERS:
-  case S_LAZY_SYMBOL_POINTERS:
-  case S_SYMBOL_STUBS:
-  case S_MOD_INIT_FUNC_POINTERS:
-  case S_MOD_TERM_FUNC_POINTERS:
-  case S_COALESCED:
-  case S_INTERPOSING:
-  case S_DTRACE_DOF:
-  case S_LAZY_DYLIB_SYMBOL_POINTERS:
-  case S_THREAD_LOCAL_REGULAR:
-  case S_THREAD_LOCAL_ZEROFILL:
-  case S_THREAD_LOCAL_VARIABLES:
-  case S_THREAD_LOCAL_VARIABLE_POINTERS:
-  case S_THREAD_LOCAL_INIT_FUNCTION_POINTERS:
-    return false;
-  default:
-    llvm_unreachable("Section type");
-  }
-}
-
-void ConcatInputSection::hashForICF() {
-  assert(data.data()); // zeroFill section data has nullptr with non-zero size
-  assert(icfEqClass[0] == 0); // don't overwrite a unique ID!
-  // Turn-on the top bit to guarantee that valid hashes have no collisions
-  // with the small-integer unique IDs for ICF-ineligible sections
-  icfEqClass[0] = xxHash64(data) | (1ull << 63);
-}
-
 void ConcatInputSection::foldIdentical(ConcatInputSection *copy) {
   align = std::max(align, copy->align);
   copy->live = false;
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index 49fadc24a2d50..7d9ea8d74c6c6 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -100,8 +100,6 @@ class ConcatInputSection final : public InputSection {
   void markLive(uint64_t off) override { live = true; }
   bool isCoalescedWeak() const { return wasCoalesced && symbols.empty(); }
   bool shouldOmitFromOutput() const { return !live || isCoalescedWeak(); }
-  bool isHashableForICF() const;
-  void hashForICF();
   void writeTo(uint8_t *buf);
 
   void foldIdentical(ConcatInputSection *redundant);

From e42ad84ba02220f5954c6cbc6fa9f77120ac8c06 Mon Sep 17 00:00:00 2001
From: Jez Ng <jezng@fb.com>
Date: Wed, 23 Feb 2022 08:57:54 -0500
Subject: [PATCH 636/748] [lld-macho][nfc] Refactor MarkLive

This mirrors the code structure in `lld/ELF`. It also paves the way for
an upcoming diff where I templatize things.

Reviewed By: #lld-macho, thakis

Differential Revision: https://reviews.llvm.org/D120376
---
 lld/MachO/MarkLive.cpp | 175 ++++++++++++++++++++++-------------------
 1 file changed, 94 insertions(+), 81 deletions(-)

diff --git a/lld/MachO/MarkLive.cpp b/lld/MachO/MarkLive.cpp
index 4269c8342c656..4790e55738651 100644
--- a/lld/MachO/MarkLive.cpp
+++ b/lld/MachO/MarkLive.cpp
@@ -21,44 +21,96 @@ namespace macho {
 using namespace llvm;
 using namespace llvm::MachO;
 
-// Set live bit on for each reachable chunk. Unmarked (unreachable)
-// InputSections will be ignored by Writer, so they will be excluded
-// from the final output.
-void markLive() {
-  TimeTraceScope timeScope("markLive");
-
-  // We build up a worklist of sections which have been marked as live. We only
-  // push into the worklist when we discover an unmarked section, and we mark
-  // as we push, so sections never appear twice in the list.
-  // Literal sections cannot contain references to other sections, so we only
-  // store ConcatInputSections in our worklist.
+class MarkLive {
+public:
+  void enqueue(InputSection *isec, uint64_t off);
+  void addSym(Symbol *s);
+  void markTransitively();
+
+private:
+  // We build up a worklist of sections which have been marked as live. We
+  // only push into the worklist when we discover an unmarked section, and we
+  // mark as we push, so sections never appear twice in the list. Literal
+  // sections cannot contain references to other sections, so we only store
+  // ConcatInputSections in our worklist.
   SmallVector<ConcatInputSection *, 256> worklist;
+};
+
+void MarkLive::enqueue(InputSection *isec, uint64_t off) {
+  if (isec->isLive(off))
+    return;
+  isec->markLive(off);
+  if (auto s = dyn_cast<ConcatInputSection>(isec)) {
+    assert(!s->isCoalescedWeak());
+    worklist.push_back(s);
+  }
+}
+
+void MarkLive::addSym(Symbol *s) {
+  if (s->used)
+    return;
+  s->used = true;
+  if (auto *d = dyn_cast<Defined>(s)) {
+    if (d->isec)
+      enqueue(d->isec, d->value);
+    if (d->unwindEntry)
+      enqueue(d->unwindEntry, 0);
+  }
+}
+
+void MarkLive::markTransitively() {
+  do {
+    // Mark things reachable from GC roots as live.
+    while (!worklist.empty()) {
+      ConcatInputSection *s = worklist.pop_back_val();
+      assert(s->live && "We mark as live when pushing onto the worklist!");
 
-  auto enqueue = [&](InputSection *isec, uint64_t off) {
-    if (isec->isLive(off))
-      return;
-    isec->markLive(off);
-    if (auto s = dyn_cast<ConcatInputSection>(isec)) {
-      assert(!s->isCoalescedWeak());
-      worklist.push_back(s);
+      // Mark all symbols listed in the relocation table for this section.
+      for (const Reloc &r : s->relocs) {
+        if (auto *s = r.referent.dyn_cast<Symbol *>())
+          addSym(s);
+        else
+          enqueue(r.referent.get<InputSection *>(), r.addend);
+      }
+      for (Defined *d : s->symbols)
+        addSym(d);
     }
-  };
-
-  auto addSym = [&](Symbol *s) {
-    if (s->used)
-      return;
-    s->used = true;
-    if (auto *d = dyn_cast<Defined>(s)) {
-      if (d->isec)
-        enqueue(d->isec, d->value);
-      if (d->unwindEntry)
-        enqueue(d->unwindEntry, 0);
+
+    // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live
+    // section. Process them in a second pass.
+    for (ConcatInputSection *isec : inputSections) {
+      // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a
+      // separate vector and only walking that here is faster.
+      if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live)
+        continue;
+
+      for (const Reloc &r : isec->relocs) {
+        bool referentLive;
+        if (auto *s = r.referent.dyn_cast<Symbol *>())
+          referentLive = s->isLive();
+        else
+          referentLive = r.referent.get<InputSection *>()->isLive(r.addend);
+        if (referentLive)
+          enqueue(isec, 0);
+      }
     }
-  };
 
+    // S_ATTR_LIVE_SUPPORT could have marked additional sections live,
+    // which in turn could mark additional S_ATTR_LIVE_SUPPORT sections live.
+    // Iterate. In practice, the second iteration won't mark additional
+    // S_ATTR_LIVE_SUPPORT sections live.
+  } while (!worklist.empty());
+}
+
+// Set live bit on for each reachable chunk. Unmarked (unreachable)
+// InputSections will be ignored by Writer, so they will be excluded
+// from the final output.
+void markLive() {
+  TimeTraceScope timeScope("markLive");
+  MarkLive marker;
   // Add GC roots.
   if (config->entry)
-    addSym(config->entry);
+    marker.addSym(config->entry);
   for (Symbol *sym : symtab->getSymbols()) {
     if (auto *defined = dyn_cast<Defined>(sym)) {
       // -exported_symbol(s_list)
@@ -69,17 +121,18 @@ void markLive() {
         // explicitUndefineds code below would handle this automatically.
         assert(!defined->privateExtern &&
                "should have been rejected by driver");
-        addSym(defined);
+        marker.addSym(defined);
         continue;
       }
 
       // public symbols explicitly marked .no_dead_strip
       if (defined->referencedDynamically || defined->noDeadStrip) {
-        addSym(defined);
+        marker.addSym(defined);
         continue;
       }
 
-      // FIXME: When we implement these flags, make symbols from them GC roots:
+      // FIXME: When we implement these flags, make symbols from them GC
+      // roots:
       // * -reexported_symbol(s_list)
       // * -alias(-list)
       // * -init
@@ -89,80 +142,40 @@ void markLive() {
       bool externsAreRoots =
           config->outputType != MH_EXECUTE || config->exportDynamic;
       if (externsAreRoots && !defined->privateExtern) {
-        addSym(defined);
+        marker.addSym(defined);
         continue;
       }
     }
   }
   // -u symbols
   for (Symbol *sym : config->explicitUndefineds)
-    addSym(sym);
+    marker.addSym(sym);
   // local symbols explicitly marked .no_dead_strip
   for (const InputFile *file : inputFiles)
     if (auto *objFile = dyn_cast<ObjFile>(file))
       for (Symbol *sym : objFile->symbols)
         if (auto *defined = dyn_cast_or_null<Defined>(sym))
           if (!defined->isExternal() && defined->noDeadStrip)
-            addSym(defined);
+            marker.addSym(defined);
   if (auto *stubBinder =
           dyn_cast_or_null<DylibSymbol>(symtab->find("dyld_stub_binder")))
-    addSym(stubBinder);
+    marker.addSym(stubBinder);
   for (ConcatInputSection *isec : inputSections) {
     // Sections marked no_dead_strip
     if (isec->getFlags() & S_ATTR_NO_DEAD_STRIP) {
-      enqueue(isec, 0);
+      marker.enqueue(isec, 0);
       continue;
     }
 
     // mod_init_funcs, mod_term_funcs sections
     if (sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS ||
         sectionType(isec->getFlags()) == S_MOD_TERM_FUNC_POINTERS) {
-      enqueue(isec, 0);
+      marker.enqueue(isec, 0);
       continue;
     }
   }
 
-  do {
-    // Mark things reachable from GC roots as live.
-    while (!worklist.empty()) {
-      ConcatInputSection *s = worklist.pop_back_val();
-      assert(s->live && "We mark as live when pushing onto the worklist!");
-
-      // Mark all symbols listed in the relocation table for this section.
-      for (const Reloc &r : s->relocs) {
-        if (auto *s = r.referent.dyn_cast<Symbol *>())
-          addSym(s);
-        else
-          enqueue(r.referent.get<InputSection *>(), r.addend);
-      }
-      for (Defined *d : s->symbols)
-        addSym(d);
-    }
-
-    // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live section.
-    // Process them in a second pass.
-    for (ConcatInputSection *isec : inputSections) {
-      // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a
-      // separate vector and only walking that here is faster.
-      if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live)
-        continue;
-
-      for (const Reloc &r : isec->relocs) {
-        bool referentLive;
-        if (auto *s = r.referent.dyn_cast<Symbol *>())
-          referentLive = s->isLive();
-        else
-          referentLive = r.referent.get<InputSection *>()->isLive(r.addend);
-        if (referentLive)
-          enqueue(isec, 0);
-      }
-    }
-
-    // S_ATTR_LIVE_SUPPORT could have marked additional sections live,
-    // which in turn could mark additional S_ATTR_LIVE_SUPPORT sections live.
-    // Iterate. In practice, the second iteration won't mark additional
-    // S_ATTR_LIVE_SUPPORT sections live.
-  } while (!worklist.empty());
+  marker.markTransitively();
 }
 
 } // namespace macho

From 841355c1e4e35fc02b5b171419979f5f9af0ebc8 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 23 Feb 2022 09:07:54 -0500
Subject: [PATCH 637/748] Remove unused function; NFC

---
 clang/include/clang/AST/ASTContext.h |  8 -------
 clang/lib/AST/ASTContext.cpp         | 35 ----------------------------
 2 files changed, 43 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 510c63962053b..f2d7060e6f3d3 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2758,14 +2758,6 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// long double and double on AArch64 will return 0).
   int getFloatingTypeSemanticOrder(QualType LHS, QualType RHS) const;
 
-  /// Return a real floating point or a complex type (based on
-  /// \p typeDomain/\p typeSize).
-  ///
-  /// \param typeDomain a real floating point or complex type.
-  /// \param typeSize a real floating point or complex type.
-  QualType getFloatingTypeOfSizeWithinDomain(QualType typeSize,
-                                             QualType typeDomain) const;
-
   unsigned getTargetAddressSpace(QualType T) const;
 
   unsigned getTargetAddressSpace(Qualifiers Q) const;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index f29e90c05713c..f99c2b91b9232 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6798,41 +6798,6 @@ static FloatingRank getFloatingRank(QualType T) {
   }
 }
 
-/// getFloatingTypeOfSizeWithinDomain - Returns a real floating
-/// point or a complex type (based on typeDomain/typeSize).
-/// 'typeDomain' is a real floating point or complex type.
-/// 'typeSize' is a real floating point or complex type.
-QualType ASTContext::getFloatingTypeOfSizeWithinDomain(QualType Size,
-                                                       QualType Domain) const {
-  FloatingRank EltRank = getFloatingRank(Size);
-  if (Domain->isComplexType()) {
-    switch (EltRank) {
-    case BFloat16Rank: llvm_unreachable("Complex bfloat16 is not supported");
-    case Float16Rank:
-    case HalfRank: llvm_unreachable("Complex half is not supported");
-    case Ibm128Rank:     return getComplexType(Ibm128Ty);
-    case FloatRank:      return getComplexType(FloatTy);
-    case DoubleRank:     return getComplexType(DoubleTy);
-    case LongDoubleRank: return getComplexType(LongDoubleTy);
-    case Float128Rank:   return getComplexType(Float128Ty);
-    }
-  }
-
-  assert(Domain->isRealFloatingType() && "Unknown domain!");
-  switch (EltRank) {
-  case Float16Rank:    return HalfTy;
-  case BFloat16Rank:   return BFloat16Ty;
-  case HalfRank:       return HalfTy;
-  case FloatRank:      return FloatTy;
-  case DoubleRank:     return DoubleTy;
-  case LongDoubleRank: return LongDoubleTy;
-  case Float128Rank:   return Float128Ty;
-  case Ibm128Rank:
-    return Ibm128Ty;
-  }
-  llvm_unreachable("getFloatingRank(): illegal value for rank");
-}
-
 /// getFloatingTypeOrder - Compare the rank of the two specified floating
 /// point types, ignoring the domain of the type (i.e. 'double' ==
 /// '_Complex double').  If LHS > RHS, return 1.  If LHS == RHS, return 0. If

From fc3b34c50803274b8ba3b8a30df9177b7d29063c Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 23 Feb 2022 09:06:11 -0500
Subject: [PATCH 638/748] [InstSimplify] remove shift that is redundant with
 part of funnel shift

In D111530, I suggested that we add some relatively basic pattern-matching
folds for shifts and funnel shifts and avoid a more specialized solution
if possible.

We can start by implementing at least one of these in IR because it's
easier to write the code and verify with Alive2:
https://alive2.llvm.org/ce/z/qHpmNn

This will need to be adapted/extended for SDAG to handle the motivating
bug ( #49541 ) because the patterns only appear later with that example
(added some tests: bb850d422b64)

This can be extended within InstSimplify to handle cases where we 'and'
with a shift too (in that case, kill the funnel shift).
We could also handle patterns where the shift and funnel shift directions
are inverted, but I think it's better to canonicalize that instead to
avoid pattern-match case explosion.

Differential Revision: https://reviews.llvm.org/D120253
---
 llvm/lib/Analysis/InstructionSimplify.cpp | 25 ++++++++++++++++++++
 llvm/test/Transforms/InstSimplify/or.ll   | 28 ++++++++++-------------
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index b3459b5ffb013..641098eaa9ef9 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2329,6 +2329,31 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     }
   }
 
+  // A funnel shift (rotate) can be decomposed into simpler shifts. See if we
+  // are mixing in another shift that is redundant with the funnel shift.
+
+  // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
+  // (shl X, Y) | (fshl X, ?, Y) --> fshl X, ?, Y
+  if (match(Op0,
+            m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(), m_Value(Y))) &&
+      match(Op1, m_Shl(m_Specific(X), m_Specific(Y))))
+    return Op0;
+  if (match(Op1,
+            m_Intrinsic<Intrinsic::fshl>(m_Value(X), m_Value(), m_Value(Y))) &&
+      match(Op0, m_Shl(m_Specific(X), m_Specific(Y))))
+    return Op1;
+
+  // (fshr ?, X, Y) | (lshr X, Y) --> fshr ?, X, Y
+  // (lshr X, Y) | (fshr ?, X, Y) --> fshr ?, X, Y
+  if (match(Op0,
+            m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X), m_Value(Y))) &&
+      match(Op1, m_LShr(m_Specific(X), m_Specific(Y))))
+    return Op0;
+  if (match(Op1,
+            m_Intrinsic<Intrinsic::fshr>(m_Value(), m_Value(X), m_Value(Y))) &&
+      match(Op0, m_LShr(m_Specific(X), m_Specific(Y))))
+    return Op1;
+
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
     return V;
 
diff --git a/llvm/test/Transforms/InstSimplify/or.ll b/llvm/test/Transforms/InstSimplify/or.ll
index 07910443ad759..a279cab582d93 100644
--- a/llvm/test/Transforms/InstSimplify/or.ll
+++ b/llvm/test/Transforms/InstSimplify/or.ll
@@ -1045,10 +1045,8 @@ declare i32 @llvm.fshr.i32 (i32, i32, i32)
 
 define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: @or_shl_fshl(
-; CHECK-NEXT:    [[SHY:%.*]] = shl i32 [[Y:%.*]], [[S:%.*]]
-; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshl.i32(i32 [[Y]], i32 [[X:%.*]], i32 [[S]])
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[FUN]], [[SHY]]
-; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshl.i32(i32 [[Y:%.*]], i32 [[X:%.*]], i32 [[S:%.*]])
+; CHECK-NEXT:    ret i32 [[FUN]]
 ;
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
@@ -1058,10 +1056,8 @@ define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) {
 
 define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: @or_shl_fshl_commute(
-; CHECK-NEXT:    [[SHY:%.*]] = shl i32 [[Y:%.*]], [[S:%.*]]
-; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshl.i32(i32 [[Y]], i32 [[X:%.*]], i32 [[S]])
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHY]], [[FUN]]
-; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshl.i32(i32 [[Y:%.*]], i32 [[X:%.*]], i32 [[S:%.*]])
+; CHECK-NEXT:    ret i32 [[FUN]]
 ;
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
@@ -1069,6 +1065,8 @@ define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) {
   ret i32 %or
 }
 
+; negative test - fshl operands are not commutative
+
 define i32 @or_shl_fshl_wrong_order(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: @or_shl_fshl_wrong_order(
 ; CHECK-NEXT:    [[SHY:%.*]] = shl i32 [[Y:%.*]], [[S:%.*]]
@@ -1084,10 +1082,8 @@ define i32 @or_shl_fshl_wrong_order(i32 %x, i32 %y, i32 %s) {
 
 define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: @or_lshr_fshr(
-; CHECK-NEXT:    [[SHY:%.*]] = lshr i32 [[Y:%.*]], [[S:%.*]]
-; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[Y]], i32 [[S]])
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[FUN]], [[SHY]]
-; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[S:%.*]])
+; CHECK-NEXT:    ret i32 [[FUN]]
 ;
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
@@ -1097,10 +1093,8 @@ define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) {
 
 define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: @or_lshr_fshr_commute(
-; CHECK-NEXT:    [[SHY:%.*]] = lshr i32 [[Y:%.*]], [[S:%.*]]
-; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[Y]], i32 [[S]])
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHY]], [[FUN]]
-; CHECK-NEXT:    ret i32 [[OR]]
+; CHECK-NEXT:    [[FUN:%.*]] = call i32 @llvm.fshr.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 [[S:%.*]])
+; CHECK-NEXT:    ret i32 [[FUN]]
 ;
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
@@ -1108,6 +1102,8 @@ define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) {
   ret i32 %or
 }
 
+; negative test - fshr operands are not commutative
+
 define i32 @or_lshr_fshr_wrong_order(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: @or_lshr_fshr_wrong_order(
 ; CHECK-NEXT:    [[SHY:%.*]] = lshr i32 [[Y:%.*]], [[S:%.*]]

From b1a8dcf8c186ddfeeb062ab8475fe30365557955 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 23 Feb 2022 09:11:34 -0500
Subject: [PATCH 639/748] Silence some "not all control paths return a value"
 warnings; NFC

---
 clang/lib/Frontend/FrontendActions.cpp      | 1 +
 clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index baf3ac34db620..65cbc946179f5 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -823,6 +823,7 @@ static StringRef ModuleKindName(Module::ModuleKind MK) {
   case Module::PrivateModuleFragment:
     return "Private Module Fragment";
   }
+  llvm_unreachable("unknown module kind!");
 }
 
 void DumpModuleInfoAction::ExecuteAction() {
diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
index 2ecb9b1cd2ce2..4a817a527f2fa 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
@@ -31,6 +31,7 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
   case LRTable::Action::Sentinel:
     llvm_unreachable("unexpected Sentinel action kind!");
   }
+  llvm_unreachable("unexpected action kind!");
 }
 
 std::string LRTable::dumpStatistics() const {

From 03e6efb8c23f489e45353b6b6d941628d9c49ca2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 15:24:44 +0100
Subject: [PATCH 640/748] [InstCombine] Further simplify udiv -> lshr folding

Rather than queuing up actions, have one function that does the
log2() fold in the obvious way, but with a flag that allows us
to check whether the fold will succeed without actually performing
it.
---
 .../InstCombine/InstCombineMulDivRem.cpp      | 159 +++++-------------
 1 file changed, 44 insertions(+), 115 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 36fb08f58221c..aeae25476db61 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -900,101 +900,55 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
 
 static const unsigned MaxDepth = 6;
 
-namespace {
-
-using FoldUDivOperandCb = Value *(*)(Value *V, InstCombinerImpl &IC);
-
-/// Used to maintain state for visitUDivOperand().
-struct UDivFoldAction {
-  /// Informs visitUDiv() how to fold this operand.  This can be zero if this
-  /// action joins two actions together.
-  FoldUDivOperandCb FoldAction;
-
-  /// Which operand to fold.
-  Value *OperandToFold;
-
-  union {
-    /// The instruction returned when FoldAction is invoked.
-    Value *FoldResult;
-
-    /// Stores the LHS action index if this action joins two actions together.
-    size_t SelectLHSIdx;
+// Take the exact integer log2 of the value. If DoFold is true, create the
+// actual instructions, otherwise return a non-null dummy value. Return nullptr
+// on failure.
+static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
+                       bool DoFold) {
+  auto IfFold = [DoFold](function_ref<Value *()> Fn) {
+    if (!DoFold)
+      return reinterpret_cast<Value *>(-1);
+    return Fn();
   };
 
-  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
-      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
-  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
-      : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
-};
-
-} // end anonymous namespace
-
-// log2(2^C) -> C
-static Value *foldUDivPow2Cst(Value *V, InstCombinerImpl &IC) {
-  Constant *C = ConstantExpr::getExactLogBase2(cast<Constant>(V));
-  if (!C)
-    llvm_unreachable("Failed to constant fold udiv -> logbase2");
-  return C;
-}
-
-// log2(C1 << N) -> N+C2, where C1 is 1<<C2
-// log2(zext(C1 << N)) -> zext(N+C2), where C1 is 1<<C2
-static Value *foldUDivShl(Value *V, InstCombinerImpl &IC) {
-  Value *ShiftLeft;
-  if (!match(V, m_ZExt(m_Value(ShiftLeft))))
-    ShiftLeft = V;
-
-  Constant *CI;
-  Value *N;
-  if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
-    llvm_unreachable("match should never fail here!");
-  Constant *Log2Base = ConstantExpr::getExactLogBase2(CI);
-  if (!Log2Base)
-    llvm_unreachable("getLogBase2 should never fail here!");
-  N = IC.Builder.CreateAdd(N, Log2Base);
-  if (V != ShiftLeft)
-    N = IC.Builder.CreateZExt(N, V->getType());
-  return N;
-}
-
-// Recursively visits the possible right hand operands of a udiv
-// instruction, seeing through select instructions, to determine if we can
-// replace the udiv with something simpler.  If we find that an operand is not
-// able to simplify the udiv, we abort the entire transformation.
-static size_t visitUDivOperand(Value *Op,
-                               SmallVectorImpl<UDivFoldAction> &Actions,
-                               unsigned Depth = 0) {
   // FIXME: assert that Op1 isn't/doesn't contain undef.
 
-  // Check to see if this is an unsigned division with an exact power of 2,
-  // if so, convert to a right shift.
-  if (match(Op, m_Power2())) {
-    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op));
-    return Actions.size();
-  }
-
-  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
-  if (match(Op, m_Shl(m_Power2(), m_Value())) ||
-      match(Op, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
-    Actions.push_back(UDivFoldAction(foldUDivShl, Op));
-    return Actions.size();
-  }
+  // log2(2^C) -> C
+  if (match(Op, m_Power2()))
+    return IfFold([&]() {
+      Constant *C = ConstantExpr::getExactLogBase2(cast<Constant>(Op));
+      if (!C)
+        llvm_unreachable("Failed to constant fold udiv -> logbase2");
+      return C;
+    });
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
   if (Depth++ == MaxDepth)
-    return 0;
+    return nullptr;
 
+  // log2(zext X) -> zext log2(X)
+  Value *X, *Y;
+  if (match(Op, m_ZExt(m_Value(X))))
+    if (Value *LogX = takeLog2(Builder, X, Depth, DoFold))
+      return IfFold([&]() { return Builder.CreateZExt(LogX, Op->getType()); });
+
+  // log2(X << Y) -> log2(X) + Y
+  if (match(Op, m_Shl(m_Value(X), m_Value(Y))))
+    if (Value *LogX = takeLog2(Builder, X, Depth, DoFold))
+      return IfFold([&]() { return Builder.CreateAdd(LogX, Y); });
+
+  // log2(Cond ? X : Y) -> Cond ? log2(X) : log2(Y)
+  // FIXME: missed optimization: if one of the hands of select is/contains
+  //        undef, just directly pick the other one.
+  // FIXME: can both hands contain undef?
   if (SelectInst *SI = dyn_cast<SelectInst>(Op))
-    // FIXME: missed optimization: if one of the hands of select is/contains
-    //        undef, just directly pick the other one.
-    // FIXME: can both hands contain undef?
-    if (size_t LHSIdx = visitUDivOperand(SI->getOperand(1), Actions, Depth))
-      if (visitUDivOperand(SI->getOperand(2), Actions, Depth)) {
-        Actions.push_back(UDivFoldAction(nullptr, Op, LHSIdx - 1));
-        return Actions.size();
-      }
+    if (Value *LogX = takeLog2(Builder, SI->getOperand(1), Depth, DoFold))
+      if (Value *LogY = takeLog2(Builder, SI->getOperand(2), Depth, DoFold))
+        return IfFold([&]() {
+          return Builder.CreateSelect(SI->getOperand(0), LogX, LogY);
+        });
 
-  return 0;
+  return nullptr;
 }
 
 /// If we have zero-extended operands of an unsigned div or rem, we may be able
@@ -1095,36 +1049,11 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
   }
 
   // Op1 udiv Op2 -> Op1 lshr log2(Op2), if log2() folds away.
-  SmallVector<UDivFoldAction, 6> UDivActions;
-  if (visitUDivOperand(Op1, UDivActions))
-    for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
-      FoldUDivOperandCb Action = UDivActions[i].FoldAction;
-      Value *ActionOp1 = UDivActions[i].OperandToFold;
-      Value *Res;
-      if (Action)
-        Res = Action(ActionOp1, *this);
-      else {
-        // This action joins two actions together.  The RHS of this action is
-        // simply the last action we processed, we saved the LHS action index in
-        // the joining action.
-        size_t SelectRHSIdx = i - 1;
-        Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
-        size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
-        Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
-        Res = Builder.CreateSelect(cast<SelectInst>(ActionOp1)->getCondition(),
-                                   SelectLHS, SelectRHS);
-      }
-
-      // If this is the last action to process, return it to the InstCombiner.
-      // Otherwise, record it so that we may use it as part of a joining action
-      // (i.e., a SelectInst).
-      if (e - i != 1) {
-        UDivActions[i].FoldResult = Res;
-      } else {
-        return replaceInstUsesWith(
-            I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
-      }
-    }
+  if (takeLog2(Builder, Op1, /*Depth*/0, /*DoFold*/false)) {
+    Value *Res = takeLog2(Builder, Op1, /*Depth*/0, /*DoFold*/true);
+    return replaceInstUsesWith(
+        I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
+  }
 
   return nullptr;
 }

From 42e391e4ca848e152bf8cdb072ed3ca2a394da2b Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Wed, 23 Feb 2022 15:41:43 +0100
Subject: [PATCH 641/748] [ELF] Use SHF_SUNW_NODISCARD instead of
 SHF_GNU_RETAIN on Solaris

Instead of the GNU extension `SHF_GNU_RETAIN`, Solaris provides equivalent
functionality with `SHF_SUNW_NODISCARD`. This patch implements the necessary
support.

Tested on `sparcv9-sun-solaris2.11`, `amd64-pc-solaris2.11`, and
`x86_64-pc-linux-gnu`.

Differential Revision: https://reviews.llvm.org/D107955
---
 llvm/include/llvm/BinaryFormat/ELF.h          |  3 +
 llvm/include/llvm/ObjectYAML/ELFYAML.h        |  1 +
 llvm/lib/ObjectYAML/ELFYAML.cpp               | 11 ++-
 .../ELF/section-flags-solaris.test            | 36 +++++++++
 .../tools/obj2yaml/ELF/retain-section.yaml    | 36 +++++++++
 .../tools/yaml2obj/ELF/retain-section.yaml    | 74 +++++++++++++++++++
 llvm/tools/llvm-readobj/ELFDumper.cpp         | 32 ++++++--
 7 files changed, 186 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/section-flags-solaris.test
 create mode 100644 llvm/test/tools/obj2yaml/ELF/retain-section.yaml
 create mode 100644 llvm/test/tools/yaml2obj/ELF/retain-section.yaml

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index bc4b677c75a15..473fa41e4bd11 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1042,6 +1042,9 @@ enum : unsigned {
 
   SHF_MASKOS = 0x0ff00000,
 
+  // Solaris equivalent of SHF_GNU_RETAIN.
+  SHF_SUNW_NODISCARD = 0x00100000,
+
   // Bits indicating processor-specific flags.
   SHF_MASKPROC = 0xf0000000,
 
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index 92a9f78ce7bfd..4ef0bdd1df688 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -720,6 +720,7 @@ struct Object {
     llvm_unreachable("the section header table chunk must always be present");
   }
 
+  ELF_ELFOSABI getOSAbi() const;
   unsigned getMachine() const;
 };
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 48803cae1bd9c..96c13ea1544fc 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -29,6 +29,8 @@ namespace llvm {
 ELFYAML::Chunk::~Chunk() = default;
 
 namespace ELFYAML {
+ELF_ELFOSABI Object::getOSAbi() const { return Header.OSABI; }
+
 unsigned Object::getMachine() const {
   if (Header.Machine)
     return *Header.Machine;
@@ -706,7 +708,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
   BCase(SHF_GROUP);
   BCase(SHF_TLS);
   BCase(SHF_COMPRESSED);
-  BCase(SHF_GNU_RETAIN);
+  switch (Object->getOSAbi()) {
+  case ELF::ELFOSABI_SOLARIS:
+    BCase(SHF_SUNW_NODISCARD);
+    break;
+  default:
+    BCase(SHF_GNU_RETAIN);
+    break;
+  }
   switch (Object->getMachine()) {
   case ELF::EM_ARM:
     BCase(SHF_ARM_PURECODE);
diff --git a/llvm/test/tools/llvm-readobj/ELF/section-flags-solaris.test b/llvm/test/tools/llvm-readobj/ELF/section-flags-solaris.test
new file mode 100644
index 0000000000000..2539383b1af8e
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/section-flags-solaris.test
@@ -0,0 +1,36 @@
+## Here we test how Solaris specific flags are dumped.
+
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-readobj -S %t.o | FileCheck %s
+
+# CHECK:      Name: .os.flags.low
+# CHECK-NEXT: Type: SHT_PROGBITS
+# CHECK-NEXT: Flags [ (0x100000)
+# CHECK-NEXT:   SHF_SUNW_NODISCARD (0x100000)
+# CHECK-NEXT: ]
+# CHECK:      Name: .os.flags.high
+# CHECK-NEXT: Type: SHT_PROGBITS
+# CHECK-NEXT: Flags [ (0xFE00000)
+# CHECK-NEXT: ]
+# CHECK:      Name: .os.flags.mask
+# CHECK-NEXT: Type: SHT_PROGBITS
+# CHECK-NEXT: Flags [ (0xFF00000)
+# CHECK-NEXT:   SHF_SUNW_NODISCARD (0x100000)
+# CHECK-NEXT: ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  OSABI: ELFOSABI_SOLARIS
+  Type:  ET_REL
+Sections:
+  - Name:    .os.flags.low
+    Type:    SHT_PROGBITS
+    ShFlags: 0x00100000
+  - Name:    .os.flags.high
+    Type:    SHT_PROGBITS
+    ShFlags: 0x0FE00000
+  - Name:    .os.flags.mask
+    Type:    SHT_PROGBITS
+    ShFlags: 0x0FF00000
diff --git a/llvm/test/tools/obj2yaml/ELF/retain-section.yaml b/llvm/test/tools/obj2yaml/ELF/retain-section.yaml
new file mode 100644
index 0000000000000..a8509fabab6cb
--- /dev/null
+++ b/llvm/test/tools/obj2yaml/ELF/retain-section.yaml
@@ -0,0 +1,36 @@
+## Check handling of SHF_GNU_RETAIN and SHF_SUNW_NODISCARD section flags.
+
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=GNU
+
+# GNU:      Name: .gnu.retain
+# GNU-NEXT: Type: SHT_PROGBITS
+# GNU-NEXT: Flags: [ SHF_GNU_RETAIN ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name:  .gnu.retain
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_GNU_RETAIN ]
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: obj2yaml %t2 | FileCheck %s --check-prefix=SOLARIS
+
+# SOLARIS:      Name: .sunw.nodiscard
+# SOLARIS-NEXT: Type: SHT_PROGBITS
+# SOLARIS-NEXT: Flags: [ SHF_SUNW_NODISCARD ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  OSABI: ELFOSABI_SOLARIS
+  Type:  ET_REL
+Sections:
+  - Name:  .sunw.nodiscard
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_SUNW_NODISCARD ]
diff --git a/llvm/test/tools/yaml2obj/ELF/retain-section.yaml b/llvm/test/tools/yaml2obj/ELF/retain-section.yaml
new file mode 100644
index 0000000000000..26cc1daea7f24
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/ELF/retain-section.yaml
@@ -0,0 +1,74 @@
+## Check how yaml2obj handles retain (SHF_GNU_RETAIN and
+## SHF_SUNW_NODISCARD) section flags.
+
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-readobj --sections %t1 | FileCheck %s --check-prefix=GNU
+
+# GNU:      Name: .gnu.retain
+# GNU-NEXT: Type: SHT_PROGBITS (0x1)
+# GNU-NEXT: Flags [ (0x200000)
+# GNU-NEXT:   SHF_GNU_RETAIN (0x200000)
+# GNU-NEXT: ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name:  .gnu.retain
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_GNU_RETAIN ]
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-readobj --sections %t2 | FileCheck %s --check-prefix=SOLARIS
+
+# SOLARIS:      Name: .sunw.nodiscard
+# SOLARIS-NEXT: Type: SHT_PROGBITS (0x1)
+# SOLARIS-NEXT: Flags [ (0x100000)
+# SOLARIS-NEXT:   SHF_SUNW_NODISCARD (0x100000)
+# SOLARIS-NEXT: ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  OSABI: ELFOSABI_SOLARIS
+  Type:  ET_REL
+Sections:
+  - Name:  .sunw.nodiscard
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_SUNW_NODISCARD ]
+
+# RUN: not yaml2obj --docnum=3 %s 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SOLARIS-GNU-ERR
+
+# SOLARIS-GNU-ERR:      error: unknown bit value
+# SOLARIS-GNU-ERR-NEXT: Flags: [ SHF_GNU_RETAIN ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  OSABI: ELFOSABI_SOLARIS
+  Type:  ET_REL
+Sections:
+  - Name:  .sunw.retain
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_GNU_RETAIN ]
+
+# RUN: not yaml2obj --docnum=4 %s 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=GNU-SOLARIS-ERR
+
+# GNU-SOLARIS-ERR:      error: unknown bit value
+# GNU-SOLARIS-ERR-NEXT: Flags: [ SHF_SUNW_NODISCARD ]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name:  .gnu.nodiscard
+    Type:  SHT_PROGBITS
+    Flags: [ SHF_SUNW_NODISCARD ]
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 20264ad72b5d4..e1f69b4f45aa7 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1242,10 +1242,17 @@ const EnumEntry<unsigned> ElfSectionFlags[] = {
   ENUM_ENT(SHF_GROUP,            "G"),
   ENUM_ENT(SHF_TLS,              "T"),
   ENUM_ENT(SHF_COMPRESSED,       "C"),
-  ENUM_ENT(SHF_GNU_RETAIN,       "R"),
   ENUM_ENT(SHF_EXCLUDE,          "E"),
 };
 
+const EnumEntry<unsigned> ElfGNUSectionFlags[] = {
+  ENUM_ENT(SHF_GNU_RETAIN, "R")
+};
+
+const EnumEntry<unsigned> ElfSolarisSectionFlags[] = {
+  ENUM_ENT(SHF_SUNW_NODISCARD, "R")
+};
+
 const EnumEntry<unsigned> ElfXCoreSectionFlags[] = {
   ENUM_ENT(XCORE_SHF_CP_SECTION, ""),
   ENUM_ENT(XCORE_SHF_DP_SECTION, "")
@@ -1275,9 +1282,19 @@ const EnumEntry<unsigned> ElfX86_64SectionFlags[] = {
 };
 
 static std::vector<EnumEntry<unsigned>>
-getSectionFlagsForTarget(unsigned EMachine) {
+getSectionFlagsForTarget(unsigned EOSAbi, unsigned EMachine) {
   std::vector<EnumEntry<unsigned>> Ret(std::begin(ElfSectionFlags),
                                        std::end(ElfSectionFlags));
+  switch (EOSAbi) {
+  case ELFOSABI_SOLARIS:
+    Ret.insert(Ret.end(), std::begin(ElfSolarisSectionFlags),
+               std::end(ElfSolarisSectionFlags));
+    break;
+  default:
+    Ret.insert(Ret.end(), std::begin(ElfGNUSectionFlags),
+               std::end(ElfGNUSectionFlags));
+    break;
+  }
   switch (EMachine) {
   case EM_ARM:
     Ret.insert(Ret.end(), std::begin(ElfARMSectionFlags),
@@ -1305,7 +1322,8 @@ getSectionFlagsForTarget(unsigned EMachine) {
   return Ret;
 }
 
-static std::string getGNUFlags(unsigned EMachine, uint64_t Flags) {
+static std::string getGNUFlags(unsigned EOSAbi, unsigned EMachine,
+                               uint64_t Flags) {
   // Here we are trying to build the flags string in the same way as GNU does.
   // It is not that straightforward. Imagine we have sh_flags == 0x90000000.
   // SHF_EXCLUDE ("E") has a value of 0x80000000 and SHF_MASKPROC is 0xf0000000.
@@ -1316,7 +1334,7 @@ static std::string getGNUFlags(unsigned EMachine, uint64_t Flags) {
   bool HasOSFlag = false;
   bool HasProcFlag = false;
   std::vector<EnumEntry<unsigned>> FlagsList =
-      getSectionFlagsForTarget(EMachine);
+      getSectionFlagsForTarget(EOSAbi, EMachine);
   while (Flags) {
     // Take the least significant bit as a flag.
     uint64_t Flag = Flags & -Flags;
@@ -3682,7 +3700,8 @@ template <class ELFT> void GNUELFDumper<ELFT>::printSectionHeaders() {
     Fields[4].Str = to_string(format_hex_no_prefix(Sec.sh_offset, 6));
     Fields[5].Str = to_string(format_hex_no_prefix(Sec.sh_size, 6));
     Fields[6].Str = to_string(format_hex_no_prefix(Sec.sh_entsize, 2));
-    Fields[7].Str = getGNUFlags(this->Obj.getHeader().e_machine, Sec.sh_flags);
+    Fields[7].Str = getGNUFlags(this->Obj.getHeader().e_ident[ELF::EI_OSABI],
+                                this->Obj.getHeader().e_machine, Sec.sh_flags);
     Fields[8].Str = to_string(Sec.sh_link);
     Fields[9].Str = to_string(Sec.sh_info);
     Fields[10].Str = to_string(Sec.sh_addralign);
@@ -6503,7 +6522,8 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printSectionHeaders() {
 
   int SectionIndex = -1;
   std::vector<EnumEntry<unsigned>> FlagsList =
-      getSectionFlagsForTarget(this->Obj.getHeader().e_machine);
+      getSectionFlagsForTarget(this->Obj.getHeader().e_ident[ELF::EI_OSABI],
+                               this->Obj.getHeader().e_machine);
   for (const Elf_Shdr &Sec : cantFail(this->Obj.sections())) {
     DictScope SectionD(W, "Section");
     W.printNumber("Index", ++SectionIndex);

From 365be7ac72a3cde3e9c138b4a8dba0af57e16341 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Wed, 23 Feb 2022 15:43:12 +0100
Subject: [PATCH 642/748] [MC][ELF] Use SHF_SUNW_NODISCARD instead of
 SHF_GNU_RETAIN on Solaris

As requested in D107955 <https://reviews.llvm.org/D107955>, this patch
splits off the `MC` and `CodeGen` parts and adds a testcase.

Tested on `sparcv9-sun-solaris2.11`, `amd64-pc-solaris2.11`, and
`x86_64-pc-linux-gnu`.

Differential Revision: https://reviews.llvm.org/D120318
---
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  | 22 +++++++++++--------
 llvm/lib/MC/MCParser/ELFAsmParser.cpp         | 11 +++++++---
 llvm/lib/MC/MCSectionELF.cpp                  |  5 +++++
 llvm/test/CodeGen/X86/elf-retain.ll           |  2 ++
 llvm/test/MC/ELF/section-gnu.s                | 19 +++++++++-------
 5 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 348470bd7687d..0853c7a34354b 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -682,9 +682,10 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
   }
 
   if (Retain) {
-    if ((Ctx.getAsmInfo()->useIntegratedAssembler() ||
-         Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
-        !TM.getTargetTriple().isOSSolaris())
+    if (TM.getTargetTriple().isOSSolaris())
+      Flags |= ELF::SHF_SUNW_NODISCARD;
+    else if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+             Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36))
       Flags |= ELF::SHF_GNU_RETAIN;
     return NextUniqueID++;
   }
@@ -861,12 +862,15 @@ static MCSection *selectELFSectionForGlobal(
     EmitUniqueSection = true;
     Flags |= ELF::SHF_LINK_ORDER;
   }
-  if (Retain &&
-      (Ctx.getAsmInfo()->useIntegratedAssembler() ||
-       Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
-      !TM.getTargetTriple().isOSSolaris()) {
-    EmitUniqueSection = true;
-    Flags |= ELF::SHF_GNU_RETAIN;
+  if (Retain) {
+    if (TM.getTargetTriple().isOSSolaris()) {
+      EmitUniqueSection = true;
+      Flags |= ELF::SHF_SUNW_NODISCARD;
+    } else if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+               Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) {
+      EmitUniqueSection = true;
+      Flags |= ELF::SHF_GNU_RETAIN;
+    }
   }
 
   MCSectionELF *Section = selectELFSectionForGlobal(
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 2d76b1c12c952..34f11eb25dbe4 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -282,7 +282,8 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   return false;
 }
 
-static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
+static unsigned parseSectionFlags(const Triple &TT, StringRef flagsStr,
+                                  bool *UseLastGroup) {
   unsigned flags = 0;
 
   // If a valid numerical value is set for the section flag, use it verbatim
@@ -331,7 +332,10 @@ static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
       flags |= ELF::SHF_GROUP;
       break;
     case 'R':
-      flags |= ELF::SHF_GNU_RETAIN;
+      if (TT.isOSSolaris())
+        flags |= ELF::SHF_SUNW_NODISCARD;
+      else
+        flags |= ELF::SHF_GNU_RETAIN;
       break;
     case '?':
       *UseLastGroup = true;
@@ -569,7 +573,8 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
     } else {
       StringRef FlagsStr = getTok().getStringContents();
       Lex();
-      extraFlags = parseSectionFlags(FlagsStr, &UseLastGroup);
+      extraFlags = parseSectionFlags(getContext().getTargetTriple(), FlagsStr,
+                                     &UseLastGroup);
     }
 
     if (extraFlags == -1U)
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index d18876507cd7a..7480a68be8ec8 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -105,6 +105,11 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   if (Flags & ELF::SHF_GNU_RETAIN)
     OS << 'R';
 
+  // If there are os-specific flags, print them.
+  if (T.isOSSolaris())
+    if (Flags & ELF::SHF_SUNW_NODISCARD)
+      OS << 'R';
+
   // If there are target-specific flags, print them.
   Triple::ArchType Arch = T.getArch();
   if (Arch == Triple::xcore) {
diff --git a/llvm/test/CodeGen/X86/elf-retain.ll b/llvm/test/CodeGen/X86/elf-retain.ll
index 95d0414f36ed7..fface61e6b824 100644
--- a/llvm/test/CodeGen/X86/elf-retain.ll
+++ b/llvm/test/CodeGen/X86/elf-retain.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=x86_64 -data-sections=1 < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64 -no-integrated-as -binutils-version=2.36 < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64 -no-integrated-as -binutils-version=2.35 < %s | FileCheck %s --check-prefix=OLDGAS
+;; Solaris uses the equivalent SHF_SUNW_NODISCARD flag, also represented as "R".
+; RUN: llc -mtriple=x86_64-solaris < %s | FileCheck %s
 
 ; RUN: llc -mtriple=x86_64 -data-sections=1 -unique-section-names=0 < %s | FileCheck %s --check-prefix=NOUNIQUE
 
diff --git a/llvm/test/MC/ELF/section-gnu.s b/llvm/test/MC/ELF/section-gnu.s
index cb4fc9d3d2b5c..573153bf7f4ef 100644
--- a/llvm/test/MC/ELF/section-gnu.s
+++ b/llvm/test/MC/ELF/section-gnu.s
@@ -1,19 +1,22 @@
 # REQUIRES: aarch64-registered-target
 # RUN: llvm-mc -triple=x86_64 %s | FileCheck %s --check-prefix=ASM
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s | llvm-readobj -hS - | FileCheck %s --check-prefixes=GNU,OBJ
-# RUN: llvm-mc -filetype=obj -triple=aarch64-freebsd %s | llvm-readobj -hS - | FileCheck %s --check-prefixes=FREEBSD,OBJ
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s | llvm-readobj -hS - | FileCheck %s --check-prefixes=GNU,OBJ,OBJ-GNU
+# RUN: llvm-mc -filetype=obj -triple=aarch64-freebsd %s | llvm-readobj -hS - | FileCheck %s --check-prefixes=FREEBSD,OBJ,OBJ-GNU
+# RUN: llvm-mc -filetype=obj -triple=x86_64-solaris %s | llvm-readobj -hS - | FileCheck %s --check-prefixes=SOLARIS,OBJ,OBJ-SOLARIS
 
 # ASM: .section retain,"aR",@progbits
 
 ## ELFOSABI_NONE is changed to ELFOSABI_GNU. Other OSABI values are unchanged.
 # GNU:      OS/ABI: GNU/Linux
 # FREEBSD:  OS/ABI: FreeBSD
+# SOLARIS:  OS/ABI: Solaris
 
-# OBJ:      Name: retain
-# OBJ-NEXT: Type: SHT_PROGBITS
-# OBJ-NEXT: Flags [
-# OBJ-NEXT:   SHF_ALLOC
-# OBJ-NEXT:   SHF_GNU_RETAIN
-# OBJ-NEXT: ]
+# OBJ:              Name: retain
+# OBJ-NEXT:         Type: SHT_PROGBITS
+# OBJ-NEXT:         Flags [
+# OBJ-NEXT:           SHF_ALLOC
+# OBJ-GNU-NEXT:       SHF_GNU_RETAIN
+# OBJ-SOLARIS-NEXT:   SHF_SUNW_NODISCARD
+# OBJ-NEXT:         ]
 
 .section retain,"aR",@progbits

From 80a696898cd57f00297e06714bd5118ce7308f3e Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 23 Feb 2022 15:18:00 +0100
Subject: [PATCH 643/748] [clang][deps] NFC: Update documentation

In D113473, the dependency scanner stopped emitting "-fmodule-map-file=" arguments. Potential build systems are expected to not add any such arguments on their own. This commit removes mentions of such arguments to avoid confusion.
---
 .../DependencyScanning/DependencyScanningTool.h      | 12 +++++-------
 .../Tooling/DependencyScanning/ModuleDepCollector.h  |  6 ++----
 .../DependencyScanning/DependencyScanningTool.cpp    |  6 +++---
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
index 2eb7a35b27b91..54c3c9543dedd 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -16,9 +16,9 @@
 #include "llvm/ADT/StringSet.h"
 #include <string>
 
-namespace clang{
-namespace tooling{
-namespace dependencies{
+namespace clang {
+namespace tooling {
+namespace dependencies {
 
 /// The full dependencies and module graph for a specific input.
 struct FullDependencies {
@@ -51,15 +51,13 @@ struct FullDependencies {
   ///                      be located.
   /// \param LookupModuleDeps This function is called to collect the full
   ///                         transitive set of dependencies for this
-  ///                         compilation and fill in "-fmodule-map-file="
-  ///                         arguments.
+  ///                         compilation.
   std::vector<std::string> getAdditionalArgs(
       std::function<StringRef(ModuleID)> LookupPCMPath,
       std::function<const ModuleDeps &(ModuleID)> LookupModuleDeps) const;
 
   /// Get additional arguments suitable for appending to the original Clang
-  /// command line, excluding arguments containing modules-related paths:
-  /// "-fmodule-file=", "-fmodule-map-file=".
+  /// command line, excluding "-fmodule-file=" arguments.
   std::vector<std::string> getAdditionalArgsWithoutModulePaths() const;
 };
 
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
index d1a7aab8c24b1..c2e9541db68e0 100644
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -111,15 +111,13 @@ struct ModuleDeps {
   ///                      be located.
   /// \param LookupModuleDeps This function is called to collect the full
   ///                         transitive set of dependencies for this
-  ///                         compilation and fill in "-fmodule-map-file="
-  ///                         arguments.
+  ///                         compilation.
   std::vector<std::string> getCanonicalCommandLine(
       std::function<StringRef(ModuleID)> LookupPCMPath,
       std::function<const ModuleDeps &(ModuleID)> LookupModuleDeps) const;
 
   /// Gets the canonical command line suitable for passing to clang, excluding
-  /// arguments containing modules-related paths: "-fmodule-file=", "-o",
-  /// "-fmodule-map-file=".
+  /// "-fmodule-file=" and "-o" arguments.
   std::vector<std::string> getCanonicalCommandLineWithoutModulePaths() const;
 };
 
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
index 739712baadd06..26f91961c2bbd 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
@@ -9,9 +9,9 @@
 #include "clang/Tooling/DependencyScanning/DependencyScanningTool.h"
 #include "clang/Frontend/Utils.h"
 
-namespace clang{
-namespace tooling{
-namespace dependencies{
+namespace clang {
+namespace tooling {
+namespace dependencies {
 
 std::vector<std::string> FullDependencies::getAdditionalArgs(
     std::function<StringRef(ModuleID)> LookupPCMPath,

From 19017c2435d76fe453a2500eeafd045ba92ece67 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 23 Feb 2022 14:51:40 +0100
Subject: [PATCH 644/748] [clang][deps] Return the whole TU command line

The dependency scanner already generates canonical -cc1 command lines that can be used to compile discovered modular dependencies.

For translation unit command lines, the scanner only generates additional driver arguments the build system is expected to append to the original command line.

While this works most of the time, there are situations where that's not the case. For example with `-Wunused-command-line-argument`, Clang will complain about the `-fmodules-cache-path=` argument that's not being used in explicit modular builds. Combine that with `-Werror` and the build outright fails.

To prevent such failures, this patch changes the dependency scanner to return the full driver command line to compile the original translation unit. This gives us more opportunities to massage the arguments into something reasonable.

Reviewed By: Bigcheese

Differential Revision: https://reviews.llvm.org/D118986
---
 .../DependencyScanningTool.h                  | 11 +++++---
 .../DependencyScanningTool.cpp                | 25 ++++++++++++++++---
 clang/test/ClangScanDeps/diagnostics.c        |  2 +-
 .../test/ClangScanDeps/modules-context-hash.c |  4 +--
 .../modules-fmodule-name-no-module-built.m    |  2 +-
 clang/test/ClangScanDeps/modules-full.cpp     |  8 +++---
 .../modules-inferred-explicit-build.m         |  4 +--
 clang/test/ClangScanDeps/modules-inferred.m   |  2 +-
 .../modules-pch-common-submodule.c            | 10 +++-----
 .../modules-pch-common-via-submodule.c        | 10 +++-----
 clang/test/ClangScanDeps/modules-pch.c        | 15 +++++------
 clang/test/ClangScanDeps/modules-symlink.c    |  3 +--
 clang/tools/clang-scan-deps/ClangScanDeps.cpp | 19 +++++++-------
 13 files changed, 63 insertions(+), 52 deletions(-)

diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
index 54c3c9543dedd..36447dd2e38e6 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -42,8 +42,10 @@ struct FullDependencies {
   /// determined that the differences are benign for this compilation.
   std::vector<ModuleID> ClangModuleDeps;
 
-  /// Get additional arguments suitable for appending to the original Clang
-  /// command line.
+  /// The original command line of the TU (excluding the compiler executable).
+  std::vector<std::string> OriginalCommandLine;
+
+  /// Get the full command line.
   ///
   /// \param LookupPCMPath This function is called to fill in "-fmodule-file="
   ///                      arguments and the "-o" argument. It needs to return
@@ -52,10 +54,13 @@ struct FullDependencies {
   /// \param LookupModuleDeps This function is called to collect the full
   ///                         transitive set of dependencies for this
   ///                         compilation.
-  std::vector<std::string> getAdditionalArgs(
+  std::vector<std::string> getCommandLine(
       std::function<StringRef(ModuleID)> LookupPCMPath,
       std::function<const ModuleDeps &(ModuleID)> LookupModuleDeps) const;
 
+  /// Get the full command line, excluding -fmodule-file=" arguments.
+  std::vector<std::string> getCommandLineWithoutModulePaths() const;
+
   /// Get additional arguments suitable for appending to the original Clang
   /// command line, excluding "-fmodule-file=" arguments.
   std::vector<std::string> getAdditionalArgsWithoutModulePaths() const;
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
index 26f91961c2bbd..2723d4742819a 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningTool.cpp
@@ -13,10 +13,10 @@ namespace clang {
 namespace tooling {
 namespace dependencies {
 
-std::vector<std::string> FullDependencies::getAdditionalArgs(
+std::vector<std::string> FullDependencies::getCommandLine(
     std::function<StringRef(ModuleID)> LookupPCMPath,
     std::function<const ModuleDeps &(ModuleID)> LookupModuleDeps) const {
-  std::vector<std::string> Ret = getAdditionalArgsWithoutModulePaths();
+  std::vector<std::string> Ret = getCommandLineWithoutModulePaths();
 
   std::vector<std::string> PCMPaths;
   std::vector<std::string> ModMapPaths;
@@ -28,6 +28,19 @@ std::vector<std::string> FullDependencies::getAdditionalArgs(
   return Ret;
 }
 
+std::vector<std::string>
+FullDependencies::getCommandLineWithoutModulePaths() const {
+  std::vector<std::string> Args = OriginalCommandLine;
+
+  std::vector<std::string> AdditionalArgs =
+      getAdditionalArgsWithoutModulePaths();
+  Args.insert(Args.end(), AdditionalArgs.begin(), AdditionalArgs.end());
+
+  // TODO: Filter out implicit modules leftovers (e.g. "-fmodules-cache-path=").
+
+  return Args;
+}
+
 std::vector<std::string>
 FullDependencies::getAdditionalArgsWithoutModulePaths() const {
   std::vector<std::string> Args{
@@ -138,9 +151,13 @@ DependencyScanningTool::getFullDependencies(
       ContextHash = std::move(Hash);
     }
 
-    FullDependenciesResult getFullDependencies() const {
+    FullDependenciesResult getFullDependencies(
+        const std::vector<std::string> &OriginalCommandLine) const {
       FullDependencies FD;
 
+      FD.OriginalCommandLine =
+          ArrayRef<std::string>(OriginalCommandLine).slice(1);
+
       FD.ID.ContextHash = std::move(ContextHash);
 
       FD.FileDeps.assign(Dependencies.begin(), Dependencies.end());
@@ -181,7 +198,7 @@ DependencyScanningTool::getFullDependencies(
       Worker.computeDependencies(CWD, CommandLine, Consumer, ModuleName);
   if (Result)
     return std::move(Result);
-  return Consumer.getFullDependencies();
+  return Consumer.getFullDependencies(CommandLine);
 }
 
 } // end namespace dependencies
diff --git a/clang/test/ClangScanDeps/diagnostics.c b/clang/test/ClangScanDeps/diagnostics.c
index ce4eff79dc02b..0dcac47afc8d7 100644
--- a/clang/test/ClangScanDeps/diagnostics.c
+++ b/clang/test/ClangScanDeps/diagnostics.c
@@ -38,7 +38,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules"
+// CHECK:              "-fno-implicit-modules"
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "file-deps": [
diff --git a/clang/test/ClangScanDeps/modules-context-hash.c b/clang/test/ClangScanDeps/modules-context-hash.c
index dfa3328335ef6..10c7d18ba36e6 100644
--- a/clang/test/ClangScanDeps/modules-context-hash.c
+++ b/clang/test/ClangScanDeps/modules-context-hash.c
@@ -50,7 +50,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules",
+// CHECK:              "-fno-implicit-modules",
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "file-deps": [
@@ -91,7 +91,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules",
+// CHECK:              "-fno-implicit-modules",
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "file-deps": [
diff --git a/clang/test/ClangScanDeps/modules-fmodule-name-no-module-built.m b/clang/test/ClangScanDeps/modules-fmodule-name-no-module-built.m
index f9d4d89211e3b..8469b49d0af67 100644
--- a/clang/test/ClangScanDeps/modules-fmodule-name-no-module-built.m
+++ b/clang/test/ClangScanDeps/modules-fmodule-name-no-module-built.m
@@ -43,7 +43,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules"
+// CHECK:              "-fno-implicit-modules"
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NEXT:         "-fmodule-file=[[PREFIX]]/module-cache{{(_clangcl)?}}/[[HASH_H2]]/header2-{{[A-Z0-9]+}}.pcm"
 // CHECK-NEXT:       ],
diff --git a/clang/test/ClangScanDeps/modules-full.cpp b/clang/test/ClangScanDeps/modules-full.cpp
index d7cdf4cce7ad0..2a7b6625bf0f1 100644
--- a/clang/test/ClangScanDeps/modules-full.cpp
+++ b/clang/test/ClangScanDeps/modules-full.cpp
@@ -103,7 +103,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules"
+// CHECK:              "-fno-implicit-modules"
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NO-ABS-NOT:   "-fmodule-file={{.*}}"
 // CHECK-ABS-NEXT:     "-fmodule-file=[[PREFIX]]/module-cache{{(_clangcl)?}}/[[HASH_H1]]/header1-{{[A-Z0-9]+}}.pcm"
@@ -123,7 +123,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules"
+// CHECK:              "-fno-implicit-modules"
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NO-ABS-NOT:   "-fmodule-file={{.*}},
 // CHECK-ABS-NEXT:     "-fmodule-file=[[PREFIX]]/module-cache{{(_clangcl)?}}/[[HASH_H1]]/header1-{{[A-Z0-9]+}}.pcm"
@@ -143,7 +143,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules"
+// CHECK:              "-fno-implicit-modules"
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NO-ABS-NOT:   "-fmodule-file={{.*}}"
 // CHECK-ABS-NEXT:     "-fmodule-file=[[PREFIX]]/module-cache{{(_clangcl)?}}/[[HASH_H1]]/header1-{{[A-Z0-9]+}}.pcm"
@@ -163,7 +163,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules"
+// CHECK:              "-fno-implicit-modules"
 // CHECK-NEXT:         "-fno-implicit-module-maps"
 // CHECK-NO-ABS-NOT:   "-fmodule-file={{.*}}"
 // CHECK-ABS-NEXT:     "-fmodule-file=[[PREFIX]]/module-cache{{(_clangcl)?}}/[[HASH_H2_DINCLUDE]]/header2-{{[A-Z0-9]+}}.pcm"
diff --git a/clang/test/ClangScanDeps/modules-inferred-explicit-build.m b/clang/test/ClangScanDeps/modules-inferred-explicit-build.m
index 4471eb38fdb74..09101ff89a5e7 100644
--- a/clang/test/ClangScanDeps/modules-inferred-explicit-build.m
+++ b/clang/test/ClangScanDeps/modules-inferred-explicit-build.m
@@ -12,9 +12,7 @@
 // RUN: %python %S/../../utils/module-deps-to-rsp.py %t.db --tu-index=0 > %t.tu.rsp
 // RUN: %clang @%t.inferred.cc1.rsp -pedantic -Werror
 // RUN: %clang @%t.system.cc1.rsp -pedantic -Werror
-// RUN: %clang -x objective-c -fsyntax-only %t.dir/modules_cdb_input.cpp \
-// RUN:   -F%S/Inputs/frameworks -fmodules -fimplicit-module-maps \
-// RUN:   -pedantic -Werror @%t.tu.rsp
+// RUN: %clang @%t.tu.rsp -pedantic -Werror -Wno-unused-command-line-argument
 
 #include <Inferred/Inferred.h>
 #include <System/System.h>
diff --git a/clang/test/ClangScanDeps/modules-inferred.m b/clang/test/ClangScanDeps/modules-inferred.m
index 15e7aa321ae7a..3fdf4edf5aa1a 100644
--- a/clang/test/ClangScanDeps/modules-inferred.m
+++ b/clang/test/ClangScanDeps/modules-inferred.m
@@ -47,7 +47,7 @@
 // CHECK-NEXT:         }
 // CHECK-NEXT:       ],
 // CHECK-NEXT:       "command-line": [
-// CHECK-NEXT:         "-fno-implicit-modules",
+// CHECK:              "-fno-implicit-modules",
 // CHECK-NEXT:         "-fno-implicit-module-maps",
 // CHECK-NEXT:         "-fmodule-file=[[PREFIX]]/module-cache/[[HASH_INFERRED]]/Inferred-{{[A-Z0-9]+}}.pcm"
 // CHECK-NEXT:       ],
diff --git a/clang/test/ClangScanDeps/modules-pch-common-submodule.c b/clang/test/ClangScanDeps/modules-pch-common-submodule.c
index c7f3e76cf0a15..574100d550485 100644
--- a/clang/test/ClangScanDeps/modules-pch-common-submodule.c
+++ b/clang/test/ClangScanDeps/modules-pch-common-submodule.c
@@ -51,7 +51,7 @@
 // CHECK-PCH-NEXT:         }
 // CHECK-PCH-NEXT:       ],
 // CHECK-PCH-NEXT:       "command-line": [
-// CHECK-PCH-NEXT:         "-fno-implicit-modules"
+// CHECK-PCH:              "-fno-implicit-modules"
 // CHECK-PCH-NEXT:         "-fno-implicit-module-maps"
 // CHECK-PCH-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_COMMON]]/ModCommon-{{.*}}.pcm"
 // CHECK-PCH-NEXT:       ],
@@ -72,8 +72,7 @@
 // RUN:   --tu-index=0 > %t/pch.rsp
 //
 // RUN: %clang @%t/mod_common.cc1.rsp
-// RUN: %clang -x c-header %t/pch.h -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -o %t/pch.h.gch @%t/pch.rsp
+// RUN: %clang @%t/pch.rsp
 
 // Scan dependencies of the TU:
 //
@@ -115,7 +114,7 @@
 // CHECK-TU-NEXT:         }
 // CHECK-TU-NEXT:       ],
 // CHECK-TU-NEXT:       "command-line": [
-// CHECK-TU-NEXT:         "-fno-implicit-modules",
+// CHECK-TU:              "-fno-implicit-modules",
 // CHECK-TU-NEXT:         "-fno-implicit-module-maps",
 // CHECK-TU-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_TU:.*]]/ModTU-{{.*}}.pcm"
 // CHECK-TU-NEXT:       ],
@@ -137,5 +136,4 @@
 // RUN:   --tu-index=0 > %t/tu.rsp
 //
 // RUN: %clang @%t/mod_tu.cc1.rsp
-// RUN: %clang -fsyntax-only %t/tu.c -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -include %t/pch.h -o %t/tu.o @%t/tu.rsp
+// RUN: %clang @%t/tu.rsp
diff --git a/clang/test/ClangScanDeps/modules-pch-common-via-submodule.c b/clang/test/ClangScanDeps/modules-pch-common-via-submodule.c
index e63e310b22d09..4d1a702e87e7a 100644
--- a/clang/test/ClangScanDeps/modules-pch-common-via-submodule.c
+++ b/clang/test/ClangScanDeps/modules-pch-common-via-submodule.c
@@ -48,7 +48,7 @@
 // CHECK-PCH-NEXT:         }
 // CHECK-PCH-NEXT:       ],
 // CHECK-PCH-NEXT:       "command-line": [
-// CHECK-PCH-NEXT:         "-fno-implicit-modules"
+// CHECK-PCH:              "-fno-implicit-modules"
 // CHECK-PCH-NEXT:         "-fno-implicit-module-maps"
 // CHECK-PCH-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_COMMON]]/ModCommon-{{.*}}.pcm"
 // CHECK-PCH-NEXT:       ],
@@ -69,8 +69,7 @@
 // RUN:   --tu-index=0 > %t/pch.rsp
 //
 // RUN: %clang @%t/mod_common.cc1.rsp
-// RUN: %clang -x c-header %t/pch.h -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -o %t/pch.h.gch @%t/pch.rsp
+// RUN: %clang @%t/pch.rsp
 
 // Scan dependencies of the TU:
 //
@@ -113,7 +112,7 @@
 // CHECK-TU-NEXT:         }
 // CHECK-TU-NEXT:       ],
 // CHECK-TU-NEXT:       "command-line": [
-// CHECK-TU-NEXT:         "-fno-implicit-modules",
+// CHECK-TU:              "-fno-implicit-modules",
 // CHECK-TU-NEXT:         "-fno-implicit-module-maps",
 // CHECK-TU-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_TU:.*]]/ModTU-{{.*}}.pcm"
 // CHECK-TU-NEXT:       ],
@@ -135,5 +134,4 @@
 // RUN:   --tu-index=0 > %t/tu.rsp
 //
 // RUN: %clang @%t/mod_tu.cc1.rsp
-// RUN: %clang -fsyntax-only %t/tu.c -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -include %t/pch.h -o %t/tu.o @%t/tu.rsp
+// RUN: %clang @%t/tu.rsp
diff --git a/clang/test/ClangScanDeps/modules-pch.c b/clang/test/ClangScanDeps/modules-pch.c
index 2be17743a64b9..89b6b6f9b0980 100644
--- a/clang/test/ClangScanDeps/modules-pch.c
+++ b/clang/test/ClangScanDeps/modules-pch.c
@@ -91,7 +91,7 @@
 // CHECK-PCH-NEXT:         }
 // CHECK-PCH-NEXT:       ],
 // CHECK-PCH-NEXT:       "command-line": [
-// CHECK-PCH-NEXT:         "-fno-implicit-modules",
+// CHECK-PCH:              "-fno-implicit-modules",
 // CHECK-PCH-NEXT:         "-fno-implicit-module-maps",
 // CHECK-PCH-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_COMMON_1]]/ModCommon1-{{.*}}.pcm",
 // CHECK-PCH-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_COMMON_2]]/ModCommon2-{{.*}}.pcm",
@@ -120,8 +120,7 @@
 // RUN: %clang @%t/mod_common_1.cc1.rsp
 // RUN: %clang @%t/mod_common_2.cc1.rsp
 // RUN: %clang @%t/mod_pch.cc1.rsp
-// RUN: %clang -x c-header %t/pch.h -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -o %t/pch.h.gch @%t/pch.rsp
+// RUN: %clang @%t/pch.rsp
 
 // Scan dependencies of the TU:
 //
@@ -161,7 +160,7 @@
 // CHECK-TU-NEXT:         }
 // CHECK-TU-NEXT:       ],
 // CHECK-TU-NEXT:       "command-line": [
-// CHECK-TU-NEXT:         "-fno-implicit-modules",
+// CHECK-TU:              "-fno-implicit-modules",
 // CHECK-TU-NEXT:         "-fno-implicit-module-maps",
 // CHECK-TU-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_TU]]/ModTU-{{.*}}.pcm"
 // CHECK-TU-NEXT:       ],
@@ -183,8 +182,7 @@
 // RUN:   --tu-index=0 > %t/tu.rsp
 //
 // RUN: %clang @%t/mod_tu.cc1.rsp
-// RUN: %clang -fsyntax-only %t/tu.c -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -include %t/pch.h -o %t/tu.o @%t/tu.rsp
+// RUN: %clang @%t/tu.rsp
 
 // Scan dependencies of the TU that has common modules with the PCH:
 //
@@ -225,7 +223,7 @@
 // CHECK-TU-WITH-COMMON-NEXT:         }
 // CHECK-TU-WITH-COMMON-NEXT:       ],
 // CHECK-TU-WITH-COMMON-NEXT:       "command-line": [
-// CHECK-TU-WITH-COMMON-NEXT:         "-fno-implicit-modules",
+// CHECK-TU-WITH-COMMON:              "-fno-implicit-modules",
 // CHECK-TU-WITH-COMMON-NEXT:         "-fno-implicit-module-maps",
 // CHECK-TU-WITH-COMMON-NEXT:         "-fmodule-file=[[PREFIX]]/build/{{.*}}/ModCommon2-{{.*}}.pcm",
 // CHECK-TU-WITH-COMMON-NEXT:         "-fmodule-file=[[PREFIX]]/build/[[HASH_MOD_TU_WITH_COMMON]]/ModTUWithCommon-{{.*}}.pcm"
@@ -248,5 +246,4 @@
 // RUN:   --tu-index=0 > %t/tu_with_common.rsp
 //
 // RUN: %clang @%t/mod_tu_with_common.cc1.rsp
-// RUN: %clang -fsyntax-only %t/tu_with_common.c -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -include %t/pch.h -o %t/tu_with_common.o @%t/tu_with_common.rsp
+// RUN: %clang @%t/tu_with_common.rsp
diff --git a/clang/test/ClangScanDeps/modules-symlink.c b/clang/test/ClangScanDeps/modules-symlink.c
index 5b628175560d2..14bc811588f04 100644
--- a/clang/test/ClangScanDeps/modules-symlink.c
+++ b/clang/test/ClangScanDeps/modules-symlink.c
@@ -49,8 +49,7 @@ static int foo = MACRO; // Macro usage that will trigger
 // RUN:   --tu-index=0 > %t/pch.rsp
 //
 // RUN: %clang @%t/mod.cc1.rsp
-// RUN: %clang -x c-header %t/pch.h -fmodules -gmodules -fimplicit-module-maps \
-// RUN:   -fmodules-cache-path=%t/cache -o %t/pch.h.gch -I %t @%t/pch.rsp
+// RUN: %clang @%t/pch.rsp
 
 // RUN: sed -e "s|DIR|%/t|g" %t/cdb_tu.json > %t/cdb.json
 // RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full \
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 49cc97b27046f..e88290d479ff5 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -296,14 +296,13 @@ class FullDeps {
       Modules.insert(I, {{MD.ID, InputIndex}, std::move(MD)});
     }
 
-    ID.AdditionalCommandLine =
-        GenerateModulesPathArgs
-            ? FD.getAdditionalArgs(
-                  [&](ModuleID MID) { return lookupPCMPath(MID); },
-                  [&](ModuleID MID) -> const ModuleDeps & {
-                    return lookupModuleDeps(MID);
-                  })
-            : FD.getAdditionalArgsWithoutModulePaths();
+    ID.CommandLine = GenerateModulesPathArgs
+                         ? FD.getCommandLine(
+                               [&](ModuleID MID) { return lookupPCMPath(MID); },
+                               [&](ModuleID MID) -> const ModuleDeps & {
+                                 return lookupModuleDeps(MID);
+                               })
+                         : FD.getCommandLineWithoutModulePaths();
 
     Inputs.push_back(std::move(ID));
   }
@@ -353,7 +352,7 @@ class FullDeps {
           {"clang-context-hash", I.ContextHash},
           {"file-deps", I.FileDeps},
           {"clang-module-deps", toJSONSorted(I.ModuleDeps)},
-          {"command-line", I.AdditionalCommandLine},
+          {"command-line", I.CommandLine},
       };
       TUs.push_back(std::move(O));
     }
@@ -415,7 +414,7 @@ class FullDeps {
     std::string ContextHash;
     std::vector<std::string> FileDeps;
     std::vector<ModuleID> ModuleDeps;
-    std::vector<std::string> AdditionalCommandLine;
+    std::vector<std::string> CommandLine;
   };
 
   std::mutex Lock;

From eb1c5a9862b62fbb20fa52542aef0497ed107f65 Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Tue, 22 Feb 2022 15:17:18 -0600
Subject: [PATCH 645/748] [PowerPC] Add the Power10 LXVKQ instrution.

Add the Power 10 instruction LXVKQ.

This patch was taken from an original patch by: Yi-Hong Lyu

Reviewed By: lei

Differential Revision: https://reviews.llvm.org/D117507
---
 llvm/lib/Target/PowerPC/P10InstrResources.td   |  1 +
 llvm/lib/Target/PowerPC/PPCInstrP10.td         | 18 ++++++++++++++++++
 .../PowerPC/ppc64-encoding-ISA31.txt           |  3 +++
 llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s    |  3 +++
 4 files changed, 25 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
index c4f0a24e221cb..7aaac73ec9632 100644
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -1625,6 +1625,7 @@ def : InstRW<[P10W_PM_4C, P10W_DISP_ANY, P10PM_Read],
       (instrs
     LVSL,
     LVSR,
+    LXVKQ,
     MFVSRLD,
     MTVSRWS,
     VCLZLSBB,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 205653274a48c..4a7483c4fdd8a 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -655,6 +655,22 @@ class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
   let Inst{31} = 0;
 }
 
+// X-Form: [ PO T EO UIM XO TX ]
+class XForm_XT6_IMM5<bits<6> opcode, bits<5> eo, bits<10> xo, dag OOL, dag IOL,
+                     string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<5> UIM;
+
+  let Pattern = pattern;
+
+  let Inst{6-10} = XT{4-0};
+  let Inst{11-15} = eo;
+  let Inst{16-20} = UIM;
+  let Inst{21-30} = xo;
+  let Inst{31} = XT{5};
+}
+
 class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
                            string asmstr, InstrItinClass itin,
                            list<dag> pattern>
@@ -2393,6 +2409,8 @@ let Predicates = [IsISA3_1] in {
   def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>;
   def XSCVUQQP : X_VT5_XO5_VB5<63, 3, 836, "xscvuqqp", []>;
   def XSCVSQQP : X_VT5_XO5_VB5<63, 11, 836, "xscvsqqp", []>;
+  def LXVKQ : XForm_XT6_IMM5<60, 31, 360, (outs vsrc:$XT), (ins u5imm:$UIM),
+                             "lxvkq $XT, $UIM", IIC_VecGeneral, []>;
 }
 
 let Predicates = [IsISA3_1, HasVSX] in {
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt
index 1ac3c26521014..730b4b8ed3d68 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt
@@ -364,6 +364,9 @@
 # CHECK: xxeval 32, 1, 2, 3, 2
 0x05 0x00 0x00 0x02 0x88 0x01 0x10 0xd1
 
+# CHECK: lxvkq 63, 31
+0xf3 0xff 0xfa 0xd1
+
 # CHECK: vclzdm 1, 2, 3
 0x10 0x22 0x1f 0x84
 
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s
index 27638ae3c4803..06c5b4900d989 100644
--- a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s
@@ -528,6 +528,9 @@
 # CHECK-LE: xxeval 32, 1, 2, 3, 2                 # encoding: [0x02,0x00,0x00,0x05,
 # CHECK-LE-SAME:                                               0xd1,0x10,0x01,0x88]
             xxeval 32, 1, 2, 3, 2
+# CHECK-BE: lxvkq 63, 31                          # encoding: [0xf3,0xff,0xfa,0xd1]
+# CHECK-LE: lxvkq 63, 31                          # encoding: [0xd1,0xfa,0xff,0xf3]
+            lxvkq 63, 31
 # CHECK-BE: vclzdm 1, 2, 3                        # encoding: [0x10,0x22,0x1f,0x84]
 # CHECK-LE: vclzdm 1, 2, 3                        # encoding: [0x84,0x1f,0x22,0x10]
             vclzdm 1, 2, 3

From 2824a65c1ff5800879a0665eba12f7446ce2b653 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 15:44:37 +0100
Subject: [PATCH 646/748] [InstCombine] Add tests for udiv->lshr fold with
 min/max intrinsics (NFC)

---
 llvm/test/Transforms/InstCombine/div-shift.ll | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/div-shift.ll b/llvm/test/Transforms/InstCombine/div-shift.ll
index 6d285ab2f0993..a2399df69d9bd 100644
--- a/llvm/test/Transforms/InstCombine/div-shift.ll
+++ b/llvm/test/Transforms/InstCombine/div-shift.ll
@@ -1,6 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
+declare void @use(i8)
+
+declare i8 @llvm.umin.i8(i8, i8)
+declare i8 @llvm.umax.i8(i8, i8)
+declare i8 @llvm.smin.i8(i8, i8)
+declare i8 @llvm.smax.i8(i8, i8)
+
 define i32 @t1(i16 zeroext %x, i32 %y) {
 ; CHECK-LABEL: @t1(
 ; CHECK-NEXT:  entry:
@@ -99,6 +106,100 @@ define i32 @t6(i32 %x, i32 %z) {
   ret i32 %y
 }
 
+define i8 @udiv_umin(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @udiv_umin(
+; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
+; CHECK-NEXT:    [[Z2:%.*]] = shl i8 1, [[Z:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[Y2]], i8 [[Z2]])
+; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    ret i8 [[D]]
+;
+  %y2 = shl i8 1, %y
+  %z2 = shl i8 1, %z
+  %m = call i8 @llvm.umin.i8(i8 %y2, i8 %z2)
+  %d = udiv i8 %x, %m
+  ret i8 %d
+}
+
+define i8 @udiv_umax(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @udiv_umax(
+; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
+; CHECK-NEXT:    [[Z2:%.*]] = shl i8 1, [[Z:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umax.i8(i8 [[Y2]], i8 [[Z2]])
+; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    ret i8 [[D]]
+;
+  %y2 = shl i8 1, %y
+  %z2 = shl i8 1, %z
+  %m = call i8 @llvm.umax.i8(i8 %y2, i8 %z2)
+  %d = udiv i8 %x, %m
+  ret i8 %d
+}
+
+; Negative test, cannot take exact log2
+define i8 @udiv_umin_(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @udiv_umin_(
+; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[Y2]], i8 [[Z:%.*]])
+; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    ret i8 [[D]]
+;
+  %y2 = shl i8 1, %y
+  %m = call i8 @llvm.umin.i8(i8 %y2, i8 %z)
+  %d = udiv i8 %x, %m
+  ret i8 %d
+}
+
+; Negative test, extra use
+define i8 @udiv_umin_extra_use(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @udiv_umin_extra_use(
+; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
+; CHECK-NEXT:    [[Z2:%.*]] = shl i8 1, [[Z:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[Y2]], i8 [[Z2]])
+; CHECK-NEXT:    call void @use(i8 [[M]])
+; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    ret i8 [[D]]
+;
+  %y2 = shl i8 1, %y
+  %z2 = shl i8 1, %z
+  %m = call i8 @llvm.umin.i8(i8 %y2, i8 %z2)
+  call void @use(i8 %m)
+  %d = udiv i8 %x, %m
+  ret i8 %d
+}
+
+; Negative test, signed min/max
+define i8 @udiv_smin(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @udiv_smin(
+; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
+; CHECK-NEXT:    [[Z2:%.*]] = shl i8 1, [[Z:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[Y2]], i8 [[Z2]])
+; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    ret i8 [[D]]
+;
+  %y2 = shl i8 1, %y
+  %z2 = shl i8 1, %z
+  %m = call i8 @llvm.smin.i8(i8 %y2, i8 %z2)
+  %d = udiv i8 %x, %m
+  ret i8 %d
+}
+
+; Negative test, signed min/max
+define i8 @udiv_smax(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @udiv_smax(
+; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
+; CHECK-NEXT:    [[Z2:%.*]] = shl i8 1, [[Z:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[Y2]], i8 [[Z2]])
+; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    ret i8 [[D]]
+;
+  %y2 = shl i8 1, %y
+  %z2 = shl i8 1, %z
+  %m = call i8 @llvm.smax.i8(i8 %y2, i8 %z2)
+  %d = udiv i8 %x, %m
+  ret i8 %d
+}
+
 ; (X << C1) / X -> 1 << C1 optimizations
 
 define i32 @t7(i32 %x) {

From 587c7ff15c26d3a751fb6a10b2af60d4a25640c9 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 15:49:12 +0100
Subject: [PATCH 647/748] [InstCombine] Support min/max intrinsics in
 udiv->lshr fold

This complements the existing fold for selects. This fold is a bit
more conservative, requiring one-use. The other folds here should
probably also be subjected to a one-use restriction.

https://alive2.llvm.org/ce/z/Q9eCDU
https://alive2.llvm.org/ce/z/8YK2CJ
---
 .../InstCombine/InstCombineMulDivRem.cpp         | 14 ++++++++++++++
 llvm/test/Transforms/InstCombine/div-shift.ll    | 16 ++++++----------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index aeae25476db61..dd866eaa07bb6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -927,12 +927,14 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
     return nullptr;
 
   // log2(zext X) -> zext log2(X)
+  // FIXME: Require one use?
   Value *X, *Y;
   if (match(Op, m_ZExt(m_Value(X))))
     if (Value *LogX = takeLog2(Builder, X, Depth, DoFold))
       return IfFold([&]() { return Builder.CreateZExt(LogX, Op->getType()); });
 
   // log2(X << Y) -> log2(X) + Y
+  // FIXME: Require one use unless X is 1?
   if (match(Op, m_Shl(m_Value(X), m_Value(Y))))
     if (Value *LogX = takeLog2(Builder, X, Depth, DoFold))
       return IfFold([&]() { return Builder.CreateAdd(LogX, Y); });
@@ -941,6 +943,7 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
   // FIXME: missed optimization: if one of the hands of select is/contains
   //        undef, just directly pick the other one.
   // FIXME: can both hands contain undef?
+  // FIXME: Require one use?
   if (SelectInst *SI = dyn_cast<SelectInst>(Op))
     if (Value *LogX = takeLog2(Builder, SI->getOperand(1), Depth, DoFold))
       if (Value *LogY = takeLog2(Builder, SI->getOperand(2), Depth, DoFold))
@@ -948,6 +951,17 @@ static Value *takeLog2(IRBuilderBase &Builder, Value *Op, unsigned Depth,
           return Builder.CreateSelect(SI->getOperand(0), LogX, LogY);
         });
 
+  // log2(umin(X, Y)) -> umin(log2(X), log2(Y))
+  // log2(umax(X, Y)) -> umax(log2(X), log2(Y))
+  auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op);
+  if (MinMax && MinMax->hasOneUse() && !MinMax->isSigned())
+    if (Value *LogX = takeLog2(Builder, MinMax->getLHS(), Depth, DoFold))
+      if (Value *LogY = takeLog2(Builder, MinMax->getRHS(), Depth, DoFold))
+        return IfFold([&]() {
+          return Builder.CreateBinaryIntrinsic(
+              MinMax->getIntrinsicID(), LogX, LogY);
+        });
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/div-shift.ll b/llvm/test/Transforms/InstCombine/div-shift.ll
index a2399df69d9bd..ec7ee56d7f5a7 100644
--- a/llvm/test/Transforms/InstCombine/div-shift.ll
+++ b/llvm/test/Transforms/InstCombine/div-shift.ll
@@ -108,11 +108,9 @@ define i32 @t6(i32 %x, i32 %z) {
 
 define i8 @udiv_umin(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @udiv_umin(
-; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
-; CHECK-NEXT:    [[Z2:%.*]] = shl i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[Y2]], i8 [[Z2]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
-; CHECK-NEXT:    ret i8 [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[Y:%.*]], i8 [[Z:%.*]])
+; CHECK-NEXT:    [[D1:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i8 [[D1]]
 ;
   %y2 = shl i8 1, %y
   %z2 = shl i8 1, %z
@@ -123,11 +121,9 @@ define i8 @udiv_umin(i8 %x, i8 %y, i8 %z) {
 
 define i8 @udiv_umax(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @udiv_umax(
-; CHECK-NEXT:    [[Y2:%.*]] = shl i8 1, [[Y:%.*]]
-; CHECK-NEXT:    [[Z2:%.*]] = shl i8 1, [[Z:%.*]]
-; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umax.i8(i8 [[Y2]], i8 [[Z2]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
-; CHECK-NEXT:    ret i8 [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[Y:%.*]], i8 [[Z:%.*]])
+; CHECK-NEXT:    [[D1:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i8 [[D1]]
 ;
   %y2 = shl i8 1, %y
   %z2 = shl i8 1, %z

From e66b1b73854e43f7e822fbb3be9e9875e6780761 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 23 Feb 2022 13:35:34 +0000
Subject: [PATCH 648/748] [AMDGPU] Split fp min/max atomics test. NFC.

Split out f32 buffer, f64 buffer and image atomics. This just makes
it easier to test subtargets that only have some of these
instructions.

Differential Revision: https://reviews.llvm.org/D120407
---
 .../test/CodeGen/AMDGPU/fp-min-max-atomics.ll | 1244 -----------------
 .../AMDGPU/fp-min-max-buffer-atomics.ll       |  587 ++++++++
 .../AMDGPU/fp-min-max-image-atomics.ll        |  120 ++
 .../AMDGPU/fp64-min-max-buffer-atomics.ll     |  558 ++++++++
 4 files changed, 1265 insertions(+), 1244 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll

diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
deleted file mode 100644
index b27943195857e..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
+++ /dev/null
@@ -1,1244 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=GFX7
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
-
-; RUN: llc < %s -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
-
-declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
-declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
-
-declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg)
-declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg)
-
-declare float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
-declare float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
-
-
-define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_min_noret_f32:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x1
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_min_noret_f32:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    v_mov_b32_e32 v0, s4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s5
-; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_clause 0x1
-; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  ret void
-}
-
-define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_min_noret_f64:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dword s6, s[0:1], 0xf
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_SI-NEXT:    s_load_dword s6, s[0:1], 0xf
-; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    v_mov_b32_e32 v0, s4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s5
-; G_SI-NEXT:    v_mov_b32_e32 v2, s6
-; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
-; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x2
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_clause 0x2
-; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  ret void
-}
-
-define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_min_rtn_f32:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[0:1], v0, off
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_mov_b32 s2, -1
-; G_SI-NEXT:    s_mov_b32 s3, 0xf000
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_mov_b32 s2, -1
-; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v[0:1], v0, off
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    global_store_dword v[0:1], v0, off
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store float %ret, float addrspace(1)* undef
-  ret void
-}
-
-define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_min_rtn_f64:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    ds_write_b64 v0, v[0:1]
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    ds_write_b64 v0, v[0:1]
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    ds_write_b64 v0, v[0:1]
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    ds_write_b64 v0, v[0:1]
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store double %ret, double addrspace(3)* undef
-  ret void
-}
-
-define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(3)* %out) {
-; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xf
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    ds_write_b32 v1, v0
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    ds_write_b32 v1, v0
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x3c
-; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ds_write_b32 v1, v0
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030-NEXT:    s_load_dword s0, s[0:1], 0x3c
-; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ds_write_b32 v1, v0
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; G_SI-NEXT:    s_load_dword s0, s[0:1], 0xf
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; G_SI-NEXT:    v_mov_b32_e32 v1, s0
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    ds_write_b32 v1, v0
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; G_GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    ds_write_b32 v1, v0
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT:    s_load_dword s0, s[0:1], 0x3c
-; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    ds_write_b32 v1, v0
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_clause 0x2
-; G_GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX1030-NEXT:    s_load_dword s0, s[0:1], 0x3c
-; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s0
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    ds_write_b32 v1, v0
-; G_GFX1030-NEXT:    s_endpgm
-; GFX1010-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
-main_body:
-  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store float %ret, float addrspace(3)* %out, align 8
-  ret void
-}
-
-define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
-; SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    ds_write_b64 v3, v[0:1]
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    ds_write_b64 v3, v[0:1]
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ds_write_b64 v3, v[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ds_write_b64 v3, v[0:1]
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    ds_write_b64 v3, v[0:1]
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    ds_write_b64 v3, v[0:1]
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    ds_write_b64 v3, v[0:1]
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    ds_write_b64 v3, v[0:1]
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store double %ret, double addrspace(3)* %out, align 8
-  ret void
-}
-
-define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_max_noret_f32:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x1
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_max_noret_f32:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    v_mov_b32_e32 v0, s4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s5
-; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_clause 0x1
-; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  ret void
-}
-
-define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_max_noret_f64:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dword s6, s[0:1], 0xf
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_SI-NEXT:    s_load_dword s6, s[0:1], 0xf
-; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    v_mov_b32_e32 v0, s4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s5
-; G_SI-NEXT:    v_mov_b32_e32 v2, s6
-; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
-; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x2
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
-; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_clause 0x2
-; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  ret void
-}
-
-define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_max_rtn_f32:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[0:1], v0, off
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_mov_b32 s2, -1
-; G_SI-NEXT:    s_mov_b32 s3, 0xf000
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_mov_b32 s2, -1
-; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v[0:1], v0, off
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    global_store_dword v[0:1], v0, off
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store float %ret, float addrspace(1)* undef
-  ret void
-}
-
-define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
-; SI-LABEL: raw_buffer_atomic_max_rtn_f64:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    ds_write_b64 v0, v[0:1]
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    ds_write_b64 v0, v[0:1]
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    ds_write_b64 v0, v[0:1]
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    ds_write_b64 v0, v[0:1]
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store double %ret, double addrspace(3)* undef
-  ret void
-}
-
-define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(1)* %out) {
-; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
-; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
-; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; G_SI-NEXT:    s_mov_b32 s2, -1
-; G_SI-NEXT:    s_mov_b32 s3, 0xf000
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; G_GFX7-NEXT:    s_mov_b32 s2, -1
-; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x1
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
-; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; G_GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_clause 0x2
-; G_GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
-; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, 0
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    global_store_dword v1, v0, s[0:1]
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store float %ret, float addrspace(1)* %out, align 8
-  ret void
-}
-
-define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
-; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; SI-NEXT:    s_load_dword s8, s[0:1], 0xf
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0x10
-; SI-NEXT:    s_mov_b32 m0, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_mov_b32_e32 v2, s8
-; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; SI-NEXT:    v_mov_b32_e32 v2, s0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    ds_write_b64 v2, v[0:1]
-; SI-NEXT:    s_endpgm
-;
-; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xf
-; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX7-NEXT:    v_mov_b32_e32 v2, s7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
-; GFX7-NEXT:    s_endpgm
-;
-; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
-; GFX10-NEXT:    s_endpgm
-;
-; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    s_clause 0x2
-; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
-; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX1030-NEXT:    v_mov_b32_e32 v2, s7
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
-; GFX1030-NEXT:    s_endpgm
-;
-; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; G_SI-NEXT:    s_load_dword s8, s[0:1], 0xf
-; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; G_SI-NEXT:    s_load_dword s0, s[0:1], 0x10
-; G_SI-NEXT:    s_mov_b32 m0, -1
-; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
-; G_SI-NEXT:    v_mov_b32_e32 v2, s8
-; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; G_SI-NEXT:    v_mov_b32_e32 v2, s0
-; G_SI-NEXT:    s_waitcnt vmcnt(0)
-; G_SI-NEXT:    ds_write_b64 v2, v[0:1]
-; G_SI-NEXT:    s_endpgm
-;
-; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; G_GFX7-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xf
-; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; G_GFX7-NEXT:    s_mov_b32 m0, -1
-; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX7-NEXT:    v_mov_b32_e32 v2, s7
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX7-NEXT:    s_endpgm
-;
-; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    s_clause 0x2
-; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; G_GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
-; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX10-NEXT:    s_endpgm
-;
-; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    s_clause 0x2
-; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
-; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
-; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s7
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX1030-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store double %ret, double addrspace(3)* %out, align 8
-  ret void
-}
-
-define amdgpu_ps float @atomic_fmin_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
-; SI-LABEL: atomic_fmin_1d:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: atomic_fmin_1d:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: atomic_fmin_1d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX1030-LABEL: atomic_fmin_1d:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ; return to shader part epilog
-;
-; G_SI-LABEL: atomic_fmin_1d:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
-; G_SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; G_SI-NEXT:    ; return to shader part epilog
-;
-; G_GFX7-LABEL: atomic_fmin_1d:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    ; return to shader part epilog
-;
-; G_GFX10-LABEL: atomic_fmin_1d:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    ; return to shader part epilog
-;
-; G_GFX1030-LABEL: atomic_fmin_1d:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    ; return to shader part epilog
-main_body:
-  %v = call float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  ret float %v
-}
-
-define amdgpu_ps float @atomic_fmax_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
-; SI-LABEL: atomic_fmax_1d:
-; SI:       ; %bb.0: ; %main_body
-; SI-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: atomic_fmax_1d:
-; GFX7:       ; %bb.0: ; %main_body
-; GFX7-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: atomic_fmax_1d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX1030-LABEL: atomic_fmax_1d:
-; GFX1030:       ; %bb.0: ; %main_body
-; GFX1030-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    ; return to shader part epilog
-;
-; G_SI-LABEL: atomic_fmax_1d:
-; G_SI:       ; %bb.0: ; %main_body
-; G_SI-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
-; G_SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; G_SI-NEXT:    ; return to shader part epilog
-;
-; G_GFX7-LABEL: atomic_fmax_1d:
-; G_GFX7:       ; %bb.0: ; %main_body
-; G_GFX7-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
-; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX7-NEXT:    ; return to shader part epilog
-;
-; G_GFX10-LABEL: atomic_fmax_1d:
-; G_GFX10:       ; %bb.0: ; %main_body
-; G_GFX10-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX10-NEXT:    ; return to shader part epilog
-;
-; G_GFX1030-LABEL: atomic_fmax_1d:
-; G_GFX1030:       ; %bb.0: ; %main_body
-; G_GFX1030-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
-; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; G_GFX1030-NEXT:    ; return to shader part epilog
-main_body:
-  %v = call float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-  ret float %v
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
new file mode 100644
index 0000000000000..2e63b837bd765
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -0,0 +1,587 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=GFX7
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
+
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
+
+declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg)
+declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg)
+
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_noret_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x1
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s4
+; G_SI-NEXT:    v_mov_b32_e32 v1, s5
+; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x1
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x1
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 s2, -1
+; G_SI-NEXT:    s_mov_b32 s3, 0xf000
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 s2, -1
+; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store float %ret, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x3c
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b32 v1, v0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dword s0, s[0:1], 0x3c
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b32 v1, v0
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_SI-NEXT:    v_mov_b32_e32 v1, s0
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b32 v1, v0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b32 v1, v0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x1
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    s_load_dword s0, s[0:1], 0x3c
+; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b32 v1, v0
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dword s0, s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s0
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b32 v1, v0
+; G_GFX1030-NEXT:    s_endpgm
+; GFX1010-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store float %ret, float addrspace(3)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_noret_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x1
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s4
+; G_SI-NEXT:    v_mov_b32_e32 v1, s5
+; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x1
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x1
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 s2, -1
+; G_SI-NEXT:    s_mov_b32 s3, 0xf000
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 s2, -1
+; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store float %ret, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(1)* %out) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_SI-NEXT:    s_mov_b32 s2, -1
+; G_SI-NEXT:    s_mov_b32 s3, 0xf000
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT:    s_mov_b32 s2, -1
+; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x1
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    global_store_dword v1, v0, s[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store float %ret, float addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
new file mode 100644
index 0000000000000..7ddeebe1b7672
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-image-atomics.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=GFX7
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
+
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
+
+declare float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
+declare float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
+
+
+define amdgpu_ps float @atomic_fmin_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; SI-LABEL: atomic_fmin_1d:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: atomic_fmin_1d:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: atomic_fmin_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1030-LABEL: atomic_fmin_1d:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; G_SI-LABEL: atomic_fmin_1d:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; G_SI-NEXT:    ; return to shader part epilog
+;
+; G_GFX7-LABEL: atomic_fmin_1d:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ; return to shader part epilog
+;
+; G_GFX10-LABEL: atomic_fmin_1d:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ; return to shader part epilog
+;
+; G_GFX1030-LABEL: atomic_fmin_1d:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %v
+}
+
+define amdgpu_ps float @atomic_fmax_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; SI-LABEL: atomic_fmax_1d:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: atomic_fmax_1d:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: atomic_fmax_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1030-LABEL: atomic_fmax_1d:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; G_SI-LABEL: atomic_fmax_1d:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; G_SI-NEXT:    ; return to shader part epilog
+;
+; G_GFX7-LABEL: atomic_fmax_1d:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ; return to shader part epilog
+;
+; G_GFX10-LABEL: atomic_fmax_1d:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ; return to shader part epilog
+;
+; G_GFX1030-LABEL: atomic_fmax_1d:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %v
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
new file mode 100644
index 0000000000000..ad30576a46049
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -0,0 +1,558 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=GFX7
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
+
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
+
+declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
+
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_noret_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT:    s_load_dword s6, s[0:1], 0xf
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dword s6, s[0:1], 0xf
+; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s4
+; G_SI-NEXT:    v_mov_b32_e32 v1, s5
+; G_SI-NEXT:    v_mov_b32_e32 v2, s6
+; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
+; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v0, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v0, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double addrspace(3)* undef
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v3, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v3, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v3, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v3, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v3, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v3, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v3, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v3, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(3)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_noret_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT:    s_load_dword s6, s[0:1], 0xf
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dword s6, s[0:1], 0xf
+; G_SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s4
+; G_SI-NEXT:    v_mov_b32_e32 v1, s5
+; G_SI-NEXT:    v_mov_b32_e32 v2, s6
+; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dword s6, s[0:1], 0xf
+; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v0, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v0, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double addrspace(3)* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dword s8, s[0:1], 0xf
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x10
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xf
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dword s8, s[0:1], 0xf
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dword s0, s[0:1], 0x10
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v2, s8
+; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_SI-NEXT:    v_mov_b32_e32 v2, s0
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v2, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xf
+; G_GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s9
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(3)* %out, align 8
+  ret void
+}

From 4b86d55997cfac5b7a6fb0d31f4bad1b8bdf5ca5 Mon Sep 17 00:00:00 2001
From: Arjun P <arjunpitchanathan@gmail.com>
Date: Wed, 23 Feb 2022 15:00:17 +0000
Subject: [PATCH 649/748] [MLIR][Presburger] unittests: use an MLIRContext
 declared in parsePoly

Use an `MLIRContext` declared in a single place in the `parsePoly` function that almost all Presburger unit tests use for parsing sets. This function is only used in tests.

This saves us from having to declare and pass a new `MLIRContext` in every test.

Reviewed By: bondhugula, mehdi_amini

Differential Revision: https://reviews.llvm.org/D119251
---
 .../Presburger/IntegerPolyhedronTest.cpp      | 194 ++++------
 .../Analysis/Presburger/PWMAFunctionTest.cpp  |   6 +-
 .../Analysis/Presburger/PresburgerSetTest.cpp | 330 +++++++-----------
 .../Analysis/Presburger/SimplexTest.cpp       |  10 +-
 mlir/unittests/Analysis/Presburger/Utils.h    |   5 +-
 5 files changed, 191 insertions(+), 354 deletions(-)

diff --git a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
index e403ddd013ad1..395889cf4afca 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerPolyhedronTest.cpp
@@ -195,47 +195,38 @@ TEST(IntegerPolyhedronTest, removeIdRange) {
 
 TEST(IntegerPolyhedronTest, FindSampleTest) {
   // Bounded sets with only inequalities.
-
-  MLIRContext context;
-
   // 0 <= 7x <= 5
-  checkSample(true, parsePoly("(x) : (7 * x >= 0, -7 * x + 5 >= 0)", &context));
+  checkSample(true, parsePoly("(x) : (7 * x >= 0, -7 * x + 5 >= 0)"));
 
   // 1 <= 5x and 5x <= 4 (no solution).
-  checkSample(false,
-              parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 4 >= 0)", &context));
+  checkSample(false, parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 4 >= 0)"));
 
   // 1 <= 5x and 5x <= 9 (solution: x = 1).
-  checkSample(true,
-              parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 9 >= 0)", &context));
+  checkSample(true, parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 9 >= 0)"));
 
   // Bounded sets with equalities.
   // x >= 8 and 40 >= y and x = y.
-  checkSample(true, parsePoly("(x,y) : (x - 8 >= 0, -y + 40 >= 0, x - y == 0)",
-                              &context));
+  checkSample(true,
+              parsePoly("(x,y) : (x - 8 >= 0, -y + 40 >= 0, x - y == 0)"));
 
   // x <= 10 and y <= 10 and 10 <= z and x + 2y = 3z.
   // solution: x = y = z = 10.
   checkSample(true, parsePoly("(x,y,z) : (-x + 10 >= 0, -y + 10 >= 0, "
-                              "z - 10 >= 0, x + 2 * y - 3 * z == 0)",
-                              &context));
+                              "z - 10 >= 0, x + 2 * y - 3 * z == 0)"));
 
   // x <= 10 and y <= 10 and 11 <= z and x + 2y = 3z.
   // This implies x + 2y >= 33 and x + 2y <= 30, which has no solution.
   checkSample(false, parsePoly("(x,y,z) : (-x + 10 >= 0, -y + 10 >= 0, "
-                               "z - 11 >= 0, x + 2 * y - 3 * z == 0)",
-                               &context));
+                               "z - 11 >= 0, x + 2 * y - 3 * z == 0)"));
 
   // 0 <= r and r <= 3 and 4q + r = 7.
   // Solution: q = 1, r = 3.
-  checkSample(
-      true,
-      parsePoly("(q,r) : (r >= 0, -r + 3 >= 0, 4 * q + r - 7 == 0)", &context));
+  checkSample(true,
+              parsePoly("(q,r) : (r >= 0, -r + 3 >= 0, 4 * q + r - 7 == 0)"));
 
   // 4q + r = 7 and r = 0.
   // Solution: q = 1, r = 3.
-  checkSample(false,
-              parsePoly("(q,r) : (4 * q + r - 7 == 0, r == 0)", &context));
+  checkSample(false, parsePoly("(q,r) : (4 * q + r - 7 == 0, r == 0)"));
 
   // The next two sets are large sets that should take a long time to sample
   // with a naive branch and bound algorithm but can be sampled efficiently with
@@ -244,8 +235,7 @@ TEST(IntegerPolyhedronTest, FindSampleTest) {
   // This is a triangle with vertices at (1/3, 0), (2/3, 0) and (10000, 10000).
   checkSample(true, parsePoly("(x,y) : (y >= 0, "
                               "300000 * x - 299999 * y - 100000 >= 0, "
-                              "-300000 * x + 299998 * y + 200000 >= 0)",
-                              &context));
+                              "-300000 * x + 299998 * y + 200000 >= 0)"));
 
   // This is a tetrahedron with vertices at
   // (1/3, 0, 0), (2/3, 0, 0), (2/3, 0, 10000), and (10000, 10000, 10000).
@@ -268,8 +258,7 @@ TEST(IntegerPolyhedronTest, FindSampleTest) {
       parsePoly("(a,b,c,d,e) : (b + d - e >= 0, -b + c - d + e >= 0, "
                 "300000 * a - 299998 * b - c - 9 * d + 21 * e - 112000 >= 0, "
                 "-150000 * a + 149999 * b - 15 * d + 47 * e + 68000 >= 0, "
-                "d - e == 0, d + e - 2000 == 0)",
-                &context));
+                "d - e == 0, d + e - 2000 == 0)"));
 
   // This is a tetrahedron with vertices at
   // (1/3, 0, 0), (2/3, 0, 0), (2/3, 0, 100), (100, 100 - 1/3, 100).
@@ -288,24 +277,20 @@ TEST(IntegerPolyhedronTest, FindSampleTest) {
   // This is a line segment from (0, 1/3) to (100, 100 + 1/3).
   checkSample(
       false,
-      parsePoly("(x,y) : (x >= 0, -x + 100 >= 0, 3 * x - 3 * y + 1 == 0)",
-                &context));
+      parsePoly("(x,y) : (x >= 0, -x + 100 >= 0, 3 * x - 3 * y + 1 == 0)"));
 
   // A thin parallelogram. 0 <= x <= 100 and x + 1/3 <= y <= x + 2/3.
   checkSample(false,
               parsePoly("(x,y) : (x >= 0, -x + 100 >= 0, "
-                        "3 * x - 3 * y + 2 >= 0, -3 * x + 3 * y - 1 >= 0)",
-                        &context));
+                        "3 * x - 3 * y + 2 >= 0, -3 * x + 3 * y - 1 >= 0)"));
 
   checkSample(true, parsePoly("(x,y) : (2 * x >= 0, -2 * x + 99 >= 0, "
-                              "2 * y >= 0, -2 * y + 99 >= 0)",
-                              &context));
+                              "2 * y >= 0, -2 * y + 99 >= 0)"));
 
   // 2D cone with apex at (10000, 10000) and
   // edges passing through (1/3, 0) and (2/3, 0).
   checkSample(true, parsePoly("(x,y) : (300000 * x - 299999 * y - 100000 >= 0, "
-                              "-300000 * x + 299998 * y + 200000 >= 0)",
-                              &context));
+                              "-300000 * x + 299998 * y + 200000 >= 0)"));
 
   // Cartesian product of a tetrahedron and a 2D cone.
   // The tetrahedron has vertices at
@@ -419,8 +404,7 @@ TEST(IntegerPolyhedronTest, FindSampleTest) {
                           {});
 
   checkSample(true, parsePoly("(x, y, z) : (2 * x - 1 >= 0, x - y - 1 == 0, "
-                              "y - z == 0)",
-                              &context));
+                              "y - z == 0)"));
 
   // Regression tests for the computation of dual coefficients.
   checkSample(false, parsePoly("(x, y, z) : ("
@@ -428,42 +412,34 @@ TEST(IntegerPolyhedronTest, FindSampleTest) {
                                "x + 5*y + z + 5 >= 0,"
                                "-4*x + y + 2*z - 1 >= 0,"
                                "-3*x - 2*y - 7*z - 1 >= 0,"
-                               "-7*x - 5*y - 9*z - 1 >= 0)",
-                               &context));
+                               "-7*x - 5*y - 9*z - 1 >= 0)"));
   checkSample(true, parsePoly("(x, y, z) : ("
                               "3*x + 3*y + 3 >= 0,"
                               "-4*x - 8*y - z + 4 >= 0,"
                               "-7*x - 4*y + z + 1 >= 0,"
                               "2*x - 7*y - 8*z - 7 >= 0,"
-                              "9*x + 8*y - 9*z - 7 >= 0)",
-                              &context));
+                              "9*x + 8*y - 9*z - 7 >= 0)"));
 }
 
 TEST(IntegerPolyhedronTest, IsIntegerEmptyTest) {
-
-  MLIRContext context;
-
   // 1 <= 5x and 5x <= 4 (no solution).
-  EXPECT_TRUE(parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 4 >= 0)", &context)
-                  .isIntegerEmpty());
+  EXPECT_TRUE(
+      parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 4 >= 0)").isIntegerEmpty());
   // 1 <= 5x and 5x <= 9 (solution: x = 1).
-  EXPECT_FALSE(parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 9 >= 0)", &context)
-                   .isIntegerEmpty());
+  EXPECT_FALSE(
+      parsePoly("(x) : (5 * x - 1 >= 0, -5 * x + 9 >= 0)").isIntegerEmpty());
 
   // Unbounded sets.
   EXPECT_TRUE(parsePoly("(x,y,z) : (2 * y - 1 >= 0, -2 * y + 1 >= 0, "
-                        "2 * z - 1 >= 0, 2 * x - 1 == 0)",
-                        &context)
+                        "2 * z - 1 >= 0, 2 * x - 1 == 0)")
                   .isIntegerEmpty());
 
   EXPECT_FALSE(parsePoly("(x,y,z) : (2 * x - 1 >= 0, -3 * x + 3 >= 0, "
-                         "5 * z - 6 >= 0, -7 * z + 17 >= 0, 3 * y - 2 >= 0)",
-                         &context)
+                         "5 * z - 6 >= 0, -7 * z + 17 >= 0, 3 * y - 2 >= 0)")
                    .isIntegerEmpty());
 
   EXPECT_FALSE(
-      parsePoly("(x,y,z) : (2 * x - 1 >= 0, x - y - 1 == 0, y - z == 0)",
-                &context)
+      parsePoly("(x,y,z) : (2 * x - 1 >= 0, x - y - 1 == 0, y - z == 0)")
           .isIntegerEmpty());
 
   // IntegerPolyhedron::isEmpty() does not detect the following sets to be
@@ -472,8 +448,7 @@ TEST(IntegerPolyhedronTest, IsIntegerEmptyTest) {
   // 3x + 7y = 1 and 0 <= x, y <= 10.
   // Since x and y are non-negative, 3x + 7y can never be 1.
   EXPECT_TRUE(parsePoly("(x,y) : (x >= 0, -x + 10 >= 0, y >= 0, -y + 10 >= 0, "
-                        "3 * x + 7 * y - 1 == 0)",
-                        &context)
+                        "3 * x + 7 * y - 1 == 0)")
                   .isIntegerEmpty());
 
   // 2x = 3y and y = x - 1 and x + y = 6z + 2 and 0 <= x, y <= 100.
@@ -481,8 +456,7 @@ TEST(IntegerPolyhedronTest, IsIntegerEmptyTest) {
   // Since x + y = 5 cannot be equal to 6z + 2 for any z, the set is empty.
   EXPECT_TRUE(
       parsePoly("(x,y,z) : (x >= 0, -x + 100 >= 0, y >= 0, -y + 100 >= 0, "
-                "2 * x - 3 * y == 0, x - y - 1 == 0, x + y - 6 * z - 2 == 0)",
-                &context)
+                "2 * x - 3 * y == 0, x - y - 1 == 0, x + y - 6 * z - 2 == 0)")
           .isIntegerEmpty());
 
   // 2x = 3y and y = x - 1 + 6z and x + y = 6q + 2 and 0 <= x, y <= 100.
@@ -493,20 +467,16 @@ TEST(IntegerPolyhedronTest, IsIntegerEmptyTest) {
   EXPECT_TRUE(
       parsePoly(
           "(x,y,z,q) : (x >= 0, -x + 100 >= 0, y >= 0, -y + 100 >= 0, "
-          "2 * x - 3 * y == 0, x - y + 6 * z - 1 == 0, x + y - 6 * q - 2 == 0)",
-          &context)
+          "2 * x - 3 * y == 0, x - y + 6 * z - 1 == 0, x + y - 6 * q - 2 == 0)")
           .isIntegerEmpty());
 
   // Set with symbols.
-  EXPECT_FALSE(parsePoly("(x)[s] : (x + s >= 0, x - s == 0)", &context)
-                   .isIntegerEmpty());
+  EXPECT_FALSE(parsePoly("(x)[s] : (x + s >= 0, x - s == 0)").isIntegerEmpty());
 }
 
 TEST(IntegerPolyhedronTest, removeRedundantConstraintsTest) {
-  MLIRContext context;
-
   IntegerPolyhedron poly =
-      parsePoly("(x) : (x - 2 >= 0, -x + 2 >= 0, x - 2 == 0)", &context);
+      parsePoly("(x) : (x - 2 >= 0, -x + 2 >= 0, x - 2 == 0)");
   poly.removeRedundantConstraints();
 
   // Both inequalities are redundant given the equality. Both have been removed.
@@ -514,7 +484,7 @@ TEST(IntegerPolyhedronTest, removeRedundantConstraintsTest) {
   EXPECT_EQ(poly.getNumEqualities(), 1u);
 
   IntegerPolyhedron poly2 =
-      parsePoly("(x,y) : (x - 3 >= 0, y - 2 >= 0, x - y == 0)", &context);
+      parsePoly("(x,y) : (x - 3 >= 0, y - 2 >= 0, x - y == 0)");
   poly2.removeRedundantConstraints();
 
   // The second inequality is redundant and should have been removed. The
@@ -524,7 +494,7 @@ TEST(IntegerPolyhedronTest, removeRedundantConstraintsTest) {
   EXPECT_EQ(poly2.getNumEqualities(), 1u);
 
   IntegerPolyhedron poly3 =
-      parsePoly("(x,y,z) : (x - y == 0, x - z == 0, y - z == 0)", &context);
+      parsePoly("(x,y,z) : (x - y == 0, x - z == 0, y - z == 0)");
   poly3.removeRedundantConstraints();
 
   // One of the three equalities can be removed.
@@ -569,8 +539,7 @@ TEST(IntegerPolyhedronTest, removeRedundantConstraintsTest) {
                 "-c + 10 >= 0,"
                 "a - 13 >= 0,"
                 "-a + 13 >= 0"
-                ")",
-                &context);
+                ")");
 
   // The above is a large set of constraints without any redundant constraints,
   // as verified by the Fourier-Motzkin based removeRedundantInequalities.
@@ -586,8 +555,7 @@ TEST(IntegerPolyhedronTest, removeRedundantConstraintsTest) {
   EXPECT_EQ(poly4.getNumEqualities(), nEq);
 
   IntegerPolyhedron poly5 = parsePoly(
-      "(x,y) : (128 * x + 127 >= 0, -x + 7 >= 0, -128 * x + y >= 0, y >= 0)",
-      &context);
+      "(x,y) : (128 * x + 127 >= 0, -x + 7 >= 0, -128 * x + y >= 0, y >= 0)");
   // 128x + 127 >= 0  implies that 128x >= 0, since x has to be an integer.
   // (This should be caught by GCDTightenInqualities().)
   // So -128x + y >= 0 and 128x + 127 >= 0 imply y >= 0 since we have
@@ -637,7 +605,6 @@ static void checkDivisionRepresentation(
     IntegerPolyhedron &poly,
     const std::vector<SmallVector<int64_t, 8>> &expectedDividends,
     const SmallVectorImpl<unsigned> &expectedDenominators) {
-
   std::vector<SmallVector<int64_t, 8>> dividends;
   SmallVector<unsigned, 4> denominators;
 
@@ -714,10 +681,8 @@ TEST(IntegerPolyhedronTest, computeLocalReprRecursive) {
 }
 
 TEST(IntegerPolyhedronTest, computeLocalReprTightUpperBound) {
-  MLIRContext context;
-
   {
-    IntegerPolyhedron poly = parsePoly("(i) : (i mod 3 - 1 >= 0)", &context);
+    IntegerPolyhedron poly = parsePoly("(i) : (i mod 3 - 1 >= 0)");
 
     // The set formed by the poly is:
     //        3q - i + 2 >= 0             <-- Division lower bound
@@ -737,8 +702,8 @@ TEST(IntegerPolyhedronTest, computeLocalReprTightUpperBound) {
   }
 
   {
-    IntegerPolyhedron poly = parsePoly(
-        "(i, j, q) : (4*q - i - j + 2 >= 0, -4*q + i + j >= 0)", &context);
+    IntegerPolyhedron poly =
+        parsePoly("(i, j, q) : (4*q - i - j + 2 >= 0, -4*q + i + j >= 0)");
     // Convert `q` to a local variable.
     poly.convertDimToLocal(2, 3);
 
@@ -751,10 +716,8 @@ TEST(IntegerPolyhedronTest, computeLocalReprTightUpperBound) {
 }
 
 TEST(IntegerPolyhedronTest, computeLocalReprFromEquality) {
-  MLIRContext context;
   {
-    IntegerPolyhedron poly =
-        parsePoly("(i, j, q) : (-4*q + i + j == 0)", &context);
+    IntegerPolyhedron poly = parsePoly("(i, j, q) : (-4*q + i + j == 0)");
     // Convert `q` to a local variable.
     poly.convertDimToLocal(2, 3);
 
@@ -764,8 +727,7 @@ TEST(IntegerPolyhedronTest, computeLocalReprFromEquality) {
     checkDivisionRepresentation(poly, divisions, denoms);
   }
   {
-    IntegerPolyhedron poly =
-        parsePoly("(i, j, q) : (4*q - i - j == 0)", &context);
+    IntegerPolyhedron poly = parsePoly("(i, j, q) : (4*q - i - j == 0)");
     // Convert `q` to a local variable.
     poly.convertDimToLocal(2, 3);
 
@@ -775,8 +737,7 @@ TEST(IntegerPolyhedronTest, computeLocalReprFromEquality) {
     checkDivisionRepresentation(poly, divisions, denoms);
   }
   {
-    IntegerPolyhedron poly =
-        parsePoly("(i, j, q) : (3*q + i + j - 2 == 0)", &context);
+    IntegerPolyhedron poly = parsePoly("(i, j, q) : (3*q + i + j - 2 == 0)");
     // Convert `q` to a local variable.
     poly.convertDimToLocal(2, 3);
 
@@ -788,12 +749,10 @@ TEST(IntegerPolyhedronTest, computeLocalReprFromEquality) {
 }
 
 TEST(IntegerPolyhedronTest, computeLocalReprFromEqualityAndInequality) {
-  MLIRContext context;
   {
     IntegerPolyhedron poly =
         parsePoly("(i, j, q, k) : (-3*k + i + j == 0, 4*q - "
-                  "i - j + 2 >= 0, -4*q + i + j >= 0)",
-                  &context);
+                  "i - j + 2 >= 0, -4*q + i + j >= 0)");
     // Convert `q` and `k` to local variables.
     poly.convertDimToLocal(2, 4);
 
@@ -806,9 +765,8 @@ TEST(IntegerPolyhedronTest, computeLocalReprFromEqualityAndInequality) {
 }
 
 TEST(IntegerPolyhedronTest, computeLocalReprNoRepr) {
-  MLIRContext context;
   IntegerPolyhedron poly =
-      parsePoly("(x, q) : (x - 3 * q >= 0, -x + 3 * q + 3 >= 0)", &context);
+      parsePoly("(x, q) : (x - 3 * q >= 0, -x + 3 * q + 3 >= 0)");
   // Convert q to a local variable.
   poly.convertDimToLocal(1, 2);
 
@@ -820,9 +778,8 @@ TEST(IntegerPolyhedronTest, computeLocalReprNoRepr) {
 }
 
 TEST(IntegerPolyhedronTest, computeLocalReprNegConstNormalize) {
-  MLIRContext context;
-  IntegerPolyhedron poly = parsePoly(
-      "(x, q) : (-1 - 3*x - 6 * q >= 0, 6 + 3*x + 6*q >= 0)", &context);
+  IntegerPolyhedron poly =
+      parsePoly("(x, q) : (-1 - 3*x - 6 * q >= 0, 6 + 3*x + 6*q >= 0)");
   // Convert q to a local variable.
   poly.convertDimToLocal(1, 2);
 
@@ -1093,41 +1050,33 @@ void expectNoRationalLexMin(OptimumKind kind, const IntegerPolyhedron &poly) {
 }
 
 TEST(IntegerPolyhedronTest, getRationalLexMin) {
-  MLIRContext context;
   expectRationalLexMin(
-      parsePoly("(x, y, z) : (x + 10 >= 0, y + 40 >= 0, z + 30 >= 0)",
-                &context),
+      parsePoly("(x, y, z) : (x + 10 >= 0, y + 40 >= 0, z + 30 >= 0)"),
       {{-10, 1}, {-40, 1}, {-30, 1}});
   expectRationalLexMin(
       parsePoly(
-          "(x, y, z) : (2*x + 7 >= 0, 3*y - 5 >= 0, 8*z + 10 >= 0, 9*z >= 0)",
-          &context),
+          "(x, y, z) : (2*x + 7 >= 0, 3*y - 5 >= 0, 8*z + 10 >= 0, 9*z >= 0)"),
       {{-7, 2}, {5, 3}, {0, 1}});
-  expectRationalLexMin(
-      parsePoly(
-          "(x, y) : (3*x + 2*y + 10 >= 0, -3*y + 10 >= 0, 4*x - 7*y - 10 >= 0)",
-          &context),
-      {{-50, 29}, {-70, 29}});
+  expectRationalLexMin(parsePoly("(x, y) : (3*x + 2*y + 10 >= 0, -3*y + 10 >= "
+                                 "0, 4*x - 7*y - 10 >= 0)"),
+                       {{-50, 29}, {-70, 29}});
 
   // Test with some locals. This is basically x >= 11, 0 <= x - 2e <= 1.
   // It'll just choose x = 11, e = 5.5 since it's rational lexmin.
   expectRationalLexMin(
       parsePoly(
-          "(x, y) : (x - 2*(x floordiv 2) == 0, y - 2*x >= 0, x - 11 >= 0)",
-          &context),
+          "(x, y) : (x - 2*(x floordiv 2) == 0, y - 2*x >= 0, x - 11 >= 0)"),
       {{11, 1}, {22, 1}});
 
   expectRationalLexMin(parsePoly("(x, y) : (3*x + 2*y + 10 >= 0,"
-                                 "-4*x + 7*y + 10 >= 0, -3*y + 10 >= 0)",
-                                 &context),
+                                 "-4*x + 7*y + 10 >= 0, -3*y + 10 >= 0)"),
                        {{-50, 9}, {10, 3}});
 
   // Cartesian product of above with itself.
   expectRationalLexMin(
       parsePoly("(x, y, z, w) : (3*x + 2*y + 10 >= 0, -4*x + 7*y + 10 >= 0,"
                 "-3*y + 10 >= 0, 3*z + 2*w + 10 >= 0, -4*z + 7*w + 10 >= 0,"
-                "-3*w + 10 >= 0)",
-                &context),
+                "-3*w + 10 >= 0)"),
       {{-50, 9}, {10, 3}, {-50, 9}, {10, 3}});
 
   // Same as above but for the constraints on z and w, we express "10" in terms
@@ -1139,8 +1088,7 @@ TEST(IntegerPolyhedronTest, getRationalLexMin) {
       parsePoly(
           "(x, y, z, w) : (3*x + 2*y + 10 >= 0, -4*x + 7*y + 10 >= 0, "
           "-3*y + 10 >= 0, 3*z + 2*w - 9*x - 12*y >= 0,"
-          "-4*z + 7*w + - 9*x - 9*y - 10 >= 0, -3*w - 9*x - 15*y + 10 >= 0)",
-          &context),
+          "-4*z + 7*w + - 9*x - 9*y - 10 >= 0, -3*w - 9*x - 15*y + 10 >= 0)"),
       {{-50, 9}, {10, 3}, {-50, 9}, {10, 3}});
 
   // Same as above with one constraint removed, making the lexmin unbounded.
@@ -1148,19 +1096,17 @@ TEST(IntegerPolyhedronTest, getRationalLexMin) {
       OptimumKind::Unbounded,
       parsePoly("(x, y, z, w) : (3*x + 2*y + 10 >= 0, -4*x + 7*y + 10 >= 0,"
                 "-3*y + 10 >= 0, 3*z + 2*w - 9*x - 12*y >= 0,"
-                "-4*z + 7*w + - 9*x - 9*y - 10>= 0)",
-                &context));
+                "-4*z + 7*w + - 9*x - 9*y - 10>= 0)"));
 
   // Again, the lexmin is unbounded.
   expectNoRationalLexMin(
       OptimumKind::Unbounded,
       parsePoly("(x, y, z) : (2*x + 5*y + 8*z - 10 >= 0,"
-                "2*x + 10*y + 8*z - 10 >= 0, 2*x + 5*y + 10*z - 10 >= 0)",
-                &context));
+                "2*x + 10*y + 8*z - 10 >= 0, 2*x + 5*y + 10*z - 10 >= 0)"));
 
   // The set is empty.
   expectNoRationalLexMin(OptimumKind::Empty,
-                         parsePoly("(x) : (2*x >= 0, -x - 1 >= 0)", &context));
+                         parsePoly("(x) : (2*x >= 0, -x - 1 >= 0)"));
 }
 
 void expectIntegerLexMin(const IntegerPolyhedron &poly, ArrayRef<int64_t> min) {
@@ -1176,16 +1122,13 @@ void expectNoIntegerLexMin(OptimumKind kind, const IntegerPolyhedron &poly) {
 }
 
 TEST(IntegerPolyhedronTest, getIntegerLexMin) {
-  MLIRContext context;
   expectIntegerLexMin(parsePoly("(x, y, z) : (2*x + 13 >= 0, 4*y - 3*x - 2  >= "
-                                "0, 11*z + 5*y - 3*x + 7 >= 0)",
-                                &context),
+                                "0, 11*z + 5*y - 3*x + 7 >= 0)"),
                       {-6, -4, 0});
   // Similar to above but no lower bound on z.
   expectNoIntegerLexMin(OptimumKind::Unbounded,
                         parsePoly("(x, y, z) : (2*x + 13 >= 0, 4*y - 3*x - 2  "
-                                  ">= 0, -11*z + 5*y - 3*x + 7 >= 0)",
-                                  &context));
+                                  ">= 0, -11*z + 5*y - 3*x + 7 >= 0)"));
 }
 
 static void
@@ -1197,30 +1140,25 @@ expectComputedVolumeIsValidOverapprox(const IntegerPolyhedron &poly,
 }
 
 TEST(IntegerPolyhedronTest, computeVolume) {
-  MLIRContext context;
-
   // 0 <= x <= 3 + 1/3, -5.5 <= y <= 2 + 3/5, 3 <= z <= 1.75.
   // i.e. 0 <= x <= 3, -5 <= y <= 2, 3 <= z <= 3 + 1/4.
   // So volume is 4 * 8 * 1 = 32.
   expectComputedVolumeIsValidOverapprox(
       parsePoly("(x, y, z) : (x >= 0, -3*x + 10 >= 0, 2*y + 11 >= 0,"
-                "-5*y + 13 >= 0, z - 3 >= 0, -4*z + 13 >= 0)",
-                &context),
+                "-5*y + 13 >= 0, z - 3 >= 0, -4*z + 13 >= 0)"),
       /*trueVolume=*/32ull, /*resultBound=*/32ull);
 
   // Same as above but y has bounds 2 + 1/5 <= y <= 2 + 3/5. So the volume is
   // zero.
   expectComputedVolumeIsValidOverapprox(
       parsePoly("(x, y, z) : (x >= 0, -3*x + 10 >= 0, 5*y - 11 >= 0,"
-                "-5*y + 13 >= 0, z - 3 >= 0, -4*z + 13 >= 0)",
-                &context),
+                "-5*y + 13 >= 0, z - 3 >= 0, -4*z + 13 >= 0)"),
       /*trueVolume=*/0ull, /*resultBound=*/0ull);
 
   // Now x is unbounded below but y still has no integer values.
   expectComputedVolumeIsValidOverapprox(
       parsePoly("(x, y, z) : (-3*x + 10 >= 0, 5*y - 11 >= 0,"
-                "-5*y + 13 >= 0, z - 3 >= 0, -4*z + 13 >= 0)",
-                &context),
+                "-5*y + 13 >= 0, z - 3 >= 0, -4*z + 13 >= 0)"),
       /*trueVolume=*/0ull, /*resultBound=*/0ull);
 
   // A diamond shape, 0 <= x + y <= 10, 0 <= x - y <= 10,
@@ -1228,8 +1166,7 @@ TEST(IntegerPolyhedronTest, computeVolume) {
   // x and y can take 11 possible values so result computed is 11*11 = 121.
   expectComputedVolumeIsValidOverapprox(
       parsePoly("(x, y) : (x + y >= 0, -x - y + 10 >= 0, x - y >= 0,"
-                "-x + y + 10 >= 0)",
-                &context),
+                "-x + y + 10 >= 0)"),
       /*trueVolume=*/61ull, /*resultBound=*/121ull);
 
   // Effectively the same diamond as above; constrain the variables to be even
@@ -1240,13 +1177,12 @@ TEST(IntegerPolyhedronTest, computeVolume) {
   expectComputedVolumeIsValidOverapprox(
       parsePoly("(x, y) : (x + y >= 0, -x - y + 20 >= 0, x - y >= 0,"
                 " -x + y + 20 >= 0, x - 2*(x floordiv 2) == 0,"
-                "y - 2*(y floordiv 2) == 0)",
-                &context),
+                "y - 2*(y floordiv 2) == 0)"),
       /*trueVolume=*/61ull, /*resultBound=*/441ull);
 
   // Unbounded polytope.
   expectComputedVolumeIsValidOverapprox(
-      parsePoly("(x, y) : (2*x - y >= 0, y - 3*x >= 0)", &context),
+      parsePoly("(x, y) : (2*x - y >= 0, y - 3*x >= 0)"),
       /*trueVolume=*/{}, /*resultBound=*/{});
 }
 
diff --git a/mlir/unittests/Analysis/Presburger/PWMAFunctionTest.cpp b/mlir/unittests/Analysis/Presburger/PWMAFunctionTest.cpp
index 0a1744db838ac..0d269367493e7 100644
--- a/mlir/unittests/Analysis/Presburger/PWMAFunctionTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/PWMAFunctionTest.cpp
@@ -43,11 +43,9 @@ static PWMAFunction parsePWMAF(
     ArrayRef<std::pair<StringRef, SmallVector<SmallVector<int64_t, 8>, 8>>>
         data,
     unsigned numSymbols = 0) {
-  static MLIRContext context;
-
   PWMAFunction result(numInputs - numSymbols, numSymbols, numOutputs);
   for (const auto &pair : data) {
-    IntegerPolyhedron domain = parsePoly(pair.first, &context);
+    IntegerPolyhedron domain = parsePoly(pair.first);
     result.addPiece(
         domain, makeMatrix(numOutputs, domain.getNumIds() + 1, pair.second));
   }
@@ -55,8 +53,6 @@ static PWMAFunction parsePWMAF(
 }
 
 TEST(PWAFunctionTest, isEqual) {
-  MLIRContext context;
-
   // The output expressions are different but it doesn't matter because they are
   // equal in this domain.
   PWMAFunction idAtZeros = parsePWMAF(
diff --git a/mlir/unittests/Analysis/Presburger/PresburgerSetTest.cpp b/mlir/unittests/Analysis/Presburger/PresburgerSetTest.cpp
index bbd0d55301210..14b33f7f0afbb 100644
--- a/mlir/unittests/Analysis/Presburger/PresburgerSetTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/PresburgerSetTest.cpp
@@ -22,17 +22,15 @@
 #include <gtest/gtest.h>
 
 namespace mlir {
-
 /// Parse a list of StringRefs to IntegerPolyhedron and combine them into a
 /// PresburgerSet be using the union operation. It is expected that the strings
 /// are all valid IntegerSet representation and that all of them have the same
 /// number of dimensions as is specified by the numDims argument.
-static PresburgerSet parsePresburgerSetFromPolyStrings(unsigned numDims,
-                                                       ArrayRef<StringRef> strs,
-                                                       MLIRContext *context) {
+static PresburgerSet
+parsePresburgerSetFromPolyStrings(unsigned numDims, ArrayRef<StringRef> strs) {
   PresburgerSet set = PresburgerSet::getEmptySet(numDims);
   for (StringRef str : strs)
-    set.unionPolyInPlace(parsePoly(str, context));
+    set.unionPolyInPlace(parsePoly(str));
   return set;
 }
 
@@ -108,12 +106,9 @@ static PresburgerSet makeSetFromPoly(unsigned numDims,
 }
 
 TEST(SetTest, containsPoint) {
-  MLIRContext context;
-
   PresburgerSet setA = parsePresburgerSetFromPolyStrings(
       1,
-      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"},
-      &context);
+      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"});
   for (unsigned x = 0; x <= 21; ++x) {
     if ((2 <= x && x <= 8) || (10 <= x && x <= 20))
       EXPECT_TRUE(setA.containsPoint({x}));
@@ -124,11 +119,9 @@ TEST(SetTest, containsPoint) {
   // A parallelogram with vertices {(3, 1), (10, -6), (24, 8), (17, 15)} union
   // a square with opposite corners (2, 2) and (10, 10).
   PresburgerSet setB = parsePresburgerSetFromPolyStrings(
-      2,
-      {"(x,y) : (x + y - 4 >= 0, -x - y + 32 >= 0, "
-       "x - y - 2 >= 0, -x + y + 16 >= 0)",
-       "(x,y) : (x - 2 >= 0, y - 2 >= 0, -x + 10 >= 0, -y + 10 >= 0)"},
-      &context);
+      2, {"(x,y) : (x + y - 4 >= 0, -x - y + 32 >= 0, "
+          "x - y - 2 >= 0, -x + y + 16 >= 0)",
+          "(x,y) : (x - 2 >= 0, y - 2 >= 0, -x + 10 >= 0, -y + 10 >= 0)"});
 
   for (unsigned x = 1; x <= 25; ++x) {
     for (unsigned y = -6; y <= 16; ++y) {
@@ -143,12 +136,9 @@ TEST(SetTest, containsPoint) {
 }
 
 TEST(SetTest, Union) {
-  MLIRContext context;
-
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
       1,
-      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"},
-      &context);
+      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"});
 
   // Universe union set.
   testUnionAtPoints(PresburgerSet::getUniverse(1), set,
@@ -172,12 +162,9 @@ TEST(SetTest, Union) {
 }
 
 TEST(SetTest, Intersect) {
-  MLIRContext context;
-
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
       1,
-      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"},
-      &context);
+      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"});
 
   // Universe intersection set.
   testIntersectAtPoints(PresburgerSet::getUniverse(1), set,
@@ -201,40 +188,33 @@ TEST(SetTest, Intersect) {
 }
 
 TEST(SetTest, Subtract) {
-  MLIRContext context;
   // The interval [2, 8] minus the interval [10, 20].
-  testSubtractAtPoints(parsePresburgerSetFromPolyStrings(
-                           1, {"(x) : (x - 2 >= 0, -x + 8 >= 0)"}, &context),
-                       parsePresburgerSetFromPolyStrings(
-                           1, {"(x) : (x - 10 >= 0, -x + 20 >= 0)"}, &context),
-                       {{1}, {2}, {8}, {9}, {10}, {20}, {21}});
-
-  // Universe minus [2, 8] U [10, 20]
   testSubtractAtPoints(
-      parsePresburgerSetFromPolyStrings(1, {"(x) : ()"}, &context),
+      parsePresburgerSetFromPolyStrings(1, {"(x) : (x - 2 >= 0, -x + 8 >= 0)"}),
       parsePresburgerSetFromPolyStrings(1,
-                                        {"(x) : (x - 2 >= 0, -x + 8 >= 0)",
-                                         "(x) : (x - 10 >= 0, -x + 20 >= 0)"},
-                                        &context),
+                                        {"(x) : (x - 10 >= 0, -x + 20 >= 0)"}),
       {{1}, {2}, {8}, {9}, {10}, {20}, {21}});
 
+  // Universe minus [2, 8] U [10, 20]
+  testSubtractAtPoints(parsePresburgerSetFromPolyStrings(1, {"(x) : ()"}),
+                       parsePresburgerSetFromPolyStrings(
+                           1, {"(x) : (x - 2 >= 0, -x + 8 >= 0)",
+                               "(x) : (x - 10 >= 0, -x + 20 >= 0)"}),
+                       {{1}, {2}, {8}, {9}, {10}, {20}, {21}});
+
   // ((-infinity, 0] U [3, 4] U [6, 7]) - ([2, 3] U [5, 6])
   testSubtractAtPoints(
-      parsePresburgerSetFromPolyStrings(1,
-                                        {"(x) : (-x >= 0)",
-                                         "(x) : (x - 3 >= 0, -x + 4 >= 0)",
-                                         "(x) : (x - 6 >= 0, -x + 7 >= 0)"},
-                                        &context),
-      parsePresburgerSetFromPolyStrings(1,
-                                        {"(x) : (x - 2 >= 0, -x + 3 >= 0)",
-                                         "(x) : (x - 5 >= 0, -x + 6 >= 0)"},
-                                        &context),
+      parsePresburgerSetFromPolyStrings(1, {"(x) : (-x >= 0)",
+                                            "(x) : (x - 3 >= 0, -x + 4 >= 0)",
+                                            "(x) : (x - 6 >= 0, -x + 7 >= 0)"}),
+      parsePresburgerSetFromPolyStrings(1, {"(x) : (x - 2 >= 0, -x + 3 >= 0)",
+                                            "(x) : (x - 5 >= 0, -x + 6 >= 0)"}),
       {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 
   // Expected result is {[x, y] : x > y}, i.e., {[x, y] : x >= y + 1}.
   testSubtractAtPoints(
-      parsePresburgerSetFromPolyStrings(2, {"(x, y) : (x - y >= 0)"}, &context),
-      parsePresburgerSetFromPolyStrings(2, {"(x, y) : (x + y >= 0)"}, &context),
+      parsePresburgerSetFromPolyStrings(2, {"(x, y) : (x - y >= 0)"}),
+      parsePresburgerSetFromPolyStrings(2, {"(x, y) : (x + y >= 0)"}),
       {{0, 1}, {1, 1}, {1, 0}, {1, -1}, {0, -1}});
 
   // A rectangle with corners at (2, 2) and (10, 10), minus
@@ -246,14 +226,12 @@ TEST(SetTest, Subtract) {
           2,
           {
               "(x, y) : (x - 2 >= 0, y - 2 >= 0, -x + 10 >= 0, -y + 10 >= 0)",
-          },
-          &context),
+          }),
       parsePresburgerSetFromPolyStrings(
           2,
           {
               "(x, y) : (x - 5 >= 0, y + 10 >= 0, -x + 7 >= 0, -y + 100 >= 0)",
-          },
-          &context),
+          }),
       {{1, 2},  {2, 2},  {4, 2},  {5, 2},  {7, 2},  {8, 2},  {11, 2},
        {1, 1},  {2, 1},  {4, 1},  {5, 1},  {7, 1},  {8, 1},  {11, 1},
        {1, 10}, {2, 10}, {4, 10}, {5, 10}, {7, 10}, {8, 10}, {11, 10},
@@ -265,14 +243,12 @@ TEST(SetTest, Subtract) {
   // resulting set can be represented as a union of four rectangles.
   testSubtractAtPoints(
       parsePresburgerSetFromPolyStrings(
-          2, {"(x, y) : (x - 2 >= 0, y -2 >= 0, -x + 10 >= 0, -y + 10 >= 0)"},
-          &context),
+          2, {"(x, y) : (x - 2 >= 0, y -2 >= 0, -x + 10 >= 0, -y + 10 >= 0)"}),
       parsePresburgerSetFromPolyStrings(
           2,
           {
               "(x, y) : (x - 5 >= 0, y - 4 >= 0, -x + 7 >= 0, -y + 8 >= 0)",
-          },
-          &context),
+          }),
       {{1, 1},
        {2, 2},
        {10, 10},
@@ -288,25 +264,24 @@ TEST(SetTest, Subtract) {
 
   // The second set is a superset of the first one, since on the line x + y = 0,
   // y <= 1 is equivalent to x >= -1. So the result is empty.
-  testSubtractAtPoints(parsePresburgerSetFromPolyStrings(
-                           2, {"(x, y) : (x >= 0, x + y == 0)"}, &context),
-                       parsePresburgerSetFromPolyStrings(
-                           2, {"(x, y) : (-y + 1 >= 0, x + y == 0)"}, &context),
-                       {{0, 0},
-                        {1, -1},
-                        {2, -2},
-                        {-1, 1},
-                        {-2, 2},
-                        {1, 1},
-                        {-1, -1},
-                        {-1, 1},
-                        {1, -1}});
+  testSubtractAtPoints(
+      parsePresburgerSetFromPolyStrings(2, {"(x, y) : (x >= 0, x + y == 0)"}),
+      parsePresburgerSetFromPolyStrings(2,
+                                        {"(x, y) : (-y + 1 >= 0, x + y == 0)"}),
+      {{0, 0},
+       {1, -1},
+       {2, -2},
+       {-1, 1},
+       {-2, 2},
+       {1, 1},
+       {-1, -1},
+       {-1, 1},
+       {1, -1}});
 
   // The result should be {0} U {2}.
   testSubtractAtPoints(
-      parsePresburgerSetFromPolyStrings(1, {"(x) : (x >= 0, -x + 2 >= 0)"},
-                                        &context),
-      parsePresburgerSetFromPolyStrings(1, {"(x) : (x - 1 == 0)"}, &context),
+      parsePresburgerSetFromPolyStrings(1, {"(x) : (x >= 0, -x + 2 >= 0)"}),
+      parsePresburgerSetFromPolyStrings(1, {"(x) : (x - 1 == 0)"}),
       {{-1}, {0}, {1}, {2}, {3}});
 
   // Sets with lots of redundant inequalities to test the redundancy heuristic.
@@ -321,14 +296,11 @@ TEST(SetTest, Subtract) {
           {
               "(x, y) : (x + y - 4 >= 0, -x - y + 32 >= 0, x - y - 2 >= 0, "
               "-x + y + 16 >= 0)",
-          },
-          &context),
+          }),
       parsePresburgerSetFromPolyStrings(
-          2,
-          {"(x, y) : (x - 2 >= 0, y - 2 >= 0, -x + 10 >= 0, "
-           "-y + 10 >= 0, x + y - 2 >= 0, -x - y + 30 >= 0, x - y >= 0, "
-           "-x + y + 10 >= 0)"},
-          &context),
+          2, {"(x, y) : (x - 2 >= 0, y - 2 >= 0, -x + 10 >= 0, "
+              "-y + 10 >= 0, x + y - 2 >= 0, -x - y + 30 >= 0, x - y >= 0, "
+              "-x + y + 10 >= 0)"}),
       {{1, 2},  {2, 2},   {3, 2},   {4, 2},  {1, 1},   {2, 1},   {3, 1},
        {4, 1},  {2, 0},   {3, 0},   {4, 0},  {5, 0},   {10, 2},  {11, 2},
        {10, 1}, {10, 10}, {10, 11}, {10, 9}, {11, 10}, {10, -6}, {11, -6},
@@ -338,19 +310,15 @@ TEST(SetTest, Subtract) {
   // 7])
   testSubtractAtPoints(
       parsePresburgerSetFromPolyStrings(
-          1,
-          {"(x) : (-x - 5 >= 0)", "(x) : (x - 3 == 0)", "(x) : (x - 4 == 0)",
-           "(x) : (x - 5 == 0)"},
-          &context),
+          1, {"(x) : (-x - 5 >= 0)", "(x) : (x - 3 == 0)", "(x) : (x - 4 == 0)",
+              "(x) : (x - 5 == 0)"}),
       parsePresburgerSetFromPolyStrings(
-          1,
-          {"(x) : (-x - 2 >= 0, x - 10 >= 0, -x >= 0, -x + 10 >= 0, "
-           "x - 100 >= 0, x - 50 >= 0)",
-           "(x) : (x - 3 >= 0, -x + 4 >= 0, x + 1 >= 0, "
-           "x + 7 >= 0, -x + 10 >= 0)",
-           "(x) : (x - 6 >= 0, -x + 7 >= 0, x + 1 >= 0, x - 3 >= 0, "
-           "-x + 5 >= 0)"},
-          &context),
+          1, {"(x) : (-x - 2 >= 0, x - 10 >= 0, -x >= 0, -x + 10 >= 0, "
+              "x - 100 >= 0, x - 50 >= 0)",
+              "(x) : (x - 3 >= 0, -x + 4 >= 0, x + 1 >= 0, "
+              "x + 7 >= 0, -x + 10 >= 0)",
+              "(x) : (x - 6 >= 0, -x + 7 >= 0, x + 1 >= 0, x - 3 >= 0, "
+              "-x + 5 >= 0)"}),
       {{-6},
        {-5},
        {-4},
@@ -369,8 +337,6 @@ TEST(SetTest, Subtract) {
 }
 
 TEST(SetTest, Complement) {
-
-  MLIRContext context;
   // Complement of universe.
   testComplementAtPoints(
       PresburgerSet::getUniverse(1),
@@ -382,10 +348,8 @@ TEST(SetTest, Complement) {
       {{-1}, {-2}, {-8}, {1}, {2}, {8}, {9}, {10}, {20}, {21}});
 
   testComplementAtPoints(
-      parsePresburgerSetFromPolyStrings(2,
-                                        {"(x,y) : (x - 2 >= 0, y - 2 >= 0, "
-                                         "-x + 10 >= 0, -y + 10 >= 0)"},
-                                        &context),
+      parsePresburgerSetFromPolyStrings(2, {"(x,y) : (x - 2 >= 0, y - 2 >= 0, "
+                                            "-x + 10 >= 0, -y + 10 >= 0)"}),
       {{1, 1},
        {2, 1},
        {1, 2},
@@ -401,15 +365,12 @@ TEST(SetTest, Complement) {
 }
 
 TEST(SetTest, isEqual) {
-
-  MLIRContext context;
   // set = [2, 8] U [10, 20].
   PresburgerSet universe = PresburgerSet::getUniverse(1);
   PresburgerSet emptySet = PresburgerSet::getEmptySet(1);
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
       1,
-      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"},
-      &context);
+      {"(x) : (x - 2 >= 0, -x + 8 >= 0)", "(x) : (x - 10 >= 0, -x + 20 >= 0)"});
 
   // universe != emptySet.
   EXPECT_FALSE(universe.isEqual(emptySet));
@@ -446,11 +407,9 @@ TEST(SetTest, isEqual) {
 
   // square is one unit taller than rect.
   PresburgerSet square = parsePresburgerSetFromPolyStrings(
-      2, {"(x, y) : (x - 2 >= 0, y - 2 >= 0, -x + 9 >= 0, -y + 9 >= 0)"},
-      &context);
+      2, {"(x, y) : (x - 2 >= 0, y - 2 >= 0, -x + 9 >= 0, -y + 9 >= 0)"});
   PresburgerSet rect = parsePresburgerSetFromPolyStrings(
-      2, {"(x, y) : (x - 2 >= 0, y - 2 >= 0, -x + 9 >= 0, -y + 8 >= 0)"},
-      &context);
+      2, {"(x, y) : (x - 2 >= 0, y - 2 >= 0, -x + 9 >= 0, -y + 8 >= 0)"});
   EXPECT_FALSE(square.isEqual(rect));
   PresburgerSet universeRect = square.unionSet(square.complement());
   PresburgerSet universeSquare = rect.unionSet(rect.complement());
@@ -467,23 +426,17 @@ void expectEqual(const PresburgerSet &s, const PresburgerSet &t) {
 void expectEmpty(const PresburgerSet &s) { EXPECT_TRUE(s.isIntegerEmpty()); }
 
 TEST(SetTest, divisions) {
-  MLIRContext context;
-
   // evens = {x : exists q, x = 2q}.
-  PresburgerSet evens{
-      parsePoly("(x) : (x - 2 * (x floordiv 2) == 0)", &context)};
+  PresburgerSet evens{parsePoly("(x) : (x - 2 * (x floordiv 2) == 0)")};
 
   //  odds = {x : exists q, x = 2q + 1}.
-  PresburgerSet odds{
-      parsePoly("(x) : (x - 2 * (x floordiv 2) - 1 == 0)", &context)};
+  PresburgerSet odds{parsePoly("(x) : (x - 2 * (x floordiv 2) - 1 == 0)")};
 
   // multiples3 = {x : exists q, x = 3q}.
-  PresburgerSet multiples3{
-      parsePoly("(x) : (x - 3 * (x floordiv 3) == 0)", &context)};
+  PresburgerSet multiples3{parsePoly("(x) : (x - 3 * (x floordiv 3) == 0)")};
 
   // multiples6 = {x : exists q, x = 6q}.
-  PresburgerSet multiples6{
-      parsePoly("(x) : (x - 6 * (x floordiv 6) == 0)", &context)};
+  PresburgerSet multiples6{parsePoly("(x) : (x - 6 * (x floordiv 6) == 0)")};
 
   // evens /\ odds = empty.
   expectEmpty(PresburgerSet(evens).intersect(PresburgerSet(odds)));
@@ -494,8 +447,8 @@ TEST(SetTest, divisions) {
   // even multiples of 3 = multiples of 6.
   expectEqual(multiples3.intersect(evens), multiples6);
 
-  PresburgerSet setA{parsePoly("(x) : (-x >= 0)", &context)};
-  PresburgerSet setB{parsePoly("(x) : (x floordiv 2 - 4 >= 0)", &context)};
+  PresburgerSet setA{parsePoly("(x) : (-x >= 0)")};
+  PresburgerSet setB{parsePoly("(x) : (x floordiv 2 - 4 >= 0)")};
   EXPECT_TRUE(setA.subtract(setB).isEqual(setA));
 }
 
@@ -514,185 +467,143 @@ TEST(SetTest, coalesceNoPoly) {
 }
 
 TEST(SetTest, coalesceContainedOneDim) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : (x >= 0, -x + 4 >= 0)", "(x) : (x - 1 >= 0, -x + 2 >= 0)"},
-      &context);
+      1, {"(x) : (x >= 0, -x + 4 >= 0)", "(x) : (x - 1 >= 0, -x + 2 >= 0)"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceFirstEmpty) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : ( x >= 0, -x - 1 >= 0)", "(x) : ( x - 1 >= 0, -x + 2 >= 0)"},
-      &context);
+      1, {"(x) : ( x >= 0, -x - 1 >= 0)", "(x) : ( x - 1 >= 0, -x + 2 >= 0)"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceSecondEmpty) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : (x - 1 >= 0, -x + 2 >= 0)", "(x) : (x >= 0, -x - 1 >= 0)"},
-      &context);
+      1, {"(x) : (x - 1 >= 0, -x + 2 >= 0)", "(x) : (x >= 0, -x - 1 >= 0)"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceBothEmpty) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : (x - 3 >= 0, -x - 1 >= 0)", "(x) : (x >= 0, -x - 1 >= 0)"},
-      &context);
+      1, {"(x) : (x - 3 >= 0, -x - 1 >= 0)", "(x) : (x >= 0, -x - 1 >= 0)"});
   expectCoalesce(0, set);
 }
 
 TEST(SetTest, coalesceFirstUniv) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : ()", "(x) : ( x >= 0, -x + 1 >= 0)"}, &context);
+      1, {"(x) : ()", "(x) : ( x >= 0, -x + 1 >= 0)"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceSecondUniv) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : ( x >= 0, -x + 1 >= 0)", "(x) : ()"}, &context);
+      1, {"(x) : ( x >= 0, -x + 1 >= 0)", "(x) : ()"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceBothUniv) {
-  MLIRContext context;
   PresburgerSet set =
-      parsePresburgerSetFromPolyStrings(1, {"(x) : ()", "(x) : ()"}, &context);
+      parsePresburgerSetFromPolyStrings(1, {"(x) : ()", "(x) : ()"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceFirstUnivSecondEmpty) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : ()", "(x) : ( x >= 0, -x - 1 >= 0)"}, &context);
+      1, {"(x) : ()", "(x) : ( x >= 0, -x - 1 >= 0)"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceFirstEmptySecondUniv) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : ( x >= 0, -x - 1 >= 0)", "(x) : ()"}, &context);
+      1, {"(x) : ( x >= 0, -x - 1 >= 0)", "(x) : ()"});
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceCutOneDim) {
-  MLIRContext context;
-  PresburgerSet set =
-      parsePresburgerSetFromPolyStrings(1,
-                                        {
-                                            "(x) : ( x >= 0, -x + 3 >= 0)",
-                                            "(x) : ( x - 2 >= 0, -x + 4 >= 0)",
-                                        },
-                                        &context);
+  PresburgerSet set = parsePresburgerSetFromPolyStrings(
+      1, {
+             "(x) : ( x >= 0, -x + 3 >= 0)",
+             "(x) : ( x - 2 >= 0, -x + 4 >= 0)",
+         });
   expectCoalesce(2, set);
 }
 
 TEST(SetTest, coalesceSeparateOneDim) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      1, {"(x) : ( x >= 0, -x + 2 >= 0)", "(x) : ( x - 3 >= 0, -x + 4 >= 0)"},
-      &context);
+      1, {"(x) : ( x >= 0, -x + 2 >= 0)", "(x) : ( x - 3 >= 0, -x + 4 >= 0)"});
   expectCoalesce(2, set);
 }
 
 TEST(SetTest, coalesceContainedTwoDim) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x >= 0, -x + 3 >= 0, y >= 0, -y + 3 >= 0)",
-          "(x,y) : (x >= 0, -x + 3 >= 0, y - 2 >= 0, -y + 3 >= 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x >= 0, -x + 3 >= 0, y >= 0, -y + 3 >= 0)",
+             "(x,y) : (x >= 0, -x + 3 >= 0, y - 2 >= 0, -y + 3 >= 0)",
+         });
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceCutTwoDim) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x >= 0, -x + 3 >= 0, y >= 0, -y + 2 >= 0)",
-          "(x,y) : (x >= 0, -x + 3 >= 0, y - 1 >= 0, -y + 3 >= 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x >= 0, -x + 3 >= 0, y >= 0, -y + 2 >= 0)",
+             "(x,y) : (x >= 0, -x + 3 >= 0, y - 1 >= 0, -y + 3 >= 0)",
+         });
   expectCoalesce(2, set);
 }
 
 TEST(SetTest, coalesceSeparateTwoDim) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x >= 0, -x + 3 >= 0, y >= 0, -y + 1 >= 0)",
-          "(x,y) : (x >= 0, -x + 3 >= 0, y - 2 >= 0, -y + 3 >= 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x >= 0, -x + 3 >= 0, y >= 0, -y + 1 >= 0)",
+             "(x,y) : (x >= 0, -x + 3 >= 0, y - 2 >= 0, -y + 3 >= 0)",
+         });
   expectCoalesce(2, set);
 }
 
 TEST(SetTest, coalesceContainedEq) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x >= 0, -x + 3 >= 0, x - y == 0)",
-          "(x,y) : (x - 1 >= 0, -x + 2 >= 0, x - y == 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x >= 0, -x + 3 >= 0, x - y == 0)",
+             "(x,y) : (x - 1 >= 0, -x + 2 >= 0, x - y == 0)",
+         });
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceCuttingEq) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x - 1 >= 0, -x + 3 >= 0, x - y == 0)",
-          "(x,y) : (x >= 0, -x + 2 >= 0, x - y == 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x - 1 >= 0, -x + 3 >= 0, x - y == 0)",
+             "(x,y) : (x >= 0, -x + 2 >= 0, x - y == 0)",
+         });
   expectCoalesce(2, set);
 }
 
 TEST(SetTest, coalesceSeparateEq) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x - 3 >= 0, -x + 4 >= 0, x - y == 0)",
-          "(x,y) : (x >= 0, -x + 1 >= 0, x - y == 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x - 3 >= 0, -x + 4 >= 0, x - y == 0)",
+             "(x,y) : (x >= 0, -x + 1 >= 0, x - y == 0)",
+         });
   expectCoalesce(2, set);
 }
 
 TEST(SetTest, coalesceContainedEqAsIneq) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x >= 0, -x + 3 >= 0, x - y >= 0, -x + y >= 0)",
-          "(x,y) : (x - 1 >= 0, -x + 2 >= 0, x - y == 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x >= 0, -x + 3 >= 0, x - y >= 0, -x + y >= 0)",
+             "(x,y) : (x - 1 >= 0, -x + 2 >= 0, x - y == 0)",
+         });
   expectCoalesce(1, set);
 }
 
 TEST(SetTest, coalesceContainedEqComplex) {
-  MLIRContext context;
   PresburgerSet set = parsePresburgerSetFromPolyStrings(
-      2,
-      {
-          "(x,y) : (x - 2 == 0, x - y == 0)",
-          "(x,y) : (x - 1 >= 0, -x + 2 >= 0, x - y == 0)",
-      },
-      &context);
+      2, {
+             "(x,y) : (x - 2 == 0, x - y == 0)",
+             "(x,y) : (x - 1 >= 0, -x + 2 >= 0, x - y == 0)",
+         });
   expectCoalesce(1, set);
 }
 
@@ -705,12 +616,10 @@ expectComputedVolumeIsValidOverapprox(const PresburgerSet &set,
 }
 
 TEST(SetTest, computeVolume) {
-  MLIRContext context;
   // Diamond with vertices at (0, 0), (5, 5), (5, 5), (10, 0).
   PresburgerSet diamond(
       parsePoly("(x, y) : (x + y >= 0, -x - y + 10 >= 0, x - y >= 0, -x + y + "
-                "10 >= 0)",
-                &context));
+                "10 >= 0)"));
   expectComputedVolumeIsValidOverapprox(diamond,
                                         /*trueVolume=*/61ull,
                                         /*resultBound=*/121ull);
@@ -718,8 +627,7 @@ TEST(SetTest, computeVolume) {
   // Diamond with vertices at (-5, 0), (0, -5), (0, 5), (5, 0).
   PresburgerSet shiftedDiamond(parsePoly(
       "(x, y) : (x + y + 5 >= 0, -x - y + 5 >= 0, x - y + 5 >= 0, -x + y + "
-      "5 >= 0)",
-      &context));
+      "5 >= 0)"));
   expectComputedVolumeIsValidOverapprox(shiftedDiamond,
                                         /*trueVolume=*/61ull,
                                         /*resultBound=*/121ull);
@@ -727,8 +635,7 @@ TEST(SetTest, computeVolume) {
   // Diamond with vertices at (-5, 0), (5, -10), (5, 10), (15, 0).
   PresburgerSet biggerDiamond(parsePoly(
       "(x, y) : (x + y + 5 >= 0, -x - y + 15 >= 0, x - y + 5 >= 0, -x + y + "
-      "15 >= 0)",
-      &context));
+      "15 >= 0)"));
   expectComputedVolumeIsValidOverapprox(biggerDiamond,
                                         /*trueVolume=*/221ull,
                                         /*resultBound=*/441ull);
@@ -745,8 +652,7 @@ TEST(SetTest, computeVolume) {
       /*resultBound=*/683ull);
 
   // Unbounded polytope.
-  PresburgerSet unbounded(
-      parsePoly("(x, y) : (2*x - y >= 0, y - 3*x >= 0)", &context));
+  PresburgerSet unbounded(parsePoly("(x, y) : (2*x - y >= 0, y - 3*x >= 0)"));
   expectComputedVolumeIsValidOverapprox(unbounded, /*trueVolume=*/{},
                                         /*resultBound=*/{});
 
diff --git a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
index fbe68070f39d6..d5020befb8a6e 100644
--- a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
@@ -503,12 +503,10 @@ TEST(SimplexTest, isRedundantEquality) {
 }
 
 TEST(SimplexTest, IsRationalSubsetOf) {
-  MLIRContext context;
-  IntegerPolyhedron univ = parsePoly("(x) : ()", &context);
-  IntegerPolyhedron empty =
-      parsePoly("(x) : (x + 0 >= 0, -x - 1 >= 0)", &context);
-  IntegerPolyhedron s1 = parsePoly("(x) : ( x >= 0, -x + 4 >= 0)", &context);
-  IntegerPolyhedron s2 = parsePoly("(x) : (x - 1 >= 0, -x + 3 >= 0)", &context);
+  IntegerPolyhedron univ = parsePoly("(x) : ()");
+  IntegerPolyhedron empty = parsePoly("(x) : (x + 0 >= 0, -x - 1 >= 0)");
+  IntegerPolyhedron s1 = parsePoly("(x) : ( x >= 0, -x + 4 >= 0)");
+  IntegerPolyhedron s2 = parsePoly("(x) : (x - 1 >= 0, -x + 3 >= 0)");
 
   Simplex simUniv(univ);
   Simplex simEmpty(empty);
diff --git a/mlir/unittests/Analysis/Presburger/Utils.h b/mlir/unittests/Analysis/Presburger/Utils.h
index d1cfc91524659..29a855ec1795f 100644
--- a/mlir/unittests/Analysis/Presburger/Utils.h
+++ b/mlir/unittests/Analysis/Presburger/Utils.h
@@ -24,8 +24,9 @@ namespace mlir {
 /// Parses a IntegerPolyhedron from a StringRef. It is expected that the
 /// string represents a valid IntegerSet, otherwise it will violate a gtest
 /// assertion.
-inline IntegerPolyhedron parsePoly(StringRef str, MLIRContext *context) {
-  FailureOr<IntegerPolyhedron> poly = parseIntegerSetToFAC(str, context);
+inline IntegerPolyhedron parsePoly(StringRef str) {
+  MLIRContext context(MLIRContext::Threading::DISABLED);
+  FailureOr<IntegerPolyhedron> poly = parseIntegerSetToFAC(str, &context);
   EXPECT_TRUE(succeeded(poly));
   return *poly;
 }

From b1863d82454b2905db8b492bea0ce8a260362645 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 16:10:29 +0100
Subject: [PATCH 650/748] [Clang][OpenMP] Remove use of getPointerElementType()

This new pointer element type use snuck in via D118632.
---
 clang/lib/CodeGen/CGStmtOpenMP.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 4bdeff4d41f34..5f6ab2769d033 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6042,12 +6042,12 @@ static void emitOMPAtomicCompareExpr(CodeGenFunction &CGF,
   }
 
   LValue XLVal = CGF.EmitLValue(X);
-  llvm::Value *XPtr = XLVal.getPointer(CGF);
+  Address XAddr = XLVal.getAddress(CGF);
   llvm::Value *EVal = CGF.EmitScalarExpr(E);
   llvm::Value *DVal = D ? CGF.EmitScalarExpr(D) : nullptr;
 
   llvm::OpenMPIRBuilder::AtomicOpValue XOpVal{
-      XPtr, XPtr->getType()->getPointerElementType(),
+      XAddr.getPointer(), XAddr.getElementType(),
       X->getType().isVolatileQualified(),
       X->getType()->hasSignedIntegerRepresentation()};
 

From a7db3c611b1e613ae43ef472c2352f2b81a0b607 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Tue, 22 Feb 2022 22:27:54 -0500
Subject: [PATCH 651/748] [mlir][NFC] Use options struct in
 ExecutionEngine::create

Its number of optional parameters has grown too large,
which makes adding new optional parameters quite a chore.

Fix this by using an options struct.

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D120380
---
 mlir/examples/toy/Ch6/toyc.cpp                |  5 +-
 mlir/examples/toy/Ch7/toyc.cpp                |  5 +-
 .../mlir/ExecutionEngine/ExecutionEngine.h    | 65 ++++++++++---------
 .../CAPI/ExecutionEngine/ExecutionEngine.cpp  |  8 ++-
 mlir/lib/ExecutionEngine/ExecutionEngine.cpp  | 30 ++++-----
 mlir/lib/ExecutionEngine/JitRunner.cpp        |  9 ++-
 6 files changed, 63 insertions(+), 59 deletions(-)

diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
index 95b0789a01462..659104032ecc1 100644
--- a/mlir/examples/toy/Ch6/toyc.cpp
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -236,8 +236,9 @@ int runJit(mlir::ModuleOp module) {
 
   // Create an MLIR execution engine. The execution engine eagerly JIT-compiles
   // the module.
-  auto maybeEngine = mlir::ExecutionEngine::create(
-      module, /*llvmModuleBuilder=*/nullptr, optPipeline);
+  mlir::ExecutionEngineOptions engineOptions;
+  engineOptions.transformer = optPipeline;
+  auto maybeEngine = mlir::ExecutionEngine::create(module, engineOptions);
   assert(maybeEngine && "failed to construct an execution engine");
   auto &engine = maybeEngine.get();
 
diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
index ce8a0f0f432a7..2b976ba97a9e0 100644
--- a/mlir/examples/toy/Ch7/toyc.cpp
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -237,8 +237,9 @@ int runJit(mlir::ModuleOp module) {
 
   // Create an MLIR execution engine. The execution engine eagerly JIT-compiles
   // the module.
-  auto maybeEngine = mlir::ExecutionEngine::create(
-      module, /*llvmModuleBuilder=*/nullptr, optPipeline);
+  mlir::ExecutionEngineOptions engineOptions;
+  engineOptions.transformer = optPipeline;
+  auto maybeEngine = mlir::ExecutionEngine::create(module, engineOptions);
   assert(maybeEngine && "failed to construct an execution engine");
   auto &engine = maybeEngine.get();
 
diff --git a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
index 058c55efaaddf..8a2a870aec422 100644
--- a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
+++ b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@@ -48,6 +48,39 @@ class SimpleObjectCache : public llvm::ObjectCache {
   llvm::StringMap<std::unique_ptr<llvm::MemoryBuffer>> cachedObjects;
 };
 
+struct ExecutionEngineOptions {
+  /// If `llvmModuleBuilder` is provided, it will be used to create LLVM module
+  /// from the given MLIR module. Otherwise, a default `translateModuleToLLVMIR`
+  /// function will be used to translate MLIR module to LLVM IR.
+  llvm::function_ref<std::unique_ptr<llvm::Module>(ModuleOp,
+                                                   llvm::LLVMContext &)>
+      llvmModuleBuilder = nullptr;
+
+  /// If `transformer` is provided, it will be called on the LLVM module during
+  /// JIT-compilation and can be used, e.g., for reporting or optimization.
+  llvm::function_ref<llvm::Error(llvm::Module *)> transformer = {};
+
+  /// `jitCodeGenOptLevel`, when provided, is used as the optimization level for
+  /// target code generation.
+  Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel = llvm::None;
+
+  /// If `sharedLibPaths` are provided, the underlying JIT-compilation will
+  /// open and link the shared libraries for symbol resolution.
+  ArrayRef<StringRef> sharedLibPaths = {};
+
+  /// If `enableObjectCache` is set, the JIT compiler will create one to store
+  /// the object generated for the given module.
+  bool enableObjectCache = true;
+
+  /// If enable `enableGDBNotificationListener` is set, the JIT compiler will
+  /// notify the llvm's global GDB notification listener.
+  bool enableGDBNotificationListener = true;
+
+  /// If `enablePerfNotificationListener` is set, the JIT compiler will notify
+  /// the llvm's global Perf notification listener.
+  bool enablePerfNotificationListener = true;
+};
+
 /// JIT-backed execution engine for MLIR modules.  Assumes the module can be
 /// converted to LLVM IR.  For each function, creates a wrapper function with
 /// the fixed interface
@@ -64,38 +97,8 @@ class ExecutionEngine {
                   bool enablePerfNotificationListener);
 
   /// Creates an execution engine for the given module.
-  ///
-  /// If `llvmModuleBuilder` is provided, it will be used to create LLVM module
-  /// from the given MLIR module. Otherwise, a default `translateModuleToLLVMIR`
-  /// function will be used to translate MLIR module to LLVM IR.
-  ///
-  /// If `transformer` is provided, it will be called on the LLVM module during
-  /// JIT-compilation and can be used, e.g., for reporting or optimization.
-  ///
-  /// `jitCodeGenOptLevel`, when provided, is used as the optimization level for
-  /// target code generation.
-  ///
-  /// If `sharedLibPaths` are provided, the underlying JIT-compilation will
-  /// open and link the shared libraries for symbol resolution.
-  ///
-  /// If `enableObjectCache` is set, the JIT compiler will create one to store
-  /// the object generated for the given module.
-  ///
-  /// If enable `enableGDBNotificationListener` is set, the JIT compiler will
-  /// notify the llvm's global GDB notification listener.
-  ///
-  /// If `enablePerfNotificationListener` is set, the JIT compiler will notify
-  /// the llvm's global Perf notification listener.
   static llvm::Expected<std::unique_ptr<ExecutionEngine>>
-  create(ModuleOp m,
-         llvm::function_ref<std::unique_ptr<llvm::Module>(ModuleOp,
-                                                          llvm::LLVMContext &)>
-             llvmModuleBuilder = nullptr,
-         llvm::function_ref<llvm::Error(llvm::Module *)> transformer = {},
-         Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel = llvm::None,
-         ArrayRef<StringRef> sharedLibPaths = {}, bool enableObjectCache = true,
-         bool enableGDBNotificationListener = true,
-         bool enablePerfNotificationListener = true);
+  create(ModuleOp m, const ExecutionEngineOptions &options = {});
 
   /// Looks up a packed-argument function wrapping the function with the given
   /// name and returns a pointer to it. Propagates errors in case of failure.
diff --git a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
index 604cc45221fe4..4c55530870df7 100644
--- a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
@@ -50,9 +50,11 @@ mlirExecutionEngineCreate(MlirModule op, int optLevel, int numPaths,
   auto llvmOptLevel = static_cast<llvm::CodeGenOpt::Level>(optLevel);
   auto transformer = mlir::makeLLVMPassesTransformer(
       /*passes=*/{}, llvmOptLevel, /*targetMachine=*/tmOrError->get());
-  auto jitOrError =
-      ExecutionEngine::create(unwrap(op), /*llvmModuleBuilder=*/{}, transformer,
-                              llvmOptLevel, libPaths);
+  ExecutionEngineOptions jitOptions;
+  jitOptions.transformer = transformer;
+  jitOptions.jitCodeGenOptLevel = llvmOptLevel;
+  jitOptions.sharedLibPaths = libPaths;
+  auto jitOrError = ExecutionEngine::create(unwrap(op), jitOptions);
   if (!jitOrError) {
     consumeError(jitOrError.takeError());
     return MlirExecutionEngine{nullptr};
diff --git a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
index 00569e1d4242c..d8e778b9efb44 100644
--- a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -227,22 +227,16 @@ ExecutionEngine::ExecutionEngine(bool enableObjectCache,
   }
 }
 
-Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
-    ModuleOp m,
-    llvm::function_ref<std::unique_ptr<llvm::Module>(ModuleOp,
-                                                     llvm::LLVMContext &)>
-        llvmModuleBuilder,
-    llvm::function_ref<Error(llvm::Module *)> transformer,
-    Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel,
-    ArrayRef<StringRef> sharedLibPaths, bool enableObjectCache,
-    bool enableGDBNotificationListener, bool enablePerfNotificationListener) {
+Expected<std::unique_ptr<ExecutionEngine>>
+ExecutionEngine::create(ModuleOp m, const ExecutionEngineOptions &options) {
   auto engine = std::make_unique<ExecutionEngine>(
-      enableObjectCache, enableGDBNotificationListener,
-      enablePerfNotificationListener);
+      options.enableObjectCache, options.enableGDBNotificationListener,
+      options.enablePerfNotificationListener);
 
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext);
-  auto llvmModule = llvmModuleBuilder ? llvmModuleBuilder(m, *ctx)
-                                      : translateModuleToLLVMIR(m, *ctx);
+  auto llvmModule = options.llvmModuleBuilder
+                        ? options.llvmModuleBuilder(m, *ctx)
+                        : translateModuleToLLVMIR(m, *ctx);
   if (!llvmModule)
     return makeStringError("could not convert to LLVM IR");
   // FIXME: the triple should be passed to the translation or dialect conversion
@@ -276,7 +270,7 @@ Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
     }
 
     // Resolve symbols from shared libraries.
-    for (auto libPath : sharedLibPaths) {
+    for (auto libPath : options.sharedLibPaths) {
       auto mb = llvm::MemoryBuffer::getFile(libPath);
       if (!mb) {
         errs() << "Failed to create MemoryBuffer for: " << libPath
@@ -302,8 +296,8 @@ Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
   // LLJITWithObjectCache example.
   auto compileFunctionCreator = [&](JITTargetMachineBuilder jtmb)
       -> Expected<std::unique_ptr<IRCompileLayer::IRCompiler>> {
-    if (jitCodeGenOptLevel)
-      jtmb.setCodeGenOptLevel(jitCodeGenOptLevel.getValue());
+    if (options.jitCodeGenOptLevel)
+      jtmb.setCodeGenOptLevel(options.jitCodeGenOptLevel.getValue());
     auto tm = jtmb.createTargetMachine();
     if (!tm)
       return tm.takeError();
@@ -320,9 +314,9 @@ Expected<std::unique_ptr<ExecutionEngine>> ExecutionEngine::create(
 
   // Add a ThreadSafemodule to the engine and return.
   ThreadSafeModule tsm(std::move(llvmModule), std::move(ctx));
-  if (transformer)
+  if (options.transformer)
     cantFail(tsm.withModuleDo(
-        [&](llvm::Module &module) { return transformer(&module); }));
+        [&](llvm::Module &module) { return options.transformer(&module); }));
   cantFail(jit->addIRModule(std::move(tsm)));
   engine->jit = std::move(jit);
 
diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
index 37e5e93e2b7ea..79468fc6d63de 100644
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -207,9 +207,12 @@ static Error compileAndExecute(Options &options, ModuleOp module,
     return symbolMap;
   };
 
-  auto expectedEngine = mlir::ExecutionEngine::create(
-      module, config.llvmModuleBuilder, config.transformer, jitCodeGenOptLevel,
-      executionEngineLibs);
+  mlir::ExecutionEngineOptions engineOptions;
+  engineOptions.llvmModuleBuilder = config.llvmModuleBuilder;
+  engineOptions.transformer = config.transformer;
+  engineOptions.jitCodeGenOptLevel = jitCodeGenOptLevel;
+  engineOptions.sharedLibPaths = executionEngineLibs;
+  auto expectedEngine = mlir::ExecutionEngine::create(module, engineOptions);
   if (!expectedEngine)
     return expectedEngine.takeError();
 

From 4d37bbc429f61ea0f60233e258ebcb1dfc031513 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 23 Feb 2022 16:32:40 +0100
Subject: [PATCH 652/748] [Bitcode] Store function type IDs rather than
 function types

This resolves one of the type ID propagation TODOs.
---
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 0433b8fd215ba..2547046be5365 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -488,7 +488,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   /// types of a Type*. This is used during upgrades of typed pointer IR in
   /// opaque pointer mode.
   DenseMap<unsigned, SmallVector<unsigned, 1>> ContainedTypeIDs;
-  DenseMap<Function *, FunctionType *> FunctionTypes;
+  DenseMap<Function *, unsigned> FunctionTypeIDs;
   BitcodeReaderValueList ValueList;
   Optional<MetadataLoader> MDLoader;
   std::vector<Comdat *> ComdatList;
@@ -3503,7 +3503,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
 
   assert(Func->getFunctionType() == FTy &&
          "Incorrect fully specified type provided for function");
-  FunctionTypes[Func] = cast<FunctionType>(FTy);
+  FunctionTypeIDs[Func] = FTyID;
 
   Func->setCallingConv(CC);
   bool isProto = Record[2];
@@ -4092,14 +4092,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
   unsigned ModuleMDLoaderSize = MDLoader->size();
 
   // Add all the function arguments to the value table.
-#ifndef NDEBUG
   unsigned ArgNo = 0;
-  FunctionType *FTy = FunctionTypes[F];
-#endif
+  unsigned FTyID = FunctionTypeIDs[F];
   for (Argument &I : F->args()) {
-    assert(I.getType() == FTy->getParamType(ArgNo++) &&
+    unsigned ArgTyID = getContainedTypeID(FTyID, ArgNo + 1);
+    assert(I.getType() == getTypeByID(ArgTyID) &&
            "Incorrect fully specified type for Function Argument");
-    ValueList.push_back(&I, TODOTypeID);
+    ValueList.push_back(&I, ArgTyID);
+    ++ArgNo;
   }
   unsigned NextValueNo = ValueList.size();
   BasicBlock *CurBB = nullptr;

From 22d04531283753758f7feae9769527c0e72c9d24 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 23 Feb 2022 15:43:34 +0000
Subject: [PATCH 653/748] [X86] combineX86ShuffleChainWithExtract - don't both
 widening inputs after peeking through ISD::EXTRACT_SUBVECTOR nodes

combineX86ShuffleChain no longer has to assume that the shuffle inputs are the right size, so don't create unnecessary nodes messing up oneuse limits as detailed on Issue #45319

Removing widening from combineX86ShufflesRecursively will be the next step, followed by removing combineX86ShuffleChainWithExtract entirely
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  30 +-
 .../vector-interleaved-load-i16-stride-3.ll   |   2 +-
 .../vector-interleaved-load-i32-stride-4.ll   | 310 +++++++++---------
 .../CodeGen/X86/vector-shuffle-256-v32.ll     |  10 +-
 4 files changed, 172 insertions(+), 180 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9666d71288a34..ebe728b9c08ed 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37239,9 +37239,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   unsigned NumRootElts = RootVT.getVectorNumElements();
 
   // Canonicalize shuffle input op to the requested type.
-  // TODO: Support cases where Op is smaller than VT.
   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
-    if (VT.getSizeInBits() < Op.getValueSizeInBits())
+    if (VT.getSizeInBits() > Op.getValueSizeInBits())
+      Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
+    else if (VT.getSizeInBits() < Op.getValueSizeInBits())
       Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
     return DAG.getBitcast(VT, Op);
   };
@@ -37255,8 +37256,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
-  assert(VT1.getSizeInBits() == RootSizeInBits &&
-         VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
+  assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
+         (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
 
   SDValue Res;
 
@@ -37978,6 +37979,12 @@ static SDValue combineX86ShuffleChainWithExtract(
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
 
+  // Bail if we have any smaller inputs.
+  if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
+        return Input.getValueSizeInBits() < RootSizeInBits;
+      }))
+    return SDValue();
+
   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
 
@@ -38020,16 +38027,6 @@ static SDValue combineX86ShuffleChainWithExtract(
       }))
     return SDValue();
 
-  for (SDValue &NewInput : WideInputs) {
-    assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
-           "Shuffle vector size mismatch");
-    if (WideSizeInBits > NewInput.getValueSizeInBits())
-      NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
-                                SDLoc(NewInput), WideSizeInBits);
-    assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
-           "Unexpected subvector extraction");
-  }
-
   // Create new mask for larger type.
   for (unsigned i = 1; i != NumInputs; ++i)
     Offsets[i] += i * Scale * NumMaskElts;
@@ -38054,7 +38051,10 @@ static SDValue combineX86ShuffleChainWithExtract(
 
   // Attempt to combine wider chain.
   // TODO: Can we use a better Root?
-  SDValue WideRoot = WideInputs[0];
+  SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
+                             WideInputs.back().getValueSizeInBits()
+                         ? WideInputs.front()
+                         : WideInputs.back();
   if (SDValue WideShuffle =
           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
                                  HasVariableMask, AllowVariableCrossLaneMask,
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 6755edfbe8a7e..857f0489597f3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -850,7 +850,7 @@ define void @vf32(<96 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.v
 ; AVX1-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15>
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm13[2,1,2,3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 8fb6a5296f695..7179d3a4d2daf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -274,8 +274,7 @@ define void @load_i32_stride4_vf8(<32 x i32>* %in.vec, <8 x i32>* %out.vec0, <8
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7]
 ; AVX1-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm3[3,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovaps %ymm11, (%rsi)
 ; AVX1-NEXT:    vmovaps %ymm12, (%rdx)
@@ -478,135 +477,132 @@ define void @load_i32_stride4_vf16(<64 x i32>* %in.vec, <16 x i32>* %out.vec0, <
 ;
 ; AVX1-LABEL: load_i32_stride4_vf16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    subq $328, %rsp # imm = 0x148
+; AVX1-NEXT:    subq $312, %rsp # imm = 0x138
 ; AVX1-NEXT:    vmovaps 64(%rdi), %ymm5
-; AVX1-NEXT:    vmovaps 96(%rdi), %ymm4
-; AVX1-NEXT:    vmovaps 192(%rdi), %ymm3
-; AVX1-NEXT:    vmovaps 224(%rdi), %ymm14
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm14[2,3,0,1]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm11[0],ymm14[2],ymm11[2]
+; AVX1-NEXT:    vmovaps 96(%rdi), %ymm8
+; AVX1-NEXT:    vmovaps 192(%rdi), %ymm2
+; AVX1-NEXT:    vmovaps 224(%rdi), %ymm15
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm15[2,3,0,1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm11[0],ymm15[2],ymm11[2]
 ; AVX1-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1]
-; AVX1-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5]
-; AVX1-NEXT:    vmovaps %ymm3, %ymm12
-; AVX1-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[4],ymm2[4],ymm7[5],ymm2[5]
+; AVX1-NEXT:    vmovaps %ymm2, %ymm4
+; AVX1-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
 ; AVX1-NEXT:    vmovaps 160(%rdi), %xmm1
 ; AVX1-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovaps 176(%rdi), %xmm0
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    vmovaps 144(%rdi), %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovaps 128(%rdi), %xmm2
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-NEXT:    vmovaps %xmm0, %xmm13
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,0]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1]
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
-; AVX1-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX1-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[4],ymm5[4],ymm9[5],ymm5[5]
-; AVX1-NEXT:    vmovups %ymm9, (%rsp) # 32-byte Spill
-; AVX1-NEXT:    vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
-; AVX1-NEXT:    vmovaps 32(%rdi), %xmm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1]
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1]
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
+; AVX1-NEXT:    vmovaps 32(%rdi), %xmm1
+; AVX1-NEXT:    vmovaps 48(%rdi), %xmm0
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovaps 48(%rdi), %xmm6
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vmovaps %xmm1, %xmm14
+; AVX1-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps (%rdi), %xmm12
+; AVX1-NEXT:    vmovaps 16(%rdi), %xmm6
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm6[0],xmm12[1],xmm6[1]
 ; AVX1-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovaps (%rdi), %xmm10
-; AVX1-NEXT:    vmovaps 16(%rdi), %xmm5
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm15 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,0]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm15 = ymm3[1,0],ymm7[1,0],ymm3[5,4],ymm7[5,4]
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[4],ymm15[4],ymm11[5],ymm15[5]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm7[1,0],ymm4[5,4],ymm7[5,4]
 ; AVX1-NEXT:    vmovaps %ymm7, %ymm11
 ; AVX1-NEXT:    vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm0[2,3],ymm15[6,4],ymm0[6,7]
-; AVX1-NEXT:    vmovaps %xmm13, %xmm1
-; AVX1-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[1],xmm13[1],zero,zero
+; AVX1-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[1],xmm4[1],zero,zero
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5]
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT:    vshufps {{.*#+}} ymm13 = ymm2[1,0],ymm9[1,0],ymm2[5,4],ymm9[5,4]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm0[2,3],ymm13[6,4],ymm0[6,7]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm10[1],xmm5[1],zero,zero
-; AVX1-NEXT:    vmovaps %xmm5, %xmm9
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5]
+; AVX1-NEXT:    vmovaps %ymm5, %ymm2
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX1-NEXT:    vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[1,0],ymm5[5,4],ymm1[5,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm12[1],xmm6[1],zero,zero
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm15[1],ymm14[3],ymm15[3]
-; AVX1-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7]
-; AVX1-NEXT:    vmovaps %ymm12, %ymm13
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm13[1],ymm15[3],ymm13[3]
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm14[2],ymm11[3],ymm14[3],ymm11[6],ymm14[6],ymm11[7],ymm14[7]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,0],ymm6[4,5],ymm0[6,4]
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX1-NEXT:    vmovaps %xmm7, %xmm5
-; AVX1-NEXT:    vmovaps %xmm8, %xmm1
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm8[2],xmm7[2]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX1-NEXT:    vmovaps %xmm9, %xmm10
+; AVX1-NEXT:    vmovaps %xmm7, %xmm4
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm12[1],ymm8[3],ymm12[3]
-; AVX1-NEXT:    vmovups (%rsp), %ymm11 # 32-byte Reload
-; AVX1-NEXT:    vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7]
+; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX1-NEXT:    vmovaps %ymm2, %ymm9
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm4[2],xmm2[2]
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm1[2],xmm5[2]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm3[2],xmm12[3],xmm3[3]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
-; AVX1-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7]
-; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX1-NEXT:    # ymm3 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm6[2,3],ymm3[6,4],ymm6[6,7]
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
-; AVX1-NEXT:    # xmm6 = xmm1[3,0],mem[3,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX1-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[6],ymm8[6],ymm12[7],ymm8[7]
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT:    vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm11[3,0],ymm1[7,4],ymm11[7,4]
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7]
+; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
+; AVX1-NEXT:    # ymm2 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm6[2,3],ymm2[6,4],ymm6[6,7]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm11[3,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[2,3]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7]
+; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
+; AVX1-NEXT:    # ymm6 = ymm9[3,0],mem[3,0],ymm9[7,4],mem[7,4]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[2,3],ymm6[6,4],ymm4[6,7]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[3,0],xmm10[3,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[2,0,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,0],xmm12[3,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT:    vmovaps %ymm2, 32(%rsi)
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT:    vmovaps %ymm2, (%rsi)
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT:    vmovaps %ymm2, 32(%rdx)
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT:    vmovaps %ymm2, (%rdx)
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT:    vmovaps %ymm2, 32(%rcx)
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm3, 32(%rsi)
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm3, (%rsi)
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm3, 32(%rdx)
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm3, (%rdx)
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vmovaps %ymm3, 32(%rcx)
 ; AVX1-NEXT:    vmovaps %ymm0, (%rcx)
-; AVX1-NEXT:    vmovaps %ymm3, 32(%r8)
+; AVX1-NEXT:    vmovaps %ymm2, 32(%r8)
 ; AVX1-NEXT:    vmovaps %ymm1, (%r8)
-; AVX1-NEXT:    addq $328, %rsp # imm = 0x148
+; AVX1-NEXT:    addq $312, %rsp # imm = 0x138
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1046,7 +1042,7 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX1-NEXT:    vmovaps 400(%rdi), %xmm5
-; AVX1-NEXT:    vmovaps %xmm5, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovaps 384(%rdi), %xmm2
 ; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
@@ -1078,22 +1074,24 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-NEXT:    vmovaps 192(%rdi), %ymm0
-; AVX1-NEXT:    vmovaps 224(%rdi), %ymm13
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm13[2,3,0,1]
-; AVX1-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[2],ymm2[2]
-; AVX1-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm0[2,3,0,1]
-; AVX1-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
-; AVX1-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovaps %ymm0, %ymm14
+; AVX1-NEXT:    vmovaps 224(%rdi), %ymm1
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
+; AVX1-NEXT:    vmovaps %ymm3, %ymm13
+; AVX1-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vmovaps %ymm1, %ymm12
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
+; AVX1-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT:    vmovaps %ymm0, %ymm15
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
 ; AVX1-NEXT:    vmovaps 160(%rdi), %xmm0
-; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovaps 176(%rdi), %xmm9
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm9[0],xmm0[0]
-; AVX1-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps 176(%rdi), %xmm1
+; AVX1-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vmovaps 144(%rdi), %xmm4
 ; AVX1-NEXT:    vmovaps 128(%rdi), %xmm8
 ; AVX1-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
@@ -1113,17 +1111,17 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
-; AVX1-NEXT:    vmovaps 32(%rdi), %xmm2
-; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovaps 48(%rdi), %xmm0
+; AVX1-NEXT:    vmovaps 32(%rdi), %xmm0
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vmovaps 48(%rdi), %xmm2
+; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX1-NEXT:    vmovaps (%rdi), %xmm2
 ; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovaps 16(%rdi), %xmm3
 ; AVX1-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0]
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
@@ -1140,25 +1138,25 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm12[1,0],ymm14[5,4],ymm12[5,4]
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[1,0],ymm9[1,0],ymm15[5,4],ymm9[5,4]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm8[1],xmm4[1],zero,zero
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; AVX1-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm7 = xmm2[0],mem[0],xmm2[1],mem[1]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5]
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[4],ymm2[4],ymm14[5],ymm2[5]
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm12[1,0],ymm3[5,4],ymm12[5,4]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm8[1],xmm1[1],zero,zero
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
@@ -1173,9 +1171,9 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[1,0],ymm1[5,4],ymm13[5,4]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX1-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT:    # xmm1 = xmm14[0],mem[0],xmm14[1],mem[1]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-NEXT:    vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
 ; AVX1-NEXT:    # xmm7 = mem[0],xmm7[1],zero,zero
@@ -1202,35 +1200,35 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    # ymm1 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX1-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
-; AVX1-NEXT:    # xmm1 = xmm11[2],mem[2],xmm11[3],mem[3]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3]
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT:    vmovaps (%rsp), %xmm5 # 16-byte Reload
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm5[2],xmm6[2]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm15[1],ymm2[3],ymm15[3]
+; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm14[1],ymm2[3],ymm14[3]
 ; AVX1-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[6],ymm3[6],ymm12[7],ymm3[7]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
-; AVX1-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm2[2],xmm8[3],xmm2[3]
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm4[2],xmm9[2]
 ; AVX1-NEXT:    vmovaps %xmm9, %xmm3
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm9[1],ymm15[3],ymm9[3]
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX1-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7]
+; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm12[1],ymm14[3],ymm12[3]
+; AVX1-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
+; AVX1-NEXT:    # ymm1 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = zero,zero,xmm14[2],xmm13[2]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm15[2]
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm14[2],xmm8[3],xmm14[3]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -1240,10 +1238,9 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
 ; AVX1-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm11[3,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm4 = xmm11[3,0],mem[3,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX1-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -1256,8 +1253,7 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
 ; AVX1-NEXT:    # xmm5 = xmm2[3,0],mem[3,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX1-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
@@ -1272,18 +1268,16 @@ define void @load_i32_stride4_vf32(<128 x i32>* %in.vec, <32 x i32>* %out.vec0,
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
 ; AVX1-NEXT:    # xmm6 = xmm2[3,0],mem[3,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm6[2,0],xmm5[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX1-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[6],ymm15[6],ymm9[7],ymm15[7]
-; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX1-NEXT:    # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4]
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm2 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7]
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX1-NEXT:    # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4]
 ; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7]
-; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm14[3,0],xmm8[3,0]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm9[3,0],xmm8[3,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX1-NEXT:    vmovaps %ymm3, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 73d94b208b6ea..933e8c60dcd2c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -5048,14 +5048,12 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
 ;
 ; XOPAVX1-LABEL: PR28136:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,8,u,10,u,12,u,14,u,9,u,11,u,13,u,15]
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm2 = xmm0[8],xmm2[1],xmm0[10],xmm2[3],xmm0[12],xmm2[5],xmm0[14],xmm2[7],xmm0[9],xmm2[9],xmm0[11],xmm2[11],xmm0[13],xmm2[13],xmm0[15],xmm2[15]
-; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,0,u,2,u,4,u,6,u,1,u,3,u,5,u,7]
-; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7],xmm0[1],xmm1[9],xmm0[3],xmm1[11],xmm0[5],xmm1[13],xmm0[7],xmm1[15]
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14],xmm0[9],xmm1[9],xmm0[11],xmm1[11],xmm0[13],xmm1[13],xmm0[15],xmm1[15]
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[1],xmm1[1],xmm0[3],xmm1[3],xmm0[5],xmm1[5],xmm0[7],xmm1[7]
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;

From 9f1c6fbf11f5385834f25fd253acce23f21d7bf6 Mon Sep 17 00:00:00 2001
From: Malhar Jajoo <malhar.jajoo@arm.com>
Date: Wed, 2 Feb 2022 15:52:13 +0000
Subject: [PATCH 654/748] [LAA] Add remarks for  unbounded array access

Adds new optimization remarks when loop vectorization fails due to
the compiler being unable to find bound of an array access inside
a loop

Differential Revision: https://reviews.llvm.org/D115873
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 23 +++++---
 .../X86/vectorization-remarks-missed.ll       |  4 +-
 .../LoopVectorize/memory-dep-remarks.ll       | 55 +++++++++++++++++++
 3 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2a62c46a05c47..68b80e0028b82 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -558,7 +558,7 @@ class AccessAnalysis {
   /// (i.e. the pointers have computable bounds).
   bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
                        Loop *TheLoop, const ValueToValueMap &Strides,
-                       bool ShouldCheckWrap = false);
+                       Value *&UncomputablePtr, bool ShouldCheckWrap = false);
 
   /// Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
@@ -732,7 +732,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
 bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                      ScalarEvolution *SE, Loop *TheLoop,
                                      const ValueToValueMap &StridesMap,
-                                     bool ShouldCheckWrap) {
+                                     Value *&UncomputablePtr, bool ShouldCheckWrap) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
@@ -824,6 +824,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
                                     DepSetId, TheLoop, RunningDepId, ASId,
                                     ShouldCheckWrap, /*Assume=*/true)) {
             CanDoAliasSetRT = false;
+            UncomputablePtr = Access.getPointer();
             break;
           }
         }
@@ -2080,10 +2081,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(),
-                                                  TheLoop, SymbolicStrides);
+  Value *UncomputablePtr = nullptr;
+  bool CanDoRTIfNeeded =
+      Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
+                               SymbolicStrides, UncomputablePtr, false);
   if (!CanDoRTIfNeeded) {
-    recordAnalysis("CantIdentifyArrayBounds") << "cannot identify array bounds";
+    auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
+    recordAnalysis("CantIdentifyArrayBounds", I) 
+        << "cannot identify array bounds";
     LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
                       << "the array bounds.\n");
     CanVecMem = false;
@@ -2110,12 +2115,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       PtrRtChecking->Need = true;
 
       auto *SE = PSE->getSE();
-      CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, SE, TheLoop,
-                                                 SymbolicStrides, true);
+      UncomputablePtr = nullptr;
+      CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
+          *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
 
       // Check that we found the bounds for the pointer.
       if (!CanDoRTIfNeeded) {
-        recordAnalysis("CantCheckMemDepsAtRunTime")
+        auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
+        recordAnalysis("CantCheckMemDepsAtRunTime", I)
             << "cannot check memory dependencies at runtime";
         LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
         CanVecMem = false;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index c96e84b3d7321..9c771411792e5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -31,7 +31,7 @@
 ;   for (int i = 0; i < Length; i++)
 ;     A[i] = A[B[i]];
 ; }
-; CHECK: remark: source.cpp:18:8: loop not vectorized: cannot identify array bounds
+; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
 ; CHECK: remark: source.cpp:18:8: loop not vectorized (Force=true)
 ; CHECK: warning: source.cpp:18:8: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
 
@@ -76,7 +76,7 @@
 ; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            ''
 ; YAML-NEXT: Name:            CantIdentifyArrayBounds
-; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 18, Column: 8 }
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
 ; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'loop not vectorized: '
diff --git a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
index 883cbfea8bee0..31ee01ce4f196 100644
--- a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
+++ b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
@@ -4,6 +4,39 @@
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
+; // Loop has an array element B[i] (%arrayidx in IR) being used as index to
+; // another array (A), and since the value of B[i] is unknown,
+; // the bound for array A is unknown.
+; void test_unknown_bounds(int n, int* A, int* B) {
+;     for(int i = 0; i < n ; ++i)
+;         A[i] = A[B[i]] + 1;
+; }
+
+; CHECK: remark: source.c:4:16: loop not vectorized: cannot identify array bounds
+
+define void @test_unknown_bounds(i64 %n, i32* nocapture %A, i32* nocapture readonly %B) !dbg !13 {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35
+  %add = add nsw i32 %1, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !dbg !28
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+}
+
 ; // a) Dependence::NoDep
 ; // Loop containing only reads (here of the array A) does not hinder vectorization
 ; void test_nodep(int n, int* A, int* B) {
@@ -258,6 +291,23 @@ for.body:                                         ; preds = %entry, %for.body
     ret void
 }
 
+; YAML:      --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantIdentifyArrayBounds
+; YAML-NEXT: DebugLoc:        { File: source.c, Line: 4, Column: 16 }
+; YAML-NEXT: Function:        test_unknown_bounds
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          cannot identify array bounds
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.c, Line: 3, Column: 5 }
+; YAML-NEXT: Function:        test_unknown_bounds
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
 ; YAML:      --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
 ; YAML-NEXT: Name:            UnsafeDep
@@ -347,6 +397,11 @@ for.body:                                         ; preds = %entry, %for.body
 !1 = !DIFile(filename: "source.c", directory: "")
 !2 = !{}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = distinct !DISubprogram(name: "test_unknown_bounds", scope: !1, file: !1, line: 2, type: !45, scopeLine: 2, unit: !0, retainedNodes: !2)
+!23 = distinct !DILexicalBlock(scope: !13, file: !1, line: 3, column: 5)
+!27 = distinct !DILexicalBlock(scope: !23, file: !1, line: 3, column: 5)
+!28 = !DILocation(line: 3, column: 5, scope: !23)
+!35 = !DILocation(line: 4, column: 16, scope: !27)
 !44 = distinct !DISubprogram(name: "test_nodep", scope: !1, file: !1, line: 14, type: !45, scopeLine: 14, unit: !0, retainedNodes: !2)
 !45 = !DISubroutineType(types: !46)
 !46 = !{null, !18, !16, !16}

From 2f906683ed14668ceed1f85922789abffd8429fd Mon Sep 17 00:00:00 2001
From: Philipp Stephani <phst@google.com>
Date: Wed, 23 Feb 2022 17:00:04 +0100
Subject: [PATCH 655/748] clang-format.el: Make clang-format work in indirect
 buffers.

In an indirect buffer, buffer-file-name is nil, so check the base buffer
instead.  This works fine in direct buffers where buffer-base-buffer returns
nil.

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D120408
---
 clang/tools/clang-format/clang-format.el | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/clang-format/clang-format.el b/clang/tools/clang-format/clang-format.el
index 768acb3a5d0dd..4e6daa82d4ade 100644
--- a/clang/tools/clang-format/clang-format.el
+++ b/clang/tools/clang-format/clang-format.el
@@ -147,7 +147,7 @@ uses the function `buffer-file-name'."
     (setq style clang-format-style))
 
   (unless assume-file-name
-    (setq assume-file-name buffer-file-name))
+    (setq assume-file-name (buffer-file-name (buffer-base-buffer))))
 
   (let ((file-start (clang-format--bufferpos-to-filepos start 'approximate
                                                         'utf-8-unix))

From 6adf4b039e095224edbbecda5972e5e3353b53b6 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Tue, 22 Feb 2022 20:00:43 -0800
Subject: [PATCH 656/748] [SLP] Remove cap on schedule window size

This cap was first added in 848c1aa45 (back in 2015).  Per the original commit message, the purpose was to avoid a compile time explosion in long basic blocks.  The algorithmic problem in scheduling has now been fixed in 0539a26d.

In the meantime, the code has rotten fairly badly.  Some intermediate refactoring caused the size to only be incremented if *both* iterators advance in the window search.  This causes the size to be badly undercounted when near one end of a basic block.  We no longer have any test which exercises the logic in an intentional way; there's one test which differs with this change, but the changes appear fairly orthoganol to the purpose of the test file.

Unfortunately, we no longer have the original motivating example, so it's possible that it also hits some other issue.  I tested locally with a large example, but even at it's worst, that one doesn't demonstrate anything too extreme even without the algorithmic fix.  It's clearly faster with, but only by ~20% which doesn't seem in line with the original commit message.   If regressions with this patch are seen, please file a bug and I'll try to fix any other algorithmic problems which fall out.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 72 ++++---------------
 .../SLPVectorizer/AArch64/gather-root.ll      | 71 ++++--------------
 .../X86/crash_exceed_scheduling.ll            |  2 +-
 .../X86/extract-shuffle-inseltpoison.ll       |  2 +-
 .../SLPVectorizer/X86/extract-shuffle.ll      |  2 +-
 .../SLPVectorizer/X86/schedule_budget.ll      |  8 +--
 6 files changed, 36 insertions(+), 121 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4ea8a77583cc9..3e87b3b6ff322 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -138,14 +138,6 @@ static cl::opt<int>
 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
     cl::desc("Maximum depth of the lookup for consecutive stores."));
 
-/// Limits the size of scheduling regions in a block.
-/// It avoid long compile times for _very_ large blocks where vector
-/// instructions are spread over a wide range.
-/// This limit is way higher than needed by real-world functions.
-static cl::opt<int>
-ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
-    cl::desc("Limit the size of the SLP scheduling region per block"));
-
 static cl::opt<int> MinVectorRegSizeOption(
     "slp-min-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
@@ -177,10 +169,6 @@ static const unsigned AliasedCheckLimit = 10;
 // This limit is useful for very large basic blocks.
 static const unsigned MaxMemDepDistance = 160;
 
-/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
-/// regions to be handled.
-static const int MinScheduleRegionSize = 16;
-
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
@@ -2627,13 +2615,6 @@ class BoUpSLP {
       FirstLoadStoreInRegion = nullptr;
       LastLoadStoreInRegion = nullptr;
 
-      // Reduce the maximum schedule region size by the size of the
-      // previous scheduling run.
-      ScheduleRegionSizeLimit -= ScheduleRegionSize;
-      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
-        ScheduleRegionSizeLimit = MinScheduleRegionSize;
-      ScheduleRegionSize = 0;
-
       // Make a new scheduling region, i.e. all existing ScheduleData is not
       // in the new region yet.
       ++SchedulingRegionID;
@@ -2814,7 +2795,7 @@ class BoUpSLP {
 
     /// Extends the scheduling region so that V is inside the region.
     /// \returns true if the region size is within the limit.
-    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+    void extendSchedulingRegion(Value *V, const InstructionsState &S);
 
     /// Initialize the ScheduleData structures for new instructions in the
     /// scheduling region.
@@ -2868,12 +2849,6 @@ class BoUpSLP {
     /// (can be null).
     ScheduleData *LastLoadStoreInRegion = nullptr;
 
-    /// The current size of the scheduling region.
-    int ScheduleRegionSize = 0;
-
-    /// The maximum size allowed for the scheduling region.
-    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
-
     /// The ID of the scheduling region. For a new vectorization iteration this
     /// is incremented which "removes" all ScheduleData from the region.
     /// Make sure that the initial SchedulingRegionID is greater than the
@@ -7517,11 +7492,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
         doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
       ReSchedule = true;
     }
-    if (Bundle) {
-      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
-                        << " in block " << BB->getName() << "\n");
-      calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
-    }
+    LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
+                      << " in block " << BB->getName() << "\n");
+    calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
 
     if (ReSchedule) {
       resetSchedule();
@@ -7532,8 +7505,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     // dependencies. As soon as the bundle is "ready" it means that there are no
     // cyclic dependencies and we can schedule it. Note that's important that we
     // don't "schedule" the bundle yet (see cancelScheduling).
-    while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
-           !ReadyInsts.empty()) {
+    while (!Bundle->isReady() && !ReadyInsts.empty()) {
       ScheduleData *Picked = ReadyInsts.pop_back_val();
       assert(Picked->isSchedulingEntity() && Picked->isReady() &&
              "must be ready to schedule");
@@ -7543,18 +7515,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
-  for (Value *V : VL) {
-    if (!extendSchedulingRegion(V, S)) {
-      // If the scheduling region got new instructions at the lower end (or it
-      // is a new region for the first bundle). This makes it necessary to
-      // recalculate all dependencies.
-      // Otherwise the compiler may crash trying to incorrectly calculate
-      // dependencies and emit instruction in the wrong order at the actual
-      // scheduling.
-      TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
-      return None;
-    }
-  }
+  for (Value *V : VL)
+    extendSchedulingRegion(V, S);
 
   bool ReSchedule = false;
   for (Value *V : VL) {
@@ -7624,10 +7586,11 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
   return &(ScheduleDataChunks.back()[ChunkPos++]);
 }
 
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
-                                                      const InstructionsState &S) {
+void
+BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+                                                 const InstructionsState &S) {
   if (getScheduleData(V, isOneOf(S, V)))
-    return true;
+    return;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
@@ -7646,7 +7609,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     return true;
   };
   if (CheckSheduleForI(I))
-    return true;
+    return;
   if (!ScheduleStart) {
     // It's the first instruction in the new region.
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
@@ -7656,7 +7619,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
       CheckSheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
-    return true;
+    return;
   }
   // Search up and down at the same time, because we don't know if the new
   // instruction is above or below the existing scheduling region.
@@ -7667,11 +7630,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
   BasicBlock::iterator LowerEnd = BB->end();
   while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
          &*DownIter != I) {
-    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
-      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
-      return false;
-    }
-
     ++UpIter;
     ++DownIter;
   }
@@ -7684,7 +7642,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
       CheckSheduleForI(I);
     LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                       << "\n");
-    return true;
+    return;
   }
   assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
          "Expected to reach top of the basic block or instruction down the "
@@ -7698,7 +7656,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     CheckSheduleForI(I);
   assert(ScheduleEnd && "tried to vectorize a terminator?");
   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
-  return true;
+  return;
 }
 
 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index e9c502b6982cd..1faadaba2bd72 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
-; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
+; RUN: opt < %s -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
+; RUN: opt < %s -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -35,41 +35,14 @@ define void @PR28330(i32 %n) {
 ;
 ; MAX-COST-LABEL: @PR28330(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-; MAX-COST-NEXT:    [[P1:%.*]] = icmp eq i8 [[P0]], 0
-; MAX-COST-NEXT:    [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-; MAX-COST-NEXT:    [[P3:%.*]] = icmp eq i8 [[P2]], 0
-; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
-; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
-; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
-; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
-; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
-; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P20:%.*]] = add i32 [[P17]], [[P19]]
-; MAX-COST-NEXT:    [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[P21]]
-; MAX-COST-NEXT:    [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[P23]]
-; MAX-COST-NEXT:    [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[P25]]
-; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
-; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
-; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[P31]]
-; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
@@ -139,30 +112,14 @@ define void @PR32038(i32 %n) {
 ;
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
-; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
-; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
-; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
-; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
-; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
-; MAX-COST-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[P27]]
-; MAX-COST-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[P29]]
-; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP5]], -5
-; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
-; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
index 7b6e6ca3c61af..56f6b7b5d3588 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 define void @exceed(double %0, double %1) {
 ; CHECK-LABEL: @exceed(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
index 293dcc0b1ef9e..a1b5f293602bd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -slp-schedule-budget=1 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
 
 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
index 61f25dd713775..ecffc1adb793c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -slp-schedule-budget=1 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
 
 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
index 3e4cfe6e05157..fa5534732b7a3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -S  -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -S  -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -15,6 +15,9 @@ define void @test(float * %a, float * %b, float * %c, float * %d) {
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
+; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    call void @unknown()
@@ -45,9 +48,6 @@ define void @test(float * %a, float * %b, float * %c, float * %d) {
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    call void @unknown()
-; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1

From 70ff6fbeb9b5acb4995dc42286954b762d0937fd Mon Sep 17 00:00:00 2001
From: Augie Fackler <augie@google.com>
Date: Fri, 11 Feb 2022 18:32:38 -0500
Subject: [PATCH 657/748] AttributorAttributes: avoid a crashing on bad
 alignments

Prior to this change, LLVM would attempt to optimize an
aligned_alloc(33, ...) call to the stack. This flunked an assertion when
trying to emit the alloca, which crashed LLVM. Avoid that with extra
checks.

Differential Revision: https://reviews.llvm.org/D119604
---
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 12 +++++++++++-
 llvm/test/Transforms/Attributor/heap_to_stack.ll | 14 ++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index c94f38687b219..ba986a65e0201 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -38,12 +38,14 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -6356,7 +6358,8 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
       continue;
 
     if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
-      if (!getAPInt(A, *this, *Align)) {
+      Optional<APInt> APAlign = getAPInt(A, *this, *Align);
+      if (!APAlign) {
         // Can't generate an alloca which respects the required alignment
         // on the allocation.
         LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
@@ -6364,6 +6367,13 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
         AI.Status = AllocationInfo::INVALID;
         Changed = ChangeStatus::CHANGED;
         continue;
+      } else {
+        if (APAlign->ugt(llvm::Value::MaximumAlignment) || !APAlign->isPowerOf2()) {
+          LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign << "\n");
+          AI.Status = AllocationInfo::INVALID;
+          Changed = ChangeStatus::CHANGED;
+          continue;
+        }
       }
     }
 
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index 73f6887204647..9741df2034ce9 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -218,6 +218,20 @@ define void @test3c(i64 %alignment) {
   ret void
 }
 
+; leave alone a constant-but-invalid alignment
+define void @test3d(i8* %p) {
+; CHECK-LABEL: define {{[^@]+}}@test3d
+; CHECK-SAME; (i8* nocapture [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 allocalign noundef 33, i64 noundef 128)
+; CHECK:    tail call void @free(i8* noalias nocapture [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call noalias i8* @aligned_alloc(i64 allocalign 33, i64 128)
+  tail call void @nofree_arg_only(i8* %1, i8* %p)
+  tail call void @free(i8* %1)
+  ret void
+}
+
 declare noalias i8* @calloc(i64, i64)
 
 define void @test0() {

From 7c1ee5e95f3159e13edef644db0509a7d49921c3 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 7 Feb 2022 19:11:16 +0100
Subject: [PATCH 658/748] [Pseudo] Token/TokenStream, PP directive parser.

The TokenStream class is the representation of the source code that will
be fed into the GLR parser.

This patch allows a "raw" TokenStream to be built by reading source code.
It also supports scanning a TokenStream to find the directive structure.

Next steps (with placeholders in the code): heuristically choosing a
path through #ifs, preprocessing the code by stripping directives and comments.
These will produce a suitable stream to feed into the parser proper.

Differential Revision: https://reviews.llvm.org/D119162
---
 clang/include/clang/Basic/TokenKinds.h        |   3 +
 .../clang/Tooling/Syntax/Pseudo/Preprocess.h  | 148 +++++++++++++
 .../clang/Tooling/Syntax/Pseudo/Token.h       | 202 +++++++++++++++++
 clang/lib/Basic/TokenKinds.cpp                |   9 +
 .../lib/Tooling/Syntax/Pseudo/CMakeLists.txt  |   3 +
 clang/lib/Tooling/Syntax/Pseudo/Lex.cpp       | 114 ++++++++++
 .../lib/Tooling/Syntax/Pseudo/Preprocess.cpp  | 206 ++++++++++++++++++
 clang/lib/Tooling/Syntax/Pseudo/Token.cpp     |  98 +++++++++
 clang/test/Syntax/lex.c                       |  52 +++++
 clang/tools/clang-pseudo/ClangPseudo.cpp      |  27 ++-
 .../Tooling/Syntax/Pseudo/CMakeLists.txt      |   2 +
 .../Tooling/Syntax/Pseudo/PreprocessTest.cpp  | 152 +++++++++++++
 .../Tooling/Syntax/Pseudo/TokenTest.cpp       | 178 +++++++++++++++
 13 files changed, 1192 insertions(+), 2 deletions(-)
 create mode 100644 clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
 create mode 100644 clang/include/clang/Tooling/Syntax/Pseudo/Token.h
 create mode 100644 clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
 create mode 100644 clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
 create mode 100644 clang/lib/Tooling/Syntax/Pseudo/Token.cpp
 create mode 100644 clang/test/Syntax/lex.c
 create mode 100644 clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
 create mode 100644 clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp

diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index 4e66aa1c8c2d8..6b7006651f4eb 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -68,6 +68,9 @@ const char *getPunctuatorSpelling(TokenKind Kind) LLVM_READNONE;
 /// tokens like 'int' and 'dynamic_cast'. Returns NULL for other token kinds.
 const char *getKeywordSpelling(TokenKind Kind) LLVM_READNONE;
 
+/// Returns the spelling of preprocessor keywords, such as "else".
+const char *getPPKeywordSpelling(PPKeywordKind Kind) LLVM_READNONE;
+
 /// Return true if this is a raw identifier or an identifier kind.
 inline bool isAnyIdentifier(TokenKind K) {
   return (K == tok::identifier) || (K == tok::raw_identifier);
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
new file mode 100644
index 0000000000000..11a92042e7496
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
@@ -0,0 +1,148 @@
+//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pseudoparser tries to match a token stream to the C++ grammar.
+// Preprocessor #defines and other directives are not part of this grammar, and
+// should be removed before the file can be parsed.
+//
+// Conditional blocks like #if...#else...#endif are particularly tricky, as
+// simply stripping the directives may not produce a grammatical result:
+//
+//   return
+//     #ifndef DEBUG
+//       1
+//     #else
+//       0
+//     #endif
+//       ;
+//
+// This header supports analyzing and removing the directives in a source file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// Describes the structure of a source file, as seen by the preprocessor.
+///
+/// The structure is a tree, whose leaves are plain source code and directives,
+/// and whose internal nodes are #if...#endif sections.
+///
+/// (root)
+/// |-+ Directive                    #include <stdio.h>
+/// |-+ Code                         int main() {
+/// | `                                printf("hello, ");
+/// |-+ Conditional -+ Directive     #ifndef NDEBUG
+/// | |-+ Code                         printf("debug\n");
+/// | |-+ Directive                  #else
+/// | |-+ Code                         printf("production\n");
+/// | `-+ Directive                  #endif
+/// |-+ Code                           return 0;
+///   `                              }
+///
+/// Unlike the clang preprocessor, we model the full tree explicitly.
+/// This class does not recognize macro usage, only directives.
+struct PPStructure {
+  /// A range of code (and possibly comments) containing no directives.
+  struct Code {
+    Token::Range Tokens;
+  };
+  /// A preprocessor directive.
+  struct Directive {
+    /// Raw tokens making up the directive, starting with `#`.
+    Token::Range Tokens;
+    clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
+  };
+  /// A preprocessor conditional section.
+  ///
+  /// This starts with an #if, #ifdef, #ifndef etc directive.
+  /// It covers all #else branches, and spans until the matching #endif.
+  struct Conditional {
+    /// The sequence of directives that introduce top-level alternative parses.
+    ///
+    /// The first branch will have an #if type directive.
+    /// Subsequent branches will have #else type directives.
+    std::vector<std::pair<Directive, PPStructure>> Branches;
+    /// The directive terminating the conditional, should be #endif.
+    Directive End;
+  };
+
+  /// Some piece of the file. {One of Code, Directive, Conditional}.
+  class Chunk; // Defined below.
+  std::vector<Chunk> Chunks;
+
+  /// Extract preprocessor structure by examining the raw tokens.
+  static PPStructure parse(const TokenStream &);
+
+  // FIXME: add heuristically selection of conditional branches.
+  // FIXME: allow deriving a preprocessed stream
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Chunk &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Code &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+                              const PPStructure::Directive &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+                              const PPStructure::Conditional &);
+
+// FIXME: This approximates std::variant<Code, Directive, Conditional>.
+//         Switch once we can use C++17.
+class PPStructure::Chunk {
+public:
+  enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
+  Kind kind() const {
+    return CodeVariant          ? K_Code
+           : DirectiveVariant   ? K_Directive
+           : ConditionalVariant ? K_Conditional
+                                : K_Empty;
+  }
+
+  Chunk() = delete;
+  Chunk(const Chunk &) = delete;
+  Chunk(Chunk &&) = default;
+  Chunk &operator=(const Chunk &) = delete;
+  Chunk &operator=(Chunk &&) = default;
+  ~Chunk() = default;
+
+  // T => Chunk constructor.
+  Chunk(Code C) : CodeVariant(std::move(C)) {}
+  Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
+  Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
+
+  // Chunk => T& and const T& conversions.
+#define CONVERSION(CONST, V)                                                   \
+  explicit operator CONST V &() CONST { return *V##Variant; }
+  CONVERSION(const, Code);
+  CONVERSION(, Code);
+  CONVERSION(const, Directive);
+  CONVERSION(, Directive);
+  CONVERSION(const, Conditional);
+  CONVERSION(, Conditional);
+#undef CONVERSION
+
+private:
+  // Wasteful, a union variant would be better!
+  llvm::Optional<Code> CodeVariant;
+  llvm::Optional<Directive> DirectiveVariant;
+  llvm::Optional<Conditional> ConditionalVariant;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Token.h b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h
new file mode 100644
index 0000000000000..7a73a85eae94d
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h
@@ -0,0 +1,202 @@
+//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tokens are the first level of abstraction above bytes used in pseudoparsing.
+// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
+// The tokens is wrapped into pseudo::Token, along with line/indent info.
+//
+// Unlike clang, we make multiple passes over the whole file, out-of-order.
+// Therefore we retain the whole token sequence in memory. (This is feasible as
+// we process one file at a time). pseudo::TokenStream holds such a stream.
+// The initial stream holds the raw tokens read from the file, later passes
+// operate on derived TokenStreams (e.g. with directives stripped).
+//
+// Similar facilities from clang that are *not* used:
+//  - SourceManager: designed around multiple files and precise macro expansion.
+//  - clang::Token: coupled to SourceManager, doesn't retain layout info.
+//                  (pseudo::Token is similar, but without SourceLocations).
+//  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
+//                  (pseudo::TokenStream is similar, but a flat token list).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// A single C++ or preprocessor token.
+///
+/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
+/// SourceManager - we are not dealing with multiple files.
+struct Token {
+  /// An Index identifies a token within a stream.
+  using Index = uint32_t;
+  /// A sentinel Index indicating no token.
+  constexpr static Index Invalid = std::numeric_limits<Index>::max();
+  struct Range;
+
+  /// The token text.
+  ///
+  /// Typically from the original source file, but may have been synthesized.
+  StringRef text() const { return StringRef(Data, Length); }
+  const char *Data = nullptr;
+  uint32_t Length = 0;
+
+  /// Zero-based line number for the start of the token.
+  /// This refers to the original source file as written.
+  uint32_t Line = 0;
+  /// Width of whitespace before the first token on this line.
+  uint8_t Indent = 0;
+  /// Flags have some meaning defined by the function that produced this stream.
+  uint8_t Flags = 0;
+  // Helpers to get/set Flags based on `enum class`.
+  template <class T> bool flag(T Mask) const {
+    return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+  }
+  template <class T> void setFlag(T Mask) {
+    Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+  }
+
+  /// The type of token as determined by clang's lexer.
+  clang::tok::TokenKind Kind = clang::tok::unknown;
+};
+static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
+
+/// A half-open range of tokens within a stream.
+struct Token::Range {
+  Index Begin = 0;
+  Index End = 0;
+
+  uint32_t size() const { return End - Begin; }
+  static Range emptyAt(Index Index) { return Range{Index, Index}; }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
+
+/// A complete sequence of Tokens representing a source file.
+///
+/// This may match a raw file from disk, or be derived from a previous stream.
+/// For example, stripping comments from a TokenStream results in a new stream.
+///
+/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
+///       int      main   (        )        ;
+///   eof kw_int   ident  l_paren  r_paren  semi   eof
+///       front()                           back()
+///       0        1      2        3        4      5
+class TokenStream {
+public:
+  /// Create an empty stream.
+  ///
+  /// Initially, the stream is appendable and not finalized.
+  /// The token sequence may only be accessed after finalize() is called.
+  ///
+  /// Payload is an opaque object which will be owned by the stream.
+  /// e.g. an allocator to hold backing storage for synthesized token text.
+  explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
+
+  /// Append a token to the stream, which must not be finalized.
+  void push(Token T) {
+    assert(!isFinalized());
+    Storage.push_back(std::move(T));
+  }
+
+  /// Finalize the token stream, allowing tokens to be accessed.
+  /// Tokens may no longer be appended.
+  void finalize();
+  bool isFinalized() const;
+
+  /// Returns the index of T within the stream.
+  ///
+  /// T must be within the stream or the end sentinel (not the start sentinel).
+  Token::Index index(const Token &T) const {
+    assert(isFinalized());
+    assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
+    assert(&T != Storage.data() && "start sentinel");
+    return &T - Tokens.data();
+  }
+
+  ArrayRef<Token> tokens() const {
+    assert(isFinalized());
+    return Tokens;
+  }
+  ArrayRef<Token> tokens(Token::Range R) const {
+    return tokens().slice(R.Begin, R.End - R.Begin);
+  }
+
+  /// May return the end sentinel if the stream is empty.
+  const Token &front() const {
+    assert(isFinalized());
+    return Storage[1];
+  }
+
+  /// Print the tokens in this stream to the output stream.
+  ///
+  /// The presence of newlines/spaces is preserved, but not the quantity.
+  void print(llvm::raw_ostream &) const;
+
+private:
+  std::shared_ptr<void> Payload;
+
+  MutableArrayRef<Token> Tokens;
+  std::vector<Token> Storage; // eof + Tokens + eof
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
+
+/// Extracts a raw token stream from the source code.
+///
+/// All tokens will reference the data of the provided string.
+/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
+TokenStream lex(const std::string &, const clang::LangOptions &);
+enum class LexFlags : uint8_t {
+  /// Marks the token at the start of a logical preprocessor line.
+  /// This is a position where a directive might start.
+  ///
+  /// Here, the first # is StartsPPLine, but second is not (same logical line).
+  ///   #define X(error) \
+  ///   #error // not a directive!
+  ///
+  /// Careful, the directive may not start exactly on the StartsPPLine token:
+  ///   /*comment*/ #include <foo.h>
+  StartsPPLine = 1 << 0,
+  /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
+  /// The text() of such tokens will contain the raw trigrah.
+  NeedsCleaning = 1 << 1,
+};
+
+/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
+///
+/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
+/// their backing data is owned by the returned stream.
+/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
+///
+/// The StartsPPLine flag is preserved.
+///
+/// Formally the identifier correctly happens before preprocessing, while we
+/// should only cook raw_identifiers that survive preprocessing.
+/// However, ignoring the Token::Kind of tokens in directives achieves the same.
+/// (And having cooked token kinds in PP-disabled sections is useful for us).
+TokenStream cook(const TokenStream &, const clang::LangOptions &);
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
diff --git a/clang/lib/Basic/TokenKinds.cpp b/clang/lib/Basic/TokenKinds.cpp
index d55e176c72c4c..c300175ce90ba 100644
--- a/clang/lib/Basic/TokenKinds.cpp
+++ b/clang/lib/Basic/TokenKinds.cpp
@@ -46,6 +46,15 @@ const char *tok::getKeywordSpelling(TokenKind Kind) {
   return nullptr;
 }
 
+const char *tok::getPPKeywordSpelling(tok::PPKeywordKind Kind) {
+  switch (Kind) {
+#define PPKEYWORD(x) case tok::pp_##x: return #x;
+#include "clang/Basic/TokenKinds.def"
+  default: break;
+  }
+  return nullptr;
+}
+
 bool tok::isAnnotation(TokenKind Kind) {
   switch (Kind) {
 #define ANNOTATION(X) case annot_ ## X: return true;
diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
index 8afe7f73f3085..be75138e60c60 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,9 +3,12 @@ set(LLVM_LINK_COMPONENTS Support)
 add_clang_library(clangToolingSyntaxPseudo
   Grammar.cpp
   GrammarBNF.cpp
+  Lex.cpp
   LRGraph.cpp
   LRTable.cpp
   LRTableBuild.cpp
+  Preprocess.cpp
+  Token.cpp
 
   LINK_LIBS
   clangBasic
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
new file mode 100644
index 0000000000000..ac0120cb9e473
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
@@ -0,0 +1,114 @@
+//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralSupport.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
+  clang::SourceLocation Start;
+  // Tokenize using clang's lexer in raw mode.
+  // std::string guarantees null-termination, which the lexer needs.
+  clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
+                     Code.data() + Code.size());
+  Lexer.SetCommentRetentionState(true);
+
+  TokenStream Result;
+  clang::Token CT;
+  unsigned LastOffset = 0;
+  unsigned Line = 0;
+  unsigned Indent = 0;
+  for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
+       Lexer.LexFromRawLexer(CT)) {
+    unsigned Offset =
+        CT.getLocation().getRawEncoding() - Start.getRawEncoding();
+
+    Token Tok;
+    Tok.Data = &Code[Offset];
+    Tok.Length = CT.getLength();
+    Tok.Kind = CT.getKind();
+
+    // Update current line number and indentation from raw source code.
+    unsigned NewLineStart = 0;
+    for (unsigned i = LastOffset; i < Offset; ++i) {
+      if (Code[i] == '\n') {
+        NewLineStart = i + 1;
+        ++Line;
+      }
+    }
+    if (NewLineStart || !LastOffset) {
+      Indent = 0;
+      for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
+        if (c == ' ')
+          ++Indent;
+        else if (c == '\t')
+          Indent += 8;
+        else
+          break;
+      }
+    }
+    Tok.Indent = Indent;
+    Tok.Line = Line;
+
+    if (CT.isAtStartOfLine())
+      Tok.setFlag(LexFlags::StartsPPLine);
+    if (CT.needsCleaning() || CT.hasUCN())
+      Tok.setFlag(LexFlags::NeedsCleaning);
+
+    Result.push(Tok);
+    LastOffset = Offset;
+  }
+  Result.finalize();
+  return Result;
+}
+
+TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
+  auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
+  clang::IdentifierTable Identifiers(LangOpts);
+  TokenStream Result(CleanedStorage);
+
+  for (auto Tok : Code.tokens()) {
+    if (Tok.flag(LexFlags::NeedsCleaning)) {
+      // Remove escaped newlines and trigraphs.
+      llvm::SmallString<64> CleanBuffer;
+      const char *Pos = Tok.text().begin();
+      while (Pos < Tok.text().end()) {
+        unsigned CharSize = 0;
+        CleanBuffer.push_back(
+            clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
+        assert(CharSize != 0 && "no progress!");
+        Pos += CharSize;
+      }
+      // Remove universal character names (UCN).
+      llvm::SmallString<64> UCNBuffer;
+      clang::expandUCNs(UCNBuffer, CleanBuffer);
+
+      llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
+      Tok.Data = Text.data();
+      Tok.Length = Text.size();
+      Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
+    }
+    // Cook raw_identifiers into identifier, keyword, etc.
+    if (Tok.Kind == tok::raw_identifier)
+      Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
+    Result.push(std::move(Tok));
+  }
+
+  Result.finalize();
+  return Result;
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
new file mode 100644
index 0000000000000..3a6403a147c91
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
@@ -0,0 +1,206 @@
+//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+class PPParser {
+public:
+  explicit PPParser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
+  void parse(PPStructure *Result) { parse(Result, /*TopLevel=*/true); }
+
+private:
+  // Roles that a directive might take within a conditional block.
+  enum class Cond { None, If, Else, End };
+  static Cond classifyDirective(tok::PPKeywordKind K) {
+    switch (K) {
+    case clang::tok::pp_if:
+    case clang::tok::pp_ifdef:
+    case clang::tok::pp_ifndef:
+      return Cond::If;
+    case clang::tok::pp_elif:
+    case clang::tok::pp_elifdef:
+    case clang::tok::pp_elifndef:
+    case clang::tok::pp_else:
+      return Cond::Else;
+    case clang::tok::pp_endif:
+      return Cond::End;
+    default:
+      return Cond::None;
+    }
+  }
+
+  // Parses tokens starting at Tok into PP.
+  // If we reach an End or Else directive that ends PP, returns it.
+  // If TopLevel is true, then we do not expect End and always return None.
+  llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
+    auto StartsDirective =
+        [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
+          if (Tok->flag(LexFlags::StartsPPLine)) {
+            // If we considered a comment at the start of a PP-line, it doesn't
+            // start a directive but the directive can still start after it.
+            if (Tok->Kind == tok::comment)
+              AllowDirectiveAt = Tok + 1;
+            return Tok->Kind == tok::hash;
+          }
+          return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
+        };
+    // Each iteration adds one chunk (or returns, if we see #endif).
+    while (Tok->Kind != tok::eof) {
+      // If there's no directive here, we have a code chunk.
+      if (!StartsDirective()) {
+        const Token *Start = Tok;
+        do
+          ++Tok;
+        while (Tok->Kind != tok::eof && !StartsDirective());
+        PP->Chunks.push_back(PPStructure::Code{
+            Token::Range{Code.index(*Start), Code.index(*Tok)}});
+        continue;
+      }
+
+      // We have some kind of directive.
+      PPStructure::Directive Directive;
+      parseDirective(&Directive);
+      Cond Kind = classifyDirective(Directive.Kind);
+      if (Kind == Cond::If) {
+        // #if or similar, starting a nested conditional block.
+        PPStructure::Conditional Conditional;
+        Conditional.Branches.emplace_back();
+        Conditional.Branches.back().first = std::move(Directive);
+        parseConditional(&Conditional);
+        PP->Chunks.push_back(std::move(Conditional));
+      } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
+        // #endif or similar, ending this PPStructure scope.
+        // (#endif is unexpected at the top level, treat as simple directive).
+        return std::move(Directive);
+      } else {
+        // #define or similar, a simple directive at the current scope.
+        PP->Chunks.push_back(std::move(Directive));
+      }
+    }
+    return None;
+  }
+
+  // Parse the rest of a conditional section, after seeing the If directive.
+  // Returns after consuming the End directive.
+  void parseConditional(PPStructure::Conditional *C) {
+    assert(C->Branches.size() == 1 &&
+           C->Branches.front().second.Chunks.empty() &&
+           "Should be ready to parse first branch body");
+    while (Tok->Kind != tok::eof) {
+      auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
+      if (!Terminator) {
+        assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
+        C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
+        return;
+      }
+      if (classifyDirective(Terminator->Kind) == Cond::End) {
+        C->End = std::move(*Terminator);
+        return;
+      }
+      assert(classifyDirective(Terminator->Kind) == Cond::Else &&
+             "ended branch unexpectedly");
+      C->Branches.emplace_back();
+      C->Branches.back().first = std::move(*Terminator);
+    }
+  }
+
+  // Parse a directive. Tok is the hash.
+  void parseDirective(PPStructure::Directive *D) {
+    assert(Tok->Kind == tok::hash);
+
+    // Directive spans from the hash until the end of line or file.
+    const Token *Begin = Tok++;
+    while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
+      ++Tok;
+    ArrayRef<Token> Tokens{Begin, Tok};
+    D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
+
+    // Directive name is the first non-comment token after the hash.
+    Tokens = Tokens.drop_front().drop_while(
+        [](const Token &T) { return T.Kind == tok::comment; });
+    if (!Tokens.empty())
+      D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
+  }
+
+  const TokenStream &Code;
+  const Token *Tok;
+  clang::IdentifierTable PPKeywords;
+};
+
+} // namespace
+
+PPStructure PPStructure::parse(const TokenStream &Code) {
+  PPStructure Result;
+  PPParser(Code).parse(&Result);
+  return Result;
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &, unsigned Indent);
+static void dump(llvm::raw_ostream &OS, const PPStructure::Directive &Directive,
+                 unsigned Indent) {
+  OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
+                                     tok::getPPKeywordSpelling(Directive.Kind),
+                                     Directive.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS, const PPStructure::Code &Code,
+                 unsigned Indent) {
+  OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS,
+                 const PPStructure::Conditional &Conditional, unsigned Indent) {
+  for (const auto &Branch : Conditional.Branches) {
+    dump(OS, Branch.first, Indent);
+    dump(OS, Branch.second, Indent + 2);
+  }
+  dump(OS, Conditional.End, Indent);
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure::Chunk &Chunk,
+                 unsigned Indent) {
+  switch (Chunk.kind()) {
+  case PPStructure::Chunk::K_Empty:
+    llvm_unreachable("invalid chunk");
+  case PPStructure::Chunk::K_Code:
+    return dump(OS, (const PPStructure::Code &)Chunk, Indent);
+  case PPStructure::Chunk::K_Directive:
+    return dump(OS, (const PPStructure::Directive &)Chunk, Indent);
+  case PPStructure::Chunk::K_Conditional:
+    return dump(OS, (const PPStructure::Conditional &)Chunk, Indent);
+  }
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
+                 unsigned Indent) {
+  for (const auto &Chunk : PP.Chunks)
+    dump(OS, Chunk, Indent);
+}
+
+// Define operator<< in terms of dump() functions above.
+#define OSTREAM_DUMP(Type)                                                     \
+  llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) {        \
+    dump(OS, T, 0);                                                            \
+    return OS;                                                                 \
+  }
+OSTREAM_DUMP(PPStructure)
+OSTREAM_DUMP(PPStructure::Chunk)
+OSTREAM_DUMP(PPStructure::Directive)
+OSTREAM_DUMP(PPStructure::Conditional)
+OSTREAM_DUMP(PPStructure::Code)
+#undef OSTREAM_DUMP
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Token.cpp b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp
new file mode 100644
index 0000000000000..070bda4c50031
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp
@@ -0,0 +1,98 @@
+//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+  OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
+                      T.Indent);
+  OS << '"';
+  llvm::printEscapedString(T.text(), OS);
+  OS << '"';
+  if (T.Flags)
+    OS << llvm::format(" flags=%x", T.Flags);
+  return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
+  OS << "Index               Kind    Line  Text\n";
+  for (const auto &T : TS.tokens()) {
+    OS << llvm::format("%5d:  %16s %4d:%-2d  ", TS.index(T),
+                       clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
+    OS << '"';
+    llvm::printEscapedString(T.text(), OS);
+    OS << '"';
+    if (T.Flags)
+      OS << llvm::format("  flags=%x", T.Flags);
+    OS << '\n';
+  }
+  return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
+  OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
+  return OS;
+}
+
+TokenStream::TokenStream(std::shared_ptr<void> Payload)
+    : Payload(std::move(Payload)) {
+  Storage.emplace_back();
+  Storage.back().Kind = clang::tok::eof;
+}
+
+void TokenStream::finalize() {
+  assert(!isFinalized());
+  unsigned LastLine = Storage.back().Line;
+  Storage.emplace_back();
+  Storage.back().Kind = tok::eof;
+  Storage.back().Line = LastLine + 1;
+
+  Tokens = Storage;
+  Tokens = Tokens.drop_front().drop_back();
+}
+
+bool TokenStream::isFinalized() const {
+  assert(!Storage.empty() && Storage.front().Kind == tok::eof);
+  if (Storage.size() == 1)
+    return false;
+  return Storage.back().Kind == tok::eof;
+}
+
+void TokenStream::print(llvm::raw_ostream &OS) const {
+  bool FirstToken = true;
+  unsigned LastLine = -1;
+  StringRef LastText;
+  for (const auto &T : tokens()) {
+    StringRef Text = T.text();
+    if (FirstToken) {
+      FirstToken = false;
+    } else if (T.Line == LastLine) {
+      if (LastText.data() + LastText.size() != Text.data())
+        OS << ' ';
+    } else {
+      OS << '\n';
+      OS.indent(T.Indent);
+    }
+    OS << Text;
+    LastLine = T.Line;
+    LastText = Text;
+  }
+  if (!FirstToken)
+    OS << '\n';
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/test/Syntax/lex.c b/clang/test/Syntax/lex.c
new file mode 100644
index 0000000000000..7ec015417a177
--- /dev/null
+++ b/clang/test/Syntax/lex.c
@@ -0,0 +1,52 @@
+int is_debug() {
+#ifndef NDEBUG
+  return 1; // in debug mode
+#else
+  return 0;
+#endif
+}
+
+/* This comment gets lexed along with the input above! We just don't CHECK it.
+
+RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
+     SOURCE: int is_debug() {
+SOURCE-NEXT: #ifndef NDEBUG
+SOURCE-NEXT:   return 1; // in debug mode
+SOURCE-NEXT: #else
+SOURCE-NEXT:  return 0;
+SOURCE-NEXT: #end
+SOURCE-NEXT: }
+
+RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
+     TOKEN:   0: raw_identifier   0:0 "int" flags=1
+TOKEN-NEXT: raw_identifier   0:0 "is_debug"
+TOKEN-NEXT: l_paren          0:0 "("
+TOKEN-NEXT: r_paren          0:0 ")"
+TOKEN-NEXT: l_brace          0:0 "{"
+TOKEN-NEXT: hash             1:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   1:0 "ifndef"
+TOKEN-NEXT: raw_identifier   1:0 "NDEBUG"
+TOKEN-NEXT: raw_identifier   2:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 2:2 "1"
+TOKEN-NEXT: semi             2:2 ";"
+TOKEN-NEXT: comment          2:2 "// in debug mode"
+TOKEN-NEXT: hash             3:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   3:0 "else"
+TOKEN-NEXT: raw_identifier   4:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 4:2 "0"
+TOKEN-NEXT: semi             4:2 ";"
+TOKEN-NEXT: hash             5:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   5:0 "endif"
+TOKEN-NEXT: r_brace          6:0 "}" flags=1
+
+RUN: clang-pseudo -source %s -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
+     PPS: code (5 tokens)
+PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT:   code (4 tokens)
+PPS-NEXT: #else (2 tokens)
+PPS-NEXT:   code (3 tokens)
+PPS-NEXT: #endif (2 tokens)
+PPS-NEXT: code (2 tokens)
+                ^ including this block comment
+
+*******************************************************************************/
diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp
index 449b9181f3ee0..2d6fbdb83944c 100644
--- a/clang/tools/clang-pseudo/ClangPseudo.cpp
+++ b/clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -6,9 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Basic/LangOptions.h"
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
 #include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
 #include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -25,13 +28,19 @@ static opt<bool> PrintGraph("print-graph",
                             desc("Print the LR graph for the grammar"));
 static opt<bool> PrintTable("print-table",
                             desc("Print the LR table for the grammar"));
+static opt<std::string> Source("source", desc("Source file"));
+static opt<bool> PrintSource("print-source", desc("Print token stream"));
+static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
+static opt<bool>
+    PrintPPStructure("print-pp-structure",
+                     desc("Print directive structure of source code"));
 
 static std::string readOrDie(llvm::StringRef Path) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
       llvm::MemoryBuffer::getFile(Path);
   if (std::error_code EC = Text.getError()) {
-    llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
-                 << "\n";
+    llvm::errs() << "Error: can't read grammar file '" << Path
+                 << "': " << EC.message() << "\n";
     ::exit(1);
   }
   return Text.get()->getBuffer().str();
@@ -60,5 +69,19 @@ int main(int argc, char *argv[]) {
     return 0;
   }
 
+  if (Source.getNumOccurrences()) {
+    std::string Text = readOrDie(Source);
+    clang::LangOptions LangOpts; // FIXME: use real options.
+    auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
+    auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
+
+    if (PrintPPStructure)
+      llvm::outs() << Structure;
+    if (PrintSource)
+      Stream.print(llvm::outs());
+    if (PrintTokens)
+      llvm::outs() << Stream;
+  }
+
   return 0;
 }
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
index 509e9e4a1598b..658ad9d926b96 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -5,6 +5,8 @@ set(LLVM_LINK_COMPONENTS
 add_clang_unittest(ClangPseudoTests
   GrammarTest.cpp
   LRTableTest.cpp
+  PreprocessTest.cpp
+  TokenTest.cpp
 )
 
 clang_target_link_libraries(ClangPseudoTests
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
new file mode 100644
index 0000000000000..b6ff47d7fc8dc
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
@@ -0,0 +1,152 @@
+//===--- TokenTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::_;
+using testing::ElementsAre;
+using testing::Matcher;
+using testing::Pair;
+using testing::StrEq;
+using Chunk = PPStructure::Chunk;
+
+MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
+  std::vector<llvm::StringRef> Texts;
+  for (const Token &Tok : TS.tokens(arg.Tokens))
+    Texts.push_back(Tok.text());
+  return Matcher<std::string>(StrEq(Tokens))
+      .MatchAndExplain(llvm::join(Texts, " "), result_listener);
+}
+
+MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
+
+TEST(PPStructure, Parse) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+  #include <foo.h>
+
+  int main() {
+  #ifdef HAS_FOO
+  #if HAS_BAR
+    foo(bar);
+  #else
+    foo(0)
+  #endif
+  #elif NEEDS_FOO
+    #error missing_foo
+  #endif
+  }
+  )cpp";
+
+  TokenStream S = cook(lex(Code, Opts), Opts);
+  PPStructure PP = PPStructure::parse(S);
+
+  ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
+                                     chunkKind(Chunk::K_Code),
+                                     chunkKind(Chunk::K_Conditional),
+                                     chunkKind(Chunk::K_Code)));
+
+  EXPECT_THAT((const PPStructure::Directive &)PP.Chunks[0],
+              tokensAre(S, "# include < foo . h >"));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[1],
+              tokensAre(S, "int main ( ) {"));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[3], tokensAre(S, "}"));
+
+  const PPStructure::Conditional &Ifdef(PP.Chunks[2]);
+  EXPECT_THAT(Ifdef.Branches,
+              ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
+                          Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
+  EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
+
+  const PPStructure &HasFoo(Ifdef.Branches[0].second);
+  const PPStructure &NeedsFoo(Ifdef.Branches[1].second);
+
+  EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
+  const PPStructure::Conditional &If(HasFoo.Chunks[0]);
+  EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
+                                       Pair(tokensAre(S, "# else"), _)));
+  EXPECT_THAT(If.Branches[0].second.Chunks,
+              ElementsAre(chunkKind(Chunk::K_Code)));
+  EXPECT_THAT(If.Branches[1].second.Chunks,
+              ElementsAre(chunkKind(Chunk::K_Code)));
+
+  EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
+  const PPStructure::Directive &Error(NeedsFoo.Chunks[0]);
+  EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
+  EXPECT_EQ(Error.Kind, tok::pp_error);
+}
+
+TEST(PPStructure, ParseUgly) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+  /*A*/ # /*B*/ \
+   /*C*/ \
+define \
+BAR /*D*/
+/*E*/
+)cpp";
+  TokenStream S = cook(lex(Code, Opts), Opts);
+  PPStructure PP = PPStructure::parse(S);
+
+  ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+                                     chunkKind(Chunk::K_Directive),
+                                     chunkKind(Chunk::K_Code)));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
+  const PPStructure::Directive &Define(PP.Chunks[1]);
+  EXPECT_EQ(Define.Kind, tok::pp_define);
+  EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
+}
+
+TEST(PPStructure, ParseBroken) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+  a
+  #endif // mismatched
+  #if X
+  b
+)cpp";
+  TokenStream S = cook(lex(Code, Opts), Opts);
+  PPStructure PP = PPStructure::parse(S);
+
+  ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+                                     chunkKind(Chunk::K_Directive),
+                                     chunkKind(Chunk::K_Conditional)));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "a"));
+  const PPStructure::Directive &Endif(PP.Chunks[1]);
+  EXPECT_EQ(Endif.Kind, tok::pp_endif);
+  EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
+
+  const PPStructure::Conditional &X(PP.Chunks[2]);
+  EXPECT_EQ(1u, X.Branches.size());
+  // The (only) branch of the broken conditional section runs until eof.
+  EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
+  EXPECT_THAT(X.Branches.front().second.Chunks,
+              ElementsAre(chunkKind(Chunk::K_Code)));
+  // The missing terminating directive is marked as pp_not_keyword.
+  EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
+  EXPECT_EQ(0u, X.End.Tokens.size());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
new file mode 100644
index 0000000000000..f790e65245741
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
@@ -0,0 +1,178 @@
+//===--- TokenTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Not;
+
+MATCHER_P2(token, Text, Kind, "") {
+  return arg.Kind == Kind && arg.text() == Text;
+}
+
+MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
+
+MATCHER_P2(lineIndent, Line, Indent, "") {
+  return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
+}
+
+TEST(TokenTest, Lex) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+    #include <stdio.h>
+    int main() {
+      return 42; // the answer
+    }
+  )cpp";
+  TokenStream Raw = lex(Code, Opts);
+  ASSERT_TRUE(Raw.isFinalized());
+  EXPECT_THAT(Raw.tokens(),
+              ElementsAreArray({
+                  // Lexing of directives is weird, especially <angled> strings.
+                  token("#", tok::hash),
+                  token("include", tok::raw_identifier),
+                  token("<", tok::less),
+                  token("stdio", tok::raw_identifier),
+                  token(".", tok::period),
+                  token("h", tok::raw_identifier),
+                  token(">", tok::greater),
+
+                  token("int", tok::raw_identifier),
+                  token("main", tok::raw_identifier),
+                  token("(", tok::l_paren),
+                  token(")", tok::r_paren),
+                  token("{", tok::l_brace),
+                  token("return", tok::raw_identifier),
+                  token("42", tok::numeric_constant),
+                  token(";", tok::semi),
+                  token("// the answer", tok::comment),
+                  token("}", tok::r_brace),
+              }));
+
+  TokenStream Cooked = cook(Raw, Opts);
+  ASSERT_TRUE(Cooked.isFinalized());
+  EXPECT_THAT(Cooked.tokens(),
+              ElementsAreArray({
+                  // Cooked identifier types in directives are not meaningful.
+                  token("#", tok::hash),
+                  token("include", tok::identifier),
+                  token("<", tok::less),
+                  token("stdio", tok::identifier),
+                  token(".", tok::period),
+                  token("h", tok::identifier),
+                  token(">", tok::greater),
+
+                  token("int", tok::kw_int),
+                  token("main", tok::identifier),
+                  token("(", tok::l_paren),
+                  token(")", tok::r_paren),
+                  token("{", tok::l_brace),
+                  token("return", tok::kw_return),
+                  token("42", tok::numeric_constant),
+                  token(";", tok::semi),
+                  token("// the answer", tok::comment),
+                  token("}", tok::r_brace),
+              }));
+  // Check raw tokens point back into original source code.
+  EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
+}
+
+TEST(TokenTest, LineContinuation) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+one_\
+token
+two \
+tokens
+  )cpp";
+  TokenStream Raw = lex(Code, Opts);
+  EXPECT_THAT(
+      Raw.tokens(),
+      ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
+                        hasFlag(LexFlags::StartsPPLine),
+                        hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0)),
+                  AllOf(token("two", tok::raw_identifier),
+                        hasFlag(LexFlags::StartsPPLine),
+                        Not(hasFlag(LexFlags::NeedsCleaning))),
+                  AllOf(token("\\\ntokens", tok::raw_identifier),
+                        Not(hasFlag(LexFlags::StartsPPLine)),
+                        hasFlag(LexFlags::NeedsCleaning))));
+
+  TokenStream Cooked = cook(Raw, Opts);
+  EXPECT_THAT(
+      Cooked.tokens(),
+      ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0)),
+                  token("two", tok::identifier),
+                  token("tokens", tok::identifier)));
+}
+
+TEST(TokenTest, EncodedCharacters) {
+  LangOptions Opts;
+  Opts.Trigraphs = true;
+  Opts.Digraphs = true;
+  Opts.C99 = true; // UCNs
+  Opts.CXXOperatorNames = true;
+  std::string Code = R"(and <: ??! '??=' \u00E9)";
+  TokenStream Raw = lex(Code, Opts);
+  EXPECT_THAT(
+      Raw.tokens(),
+      ElementsAre( // and is not recognized as && until cook().
+          AllOf(token("and", tok::raw_identifier),
+                Not(hasFlag(LexFlags::NeedsCleaning))),
+          // Digraphs are just different spellings of tokens.
+          AllOf(token("<:", tok::l_square),
+                Not(hasFlag(LexFlags::NeedsCleaning))),
+          // Trigraps are interpreted, still need text cleaning.
+          AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
+          // Trigraphs must be substituted inside constants too.
+          AllOf(token(R"('??=')", tok::char_constant),
+                hasFlag(LexFlags::NeedsCleaning)),
+          // UCNs need substitution.
+          AllOf(token(R"(\u00E9)", tok::raw_identifier),
+                hasFlag(LexFlags::NeedsCleaning))));
+
+  TokenStream Cooked = cook(Raw, Opts);
+  EXPECT_THAT(
+      Cooked.tokens(),
+      ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
+                  token("<:", tok::l_square),
+                  token("|", tok::pipe),            // trigraph substituted
+                  token("'#'", tok::char_constant), // trigraph substituted
+                  token("é", tok::identifier)));    // UCN substituted
+}
+
+TEST(TokenTest, Indentation) {
+  LangOptions Opts;
+  std::string Code = R"cpp(   hello world
+no_indent \
+  line_was_continued
+)cpp";
+  TokenStream Raw = lex(Code, Opts);
+  EXPECT_THAT(Raw.tokens(), ElementsAreArray({
+                                lineIndent(0, 3), // hello
+                                lineIndent(0, 3), // world
+                                lineIndent(1, 0), // no_indent
+                                lineIndent(2, 2), // line_was_continued
+                            }));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

From e9d2f173913da17f4e63b68fcee6d7f30d9e2a56 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Wed, 23 Feb 2022 18:01:58 +0100
Subject: [PATCH 659/748] [flang] Lower complex constant

Add ability to lower complex constant.

This patch is part of the upstreaming effort from fir-dev branch.

Reviewed By: PeteSteinfeld

Differential Revision: https://reviews.llvm.org/D120402

Co-authored-by: Kiran Chandramohan <kiran.chandramohan@arm.com>
Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Co-authored-by: Jean Perier <jperier@nvidia.com>
---
 flang/lib/Lower/ConvertExpr.cpp | 14 ++++++++++++--
 flang/test/Lower/assignment.f90 | 14 ++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 013adb797da93..0a939dbf6aaad 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -19,6 +19,7 @@
 #include "flang/Lower/IntrinsicCall.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Lower/Todo.h"
+#include "flang/Optimizer/Builder/Complex.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
@@ -277,7 +278,9 @@ class ScalarExprLowering {
 
   template <int KIND>
   ExtValue genval(const Fortran::evaluate::ComplexConstructor<KIND> &op) {
-    TODO(getLoc(), "genval ComplexConstructor");
+    mlir::Value realPartValue = genunbox(op.left());
+    return fir::factory::Complex{builder, getLoc()}.createComplex(
+        KIND, realPartValue, genunbox(op.right()));
   }
 
   template <int KIND>
@@ -381,7 +384,14 @@ class ScalarExprLowering {
         return genRealConstant<KIND>(builder.getContext(), floatVal);
       }
     } else if constexpr (TC == Fortran::common::TypeCategory::Complex) {
-      TODO(getLoc(), "genval complex constant");
+      using TR =
+          Fortran::evaluate::Type<Fortran::common::TypeCategory::Real, KIND>;
+      Fortran::evaluate::ComplexConstructor<KIND> ctor(
+          Fortran::evaluate::Expr<TR>{
+              Fortran::evaluate::Constant<TR>{value.REAL()}},
+          Fortran::evaluate::Expr<TR>{
+              Fortran::evaluate::Constant<TR>{value.AIMAG()}});
+      return genunbox(ctor);
     } else /*constexpr*/ {
       llvm_unreachable("unhandled constant");
     }
diff --git a/flang/test/Lower/assignment.f90 b/flang/test/Lower/assignment.f90
index 26aa33631d0e4..f2f81c3b41248 100644
--- a/flang/test/Lower/assignment.f90
+++ b/flang/test/Lower/assignment.f90
@@ -284,3 +284,17 @@ subroutine real_constant()
 ! CHECK: fir.store %[[C10]] to %[[D]] : !fir.ref<f80>
 ! CHECK: %[[C16:.*]] = arith.constant 1.600000e+01 : f128
 ! CHECK: fir.store %[[C16]] to %[[E]] : !fir.ref<f128>
+
+subroutine complex_constant()
+  complex(4) :: a
+  a = (0, 1)
+end
+
+! CHECK-LABEL: func @_QPcomplex_constant()
+! CHECK:         %[[A:.*]] = fir.alloca !fir.complex<4> {bindc_name = "a", uniq_name = "_QFcomplex_constantEa"}
+! CHECK:         %[[C0:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:         %[[C1:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:         %[[UNDEF:.*]] = fir.undefined !fir.complex<4>
+! CHECK:         %[[INS0:.*]] = fir.insert_value %[[UNDEF]], %[[C0]], [0 : index] : (!fir.complex<4>, f32) -> !fir.complex<4>
+! CHECK:         %[[INS1:.*]] = fir.insert_value %[[INS0]], %[[C1]], [1 : index] : (!fir.complex<4>, f32) -> !fir.complex<4>
+! CHECK:         fir.store %[[INS1]] to %[[A]] : !fir.ref<!fir.complex<4>>

From e7f4ea8abeed5063ea21e508d85ef104be8e4505 Mon Sep 17 00:00:00 2001
From: Daniel Resnick <danielzresnick@gmail.com>
Date: Tue, 22 Feb 2022 10:41:48 -0700
Subject: [PATCH 660/748] [MLIR][Pass] Have PassRegistryEntry own pass strings

This eliminates the requirement that pass-related strings outlive pass
instances, which will facilitate future work enabling dynamic passes
written in other languages.

Differential Revision: https://reviews.llvm.org/D120341
---
 mlir/include/mlir/Pass/PassRegistry.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Pass/PassRegistry.h b/mlir/include/mlir/Pass/PassRegistry.h
index 9167751ebbcee..496bac77476a3 100644
--- a/mlir/include/mlir/Pass/PassRegistry.h
+++ b/mlir/include/mlir/Pass/PassRegistry.h
@@ -81,10 +81,10 @@ class PassRegistryEntry {
 
 private:
   /// The argument with which to invoke the pass via mlir-opt.
-  StringRef arg;
+  std::string arg;
 
   /// Description of the pass.
-  StringRef description;
+  std::string description;
 
   /// Function to register this entry to a pass manager pipeline.
   PassRegistryFunction builder;

From 307ccf4c0defc06a9c511ed9b859c26c7c07b214 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Wed, 23 Feb 2022 18:04:15 +0100
Subject: [PATCH 661/748] [flang][NFC] Clean up ConvertType

This patch removes unused or obsolete code in
the ConvertType.h and ConvertType.cpp files. These
files were landed together with the initial flang
upstreaming. This cleanup will help future upstreaming
effort from fir-dev and keep only used code.

Reviewed By: PeteSteinfeld

Differential Revision: https://reviews.llvm.org/D120405
---
 flang/include/flang/Lower/ConvertType.h |  35 +---
 flang/lib/Lower/ConvertType.cpp         | 268 +-----------------------
 2 files changed, 17 insertions(+), 286 deletions(-)

diff --git a/flang/include/flang/Lower/ConvertType.h b/flang/include/flang/Lower/ConvertType.h
index ea931e28cb3fb..fccddc7dbf0ff 100644
--- a/flang/include/flang/Lower/ConvertType.h
+++ b/flang/include/flang/Lower/ConvertType.h
@@ -4,7 +4,11 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-//----------------------------------------------------------------------------//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
 ///
 /// Conversion of front-end TYPE, KIND, ATTRIBUTE (TKA) information to FIR/MLIR.
 /// This is meant to be the single point of truth (SPOT) for all type
@@ -12,15 +16,13 @@
 /// tree TKA to the FIR type system. If one is converting front-end types and
 /// not using one of the routines provided here, it's being done wrong.
 ///
-/// [Coding style](https://llvm.org/docs/CodingStandards.html)
-///
-//----------------------------------------------------------------------------//
+//===----------------------------------------------------------------------===//
 
 #ifndef FORTRAN_LOWER_CONVERT_TYPE_H
 #define FORTRAN_LOWER_CONVERT_TYPE_H
 
 #include "flang/Common/Fortran.h"
-#include "mlir/IR/Types.h"
+#include "mlir/IR/BuiltinTypes.h"
 
 namespace mlir {
 class Location;
@@ -30,22 +32,14 @@ class Type;
 
 namespace Fortran {
 namespace common {
-class IntrinsicTypeDefaultKinds;
 template <typename>
 class Reference;
 } // namespace common
 
 namespace evaluate {
-struct DataRef;
-template <typename>
-class Designator;
 template <typename>
 class Expr;
-template <common::TypeCategory>
-struct SomeKind;
 struct SomeType;
-template <common::TypeCategory, int>
-class Type;
 } // namespace evaluate
 
 namespace semantics {
@@ -68,14 +62,6 @@ using LenParameterTy = std::int64_t;
 mlir::Type getFIRType(mlir::MLIRContext *ctxt, common::TypeCategory tc,
                       int kind);
 
-/// Get a FIR type based on a category.
-mlir::Type getFIRType(Fortran::lower::AbstractConverter &,
-                      common::TypeCategory tc);
-
-/// Translate a Fortran::evaluate::DataRef to an mlir::Type.
-mlir::Type translateDataRefToFIRType(Fortran::lower::AbstractConverter &,
-                                     const evaluate::DataRef &dataRef);
-
 /// Translate a SomeExpr to an mlir::Type.
 mlir::Type translateSomeExprToFIRType(Fortran::lower::AbstractConverter &,
                                       const SomeExpr &expr);
@@ -91,13 +77,6 @@ mlir::Type translateVariableToFIRType(Fortran::lower::AbstractConverter &,
 /// Translate a REAL of KIND to the mlir::Type.
 mlir::Type convertReal(mlir::MLIRContext *ctxt, int KIND);
 
-// Given a ReferenceType of a base type, returns the ReferenceType to
-// the SequenceType of this base type.
-// The created SequenceType has one dimension of unknown extent.
-// This is useful to do pointer arithmetic using fir::CoordinateOp that requires
-// a memory reference to a sequence type.
-mlir::Type getSequenceRefType(mlir::Type referenceType);
-
 } // namespace lower
 } // namespace Fortran
 
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 429fae81e25cc..19556fc3afb27 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -101,54 +101,18 @@ static mlir::Type genFIRType(mlir::MLIRContext *context,
   llvm_unreachable("unhandled type category");
 }
 
-template <typename A>
-bool isConstant(const Fortran::evaluate::Expr<A> &e) {
-  return Fortran::evaluate::IsConstantExpr(Fortran::lower::SomeExpr{e});
-}
-
-template <typename A>
-int64_t toConstant(const Fortran::evaluate::Expr<A> &e) {
-  auto opt = Fortran::evaluate::ToInt64(e);
-  assert(opt.has_value() && "expression didn't resolve to a constant");
-  return opt.value();
-}
-
-// one argument template, must be specialized
-template <Fortran::common::TypeCategory TC>
-mlir::Type genFIRType(mlir::MLIRContext *, int) {
-  return {};
-}
-
-// two argument template
-template <Fortran::common::TypeCategory TC, int KIND>
-mlir::Type genFIRType(mlir::MLIRContext *context) {
-  if constexpr (TC == Fortran::common::TypeCategory::Integer) {
-    auto bits{Fortran::evaluate::Type<Fortran::common::TypeCategory::Integer,
-                                      KIND>::Scalar::bits};
-    return mlir::IntegerType::get(context, bits);
-  } else if constexpr (TC == Fortran::common::TypeCategory::Logical ||
-                       TC == Fortran::common::TypeCategory::Character ||
-                       TC == Fortran::common::TypeCategory::Complex) {
-    return genFIRType<TC>(context, KIND);
-  } else {
-    return {};
-  }
-}
-
-template <>
-mlir::Type
-genFIRType<Fortran::common::TypeCategory::Character>(mlir::MLIRContext *context,
-                                                     int KIND) {
-  if (Fortran::evaluate::IsValidKindOfIntrinsicType(
-          Fortran::common::TypeCategory::Character, KIND))
-    return fir::CharacterType::get(context, KIND, 1);
-  return {};
-}
+//===--------------------------------------------------------------------===//
+// Symbol and expression type translation
+//===--------------------------------------------------------------------===//
 
+/// TypeBuilder translates expression and symbol type taking into account
+/// their shape and length parameters. For symbols, attributes such as
+/// ALLOCATABLE or POINTER are reflected in the fir type.
+/// It uses evaluate::DynamicType and evaluate::Shape when possible to
+/// avoid re-implementing type/shape analysis here.
+/// Do not use the FirOpBuilder from the AbstractConverter to get fir/mlir types
+/// since it is not guaranteed to exist yet when we lower types.
 namespace {
-
-/// Discover the type of an Fortran::evaluate::Expr<T> and convert it to an
-/// mlir::Type. The type returned may be an MLIR standard or FIR type.
 class TypeBuilder {
 public:
   TypeBuilder(Fortran::lower::AbstractConverter &converter)
@@ -282,203 +246,11 @@ class TypeBuilder {
     return ty;
   }
 
-  //===--------------------------------------------------------------------===//
-  // Generate type entry points
-  //===--------------------------------------------------------------------===//
-
-  template <template <typename> typename A, Fortran::common::TypeCategory TC>
-  mlir::Type gen(const A<Fortran::evaluate::SomeKind<TC>> &) {
-    return genFIRType<TC>(context, defaultKind<TC>());
-  }
-
-  template <template <typename> typename A, Fortran::common::TypeCategory TC,
-            int KIND>
-  mlir::Type gen(const A<Fortran::evaluate::Type<TC, KIND>> &) {
-    return genFIRType<TC, KIND>(context);
-  }
-
-  // breaks the conflict between A<Type<TC,KIND>> and Expr<B> deduction
-  template <Fortran::common::TypeCategory TC, int KIND>
-  mlir::Type
-  gen(const Fortran::evaluate::Expr<Fortran::evaluate::Type<TC, KIND>> &) {
-    return genFIRType<TC, KIND>(context);
-  }
-
-  // breaks the conflict between A<SomeKind<TC>> and Expr<B> deduction
-  template <Fortran::common::TypeCategory TC>
-  mlir::Type
-  gen(const Fortran::evaluate::Expr<Fortran::evaluate::SomeKind<TC>> &expr) {
-    return {};
-  }
-
-  template <typename A>
-  mlir::Type gen(const Fortran::evaluate::Expr<A> &expr) {
-    return {};
-  }
-
-  mlir::Type gen(const Fortran::evaluate::DataRef &dref) { return {}; }
-
   mlir::Type genVariableType(const Fortran::lower::pft::Variable &var) {
     return genSymbolType(var.getSymbol(), var.isHeapAlloc(), var.isPointer());
   }
 
-  // non-template, category is runtime values, kind is defaulted
-  mlir::Type genFIRTy(Fortran::common::TypeCategory tc) {
-    return genFIRTy(tc, defaultKind(tc));
-  }
-
-  // non-template, arguments are runtime values
-  mlir::Type genFIRTy(Fortran::common::TypeCategory tc, int kind) {
-    switch (tc) {
-    case Fortran::common::TypeCategory::Real:
-      return genFIRType<Fortran::common::TypeCategory::Real>(context, kind);
-    case Fortran::common::TypeCategory::Integer:
-      return genFIRType<Fortran::common::TypeCategory::Integer>(context, kind);
-    case Fortran::common::TypeCategory::Complex:
-      return genFIRType<Fortran::common::TypeCategory::Complex>(context, kind);
-    case Fortran::common::TypeCategory::Logical:
-      return genFIRType<Fortran::common::TypeCategory::Logical>(context, kind);
-    case Fortran::common::TypeCategory::Character:
-      return genFIRType<Fortran::common::TypeCategory::Character>(context,
-                                                                  kind);
-    default:
-      break;
-    }
-    llvm_unreachable("unhandled type category");
-  }
-
 private:
-  //===--------------------------------------------------------------------===//
-  // Generate type helpers
-  //===--------------------------------------------------------------------===//
-
-  mlir::Type gen(const Fortran::evaluate::ImpliedDoIndex &) {
-    return genFIRType<Fortran::evaluate::ImpliedDoIndex::Result::category>(
-        context, Fortran::evaluate::ImpliedDoIndex::Result::kind);
-  }
-
-  mlir::Type gen(const Fortran::evaluate::TypeParamInquiry &) {
-    return genFIRType<Fortran::evaluate::TypeParamInquiry::Result::category>(
-        context, Fortran::evaluate::TypeParamInquiry::Result::kind);
-  }
-
-  template <typename A>
-  mlir::Type gen(const Fortran::evaluate::Relational<A> &) {
-    return genFIRType<Fortran::common::TypeCategory::Logical, 1>(context);
-  }
-
-  // some sequence of `n` bytes
-  mlir::Type gen(const Fortran::evaluate::StaticDataObject::Pointer &ptr) {
-    mlir::Type byteTy{mlir::IntegerType::get(context, 8)};
-    return fir::SequenceType::get(trivialShape(ptr->itemBytes()), byteTy);
-  }
-
-  mlir::Type gen(const Fortran::evaluate::Substring &ss) { return {}; }
-
-  mlir::Type gen(const Fortran::evaluate::NullPointer &) {
-    return genTypelessPtr();
-  }
-  mlir::Type gen(const Fortran::evaluate::ProcedureRef &) {
-    return genTypelessPtr();
-  }
-  mlir::Type gen(const Fortran::evaluate::ProcedureDesignator &) {
-    return genTypelessPtr();
-  }
-  mlir::Type gen(const Fortran::evaluate::BOZLiteralConstant &) {
-    return genTypelessPtr();
-  }
-  mlir::Type gen(const Fortran::evaluate::ArrayRef &) {
-    TODO_NOLOC("array ref");
-  }
-  mlir::Type gen(const Fortran::evaluate::CoarrayRef &) {
-    TODO_NOLOC("coarray ref");
-  }
-  mlir::Type gen(const Fortran::evaluate::Component &) {
-    TODO_NOLOC("component");
-  }
-  mlir::Type gen(const Fortran::evaluate::ComplexPart &) {
-    TODO_NOLOC("complex part");
-  }
-  mlir::Type gen(const Fortran::evaluate::DescriptorInquiry &) {
-    TODO_NOLOC("descriptor inquiry");
-  }
-  mlir::Type gen(const Fortran::evaluate::StructureConstructor &) {
-    TODO_NOLOC("structure constructor");
-  }
-
-  fir::SequenceType::Shape genSeqShape(Fortran::semantics::SymbolRef symbol) {
-    assert(symbol->IsObjectArray() && "unexpected symbol type");
-    fir::SequenceType::Shape bounds;
-    return seqShapeHelper(symbol, bounds);
-  }
-
-  fir::SequenceType::Shape genSeqShape(Fortran::semantics::SymbolRef symbol,
-                                       fir::SequenceType::Extent charLen) {
-    assert(symbol->IsObjectArray() && "unexpected symbol type");
-    fir::SequenceType::Shape bounds;
-    bounds.push_back(charLen);
-    return seqShapeHelper(symbol, bounds);
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Other helper functions
-  //===--------------------------------------------------------------------===//
-
-  fir::SequenceType::Shape trivialShape(int size) {
-    fir::SequenceType::Shape bounds;
-    bounds.emplace_back(size);
-    return bounds;
-  }
-
-  mlir::Type mkVoid() { return mlir::TupleType::get(context); }
-  mlir::Type genTypelessPtr() { return fir::ReferenceType::get(mkVoid()); }
-
-  template <Fortran::common::TypeCategory TC>
-  int defaultKind() {
-    return defaultKind(TC);
-  }
-  int defaultKind(Fortran::common::TypeCategory TC) { return 0; }
-
-  fir::SequenceType::Shape seqShapeHelper(Fortran::semantics::SymbolRef symbol,
-                                          fir::SequenceType::Shape &bounds) {
-    auto &details = symbol->get<Fortran::semantics::ObjectEntityDetails>();
-    const auto size = details.shape().size();
-    for (auto &ss : details.shape()) {
-      auto lb = ss.lbound();
-      auto ub = ss.ubound();
-      if (lb.isStar() && ub.isStar() && size == 1)
-        return {}; // assumed rank
-      if (lb.isExplicit() && ub.isExplicit()) {
-        auto &lbv = lb.GetExplicit();
-        auto &ubv = ub.GetExplicit();
-        if (lbv.has_value() && ubv.has_value() && isConstant(lbv.value()) &&
-            isConstant(ubv.value())) {
-          bounds.emplace_back(toConstant(ubv.value()) -
-                              toConstant(lbv.value()) + 1);
-        } else {
-          bounds.emplace_back(fir::SequenceType::getUnknownExtent());
-        }
-      } else {
-        bounds.emplace_back(fir::SequenceType::getUnknownExtent());
-      }
-    }
-    return bounds;
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Emit errors and warnings.
-  //===--------------------------------------------------------------------===//
-
-  mlir::InFlightDiagnostic emitError(const llvm::Twine &message) {
-    return mlir::emitError(mlir::UnknownLoc::get(context), message);
-  }
-
-  mlir::InFlightDiagnostic emitWarning(const llvm::Twine &message) {
-    return mlir::emitWarning(mlir::UnknownLoc::get(context), message);
-  }
-
-  //===--------------------------------------------------------------------===//
-
   Fortran::lower::AbstractConverter &converter;
   mlir::MLIRContext *context;
 };
@@ -491,18 +263,6 @@ mlir::Type Fortran::lower::getFIRType(mlir::MLIRContext *context,
   return genFIRType(context, tc, kind);
 }
 
-mlir::Type
-Fortran::lower::getFIRType(Fortran::lower::AbstractConverter &converter,
-                           Fortran::common::TypeCategory tc) {
-  return TypeBuilder{converter}.genFIRTy(tc);
-}
-
-mlir::Type Fortran::lower::translateDataRefToFIRType(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::evaluate::DataRef &dataRef) {
-  return TypeBuilder{converter}.gen(dataRef);
-}
-
 mlir::Type Fortran::lower::translateSomeExprToFIRType(
     Fortran::lower::AbstractConverter &converter, const SomeExpr &expr) {
   return TypeBuilder{converter}.genExprType(expr);
@@ -522,11 +282,3 @@ mlir::Type Fortran::lower::translateVariableToFIRType(
 mlir::Type Fortran::lower::convertReal(mlir::MLIRContext *context, int kind) {
   return genRealType(context, kind);
 }
-
-mlir::Type Fortran::lower::getSequenceRefType(mlir::Type refType) {
-  auto type{refType.dyn_cast<fir::ReferenceType>()};
-  assert(type && "expected a reference type");
-  auto elementType{type.getEleTy()};
-  fir::SequenceType::Shape shape{fir::SequenceType::getUnknownExtent()};
-  return fir::ReferenceType::get(fir::SequenceType::get(shape, elementType));
-}

From 1fd980de04eda1bead75263a67e6e3bf8c4ebd00 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 23 Feb 2022 09:08:03 -0800
Subject: [PATCH 662/748] Revert "AttributorAttributes: avoid a crashing on bad
 alignments"

This reverts commit 70ff6fbeb9b5acb4995dc42286954b762d0937fd.

Breaks bots, e.g. http://45.33.8.238/linux/69375/step_12.txt.
---
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 12 +-----------
 llvm/test/Transforms/Attributor/heap_to_stack.ll | 14 --------------
 2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index ba986a65e0201..c94f38687b219 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -38,14 +38,12 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Value.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -6358,8 +6356,7 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
       continue;
 
     if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
-      Optional<APInt> APAlign = getAPInt(A, *this, *Align);
-      if (!APAlign) {
+      if (!getAPInt(A, *this, *Align)) {
         // Can't generate an alloca which respects the required alignment
         // on the allocation.
         LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
@@ -6367,13 +6364,6 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
         AI.Status = AllocationInfo::INVALID;
         Changed = ChangeStatus::CHANGED;
         continue;
-      } else {
-        if (APAlign->ugt(llvm::Value::MaximumAlignment) || !APAlign->isPowerOf2()) {
-          LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign << "\n");
-          AI.Status = AllocationInfo::INVALID;
-          Changed = ChangeStatus::CHANGED;
-          continue;
-        }
       }
     }
 
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index 9741df2034ce9..73f6887204647 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -218,20 +218,6 @@ define void @test3c(i64 %alignment) {
   ret void
 }
 
-; leave alone a constant-but-invalid alignment
-define void @test3d(i8* %p) {
-; CHECK-LABEL: define {{[^@]+}}@test3d
-; CHECK-SAME; (i8* nocapture [[P:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 allocalign noundef 33, i64 noundef 128)
-; CHECK:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
-;
-  %1 = tail call noalias i8* @aligned_alloc(i64 allocalign 33, i64 128)
-  tail call void @nofree_arg_only(i8* %1, i8* %p)
-  tail call void @free(i8* %1)
-  ret void
-}
-
 declare noalias i8* @calloc(i64, i64)
 
 define void @test0() {

From 21d7c3bcc646f5db73bc3d21f9d1b1327b6a5ec0 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 23 Feb 2022 11:25:01 -0500
Subject: [PATCH 663/748] [DAG] try to convert multiply to shift via demanded
 bits

This is a fix for a regression discussed in:
https://github.com/llvm/llvm-project/issues/53829

We cleared more high multiplier bits with 995d400,
but that can lead to worse codegen because we would fail
to recognize the now disguised multiplication by neg-power-of-2
as a shift-left. The problem exists independently of the IR
change in the case that the multiply already had cleared high
bits. We also convert shl+sub into mul+add in instcombine's
negator.

This patch fills in the high-bits to see the shift transform
opportunity. Alive2 attempt to show correctness:
https://alive2.llvm.org/ce/z/GgSKVX

The AArch64, RISCV, and MIPS diffs look like clear wins. The
x86 code requires an extra move register in the minimal examples,
but it's still an improvement to get rid of the multiply on all
CPUs that I am aware of (because multiply is never as fast as a
shift).

There's a potential follow-up noted by the TODO comment. We
should already convert that pattern into shl+add in IR, so
it's probably not common:
https://alive2.llvm.org/ce/z/7QY_Ga

Fixes #53829

Differential Revision: https://reviews.llvm.org/D120216
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 40 +++++++++++
 llvm/test/CodeGen/AArch64/mul_pow2.ll         | 12 ++--
 .../CodeGen/Mips/urem-seteq-illegal-types.ll  | 68 +++++++++----------
 llvm/test/CodeGen/RISCV/mul.ll                | 34 +++-------
 llvm/test/CodeGen/X86/mul-demand.ll           | 12 ++--
 5 files changed, 94 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6619f1c42a888..5c8612901b50b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2486,6 +2486,46 @@ bool TargetLowering::SimplifyDemandedBits(
       return TLO.CombineTo(Op, NewOp);
     }
 
+    // Match a multiply with a disguised negated-power-of-2 and convert to a
+    // an equivalent shift-left amount.
+    // Example: (X * MulC) + Op1 --> Op1 - (X << log2(-MulC))
+    auto getShiftLeftAmt = [&HighMask](SDValue Mul) -> unsigned {
+      if (Mul.getOpcode() != ISD::MUL || !Mul.hasOneUse())
+        return 0;
+
+      // Don't touch opaque constants. Also, ignore zero and power-of-2
+      // multiplies. Those will get folded later.
+      ConstantSDNode *MulC = isConstOrConstSplat(Mul.getOperand(1));
+      if (MulC && !MulC->isOpaque() && !MulC->isZero() &&
+          !MulC->getAPIntValue().isPowerOf2()) {
+        APInt UnmaskedC = MulC->getAPIntValue() | HighMask;
+        if (UnmaskedC.isNegatedPowerOf2())
+          return (-UnmaskedC).logBase2();
+      }
+      return 0;
+    };
+
+    auto foldMul = [&](SDValue X, SDValue Y, unsigned ShlAmt) {
+      EVT ShiftAmtTy = getShiftAmountTy(VT, TLO.DAG.getDataLayout());
+      SDValue ShlAmtC = TLO.DAG.getConstant(ShlAmt, dl, ShiftAmtTy);
+      SDValue Shl = TLO.DAG.getNode(ISD::SHL, dl, VT, X, ShlAmtC);
+      SDValue Sub = TLO.DAG.getNode(ISD::SUB, dl, VT, Y, Shl);
+      return TLO.CombineTo(Op, Sub);
+    };
+
+    if (isOperationLegalOrCustom(ISD::SHL, VT)) {
+      if (Op.getOpcode() == ISD::ADD) {
+        // (X * MulC) + Op1 --> Op1 - (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op0))
+          return foldMul(Op0.getOperand(0), Op1, ShAmt);
+        // Op0 + (X * MulC) --> Op0 - (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op1))
+          return foldMul(Op1.getOperand(0), Op0, ShAmt);
+        // TODO:
+        // Op0 - (X * MulC) --> Op0 + (X << log2(-MulC))
+      }
+    }
+
     LLVM_FALLTHROUGH;
   }
   default:
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index 31ff289b7a2f4..2c0bec9b87902 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -704,8 +704,7 @@ define i32 @ntest16(i32 %x) {
 define i32 @muladd_demand(i32 %x, i32 %y) {
 ; CHECK-LABEL: muladd_demand:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #131008
-; CHECK-NEXT:    madd w8, w0, w8, w1
+; CHECK-NEXT:    sub w8, w1, w0, lsl #6
 ; CHECK-NEXT:    and w0, w8, #0x1ffc0
 ; CHECK-NEXT:    ret
 ;
@@ -724,11 +723,10 @@ define i32 @muladd_demand(i32 %x, i32 %y) {
 define <4 x i32> @muladd_demand_commute(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: muladd_demand_commute:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #131008
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    mla v1.4s, v0.4s, v2.4s
-; CHECK-NEXT:    movi v0.4s, #1, msl #16
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    movi v2.4s, #1, msl #16
+; CHECK-NEXT:    shl v0.4s, v0.4s, #6
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: muladd_demand_commute:
diff --git a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
index 236addd12c387..6c477e654be21 100644
--- a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll
@@ -151,45 +151,45 @@ define i1 @test_urem_oversized(i66 %X) nounwind {
 ; MIPSEL-NEXT:    lui $1, 12057
 ; MIPSEL-NEXT:    ori $1, $1, 37186
 ; MIPSEL-NEXT:    multu $6, $1
-; MIPSEL-NEXT:    mflo $2
-; MIPSEL-NEXT:    mfhi $3
-; MIPSEL-NEXT:    lui $7, 52741
-; MIPSEL-NEXT:    ori $7, $7, 40665
-; MIPSEL-NEXT:    multu $6, $7
-; MIPSEL-NEXT:    mflo $8
+; MIPSEL-NEXT:    mflo $1
+; MIPSEL-NEXT:    mfhi $2
+; MIPSEL-NEXT:    lui $3, 52741
+; MIPSEL-NEXT:    ori $3, $3, 40665
+; MIPSEL-NEXT:    multu $6, $3
+; MIPSEL-NEXT:    mflo $7
+; MIPSEL-NEXT:    mfhi $8
+; MIPSEL-NEXT:    multu $5, $3
 ; MIPSEL-NEXT:    mfhi $9
-; MIPSEL-NEXT:    multu $5, $7
-; MIPSEL-NEXT:    mfhi $10
-; MIPSEL-NEXT:    mflo $11
-; MIPSEL-NEXT:    addu $9, $11, $9
-; MIPSEL-NEXT:    addu $12, $2, $9
-; MIPSEL-NEXT:    sltu $9, $9, $11
-; MIPSEL-NEXT:    sll $11, $12, 31
-; MIPSEL-NEXT:    sltu $2, $12, $2
-; MIPSEL-NEXT:    srl $13, $8, 1
-; MIPSEL-NEXT:    sll $8, $8, 1
-; MIPSEL-NEXT:    addu $2, $3, $2
-; MIPSEL-NEXT:    or $3, $13, $11
-; MIPSEL-NEXT:    srl $11, $12, 1
-; MIPSEL-NEXT:    addu $9, $10, $9
-; MIPSEL-NEXT:    mul $4, $4, $7
-; MIPSEL-NEXT:    mul $1, $5, $1
-; MIPSEL-NEXT:    sll $5, $6, 1
+; MIPSEL-NEXT:    mflo $10
+; MIPSEL-NEXT:    addu $8, $10, $8
+; MIPSEL-NEXT:    addu $11, $1, $8
+; MIPSEL-NEXT:    sltu $8, $8, $10
+; MIPSEL-NEXT:    sll $10, $11, 31
+; MIPSEL-NEXT:    sltu $1, $11, $1
+; MIPSEL-NEXT:    srl $12, $7, 1
+; MIPSEL-NEXT:    sll $7, $7, 1
+; MIPSEL-NEXT:    addu $1, $2, $1
+; MIPSEL-NEXT:    or $10, $12, $10
+; MIPSEL-NEXT:    srl $2, $11, 1
+; MIPSEL-NEXT:    addu $8, $9, $8
+; MIPSEL-NEXT:    mul $3, $4, $3
+; MIPSEL-NEXT:    sll $4, $6, 1
+; MIPSEL-NEXT:    sll $5, $5, 1
 ; MIPSEL-NEXT:    lui $6, 60010
 ; MIPSEL-NEXT:    ori $6, $6, 61135
-; MIPSEL-NEXT:    addu $2, $9, $2
-; MIPSEL-NEXT:    addu $1, $1, $2
-; MIPSEL-NEXT:    addu $2, $5, $4
-; MIPSEL-NEXT:    addu $1, $1, $2
+; MIPSEL-NEXT:    addu $1, $8, $1
+; MIPSEL-NEXT:    subu $1, $1, $5
+; MIPSEL-NEXT:    addu $3, $4, $3
+; MIPSEL-NEXT:    addu $1, $1, $3
 ; MIPSEL-NEXT:    andi $1, $1, 3
-; MIPSEL-NEXT:    sll $2, $1, 31
-; MIPSEL-NEXT:    or $4, $11, $2
-; MIPSEL-NEXT:    sltiu $2, $4, 13
-; MIPSEL-NEXT:    xori $4, $4, 13
-; MIPSEL-NEXT:    sltu $3, $3, $6
-; MIPSEL-NEXT:    movz $2, $3, $4
+; MIPSEL-NEXT:    sll $3, $1, 31
+; MIPSEL-NEXT:    or $3, $2, $3
+; MIPSEL-NEXT:    sltiu $2, $3, 13
+; MIPSEL-NEXT:    xori $3, $3, 13
+; MIPSEL-NEXT:    sltu $4, $10, $6
+; MIPSEL-NEXT:    movz $2, $4, $3
 ; MIPSEL-NEXT:    srl $1, $1, 1
-; MIPSEL-NEXT:    or $1, $1, $8
+; MIPSEL-NEXT:    or $1, $1, $7
 ; MIPSEL-NEXT:    andi $1, $1, 3
 ; MIPSEL-NEXT:    jr $ra
 ; MIPSEL-NEXT:    movn $2, $zero, $1
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index ad720808c1b5d..5782a601ce842 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1550,47 +1550,29 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 define i8 @muladd_demand(i8 %x, i8 %y) nounwind {
 ; RV32I-LABEL: muladd_demand:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a1
-; RV32I-NEXT:    li a1, 14
-; RV32I-NEXT:    call __mulsi3@plt
-; RV32I-NEXT:    add a0, s0, a0
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    sub a0, a1, a0
 ; RV32I-NEXT:    andi a0, a0, 15
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muladd_demand:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    li a2, 14
-; RV32IM-NEXT:    mul a0, a0, a2
-; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    slli a0, a0, 1
+; RV32IM-NEXT:    sub a0, a1, a0
 ; RV32IM-NEXT:    andi a0, a0, 15
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muladd_demand:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a1
-; RV64I-NEXT:    li a1, 14
-; RV64I-NEXT:    call __muldi3@plt
-; RV64I-NEXT:    addw a0, s0, a0
+; RV64I-NEXT:    slliw a0, a0, 1
+; RV64I-NEXT:    subw a0, a1, a0
 ; RV64I-NEXT:    andi a0, a0, 15
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muladd_demand:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    li a2, 14
-; RV64IM-NEXT:    mulw a0, a0, a2
-; RV64IM-NEXT:    addw a0, a1, a0
+; RV64IM-NEXT:    slliw a0, a0, 1
+; RV64IM-NEXT:    subw a0, a1, a0
 ; RV64IM-NEXT:    andi a0, a0, 15
 ; RV64IM-NEXT:    ret
   %m = mul i8 %x, 14
diff --git a/llvm/test/CodeGen/X86/mul-demand.ll b/llvm/test/CodeGen/X86/mul-demand.ll
index 0af5cb3e7a8e0..3454a84c45b08 100644
--- a/llvm/test/CodeGen/X86/mul-demand.ll
+++ b/llvm/test/CodeGen/X86/mul-demand.ll
@@ -4,8 +4,9 @@
 define i64 @muladd_demand(i64 %x, i64 %y) {
 ; CHECK-LABEL: muladd_demand:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    imull $131008, %edi, %eax # imm = 0x1FFC0
-; CHECK-NEXT:    addl %esi, %eax
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    shll $6, %edi
+; CHECK-NEXT:    subl %edi, %eax
 ; CHECK-NEXT:    shlq $47, %rax
 ; CHECK-NEXT:    retq
   %m = mul i64 %x, 131008 ; 0x0001ffc0
@@ -17,9 +18,10 @@ define i64 @muladd_demand(i64 %x, i64 %y) {
 define <2 x i64> @muladd_demand_commute(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: muladd_demand_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    paddq %xmm1, %xmm0
-; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    psllq $6, %xmm0
+; CHECK-NEXT:    psubq %xmm0, %xmm1
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %m = mul <2 x i64> %x, <i64 131008, i64 131008>
   %a = add <2 x i64> %y, %m

From dc19c70c81630f8a7b9d0a6a3c90272234ce978d Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 23 Feb 2022 17:12:13 +0000
Subject: [PATCH 664/748] [gn build] Port 7c1ee5e95f31

---
 .../gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn      | 3 +++
 .../secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
index c6d549ef68aa6..5abbc71e736fe 100644
--- a/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/Pseudo/BUILD.gn
@@ -12,5 +12,8 @@ static_library("Pseudo") {
     "LRGraph.cpp",
     "LRTable.cpp",
     "LRTableBuild.cpp",
+    "Lex.cpp",
+    "Preprocess.cpp",
+    "Token.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
index 99a3396428f58..903180f72c269 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/Syntax/Pseudo/BUILD.gn
@@ -13,5 +13,7 @@ unittest("ClangPseudoTests") {
   sources = [
     "GrammarTest.cpp",
     "LRTableTest.cpp",
+    "PreprocessTest.cpp",
+    "TokenTest.cpp",
   ]
 }

From 87753cebf5f861eee418d6bce155dfa0b00f9878 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 23 Feb 2022 17:29:41 +0000
Subject: [PATCH 665/748] [X86] combineX86ShufflesRecursively - don't both
 widening inputs before calling combineX86ShuffleChain

combineX86ShuffleChain no longer has to assume that the shuffle inputs are the right size, so don't create unnecessary nodes messing up oneuse limits as detailed on Issue #45319
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  32 +-
 llvm/test/CodeGen/X86/horizontal-sum.ll       |   4 +-
 .../vector-interleaved-load-i16-stride-3.ll   |   4 +-
 .../vector-interleaved-load-i16-stride-5.ll   |  32 +-
 .../vector-interleaved-load-i16-stride-6.ll   | 383 +++++++++---------
 .../vector-interleaved-store-i16-stride-5.ll  |   6 +-
 .../vector-interleaved-store-i32-stride-6.ll  |   2 +-
 7 files changed, 225 insertions(+), 238 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ebe728b9c08ed..a015e36cf658a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37979,12 +37979,6 @@ static SDValue combineX86ShuffleChainWithExtract(
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
 
-  // Bail if we have any smaller inputs.
-  if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {
-        return Input.getValueSizeInBits() < RootSizeInBits;
-      }))
-    return SDValue();
-
   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
   SmallVector<unsigned, 4> Offsets(NumInputs, 0);
 
@@ -37996,6 +37990,8 @@ static SDValue combineX86ShuffleChainWithExtract(
     unsigned &Offset = Offsets[i];
     Src = peekThroughBitcasts(Src);
     EVT BaseVT = Src.getValueType();
+    if (BaseVT.getSizeInBits() < RootSizeInBits)
+      continue;
     while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
       Offset += Src.getConstantOperandVal(1);
       Src = Src.getOperand(0);
@@ -38466,6 +38462,16 @@ static SDValue combineX86ShufflesRecursively(
     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
   }
 
+  // See if any input has been narrowed, this is done separately from the
+  // non-zero extraction index above as we often narrow the input during
+  // lowering/simplifydemandedelts so will end up in infinite canonicalization
+  // loops if we treat it as a faux shuffle (and increase the depth).
+  for (SDValue &OpInput : OpInputs)
+    if (OpInput.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        (RootSizeInBits % OpInput.getOperand(0).getValueSizeInBits()) == 0 &&
+        isNullConstant(OpInput.getOperand(1)))
+      OpInput = OpInput.getOperand(0);
+
   SmallVector<int, 64> Mask;
   SmallVector<SDValue, 16> Ops;
 
@@ -38714,20 +38720,6 @@ static SDValue combineX86ShufflesRecursively(
   }
   // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
 
-  // Widen any subvector shuffle inputs we've collected.
-  // TODO: Remove this to avoid generating temporary nodes, we should only
-  // widen once combineX86ShuffleChain has found a match.
-  if (any_of(Ops, [RootSizeInBits](SDValue Op) {
-        return Op.getValueSizeInBits() < RootSizeInBits;
-      })) {
-    for (SDValue &Op : Ops)
-      if (Op.getValueSizeInBits() < RootSizeInBits)
-        Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
-                            RootSizeInBits);
-    // Reresolve - we might have repeated subvector sources.
-    resolveTargetShuffleInputsAndMask(Ops, Mask);
-  }
-
   // We can only combine unary and binary shuffle mask cases.
   if (Ops.size() <= 2) {
     // Minor canonicalization of the accumulated shuffle mask to make it easier
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 3565165dc863a..9ee0a7c3ee901 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -233,7 +233,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
 ; AVX1-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
-; AVX1-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vhaddps %xmm0, %xmm2, %xmm2
 ; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
 ; AVX1-SLOW-NEXT:    retq
@@ -277,7 +277,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
 ; AVX2-SLOW-NEXT:    vhaddps %xmm7, %xmm6, %xmm2
-; AVX2-SLOW-NEXT:    vhaddps %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vhaddps %xmm0, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
 ; AVX2-SLOW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 857f0489597f3..c23b9f6b34a62 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -891,7 +891,7 @@ define void @vf32(<96 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.v
 ; AVX2-NEXT:    vmovdqa 176(%rdi), %xmm5
 ; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm6
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11>
 ; AVX2-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
@@ -916,7 +916,7 @@ define void @vf32(<96 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.v
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
 ; AVX2-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13>
 ; AVX2-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 689b555c40114..69d7bf38709d4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -948,7 +948,7 @@ define void @vf16(<80 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7]
 ; AVX1-NEXT:    vandps %ymm3, %ymm11, %ymm3
 ; AVX1-NEXT:    vorps %ymm2, %ymm3, %ymm3
@@ -1273,8 +1273,8 @@ define void @vf16(<80 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,4,5,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,0,1,10,11,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
@@ -2235,7 +2235,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vmovdqa 288(%rdi), %xmm8
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm4
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
@@ -2251,7 +2251,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm7[2],xmm8[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm5[5,6,7],ymm12[8,9,10,11,12],ymm5[13,14,15]
@@ -2280,7 +2280,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0],xmm8[1],xmm7[2,3]
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
@@ -2297,7 +2297,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
@@ -2315,7 +2315,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm8[2],xmm7[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
@@ -2348,7 +2348,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm7[3,1,2,3]
@@ -2554,7 +2554,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-ALL-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} xmm4 = <8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u>
 ; AVX2-FAST-ALL-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX2-FAST-ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
 ; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
@@ -2659,7 +2659,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 288(%rdi), %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm4, %ymm1, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm12[0],xmm10[1],xmm12[2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
@@ -2674,7 +2674,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm10[2],xmm12[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7],ymm13[8,9,10,11,12],ymm5[13,14,15]
@@ -2702,7 +2702,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm5 = xmm10[0],xmm12[1],xmm10[2,3]
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
@@ -2720,7 +2720,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm5 = xmm8[0],xmm9[1],xmm8[2,3]
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
@@ -2737,7 +2737,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm12[2],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm13, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
@@ -2771,7 +2771,7 @@ define void @vf32(<160 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u>
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5],ymm6[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index a104919603f18..bcef13b3bea6e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -1315,12 +1315,12 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm14
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa 160(%rdi), %ymm15
-; AVX2-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm1
-; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7]
+; AVX2-SLOW-NEXT:    vmovdqa 160(%rdi), %ymm0
+; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm15
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm8, %xmm0
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
@@ -1334,7 +1334,7 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4,5,6],ymm11[7],ymm2[8,9],ymm11[10],ymm2[11,12,13,14],ymm11[15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,2,0,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
@@ -1344,7 +1344,7 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5]
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,1,6,5,6,5]
@@ -1353,21 +1353,19 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11,12,13,14],ymm1[15]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u]
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7]
+; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,8,9,u,u,0,1,12,13,u,u]
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,5,6,4]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[1,1,0,3,5,5,4,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm12[0,3,2,3,4,7,6,7]
@@ -1379,15 +1377,15 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm8, %xmm5
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[2,1,2,0,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,5]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u]
@@ -1415,7 +1413,7 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3,4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10,11,12],ymm4[13],ymm6[14,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm2, %ymm4, %ymm2
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,3,3,4,5,7,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
@@ -1464,7 +1462,7 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm8, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
 ; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm10 = ymm2[2,3],ymm5[2,3]
@@ -1475,7 +1473,7 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,u,u,16,17,20,21,u,u,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4,5,6],ymm12[7],ymm2[8,9],ymm12[10],ymm2[11,12,13,14],ymm12[15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm3
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u]
@@ -1485,13 +1483,13 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,u,u,18,19,22,23,u,u,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11,12,13,14],ymm1[15]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm12[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm12[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7]
@@ -1504,7 +1502,7 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,0,1,u,u,8,9,12,13,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm7[4,5,u,u,u,u,u,u,u,u,0,1,12,13,u,u,20,21,u,u,u,u,u,u,u,u,16,17,28,29,u,u]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm10[0,3,2,3,4,7,6,7]
@@ -1522,7 +1520,7 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm6[4,5,6,7]
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,u,u,10,11,14,15,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm9[u,u,2,3,6,7,6,7,10,11,u,u,u,u,10,11,u,u,18,19,22,23,22,23,26,27,u,u,u,u,26,27]
@@ -1557,12 +1555,12 @@ define void @vf16(<96 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.v
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,1]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
@@ -2758,20 +2756,20 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm13[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm0[0],ymm11[1,2,3,4],ymm0[5],ymm11[6,7],ymm0[8],ymm11[9,10,11,12],ymm0[13],ymm11[14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
-; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm3, %ymm11, %ymm0
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm3, %ymm11, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 224(%rdi), %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm6
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa %ymm6, %ymm9
-; AVX2-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm5
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
+; AVX2-SLOW-NEXT:    vmovdqa %ymm5, %ymm9
+; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm0, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm7
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm4[2,2,2,2,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm14[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
@@ -2779,19 +2777,19 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    # ymm0 = mem[2,1,2,1,6,5,6,5]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10,11,12],ymm15[13],ymm7[14,15]
-; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm6, %ymm7, %ymm6
-; AVX2-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm10[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,1,3,3,4,5,7,7]
+; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm5, %ymm7, %ymm5
+; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm10[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,1,3,3,4,5,7,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm13[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3,4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10,11,12],ymm6[13],ymm7[14,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3,4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10,11,12],ymm5[13],ymm7[14,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm1, %ymm6, %ymm1
+; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,3,4,5,7,7]
@@ -2802,7 +2800,7 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm9
@@ -2821,8 +2819,8 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm4
-; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm3
+; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4,5],xmm1[6],xmm7[7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
@@ -2832,70 +2830,69 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm7, %xmm3
-; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm7[2,2,2,2,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm13[0,1,2,2]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm8 = xmm6[0,1,2],xmm1[3],xmm6[4,5],xmm1[6],xmm6[7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm8
+; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm0[2,2,2,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm13[0,1,2,2]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[0,2,2,1,4,6,6,5]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm15[0,2,2,1,4,6,6,5]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14]
 ; AVX2-SLOW-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm13 = mem[0,3,2,3,4,7,6,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm11[0,1],ymm6[2],ymm11[3,4,5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11,12,13,14],ymm6[15]
-; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm11 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4,5,6],ymm7[7],ymm11[8,9],ymm7[10],ymm11[11,12,13,14],ymm7[15]
+; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-SLOW-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm11 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm10, %xmm11, %xmm10
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm0
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[0,2,0,3]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm1[2],xmm10[3],xmm1[4,5],xmm10[6,7]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5],ymm1[6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm6
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm1
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm1[0,2,0,3]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2],xmm10[3],xmm4[4,5],xmm10[6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5],ymm4[6,7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm4[0,1,2],ymm6[3,4,5,6,7],ymm4[8,9,10],ymm6[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,1,6,5,6,5]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm8 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4,5,6],ymm1[7],ymm8[8,9],ymm1[10],ymm8[11,12,13,14],ymm1[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
-; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm9, %xmm6
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
-; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5],ymm2[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
-; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm6 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4,5,6],ymm4[7],ymm6[8,9],ymm4[10],ymm6[11,12,13,14],ymm4[15]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u>
+; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm9, %xmm7
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3],xmm7[4,5],xmm2[6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5],ymm2[6,7]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,5,5,5,5]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
-; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm11, %xmm3
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,1,6,5,6,5]
-; AVX2-SLOW-NEXT:    vmovdqa %ymm15, %ymm1
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
+; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm11, %xmm2
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm15[2,1,2,1,6,5,6,5]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11,12,13,14],ymm3[15]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5],ymm0[6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11,12,13,14],ymm2[15]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5],ymm1[6,7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufd $197, (%rsp), %ymm0 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm0 = mem[1,1,0,3,5,5,4,7]
@@ -2908,98 +2905,96 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    # ymm4 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[0,1,2,1]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm14[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,6,4]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5,6],xmm7[7]
-; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm7 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,u,8,9,u,u,0,1,12,13,u,u>
+; AVX2-SLOW-NEXT:    vpshufb %xmm13, %xmm4, %xmm7
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,6,5,6,4]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5,6],xmm3[7]
+; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-SLOW-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm7 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm7[2,1,2,3]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm7, %xmm7
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[0,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm11[2,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm3
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm7[0,3,2,1]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm9[0,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm11[2,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2],xmm1[3],xmm6[4,5,6,7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-SLOW-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-SLOW-NEXT:    vpshufb %xmm13, %xmm0, %xmm1
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,6,4]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,6,5,6,4]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[1,1,0,3,5,5,4,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm13 = ymm1[0,3,2,3,4,7,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[1,1,0,3,5,5,4,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,6,7,8,9,10,11,12,12,14,15]
+; AVX2-SLOW-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm13 = mem[0,3,2,3,4,7,6,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm13[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2,3,4],ymm5[5,6],ymm10[7],ymm5[8],ymm10[9,10,11,12],ymm5[13,14],ymm10[15]
-; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm10[2,1,2,3]
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm6
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm4
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u>
-; AVX2-SLOW-NEXT:    vmovdqu (%rsp), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpshufb %ymm10, %ymm1, %ymm4
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm15[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6],ymm9[7],ymm4[8],ymm9[9,10,11,12],ymm4[13,14],ymm9[15]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6],xmm5[7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2],xmm5[3],xmm7[4,5,6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1,2,3,4],ymm6[5,6],ymm10[7],ymm6[8],ymm10[9,10,11,12],ymm6[13,14],ymm10[15]
+; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-SLOW-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm10 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm10[2,1,2,3]
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm5
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[0,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[2,1,2,0,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5,6,7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm7
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <6,7,u,u,u,u,u,u,u,u,2,3,14,15,u,u,22,23,u,u,u,u,u,u,u,u,18,19,30,31,u,u>
+; AVX2-SLOW-NEXT:    vmovdqu (%rsp), %ymm2 # 32-byte Reload
+; AVX2-SLOW-NEXT:    vpshufb %ymm14, %ymm2, %ymm7
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm15[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1,2,3,4],ymm7[5,6],ymm10[7],ymm7[8],ymm10[9,10,11,12],ymm7[13,14],ymm10[15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,7,5,6,5]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,0,3]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5,6],xmm6[7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1,2],xmm6[3],xmm2[4,5,6,7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7],ymm7[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
-; AVX2-SLOW-NEXT:    vpshufb %ymm10, %ymm8, %ymm3
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
+; AVX2-SLOW-NEXT:    vpshufb %ymm14, %ymm8, %ymm3
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,6,7,8,9,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3,4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9,10,11,12],ymm3[13,14],ymm4[15]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[0,1,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,1,3,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm2 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
+; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13>
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
@@ -3007,26 +3002,26 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-SLOW-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm6 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-SLOW-NEXT:    vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
-; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-SLOW-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15>
+; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
+; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-SLOW-NEXT:    vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-SLOW-NEXT:    # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm7, %xmm3
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3]
@@ -3048,11 +3043,11 @@ define void @vf32(<192 x i16>* %in.vec, <32 x i16>* %out.vec0, <32 x i16>* %out.
 ; AVX2-SLOW-NEXT:    vmovaps %ymm5, (%rcx)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm9, 32(%r8)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%r8)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%r9)
+; AVX2-SLOW-NEXT:    vmovdqa %ymm2, 32(%r9)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, (%r9)
 ; AVX2-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm3, 32(%rax)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm2, (%rax)
+; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rax)
 ; AVX2-SLOW-NEXT:    addq $520, %rsp # imm = 0x208
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 9d7b11119d5ec..f671c4d1d2175 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -895,7 +895,7 @@ define void @vf16(<16 x i16>* %in.vecptr0, <16 x i16>* %in.vecptr1, <16 x i16>*
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm13[0,1,2,3],xmm0[4],xmm13[5,6,7]
@@ -1767,7 +1767,7 @@ define void @vf32(<32 x i16>* %in.vecptr0, <32 x i16>* %in.vecptr1, <32 x i16>*
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[0,1,2,2,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm14[4],xmm5[5,6,7]
@@ -1870,7 +1870,7 @@ define void @vf32(<32 x i16>* %in.vecptr0, <32 x i16>* %in.vecptr1, <32 x i16>*
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
 ; AVX1-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 36c95807e44d5..c3acfb4c0d8f2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -219,7 +219,7 @@ define void @store_i32_stride6_vf4(<4 x i32>* %in.vecptr0, <4 x i32>* %in.vecptr
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm10
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2]
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm3[0,0],xmm2[0,0]
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm6 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5,6,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7]

From 6eec4835844439ab932515ff4ac857773c639171 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 23 Feb 2022 09:37:53 -0800
Subject: [PATCH 666/748] [clang] Remove getPointerElementType() in
 EmitVTableTypeCheckedLoad()

---
 clang/lib/CodeGen/CGClass.cpp         | 5 +++--
 clang/lib/CodeGen/CodeGenFunction.h   | 4 +++-
 clang/lib/CodeGen/ItaniumCXXABI.cpp   | 2 +-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index f6cacd07a66f2..9a175d1f59d2b 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2829,7 +2829,8 @@ bool CodeGenFunction::ShouldEmitVTableTypeCheckedLoad(const CXXRecordDecl *RD) {
 }
 
 llvm::Value *CodeGenFunction::EmitVTableTypeCheckedLoad(
-    const CXXRecordDecl *RD, llvm::Value *VTable, uint64_t VTableByteOffset) {
+    const CXXRecordDecl *RD, llvm::Value *VTable, llvm::Type *VTableTy,
+    uint64_t VTableByteOffset) {
   SanitizerScope SanScope(this);
 
   EmitSanitizerStatReport(llvm::SanStat_CFI_VCall);
@@ -2854,7 +2855,7 @@ llvm::Value *CodeGenFunction::EmitVTableTypeCheckedLoad(
   }
 
   return Builder.CreateBitCast(Builder.CreateExtractValue(CheckedLoad, 0),
-                               VTable->getType()->getPointerElementType());
+                               VTableTy);
 }
 
 void CodeGenFunction::EmitForwardingCallToLambda(
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index df99cd9a1b790..f9932e4140813 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2322,7 +2322,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   bool ShouldEmitVTableTypeCheckedLoad(const CXXRecordDecl *RD);
 
   /// Emit a type checked load from the given vtable.
-  llvm::Value *EmitVTableTypeCheckedLoad(const CXXRecordDecl *RD, llvm::Value *VTable,
+  llvm::Value *EmitVTableTypeCheckedLoad(const CXXRecordDecl *RD,
+                                         llvm::Value *VTable,
+                                         llvm::Type *VTableTy,
                                          uint64_t VTableByteOffset);
 
   /// EnterDtorCleanups - Enter the cleanups necessary to complete the
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 5ec9d3289ee8b..c2213c6e0df37 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1925,7 +1925,7 @@ CGCallee ItaniumCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
   llvm::Value *VFunc;
   if (CGF.ShouldEmitVTableTypeCheckedLoad(MethodDecl->getParent())) {
     VFunc = CGF.EmitVTableTypeCheckedLoad(
-        MethodDecl->getParent(), VTable,
+        MethodDecl->getParent(), VTable, TyPtr,
         VTableIndex * CGM.getContext().getTargetInfo().getPointerWidth(0) / 8);
   } else {
     CGF.EmitTypeMetadataCodeForVCall(MethodDecl->getParent(), VTable, Loc);
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index c1156d643e82d..14912c3a415a3 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1947,7 +1947,7 @@ CGCallee MicrosoftCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
   llvm::Value *VFunc;
   if (CGF.ShouldEmitVTableTypeCheckedLoad(MethodDecl->getParent())) {
     VFunc = CGF.EmitVTableTypeCheckedLoad(
-        getObjectWithVPtr(), VTable,
+        getObjectWithVPtr(), VTable, Ty,
         ML.Index * CGM.getContext().getTargetInfo().getPointerWidth(0) / 8);
   } else {
     if (CGM.getCodeGenOpts().PrepareForLTO)

From b3a778fb5eca0e6032f3d62c03a673c2431be8a0 Mon Sep 17 00:00:00 2001
From: wlei <wlei@fb.com>
Date: Tue, 22 Feb 2022 12:09:01 -0800
Subject: [PATCH 667/748] [llvm-profgen] Support symbol loading for debug
 fission

Support to load debug info from dwarf split file, like .dwo, .dwp files. Leverage the `getNonSkeletonUnitDIE(false)` API to achieve this.

Add test cause to make sure all the ranges is well retrieved by the loader.

Reviewed By: ayermolo, hoy, wenlei

Differential Revision: https://reviews.llvm.org/D115973
---
 .../Inputs/split-dwarf-single.exe             | Bin 0 -> 18584 bytes
 .../Inputs/split-dwarf-single.o.yaml          |  92 ++++++++++++
 .../Inputs/split-dwarf-split.dwo.yaml         |  52 +++++++
 .../llvm-profgen/Inputs/split-dwarf-split.exe | Bin 0 -> 18584 bytes
 llvm/test/tools/llvm-profgen/split-dwarf.test |  50 +++++++
 llvm/tools/llvm-profgen/ProfiledBinary.cpp    | 139 +++++++++++-------
 llvm/tools/llvm-profgen/ProfiledBinary.h      |   3 +
 7 files changed, 282 insertions(+), 54 deletions(-)
 create mode 100755 llvm/test/tools/llvm-profgen/Inputs/split-dwarf-single.exe
 create mode 100644 llvm/test/tools/llvm-profgen/Inputs/split-dwarf-single.o.yaml
 create mode 100644 llvm/test/tools/llvm-profgen/Inputs/split-dwarf-split.dwo.yaml
 create mode 100755 llvm/test/tools/llvm-profgen/Inputs/split-dwarf-split.exe
 create mode 100644 llvm/test/tools/llvm-profgen/split-dwarf.test

diff --git a/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-single.exe b/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-single.exe
new file mode 100755
index 0000000000000000000000000000000000000000..86a7775ece4c61fe86a4269e137760953d748a8f
GIT binary patch
literal 18584
zcmeHPdu$xV8K1lJE3dN?;v|q1A0?5TfY)ayan3->ou6moJeru2s7*F|zO{XDzH{&P
zf`e2M2oJe5#Z`r>s!|oH6@oviQVFW6RV7EEq)3$-C{?KfQm9Dfwjfd!XbL3P@0<O;
zyWPFr1c887+{k<1%<p^5%(t^UkDc+o{lf!(pHFb`i!TXG?W&1N$ov&z@2xUPSVSxk
zQL#>}6?K5rz!jAoq-Koi1v1UFf#d=}Q7_^*frvjUBTUzruwcq1L|RKclSC<H($;!2
z$yA7t1W*sl8V<@N(1S9<^bN}3JW<xe;-OxY^rEE4^f<{g<@#aW@EW1|jrgOG!gLgg
z2+7|vMOh5H&KXIJH(;icMs4&KnG%92>wOvYNQERWPnw9moAUGW%W0|~Q+7{#I+f_!
z)SgbZrBj*0blY^KtF3F3lFurgvfgMPWe0cME_1^BO|B@V@jh}pkRbhV9w?7-zN|7=
zimwNOkc-eFz?Qk-{eXpB@DUGOK>bItx?Mt5F|OtfEoZ2cS}G$3hlhH4)lQ{T*(}td
zk?m?y&*|f-yrJhtw)dv98GS@cq;=37pUh@RQ#D0h1+ieJ_k7?D^atO+!ZoP{$om3{
zKseiYP*Q9QD92&mf?Myyc?*vBg2M$1&TR~N|7yJUz4*~L8sd-C{%lc5#2=h9{KeDp
zqt7;+l&Z+@cynmxg#+=Kzr>Heety@;(5bnzwNVj2bz(}AQ@el^o9_hmw`WoQ<e1Xf
z@_<yIjUU@OgLaSA-jAgFg3%0Fbx~Feif0>Jj-cd8q9DId%5UC{Wc{W1iA#a_%=!3<
z`CH?@=i)D1GM0mZC1jwXcy_F@rLU}y`5)Q36xgD0)$Q?_bMyBUi^b<^iwi@-xBukq
z$+2?b8e5{W{{Q%98AFv{priLgiNG$V!9Pu>`ai&8&c6cII_AJXO(<aM$L<)K`BUsG
zLo@HjZja4ejExK*TleFJ5a=FjL%L^pJ=$oPfAmtZ7(a0_V6=3cq52Nbyf-}aZr{vX
zvEuSq;zv*V;@z(n{*M0t>i*c*V*6tUVyd;w(!Wp;r1HMeo$}e9MrbRm12^M41=~#=
zvfT@G8fY451nO`EaiGV6P5_+)`ZCZ4YGlhHjlp-%ZsD75@~vK6-!Kb2Y^Q7Bn!i{q
zwxd8_eG_6~xN5*Yzspeu+$Wv_44+XE3pPFE?_E+i0EQrm{3Kk*fsgSL{|=Lnx(^Uv
zNB%&t=}~`waMia1{lU<&n*Lzxw`$|T@I!U+VB~21U@)2qMq<HmEZEu;4D|$8^#q%G
zf(^2~zXSE(TFQpzs-fzEst2kbsCuC4fzPuC!Z3D|fhdnLnJy;hglkN>C6t~g`Ky^v
z^QhI7=6TaKl#UQz_Vs`<zlQkJ{-}(-bE%lc$Z>xZe#>*iNWe_<oM_H(rWXn`WQfYK
z2E<oUx|t}GdWxJTdt8o#gu~!l2G)mRgDlTeBo9L*nP&f-r}X2*C-u*a@I2P70E4}~
zH-%bZ2Dd&GQ8p{#P-nQaBfPm|Q>b;fo(#n`!^GP*t{0<eEi)cEr04RfY$ntJ(jebD
zVHi{So7&sQQ^rIgp^RoH+tcYo=1<#HE_<gwYAC=21%0C)=}vSfv_xkl(iKUByTiKP
zJ+`r{Q_~`2;f<Xe!`+*<tjBWVJ(TkwaOz-et#9L0ch_dX?xi-zv;~DRm~yqbLMD@f
zQjU*~Iw~ZDQ&!7lvWZlNWH%`tSov_cyE7sLw;9v!hQXb8hz<QPsp7UqxSVD+nkAqz
z+yIlr(YsV|R(?W=t8Batqhl(#!~mCo3$elTvqSa<cJ3Y-8rUa_sJ6nUdxM;D?%k`#
z_U!2gshvCa0N+ubdYs?!c-ypjoovnyTiMB!p^oLWNnM@FrZO;Tt+Z8#bLgdT$sqlE
zomsb>uBB7s^6~PzPrSCo)am=`eX8E4>hn~6o~rLX?r^L9Q@NkH;S=wlj?g&bL88_E
z$=c6v_|(S{I8MUx9)6}8j8%@4ocD{zO(gF+ZZdi9<EHAkr8>Se`<}J6)p<a59#EaH
zMP&z2ov;0G%&T$IZd?D*I!gOBc<sYYhIvfgWO&TO)J=xh_6BHt<R-)GedlT2&rN2Q
z^t(vLtKY@X#u&~K{jA4hxQ_A(tn0}2BF9?Hhg#P`#ZPlx=St%-zF&W&`{gknv(@)<
z<FPB{$N6yk__+CbwL^aYF|D>kd@3QB<MR4XyB${V(?=VpSMQVidj049y<UBu{y&~4
zx3LuWeM~>943FuyU1n^@W4d>!e_sn-K&`03EJaOg;lAcKAO1ab3G~s{CvK#u?>5o2
zGPiQ|x~F@tKD2J>>ScBLmB2axS1o+++5*()^SvKjf&>k~1(6nME8uJKH`ZS3lcR>W
zR^D`RTl;~~efwYg!Hc~I--fTk3;a<kj;}Vba9geTHIQ@r{0(425&5Zf%4kd8t>wnr
z^6(8Xtt(l9d)6>xR@_Af@EbOdr!c$g;F8&jR*2yj>hM@LEAU%nSnl_Ph`^eZh2Sfp
zn1yd=$PtP#^t_>r%EV=U0#M<IKUyIX0K+4$ctA|M;x(o3;Z|b-sMVaSe4XI$<#zdc
z;r5+606ydI+jjW{g1?j7@r8ot40e1`Y2IST7Ym+0*zqOe0^K7!9)$hUQ7a_l0T`vn
zUGb*U{%@;N0KRDQz5_eH4EBE?v_di-fG@bb-`2_uV9RQ9lW!KQT=5lRjVq3gsxky%
zw?e4tcm*RMcz1=J6_9P#j$tYwc=v^!7bxwzcjCOu*oj|j2PhRNe^s~f10rNgb0Q$0
zG)|cmpZyx~&!Qa_Vk-jvMWt{4%-`;TYlQRIo#hK2_%RRsaS!~Y2Y$u_|F;Le7%L3*
za+Y%);k?$3>KL{X&hvE>6=I)<oq`Acm<RrI5B!V={=Ns^g6~F`dfnoI?<d?@-v<bH
z*7r%k-P*}BB+qkF_RsT#bAQSBYlJ)f{~r&$83&Cn_3HG%_jur05ByON{0R^IH-tOu
z`#Rw~SLXWqpzFl)JMj?VPXBBL+|AE%!2Pf@W|qbcZ2vaG=N$Nc!iD2~o(J3qyIc5u
zh|LMZ{SvQyKR-!!!VWvnNcqb5{7ZnlmEWsgXQZ7*v9>-c%VGU-v-3BQ$JIG!Kk}Zm
zQ`wIM<UpgcoGT<=*{`h!+~?B2+zR+I@IRikz#!Fu?u1AP0Y~_*#4G!wy8v$z%_aY{
z+7RH&{FVK34sjN;(C?5PXMguYz(W=}j{XGjrG8kssf#is=`Rt7olxBGOZNijdBCy0
z+|G5<`ZM7?rsKU@BhZkVpk0L={p_nK5AC1dGmw<wdt`^-1MP&r1322@_DQht)tTkl
zMgHe~i*<l6^*2}KyA1FUD_iIvWO-c1OYMbx4i8<m!|t_qIBlhNZEDNu$qCIcPhho2
z+7+ez2a<3+Ycywcz(7}_uUKj-lfrSRBdzM0q!UvnvW62wIX7G>WYVd-^z>m@1uv!|
z4@?!NT$Ma{iawokFP5DF>PBi*&FSgvsI|U!++J@x&t6YEuhI{8zKcKX0@6UGU+hAa
z{;~6=5rA@r)rWQ&hrjGhX~6C`HJLV@rWraOOG774rF63><s0QChV4jCkG08WmfcfU
z0I;I%l`$-co2>mF*yX)=F1~jtR{FgxQ0enB&)wf;zQfmLhMS+wBZH&)g6S4{8_O=B
z6NM$V%|90A7&yA(-jq#CHtND_Aa4}lB-^McpF~uRN%*h{XA$#o4l$Wk$J5z_mR6HS
zHkVhm!nA<hx>IT0(38rR&hAa_LMrB=YPp<t7^-RH4hwVRXtFRlc^FhIDOppSD9wsh
zbzpaFd%xPhqfdoHkaoGGQ2X}nh;1M0wR235sVeMFR{P^*A>OxJsDr~hdt$@t&VhkF
z{Uho~tY^3%t;#c+=us^3<xgemNzKq$MP?+Abw;DMbDb(mnP)da)_#yvP3E)egqBI-
zA<v<mAeBsI)Iwf|>Xb`@kBPvQ1drzlWE1&3Wg!<DR29yTvZL@|srzYBC4YF*&=LaL
zM$V)Y9FzNb;hmd-jmAo>XQ<84#)UGTDJT=TCs|1z&VaH>jhxA$UEQ{X3OsndYN#L-
zSrf(3r{U!+{j21%(us;bL2Y0n2^yR-HO$6fYH%FPX_G1FqQ<fRfgfcm4H45ActhYO
z@=0iFW@!?IR640>hLKAp3ecWFQco1dp$&~ew`az+L?Wjj;uIWL);SJN;i#6x@|g+%
ztJ$%!yl#~2frHekLITQG&Vw-H@KhN?>u|$x$MnYwI5USK3oR+(Ir%80%ID>Id?7*j
zL0N0K9uoC%;j{$<uOIPw^pHf$KRm8t1ozY9Scvs`ooSi?TnWQ~G8p9IA5gZ_0r7X?
ztOM&G_eTK{ur4J@X@$Y5Z*qmW86r4#WBoaQ6jB1#uOt!ECJ``<l6B|&R}tO|R8Dy)
z&Ffnc1R?PG4>MKiziq(4vm2a0uY)~K`nXn(fy>WwO!op0&w_B8*UO$F{SYccaF*Xh
zg&2nj`jYi|UF|gKA0$D}-&y~!10UN2+Q$&Ci4v8hV&#YLo5>a8Zira*c^tfp1m$`G
z3j>w=5iIF%IP`fvZ=MPaBYj(N=6{s*x&A!<zd-uV{J8#X^bv<XuMhJ7Rk(!(i8}ND
zzC)ka5#!|0vkrY{{yzdf{>uouU8VI#{yz&&c}_F^Dd^%J1J>vHWCV>uU^z2YPBZ>1
zU|>;MpVu$>|5zNS7Y4Tv)?>du3%V%6`n=A0mNq_wfM*S?&-8g<TJ=lop_BnXr)-1u
zn7-)H=XvBR(s$l3st%hIFGCD%art?^w&p|Xze@RsOBT%t>p$keIe(^q1wOuiIq-Vz
z)0F=#zThpvS)UJp$F?kDq7Of&Kxj2d>DMWX=klEx3a3fmxw7n(UE!gBl=Ks&oXv<+
z|BIyW^xwHF<j;26Jo5j5^quX8^|)Bw9{S7MObNOTEMDQOVGsSaq`$7DX-1s+AN0^4
zB7Mj0ugKpZeW(ADq`$x;e=Y}7es@|$r$|3S9oY52+y@swckEZR&(Drk|A{ST>PCKX
a5R>y|Sv(u&rvH*$xQGAM;81WR#eV@<X9*Gj

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-single.o.yaml b/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-single.o.yaml
new file mode 100644
index 0000000000000..a3ace754518e0
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-single.o.yaml
@@ -0,0 +1,92 @@
+# Source code:
+
+# int foo() {
+#   return 1;
+# }
+#
+# int main() {
+#   foo();
+#   return 0;
+# }
+
+# Build instructions (clang version 15.0.0):
+# split-dwarf-single.o:       clang -gsplit-dwarf=single -fdebug-compilation-dir=.  test.c   -fdebug-info-for-profiling  -O0 -g -o split-dwarf-single.o -c
+# split-dwarf-single.o.yaml:  obj2yaml split-dwarf-single.o > split-dwarf-single.o.yaml
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         014A00101772171B25B442197625111B12067317000000
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         2400000005000408000000000428758115ED87CF0100000000000000000001002C00000000000000
+  - Name:            .debug_str_offsets
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         0C000000050000000000000000000000
+  - Name:            .debug_str_offsets.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE ]
+    AddressAlign:    0x1
+    Content:         1C000000050000000000000004000000080000000D000000760000007D000000
+  - Name:            .debug_str.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE, SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         666F6F00696E74006D61696E00636C616E672076657273696F6E2031352E302E30202868747470733A2F2F6769746875622E636F6D2F6C6C766D2F6C6C766D2D70726F6A6563742E67697420313565336538396239626162323838363862303930656539663336326161386630333233303934372900746573742E630073706C69742D64776172662D73696E676C652E6F00
+  - Name:            .debug_info.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE ]
+    AddressAlign:    0x1
+    Content:         3900000005000508000000000428758115ED87CF01030C00040502000B00000001560000013800000002011C0000000156020005380000000301050400
+  - Name:            .debug_abbrev.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE ]
+    AddressAlign:    0x1
+    Content:         01110125251305032576250000022E00111B1206401803253A0B3B0B49133F19000003240003253E0B0B0B000000
+  - Name:            .debug_line
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         5E0000000500080037000000010101FB0E0D00010101010000000100000101011F010000000003011F020F051E010000000000EF173AFD4B2F5E20815DE19BD24360F4040000090200000000000000000105030A4B0500BD05030AE5590208000101
+  - Name:            .debug_line_str
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         2E00746573742E6300
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .strtab
+      - Name:            .debug_abbrev
+      - Name:            .debug_info
+      - Name:            .debug_str_offsets
+      - Name:            .debug_str
+      - Name:            .debug_str_offsets.dwo
+      - Name:            .debug_str.dwo
+      - Name:            .debug_info.dwo
+      - Name:            .debug_abbrev.dwo
+      - Name:            .debug_addr
+      - Name:            .debug_line
+      - Name:            .debug_line_str
+DWARF:
+  debug_str:
+    - .
+    - split-dwarf-single.o
+  debug_addr:
+    - Length:          0x14
+      Version:         0x5
+      AddressSize:     0x8
+      Entries:
+        - {}
+        - Address:         0x10
+...
diff --git a/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-split.dwo.yaml b/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-split.dwo.yaml
new file mode 100644
index 0000000000000..688011a57a104
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-split.dwo.yaml
@@ -0,0 +1,52 @@
+# Source code:
+
+# int foo() {
+#   return 1;
+# }
+#
+# int main() {
+#   foo();
+#   return 0;
+# }
+
+# Build instructions (clang version 15.0.0):
+# split-dwarf-split.dwo:      clang -gsplit-dwarf=split -fdebug-compilation-dir=.  test.c  -fdebug-info-for-profiling  -O0 -g -o split-dwarf-split.o -c
+# split-dwarf-split.dwo.yaml:  obj2yaml split-dwarf-split.dwo > split-dwarf-split.dwo.yaml
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .debug_str_offsets.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE ]
+    AddressAlign:    0x1
+    Content:         1C000000050000000000000004000000080000000D000000760000007D000000
+  - Name:            .debug_str.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE, SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         666F6F00696E74006D61696E00636C616E672076657273696F6E2031352E302E30202868747470733A2F2F6769746875622E636F6D2F6C6C766D2F6C6C766D2D70726F6A6563742E67697420313565336538396239626162323838363862303930656539663336326161386630333233303934372900746573742E630073706C69742D64776172662D73706C69742E64776F00
+  - Name:            .debug_info.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE ]
+    AddressAlign:    0x1
+    Content:         3900000005000508000000000428758115ED87CF01030C00040502000B00000001560000013800000002011C0000000156020005380000000301050400
+  - Name:            .debug_abbrev.dwo
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_EXCLUDE ]
+    AddressAlign:    0x1
+    Content:         01110125251305032576250000022E00111B1206401803253A0B3B0B49133F19000003240003253E0B0B0B000000
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .strtab
+      - Name:            .debug_str_offsets.dwo
+      - Name:            .debug_str.dwo
+      - Name:            .debug_info.dwo
+      - Name:            .debug_abbrev.dwo
+...
diff --git a/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-split.exe b/llvm/test/tools/llvm-profgen/Inputs/split-dwarf-split.exe
new file mode 100755
index 0000000000000000000000000000000000000000..4d0ff5700b1a4ee5096065e0dcbcedec3d2f7f10
GIT binary patch
literal 18584
zcmeHPeQaCR6~DIg)h{P4q%E`~X0%dT$cvMvsRMM_Ns~A<{a6!5tF3)~j$aakV`uhr
zNs%VX3QQU6$TXp8nlx3~gy4^9OoD0JG$~aX9nz!~#x$maR3<uQ8;DK=)&Z?|&VA?D
z?>)cQA`q|%U&TK6{?5m}_uT7uKi<8$uYY*J@AC-`e(?o?sogbE37OwQ?7dAU35$qD
zA}ZF4b)pWC8u&yd2dNojdVx$cZ6LV-P}Ga~O(5cr$_Uf7CM=k836a(k&m>VwnY6Xu
zOfnTBBmvaJvW9~)3G|?hFnygeI8T)Iuz08!CA}!=F+EQ5Ou2qoH~frJ{YL#!NMSmP
zM1<sbnW8L)-RF!X#v3qGNuxG;t4s;Ol=Z#{dZa=UmnTic-b49$`Q<d#k14ySJ)KH)
zceSUJZRu2|Fw-^@>2B-pQu0})Q`Q^pqwL_$J7iAS-sFl>8rzZkfduJ`^FVow^T#T4
zrT7LA2)PI?0c?c}-Va#F1t0am1=N2GtJ^J972|5&&~k=4rKK`raCoS<PwiAXmCZsO
z8r`8L^_)JD${TuabVpx0o6$$LL|O;UiK%RcG*wg7RS*kidd~;mK!0%i6|PM!Lf+?4
z1j5<IgOXxjKsgQz7Tnqn=Pfw41&0e3ockE^{?&NhyYZv1H^d*V{n?U`h(9=Q_=~6G
zN1tgpDOHi*@y5{X^9SOye~BM`?fmZ1p;Pl`Yoj85>cq4pr*;D=Hs1y6Z_T0n$#JEz
z<pHTa7eBUj7VRFZ{W_AK3q~_!)lFF~DxPg@If9ZWiGut-DZhCSk`0&QCoTozv*+U{
z7H*6Co{c|$$+#K}EF%LA#k1p$E!)canE#Qj%YiKlSKSexJ-2Xgu~>YzwzxPXeEUz%
zog6P0uCXO5>;I2$mN8WM1v+{^lnCr%8vN6As{ef~=EBQhtz#bi(}V)1e(cVn*+0d;
zG&K88?2g#%#n|ZZvGqT02!ZagHl!oN8_-6>!XuZ8#rTPf0i&hk4Apmd_TAyxcec&G
z87p4>a{TB?U%cm)!r#&VU)~@4N^F1ZKuookS^5_Wf>gE}wUp2HYlObCCU6TrQ?TE}
zA=|w`XMm=GMxYKy5C?i3=p@j2pf3S!piXu*q%rvJ-6MQ6O};ft>l@~Phy8Rdd=@Si
zi|r_|t-cAdFnnsjK0nJ*2HYo}1Pp(pA{K0V$ltfDZU78H68TB^90xweOZ+=cKI%R|
zd>#1%!KO$2{lV4W4)g~@$7=e6t>3DR2g485#e<Qf^@G7^CK!nY!?9p%Z!pvwT-_UN
z>J2u?^8OCge`_flnyZGY2dW;ZdZ6loss}#H9tgwSO$MSo$7H&coD;4w<(5%;f#k1Y
zKCPqHP@30G*HSt{d^y$w#{3%M&-kM<_V%S>79+>~QTQ&e4I=?F&1<50znNYv%#a}}
z=Nb@SP3dN$OzJ6en(T2o4iXNNZy8u0rVX+@Pm(-Lkz|_vbDq+V5uemQHNxvyw*n0I
z_1zq5g%#X}P(<0RghQR-&W`Zrj;>Ja9z7X~YlexpZQ3Bl(pqLBbV$$TQ`t<Y1EfK|
zb<!}V^EbD*Po#{=LP8nKPPM1ghs>|G>0I_MeaujR2@3iqJ<^ltNoa}ANTfTG2=|0_
zy=Qz=cc-RB#>1OBH-&q;wrs$1VjIf!4>)x&w$8U{x~F?HVE0j<W7>kk7)-g^Tp^Q5
zK`AH3#vBzA!YQj|GTB5bL$X~;2Ub2D?&*vO!F|TGyK!*Wonm7@EULJ#5iXZmjb;g`
z3^&3eaqMmtoRyyx;wl?&<Jh<gE-}C*;6m*1{M#XW1H1N&4h`%RMO0g9)4fryIQQ;V
zV<RK|Ahm1f2=E={smJvl&$msR*URqgu$7%k8R~dWo6^<kY$^kb)=FD-xQ1R19~q>7
zuQ%(K)3tPJLjJtG+KKm;m^yu5ZKrBGRew*_-&6Iu#{+J4d@7GKH-7Bn(-E3SJV>-U
zK3T{4ji2~D0_RCM-^15bgR#nalC!;d-bC`Q^Cpw$K5wedTdMO*bL?4HTU`fK*8$b_
zT2u}I)%Du{#=068?Y8|7t)p~agZDn%WSGa)O@`+@Ox<L7Z*PF+M{Y8_-*=w&{oG{c
zNWYt8yvAL8ZH(a@(NB9mhWjWV!@iE(FLLa~e4u?DRQx3Ob*?lY<M#R^wU@_y%vRsa
z&Bv~kALqmU<D=&1)ergk$F$lH@mC4KoR`;s(*3a7P9JWbUTr7${rb=PbG`a|`v3Sn
zxy_|`>|^?2Wq3}v{W5bqp3}We<NG=o0%}DKW+`e~3->p_@r~cZkU%eOec~pH`fe9Z
zt8%N>tbeNannUZCuUS!-Uj?iK@TrCOU0Z<qe7^UB%aEV}_&}sZ+6wqu{EfBO`Q)tO
z%~dyF+}?g5bpQTWfAB)z!MEV8@FIVdisP#dEZ$x#ehuW@K7Rw4P(*$@oif^z_h`BC
zHW^ov_hbbgTEm=Kb2nMQm)Ja^!u+m>kIYxJLJVI~hsU#7fiEM&^1vrV1XiUi1g{Cj
z9K1Y3j!=Z5=M7~{CcM7_P~nI_S|JbsyQ8jnK+L$}HKq6AR$~FE)V!;Fo#6N7cKLeY
z_MSQbF5~yxcKJnu-;>+%#e&xic6>=`-D1a=3SK|h@nzxyHIW?;!g=Ya6_W7)%+lko
zcvI>8w^b<sZ#4PbfgN7~=f4kHAsG+A8(cnbYh?zoV>P+SH;dJ-_)4+X6~{hR83J%x
zA=Grdf)NmWy28#1$S!NgFclDd`ohi&luq3{aXw}2#ILghlnRvJs@wPh5wfK@5s*JL
zPMH*c`!(X9MLR0QRs{S@O7HxczrzF92<N#w%NIQGV;=Zp9{5QQ{EP?wZx4JaRv7B#
zEa!T{d9NGQF>EEA*Xt%K#6Ax@1rPjD5B%pI_!$rUJrBGEAB`^cy43^UPq?$b4-oFG
z?-PK#^^>Pbp4X)8pXUhY@sjaZ33vMcKOT59P8wb6)#-tcc;HzN{1FfQaS!}Aggfi|
z8sWTF=KA_z=*03n@etuo|7-=^&ChYb{ctj7j^+()|8~OX9r%92h2wdi2iymzTljg1
z-3i0jC0_Y_euC_T9d@3U@|Dl|7Xf!GzgNA^NIQ*UU42xR!}{fB=Wig7yK~NQ<Xvf}
zavTZBc}8V9S4zBcT-yM+&t-hM4e%A<f4pgdL8=4Y1(6T}j__THSB^(_1KuQ>Oa5oI
zA;4GoE63#=;w)sL-yu8B@$QF!hb(d&{R!a9{jhUW7iCD&Um^}Cp?KVv?gh?sfMb2R
zpX;RcXTo_($7i)hp(8axzY00V*;h~=`aeHsASuK5$PPaT+6jLfaJ0kylVIViGt0A^
z{LkkW>i}QwZ?4F91>hl8w$MMw^0>^G+6(y{Ub<?B(`)T;+e+>3YRl=#NzE{CV6{iu
z6{Y+Kl5jn1EN65mS)tHdEH#x$;WX5dR`pEMi7As=!-=7s8?F>G>D1kN`mn2l7gLcJ
zrV7)pN*+8#pGmnF%gz9GBQ>Vx^mKO2T3<VEueY6Nucw_?=?6RC#UFM7X`s?CcA-lD
z*!j{3Ksm$eL%WQ_Uv{Q6VE3DvOq)*A3>~kfp%bT5x>=NRLwSi|Kho3VZL*tXN6HES
zR`k6xh6QnxwYPy?-iznrdxv7B-^&7(J}>j!{axled|hU^`PsZOIF>J%ZjsGcb^+Ze
zEU|6=u`tKM(G_h|HZ9qx3y*=kQGlClW1@T$Q8lLE#U|WE%)>pzWLBL>XA@dlO&Zx;
zUeyXS0#55rr*%V5DqA{xy4;0S%tO_3Iqfi1)5skb=E2cqVQT6ys8~|6rZ!Pp6szjM
zp4g6lwSVU}6)r*A<&r|(wr^)_$55Z0V|q+g;dHXvA14d(ZF__|IJ~PjHmvR%7#Qgv
zRYzmJ!~JMg-qA#lVu>$*D^pKuhQ=x~BYCYe8nxZ)R8h*jy9u)Pi=1jQpH(NdOcF17
z4($S|WGbT;@;X$fToU}52;4~U_&tGaBA=%$<UWI{!u?Tp6kaTKzb&fd4^J6dLO|ch
znRJq4@;om*b2D(zSc&xxwHexkP$n`3WfIRME6KwdP&TQNGdXms+m=v)2ai_`6@(&d
zq8R!NJe;L}m0VUjQPC%<4@@ROgHxu4*%?d?j)OUEDg{H-1dc!OrA((GVj2T)2s}hS
z1zpW7O`?!WCl$>wa;ZcC`V&a%iNXZ*p>Y`Y%(#|F<n%+Fg6qmU$H6Ha)sk2~QvqN#
zJ3gM*jgmcZkUCvRK-tQ95M~^nE@NmN8VpZNf3$!rbC|NwjuPIJk3y<^U!La|5`-_5
zwT2rYQ4b$nwqW4>BfgIwl4$vh=T(g0d3u}+u|DrJ%@BYaVHi*bgWUWB%62*+{tn!A
zVEyC%C?Ep%r6eisFgW#1t`N6C1m|w7Kktu1O2Gb=Bx1%S0;W;2?wtQ>!ux>AB@d-}
ze=C9@1it@arYiln9T<3bgY)Nou*XOr_sTJF`B{$XUf|(f5Ki-c*^{InLS+cf@|&m-
z6A(dPvOe#tohJQ*B*^(Y>;F~YW1m3#7{WDCqLNhX{BXOOTp{j(h*h8G!MjON?ia8y
zP<bA~lKz@QpZD_?sK7AOw*_bZM@gUS&-4Ear0>j+>(53XcIfl|Apc*5TUn5(Gym^9
z^m!jKP7Xcm(0AtlBjDq|jG*Q!?KkrOS#Zj8n(0qL7ta{5KCdSuXcPj=nW=J`@m~Q0
zi^}@Ef64#H;y67pxPP!7`}G;nMG@BLea^FV@F4^|YhZn*&jHh_U)m3)4ET4-Hdv48
z3l4o=N3JG)XM0g~*qnF?VrYxY&+E0dA5i}l%0FDPXhvB7Q3uZXGyN;@@%hVv_iLY`
z{O9liZwb!&ybnCKWf2qG@MQ{wR+E&zow9f@--)4cn)IDJ%TC#q9{NW~KT*orj5zf_
zPx?;(og;l``?EgNHqv)wBQ9P+pXGZz^qbpE32Fuwukh8dhyHrfUtiKRBV2BV4|?d|
zPWq1KugKpZeW(9)(qF_G5#{`?l%Jhe(P`3;&;WJ=F!#ZSe|Ow2XrF&OR{h7fn5moi
b!9h&Umu2y8n4A8Ka^oKUSA#>rkre+0u^0)R

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profgen/split-dwarf.test b/llvm/test/tools/llvm-profgen/split-dwarf.test
new file mode 100644
index 0000000000000..09daea7f033dc
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/split-dwarf.test
@@ -0,0 +1,50 @@
+; RUN: rm -rf %t
+; RUN: mkdir -p %t
+; RUN: cd %t
+
+; RUN: echo -e "0\n0" > %t.prof
+
+; Test -gsplit-dwarf=single
+; RUN: cp %S/Inputs/split-dwarf-single.exe %t/split-dwarf-single.exe
+; RUN: yaml2obj %S/Inputs/split-dwarf-single.o.yaml -o %t/split-dwarf-single.o
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%t.prof --binary=%t/split-dwarf-single.exe --output=%t1 --fill-zero-for-all-funcs
+; RUN: FileCheck %s --input-file %t1 --check-prefix=CHECK-SPLIT-DWARF
+
+; Test -gsplit-dwarf=split
+; RUN: cp %S/Inputs/split-dwarf-split.exe %t/split-dwarf-split.exe
+; RUN: yaml2obj %S/Inputs/split-dwarf-split.dwo.yaml -o %t/split-dwarf-split.dwo
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%t.prof --binary=%t/split-dwarf-split.exe --output=%t2 --fill-zero-for-all-funcs
+; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-SPLIT-DWARF
+; Test --dwp
+; RUN: llvm-dwp %t/split-dwarf-split.dwo -o %t/split-dwarf-split.dwp
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%t.prof --dwp=%t/split-dwarf-split.dwp --binary=%t/split-dwarf-split.exe --output=%t3 --fill-zero-for-all-funcs
+; RUN: FileCheck %s --input-file %t3 --check-prefix=CHECK-SPLIT-DWARF
+
+; Make sure that all function ranges are well retrieved and have been populated with zero.
+
+; CHECK-SPLIT-DWARF:       foo:0:0
+; CHECK-SPLIT-DWARF-NEXT:   0: 0
+; CHECK-SPLIT-DWARF-NEXT:   1: 0
+; CHECK-SPLIT-DWARF-NEXT:  main:0:0
+; CHECK-SPLIT-DWARF-NEXT:   0: 0
+; CHECK-SPLIT-DWARF-NEXT:   1: 0
+; CHECK-SPLIT-DWARF-NEXT:   2: 0
+
+
+; Build instructions:
+; split-dwarf-single.o:       clang -gsplit-dwarf=single -fdebug-compilation-dir=.  test.c   -fdebug-info-for-profiling  -O0 -g -o split-dwarf-single.o -c
+; split-dwarf-single.exe:     clang -fdebug-compilation-dir=.  split-dwarf-single.o -o split-dwarf-single.exe  -fdebug-info-for-profiling  -O0 -g
+
+; split-dwarf-split.dwo:      clang -gsplit-dwarf=split -fdebug-compilation-dir=.  test.c  -fdebug-info-for-profiling  -O0 -g -o split-dwarf-split.o -c
+; split-dwarf-split.exe:      clang -fdebug-compilation-dir=.  split-dwarf-split.o -o split-dwarf-split.exe  -fdebug-info-for-profiling  -O0 -g
+
+; Source code:
+
+int foo() {
+  return 1;
+}
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 3430b030c01a8..dca69064b5d8f 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -43,6 +43,11 @@ static cl::opt<bool> UseDwarfCorrelation(
     cl::desc("Use dwarf for profile correlation even when binary contains "
              "pseudo probe."));
 
+static cl::opt<std::string>
+    DWPPath("dwp", cl::init(""), cl::ZeroOrMore,
+            cl::desc("Path of .dwp file. When not specified, it will be "
+                     "<binary>.dwp in the same directory as the main binary."));
+
 static cl::list<std::string> DisassembleFunctions(
     "disassemble-functions", cl::CommaSeparated,
     cl::desc("List of functions to print disassembly for. Accept demangled "
@@ -610,69 +615,94 @@ void ProfiledBinary::checkUseFSDiscriminator(
   }
 }
 
-void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) {
-  auto DebugContext = llvm::DWARFContext::create(Obj);
-  if (!DebugContext)
-    exitWithError("Misssing debug info.", Path);
+void ProfiledBinary::loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit) {
+  for (const auto &DieInfo : CompilationUnit.dies()) {
+    llvm::DWARFDie Die(&CompilationUnit, &DieInfo);
 
-  for (const auto &CompilationUnit : DebugContext->compile_units()) {
-    for (const auto &DieInfo : CompilationUnit->dies()) {
-      llvm::DWARFDie Die(CompilationUnit.get(), &DieInfo);
+    if (!Die.isSubprogramDIE())
+      continue;
+    auto Name = Die.getName(llvm::DINameKind::LinkageName);
+    if (!Name)
+      Name = Die.getName(llvm::DINameKind::ShortName);
+    if (!Name)
+      continue;
 
-      if (!Die.isSubprogramDIE())
-        continue;
-      auto Name = Die.getName(llvm::DINameKind::LinkageName);
-      if (!Name)
-        Name = Die.getName(llvm::DINameKind::ShortName);
-      if (!Name)
-        continue;
+    auto RangesOrError = Die.getAddressRanges();
+    if (!RangesOrError)
+      continue;
+    const DWARFAddressRangesVector &Ranges = RangesOrError.get();
 
-      auto RangesOrError = Die.getAddressRanges();
-      if (!RangesOrError)
-        continue;
-      const DWARFAddressRangesVector &Ranges = RangesOrError.get();
+    if (Ranges.empty())
+      continue;
+
+    // Different DWARF symbols can have same function name, search or create
+    // BinaryFunction indexed by the name.
+    auto Ret = BinaryFunctions.emplace(Name, BinaryFunction());
+    auto &Func = Ret.first->second;
+    if (Ret.second)
+      Func.FuncName = Ret.first->first;
 
-      if (Ranges.empty())
+    for (const auto &Range : Ranges) {
+      uint64_t FuncStart = Range.LowPC;
+      uint64_t FuncSize = Range.HighPC - FuncStart;
+
+      if (FuncSize == 0 || FuncStart < getPreferredBaseAddress())
         continue;
 
-      // Different DWARF symbols can have same function name, search or create
-      // BinaryFunction indexed by the name.
-      auto Ret = BinaryFunctions.emplace(Name, BinaryFunction());
-      auto &Func = Ret.first->second;
-      if (Ret.second)
-        Func.FuncName = Ret.first->first;
-
-      for (const auto &Range : Ranges) {
-        uint64_t FuncStart = Range.LowPC;
-        uint64_t FuncSize = Range.HighPC - FuncStart;
-
-        if (FuncSize == 0 || FuncStart < getPreferredBaseAddress())
-          continue;
-
-        uint64_t StartOffset = FuncStart - getPreferredBaseAddress();
-        uint64_t EndOffset = Range.HighPC - getPreferredBaseAddress();
-
-        // We may want to know all ranges for one function. Here group the
-        // ranges and store them into BinaryFunction.
-        Func.Ranges.emplace_back(StartOffset, EndOffset);
-
-        auto R = StartOffset2FuncRangeMap.emplace(StartOffset, FuncRange());
-        if (R.second) {
-          FuncRange &FRange = R.first->second;
-          FRange.Func = &Func;
-          FRange.StartOffset = StartOffset;
-          FRange.EndOffset = EndOffset;
-        } else {
-          WithColor::warning()
-              << "Duplicated symbol start address at "
-              << format("%8" PRIx64, StartOffset + getPreferredBaseAddress())
-              << " " << R.first->second.getFuncName() << " and " << Name
-              << "\n";
-        }
+      uint64_t StartOffset = FuncStart - getPreferredBaseAddress();
+      uint64_t EndOffset = Range.HighPC - getPreferredBaseAddress();
+
+      // We may want to know all ranges for one function. Here group the
+      // ranges and store them into BinaryFunction.
+      Func.Ranges.emplace_back(StartOffset, EndOffset);
+
+      auto R = StartOffset2FuncRangeMap.emplace(StartOffset, FuncRange());
+      if (R.second) {
+        FuncRange &FRange = R.first->second;
+        FRange.Func = &Func;
+        FRange.StartOffset = StartOffset;
+        FRange.EndOffset = EndOffset;
+      } else {
+        WithColor::warning()
+            << "Duplicated symbol start address at "
+            << format("%8" PRIx64, StartOffset + getPreferredBaseAddress())
+            << " " << R.first->second.getFuncName() << " and " << Name << "\n";
       }
     }
   }
-  assert(!StartOffset2FuncRangeMap.empty() && "Misssing debug info.");
+}
+
+void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) {
+  auto DebugContext = llvm::DWARFContext::create(
+      Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, DWPPath);
+  if (!DebugContext)
+    exitWithError("Error creating the debug info context", Path);
+
+  for (const auto &CompilationUnit : DebugContext->compile_units())
+    loadSymbolsFromDWARFUnit(*CompilationUnit.get());
+
+  // Handles DWO sections that can either be in .o, .dwo or .dwp files.
+  for (const auto &CompilationUnit : DebugContext->compile_units()) {
+    DWARFUnit *const DwarfUnit = CompilationUnit.get();
+    if (llvm::Optional<uint64_t> DWOId = DwarfUnit->getDWOId()) {
+      DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit();
+      if (!DWOCU->isDWOUnit()) {
+        std::string DWOName = dwarf::toString(
+            DwarfUnit->getUnitDIE().find(
+                {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
+            "");
+        WithColor::warning()
+            << "DWO debug information for " << DWOName
+            << " was not loaded. Please check the .o, .dwo or .dwp path.\n";
+        continue;
+      }
+      loadSymbolsFromDWARFUnit(*DWOCU);
+    }
+  }
+
+  if (BinaryFunctions.empty())
+    WithColor::warning() << "Loading of DWARF info completed, but no binary "
+                            "functions have been retrieved.\n";
 }
 
 void ProfiledBinary::populateSymbolListFromDWARF(
@@ -689,6 +719,7 @@ void ProfiledBinary::setupSymbolizer() {
   SymbolizerOpts.DefaultArch = TheTriple.getArchName().str();
   SymbolizerOpts.UseSymbolTable = false;
   SymbolizerOpts.RelativeAddresses = false;
+  SymbolizerOpts.DWPName = DWPPath;
   Symbolizer = std::make_unique<symbolize::LLVMSymbolizer>(SymbolizerOpts);
 }
 
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index 33b0b81fb0468..d359f79749d69 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -290,6 +290,9 @@ class ProfiledBinary {
   // Load debug info of subprograms from DWARF section.
   void loadSymbolsFromDWARF(ObjectFile &Obj);
 
+  // Load debug info from DWARF unit.
+  void loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit);
+
   // A function may be spilt into multiple non-continuous address ranges. We use
   // this to set whether start offset of a function is the real entry of the
   // function and also set false to the non-function label.

From fc0aa8424ca98da29a9c7aa15b4427d47504ba87 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 23 Feb 2022 10:15:42 -0800
Subject: [PATCH 668/748] [ELF] Check COMMON symbols for PROVIDE and don't
 redefine COMMON symbols edata/end/etext

In GNU ld, the definition precedence is: regular symbol assignment > relocatable object definition > `PROVIDE` symbol assignment.

GNU ld's internal linker scripts define the non-reserved (by C and C++)
edata/end/etext with `PROVIDE` so the relocatable object definition takes
precedence. This makes sense because `int end;` is valid.

We currently redefine such symbols if they are COMMON, but not if they are
regular definitions, so `int end;` with -fcommon is essentially a UB in ld.lld.
Fix this (also improve consistency and match GNU ld) by using the
`isDefined` code path for `isCommon`. In GNU ld, reserved identifiers like
`__ehdr_start` do not use `PROVIDE`, while we treat them all as `PROVIDE`, this
seems fine.

Reviewed By: peter.smith

Differential Revision: https://reviews.llvm.org/D120389
---
 lld/ELF/LinkerScript.cpp   |  2 +-
 lld/ELF/Writer.cpp         |  2 +-
 lld/test/ELF/edata-etext.s | 54 +++++++++++++++++++++++++++++++++++---
 3 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 402fa2f4ffbf9..4b80d6af6e264 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -203,7 +203,7 @@ static bool shouldDefineSym(SymbolAssignment *cmd) {
   // If a symbol was in PROVIDE(), we need to define it only
   // when it is a referenced undefined symbol.
   Symbol *b = symtab->find(cmd->name);
-  if (b && !b->isDefined())
+  if (b && !b->isDefined() && !b->isCommon())
     return true;
   return false;
 }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index cd43e79b82760..bf9e315ec0d2b 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -165,7 +165,7 @@ void elf::combineEhSections() {
 static Defined *addOptionalRegular(StringRef name, SectionBase *sec,
                                    uint64_t val, uint8_t stOther = STV_HIDDEN) {
   Symbol *s = symtab->find(name);
-  if (!s || s->isDefined())
+  if (!s || s->isDefined() || s->isCommon())
     return nullptr;
 
   s->resolve(Defined{nullptr, StringRef(), STB_GLOBAL, stOther, STT_NOTYPE, val,
diff --git a/lld/test/ELF/edata-etext.s b/lld/test/ELF/edata-etext.s
index 673475e3e7ee6..19cf2e5eb67d5 100644
--- a/lld/test/ELF/edata-etext.s
+++ b/lld/test/ELF/edata-etext.s
@@ -1,7 +1,10 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-# RUN: ld.lld %t.o -o %t
-# RUN: llvm-objdump -t --section-headers %t | FileCheck %s
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %t/a.s -o %t/a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %t/b.s -o %t/b.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %t/c.s -o %t/c.o
+# RUN: ld.lld %t/a.o -o %t/a
+# RUN: llvm-objdump -t --section-headers %t/a | FileCheck %s
 
 ## This checks that:
 ## 1) Address of _etext is the first location after the last read-only loadable segment.
@@ -32,6 +35,23 @@
 # RELOCATABLE-NEXT:  0000000000000000 *UND* 0000000000000000 _end
 # RELOCATABLE-NEXT:  0000000000000000 *UND* 0000000000000000 _etext
 
+## If a relocatable object file defines non-reserved identifiers (by C and C++)
+## edata/end/etext, don't redefine them. Note: GNU ld redefines the reserved
+## _edata while we don't for simplicty.
+# RUN: ld.lld %t/b.o -o %t/b
+# RUN: llvm-objdump -t %t/b | FileCheck %s --check-prefix=CHECK2
+# RUN: ld.lld %t/c.o -o %t/c
+# RUN: llvm-objdump -t %t/c | FileCheck %s --check-prefix=CHECK2
+## PROVIDE does not redefine defined symbols, even if COMMON.
+# RUN: ld.lld %t/c.o %t/lds -o %t/c
+# RUN: llvm-objdump -t %t/c | FileCheck %s --check-prefix=CHECK2
+
+# CHECK2:       [[#%x,]] g     O .bss   0000000000000001 _edata
+# CHECK2-NEXT:  [[#%x,]] g     O .bss   0000000000000001 edata
+# CHECK2-NEXT:  [[#%x,]] g     O .bss   0000000000000001 end
+# CHECK2-NEXT:  [[#%x,]] g     O .bss   0000000000000001 etext
+
+#--- a.s
 .global _edata,_end,_etext,_start,edata,end,etext
 .text
 _start:
@@ -41,3 +61,31 @@ _start:
 .bss
   .align 4
   .space 6
+
+#--- b.s
+.bss
+.macro def x
+  .globl \x
+  .type \x, @object
+  \x: .byte 0
+  .size \x, 1
+.endm
+def _edata
+def edata
+def end
+def etext
+
+#--- c.s
+.comm _edata,1,1
+.comm edata,1,1
+.comm end,1,1
+.comm etext,1,1
+
+#--- lds
+SECTIONS {
+  .text : { *(.text) }
+
+  PROVIDE(etext = .);
+  PROVIDE(edata = .);
+  PROVIDE(end = .);
+}

From beff16f7bd6353054ee7dbf43c6f35082ad61577 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 17 Feb 2022 10:22:18 -0800
Subject: [PATCH 669/748] [mlir] Async: update condition for dispatching
 block-aligned compute function

+ compare block size with the unrollable inner dimension
+ reduce nesting in the code and simplify a bit IR building

Reviewed By: cota

Differential Revision: https://reviews.llvm.org/D120075
---
 .../Async/Transforms/AsyncParallelFor.cpp     | 74 ++++++++++---------
 .../Async/async-parallel-for-compute-fn.mlir  | 20 ++---
 2 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
index e596fc3e73488..c4ba141b9bca0 100644
--- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
+++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
@@ -779,10 +779,10 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
     // and we can elide dynamic loop boundaries, and give LLVM an opportunity to
     // unroll the loops. The constant `512` is arbitrary, it should depend on
     // how many iterations LLVM will typically decide to unroll.
-    static constexpr int64_t maxIterations = 512;
+    static constexpr int64_t maxUnrollableIterations = 512;
 
     // The number of inner loops with statically known number of iterations less
-    // than the `maxIterations` value.
+    // than the `maxUnrollableIterations` value.
     int numUnrollableLoops = 0;
 
     auto getInt = [](IntegerAttr attr) { return attr ? attr.getInt() : 0; };
@@ -796,7 +796,7 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
       numIterations[i] = tripCount * innerIterations;
 
       // Update the number of inner loops that we can potentially unroll.
-      if (innerIterations > 0 && innerIterations <= maxIterations)
+      if (innerIterations > 0 && innerIterations <= maxUnrollableIterations)
         numUnrollableLoops++;
     }
 
@@ -856,9 +856,6 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
     Value bs1 = b.create<arith::MaxSIOp>(bs0, minTaskSize);
     Value blockSize = b.create<arith::MinSIOp>(tripCount, bs1);
 
-    ParallelComputeFunction notUnrollableParallelComputeFunction =
-        createParallelComputeFunction(op, staticBounds, 0, rewriter);
-
     // Dispatch parallel compute function using async recursive work splitting,
     // or by submitting compute task sequentially from a caller thread.
     auto doDispatch = asyncDispatch ? doAsyncDispatch : doSequentialDispatch;
@@ -869,42 +866,47 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
     // Compute the number of parallel compute blocks.
     Value blockCount = b.create<arith::CeilDivSIOp>(tripCount, blockSize);
 
-    // Unroll when numUnrollableLoops > 0 && blockSize >= maxIterations.
-    bool staticShouldUnroll = numUnrollableLoops > 0;
-    auto dispatchNotUnrollable = [&](OpBuilder &nestedBuilder, Location loc) {
+    // Dispatch parallel compute function without hints to unroll inner loops.
+    auto dispatchDefault = [&](OpBuilder &nestedBuilder, Location loc) {
+      ParallelComputeFunction compute =
+          createParallelComputeFunction(op, staticBounds, 0, rewriter);
+
+      ImplicitLocOpBuilder b(loc, nestedBuilder);
+      doDispatch(b, rewriter, compute, op, blockSize, blockCount, tripCounts);
+      b.create<scf::YieldOp>();
+    };
+
+    // Dispatch parallel compute function with hints for unrolling inner loops.
+    auto dispatchBlockAligned = [&](OpBuilder &nestedBuilder, Location loc) {
+      ParallelComputeFunction compute = createParallelComputeFunction(
+          op, staticBounds, numUnrollableLoops, rewriter);
+
       ImplicitLocOpBuilder b(loc, nestedBuilder);
-      doDispatch(b, rewriter, notUnrollableParallelComputeFunction, op,
-                 blockSize, blockCount, tripCounts);
+      // Align the block size to be a multiple of the statically known
+      // number of iterations in the inner loops.
+      Value numIters = b.create<arith::ConstantIndexOp>(
+          numIterations[op.getNumLoops() - numUnrollableLoops]);
+      Value alignedBlockSize = b.create<arith::MulIOp>(
+          b.create<arith::CeilDivSIOp>(blockSize, numIters), numIters);
+      doDispatch(b, rewriter, compute, op, alignedBlockSize, blockCount,
+                 tripCounts);
       b.create<scf::YieldOp>();
     };
 
-    if (staticShouldUnroll) {
-      Value dynamicShouldUnroll = b.create<arith::CmpIOp>(
-          arith::CmpIPredicate::sge, blockSize,
-          b.create<arith::ConstantIndexOp>(maxIterations));
-
-      ParallelComputeFunction unrollableParallelComputeFunction =
-          createParallelComputeFunction(op, staticBounds, numUnrollableLoops,
-                                        rewriter);
-
-      auto dispatchUnrollable = [&](OpBuilder &nestedBuilder, Location loc) {
-        ImplicitLocOpBuilder b(loc, nestedBuilder);
-        // Align the block size to be a multiple of the statically known
-        // number of iterations in the inner loops.
-        Value numIters = b.create<arith::ConstantIndexOp>(
-            numIterations[op.getNumLoops() - numUnrollableLoops]);
-        Value alignedBlockSize = b.create<arith::MulIOp>(
-            b.create<arith::CeilDivSIOp>(blockSize, numIters), numIters);
-        doDispatch(b, rewriter, unrollableParallelComputeFunction, op,
-                   alignedBlockSize, blockCount, tripCounts);
-        b.create<scf::YieldOp>();
-      };
-
-      b.create<scf::IfOp>(TypeRange(), dynamicShouldUnroll, dispatchUnrollable,
-                          dispatchNotUnrollable);
+    // Dispatch to block aligned compute function only if the computed block
+    // size is larger than the number of iterations in the unrollable inner
+    // loops, because otherwise it can reduce the available parallelism.
+    if (numUnrollableLoops > 0) {
+      Value numIters = b.create<arith::ConstantIndexOp>(
+          numIterations[op.getNumLoops() - numUnrollableLoops]);
+      Value useBlockAlignedComputeFn = b.create<arith::CmpIOp>(
+          arith::CmpIPredicate::sge, blockSize, numIters);
+
+      b.create<scf::IfOp>(TypeRange(), useBlockAlignedComputeFn,
+                          dispatchBlockAligned, dispatchDefault);
       b.create<scf::YieldOp>();
     } else {
-      dispatchNotUnrollable(b, loc);
+      dispatchDefault(b, loc);
     }
   };
 
diff --git a/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir b/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir
index 217e63bd67adf..8fc1c66e554fb 100644
--- a/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir
+++ b/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir
@@ -87,7 +87,7 @@ func @sink_constant_step(%arg0: memref<?x10xf32>, %lb: index, %ub: index) {
   return
 }
 
-// CHECK-LABEL: func private @parallel_compute_fn(
+// CHECK-LABEL: func private @parallel_compute_fn_with_aligned_loops(
 // CHECK-SAME:   %[[BLOCK_INDEX:arg[0-9]+]]: index,
 // CHECK-SAME:   %[[BLOCK_SIZE:arg[0-9]+]]: index,
 // CHECK-SAME:   %[[TRIP_COUNT0:arg[0-9]+]]: index,
@@ -100,12 +100,14 @@ func @sink_constant_step(%arg0: memref<?x10xf32>, %lb: index, %ub: index) {
 // CHECK-SAME:   %[[STEP1:arg[0-9]+]]: index,
 // CHECK-SAME:   %[[MEMREF:arg[0-9]+]]: memref<?x10xf32>
 // CHECK-SAME: ) {
+// CHECK:        %[[C0:.*]] = arith.constant 0 : index
+// CHECK:        %[[C1:.*]] = arith.constant 1 : index
+// CHECK:        %[[C10:.*]] = arith.constant 10 : index
 // CHECK:        scf.for %[[I:arg[0-9]+]]
-// CHECK:          arith.select
-// CHECK:          scf.for %[[J:arg[0-9]+]]
-// CHECK:          memref.store
+// CHECK-NOT:      arith.select
+// CHECK:          scf.for %[[J:arg[0-9]+]] = %c0 to %c10 step %c1
 
-// CHECK-LABEL: func private @parallel_compute_fn_with_aligned_loops(
+// CHECK-LABEL: func private @parallel_compute_fn(
 // CHECK-SAME:   %[[BLOCK_INDEX:arg[0-9]+]]: index,
 // CHECK-SAME:   %[[BLOCK_SIZE:arg[0-9]+]]: index,
 // CHECK-SAME:   %[[TRIP_COUNT0:arg[0-9]+]]: index,
@@ -118,9 +120,7 @@ func @sink_constant_step(%arg0: memref<?x10xf32>, %lb: index, %ub: index) {
 // CHECK-SAME:   %[[STEP1:arg[0-9]+]]: index,
 // CHECK-SAME:   %[[MEMREF:arg[0-9]+]]: memref<?x10xf32>
 // CHECK-SAME: ) {
-// CHECK:        %[[C0:.*]] = arith.constant 0 : index
-// CHECK:        %[[C1:.*]] = arith.constant 1 : index
-// CHECK:        %[[C10:.*]] = arith.constant 10 : index
 // CHECK:        scf.for %[[I:arg[0-9]+]]
-// CHECK-NOT:      arith.select
-// CHECK:          scf.for %[[J:arg[0-9]+]] = %c0 to %c10 step %c1
+// CHECK:          arith.select
+// CHECK:          scf.for %[[J:arg[0-9]+]]
+// CHECK:          memref.store

From 68c718c8f4b77dac87bdf7dfd9f2b14f3bac0592 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Wed, 23 Feb 2022 10:35:52 -0800
Subject: [PATCH 670/748] Revert "[MachineOutliner][AArch64] NFC: Split MBBs
 into "outlinable ranges""

This reverts commit d97f997eb79d91b2872ac13619f49cb3a7120781.

This commit was not NFC.

(See: https://reviews.llvm.org/rGd97f997eb79d91b2872ac13619f49cb3a7120781)
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  |  19 --
 llvm/lib/CodeGen/MachineOutliner.cpp         |  67 +++----
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 175 ++++++++++---------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h   |   9 +-
 4 files changed, 125 insertions(+), 145 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index a3209af8b2352..12cd21617b0d4 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1940,25 +1940,6 @@ class TargetInstrInfo : public MCInstrInfo {
   virtual bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
                                       unsigned &Flags) const;
 
-  /// Optional target hook which partitions \p MBB into outlinable ranges for
-  /// instruction mapping purposes. Each range is defined by two iterators:
-  /// [start, end).
-  ///
-  /// Ranges are expected to be ordered top-down. That is, ranges closer to the
-  /// top of the block should come before ranges closer to the end of the block.
-  ///
-  /// Ranges cannot overlap.
-  ///
-  /// If an entire block is mappable, then its range is [MBB.begin(), MBB.end())
-  ///
-  /// All instructions not present in an outlinable range are considered
-  /// illegal.
-  virtual SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-  getOutlinableRanges(MachineBasicBlock &MBB, unsigned &Flags) const {
-    return {std::make_pair(MBB.begin(), MBB.end())};
-  }
-
   /// Insert a custom frame for outlined functions.
   virtual void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
                                   const outliner::OutlinedFunction &OF) const {
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index e74264276d4c1..7ce655dce8e34 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -258,10 +258,6 @@ struct InstructionMapper {
     if (!TII.isMBBSafeToOutlineFrom(MBB, Flags))
       return;
 
-    auto Ranges = TII.getOutlinableRanges(MBB, Flags);
-    if (Ranges.empty())
-      return;
-
     // Store info for the MBB for later outlining.
     MBBFlagsMap[&MBB] = Flags;
 
@@ -284,47 +280,34 @@ struct InstructionMapper {
     std::vector<unsigned> UnsignedVecForMBB;
     std::vector<MachineBasicBlock::iterator> InstrListForMBB;
 
-    for (auto &Range : Ranges) {
-      auto RangeStart = Range.first;
-      auto RangeEnd = Range.second;
-      // Everything outside of an outlinable range is illegal.
-      for (; It != RangeStart; ++It)
+    for (MachineBasicBlock::iterator Et = MBB.end(); It != Et; ++It) {
+      // Keep track of where this instruction is in the module.
+      switch (TII.getOutliningType(It, Flags)) {
+      case InstrType::Illegal:
         mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
                              InstrListForMBB);
-      assert(It != MBB.end() && "Should still have instructions?");
-      // `It` is now positioned at the beginning of a range of instructions
-      // which may be outlinable. Check if each instruction is known to be safe.
-      for (; It != RangeEnd; ++It) {
-        // Keep track of where this instruction is in the module.
-        switch (TII.getOutliningType(It, Flags)) {
-        case InstrType::Illegal:
-          mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
-                               InstrListForMBB);
-          break;
-
-        case InstrType::Legal:
-          mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
-                             NumLegalInBlock, UnsignedVecForMBB,
-                             InstrListForMBB);
-          break;
-
-        case InstrType::LegalTerminator:
-          mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
-                             NumLegalInBlock, UnsignedVecForMBB,
+        break;
+
+      case InstrType::Legal:
+        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
+        break;
+
+      case InstrType::LegalTerminator:
+        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
+        // The instruction also acts as a terminator, so we have to record that
+        // in the string.
+        mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
                              InstrListForMBB);
-          // The instruction also acts as a terminator, so we have to record
-          // that in the string.
-          mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
-                               InstrListForMBB);
-          break;
-
-        case InstrType::Invisible:
-          // Normally this is set by mapTo(Blah)Unsigned, but we just want to
-          // skip this instruction. So, unset the flag here.
-          ++NumInvisible;
-          AddedIllegalLastTime = false;
-          break;
-        }
+        break;
+
+      case InstrType::Invisible:
+        // Normally this is set by mapTo(Blah)Unsigned, but we just want to
+        // skip this instruction. So, unset the flag here.
+        ++NumInvisible;
+        AddedIllegalLastTime = false;
+        break;
       }
     }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d160a33b529e2..84469dd257cab 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6783,11 +6783,48 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
 
   // Properties about candidate MBBs that hold for all of them.
   unsigned FlagsSetInAll = 0xF;
+
+  // Compute liveness information for each candidate, and set FlagsSetInAll.
   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
                 [&FlagsSetInAll](outliner::Candidate &C) {
                   FlagsSetInAll &= C.Flags;
                 });
 
+  // According to the AArch64 Procedure Call Standard, the following are
+  // undefined on entry/exit from a function call:
+  //
+  // * Registers x16, x17, (and thus w16, w17)
+  // * Condition codes (and thus the NZCV register)
+  //
+  // Because if this, we can't outline any sequence of instructions where
+  // one
+  // of these registers is live into/across it. Thus, we need to delete
+  // those
+  // candidates.
+  auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
+    // If the unsafe registers in this block are all dead, then we don't need
+    // to compute liveness here.
+    if (C.Flags & UnsafeRegsDead)
+      return false;
+    return C.isAnyUnavailableAcrossOrOutOfSeq(
+        {AArch64::W16, AArch64::W17, AArch64::NZCV}, TRI);
+  };
+
+  // Are there any candidates where those registers are live?
+  if (!(FlagsSetInAll & UnsafeRegsDead)) {
+    // Erase every candidate that violates the restrictions above. (It could be
+    // true that we have viable candidates, so it's not worth bailing out in
+    // the case that, say, 1 out of 20 candidates violate the restructions.)
+    llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall);
+
+    // If the sequence doesn't have enough candidates left, then we're done.
+    if (RepeatedSequenceLocs.size() < 2)
+      return outliner::OutlinedFunction();
+  }
+
+  // At this point, we have only "safe" candidates to outline. Figure out
+  // frame + call instruction information.
+
   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
 
   // Helper lambda which sets call information for every candidate.
@@ -6915,10 +6952,6 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
 
     // Check if we have to save LR.
     for (outliner::Candidate &C : RepeatedSequenceLocs) {
-      bool LRAvailable =
-          (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
-              ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
-              : true;
       // If we have a noreturn caller, then we're going to be conservative and
       // say that we have to save LR. If we don't have a ret at the end of the
       // block, then we can't reason about liveness accurately.
@@ -6929,7 +6962,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
 
       // Is LR available? If so, we don't need a save.
-      if (LRAvailable && !IsNoReturn) {
+      if (C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) && !IsNoReturn) {
         NumBytesNoStackCalls += 4;
         C.setCallInfo(MachineOutlinerNoLRSave, 4);
         CandidatesWithoutStackFixups.push_back(C);
@@ -7101,88 +7134,72 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
   return true;
 }
 
-SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
-                                      unsigned &Flags) const {
+bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                                              unsigned &Flags) const {
+  if (!TargetInstrInfo::isMBBSafeToOutlineFrom(MBB, Flags))
+    return false;
+  // Check if LR is available through all of the MBB. If it's not, then set
+  // a flag.
   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
-         "Must track liveness!");
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-      Ranges;
+         "Suitable Machine Function for outlining must track liveness");
+  LiveRegUnits LRU(getRegisterInfo());
 
-  // The range [RangeBegin, RangeEnd).
-  MachineBasicBlock::instr_iterator RangeEnd = MBB.instr_end();
-  MachineBasicBlock::instr_iterator RangeBegin = RangeEnd;
-  unsigned RangeLen = 0;
+  std::for_each(MBB.rbegin(), MBB.rend(),
+                [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
 
-  // According to the AArch64 Procedure Call Standard, the following are
-  // undefined on entry/exit from a function call:
-  //
-  // * Registers x16, x17, (and thus w16, w17)
-  // * Condition codes (and thus the NZCV register)
-  //
-  // If any of these registers are used inside or live across an outlined
-  // function, then they may be modified later, either by the compiler or
-  // some other tool (like the linker).
-  //
-  // To avoid outlining in these situations, partition each block into ranges
-  // where these registers are dead. We will only outline from those ranges.
-  LiveRegUnits LRU(getRegisterInfo());
-  auto AreAllUnsafeRegsDead = [&LRU]() {
-    return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
-           LRU.available(AArch64::NZCV);
-  };
+  // Check if each of the unsafe registers are available...
+  bool W16AvailableInBlock = LRU.available(AArch64::W16);
+  bool W17AvailableInBlock = LRU.available(AArch64::W17);
+  bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
 
-  // We need to know if LR is live across an outlining boundary later on in
-  // order to decide how we'll create the outlined call, frame, etc.
-  //
-  // It's pretty expensive to check this for *every candidate* within a block.
-  // That's some potentially n^2 behaviour, since in the worst case, we'd need
-  // to compute liveness from the end of the block for O(n) candidates within
-  // the block.
-  //
-  // So, to improve the average case, let's keep track of liveness from the end
-  // of the block to the beginning of *every outlinable range*. If we know that
-  // LR is available in every range we could outline from, then we know that
-  // we don't need to check liveness for any candidate within that range.
-  bool LRAvailableEverywhere = true;
+  // If all of these are dead (and not live out), we know we don't have to check
+  // them later.
+  if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
+    Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
 
-  // Compute liveness bottom-up.
+  // Now, add the live outs to the set.
   LRU.addLiveOuts(MBB);
-  for (auto &MI : make_range(MBB.instr_rbegin(), MBB.instr_rend())) {
-    LRU.stepBackward(MI);
-    // If we are in a range where all of the unsafe registers are dead, then
-    // update the beginning of the range. Also try to precalculate some stuff
-    // for getOutliningCandidateInfo.
-    if (AreAllUnsafeRegsDead()) {
-      if (MI.isCall())
-        Flags |= MachineOutlinerMBBFlags::HasCalls;
-      LRAvailableEverywhere &= LRU.available(AArch64::LR);
-      RangeBegin = MI.getIterator();
-      ++RangeLen;
-      continue;
+
+  // If any of these registers is available in the MBB, but also a live out of
+  // the block, then we know outlining is unsafe.
+  if (W16AvailableInBlock && !LRU.available(AArch64::W16))
+    return false;
+  if (W17AvailableInBlock && !LRU.available(AArch64::W17))
+    return false;
+  if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
+    return false;
+
+  // Check if there's a call inside this MachineBasicBlock. If there is, then
+  // set a flag.
+  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
+    Flags |= MachineOutlinerMBBFlags::HasCalls;
+
+  MachineFunction *MF = MBB.getParent();
+
+  // In the event that we outline, we may have to save LR. If there is an
+  // available register in the MBB, then we'll always save LR there. Check if
+  // this is true.
+  bool CanSaveLR = false;
+  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+
+  // Check if there is an available register across the sequence that we can
+  // use.
+  for (unsigned Reg : AArch64::GPR64RegClass) {
+    if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
+        Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
+      CanSaveLR = true;
+      break;
     }
-    // At least one unsafe register is not dead. We do not want to outline at
-    // this point. If it is long enough to outline from, save the range
-    // [RangeBegin, RangeEnd).
-    if (RangeLen > 1)
-      Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
-    // Start a new range where RangeEnd is the first known unsafe point.
-    RangeLen = 0;
-    RangeBegin = MI.getIterator();
-    RangeEnd = MI.getIterator();
-  }
-  // Above loop misses the last (or only) range.
-  if (AreAllUnsafeRegsDead() && RangeLen > 1)
-    Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
-  if (Ranges.empty())
-    return Ranges;
-  // We found the ranges bottom-up. Mapping expects the top-down. Reverse
-  // the order.
-  std::reverse(Ranges.begin(), Ranges.end());
-  if (!LRAvailableEverywhere)
+  }
+
+  // Check if we have a register we can save LR to, and if LR was used
+  // somewhere. If both of those things are true, then we need to evaluate the
+  // safety of outlining stack instructions later.
+  if (!CanSaveLR && !LRU.available(AArch64::LR))
     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
-  return Ranges;
+
+  return true;
 }
 
 outliner::InstrType
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 677e1443191cd..55b1813f0b301 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -280,11 +280,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
                                    bool OutlineFromLinkOnceODRs) const override;
   outliner::OutlinedFunction getOutliningCandidateInfo(
       std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
-  outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT,
-                                       unsigned Flags) const override;
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
-  getOutlinableRanges(MachineBasicBlock &MBB, unsigned &Flags) const override;
+  outliner::InstrType
+  getOutliningType(MachineBasicBlock::iterator &MIT, unsigned Flags) const override;
+  bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                              unsigned &Flags) const override;
   void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
                           const outliner::OutlinedFunction &OF) const override;
   MachineBasicBlock::iterator

From 427d9f60db14bd184ab70b3b9cdd1b9285d715c2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 23 Feb 2022 18:45:17 +0000
Subject: [PATCH 671/748] [X86] combineX86ShufflesRecursively - pull out
 repeated getValueType/getSimpleValueType calls.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a015e36cf658a..0fc2b164fb0bc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38389,9 +38389,9 @@ static SDValue combineX86ShufflesRecursively(
   assert(RootMask.size() > 0 &&
          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
          "Illegal shuffle root mask");
-  assert(Root.getSimpleValueType().isVector() &&
-         "Shuffles operate on vector types!");
-  unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+  MVT RootVT = Root.getSimpleValueType();
+  assert(RootVT.isVector() && "Shuffles operate on vector types!");
+  unsigned RootSizeInBits = RootVT.getSizeInBits();
 
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
@@ -38610,13 +38610,12 @@ static SDValue combineX86ShufflesRecursively(
 
   // Handle the all undef/zero/ones cases early.
   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
-    return DAG.getUNDEF(Root.getValueType());
+    return DAG.getUNDEF(RootVT);
   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
-    return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
-                         SDLoc(Root));
+    return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
       none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
-    return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
+    return getOnesVector(RootVT, DAG, SDLoc(Root));
 
   assert(!Ops.empty() && "Shuffle with no inputs detected");
   HasVariableMask |= IsOpVariableMask;
@@ -38676,7 +38675,7 @@ static SDValue combineX86ShufflesRecursively(
   // NOTE: This will update the Ops and Mask.
   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
-    return DAG.getBitcast(Root.getValueType(), HOp);
+    return DAG.getBitcast(RootVT, HOp);
 
   // Try to refine our inputs given our knowledge of target shuffle mask.
   for (auto I : enumerate(Ops)) {

From d0b70a070aedc3665e352d06c7d996a4050f8fc8 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Wed, 23 Feb 2022 19:48:07 +0100
Subject: [PATCH 672/748] [flang] Lower function and subroutine calls

This patch introduce basic function/subroutine calls.
Because of the state of lowering only simple scalar arguments
can be used in the calls. This will be enhanced in follow up
patches with arrays, allocatable, pointer ans so on.

```
subroutine sub1()
end

subroutine sub2()
  call sub1()
end
```

This patch is part of the upstreaming effort from fir-dev branch.

Reviewed By: schweitz

Differential Revision: https://reviews.llvm.org/D120419

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Co-authored-by: Jean Perier <jperier@nvidia.com>
Co-authored-by: V Donaldson <vdonaldson@nvidia.com>
---
 flang/include/flang/Lower/AbstractConverter.h |  18 +-
 flang/include/flang/Lower/CallInterface.h     | 127 ++++
 flang/include/flang/Lower/ConvertExpr.h       |  16 +-
 flang/include/flang/Lower/ConvertVariable.h   |   8 +
 flang/include/flang/Lower/StatementContext.h  |  85 +++
 .../flang/Optimizer/Builder/FIRBuilder.h      |  40 +-
 .../flang/Optimizer/Dialect/FIROpsSupport.h   |  10 +
 flang/lib/Lower/Bridge.cpp                    |  32 +-
 flang/lib/Lower/CallInterface.cpp             | 289 +++++++++
 flang/lib/Lower/ConvertExpr.cpp               | 609 +++++++++++++++++-
 flang/lib/Lower/ConvertVariable.cpp           |  47 ++
 flang/lib/Lower/OpenACC.cpp                   | 102 +--
 flang/lib/Lower/OpenMP.cpp                    |   8 +-
 flang/lib/Lower/Runtime.cpp                   |   7 +-
 flang/lib/Optimizer/Builder/FIRBuilder.cpp    |  64 +-
 flang/lib/Optimizer/Dialect/FIROps.cpp        |   7 +
 flang/test/Lower/basic-call.f90               |  49 ++
 17 files changed, 1412 insertions(+), 106 deletions(-)
 create mode 100644 flang/include/flang/Lower/StatementContext.h
 create mode 100644 flang/test/Lower/basic-call.f90

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 99ee5aee2bb4b..b4ec1658e24f6 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -57,6 +57,7 @@ struct Variable;
 
 using SomeExpr = Fortran::evaluate::Expr<Fortran::evaluate::SomeType>;
 using SymbolRef = Fortran::common::Reference<const Fortran::semantics::Symbol>;
+class StatementContext;
 
 //===----------------------------------------------------------------------===//
 // AbstractConverter interface
@@ -79,26 +80,33 @@ class AbstractConverter {
   //===--------------------------------------------------------------------===//
 
   /// Generate the address of the location holding the expression, someExpr.
-  virtual fir::ExtendedValue genExprAddr(const SomeExpr &,
+  virtual fir::ExtendedValue genExprAddr(const SomeExpr &, StatementContext &,
                                          mlir::Location *loc = nullptr) = 0;
   /// Generate the address of the location holding the expression, someExpr
-  fir::ExtendedValue genExprAddr(const SomeExpr *someExpr, mlir::Location loc) {
-    return genExprAddr(*someExpr, &loc);
+  fir::ExtendedValue genExprAddr(const SomeExpr *someExpr,
+                                 StatementContext &stmtCtx,
+                                 mlir::Location loc) {
+    return genExprAddr(*someExpr, stmtCtx, &loc);
   }
 
   /// Generate the computations of the expression to produce a value
-  virtual fir::ExtendedValue genExprValue(const SomeExpr &,
+  virtual fir::ExtendedValue genExprValue(const SomeExpr &, StatementContext &,
                                           mlir::Location *loc = nullptr) = 0;
   /// Generate the computations of the expression, someExpr, to produce a value
   fir::ExtendedValue genExprValue(const SomeExpr *someExpr,
+                                  StatementContext &stmtCtx,
                                   mlir::Location loc) {
-    return genExprValue(*someExpr, &loc);
+    return genExprValue(*someExpr, stmtCtx, &loc);
   }
 
   /// Get FoldingContext that is required for some expression
   /// analysis.
   virtual Fortran::evaluate::FoldingContext &getFoldingContext() = 0;
 
+  /// Host associated variables are grouped as a tuple. This returns that value,
+  /// which is itself a reference. Use bindTuple() to set this value.
+  virtual mlir::Value hostAssocTupleValue() = 0;
+
   //===--------------------------------------------------------------------===//
   // Types
   //===--------------------------------------------------------------------===//
diff --git a/flang/include/flang/Lower/CallInterface.h b/flang/include/flang/Lower/CallInterface.h
index 896fde850e7ab..c39cbb87233e6 100644
--- a/flang/include/flang/Lower/CallInterface.h
+++ b/flang/include/flang/Lower/CallInterface.h
@@ -52,10 +52,16 @@ struct FunctionLikeUnit;
 /// inside the input vector for the CallOp (caller side. It will be up to the
 /// CallInterface user to produce the mlir::Value that will go in this input
 /// vector).
+class CallerInterface;
 class CalleeInterface;
 template <typename T>
 struct PassedEntityTypes {};
 template <>
+struct PassedEntityTypes<CallerInterface> {
+  using FortranEntity = const Fortran::evaluate::ActualArgument *;
+  using FirValue = int;
+};
+template <>
 struct PassedEntityTypes<CalleeInterface> {
   using FortranEntity =
       std::optional<common::Reference<const semantics::Symbol>>;
@@ -165,6 +171,15 @@ class CallInterface {
         nullptr;
   };
 
+  /// Return the mlir::FuncOp. Note that front block is added by this
+  /// utility if callee side.
+  mlir::FuncOp getFuncOp() const { return func; }
+  /// Number of MLIR inputs/outputs of the created FuncOp.
+  std::size_t getNumFIRArguments() const { return inputs.size(); }
+  std::size_t getNumFIRResults() const { return outputs.size(); }
+  /// Return the MLIR output types.
+  llvm::SmallVector<mlir::Type> getResultType() const;
+
   /// Return a container of Symbol/ActualArgument* and how they must
   /// be plugged with the mlir::FuncOp.
   llvm::ArrayRef<PassedEntity> getPassedArguments() const {
@@ -182,6 +197,21 @@ class CallInterface {
   determineInterface(bool isImplicit,
                      const Fortran::evaluate::characteristics::Procedure &);
 
+  /// Does the caller need to allocate storage for the result ?
+  bool callerAllocateResult() const {
+    return mustPassResult() || mustSaveResult();
+  }
+
+  /// Is the Fortran result passed as an extra MLIR argument ?
+  bool mustPassResult() const { return passedResult.has_value(); }
+  /// Must the MLIR result be saved with a fir.save_result ?
+  bool mustSaveResult() const { return saveResult; }
+
+  /// Can the associated procedure be called via an implicit interface?
+  bool canBeCalledViaImplicitInterface() const {
+    return characteristic && characteristic->CanBeCalledViaImplicitInterface();
+  }
+
 protected:
   CallInterface(Fortran::lower::AbstractConverter &c) : converter{c} {}
   /// CRTP handle.
@@ -199,6 +229,7 @@ class CallInterface {
   mlir::FuncOp func;
   llvm::SmallVector<PassedEntity> passedArguments;
   std::optional<PassedEntity> passedResult;
+  bool saveResult = false;
 
   Fortran::lower::AbstractConverter &converter;
   /// Store characteristic once created, it is required for further information
@@ -207,6 +238,102 @@ class CallInterface {
       std::nullopt;
 };
 
+//===----------------------------------------------------------------------===//
+// Caller side interface
+//===----------------------------------------------------------------------===//
+
+/// The CallerInterface provides the helpers needed by CallInterface
+/// (getting the characteristic...) and a safe way for the user to
+/// place the mlir::Value arguments into the input vector
+/// once they are lowered.
+class CallerInterface : public CallInterface<CallerInterface> {
+public:
+  CallerInterface(const Fortran::evaluate::ProcedureRef &p,
+                  Fortran::lower::AbstractConverter &c)
+      : CallInterface{c}, procRef{p} {
+    declare();
+    mapPassedEntities();
+    actualInputs.resize(getNumFIRArguments());
+  }
+
+  using ExprVisitor = std::function<void(evaluate::Expr<evaluate::SomeType>)>;
+
+  /// CRTP callbacks
+  bool hasAlternateReturns() const;
+  std::string getMangledName() const;
+  mlir::Location getCalleeLocation() const;
+  Fortran::evaluate::characteristics::Procedure characterize() const;
+
+  const Fortran::evaluate::ProcedureRef &getCallDescription() const {
+    return procRef;
+  }
+
+  bool isMainProgram() const { return false; }
+
+  /// Returns true if this is a call to a procedure pointer of a dummy
+  /// procedure.
+  bool isIndirectCall() const;
+
+  /// Return the procedure symbol if this is a call to a user defined
+  /// procedure.
+  const Fortran::semantics::Symbol *getProcedureSymbol() const;
+
+  /// Helpers to place the lowered arguments at the right place once they
+  /// have been lowered.
+  void placeInput(const PassedEntity &passedEntity, mlir::Value arg);
+  void placeAddressAndLengthInput(const PassedEntity &passedEntity,
+                                  mlir::Value addr, mlir::Value len);
+
+  /// If this is a call to a procedure pointer or dummy, returns the related
+  /// symbol. Nullptr otherwise.
+  const Fortran::semantics::Symbol *getIfIndirectCallSymbol() const;
+
+  /// Get the input vector once it is complete.
+  llvm::ArrayRef<mlir::Value> getInputs() const {
+    if (!verifyActualInputs())
+      llvm::report_fatal_error("lowered arguments are incomplete");
+    return actualInputs;
+  }
+
+  /// Does the caller must map function interface symbols in order to evaluate
+  /// the result specification expressions (extents and lengths) ? If needed,
+  /// this mapping must be done after argument lowering, and before the call
+  /// itself.
+  bool mustMapInterfaceSymbols() const;
+
+  /// Walk the result non-deferred extent specification expressions.
+  void walkResultExtents(ExprVisitor) const;
+
+  /// Walk the result non-deferred length specification expressions.
+  void walkResultLengths(ExprVisitor) const;
+
+  /// Get the mlir::Value that is passed as argument \p sym of the function
+  /// being called. The arguments must have been placed before calling this
+  /// function.
+  mlir::Value getArgumentValue(const semantics::Symbol &sym) const;
+
+  /// Returns the symbol for the result in the explicit interface. If this is
+  /// called on an intrinsic or function without explicit interface, this will
+  /// crash.
+  const Fortran::semantics::Symbol &getResultSymbol() const;
+
+  /// If some storage needs to be allocated for the result,
+  /// returns the storage type.
+  mlir::Type getResultStorageType() const;
+
+  // Copy of base implementation.
+  static constexpr bool hasHostAssociated() { return false; }
+  mlir::Type getHostAssociatedTy() const {
+    llvm_unreachable("getting host associated type in CallerInterface");
+  }
+
+private:
+  /// Check that the input vector is complete.
+  bool verifyActualInputs() const;
+  const Fortran::evaluate::ProcedureRef &procRef;
+  llvm::SmallVector<mlir::Value> actualInputs;
+};
+
 //===----------------------------------------------------------------------===//
 // Callee side interface
 //===----------------------------------------------------------------------===//
diff --git a/flang/include/flang/Lower/ConvertExpr.h b/flang/include/flang/Lower/ConvertExpr.h
index 459ab71074a0f..07763f5d5c1fb 100644
--- a/flang/include/flang/Lower/ConvertExpr.h
+++ b/flang/include/flang/Lower/ConvertExpr.h
@@ -34,6 +34,7 @@ struct SomeType;
 namespace Fortran::lower {
 
 class AbstractConverter;
+class StatementContext;
 class SymMap;
 using SomeExpr = Fortran::evaluate::Expr<Fortran::evaluate::SomeType>;
 
@@ -41,13 +42,24 @@ using SomeExpr = Fortran::evaluate::Expr<Fortran::evaluate::SomeType>;
 fir::ExtendedValue createSomeExtendedExpression(mlir::Location loc,
                                                 AbstractConverter &converter,
                                                 const SomeExpr &expr,
-                                                SymMap &symMap);
+                                                SymMap &symMap,
+                                                StatementContext &stmtCtx);
 
 /// Create an extended expression address.
 fir::ExtendedValue createSomeExtendedAddress(mlir::Location loc,
                                              AbstractConverter &converter,
                                              const SomeExpr &expr,
-                                             SymMap &symMap);
+                                             SymMap &symMap,
+                                             StatementContext &stmtCtx);
+
+/// Lower a subroutine call. This handles both elemental and non elemental
+/// subroutines. \p isUserDefAssignment must be set if this is called in the
+/// context of a user defined assignment. For subroutines with alternate
+/// returns, the returned value indicates which label the code should jump to.
+/// The returned value is null otherwise.
+mlir::Value createSubroutineCall(AbstractConverter &converter,
+                                 const evaluate::ProcedureRef &call,
+                                 SymMap &symMap, StatementContext &stmtCtx);
 
 // Attribute for an alloca that is a trivial adaptor for converting a value to
 // pass-by-ref semantics for a VALUE parameter. The optimizer may be able to
diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index 011eab6113a6b..f1d9d4141949e 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -19,6 +19,7 @@
 
 namespace Fortran ::lower {
 class AbstractConverter;
+class CallerInterface;
 class SymMap;
 namespace pft {
 struct Variable;
@@ -31,5 +32,12 @@ struct Variable;
 void instantiateVariable(AbstractConverter &, const pft::Variable &var,
                          SymMap &symMap);
 
+/// Instantiate the variables that appear in the specification expressions
+/// of the result of a function call. The instantiated variables are added
+/// to \p symMap.
+void mapCallInterfaceSymbols(AbstractConverter &,
+                             const Fortran::lower::CallerInterface &caller,
+                             SymMap &symMap);
+
 } // namespace Fortran::lower
 #endif // FORTRAN_LOWER_CONVERT_VARIABLE_H
diff --git a/flang/include/flang/Lower/StatementContext.h b/flang/include/flang/Lower/StatementContext.h
new file mode 100644
index 0000000000000..b4df75026e1d4
--- /dev/null
+++ b/flang/include/flang/Lower/StatementContext.h
@@ -0,0 +1,85 @@
+//===-- StatementContext.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_STATEMENTCONTEXT_H
+#define FORTRAN_LOWER_STATEMENTCONTEXT_H
+
+#include <functional>
+
+namespace Fortran::lower {
+
+/// When lowering a statement, temporaries for intermediate results may be
+/// allocated on the heap.  A StatementContext enables their deallocation
+/// either explicitly with finalize() calls, or implicitly at the end of
+/// the context.  A context may prohibit temporary allocation.  Otherwise,
+/// an initial "outer" context scope may have nested context scopes, which
+/// must make explicit subscope finalize() calls.
+class StatementContext {
+public:
+  explicit StatementContext(bool cleanupProhibited = false) {
+    if (cleanupProhibited)
+      return;
+    cufs.push_back({});
+  }
+
+  ~StatementContext() {
+    if (!cufs.empty())
+      finalize(/*popScope=*/true);
+    assert(cufs.empty() && "invalid StatementContext destructor call");
+  }
+
+  using CleanupFunction = std::function<void()>;
+
+  /// Push a context subscope.
+  void pushScope() {
+    assert(!cufs.empty() && "invalid pushScope statement context");
+    cufs.push_back({});
+  }
+
+  /// Append a cleanup function to the "list" of cleanup functions.
+  void attachCleanup(CleanupFunction cuf) {
+    assert(!cufs.empty() && "invalid attachCleanup statement context");
+    if (cufs.back()) {
+      CleanupFunction oldCleanup = *cufs.back();
+      cufs.back() = [=]() {
+        cuf();
+        oldCleanup();
+      };
+    } else {
+      cufs.back() = cuf;
+    }
+  }
+
+  /// Make cleanup calls.  Pop or reset the stack top list.
+  void finalize(bool popScope = false) {
+    assert(!cufs.empty() && "invalid finalize statement context");
+    if (cufs.back())
+      (*cufs.back())();
+    if (popScope)
+      cufs.pop_back();
+    else
+      cufs.back().reset();
+  }
+
+private:
+  // A statement context should never be copied or moved.
+  StatementContext(const StatementContext &) = delete;
+  StatementContext &operator=(const StatementContext &) = delete;
+  StatementContext(StatementContext &&) = delete;
+
+  // Stack of cleanup function "lists" (nested cleanup function calls).
+  llvm::SmallVector<llvm::Optional<CleanupFunction>> cufs;
+};
+
+} // namespace Fortran::lower
+
+#endif // FORTRAN_LOWER_STATEMENTCONTEXT_H
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index f064ceba94650..fc056298eec3d 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -63,14 +63,15 @@ class FirOpBuilder : public mlir::OpBuilder {
         getKindMap().getIntegerBitsize(getKindMap().defaultIntegerKind()));
   }
 
-  /// The LHS and RHS are not always in agreement in terms of
-  /// type. In some cases, the disagreement is between COMPLEX and other scalar
-  /// types. In that case, the conversion must insert/extract out of a COMPLEX
-  /// value to have the proper semantics and be strongly typed. For e.g for
-  /// converting an integer/real to a complex, the real part is filled using
-  /// the integer/real after type conversion and the imaginary part is zero.
+  /// The LHS and RHS are not always in agreement in terms of type. In some
+  /// cases, the disagreement is between COMPLEX and other scalar types. In that
+  /// case, the conversion must insert (extract) out of a COMPLEX value to have
+  /// the proper semantics and be strongly typed. E.g., converting an integer
+  /// (real) to a complex, the real part is filled using the integer (real)
+  /// after type conversion and the imaginary part is zero.
   mlir::Value convertWithSemantics(mlir::Location loc, mlir::Type toTy,
-                                   mlir::Value val);
+                                   mlir::Value val,
+                                   bool allowCharacterConversion = false);
 
   /// Get the entry block of the current Function
   mlir::Block *getEntryBlock() { return &getFunction().front(); }
@@ -97,9 +98,18 @@ class FirOpBuilder : public mlir::OpBuilder {
     return getI64Type();
   }
 
+  /// Wrap `str` to a SymbolRefAttr.
+  mlir::SymbolRefAttr getSymbolRefAttr(llvm::StringRef str) {
+    return mlir::SymbolRefAttr::get(getContext(), str);
+  }
+
   /// Get the mlir real type that implements fortran REAL(kind).
   mlir::Type getRealType(int kind);
 
+  fir::BoxProcType getBoxProcType(mlir::FunctionType funcTy) {
+    return fir::BoxProcType::get(getContext(), funcTy);
+  }
+
   /// Create a null constant memory reference of type \p ptrType.
   /// If \p ptrType is not provided, !fir.ref<none> type will be used.
   mlir::Value createNullConstant(mlir::Location loc, mlir::Type ptrType = {});
@@ -213,6 +223,14 @@ class FirOpBuilder : public mlir::OpBuilder {
   static mlir::FuncOp getNamedFunction(mlir::ModuleOp module,
                                        llvm::StringRef name);
 
+  /// Get a function by symbol name. The result will be null if there is no
+  /// function with the given symbol in the module.
+  mlir::FuncOp getNamedFunction(mlir::SymbolRefAttr symbol) {
+    return getNamedFunction(getModule(), symbol);
+  }
+  static mlir::FuncOp getNamedFunction(mlir::ModuleOp module,
+                                       mlir::SymbolRefAttr symbol);
+
   fir::GlobalOp getNamedGlobal(llvm::StringRef name) {
     return getNamedGlobal(getModule(), name);
   }
@@ -382,6 +400,14 @@ llvm::SmallVector<mlir::Value> getExtents(fir::FirOpBuilder &builder,
                                           mlir::Location loc,
                                           const fir::ExtendedValue &box);
 
+/// Read a fir::BoxValue into an fir::UnboxValue, a fir::ArrayBoxValue or a
+/// fir::CharArrayBoxValue. This should only be called if the fir::BoxValue is
+/// known to be contiguous given the context (or if the resulting address will
+/// not be used). If the value is polymorphic, its dynamic type will be lost.
+/// This must not be used on unlimited polymorphic and assumed rank entities.
+fir::ExtendedValue readBoxValue(fir::FirOpBuilder &builder, mlir::Location loc,
+                                const fir::BoxValue &box);
+
 //===----------------------------------------------------------------------===//
 // String literal helper helpers
 //===----------------------------------------------------------------------===//
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index d9be2f9da5470..c99f2a39bc914 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -77,6 +77,11 @@ static constexpr llvm::StringRef getCharacterProcedureDummyAttrName() {
 /// Attribute to keep track of Fortran scoping information for a symbol.
 static constexpr llvm::StringRef getSymbolAttrName() { return "fir.sym_name"; }
 
+/// Attribute to mark a function that takes a host associations argument.
+static constexpr llvm::StringRef getHostAssocAttrName() {
+  return "fir.host_assoc";
+}
+
 /// Tell if \p value is:
 ///   - a function argument that has attribute \p attributeName
 ///   - or, the result of fir.alloca/fir.allocamem op that has attribute \p
@@ -87,6 +92,11 @@ static constexpr llvm::StringRef getSymbolAttrName() { return "fir.sym_name"; }
 ///     previous cases.
 bool valueHasFirAttribute(mlir::Value value, llvm::StringRef attributeName);
 
+/// Scan the arguments of a FuncOp to determine if any arguments have the
+/// attribute `attr` placed on them. This can be used to determine if the
+/// function has any host associations, for example.
+bool anyFuncArgsHaveAttr(mlir::FuncOp func, llvm::StringRef attr);
+
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_FIROPSSUPPORT_H
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index bf346ec6f80b2..815ba254c34e1 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -19,6 +19,7 @@
 #include "flang/Lower/Mangler.h"
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Lower/Runtime.h"
+#include "flang/Lower/StatementContext.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Lower/Todo.h"
 #include "flang/Optimizer/Support/FIRContext.h"
@@ -77,15 +78,17 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   }
 
   fir::ExtendedValue genExprAddr(const Fortran::lower::SomeExpr &expr,
+                                 Fortran::lower::StatementContext &context,
                                  mlir::Location *loc = nullptr) override final {
     return createSomeExtendedAddress(loc ? *loc : toLocation(), *this, expr,
-                                     localSymbols);
+                                     localSymbols, context);
   }
   fir::ExtendedValue
   genExprValue(const Fortran::lower::SomeExpr &expr,
+               Fortran::lower::StatementContext &context,
                mlir::Location *loc = nullptr) override final {
     return createSomeExtendedExpression(loc ? *loc : toLocation(), *this, expr,
-                                        localSymbols);
+                                        localSymbols, context);
   }
 
   Fortran::evaluate::FoldingContext &getFoldingContext() override final {
@@ -224,6 +227,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                                 {builder->getRegion()}); // remove dead code
     delete builder;
     builder = nullptr;
+    hostAssocTuple = mlir::Value{};
     localSymbols.clear();
   }
 
@@ -357,6 +361,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       lowerFunc(f); // internal procedure
   }
 
+  mlir::Value hostAssocTupleValue() override final { return hostAssocTuple; }
+
 private:
   FirConverter() = delete;
   FirConverter(const FirConverter &) = delete;
@@ -476,8 +482,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   }
 
   void genAssignment(const Fortran::evaluate::Assignment &assign) {
+    Fortran::lower::StatementContext stmtCtx;
     mlir::Location loc = toLocation();
-
     std::visit(
         Fortran::common::visitors{
             // [1] Plain old assignment.
@@ -512,15 +518,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
               const bool isNumericScalar =
                   isNumericScalarCategory(lhsType->category());
               fir::ExtendedValue rhs = isNumericScalar
-                                           ? genExprValue(assign.rhs)
-                                           : genExprAddr(assign.rhs);
+                                           ? genExprValue(assign.rhs, stmtCtx)
+                                           : genExprAddr(assign.rhs, stmtCtx);
 
               if (isNumericScalar) {
                 // Fortran 2018 10.2.1.3 p8 and p9
                 // Conversions should have been inserted by semantic analysis,
                 // but they can be incorrect between the rhs and lhs. Correct
                 // that here.
-                mlir::Value addr = fir::getBase(genExprAddr(assign.lhs));
+                mlir::Value addr =
+                    fir::getBase(genExprAddr(assign.lhs, stmtCtx));
                 mlir::Value val = fir::getBase(rhs);
                 // A function with multiple entry points returning different
                 // types tags all result variables with one of the largest
@@ -568,8 +575,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         assign.u);
   }
 
+  /// Lowering of CALL statement
   void genFIR(const Fortran::parser::CallStmt &stmt) {
-    TODO(toLocation(), "CallStmt lowering");
+    Fortran::lower::StatementContext stmtCtx;
+    setCurrentPosition(stmt.v.source);
+    assert(stmt.typedCall && "Call was not analyzed");
+    // Call statement lowering shares code with function call lowering.
+    mlir::Value res = Fortran::lower::createSubroutineCall(
+        *this, *stmt.typedCall, localSymbols, stmtCtx);
+    if (!res)
+      return; // "Normal" subroutine call.
   }
 
   void genFIR(const Fortran::parser::ComputedGotoStmt &stmt) {
@@ -999,6 +1014,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   Fortran::lower::pft::Evaluation *evalPtr = nullptr;
   Fortran::lower::SymMap localSymbols;
   Fortran::parser::CharBlock currentPosition;
+
+  /// Tuple of host assoicated variables.
+  mlir::Value hostAssocTuple;
 };
 
 } // namespace
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 93c8f02bc7039..7a3f95aa412ec 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -30,6 +30,261 @@ static std::string getMangledName(const Fortran::semantics::Symbol &symbol) {
   return bindName ? *bindName : Fortran::lower::mangle::mangleName(symbol);
 }
 
+//===----------------------------------------------------------------------===//
+// Caller side interface implementation
+//===----------------------------------------------------------------------===//
+
+bool Fortran::lower::CallerInterface::hasAlternateReturns() const {
+  return procRef.hasAlternateReturns();
+}
+
+std::string Fortran::lower::CallerInterface::getMangledName() const {
+  const Fortran::evaluate::ProcedureDesignator &proc = procRef.proc();
+  if (const Fortran::semantics::Symbol *symbol = proc.GetSymbol())
+    return ::getMangledName(symbol->GetUltimate());
+  assert(proc.GetSpecificIntrinsic() &&
+         "expected intrinsic procedure in designator");
+  return proc.GetName();
+}
+
+const Fortran::semantics::Symbol *
+Fortran::lower::CallerInterface::getProcedureSymbol() const {
+  return procRef.proc().GetSymbol();
+}
+
+bool Fortran::lower::CallerInterface::isIndirectCall() const {
+  if (const Fortran::semantics::Symbol *symbol = procRef.proc().GetSymbol())
+    return Fortran::semantics::IsPointer(*symbol) ||
+           Fortran::semantics::IsDummy(*symbol);
+  return false;
+}
+
+const Fortran::semantics::Symbol *
+Fortran::lower::CallerInterface::getIfIndirectCallSymbol() const {
+  if (const Fortran::semantics::Symbol *symbol = procRef.proc().GetSymbol())
+    if (Fortran::semantics::IsPointer(*symbol) ||
+        Fortran::semantics::IsDummy(*symbol))
+      return symbol;
+  return nullptr;
+}
+
+mlir::Location Fortran::lower::CallerInterface::getCalleeLocation() const {
+  const Fortran::evaluate::ProcedureDesignator &proc = procRef.proc();
+  // FIXME: If the callee is defined in the same file but after the current
+  // unit we cannot get its location here and the funcOp is created at the
+  // wrong location (i.e, the caller location).
+  if (const Fortran::semantics::Symbol *symbol = proc.GetSymbol())
+    return converter.genLocation(symbol->name());
+  // Use current location for intrinsics.
+  return converter.getCurrentLocation();
+}
+
+// Get dummy argument characteristic for a procedure with implicit interface
+// from the actual argument characteristic. The actual argument may not be a F77
+// entity. The attribute must be dropped and the shape, if any, must be made
+// explicit.
+static Fortran::evaluate::characteristics::DummyDataObject
+asImplicitArg(Fortran::evaluate::characteristics::DummyDataObject &&dummy) {
+  Fortran::evaluate::Shape shape =
+      dummy.type.attrs().none() ? dummy.type.shape()
+                                : Fortran::evaluate::Shape(dummy.type.Rank());
+  return Fortran::evaluate::characteristics::DummyDataObject(
+      Fortran::evaluate::characteristics::TypeAndShape(dummy.type.type(),
+                                                       std::move(shape)));
+}
+
+static Fortran::evaluate::characteristics::DummyArgument
+asImplicitArg(Fortran::evaluate::characteristics::DummyArgument &&dummy) {
+  return std::visit(
+      Fortran::common::visitors{
+          [&](Fortran::evaluate::characteristics::DummyDataObject &obj) {
+            return Fortran::evaluate::characteristics::DummyArgument(
+                std::move(dummy.name), asImplicitArg(std::move(obj)));
+          },
+          [&](Fortran::evaluate::characteristics::DummyProcedure &proc) {
+            return Fortran::evaluate::characteristics::DummyArgument(
+                std::move(dummy.name), std::move(proc));
+          },
+          [](Fortran::evaluate::characteristics::AlternateReturn &x) {
+            return Fortran::evaluate::characteristics::DummyArgument(
+                std::move(x));
+          }},
+      dummy.u);
+}
+
+Fortran::evaluate::characteristics::Procedure
+Fortran::lower::CallerInterface::characterize() const {
+  Fortran::evaluate::FoldingContext &foldingContext =
+      converter.getFoldingContext();
+  std::optional<Fortran::evaluate::characteristics::Procedure> characteristic =
+      Fortran::evaluate::characteristics::Procedure::Characterize(
+          procRef.proc(), foldingContext);
+  assert(characteristic && "Failed to get characteristic from procRef");
+  // The characteristic may not contain the argument characteristic if the
+  // ProcedureDesignator has no interface.
+  if (!characteristic->HasExplicitInterface()) {
+    for (const std::optional<Fortran::evaluate::ActualArgument> &arg :
+         procRef.arguments()) {
+      if (arg.value().isAlternateReturn()) {
+        characteristic->dummyArguments.emplace_back(
+            Fortran::evaluate::characteristics::AlternateReturn{});
+      } else {
+        // Argument cannot be optional with implicit interface
+        const Fortran::lower::SomeExpr *expr = arg.value().UnwrapExpr();
+        assert(
+            expr &&
+            "argument in call with implicit interface cannot be assumed type");
+        std::optional<Fortran::evaluate::characteristics::DummyArgument>
+            argCharacteristic =
+                Fortran::evaluate::characteristics::DummyArgument::FromActual(
+                    "actual", *expr, foldingContext);
+        assert(argCharacteristic &&
+               "failed to characterize argument in implicit call");
+        characteristic->dummyArguments.emplace_back(
+            asImplicitArg(std::move(*argCharacteristic)));
+      }
+    }
+  }
+  return *characteristic;
+}
+
+void Fortran::lower::CallerInterface::placeInput(
+    const PassedEntity &passedEntity, mlir::Value arg) {
+  assert(static_cast<int>(actualInputs.size()) > passedEntity.firArgument &&
+         passedEntity.firArgument >= 0 &&
+         passedEntity.passBy != CallInterface::PassEntityBy::AddressAndLength &&
+         "bad arg position");
+  actualInputs[passedEntity.firArgument] = arg;
+}
+
+void Fortran::lower::CallerInterface::placeAddressAndLengthInput(
+    const PassedEntity &passedEntity, mlir::Value addr, mlir::Value len) {
+  assert(static_cast<int>(actualInputs.size()) > passedEntity.firArgument &&
+         static_cast<int>(actualInputs.size()) > passedEntity.firLength &&
+         passedEntity.firArgument >= 0 && passedEntity.firLength >= 0 &&
+         passedEntity.passBy == CallInterface::PassEntityBy::AddressAndLength &&
+         "bad arg position");
+  actualInputs[passedEntity.firArgument] = addr;
+  actualInputs[passedEntity.firLength] = len;
+}
+
+bool Fortran::lower::CallerInterface::verifyActualInputs() const {
+  if (getNumFIRArguments() != actualInputs.size())
+    return false;
+  for (mlir::Value arg : actualInputs) {
+    if (!arg)
+      return false;
+  }
+  return true;
+}
+
+void Fortran::lower::CallerInterface::walkResultLengths(
+    ExprVisitor visitor) const {
+  assert(characteristic && "characteristic was not computed");
+  const Fortran::evaluate::characteristics::FunctionResult &result =
+      characteristic->functionResult.value();
+  const Fortran::evaluate::characteristics::TypeAndShape *typeAndShape =
+      result.GetTypeAndShape();
+  assert(typeAndShape && "no result type");
+  Fortran::evaluate::DynamicType dynamicType = typeAndShape->type();
+  // Visit result length specification expressions that are explicit.
+  if (dynamicType.category() == Fortran::common::TypeCategory::Character) {
+    if (std::optional<Fortran::evaluate::ExtentExpr> length =
+            dynamicType.GetCharLength())
+      visitor(toEvExpr(*length));
+  } else if (dynamicType.category() == common::TypeCategory::Derived) {
+    const Fortran::semantics::DerivedTypeSpec &derivedTypeSpec =
+        dynamicType.GetDerivedTypeSpec();
+    if (Fortran::semantics::CountLenParameters(derivedTypeSpec) > 0)
+      TODO(converter.getCurrentLocation(),
+           "function result with derived type length parameters");
+  }
+}
+
+// Compute extent expr from shapeSpec of an explicit shape.
+// TODO: Allow evaluate shape analysis to work in a mode where it disregards
+// the non-constant aspects when building the shape to avoid having this here.
+static Fortran::evaluate::ExtentExpr
+getExtentExpr(const Fortran::semantics::ShapeSpec &shapeSpec) {
+  const auto &ubound = shapeSpec.ubound().GetExplicit();
+  const auto &lbound = shapeSpec.lbound().GetExplicit();
+  assert(lbound && ubound && "shape must be explicit");
+  return Fortran::common::Clone(*ubound) - Fortran::common::Clone(*lbound) +
+         Fortran::evaluate::ExtentExpr{1};
+}
+
+void Fortran::lower::CallerInterface::walkResultExtents(
+    ExprVisitor visitor) const {
+  // Walk directly the result symbol shape (the characteristic shape may contain
+  // descriptor inquiries to it that would fail to lower on the caller side).
+  const Fortran::semantics::Symbol *interfaceSymbol =
+      procRef.proc().GetInterfaceSymbol();
+  if (interfaceSymbol) {
+    const Fortran::semantics::Symbol &result =
+        interfaceSymbol->get<Fortran::semantics::SubprogramDetails>().result();
+    if (const auto *objectDetails =
+            result.detailsIf<Fortran::semantics::ObjectEntityDetails>())
+      if (objectDetails->shape().IsExplicitShape())
+        for (const Fortran::semantics::ShapeSpec &shapeSpec :
+             objectDetails->shape())
+          visitor(Fortran::evaluate::AsGenericExpr(getExtentExpr(shapeSpec)));
+  } else {
+    if (procRef.Rank() != 0)
+      fir::emitFatalError(
+          converter.getCurrentLocation(),
+          "only scalar functions may not have an interface symbol");
+  }
+}
+
+bool Fortran::lower::CallerInterface::mustMapInterfaceSymbols() const {
+  assert(characteristic && "characteristic was not computed");
+  const std::optional<Fortran::evaluate::characteristics::FunctionResult>
+      &result = characteristic->functionResult;
+  if (!result || result->CanBeReturnedViaImplicitInterface() ||
+      !procRef.proc().GetInterfaceSymbol())
+    return false;
+  bool allResultSpecExprConstant = true;
+  auto visitor = [&](const Fortran::lower::SomeExpr &e) {
+    allResultSpecExprConstant &= Fortran::evaluate::IsConstantExpr(e);
+  };
+  walkResultLengths(visitor);
+  walkResultExtents(visitor);
+  return !allResultSpecExprConstant;
+}
+
+mlir::Value Fortran::lower::CallerInterface::getArgumentValue(
+    const semantics::Symbol &sym) const {
+  mlir::Location loc = converter.getCurrentLocation();
+  const Fortran::semantics::Symbol *iface = procRef.proc().GetInterfaceSymbol();
+  if (!iface)
+    fir::emitFatalError(
+        loc, "mapping actual and dummy arguments requires an interface");
+  const std::vector<Fortran::semantics::Symbol *> &dummies =
+      iface->get<semantics::SubprogramDetails>().dummyArgs();
+  auto it = std::find(dummies.begin(), dummies.end(), &sym);
+  if (it == dummies.end())
+    fir::emitFatalError(loc, "symbol is not a dummy in this call");
+  FirValue mlirArgIndex = passedArguments[it - dummies.begin()].firArgument;
+  return actualInputs[mlirArgIndex];
+}
+
+mlir::Type Fortran::lower::CallerInterface::getResultStorageType() const {
+  if (passedResult)
+    return fir::dyn_cast_ptrEleTy(inputs[passedResult->firArgument].type);
+  assert(saveResult && !outputs.empty());
+  return outputs[0].type;
+}
+
+const Fortran::semantics::Symbol &
+Fortran::lower::CallerInterface::getResultSymbol() const {
+  mlir::Location loc = converter.getCurrentLocation();
+  const Fortran::semantics::Symbol *iface = procRef.proc().GetInterfaceSymbol();
+  if (!iface)
+    fir::emitFatalError(
+        loc, "mapping actual and dummy arguments requires an interface");
+  return iface->get<semantics::SubprogramDetails>().result();
+}
+
 //===----------------------------------------------------------------------===//
 // Callee side interface implementation
 //===----------------------------------------------------------------------===//
@@ -162,6 +417,12 @@ void Fortran::lower::CallInterface<T>::mapBackInputToPassedEntity(
     passedEntity.firArgument = firValue;
 }
 
+/// Helpers to access ActualArgument/Symbols
+static const Fortran::evaluate::ActualArguments &
+getEntityContainer(const Fortran::evaluate::ProcedureRef &proc) {
+  return proc.arguments();
+}
+
 static const std::vector<Fortran::semantics::Symbol *> &
 getEntityContainer(Fortran::lower::pft::FunctionLikeUnit &funit) {
   return funit.getSubprogramSymbol()
@@ -169,6 +430,13 @@ getEntityContainer(Fortran::lower::pft::FunctionLikeUnit &funit) {
       .dummyArgs();
 }
 
+static const Fortran::evaluate::ActualArgument *getDataObjectEntity(
+    const std::optional<Fortran::evaluate::ActualArgument> &arg) {
+  if (arg)
+    return &*arg;
+  return nullptr;
+}
+
 static const Fortran::semantics::Symbol &
 getDataObjectEntity(const Fortran::semantics::Symbol *arg) {
   assert(arg && "expect symbol for data object entity");
@@ -400,6 +668,26 @@ class Fortran::lower::CallInterfaceImpl {
   mlir::MLIRContext &mlirContext;
 };
 
+template <typename T>
+bool Fortran::lower::CallInterface<T>::PassedEntity::isOptional() const {
+  if (!characteristics)
+    return false;
+  return characteristics->IsOptional();
+}
+template <typename T>
+bool Fortran::lower::CallInterface<T>::PassedEntity::mayBeModifiedByCall()
+    const {
+  if (!characteristics)
+    return true;
+  return characteristics->GetIntent() != Fortran::common::Intent::In;
+}
+template <typename T>
+bool Fortran::lower::CallInterface<T>::PassedEntity::mayBeReadByCall() const {
+  if (!characteristics)
+    return true;
+  return characteristics->GetIntent() != Fortran::common::Intent::Out;
+}
+
 template <typename T>
 void Fortran::lower::CallInterface<T>::determineInterface(
     bool isImplicit,
@@ -424,3 +712,4 @@ mlir::FunctionType Fortran::lower::CallInterface<T>::genFunctionType() {
 }
 
 template class Fortran::lower::CallInterface<Fortran::lower::CalleeInterface>;
+template class Fortran::lower::CallInterface<Fortran::lower::CallerInterface>;
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 0a939dbf6aaad..8d6805d262e7d 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -12,14 +12,17 @@
 
 #include "flang/Lower/ConvertExpr.h"
 #include "flang/Evaluate/fold.h"
-#include "flang/Evaluate/real.h"
 #include "flang/Evaluate/traverse.h"
 #include "flang/Lower/AbstractConverter.h"
+#include "flang/Lower/CallInterface.h"
 #include "flang/Lower/ConvertType.h"
+#include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/IntrinsicCall.h"
+#include "flang/Lower/StatementContext.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Lower/Todo.h"
 #include "flang/Optimizer/Builder/Complex.h"
+#include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
@@ -67,6 +70,25 @@ placeScalarValueInMemory(fir::FirOpBuilder &builder, mlir::Location loc,
   return fir::substBase(exv, temp);
 }
 
+/// Is this a variable wrapped in parentheses?
+template <typename A>
+static bool isParenthesizedVariable(const A &) {
+  return false;
+}
+template <typename T>
+static bool isParenthesizedVariable(const Fortran::evaluate::Expr<T> &expr) {
+  using ExprVariant = decltype(Fortran::evaluate::Expr<T>::u);
+  using Parentheses = Fortran::evaluate::Parentheses<T>;
+  if constexpr (Fortran::common::HasMember<Parentheses, ExprVariant>) {
+    if (const auto *parentheses = std::get_if<Parentheses>(&expr.u))
+      return Fortran::evaluate::IsVariable(parentheses->left());
+    return false;
+  } else {
+    return std::visit([&](const auto &x) { return isParenthesizedVariable(x); },
+                      expr.u);
+  }
+}
+
 /// Generate a load of a value from an address. Beware that this will lose
 /// any dynamic type information for polymorphic entities (note that unlimited
 /// polymorphic cannot be loaded and must not be provided here).
@@ -103,6 +125,22 @@ isElementalProcWithArrayArgs(const Fortran::evaluate::ProcedureRef &procRef) {
         return true;
   return false;
 }
+
+/// If \p arg is the address of a function with a denoted host-association tuple
+/// argument, then return the host-associations tuple value of the current
+/// procedure. Otherwise, return nullptr.
+static mlir::Value
+argumentHostAssocs(Fortran::lower::AbstractConverter &converter,
+                   mlir::Value arg) {
+  if (auto addr = mlir::dyn_cast_or_null<fir::AddrOfOp>(arg.getDefiningOp())) {
+    auto &builder = converter.getFirOpBuilder();
+    if (auto funcOp = builder.getNamedFunction(addr.getSymbol()))
+      if (fir::anyFuncArgsHaveAttr(funcOp, fir::getHostAssocAttrName()))
+        return converter.hostAssocTupleValue();
+  }
+  return {};
+}
+
 namespace {
 
 /// Lowering of Fortran::evaluate::Expr<T> expressions
@@ -112,9 +150,29 @@ class ScalarExprLowering {
 
   explicit ScalarExprLowering(mlir::Location loc,
                               Fortran::lower::AbstractConverter &converter,
-                              Fortran::lower::SymMap &symMap)
+                              Fortran::lower::SymMap &symMap,
+                              Fortran::lower::StatementContext &stmtCtx)
       : location{loc}, converter{converter},
-        builder{converter.getFirOpBuilder()}, symMap{symMap} {}
+        builder{converter.getFirOpBuilder()}, stmtCtx{stmtCtx}, symMap{symMap} {
+  }
+
+  ExtValue genExtAddr(const Fortran::lower::SomeExpr &expr) {
+    return gen(expr);
+  }
+
+  /// Lower `expr` to be passed as a fir.box argument. Do not create a temp
+  /// for the expr if it is a variable that can be described as a fir.box.
+  ExtValue genBoxArg(const Fortran::lower::SomeExpr &expr) {
+    bool saveUseBoxArg = useBoxArg;
+    useBoxArg = true;
+    ExtValue result = gen(expr);
+    useBoxArg = saveUseBoxArg;
+    return result;
+  }
+
+  ExtValue genExtValue(const Fortran::lower::SomeExpr &expr) {
+    return genval(expr);
+  }
 
   mlir::Location getLoc() { return location; }
 
@@ -516,6 +574,501 @@ class ScalarExprLowering {
     TODO(getLoc(), "gen FunctionRef<A>");
   }
 
+  /// helper to detect statement functions
+  static bool
+  isStatementFunctionCall(const Fortran::evaluate::ProcedureRef &procRef) {
+    if (const Fortran::semantics::Symbol *symbol = procRef.proc().GetSymbol())
+      if (const auto *details =
+              symbol->detailsIf<Fortran::semantics::SubprogramDetails>())
+        return details->stmtFunction().has_value();
+    return false;
+  }
+
+  /// Helper to package a Value and its properties into an ExtendedValue.
+  static ExtValue toExtendedValue(mlir::Location loc, mlir::Value base,
+                                  llvm::ArrayRef<mlir::Value> extents,
+                                  llvm::ArrayRef<mlir::Value> lengths) {
+    mlir::Type type = base.getType();
+    if (type.isa<fir::BoxType>())
+      return fir::BoxValue(base, /*lbounds=*/{}, lengths, extents);
+    type = fir::unwrapRefType(type);
+    if (type.isa<fir::BoxType>())
+      return fir::MutableBoxValue(base, lengths, /*mutableProperties*/ {});
+    if (auto seqTy = type.dyn_cast<fir::SequenceType>()) {
+      if (seqTy.getDimension() != extents.size())
+        fir::emitFatalError(loc, "incorrect number of extents for array");
+      if (seqTy.getEleTy().isa<fir::CharacterType>()) {
+        if (lengths.empty())
+          fir::emitFatalError(loc, "missing length for character");
+        assert(lengths.size() == 1);
+        return fir::CharArrayBoxValue(base, lengths[0], extents);
+      }
+      return fir::ArrayBoxValue(base, extents);
+    }
+    if (type.isa<fir::CharacterType>()) {
+      if (lengths.empty())
+        fir::emitFatalError(loc, "missing length for character");
+      assert(lengths.size() == 1);
+      return fir::CharBoxValue(base, lengths[0]);
+    }
+    return base;
+  }
+
+  // Find the argument that corresponds to the host associations.
+  // Verify some assumptions about how the signature was built here.
+  [[maybe_unused]] static unsigned findHostAssocTuplePos(mlir::FuncOp fn) {
+    // Scan the argument list from last to first as the host associations are
+    // appended for now.
+    for (unsigned i = fn.getNumArguments(); i > 0; --i)
+      if (fn.getArgAttr(i - 1, fir::getHostAssocAttrName())) {
+        // Host assoc tuple must be last argument (for now).
+        assert(i == fn.getNumArguments() && "tuple must be last");
+        return i - 1;
+      }
+    llvm_unreachable("anyFuncArgsHaveAttr failed");
+  }
+
+  /// Lower a non-elemental procedure reference and read allocatable and pointer
+  /// results into normal values.
+  ExtValue genProcedureRef(const Fortran::evaluate::ProcedureRef &procRef,
+                           llvm::Optional<mlir::Type> resultType) {
+    ExtValue res = genRawProcedureRef(procRef, resultType);
+    return res;
+  }
+
+  /// Given a call site for which the arguments were already lowered, generate
+  /// the call and return the result. This function deals with explicit result
+  /// allocation and lowering if needed. It also deals with passing the host
+  /// link to internal procedures.
+  ExtValue genCallOpAndResult(Fortran::lower::CallerInterface &caller,
+                              mlir::FunctionType callSiteType,
+                              llvm::Optional<mlir::Type> resultType) {
+    mlir::Location loc = getLoc();
+    using PassBy = Fortran::lower::CallerInterface::PassEntityBy;
+    // Handle cases where caller must allocate the result or a fir.box for it.
+    bool mustPopSymMap = false;
+    if (caller.mustMapInterfaceSymbols()) {
+      symMap.pushScope();
+      mustPopSymMap = true;
+      Fortran::lower::mapCallInterfaceSymbols(converter, caller, symMap);
+    }
+    // If this is an indirect call, retrieve the function address. Also retrieve
+    // the result length if this is a character function (note that this length
+    // will be used only if there is no explicit length in the local interface).
+    mlir::Value funcPointer;
+    mlir::Value charFuncPointerLength;
+    if (caller.getIfIndirectCallSymbol()) {
+      TODO(loc, "genCallOpAndResult indirect call");
+    }
+
+    mlir::IndexType idxTy = builder.getIndexType();
+    auto lowerSpecExpr = [&](const auto &expr) -> mlir::Value {
+      return builder.createConvert(
+          loc, idxTy, fir::getBase(converter.genExprValue(expr, stmtCtx)));
+    };
+    llvm::SmallVector<mlir::Value> resultLengths;
+    auto allocatedResult = [&]() -> llvm::Optional<ExtValue> {
+      llvm::SmallVector<mlir::Value> extents;
+      llvm::SmallVector<mlir::Value> lengths;
+      if (!caller.callerAllocateResult())
+        return {};
+      mlir::Type type = caller.getResultStorageType();
+      if (type.isa<fir::SequenceType>())
+        caller.walkResultExtents([&](const Fortran::lower::SomeExpr &e) {
+          extents.emplace_back(lowerSpecExpr(e));
+        });
+      caller.walkResultLengths([&](const Fortran::lower::SomeExpr &e) {
+        lengths.emplace_back(lowerSpecExpr(e));
+      });
+
+      // Result length parameters should not be provided to box storage
+      // allocation and save_results, but they are still useful information to
+      // keep in the ExtendedValue if non-deferred.
+      if (!type.isa<fir::BoxType>()) {
+        if (fir::isa_char(fir::unwrapSequenceType(type)) && lengths.empty()) {
+          // Calling an assumed length function. This is only possible if this
+          // is a call to a character dummy procedure.
+          if (!charFuncPointerLength)
+            fir::emitFatalError(loc, "failed to retrieve character function "
+                                     "length while calling it");
+          lengths.push_back(charFuncPointerLength);
+        }
+        resultLengths = lengths;
+      }
+
+      if (!extents.empty() || !lengths.empty()) {
+        TODO(loc, "genCallOpResult extents and length");
+      }
+      mlir::Value temp =
+          builder.createTemporary(loc, type, ".result", extents, resultLengths);
+      return toExtendedValue(loc, temp, extents, lengths);
+    }();
+
+    if (mustPopSymMap)
+      symMap.popScope();
+
+    // Place allocated result or prepare the fir.save_result arguments.
+    mlir::Value arrayResultShape;
+    if (allocatedResult) {
+      if (std::optional<Fortran::lower::CallInterface<
+              Fortran::lower::CallerInterface>::PassedEntity>
+              resultArg = caller.getPassedResult()) {
+        if (resultArg->passBy == PassBy::AddressAndLength)
+          caller.placeAddressAndLengthInput(*resultArg,
+                                            fir::getBase(*allocatedResult),
+                                            fir::getLen(*allocatedResult));
+        else if (resultArg->passBy == PassBy::BaseAddress)
+          caller.placeInput(*resultArg, fir::getBase(*allocatedResult));
+        else
+          fir::emitFatalError(
+              loc, "only expect character scalar result to be passed by ref");
+      } else {
+        assert(caller.mustSaveResult());
+        arrayResultShape = allocatedResult->match(
+            [&](const fir::CharArrayBoxValue &) {
+              return builder.createShape(loc, *allocatedResult);
+            },
+            [&](const fir::ArrayBoxValue &) {
+              return builder.createShape(loc, *allocatedResult);
+            },
+            [&](const auto &) { return mlir::Value{}; });
+      }
+    }
+
+    // In older Fortran, procedure argument types are inferred. This may lead
+    // different view of what the function signature is in different locations.
+    // Casts are inserted as needed below to accommodate this.
+
+    // The mlir::FuncOp type prevails, unless it has a different number of
+    // arguments which can happen in legal program if it was passed as a dummy
+    // procedure argument earlier with no further type information.
+    mlir::SymbolRefAttr funcSymbolAttr;
+    bool addHostAssociations = false;
+    if (!funcPointer) {
+      mlir::FunctionType funcOpType = caller.getFuncOp().getType();
+      mlir::SymbolRefAttr symbolAttr =
+          builder.getSymbolRefAttr(caller.getMangledName());
+      if (callSiteType.getNumResults() == funcOpType.getNumResults() &&
+          callSiteType.getNumInputs() + 1 == funcOpType.getNumInputs() &&
+          fir::anyFuncArgsHaveAttr(caller.getFuncOp(),
+                                   fir::getHostAssocAttrName())) {
+        // The number of arguments is off by one, and we're lowering a function
+        // with host associations. Modify call to include host associations
+        // argument by appending the value at the end of the operands.
+        assert(funcOpType.getInput(findHostAssocTuplePos(caller.getFuncOp())) ==
+               converter.hostAssocTupleValue().getType());
+        addHostAssociations = true;
+      }
+      if (!addHostAssociations &&
+          (callSiteType.getNumResults() != funcOpType.getNumResults() ||
+           callSiteType.getNumInputs() != funcOpType.getNumInputs())) {
+        // Deal with argument number mismatch by making a function pointer so
+        // that function type cast can be inserted. Do not emit a warning here
+        // because this can happen in legal program if the function is not
+        // defined here and it was first passed as an argument without any more
+        // information.
+        funcPointer =
+            builder.create<fir::AddrOfOp>(loc, funcOpType, symbolAttr);
+      } else if (callSiteType.getResults() != funcOpType.getResults()) {
+        // Implicit interface result type mismatch are not standard Fortran, but
+        // some compilers are not complaining about it.  The front end is not
+        // protecting lowering from this currently. Support this with a
+        // discouraging warning.
+        LLVM_DEBUG(mlir::emitWarning(
+            loc, "a return type mismatch is not standard compliant and may "
+                 "lead to undefined behavior."));
+        // Cast the actual function to the current caller implicit type because
+        // that is the behavior we would get if we could not see the definition.
+        funcPointer =
+            builder.create<fir::AddrOfOp>(loc, funcOpType, symbolAttr);
+      } else {
+        funcSymbolAttr = symbolAttr;
+      }
+    }
+
+    mlir::FunctionType funcType =
+        funcPointer ? callSiteType : caller.getFuncOp().getType();
+    llvm::SmallVector<mlir::Value> operands;
+    // First operand of indirect call is the function pointer. Cast it to
+    // required function type for the call to handle procedures that have a
+    // compatible interface in Fortran, but that have different signatures in
+    // FIR.
+    if (funcPointer) {
+      operands.push_back(
+          funcPointer.getType().isa<fir::BoxProcType>()
+              ? builder.create<fir::BoxAddrOp>(loc, funcType, funcPointer)
+              : builder.createConvert(loc, funcType, funcPointer));
+    }
+
+    // Deal with potential mismatches in arguments types. Passing an array to a
+    // scalar argument should for instance be tolerated here.
+    bool callingImplicitInterface = caller.canBeCalledViaImplicitInterface();
+    for (auto [fst, snd] :
+         llvm::zip(caller.getInputs(), funcType.getInputs())) {
+      // When passing arguments to a procedure that can be called an implicit
+      // interface, allow character actual arguments to be passed to dummy
+      // arguments of any type and vice versa
+      mlir::Value cast;
+      auto *context = builder.getContext();
+      if (snd.isa<fir::BoxProcType>() &&
+          fst.getType().isa<mlir::FunctionType>()) {
+        auto funcTy = mlir::FunctionType::get(context, llvm::None, llvm::None);
+        auto boxProcTy = builder.getBoxProcType(funcTy);
+        if (mlir::Value host = argumentHostAssocs(converter, fst)) {
+          cast = builder.create<fir::EmboxProcOp>(
+              loc, boxProcTy, llvm::ArrayRef<mlir::Value>{fst, host});
+        } else {
+          cast = builder.create<fir::EmboxProcOp>(loc, boxProcTy, fst);
+        }
+      } else {
+        cast = builder.convertWithSemantics(loc, snd, fst,
+                                            callingImplicitInterface);
+      }
+      operands.push_back(cast);
+    }
+
+    // Add host associations as necessary.
+    if (addHostAssociations)
+      operands.push_back(converter.hostAssocTupleValue());
+
+    auto call = builder.create<fir::CallOp>(loc, funcType.getResults(),
+                                            funcSymbolAttr, operands);
+
+    if (caller.mustSaveResult())
+      builder.create<fir::SaveResultOp>(
+          loc, call.getResult(0), fir::getBase(allocatedResult.getValue()),
+          arrayResultShape, resultLengths);
+
+    if (allocatedResult) {
+      allocatedResult->match(
+          [&](const fir::MutableBoxValue &box) {
+            if (box.isAllocatable()) {
+              TODO(loc, "allocatedResult for allocatable");
+            }
+          },
+          [](const auto &) {});
+      return *allocatedResult;
+    }
+
+    if (!resultType.hasValue())
+      return mlir::Value{}; // subroutine call
+    // For now, Fortran return values are implemented with a single MLIR
+    // function return value.
+    assert(call.getNumResults() == 1 &&
+           "Expected exactly one result in FUNCTION call");
+    return call.getResult(0);
+  }
+
+  /// Like genExtAddr, but ensure the address returned is a temporary even if \p
+  /// expr is variable inside parentheses.
+  ExtValue genTempExtAddr(const Fortran::lower::SomeExpr &expr) {
+    // In general, genExtAddr might not create a temp for variable inside
+    // parentheses to avoid creating array temporary in sub-expressions. It only
+    // ensures the sub-expression is not re-associated with other parts of the
+    // expression. In the call semantics, there is a difference between expr and
+    // variable (see R1524). For expressions, a variable storage must not be
+    // argument associated since it could be modified inside the call, or the
+    // variable could also be modified by other means during the call.
+    if (!isParenthesizedVariable(expr))
+      return genExtAddr(expr);
+    mlir::Location loc = getLoc();
+    if (expr.Rank() > 0)
+      TODO(loc, "genTempExtAddr array");
+    return genExtValue(expr).match(
+        [&](const fir::CharBoxValue &boxChar) -> ExtValue {
+          TODO(loc, "genTempExtAddr CharBoxValue");
+        },
+        [&](const fir::UnboxedValue &v) -> ExtValue {
+          mlir::Type type = v.getType();
+          mlir::Value value = v;
+          if (fir::isa_ref_type(type))
+            value = builder.create<fir::LoadOp>(loc, value);
+          mlir::Value temp = builder.createTemporary(loc, value.getType());
+          builder.create<fir::StoreOp>(loc, value, temp);
+          return temp;
+        },
+        [&](const fir::BoxValue &x) -> ExtValue {
+          // Derived type scalar that may be polymorphic.
+          assert(!x.hasRank() && x.isDerived());
+          if (x.isDerivedWithLengthParameters())
+            fir::emitFatalError(
+                loc, "making temps for derived type with length parameters");
+          // TODO: polymorphic aspects should be kept but for now the temp
+          // created always has the declared type.
+          mlir::Value var =
+              fir::getBase(fir::factory::readBoxValue(builder, loc, x));
+          auto value = builder.create<fir::LoadOp>(loc, var);
+          mlir::Value temp = builder.createTemporary(loc, value.getType());
+          builder.create<fir::StoreOp>(loc, value, temp);
+          return temp;
+        },
+        [&](const auto &) -> ExtValue {
+          fir::emitFatalError(loc, "expr is not a scalar value");
+        });
+  }
+
+  /// Helper structure to track potential copy-in of non contiguous variable
+  /// argument into a contiguous temp. It is used to deallocate the temp that
+  /// may have been created as well as to the copy-out from the temp to the
+  /// variable after the call.
+  struct CopyOutPair {
+    ExtValue var;
+    ExtValue temp;
+    // Flag to indicate if the argument may have been modified by the
+    // callee, in which case it must be copied-out to the variable.
+    bool argMayBeModifiedByCall;
+    // Optional boolean value that, if present and false, prevents
+    // the copy-out and temp deallocation.
+    llvm::Optional<mlir::Value> restrictCopyAndFreeAtRuntime;
+  };
+  using CopyOutPairs = llvm::SmallVector<CopyOutPair, 4>;
+
+  /// Helper to read any fir::BoxValue into other fir::ExtendedValue categories
+  /// not based on fir.box.
+  /// This will lose any non contiguous stride information and dynamic type and
+  /// should only be called if \p exv is known to be contiguous or if its base
+  /// address will be replaced by a contiguous one. If \p exv is not a
+  /// fir::BoxValue, this is a no-op.
+  ExtValue readIfBoxValue(const ExtValue &exv) {
+    if (const auto *box = exv.getBoxOf<fir::BoxValue>())
+      return fir::factory::readBoxValue(builder, getLoc(), *box);
+    return exv;
+  }
+
+  /// Lower a non-elemental procedure reference.
+  ExtValue genRawProcedureRef(const Fortran::evaluate::ProcedureRef &procRef,
+                              llvm::Optional<mlir::Type> resultType) {
+    mlir::Location loc = getLoc();
+    if (isElementalProcWithArrayArgs(procRef))
+      fir::emitFatalError(loc, "trying to lower elemental procedure with array "
+                               "arguments as normal procedure");
+    if (const Fortran::evaluate::SpecificIntrinsic *intrinsic =
+            procRef.proc().GetSpecificIntrinsic())
+      return genIntrinsicRef(procRef, *intrinsic, resultType);
+
+    if (isStatementFunctionCall(procRef))
+      TODO(loc, "Lower statement function call");
+
+    Fortran::lower::CallerInterface caller(procRef, converter);
+    using PassBy = Fortran::lower::CallerInterface::PassEntityBy;
+
+    llvm::SmallVector<fir::MutableBoxValue> mutableModifiedByCall;
+    // List of <var, temp> where temp must be copied into var after the call.
+    CopyOutPairs copyOutPairs;
+
+    mlir::FunctionType callSiteType = caller.genFunctionType();
+
+    // Lower the actual arguments and map the lowered values to the dummy
+    // arguments.
+    for (const Fortran::lower::CallInterface<
+             Fortran::lower::CallerInterface>::PassedEntity &arg :
+         caller.getPassedArguments()) {
+      const auto *actual = arg.entity;
+      mlir::Type argTy = callSiteType.getInput(arg.firArgument);
+      if (!actual) {
+        // Optional dummy argument for which there is no actual argument.
+        caller.placeInput(arg, builder.create<fir::AbsentOp>(loc, argTy));
+        continue;
+      }
+      const auto *expr = actual->UnwrapExpr();
+      if (!expr)
+        TODO(loc, "assumed type actual argument lowering");
+
+      if (arg.passBy == PassBy::Value) {
+        ExtValue argVal = genval(*expr);
+        if (!fir::isUnboxedValue(argVal))
+          fir::emitFatalError(
+              loc, "internal error: passing non trivial value by value");
+        caller.placeInput(arg, fir::getBase(argVal));
+        continue;
+      }
+
+      if (arg.passBy == PassBy::MutableBox) {
+        TODO(loc, "arg passby MutableBox");
+      }
+      const bool actualArgIsVariable = Fortran::evaluate::IsVariable(*expr);
+      if (arg.passBy == PassBy::BaseAddress || arg.passBy == PassBy::BoxChar) {
+        auto argAddr = [&]() -> ExtValue {
+          ExtValue baseAddr;
+          if (actualArgIsVariable && arg.isOptional()) {
+            if (Fortran::evaluate::IsAllocatableOrPointerObject(
+                    *expr, converter.getFoldingContext())) {
+              TODO(loc, "Allocatable or pointer argument");
+            }
+            if (const Fortran::semantics::Symbol *wholeSymbol =
+                    Fortran::evaluate::UnwrapWholeSymbolOrComponentDataRef(
+                        *expr))
+              if (Fortran::semantics::IsOptional(*wholeSymbol)) {
+                TODO(loc, "procedureref optional arg");
+              }
+            // Fall through: The actual argument can safely be
+            // copied-in/copied-out without any care if needed.
+          }
+          if (actualArgIsVariable && expr->Rank() > 0) {
+            TODO(loc, "procedureref arrays");
+          }
+          // Actual argument is a non optional/non pointer/non allocatable
+          // scalar.
+          if (actualArgIsVariable)
+            return genExtAddr(*expr);
+          // Actual argument is not a variable. Make sure a variable address is
+          // not passed.
+          return genTempExtAddr(*expr);
+        }();
+        // Scalar and contiguous expressions may be lowered to a fir.box,
+        // either to account for potential polymorphism, or because lowering
+        // did not account for some contiguity hints.
+        // Here, polymorphism does not matter (an entity of the declared type
+        // is passed, not one of the dynamic type), and the expr is known to
+        // be simply contiguous, so it is safe to unbox it and pass the
+        // address without making a copy.
+        argAddr = readIfBoxValue(argAddr);
+
+        if (arg.passBy == PassBy::BaseAddress) {
+          caller.placeInput(arg, fir::getBase(argAddr));
+        } else {
+          TODO(loc, "procedureref PassBy::BoxChar");
+        }
+      } else if (arg.passBy == PassBy::Box) {
+        // Before lowering to an address, handle the allocatable/pointer actual
+        // argument to optional fir.box dummy. It is legal to pass
+        // unallocated/disassociated entity to an optional. In this case, an
+        // absent fir.box must be created instead of a fir.box with a null value
+        // (Fortran 2018 15.5.2.12 point 1).
+        if (arg.isOptional() && Fortran::evaluate::IsAllocatableOrPointerObject(
+                                    *expr, converter.getFoldingContext())) {
+          TODO(loc, "optional allocatable or pointer argument");
+        } else {
+          // Make sure a variable address is only passed if the expression is
+          // actually a variable.
+          mlir::Value box =
+              actualArgIsVariable
+                  ? builder.createBox(loc, genBoxArg(*expr))
+                  : builder.createBox(getLoc(), genTempExtAddr(*expr));
+          caller.placeInput(arg, box);
+        }
+      } else if (arg.passBy == PassBy::AddressAndLength) {
+        ExtValue argRef = genExtAddr(*expr);
+        caller.placeAddressAndLengthInput(arg, fir::getBase(argRef),
+                                          fir::getLen(argRef));
+      } else if (arg.passBy == PassBy::CharProcTuple) {
+        TODO(loc, "procedureref CharProcTuple");
+      } else {
+        TODO(loc, "pass by value in non elemental function call");
+      }
+    }
+
+    ExtValue result = genCallOpAndResult(caller, callSiteType, resultType);
+
+    // // Copy-out temps that were created for non contiguous variable arguments
+    // if
+    // // needed.
+    // for (const auto &copyOutPair : copyOutPairs)
+    //   genCopyOut(copyOutPair);
+
+    return result;
+  }
+
   template <typename A>
   ExtValue genval(const Fortran::evaluate::FunctionRef<A> &funcRef) {
     ExtValue result = genFunctionRef(funcRef);
@@ -525,7 +1078,10 @@ class ScalarExprLowering {
   }
 
   ExtValue genval(const Fortran::evaluate::ProcedureRef &procRef) {
-    TODO(getLoc(), "genval ProcedureRef");
+    llvm::Optional<mlir::Type> resTy;
+    if (procRef.hasAlternateReturns())
+      resTy = builder.getIndexType();
+    return genProcedureRef(procRef, resTy);
   }
 
   /// Generate a call to an intrinsic function.
@@ -586,28 +1142,6 @@ class ScalarExprLowering {
     TODO(getLoc(), "genval Expr<A> arrays");
   }
 
-  /// Lower a non-elemental procedure reference.
-  // TODO: Handle read allocatable and pointer results.
-  ExtValue genProcedureRef(const Fortran::evaluate::ProcedureRef &procRef,
-                           llvm::Optional<mlir::Type> resultType) {
-    ExtValue res = genRawProcedureRef(procRef, resultType);
-    return res;
-  }
-
-  /// Lower a non-elemental procedure reference.
-  ExtValue genRawProcedureRef(const Fortran::evaluate::ProcedureRef &procRef,
-                              llvm::Optional<mlir::Type> resultType) {
-    mlir::Location loc = getLoc();
-    if (isElementalProcWithArrayArgs(procRef))
-      fir::emitFatalError(loc, "trying to lower elemental procedure with array "
-                               "arguments as normal procedure");
-    if (const Fortran::evaluate::SpecificIntrinsic *intrinsic =
-            procRef.proc().GetSpecificIntrinsic())
-      return genIntrinsicRef(procRef, *intrinsic, resultType);
-
-    return {};
-  }
-
   /// Helper to detect Transformational function reference.
   template <typename T>
   bool isTransformationalRef(const T &) {
@@ -679,20 +1213,35 @@ class ScalarExprLowering {
   mlir::Location location;
   Fortran::lower::AbstractConverter &converter;
   fir::FirOpBuilder &builder;
+  Fortran::lower::StatementContext &stmtCtx;
   Fortran::lower::SymMap &symMap;
+  bool useBoxArg = false; // expression lowered as argument
 };
 } // namespace
 
 fir::ExtendedValue Fortran::lower::createSomeExtendedExpression(
     mlir::Location loc, Fortran::lower::AbstractConverter &converter,
-    const Fortran::lower::SomeExpr &expr, Fortran::lower::SymMap &symMap) {
+    const Fortran::lower::SomeExpr &expr, Fortran::lower::SymMap &symMap,
+    Fortran::lower::StatementContext &stmtCtx) {
   LLVM_DEBUG(expr.AsFortran(llvm::dbgs() << "expr: ") << '\n');
-  return ScalarExprLowering{loc, converter, symMap}.genval(expr);
+  return ScalarExprLowering{loc, converter, symMap, stmtCtx}.genval(expr);
 }
 
 fir::ExtendedValue Fortran::lower::createSomeExtendedAddress(
     mlir::Location loc, Fortran::lower::AbstractConverter &converter,
-    const Fortran::lower::SomeExpr &expr, Fortran::lower::SymMap &symMap) {
+    const Fortran::lower::SomeExpr &expr, Fortran::lower::SymMap &symMap,
+    Fortran::lower::StatementContext &stmtCtx) {
   LLVM_DEBUG(expr.AsFortran(llvm::dbgs() << "address: ") << '\n');
-  return ScalarExprLowering{loc, converter, symMap}.gen(expr);
+  return ScalarExprLowering{loc, converter, symMap, stmtCtx}.gen(expr);
+}
+
+mlir::Value Fortran::lower::createSubroutineCall(
+    AbstractConverter &converter, const evaluate::ProcedureRef &call,
+    SymMap &symMap, StatementContext &stmtCtx) {
+  mlir::Location loc = converter.getCurrentLocation();
+
+  // Simple subroutine call, with potential alternate return.
+  auto res = Fortran::lower::createSomeExtendedExpression(
+      loc, converter, toEvExpr(call), symMap, stmtCtx);
+  return fir::getBase(res);
 }
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index bd347362fbc92..8667732bd4257 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -107,3 +107,50 @@ void Fortran::lower::instantiateVariable(AbstractConverter &converter,
     instantiateLocal(converter, var, symMap);
   }
 }
+
+void Fortran::lower::mapCallInterfaceSymbols(
+    AbstractConverter &converter, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap) {
+  const Fortran::semantics::Symbol &result = caller.getResultSymbol();
+  for (Fortran::lower::pft::Variable var :
+       Fortran::lower::pft::buildFuncResultDependencyList(result)) {
+    if (var.isAggregateStore()) {
+      instantiateVariable(converter, var, symMap);
+    } else {
+      const Fortran::semantics::Symbol &sym = var.getSymbol();
+      const auto *hostDetails =
+          sym.detailsIf<Fortran::semantics::HostAssocDetails>();
+      if (hostDetails && !var.isModuleVariable()) {
+        // The callee is an internal procedure `A` whose result properties
+        // depend on host variables. The caller may be the host, or another
+        // internal procedure `B` contained in the same host.  In the first
+        // case, the host symbol is obviously mapped, in the second case, it
+        // must also be mapped because
+        // HostAssociations::internalProcedureBindings that was called when
+        // lowering `B` will have mapped all host symbols of captured variables
+        // to the tuple argument containing the composite of all host associated
+        // variables, whether or not the host symbol is actually referred to in
+        // `B`. Hence it is possible to simply lookup the variable associated to
+        // the host symbol without having to go back to the tuple argument.
+        Fortran::lower::SymbolBox hostValue =
+            symMap.lookupSymbol(hostDetails->symbol());
+        assert(hostValue && "callee host symbol must be mapped on caller side");
+        symMap.addSymbol(sym, hostValue.toExtendedValue());
+        // The SymbolBox associated to the host symbols is complete, skip
+        // instantiateVariable that would try to allocate a new storage.
+        continue;
+      }
+      if (Fortran::semantics::IsDummy(sym) && sym.owner() == result.owner()) {
+        // Get the argument for the dummy argument symbols of the current call.
+        symMap.addSymbol(sym, caller.getArgumentValue(sym));
+        // All the properties of the dummy variable may not come from the actual
+        // argument, let instantiateVariable handle this.
+      }
+      // If this is neither a host associated or dummy symbol, it must be a
+      // module or common block variable to satisfy specification expression
+      // requirements in 10.1.11, instantiateVariable will get its address and
+      // properties.
+      instantiateVariable(converter, var, symMap);
+    }
+  }
+}
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index e440bb18e9f07..f7c8c1f38da0c 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -14,6 +14,7 @@
 #include "flang/Common/idioms.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/StatementContext.h"
 #include "flang/Lower/Todo.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -120,7 +121,7 @@ static Op createSimpleOp(fir::FirOpBuilder &builder, mlir::Location loc,
 static void genACC(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::pft::Evaluation &eval,
                    const Fortran::parser::OpenACCLoopConstruct &loopConstruct) {
-
+  Fortran::lower::StatementContext stmtCtx;
   const auto &beginLoopDirective =
       std::get<Fortran::parser::AccBeginLoopDirective>(loopConstruct.t);
   const auto &loopDirective =
@@ -151,7 +152,7 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
                   std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
                       x.t)) {
             gangNum = fir::getBase(converter.genExprValue(
-                *Fortran::semantics::GetExpr(gangNumValue.value())));
+                *Fortran::semantics::GetExpr(gangNumValue.value()), stmtCtx));
           }
           if (const auto &gangStaticValue =
                   std::get<std::optional<Fortran::parser::AccSizeExpr>>(x.t)) {
@@ -159,8 +160,8 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
                 std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
                     gangStaticValue.value().t);
             if (expr) {
-              gangStatic = fir::getBase(
-                  converter.genExprValue(*Fortran::semantics::GetExpr(*expr)));
+              gangStatic = fir::getBase(converter.genExprValue(
+                  *Fortran::semantics::GetExpr(*expr), stmtCtx));
             } else {
               // * was passed as value and will be represented as a -1 constant
               // integer.
@@ -176,7 +177,7 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
                          &clause.u)) {
         if (workerClause->v) {
           workerNum = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(*workerClause->v)));
+              *Fortran::semantics::GetExpr(*workerClause->v), stmtCtx));
         }
         executionMapping |= mlir::acc::OpenACCExecMapping::WORKER;
       } else if (const auto *vectorClause =
@@ -184,7 +185,7 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
                          &clause.u)) {
         if (vectorClause->v) {
           vectorLength = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(*vectorClause->v)));
+              *Fortran::semantics::GetExpr(*vectorClause->v), stmtCtx));
         }
         executionMapping |= mlir::acc::OpenACCExecMapping::VECTOR;
       } else if (const auto *tileClause =
@@ -195,8 +196,8 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
               std::get<std::optional<Fortran::parser::ScalarIntConstantExpr>>(
                   accTileExpr.t);
           if (expr) {
-            tileOperands.push_back(fir::getBase(
-                converter.genExprValue(*Fortran::semantics::GetExpr(*expr))));
+            tileOperands.push_back(fir::getBase(converter.genExprValue(
+                *Fortran::semantics::GetExpr(*expr), stmtCtx)));
           } else {
             // * was passed as value and will be represented as a -1 constant
             // integer.
@@ -281,6 +282,7 @@ genACCParallelOp(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
@@ -291,7 +293,7 @@ genACCParallelOp(Fortran::lower::AbstractConverter &converter,
       const auto &asyncClauseValue = asyncClause->v;
       if (asyncClauseValue) { // async has a value.
         async = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*asyncClauseValue)));
+            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx));
       } else {
         addAsyncAttr = true;
       }
@@ -303,8 +305,8 @@ genACCParallelOp(Fortran::lower::AbstractConverter &converter,
         const std::list<Fortran::parser::ScalarIntExpr> &waitList =
             std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
         for (const Fortran::parser::ScalarIntExpr &value : waitList) {
-          Value v = fir::getBase(
-              converter.genExprValue(*Fortran::semantics::GetExpr(value)));
+          Value v = fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(value), stmtCtx));
           waitOperands.push_back(v);
         }
       } else {
@@ -314,21 +316,21 @@ genACCParallelOp(Fortran::lower::AbstractConverter &converter,
                    std::get_if<Fortran::parser::AccClause::NumGangs>(
                        &clause.u)) {
       numGangs = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(numGangsClause->v)));
+          *Fortran::semantics::GetExpr(numGangsClause->v), stmtCtx));
     } else if (const auto *numWorkersClause =
                    std::get_if<Fortran::parser::AccClause::NumWorkers>(
                        &clause.u)) {
       numWorkers = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(numWorkersClause->v)));
+          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx));
     } else if (const auto *vectorLengthClause =
                    std::get_if<Fortran::parser::AccClause::VectorLength>(
                        &clause.u)) {
       vectorLength = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(vectorLengthClause->v)));
+          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx));
     } else if (const auto *ifClause =
                    std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      Value cond = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(ifClause->v)));
+      Value cond = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(ifClause->v), stmtCtx));
       ifCond = firOpBuilder.createConvert(currentLocation,
                                           firOpBuilder.getI1Type(), cond);
     } else if (const auto *selfClause =
@@ -339,7 +341,7 @@ genACCParallelOp(Fortran::lower::AbstractConverter &converter,
                   &accSelfClause.u)) {
         if (*optCondition) {
           Value cond = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(*optCondition)));
+              *Fortran::semantics::GetExpr(*optCondition), stmtCtx));
           selfCond = firOpBuilder.createConvert(currentLocation,
                                                 firOpBuilder.getI1Type(), cond);
         } else {
@@ -442,6 +444,7 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
@@ -449,8 +452,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
   for (const auto &clause : accClauseList.v) {
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      Value cond = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(ifClause->v)));
+      Value cond = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(ifClause->v), stmtCtx));
       ifCond = firOpBuilder.createConvert(currentLocation,
                                           firOpBuilder.getI1Type(), cond);
     } else if (const auto *copyClause =
@@ -546,6 +549,7 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
@@ -553,8 +557,8 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
   for (const auto &clause : accClauseList.v) {
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      mlir::Value cond = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(ifClause->v)));
+      mlir::Value cond = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(ifClause->v), stmtCtx));
       ifCond = firOpBuilder.createConvert(currentLocation,
                                           firOpBuilder.getI1Type(), cond);
     } else if (const auto *asyncClause =
@@ -562,7 +566,7 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
       const auto &asyncClauseValue = asyncClause->v;
       if (asyncClauseValue) { // async has a value.
         async = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*asyncClauseValue)));
+            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx));
       } else {
         addAsyncAttr = true;
       }
@@ -574,8 +578,8 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
         const std::list<Fortran::parser::ScalarIntExpr> &waitList =
             std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
         for (const Fortran::parser::ScalarIntExpr &value : waitList) {
-          mlir::Value v = fir::getBase(
-              converter.genExprValue(*Fortran::semantics::GetExpr(value)));
+          mlir::Value v = fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(value), stmtCtx));
           waitOperands.push_back(v);
         }
 
@@ -583,7 +587,7 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
             std::get<std::optional<Fortran::parser::ScalarIntExpr>>(waitArg.t);
         if (waitDevnumValue)
           waitDevnum = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(*waitDevnumValue)));
+              *Fortran::semantics::GetExpr(*waitDevnumValue), stmtCtx));
       } else {
         addWaitAttr = true;
       }
@@ -646,6 +650,7 @@ genACCExitDataOp(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
@@ -653,8 +658,8 @@ genACCExitDataOp(Fortran::lower::AbstractConverter &converter,
   for (const auto &clause : accClauseList.v) {
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      Value cond = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(ifClause->v)));
+      Value cond = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(ifClause->v), stmtCtx));
       ifCond = firOpBuilder.createConvert(currentLocation,
                                           firOpBuilder.getI1Type(), cond);
     } else if (const auto *asyncClause =
@@ -662,7 +667,7 @@ genACCExitDataOp(Fortran::lower::AbstractConverter &converter,
       const auto &asyncClauseValue = asyncClause->v;
       if (asyncClauseValue) { // async has a value.
         async = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*asyncClauseValue)));
+            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx));
       } else {
         addAsyncAttr = true;
       }
@@ -674,8 +679,8 @@ genACCExitDataOp(Fortran::lower::AbstractConverter &converter,
         const std::list<Fortran::parser::ScalarIntExpr> &waitList =
             std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
         for (const Fortran::parser::ScalarIntExpr &value : waitList) {
-          Value v = fir::getBase(
-              converter.genExprValue(*Fortran::semantics::GetExpr(value)));
+          Value v = fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(value), stmtCtx));
           waitOperands.push_back(v);
         }
 
@@ -683,7 +688,7 @@ genACCExitDataOp(Fortran::lower::AbstractConverter &converter,
             std::get<std::optional<Fortran::parser::ScalarIntExpr>>(waitArg.t);
         if (waitDevnumValue)
           waitDevnum = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(*waitDevnumValue)));
+              *Fortran::semantics::GetExpr(*waitDevnumValue), stmtCtx));
       } else {
         addWaitAttr = true;
       }
@@ -737,6 +742,7 @@ genACCInitShutdownOp(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
@@ -744,15 +750,15 @@ genACCInitShutdownOp(Fortran::lower::AbstractConverter &converter,
   for (const auto &clause : accClauseList.v) {
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      mlir::Value cond = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(ifClause->v)));
+      mlir::Value cond = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(ifClause->v), stmtCtx));
       ifCond = firOpBuilder.createConvert(currentLocation,
                                           firOpBuilder.getI1Type(), cond);
     } else if (const auto *deviceNumClause =
                    std::get_if<Fortran::parser::AccClause::DeviceNum>(
                        &clause.u)) {
       deviceNum = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(deviceNumClause->v)));
+          *Fortran::semantics::GetExpr(deviceNumClause->v), stmtCtx));
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
@@ -761,7 +767,7 @@ genACCInitShutdownOp(Fortran::lower::AbstractConverter &converter,
       if (deviceTypeValue) {
         for (const auto &scalarIntExpr : *deviceTypeValue) {
           mlir::Value expr = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(scalarIntExpr)));
+              *Fortran::semantics::GetExpr(scalarIntExpr), stmtCtx));
           deviceTypeOperands.push_back(expr);
         }
       } else {
@@ -800,6 +806,7 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
@@ -807,8 +814,8 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
   for (const auto &clause : accClauseList.v) {
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      mlir::Value cond = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(ifClause->v)));
+      mlir::Value cond = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(ifClause->v), stmtCtx));
       ifCond = firOpBuilder.createConvert(currentLocation,
                                           firOpBuilder.getI1Type(), cond);
     } else if (const auto *asyncClause =
@@ -816,7 +823,7 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
       const auto &asyncClauseValue = asyncClause->v;
       if (asyncClauseValue) { // async has a value.
         async = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*asyncClauseValue)));
+            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx));
       } else {
         addAsyncAttr = true;
       }
@@ -828,8 +835,8 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
         const std::list<Fortran::parser::ScalarIntExpr> &waitList =
             std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
         for (const Fortran::parser::ScalarIntExpr &value : waitList) {
-          mlir::Value v = fir::getBase(
-              converter.genExprValue(*Fortran::semantics::GetExpr(value)));
+          mlir::Value v = fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(value), stmtCtx));
           waitOperands.push_back(v);
         }
 
@@ -837,7 +844,7 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
             std::get<std::optional<Fortran::parser::ScalarIntExpr>>(waitArg.t);
         if (waitDevnumValue)
           waitDevnum = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(*waitDevnumValue)));
+              *Fortran::semantics::GetExpr(*waitDevnumValue), stmtCtx));
       } else {
         addWaitAttr = true;
       }
@@ -849,7 +856,7 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
       if (deviceTypeValue) {
         for (const auto &scalarIntExpr : *deviceTypeValue) {
           mlir::Value expr = fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(scalarIntExpr)));
+              *Fortran::semantics::GetExpr(scalarIntExpr), stmtCtx));
           deviceTypeOperands.push_back(expr);
         }
       } else {
@@ -935,6 +942,7 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
 
   if (waitArgument) { // wait has a value.
     const Fortran::parser::AccWaitArgument &waitArg = *waitArgument;
@@ -942,7 +950,7 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
         std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
     for (const Fortran::parser::ScalarIntExpr &value : waitList) {
       mlir::Value v = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(value)));
+          converter.genExprValue(*Fortran::semantics::GetExpr(value), stmtCtx));
       waitOperands.push_back(v);
     }
 
@@ -950,7 +958,7 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
         std::get<std::optional<Fortran::parser::ScalarIntExpr>>(waitArg.t);
     if (waitDevnumValue)
       waitDevnum = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(*waitDevnumValue)));
+          *Fortran::semantics::GetExpr(*waitDevnumValue), stmtCtx));
   }
 
   // Lower clauses values mapped to operands.
@@ -959,8 +967,8 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
   for (const auto &clause : accClauseList.v) {
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      mlir::Value cond = fir::getBase(
-          converter.genExprValue(*Fortran::semantics::GetExpr(ifClause->v)));
+      mlir::Value cond = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(ifClause->v), stmtCtx));
       ifCond = firOpBuilder.createConvert(currentLocation,
                                           firOpBuilder.getI1Type(), cond);
     } else if (const auto *asyncClause =
@@ -968,7 +976,7 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
       const auto &asyncClauseValue = asyncClause->v;
       if (asyncClauseValue) { // async has a value.
         async = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*asyncClauseValue)));
+            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx));
       } else {
         addAsyncAttr = true;
       }
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index bf02f577cc408..9e64ab053d547 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -14,6 +14,7 @@
 #include "flang/Common/idioms.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/StatementContext.h"
 #include "flang/Lower/Todo.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -139,6 +140,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   auto &firOpBuilder = converter.getFirOpBuilder();
   auto currentLocation = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
   llvm::ArrayRef<mlir::Type> argTy;
   if (blockDirective.v == llvm::omp::OMPD_parallel) {
 
@@ -152,14 +154,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
               std::get_if<Fortran::parser::OmpClause::If>(&clause.u)) {
         auto &expr =
             std::get<Fortran::parser::ScalarLogicalExpr>(ifClause->v.t);
-        ifClauseOperand = fir::getBase(
-            converter.genExprValue(*Fortran::semantics::GetExpr(expr)));
+        ifClauseOperand = fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(expr), stmtCtx));
       } else if (const auto &numThreadsClause =
                      std::get_if<Fortran::parser::OmpClause::NumThreads>(
                          &clause.u)) {
         // OMPIRBuilder expects `NUM_THREAD` clause as a `Value`.
         numThreadsClauseOperand = fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(numThreadsClause->v)));
+            *Fortran::semantics::GetExpr(numThreadsClause->v), stmtCtx));
       }
       // TODO: Handle private, firstprivate, shared and copyin
     }
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index 32b67ab62aa77..b35ae660ea8a2 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -8,6 +8,7 @@
 
 #include "flang/Lower/Runtime.h"
 #include "flang/Lower/Bridge.h"
+#include "flang/Lower/StatementContext.h"
 #include "flang/Lower/Todo.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
@@ -38,13 +39,15 @@ void Fortran::lower::genStopStatement(
     const Fortran::parser::StopStmt &stmt) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Location loc = converter.getCurrentLocation();
+  Fortran::lower::StatementContext stmtCtx;
   llvm::SmallVector<mlir::Value> operands;
   mlir::FuncOp callee;
   mlir::FunctionType calleeType;
   // First operand is stop code (zero if absent)
   if (const auto &code =
           std::get<std::optional<Fortran::parser::StopCode>>(stmt.t)) {
-    auto expr = converter.genExprValue(*Fortran::semantics::GetExpr(*code));
+    auto expr =
+        converter.genExprValue(*Fortran::semantics::GetExpr(*code), stmtCtx);
     LLVM_DEBUG(llvm::dbgs() << "stop expression: "; expr.dump();
                llvm::dbgs() << '\n');
     expr.match(
@@ -88,7 +91,7 @@ void Fortran::lower::genStopStatement(
           std::get<std::optional<Fortran::parser::ScalarLogicalExpr>>(stmt.t)) {
     const SomeExpr *expr = Fortran::semantics::GetExpr(*quiet);
     assert(expr && "failed getting typed expression");
-    mlir::Value q = fir::getBase(converter.genExprValue(*expr));
+    mlir::Value q = fir::getBase(converter.genExprValue(*expr, stmtCtx));
     operands.push_back(
         builder.createConvert(loc, calleeType.getInput(operands.size()), q));
   } else {
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index bcc67a9c1e7d2..bc9f6dbff075a 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -38,6 +38,11 @@ mlir::FuncOp fir::FirOpBuilder::getNamedFunction(mlir::ModuleOp modOp,
   return modOp.lookupSymbol<mlir::FuncOp>(name);
 }
 
+mlir::FuncOp fir::FirOpBuilder::getNamedFunction(mlir::ModuleOp modOp,
+                                                 mlir::SymbolRefAttr symbol) {
+  return modOp.lookupSymbol<mlir::FuncOp>(symbol);
+}
+
 fir::GlobalOp fir::FirOpBuilder::getNamedGlobal(mlir::ModuleOp modOp,
                                                 llvm::StringRef name) {
   return modOp.lookupSymbol<fir::GlobalOp>(name);
@@ -258,9 +263,10 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal(
   return glob;
 }
 
-mlir::Value fir::FirOpBuilder::convertWithSemantics(mlir::Location loc,
-                                                    mlir::Type toTy,
-                                                    mlir::Value val) {
+mlir::Value
+fir::FirOpBuilder::convertWithSemantics(mlir::Location loc, mlir::Type toTy,
+                                        mlir::Value val,
+                                        bool allowCharacterConversion) {
   assert(toTy && "store location must be typed");
   auto fromTy = val.getType();
   if (fromTy == toTy)
@@ -282,6 +288,35 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(mlir::Location loc,
     auto rp = helper.extractComplexPart(val, /*isImagPart=*/false);
     return createConvert(loc, toTy, rp);
   }
+  if (allowCharacterConversion) {
+    if (fromTy.isa<fir::BoxCharType>()) {
+      // Extract the address of the character string and pass it
+      fir::factory::CharacterExprHelper charHelper{*this, loc};
+      std::pair<mlir::Value, mlir::Value> unboxchar =
+          charHelper.createUnboxChar(val);
+      return createConvert(loc, toTy, unboxchar.first);
+    }
+    if (auto boxType = toTy.dyn_cast<fir::BoxCharType>()) {
+      // Extract the address of the actual argument and create a boxed
+      // character value with an undefined length
+      // TODO: We should really calculate the total size of the actual
+      // argument in characters and use it as the length of the string
+      auto refType = getRefType(boxType.getEleTy());
+      mlir::Value charBase = createConvert(loc, refType, val);
+      mlir::Value unknownLen = create<fir::UndefOp>(loc, getIndexType());
+      fir::factory::CharacterExprHelper charHelper{*this, loc};
+      return charHelper.createEmboxChar(charBase, unknownLen);
+    }
+  }
+  if (fir::isa_ref_type(toTy) && fir::isa_box_type(fromTy)) {
+    // Call is expecting a raw data pointer, not a box. Get the data pointer out
+    // of the box and pass that.
+    assert((fir::unwrapRefType(toTy) ==
+                fir::unwrapRefType(fir::unwrapPassByRefType(fromTy)) &&
+            "element types expected to match"));
+    return create<fir::BoxAddrOp>(loc, toTy, val);
+  }
+
   return createConvert(loc, toTy, val);
 }
 
@@ -523,6 +558,29 @@ fir::factory::getExtents(fir::FirOpBuilder &builder, mlir::Location loc,
       [&](const auto &) -> llvm::SmallVector<mlir::Value> { return {}; });
 }
 
+fir::ExtendedValue fir::factory::readBoxValue(fir::FirOpBuilder &builder,
+                                              mlir::Location loc,
+                                              const fir::BoxValue &box) {
+  assert(!box.isUnlimitedPolymorphic() && !box.hasAssumedRank() &&
+         "cannot read unlimited polymorphic or assumed rank fir.box");
+  auto addr =
+      builder.create<fir::BoxAddrOp>(loc, box.getMemTy(), box.getAddr());
+  if (box.isCharacter()) {
+    auto len = fir::factory::readCharLen(builder, loc, box);
+    if (box.rank() == 0)
+      return fir::CharBoxValue(addr, len);
+    return fir::CharArrayBoxValue(addr, len,
+                                  fir::factory::readExtents(builder, loc, box),
+                                  box.getLBounds());
+  }
+  if (box.isDerivedWithLengthParameters())
+    TODO(loc, "read fir.box with length parameters");
+  if (box.rank() == 0)
+    return addr;
+  return fir::ArrayBoxValue(addr, fir::factory::readExtents(builder, loc, box),
+                            box.getLBounds());
+}
+
 std::string fir::factory::uniqueCGIdent(llvm::StringRef prefix,
                                         llvm::StringRef name) {
   // For "long" identifiers use a hash value
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 935f35db572a0..0f1f594fa9dec 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -3291,6 +3291,13 @@ bool fir::valueHasFirAttribute(mlir::Value value,
   return false;
 }
 
+bool fir::anyFuncArgsHaveAttr(mlir::FuncOp func, llvm::StringRef attr) {
+  for (unsigned i = 0, end = func.getNumArguments(); i < end; ++i)
+    if (func.getArgAttr(i, attr))
+      return true;
+  return false;
+}
+
 mlir::Type fir::applyPathToType(mlir::Type eleTy, mlir::ValueRange path) {
   for (auto i = path.begin(), end = path.end(); eleTy && i < end;) {
     eleTy = llvm::TypeSwitch<mlir::Type, mlir::Type>(eleTy)
diff --git a/flang/test/Lower/basic-call.f90 b/flang/test/Lower/basic-call.f90
new file mode 100644
index 0000000000000..044ceed9f2d8f
--- /dev/null
+++ b/flang/test/Lower/basic-call.f90
@@ -0,0 +1,49 @@
+! RUN: bbc %s -o "-" -emit-fir | FileCheck %s
+
+subroutine sub1()
+end
+! CHECK-LABEL: func @_QPsub1()
+
+subroutine sub2()
+  call sub1()
+end
+
+! CHECK-LABEL: func @_QPsub2()
+! CHECK:         fir.call @_QPsub1() : () -> ()
+
+subroutine sub3(a, b)
+  integer :: a
+  real :: b
+end
+
+! CHECK-LABEL: func @_QPsub3(
+! CHECK-SAME:    %{{.*}}: !fir.ref<i32> {fir.bindc_name = "a"},
+! CHECK-SAME:    %{{.*}}: !fir.ref<f32> {fir.bindc_name = "b"})
+
+subroutine sub4()
+  call sub3(2, 3.0)
+end
+
+! CHECK-LABEL: func @_QPsub4() {
+! CHECK-DAG: %[[REAL_VALUE:.*]] = fir.alloca f32 {adapt.valuebyref}
+! CHECK-DAG: %[[INT_VALUE:.*]] = fir.alloca i32 {adapt.valuebyref}
+! CHECK:     %[[C2:.*]] = arith.constant 2 : i32
+! CHECK:     fir.store %[[C2]] to %[[INT_VALUE]] : !fir.ref<i32>
+! CHECK:     %[[C3:.*]] = arith.constant 3.000000e+00 : f32
+! CHECK:     fir.store %[[C3]] to %[[REAL_VALUE]] : !fir.ref<f32>
+! CHECK:     fir.call @_QPsub3(%[[INT_VALUE]], %[[REAL_VALUE]]) : (!fir.ref<i32>, !fir.ref<f32>) -> ()
+
+subroutine call_fct1()
+  real :: a, b, c
+  c = fct1(a, b)
+end
+
+! CHECK-LABEL: func @_QPcall_fct1()
+! CHECK:         %[[A:.*]] = fir.alloca f32 {bindc_name = "a", uniq_name = "_QFcall_fct1Ea"}
+! CHECK:         %[[B:.*]] = fir.alloca f32 {bindc_name = "b", uniq_name = "_QFcall_fct1Eb"}
+! CHECK:         %[[C:.*]] = fir.alloca f32 {bindc_name = "c", uniq_name = "_QFcall_fct1Ec"}
+! CHECK:         %[[RES:.*]] = fir.call @_QPfct1(%[[A]], %[[B]]) : (!fir.ref<f32>, !fir.ref<f32>) -> f32
+! CHECK:         fir.store %[[RES]] to %[[C]] : !fir.ref<f32>
+! CHECK:         return
+
+! CHECK: func private @_QPfct1(!fir.ref<f32>, !fir.ref<f32>) -> f32

From 1b2a1f847354bf027a2ad1591a0b694b721d0177 Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Wed, 16 Feb 2022 11:26:12 -0500
Subject: [PATCH 673/748] [MLIR][Arith] Canonicalize cmpf(int to fp) to cmpi

Given a cmpf of either uitofp or sitofp and a constant, attempt to canonicalize it to a cmpi.

This PR rewrites equivalent code within LLVM to now apply to MLIR arith.

Reviewed By: rriddle

Differential Revision: https://reviews.llvm.org/D117257
---
 .../Dialect/Arithmetic/IR/ArithmeticOps.td    |   1 +
 mlir/include/mlir/IR/BuiltinTypes.h           |   3 +
 .../Dialect/Arithmetic/IR/ArithmeticOps.cpp   | 293 ++++++++++++++++++
 mlir/lib/IR/BuiltinTypes.cpp                  |   4 +
 .../test/Dialect/Arithmetic/canonicalize.mlir |  83 +++++
 5 files changed, 384 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td b/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td
index f31fe2d9f0447..ea60a83998ad7 100644
--- a/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td
+++ b/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td
@@ -1153,6 +1153,7 @@ def Arith_CmpFOp : Arith_CompareOp<"cmpf"> {
   }];
 
   let hasFolder = 1;
+  let hasCanonicalizer = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index b03d9ea9f575d..60c61cdd56a76 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -54,6 +54,9 @@ class FloatType : public Type {
   /// Return the bitwidth of this float type.
   unsigned getWidth();
 
+  /// Return the width of the mantissa of this type.
+  unsigned getFPMantissaWidth();
+
   /// Get or create a new FloatType with bitwidth scaled by `scale`.
   /// Return null if the scaled element type cannot be represented.
   FloatType scaleElementBitwidth(unsigned scale);
diff --git a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
index 8074f2c4751fb..eea91dcde9724 100644
--- a/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
+++ b/mlir/lib/Dialect/Arithmetic/IR/ArithmeticOps.cpp
@@ -1393,6 +1393,299 @@ OpFoldResult arith::CmpFOp::fold(ArrayRef<Attribute> operands) {
   return BoolAttr::get(getContext(), val);
 }
 
+class CmpFIntToFPConst final : public OpRewritePattern<CmpFOp> {
+public:
+  using OpRewritePattern<CmpFOp>::OpRewritePattern;
+
+  static CmpIPredicate convertToIntegerPredicate(CmpFPredicate pred,
+                                                 bool isUnsigned) {
+    using namespace arith;
+    switch (pred) {
+    case CmpFPredicate::UEQ:
+    case CmpFPredicate::OEQ:
+      return CmpIPredicate::eq;
+    case CmpFPredicate::UGT:
+    case CmpFPredicate::OGT:
+      return isUnsigned ? CmpIPredicate::ugt : CmpIPredicate::sgt;
+    case CmpFPredicate::UGE:
+    case CmpFPredicate::OGE:
+      return isUnsigned ? CmpIPredicate::uge : CmpIPredicate::sge;
+    case CmpFPredicate::ULT:
+    case CmpFPredicate::OLT:
+      return isUnsigned ? CmpIPredicate::ult : CmpIPredicate::slt;
+    case CmpFPredicate::ULE:
+    case CmpFPredicate::OLE:
+      return isUnsigned ? CmpIPredicate::ule : CmpIPredicate::sle;
+    case CmpFPredicate::UNE:
+    case CmpFPredicate::ONE:
+      return CmpIPredicate::ne;
+    default:
+      llvm_unreachable("Unexpected predicate!");
+    }
+  }
+
+  LogicalResult matchAndRewrite(CmpFOp op,
+                                PatternRewriter &rewriter) const override {
+    FloatAttr flt;
+    if (!matchPattern(op.getRhs(), m_Constant(&flt)))
+      return failure();
+
+    const APFloat &rhs = flt.getValue();
+
+    // Don't attempt to fold a nan.
+    if (rhs.isNaN())
+      return failure();
+
+    // Get the width of the mantissa.  We don't want to hack on conversions that
+    // might lose information from the integer, e.g. "i64 -> float"
+    FloatType floatTy = op.getRhs().getType().cast<FloatType>();
+    int mantissaWidth = floatTy.getFPMantissaWidth();
+    if (mantissaWidth <= 0)
+      return failure();
+
+    bool isUnsigned;
+    Value intVal;
+
+    if (auto si = op.getLhs().getDefiningOp<SIToFPOp>()) {
+      isUnsigned = false;
+      intVal = si.getIn();
+    } else if (auto ui = op.getLhs().getDefiningOp<UIToFPOp>()) {
+      isUnsigned = true;
+      intVal = ui.getIn();
+    } else {
+      return failure();
+    }
+
+    // Check to see that the input is converted from an integer type that is
+    // small enough that preserves all bits.
+    auto intTy = intVal.getType().cast<IntegerType>();
+    auto intWidth = intTy.getWidth();
+
+    // Number of bits representing values, as opposed to the sign
+    auto valueBits = isUnsigned ? intWidth : (intWidth - 1);
+
+    // Following test does NOT adjust intWidth downwards for signed inputs,
+    // because the most negative value still requires all the mantissa bits
+    // to distinguish it from one less than that value.
+    if ((int)intWidth > mantissaWidth) {
+      // Conversion would lose accuracy. Check if loss can impact comparison.
+      int exponent = ilogb(rhs);
+      if (exponent == APFloat::IEK_Inf) {
+        int maxExponent = ilogb(APFloat::getLargest(rhs.getSemantics()));
+        if (maxExponent < (int)valueBits) {
+          // Conversion could create infinity.
+          return failure();
+        }
+      } else {
+        // Note that if rhs is zero or NaN, then Exp is negative
+        // and first condition is trivially false.
+        if (mantissaWidth <= exponent && exponent <= (int)valueBits) {
+          // Conversion could affect comparison.
+          return failure();
+        }
+      }
+    }
+
+    // Convert to equivalent cmpi predicate
+    CmpIPredicate pred;
+    switch (op.getPredicate()) {
+    case CmpFPredicate::ORD:
+      // Int to fp conversion doesn't create a nan (ord checks neither is a nan)
+      rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                 /*width=*/1);
+      return success();
+    case CmpFPredicate::UNO:
+      // Int to fp conversion doesn't create a nan (uno checks either is a nan)
+      rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                 /*width=*/1);
+      return success();
+    default:
+      pred = convertToIntegerPredicate(op.getPredicate(), isUnsigned);
+      break;
+    }
+
+    if (!isUnsigned) {
+      // If the rhs value is > SignedMax, fold the comparison.  This handles
+      // +INF and large values.
+      APFloat signedMax(rhs.getSemantics());
+      signedMax.convertFromAPInt(APInt::getSignedMaxValue(intWidth), true,
+                                 APFloat::rmNearestTiesToEven);
+      if (signedMax < rhs) { // smax < 13123.0
+        if (pred == CmpIPredicate::ne || pred == CmpIPredicate::slt ||
+            pred == CmpIPredicate::sle)
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                     /*width=*/1);
+        else
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                     /*width=*/1);
+        return success();
+      }
+    } else {
+      // If the rhs value is > UnsignedMax, fold the comparison. This handles
+      // +INF and large values.
+      APFloat unsignedMax(rhs.getSemantics());
+      unsignedMax.convertFromAPInt(APInt::getMaxValue(intWidth), false,
+                                   APFloat::rmNearestTiesToEven);
+      if (unsignedMax < rhs) { // umax < 13123.0
+        if (pred == CmpIPredicate::ne || pred == CmpIPredicate::ult ||
+            pred == CmpIPredicate::ule)
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                     /*width=*/1);
+        else
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                     /*width=*/1);
+        return success();
+      }
+    }
+
+    if (!isUnsigned) {
+      // See if the rhs value is < SignedMin.
+      APFloat signedMin(rhs.getSemantics());
+      signedMin.convertFromAPInt(APInt::getSignedMinValue(intWidth), true,
+                                 APFloat::rmNearestTiesToEven);
+      if (signedMin > rhs) { // smin > 12312.0
+        if (pred == CmpIPredicate::ne || pred == CmpIPredicate::sgt ||
+            pred == CmpIPredicate::sge)
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                     /*width=*/1);
+        else
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                     /*width=*/1);
+        return success();
+      }
+    } else {
+      // See if the rhs value is < UnsignedMin.
+      APFloat unsignedMin(rhs.getSemantics());
+      unsignedMin.convertFromAPInt(APInt::getMinValue(intWidth), false,
+                                   APFloat::rmNearestTiesToEven);
+      if (unsignedMin > rhs) { // umin > 12312.0
+        if (pred == CmpIPredicate::ne || pred == CmpIPredicate::ugt ||
+            pred == CmpIPredicate::uge)
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                     /*width=*/1);
+        else
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                     /*width=*/1);
+        return success();
+      }
+    }
+
+    // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
+    // [0, UMAX], but it may still be fractional.  See if it is fractional by
+    // casting the FP value to the integer value and back, checking for
+    // equality. Don't do this for zero, because -0.0 is not fractional.
+    bool ignored;
+    APSInt rhsInt(intWidth, isUnsigned);
+    if (APFloat::opInvalidOp ==
+        rhs.convertToInteger(rhsInt, APFloat::rmTowardZero, &ignored)) {
+      // Undefined behavior invoked - the destination type can't represent
+      // the input constant.
+      return failure();
+    }
+
+    if (!rhs.isZero()) {
+      APFloat apf(floatTy.getFloatSemantics(),
+                  APInt::getZero(floatTy.getWidth()));
+      apf.convertFromAPInt(rhsInt, !isUnsigned, APFloat::rmNearestTiesToEven);
+
+      bool equal = apf == rhs;
+      if (!equal) {
+        // If we had a comparison against a fractional value, we have to adjust
+        // the compare predicate and sometimes the value.  rhsInt is rounded
+        // towards zero at this point.
+        switch (pred) {
+        default:
+          llvm_unreachable("Unexpected integer comparison!");
+        case CmpIPredicate::ne: // (float)int != 4.4   --> true
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                     /*width=*/1);
+          return success();
+        case CmpIPredicate::eq: // (float)int == 4.4   --> false
+          rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                     /*width=*/1);
+          return success();
+        case CmpIPredicate::ule:
+          // (float)int <= 4.4   --> int <= 4
+          // (float)int <= -4.4  --> false
+          if (rhs.isNegative()) {
+            rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                       /*width=*/1);
+            return success();
+          }
+          break;
+        case CmpIPredicate::sle:
+          // (float)int <= 4.4   --> int <= 4
+          // (float)int <= -4.4  --> int < -4
+          if (rhs.isNegative())
+            pred = CmpIPredicate::slt;
+          break;
+        case CmpIPredicate::ult:
+          // (float)int < -4.4   --> false
+          // (float)int < 4.4    --> int <= 4
+          if (rhs.isNegative()) {
+            rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/false,
+                                                       /*width=*/1);
+            return success();
+          }
+          pred = CmpIPredicate::ule;
+          break;
+        case CmpIPredicate::slt:
+          // (float)int < -4.4   --> int < -4
+          // (float)int < 4.4    --> int <= 4
+          if (!rhs.isNegative())
+            pred = CmpIPredicate::sle;
+          break;
+        case CmpIPredicate::ugt:
+          // (float)int > 4.4    --> int > 4
+          // (float)int > -4.4   --> true
+          if (rhs.isNegative()) {
+            rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                       /*width=*/1);
+            return success();
+          }
+          break;
+        case CmpIPredicate::sgt:
+          // (float)int > 4.4    --> int > 4
+          // (float)int > -4.4   --> int >= -4
+          if (rhs.isNegative())
+            pred = CmpIPredicate::sge;
+          break;
+        case CmpIPredicate::uge:
+          // (float)int >= -4.4   --> true
+          // (float)int >= 4.4    --> int > 4
+          if (rhs.isNegative()) {
+            rewriter.replaceOpWithNewOp<ConstantIntOp>(op, /*value=*/true,
+                                                       /*width=*/1);
+            return success();
+          }
+          pred = CmpIPredicate::ugt;
+          break;
+        case CmpIPredicate::sge:
+          // (float)int >= -4.4   --> int >= -4
+          // (float)int >= 4.4    --> int > 4
+          if (!rhs.isNegative())
+            pred = CmpIPredicate::sgt;
+          break;
+        }
+      }
+    }
+
+    // Lower this FP comparison into an appropriate integer version of the
+    // comparison.
+    rewriter.replaceOpWithNewOp<CmpIOp>(
+        op, pred, intVal,
+        rewriter.create<ConstantOp>(
+            op.getLoc(), intVal.getType(),
+            rewriter.getIntegerAttr(intVal.getType(), rhsInt)));
+    return success();
+  }
+};
+
+void arith::CmpFOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
+                                                MLIRContext *context) {
+  patterns.insert<CmpFIntToFPConst>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // SelectOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp
index 6d3ed12cedf22..d57005237187e 100644
--- a/mlir/lib/IR/BuiltinTypes.cpp
+++ b/mlir/lib/IR/BuiltinTypes.cpp
@@ -137,6 +137,10 @@ FloatType FloatType::scaleElementBitwidth(unsigned scale) {
   return FloatType();
 }
 
+unsigned FloatType::getFPMantissaWidth() {
+  return APFloat::semanticsPrecision(getFloatSemantics());
+}
+
 //===----------------------------------------------------------------------===//
 // FunctionType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Arithmetic/canonicalize.mlir b/mlir/test/Dialect/Arithmetic/canonicalize.mlir
index bb40f62a5e01f..4bdefac2ac5f5 100644
--- a/mlir/test/Dialect/Arithmetic/canonicalize.mlir
+++ b/mlir/test/Dialect/Arithmetic/canonicalize.mlir
@@ -788,3 +788,86 @@ func @constant_UItoFP() -> f32 {
   %res = arith.sitofp %c0 : i32 to f32
   return %res : f32
 }
+
+// -----
+
+// Tests rewritten from https://github.com/llvm/llvm-project/blob/main/llvm/test/Transforms/InstCombine/2008-11-08-FCmp.ll
+// When inst combining an FCMP with the LHS coming from a arith.uitofp instruction, we
+// can lower it to signed ICMP instructions.
+
+// CHECK-LABEL: @test1(
+// CHECK-SAME: %[[arg0:.+]]:
+func @test1(%arg0: i32) -> i1 {
+  %cst = arith.constant 0.000000e+00 : f64
+  %1 = arith.uitofp %arg0: i32 to f64
+  %2 = arith.cmpf ole, %1, %cst : f64
+  // CHECK: %[[c0:.+]] = arith.constant 0 : i32
+  // CHECK: arith.cmpi ule, %[[arg0]], %[[c0]] : i32
+  return %2 : i1
+}
+
+// CHECK-LABEL: @test2(
+// CHECK-SAME: %[[arg0:.+]]:
+func @test2(%arg0: i32) -> i1 {
+  %cst = arith.constant 0.000000e+00 : f64
+  %1 = arith.uitofp %arg0: i32 to f64
+  %2 = arith.cmpf olt, %1, %cst : f64
+  return %2 : i1
+  // CHECK: %[[c0:.+]] = arith.constant 0 : i32
+  // CHECK: arith.cmpi ult, %[[arg0]], %[[c0]] : i32
+}
+
+// CHECK-LABEL: @test3(
+// CHECK-SAME: %[[arg0:.+]]:
+func @test3(%arg0: i32) -> i1 {
+  %cst = arith.constant 0.000000e+00 : f64
+  %1 = arith.uitofp %arg0: i32 to f64
+  %2 = arith.cmpf oge, %1, %cst : f64
+  return %2 : i1
+  // CHECK: %[[c0:.+]] = arith.constant 0 : i32
+  // CHECK: arith.cmpi uge, %[[arg0]], %[[c0]] : i32
+}
+
+// CHECK-LABEL: @test4(
+// CHECK-SAME: %[[arg0:.+]]:
+func @test4(%arg0: i32) -> i1 {
+  %cst = arith.constant 0.000000e+00 : f64
+  %1 = arith.uitofp %arg0: i32 to f64
+  %2 = arith.cmpf ogt, %1, %cst : f64
+  // CHECK: %[[c0:.+]] = arith.constant 0 : i32
+  // CHECK: arith.cmpi ugt, %[[arg0]], %[[c0]] : i32
+  return %2 : i1
+}
+
+// CHECK-LABEL: @test5(
+func @test5(%arg0: i32) -> i1 {
+  %cst = arith.constant -4.400000e+00 : f64
+  %1 = arith.uitofp %arg0: i32 to f64
+  %2 = arith.cmpf ogt, %1, %cst : f64
+  return %2 : i1
+  // CHECK: %[[true:.+]] = arith.constant true
+  // CHECK: return %[[true]] : i1
+}
+
+// CHECK-LABEL: @test6(
+func @test6(%arg0: i32) -> i1 {
+  %cst = arith.constant -4.400000e+00 : f64
+  %1 = arith.uitofp %arg0: i32 to f64
+  %2 = arith.cmpf olt, %1, %cst : f64
+  return %2 : i1
+  // CHECK: %[[false:.+]] = arith.constant false
+  // CHECK: return %[[false]] : i1
+}
+
+// Check that optimizing unsigned >= comparisons correctly distinguishes
+// positive and negative constants.
+// CHECK-LABEL: @test7(
+// CHECK-SAME: %[[arg0:.+]]:
+func @test7(%arg0: i32) -> i1 {
+  %cst = arith.constant 3.200000e+00 : f64
+  %1 = arith.uitofp %arg0: i32 to f64
+  %2 = arith.cmpf oge, %1, %cst : f64
+  return %2 : i1
+  // CHECK: %[[c3:.+]] = arith.constant 3 : i32
+  // CHECK: arith.cmpi ugt, %[[arg0]], %[[c3]] : i32
+}

From 6bff092e3ed4ae1f21b290f88cf7152cb331aa48 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 23 Feb 2022 11:15:57 -0800
Subject: [PATCH 674/748] [TSan][NFC] fixup for comment of Shadow

There should be 1-bit unused field between tid field and is_atomic field of Shadow.

Reviewed By: dvyukov, vitalybuka

Differential Revision: https://reviews.llvm.org/D119417
---
 compiler-rt/lib/tsan/rtl-old/tsan_shadow.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h b/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
index 8b7bc341713e8..566584fd3203a 100644
--- a/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
+++ b/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
@@ -94,6 +94,7 @@ class FastState {
 // Shadow (from most significant bit):
 //   freed           : 1
 //   tid             : kTidBits
+//   unused          : 1
 //   is_atomic       : 1
 //   is_read         : 1
 //   size_log        : 2

From 280d41d9bbf00a3978d943bc3e56381eed1544ba Mon Sep 17 00:00:00 2001
From: Arjun P <arjunpitchanathan@gmail.com>
Date: Tue, 22 Feb 2022 17:28:21 +0000
Subject: [PATCH 675/748] [MLIR][Presburger] PresburgerSet::subtract:
 automatically restore state on return

Reviewed By: Groverkss

Differential Revision: https://reviews.llvm.org/D120339
---
 mlir/lib/Analysis/Presburger/PresburgerSet.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Analysis/Presburger/PresburgerSet.cpp b/mlir/lib/Analysis/Presburger/PresburgerSet.cpp
index b75cd1d51fddd..82946297c7e09 100644
--- a/mlir/lib/Analysis/Presburger/PresburgerSet.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerSet.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/Analysis/Presburger/Utils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallBitVector.h"
 
 using namespace mlir;
@@ -198,7 +199,6 @@ static void subtractRecursively(IntegerPolyhedron &b, Simplex &simplex,
   // Similarly, we also want to rollback simplex to its original state.
   const unsigned initialSnapshot = simplex.getSnapshot();
 
-  // Automatically restore the original state when we return.
   auto restoreState = [&]() {
     b.removeIdRange(IntegerPolyhedron::IdKind::Local, bInitNumLocals,
                     b.getNumLocalIds());
@@ -207,6 +207,9 @@ static void subtractRecursively(IntegerPolyhedron &b, Simplex &simplex,
     simplex.rollback(initialSnapshot);
   };
 
+  // Automatically restore the original state when we return.
+  auto stateRestorer = llvm::make_scope_exit(restoreState);
+
   // Find out which inequalities of sI correspond to division inequalities for
   // the local variables of sI.
   std::vector<MaybeLocalRepr> repr(sI.getNumLocalIds());
@@ -243,11 +246,16 @@ static void subtractRecursively(IntegerPolyhedron &b, Simplex &simplex,
   simplex.intersectIntegerPolyhedron(sI);
 
   if (simplex.isEmpty()) {
-    /// b ^ s_i is empty, so b \ s_i = b. We move directly to i + 1.
-    /// We are ignoring level i completely, so we restore the state
-    /// *before* going to level i + 1.
+    // b ^ s_i is empty, so b \ s_i = b. We move directly to i + 1.
+    // We are ignoring level i completely, so we restore the state
+    // *before* going to level i + 1.
     restoreState();
     subtractRecursively(b, simplex, s, i + 1, result);
+
+    // We already restored the state above and the recursive call should have
+    // restored to the same state before returning, so we don't need to restore
+    // the state again.
+    stateRestorer.release();
     return;
   }
 
@@ -309,8 +317,6 @@ static void subtractRecursively(IntegerPolyhedron &b, Simplex &simplex,
     if (!isMarkedRedundant[offset + 2 * j + 1])
       processInequality(getNegatedCoeffs(coeffs));
   }
-
-  restoreState();
 }
 
 /// Return the set difference poly \ set.

From 95f3cc222a510b31c1890ab0e840828634805204 Mon Sep 17 00:00:00 2001
From: Augie Fackler <augie@google.com>
Date: Fri, 11 Feb 2022 18:32:38 -0500
Subject: [PATCH 676/748] AttributorAttributes: avoid a crashing on bad
 alignments

Prior to this change, LLVM would attempt to optimize an
aligned_alloc(33, ...) call to the stack. This flunked an assertion when
trying to emit the alloca, which crashed LLVM. Avoid that with extra
checks.

Differential Revision: https://reviews.llvm.org/D119604
---
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 12 +++++++++++-
 llvm/test/Transforms/Attributor/heap_to_stack.ll | 14 ++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index c94f38687b219..ba986a65e0201 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -38,12 +38,14 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -6356,7 +6358,8 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
       continue;
 
     if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
-      if (!getAPInt(A, *this, *Align)) {
+      Optional<APInt> APAlign = getAPInt(A, *this, *Align);
+      if (!APAlign) {
         // Can't generate an alloca which respects the required alignment
         // on the allocation.
         LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
@@ -6364,6 +6367,13 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
         AI.Status = AllocationInfo::INVALID;
         Changed = ChangeStatus::CHANGED;
         continue;
+      } else {
+        if (APAlign->ugt(llvm::Value::MaximumAlignment) || !APAlign->isPowerOf2()) {
+          LLVM_DEBUG(dbgs() << "[H2S] Invalid allocation alignment: " << APAlign << "\n");
+          AI.Status = AllocationInfo::INVALID;
+          Changed = ChangeStatus::CHANGED;
+          continue;
+        }
       }
     }
 
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index 73f6887204647..4ee3c432ac5b1 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -218,6 +218,20 @@ define void @test3c(i64 %alignment) {
   ret void
 }
 
+; leave alone a constant-but-invalid alignment
+define void @test3d(i8* %p) {
+; CHECK-LABEL: define {{[^@]+}}@test3d
+; CHECK-SAME; (i8* nocapture [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 noundef 33, i64 noundef 128)
+; CHECK:    tail call void @free(i8* noalias nocapture [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call noalias i8* @aligned_alloc(i64 33, i64 128)
+  tail call void @nofree_arg_only(i8* %1, i8* %p)
+  tail call void @free(i8* %1)
+  ret void
+}
+
 declare noalias i8* @calloc(i64, i64)
 
 define void @test0() {

From 9bd9cbfa2f7b60ddb95cb321eaf2a4804aeef611 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Wed, 23 Feb 2022 13:43:27 -0500
Subject: [PATCH 677/748] [libc++][AIX] Fix trivial_abi return tests for
 unique_ptr/weak_ptr

The unique_ptr_ret and weak_ptr_ret tests are not expected to pass on
AIX. These tests check that unique_ptr and weak_ptr are returned by
value, but on AIX, all structs are always returned by reference.

```
3.9.6 Function Return Values
...

Note: Structures of any length and character strings longer than four
bytes are returned in a storage buffer allocated by the caller. The
address of this buffer is passed as a hidden first argument in GPR3,
which causes the first explicit argument word to be passed in GPR4. This
hidden argument is treated as a formal argument and corresponds to the
first word of the argument area.
```

Reviewed By: #powerpc, daltenty, #libc, Quuxplusone, philnik

Differential Revision: https://reviews.llvm.org/D119952
---
 .../test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp   | 5 ++---
 libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp  | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
index 01db9b806a433..946786e3d3016 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: LIBCXX-AIX-FIXME
-
 // <memory>
 
 // Test unique_ptr<T> with trivial_abi as return-type.
@@ -47,8 +45,9 @@ int main(int, char**) {
   //
   // With trivial_abi, local_addr is the address of a local variable in
   // make_val, and hence different from &ret.
-#if !defined(__i386__) && !defined(_WIN32)
+#if !defined(__i386__) && !defined(_WIN32) && !defined(_AIX)
   // On X86, structs are never returned in registers.
+  // On AIX, structs are never returned in registers.
   // Thus, unique_ptr will be passed indirectly even if it is trivial.
   // On Windows, structs with a destructor are always returned indirectly.
   assert((void*)&ret != local_addr);
diff --git a/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
index 3e3d978e79ad6..050073c4a6d7f 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
@@ -13,7 +13,6 @@
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI
 
 // XFAIL: gcc
-// XFAIL: LIBCXX-AIX-FIXME
 
 #include <memory>
 #include <cassert>
@@ -49,8 +48,9 @@ int main(int, char**) {
   //
   // With trivial_abi, local_addr is the address of a local variable in
   // make_val, and hence different from &ret.
-#if !defined(__i386__) && !defined(__arm__) && !defined(_WIN32)
+#if !defined(__i386__) && !defined(__arm__) && !defined(_WIN32) && !defined(_AIX)
   // On X86, structs are never returned in registers.
+  // On AIX, structs are never returned in registers.
   // On ARM32, structs larger than 4 bytes cannot be returned in registers.
   // On Windows, structs with a destructor are always returned indirectly.
   // Thus, weak_ptr will be passed indirectly even if it is trivial.

From 8c85f3a0523070ef656e30e368df0a679c1400cd Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 08:51:33 -0800
Subject: [PATCH 678/748] [SLP] Simplify extendSchedulingRegion

This change uses instruction's comesBefore method to simplify the code significantly. There's little compile time concern here because getSpillCost already calls comesBefore on every basic block which contains a vectorization candidate. The only additional times we'll build basic block ordering is when we can't schedule a vector candidate anywhere in the containing block.

Differential Revision: https://reviews.llvm.org/D120364
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 32 +++++--------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3e87b3b6ff322..9b38be00d7e13 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7621,21 +7621,8 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return;
   }
-  // Search up and down at the same time, because we don't know if the new
-  // instruction is above or below the existing scheduling region.
-  BasicBlock::reverse_iterator UpIter =
-      ++ScheduleStart->getIterator().getReverse();
-  BasicBlock::reverse_iterator UpperEnd = BB->rend();
-  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
-  BasicBlock::iterator LowerEnd = BB->end();
-  while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
-         &*DownIter != I) {
-    ++UpIter;
-    ++DownIter;
-  }
-  if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
-    assert(I->getParent() == ScheduleStart->getParent() &&
-           "Instruction is in wrong basic block.");
+
+  if (I->comesBefore(ScheduleStart)) {
     initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
     ScheduleStart = I;
     if (isOneOf(S, I) != I)
@@ -7644,17 +7631,14 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
                       << "\n");
     return;
   }
-  assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
-         "Expected to reach top of the basic block or instruction down the "
-         "lower end.");
-  assert(I->getParent() == ScheduleEnd->getParent() &&
-         "Instruction is in wrong basic block.");
-  initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
-                   nullptr);
-  ScheduleEnd = I->getNextNode();
+
+  auto *NextI = I->getNextNode();
+  assert(NextI && "tried to vectorize a terminator?");
+  assert(ScheduleEnd->comesBefore(NextI) && "must extend?");
+  initScheduleData(ScheduleEnd, NextI, LastLoadStoreInRegion, nullptr);
+  ScheduleEnd = NextI;
   if (isOneOf(S, I) != I)
     CheckSheduleForI(I);
-  assert(ScheduleEnd && "tried to vectorize a terminator?");
   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
   return;
 }

From 475c43339bdd6e07d3b891365c04d48fca75303b Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 23 Feb 2022 11:21:04 -0800
Subject: [PATCH 679/748] Revert "[TSan][NFC] fixup for comment of Shadow"

Wrong author.

This reverts commit 6bff092e3ed4ae1f21b290f88cf7152cb331aa48.
---
 compiler-rt/lib/tsan/rtl-old/tsan_shadow.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h b/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
index 566584fd3203a..8b7bc341713e8 100644
--- a/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
+++ b/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
@@ -94,7 +94,6 @@ class FastState {
 // Shadow (from most significant bit):
 //   freed           : 1
 //   tid             : kTidBits
-//   unused          : 1
 //   is_atomic       : 1
 //   is_read         : 1
 //   size_log        : 2

From f19f67232891b4325302e27dd40c28d14faf159e Mon Sep 17 00:00:00 2001
From: Xu Mingjie <xumingjie.enna1@bytedance.com>
Date: Wed, 23 Feb 2022 11:15:57 -0800
Subject: [PATCH 680/748] [TSan][NFC] fixup for comment of Shadow

There should be 1-bit unused field between tid field and is_atomic field of Shadow.

Reviewed By: dvyukov, vitalybuka

Differential Revision: https://reviews.llvm.org/D119417
---
 compiler-rt/lib/tsan/rtl-old/tsan_shadow.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h b/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
index 8b7bc341713e8..566584fd3203a 100644
--- a/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
+++ b/compiler-rt/lib/tsan/rtl-old/tsan_shadow.h
@@ -94,6 +94,7 @@ class FastState {
 // Shadow (from most significant bit):
 //   freed           : 1
 //   tid             : kTidBits
+//   unused          : 1
 //   is_atomic       : 1
 //   is_read         : 1
 //   size_log        : 2

From 78f7a6fbe5bbe6c3e3e1552f265964b189c57c25 Mon Sep 17 00:00:00 2001
From: Martin Liska <mliska@suse.cz>
Date: Wed, 23 Feb 2022 11:25:17 -0800
Subject: [PATCH 681/748] [PATCH] ASAN: Align declaration with definition of a
 fn

Fixes:
https://bugs.llvm.org/show_bug.cgi?id=51641

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D115447
---
 compiler-rt/lib/asan/asan_errors.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/asan/asan_errors.h b/compiler-rt/lib/asan/asan_errors.h
index af6d1f295eb29..c6ac88f6dc2ae 100644
--- a/compiler-rt/lib/asan/asan_errors.h
+++ b/compiler-rt/lib/asan/asan_errors.h
@@ -372,7 +372,7 @@ struct ErrorGeneric : ErrorBase {
   u8 shadow_val;
 
   ErrorGeneric() = default;  // (*)
-  ErrorGeneric(u32 tid, uptr addr, uptr pc_, uptr bp_, uptr sp_, bool is_write_,
+  ErrorGeneric(u32 tid, uptr pc_, uptr bp_, uptr sp_, uptr addr, bool is_write_,
                uptr access_size_);
   void Print();
 };

From a3e9b32c00959ad5c73189d8378d019fbe80ade5 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 11:40:03 -0800
Subject: [PATCH 682/748] [SLP] Remove SchedulingPriority from ScheduleData
 [NFC]

First step in trying to shrink the memory footprint of ScheduleData to improve cache locality.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9b38be00d7e13..96ab621e89f1d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2563,9 +2563,6 @@ class BoUpSLP {
     /// the current SchedulingRegionID of BlockScheduling.
     int SchedulingRegionID = 0;
 
-    /// Used for getting a "good" final ordering of instructions.
-    int SchedulingPriority = 0;
-
     /// The number of dependencies. Constitutes of the number of users of the
     /// instruction plus the number of dependent memory instructions (if any).
     /// This value is calculated on demand.
@@ -7828,23 +7825,23 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   // For the real scheduling we use a more sophisticated ready-list: it is
   // sorted by the original instruction location. This lets the final schedule
   // be as  close as possible to the original instruction order.
-  struct ScheduleDataCompare {
-    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
-      return SD2->SchedulingPriority < SD1->SchedulingPriority;
-    }
+  DenseMap<ScheduleData *, unsigned> OriginalOrder;
+  auto ScheduleDataCompare = [&](ScheduleData *SD1, ScheduleData *SD2) {
+    return OriginalOrder[SD2] < OriginalOrder[SD1];
   };
-  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+  std::set<ScheduleData *, decltype(ScheduleDataCompare)>
+    ReadyInsts(ScheduleDataCompare);
 
   // Ensure that all dependency data is updated (for nodes in the sub-graph)
   // and fill the ready-list with initial instructions.
   int Idx = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
+    BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
       assert((isVectorLikeInstWithConstOps(SD->Inst) ||
               SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
              "scheduler and vectorizer bundle mismatch");
-      SD->FirstInBundle->SchedulingPriority = Idx++;
+      OriginalOrder[SD->FirstInBundle] = Idx++;
 
       if (SD->isSchedulingEntity() && SD->isPartOfBundle())
         BS->calculateDependencies(SD, false, this);

From e87c32e390bb812d9390d3df39357409d073ac3f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 23 Feb 2022 11:51:30 -0800
Subject: [PATCH 683/748] [Driver] Add
 -fno-sanitize-address-globals-dead-stripping

It's customary for these options to have the -fno- form which is sometimes
handy to work around issues. Using the supported driver option is preferred over
the internal cl::opt option `-mllvm -asan-globals-live-support=0`

Reviewed By: kstoimenov, vitalybuka

Differential Revision: https://reviews.llvm.org/D120391
---
 clang/include/clang/Driver/Options.td | 8 +++++---
 clang/lib/Driver/SanitizerArgs.cpp    | 7 ++++---
 clang/test/Driver/fsanitize.c         | 1 +
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c377329e8f6f4..fcee066905703 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1662,9 +1662,11 @@ defm sanitize_address_poison_custom_array_cookie : BoolOption<"f", "sanitize-add
   PosFlag<SetTrue, [], "Enable">, NegFlag<SetFalse, [], "Disable">,
   BothFlags<[], " poisoning array cookies when using custom operator new[] in AddressSanitizer">>,
   Group<f_clang_Group>;
-def fsanitize_address_globals_dead_stripping : Flag<["-"], "fsanitize-address-globals-dead-stripping">,
-  Group<f_clang_Group>, HelpText<"Enable linker dead stripping of globals in AddressSanitizer">,
-  MarshallingInfoFlag<CodeGenOpts<"SanitizeAddressGlobalsDeadStripping">, "false">;
+defm sanitize_address_globals_dead_stripping : BoolOption<"f", "sanitize-address-globals-dead-stripping",
+  CodeGenOpts<"SanitizeAddressGlobalsDeadStripping">, DefaultFalse,
+  PosFlag<SetTrue, [], "Enable linker dead stripping of globals in AddressSanitizer">,
+  NegFlag<SetFalse, [], "Disable linker dead stripping of globals in AddressSanitizer">>,
+  Group<f_clang_Group>;
 defm sanitize_address_use_odr_indicator : BoolOption<"f", "sanitize-address-use-odr-indicator",
   CodeGenOpts<"SanitizeAddressUseOdrIndicator">, DefaultFalse,
   PosFlag<SetTrue, [], "Enable ODR indicator globals to avoid false ODR violation"
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 403fac76f0602..440f29142ab29 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -848,10 +848,11 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
     // As a workaround for a bug in gold 2.26 and earlier, dead stripping of
     // globals in ASan is disabled by default on ELF targets.
     // See https://sourceware.org/bugzilla/show_bug.cgi?id=19002
-    AsanGlobalsDeadStripping =
+    AsanGlobalsDeadStripping = Args.hasFlag(
+        options::OPT_fsanitize_address_globals_dead_stripping,
+        options::OPT_fno_sanitize_address_globals_dead_stripping,
         !TC.getTriple().isOSBinFormatELF() || TC.getTriple().isOSFuchsia() ||
-        TC.getTriple().isPS4() ||
-        Args.hasArg(options::OPT_fsanitize_address_globals_dead_stripping);
+            TC.getTriple().isPS4());
 
     AsanUseOdrIndicator =
         Args.hasFlag(options::OPT_fsanitize_address_use_odr_indicator,
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 7bcda5c33dc9e..c61726aaa3115 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -242,6 +242,7 @@
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-address-globals-dead-stripping %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-GLOBALS
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ASAN-GLOBALS
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-address-globals-dead-stripping -fno-sanitize-address-globals-dead-stripping %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ASAN-GLOBALS
 // RUN: %clang_cl --target=x86_64-windows-msvc -fsanitize=address -fsanitize-address-globals-dead-stripping -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-GLOBALS
 // RUN: %clang_cl --target=x86_64-windows-msvc -fsanitize=address -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-GLOBALS
 // RUN: %clang -target x86_64-scei-ps4  -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-GLOBALS

From 011f653265e1c444a77fb24cf03b2ea7faa517b9 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Wed, 23 Feb 2022 10:51:36 -0500
Subject: [PATCH 684/748] [mlir] Add sectionMemoryMapper to
 ExecutionEngineOptions

By specifying a sectionMemoryMapper, users can control how
memory for JIT code is allocated.

In particular, I need this in order to use a named memory
region so that profilers such as perf(1) can correctly label
execution cycles coming from JIT'ed code.

Reviewed-by: ezhulenev

Differential Revision: https://reviews.llvm.org/D120415
---
 mlir/include/mlir/ExecutionEngine/ExecutionEngine.h | 6 ++++++
 mlir/lib/ExecutionEngine/ExecutionEngine.cpp        | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
index 8a2a870aec422..34fd7f21751ac 100644
--- a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
+++ b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@@ -16,6 +16,7 @@
 #include "mlir/Support/LLVM.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Error.h"
 
@@ -68,6 +69,11 @@ struct ExecutionEngineOptions {
   /// open and link the shared libraries for symbol resolution.
   ArrayRef<StringRef> sharedLibPaths = {};
 
+  /// Specifies an existing `sectionMemoryMapper` to be associated with the
+  /// compiled code. If none is provided, a default memory mapper that directly
+  /// calls into the operating system is used.
+  llvm::SectionMemoryManager::MemoryMapper *sectionMemoryMapper = nullptr;
+
   /// If `enableObjectCache` is set, the JIT compiler will create one to store
   /// the object generated for the given module.
   bool enableObjectCache = true;
diff --git a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
index d8e778b9efb44..916d764a75dad 100644
--- a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -252,7 +251,9 @@ ExecutionEngine::create(ModuleOp m, const ExecutionEngineOptions &options) {
   auto objectLinkingLayerCreator = [&](ExecutionSession &session,
                                        const Triple &tt) {
     auto objectLayer = std::make_unique<RTDyldObjectLinkingLayer>(
-        session, []() { return std::make_unique<SectionMemoryManager>(); });
+        session, [sectionMemoryMapper = options.sectionMemoryMapper]() {
+          return std::make_unique<SectionMemoryManager>(sectionMemoryMapper);
+        });
 
     // Register JIT event listeners if they are enabled.
     if (engine->gdbListener)

From b3e63ee2e5bde11f4ee3f1b372622d3ab0222c8f Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Wed, 23 Feb 2022 09:18:19 -0600
Subject: [PATCH 685/748] [NFC][PowerPC] Fix the check-cpu.ll test case.

This test doesn't work because the CHECK-NOT line is actually checking
something that only exists on stderr and not stdout.
Changed the test so that we now check both stderr and stdout.
Changed the test so that we check pwr9, pwr10, and future. The cpu names of
power9 or power10 are not supported in the llc backend.

Reviewed By: nemanjai, #powerpc

Differential Revision: https://reviews.llvm.org/D120349
---
 llvm/test/CodeGen/PowerPC/check-cpu.ll | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/check-cpu.ll b/llvm/test/CodeGen/PowerPC/check-cpu.ll
index 132be3058216b..e1a201427a410 100644
--- a/llvm/test/CodeGen/PowerPC/check-cpu.ll
+++ b/llvm/test/CodeGen/PowerPC/check-cpu.ll
@@ -1,14 +1,19 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:     -mcpu=future < %s | FileCheck %s
+; RUN:     -mcpu=future < %s 2>&1 | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
-; RUN:     -mcpu=future < %s | FileCheck %s
+; RUN:     -mcpu=future < %s 2>&1 | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:     -mcpu=power10 < %s | FileCheck %s
+; RUN:     -mcpu=pwr10 < %s 2>&1 | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
-; RUN:     -mcpu=pwr10 < %s | FileCheck %s
+; RUN:     -mcpu=pwr10 < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 < %s 2>&1 | FileCheck %s
+
 
 
-; Test -mcpu=[pwr10|future] is recognized on PowerPC.
+; Test -mcpu=[pwr9|pwr10|future] is recognized on PowerPC.
 
 ; CHECK-NOT: is not a recognized processor for this target
 ; CHECK:     .text

From 222e8610f1372783e49405fec9208000b0b39bdb Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 11:57:56 -0800
Subject: [PATCH 686/748] [SLP] Rearrange fields in ScheduleData for  density
 [NFC]

---
 .../lib/Transforms/Vectorize/SLPVectorizer.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 96ab621e89f1d..921662ad6cb87 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2543,6 +2543,12 @@ class BoUpSLP {
 
     Instruction *Inst = nullptr;
 
+    /// Opcode of the current instruction in the schedule data.
+    Value *OpValue = nullptr;
+
+    /// The TreeEntry that this instruction corresponds to.
+    TreeEntry *TE = nullptr;
+
     /// Points to the head in an instruction bundle (and always to this for
     /// single instructions).
     ScheduleData *FirstInBundle = nullptr;
@@ -2575,18 +2581,12 @@ class BoUpSLP {
     /// Note that this is negative as long as Dependencies is not calculated.
     int UnscheduledDeps = InvalidDeps;
 
+    /// The lane of this node in the TreeEntry.
+    int Lane = -1;
+
     /// True if this instruction is scheduled (or considered as scheduled in the
     /// dry-run).
     bool IsScheduled = false;
-
-    /// Opcode of the current instruction in the schedule data.
-    Value *OpValue = nullptr;
-
-    /// The TreeEntry that this instruction corresponds to.
-    TreeEntry *TE = nullptr;
-
-    /// The lane of this node in the TreeEntry.
-    int Lane = -1;
   };
 
 #ifndef NDEBUG

From 2a6dbedf5a923512ba8b5c2d192b5fc8bb1210fd Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 23 Feb 2022 11:52:20 -0800
Subject: [PATCH 687/748] [lldb] Fix (unintentional) recursion in
 CommandObjectRegexCommand

Jim noticed that the regex command is unintentionally recursive. Let's
use the following command regex as an example:

  (lldb) com regex humm 's/([^ ]+) ([^ ]+)/p %1 %2 %1 %2/'

If we call it with arguments foo bar, thing behave as expected:

  (lldb) humm foo bar
  (...)
  foo bar foo bar

However, if we include %2 in the arguments, things break down:

  (lldb) humm fo%2o bar
  (...)
  fobaro bar fobaro bar

The problem is that the implementation of the substitution is too naive.
It substitutes the %1 token into the target template in place, then does
the %2 substitution starting with the resultant string. So if the
previous substitution introduced a %2 token, it would get processed in
the second sweep, etc.

This patch addresses the issue by walking the command once and
substituting the % variables in place.

  (lldb) humm fo%2o bar
  (...)
  fo%2o bar fo%2o bar

Furthermore, this patch also reports an error if not enough variables
were provided and add support for substituting %0.

rdar://81236994

Differential revision: https://reviews.llvm.org/D120101
---
 .../lldb/Interpreter/CommandReturnObject.h    |  3 +
 .../Commands/CommandObjectRegexCommand.cpp    | 63 +++++++++++------
 .../Commands/CommandObjectRegexCommand.h      |  5 ++
 .../Interpreter/CommandReturnObject.cpp       |  5 ++
 lldb/unittests/Interpreter/CMakeLists.txt     |  2 +
 .../Interpreter/TestRegexCommand.cpp          | 68 +++++++++++++++++++
 6 files changed, 125 insertions(+), 21 deletions(-)
 create mode 100644 lldb/unittests/Interpreter/TestRegexCommand.cpp

diff --git a/lldb/include/lldb/Interpreter/CommandReturnObject.h b/lldb/include/lldb/Interpreter/CommandReturnObject.h
index 72518a902144b..2177e721f9a40 100644
--- a/lldb/include/lldb/Interpreter/CommandReturnObject.h
+++ b/lldb/include/lldb/Interpreter/CommandReturnObject.h
@@ -15,6 +15,7 @@
 #include "lldb/lldb-private.h"
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/WithColor.h"
 
@@ -132,6 +133,8 @@ class CommandReturnObject {
 
   void SetError(const Status &error, const char *fallback_error_cstr = nullptr);
 
+  void SetError(llvm::Error error);
+
   lldb::ReturnStatus GetStatus() const;
 
   void SetStatus(lldb::ReturnStatus status);
diff --git a/lldb/source/Commands/CommandObjectRegexCommand.cpp b/lldb/source/Commands/CommandObjectRegexCommand.cpp
index 7ddc5c0c7e083..a99a9e760cb26 100644
--- a/lldb/source/Commands/CommandObjectRegexCommand.cpp
+++ b/lldb/source/Commands/CommandObjectRegexCommand.cpp
@@ -10,6 +10,9 @@
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+
 using namespace lldb;
 using namespace lldb_private;
 
@@ -25,35 +28,53 @@ CommandObjectRegexCommand::CommandObjectRegexCommand(
 // Destructor
 CommandObjectRegexCommand::~CommandObjectRegexCommand() = default;
 
+llvm::Expected<std::string> CommandObjectRegexCommand::SubstituteVariables(
+    llvm::StringRef input,
+    const llvm::SmallVectorImpl<llvm::StringRef> &replacements) {
+  std::string buffer;
+  llvm::raw_string_ostream output(buffer);
+
+  llvm::SmallVector<llvm::StringRef, 4> parts;
+  input.split(parts, '%');
+
+  output << parts[0];
+  for (llvm::StringRef part : drop_begin(parts)) {
+    size_t idx = 0;
+    if (part.consumeInteger(10, idx))
+      output << '%';
+    else if (idx < replacements.size())
+      output << replacements[idx];
+    else
+      return llvm::make_error<llvm::StringError>(
+          llvm::formatv("%{0} is out of range: not enough arguments specified",
+                        idx),
+          llvm::errc::invalid_argument);
+    output << part;
+  }
+
+  return output.str();
+}
+
 bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command,
                                           CommandReturnObject &result) {
   EntryCollection::const_iterator pos, end = m_entries.end();
   for (pos = m_entries.begin(); pos != end; ++pos) {
     llvm::SmallVector<llvm::StringRef, 4> matches;
     if (pos->regex.Execute(command, &matches)) {
-      std::string new_command(pos->command);
-      char percent_var[8];
-      size_t idx, percent_var_idx;
-      for (uint32_t match_idx = 1; match_idx <= m_max_matches; ++match_idx) {
-        if (match_idx < matches.size()) {
-          const std::string match_str = matches[match_idx].str();
-          const int percent_var_len =
-              ::snprintf(percent_var, sizeof(percent_var), "%%%u", match_idx);
-          for (idx = 0; (percent_var_idx = new_command.find(
-                             percent_var, idx)) != std::string::npos;) {
-            new_command.erase(percent_var_idx, percent_var_len);
-            new_command.insert(percent_var_idx, match_str);
-            idx = percent_var_idx + match_str.size();
-          }
-        }
+      llvm::Expected<std::string> new_command =
+          SubstituteVariables(pos->command, matches);
+      if (!new_command) {
+        result.SetError(new_command.takeError());
+        return false;
       }
+
       // Interpret the new command and return this as the result!
       if (m_interpreter.GetExpandRegexAliases())
-        result.GetOutputStream().Printf("%s\n", new_command.c_str());
+        result.GetOutputStream().Printf("%s\n", new_command->c_str());
       // Pass in true for "no context switching".  The command that called us
       // should have set up the context appropriately, we shouldn't have to
       // redo that.
-      return m_interpreter.HandleCommand(new_command.c_str(),
+      return m_interpreter.HandleCommand(new_command->c_str(),
                                          eLazyBoolCalculate, result);
     }
   }
@@ -61,10 +82,10 @@ bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command,
   if (!GetSyntax().empty())
     result.AppendError(GetSyntax());
   else
-    result.GetOutputStream() << "Command contents '" << command
-                             << "' failed to match any "
-                                "regular expression in the '"
-                             << m_cmd_name << "' regex ";
+    result.GetErrorStream() << "Command contents '" << command
+                            << "' failed to match any "
+                               "regular expression in the '"
+                            << m_cmd_name << "' regex ";
   return false;
 }
 
diff --git a/lldb/source/Commands/CommandObjectRegexCommand.h b/lldb/source/Commands/CommandObjectRegexCommand.h
index 2f65c2cd815d0..b2a375c63a764 100644
--- a/lldb/source/Commands/CommandObjectRegexCommand.h
+++ b/lldb/source/Commands/CommandObjectRegexCommand.h
@@ -39,6 +39,11 @@ class CommandObjectRegexCommand : public CommandObjectRaw {
 protected:
   bool DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
 
+  /// Substitute variables of the format %\d+ in the input string.
+  static llvm::Expected<std::string> SubstituteVariables(
+      llvm::StringRef input,
+      const llvm::SmallVectorImpl<llvm::StringRef> &replacements);
+
   struct Entry {
     RegularExpression regex;
     std::string command;
diff --git a/lldb/source/Interpreter/CommandReturnObject.cpp b/lldb/source/Interpreter/CommandReturnObject.cpp
index 1b1e6996764c9..798dced0f1c6d 100644
--- a/lldb/source/Interpreter/CommandReturnObject.cpp
+++ b/lldb/source/Interpreter/CommandReturnObject.cpp
@@ -109,6 +109,11 @@ void CommandReturnObject::SetError(const Status &error,
   AppendError(error.AsCString(fallback_error_cstr));
 }
 
+void CommandReturnObject::SetError(llvm::Error error) {
+  if (error)
+    AppendError(llvm::toString(std::move(error)));
+}
+
 // Similar to AppendError, but do not prepend 'Status: ' to message, and don't
 // append "\n" to the end of it.
 
diff --git a/lldb/unittests/Interpreter/CMakeLists.txt b/lldb/unittests/Interpreter/CMakeLists.txt
index 2080ce9085400..5b5268ffe9732 100644
--- a/lldb/unittests/Interpreter/CMakeLists.txt
+++ b/lldb/unittests/Interpreter/CMakeLists.txt
@@ -4,6 +4,7 @@ add_lldb_unittest(InterpreterTests
   TestOptionArgParser.cpp
   TestOptionValue.cpp
   TestOptionValueFileColonLine.cpp
+  TestRegexCommand.cpp
 
   LINK_LIBS
       lldbCore
@@ -14,4 +15,5 @@ add_lldb_unittest(InterpreterTests
       lldbUtilityHelpers
       lldbInterpreter
       lldbPluginPlatformMacOSX
+      LLVMTestingSupport
 )
diff --git a/lldb/unittests/Interpreter/TestRegexCommand.cpp b/lldb/unittests/Interpreter/TestRegexCommand.cpp
new file mode 100644
index 0000000000000..ee30faab4814b
--- /dev/null
+++ b/lldb/unittests/Interpreter/TestRegexCommand.cpp
@@ -0,0 +1,68 @@
+//===-- TestRegexCommand.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Commands/CommandObjectRegexCommand.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace lldb_private;
+using namespace lldb;
+
+namespace {
+class TestRegexCommand : public CommandObjectRegexCommand {
+public:
+  using CommandObjectRegexCommand::SubstituteVariables;
+
+  static std::string
+  Substitute(llvm::StringRef input,
+             const llvm::SmallVectorImpl<llvm::StringRef> &replacements) {
+    llvm::Expected<std::string> str = SubstituteVariables(input, replacements);
+    if (!str)
+      return llvm::toString(str.takeError());
+    return *str;
+  }
+};
+} // namespace
+
+TEST(RegexCommandTest, SubstituteVariablesSuccess) {
+  const llvm::SmallVector<llvm::StringRef, 4> substitutions = {"all", "foo",
+                                                               "bar", "baz"};
+
+  EXPECT_EQ(TestRegexCommand::Substitute("%0", substitutions), "all");
+  EXPECT_EQ(TestRegexCommand::Substitute("%1", substitutions), "foo");
+  EXPECT_EQ(TestRegexCommand::Substitute("%2", substitutions), "bar");
+  EXPECT_EQ(TestRegexCommand::Substitute("%3", substitutions), "baz");
+  EXPECT_EQ(TestRegexCommand::Substitute("%1%2%3", substitutions), "foobarbaz");
+  EXPECT_EQ(TestRegexCommand::Substitute("#%1#%2#%3#", substitutions),
+            "#foo#bar#baz#");
+}
+
+TEST(RegexCommandTest, SubstituteVariablesFailed) {
+  const llvm::SmallVector<llvm::StringRef, 4> substitutions = {"all", "foo",
+                                                               "bar", "baz"};
+
+  ASSERT_THAT_EXPECTED(
+      TestRegexCommand::SubstituteVariables("%1%2%3%4", substitutions),
+      llvm::Failed());
+  ASSERT_THAT_EXPECTED(
+      TestRegexCommand::SubstituteVariables("%5", substitutions),
+      llvm::Failed());
+  ASSERT_THAT_EXPECTED(
+      TestRegexCommand::SubstituteVariables("%11", substitutions),
+      llvm::Failed());
+}
+
+TEST(RegexCommandTest, SubstituteVariablesNoRecursion) {
+  const llvm::SmallVector<llvm::StringRef, 4> substitutions = {"all", "%2",
+                                                               "%3", "%4"};
+  EXPECT_EQ(TestRegexCommand::Substitute("%0", substitutions), "all");
+  EXPECT_EQ(TestRegexCommand::Substitute("%1", substitutions), "%2");
+  EXPECT_EQ(TestRegexCommand::Substitute("%2", substitutions), "%3");
+  EXPECT_EQ(TestRegexCommand::Substitute("%3", substitutions), "%4");
+  EXPECT_EQ(TestRegexCommand::Substitute("%1%2%3", substitutions), "%2%3%4");
+}

From 302ca279cb83043ef7d60115eb5ba58f12064a4a Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Wed, 23 Feb 2022 21:34:20 +0100
Subject: [PATCH 688/748] [pseudo] fix an out-of-bound error in LRTable.

Fix window debug build.
---
 clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
index 4a817a527f2fa..23c455941ff5b 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
@@ -117,7 +117,7 @@ llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
   size_t Start = It - States.data(), End = Start;
   while (End < States.size() && States[End] == Src)
     ++End;
-  return llvm::makeArrayRef(&Actions[Start], &Actions[End]);
+  return llvm::makeArrayRef(&Actions[Start], End - Start);
 }
 
 } // namespace pseudo

From afdaa86b771f9fa7bcf3ac273ae6f9637943c0d1 Mon Sep 17 00:00:00 2001
From: Pawe Bylica <chfast@gmail.com>
Date: Wed, 23 Feb 2022 19:26:48 +0100
Subject: [PATCH 689/748] [DAGCombine] Extend combineCarryDiamond()

In combineCarryDiamond() use getAsCarry() to find more candidates for being a carry flag.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D118362
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 +++++++--
 llvm/test/CodeGen/X86/addcarry.ll             | 5 +----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6c5286277ba9d..4ed38c3ea243c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3137,9 +3137,14 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
 // a single path for carry/borrow out propagation:
 static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
-                                   SDValue Carry0, SDValue Carry1, SDNode *N) {
-  if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1)
+                                   SDValue N0, SDValue N1, SDNode *N) {
+  SDValue Carry0 = getAsCarry(TLI, N0);
+  if (!Carry0)
     return SDValue();
+  SDValue Carry1 = getAsCarry(TLI, N1);
+  if (!Carry1)
+    return SDValue();
+
   unsigned Opcode = Carry0.getOpcode();
   if (Opcode != Carry1.getOpcode())
     return SDValue();
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index 02c2110b37b9c..3d996619c94ba 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -451,12 +451,9 @@ define { i64, i64, i1 } @addcarry_hidden_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1
 ; CHECK-LABEL: addcarry_hidden_2x64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    addq %rcx, %rsi
-; CHECK-NEXT:    setb %dil
 ; CHECK-NEXT:    addq %rdx, %rax
-; CHECK-NEXT:    adcq $0, %rsi
+; CHECK-NEXT:    adcq %rcx, %rsi
 ; CHECK-NEXT:    setb %cl
-; CHECK-NEXT:    orb %dil, %cl
 ; CHECK-NEXT:    movq %rsi, %rdx
 ; CHECK-NEXT:    retq
   %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0)

From c7d6448d037e2c053bcb19a8398653f1d31ca3de Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 23 Feb 2022 12:35:06 -0800
Subject: [PATCH 690/748] [DAGCombiner][TargetLowering] Pass SDValue by value
 to isMulAddWithConstProfitable.

Internally to DAGCombiner the SDValues were passed by non-const
reference despite not being modified. They were then passed by
const reference to TLI.

This patch passes them by value which is consistent with the vast
majority of code.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D120420
---
 llvm/include/llvm/CodeGen/TargetLowering.h      |  4 ++--
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp   | 10 ++++------
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp |  2 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h   |  4 ++--
 llvm/lib/Target/ARM/ARMISelLowering.cpp         |  4 ++--
 llvm/lib/Target/ARM/ARMISelLowering.h           |  4 ++--
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp     |  4 ++--
 llvm/lib/Target/RISCV/RISCVISelLowering.h       |  4 ++--
 8 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2e17722180dd4..fbb9767e4da71 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2128,8 +2128,8 @@ class TargetLoweringBase {
   /// about some cases, a default true can be returned to let the DAGCombiner
   /// decide.
   /// AddNode is (add x, c1), and ConstNode is c2.
-  virtual bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                           const SDValue &ConstNode) const {
+  virtual bool isMulAddWithConstProfitable(SDValue AddNode,
+                                           SDValue ConstNode) const {
     return true;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4ed38c3ea243c..3544729da2f71 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -666,9 +666,8 @@ namespace {
     /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
     /// MulNode is the original multiply, AddNode is (add x, c1),
     /// and ConstNode is c2.
-    bool isMulAddWithConstProfitable(SDNode *MulNode,
-                                     SDValue &AddNode,
-                                     SDValue &ConstNode);
+    bool isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
+                                     SDValue ConstNode);
 
     /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
     /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
@@ -17355,9 +17354,8 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
 //     (A + c1) * c3
 //     (A + c2) * c3
 // We're checking for cases where we have common "c3 * A" expressions.
-bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
-                                              SDValue &AddNode,
-                                              SDValue &ConstNode) {
+bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
+                                              SDValue ConstNode) {
   APInt Val;
 
   // If the add only has one use, and the target thinks the folding is
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2fe77449b3a07..cc0e8b37c184b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12905,7 +12905,7 @@ bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
 // if the folding leads to worse code.
 bool AArch64TargetLowering::isMulAddWithConstProfitable(
-    const SDValue &AddNode, const SDValue &ConstNode) const {
+    SDValue AddNode, SDValue ConstNode) const {
   // Let the DAGCombiner decide for vector types and large types.
   const EVT VT = AddNode.getValueType();
   if (VT.isVector() || VT.getScalarSizeInBits() > 64)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9bd33bf0bcf2e..0d2df1002bb12 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -601,8 +601,8 @@ class AArch64TargetLowering : public TargetLowering {
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
-  bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                   const SDValue &ConstNode) const override;
+  bool isMulAddWithConstProfitable(SDValue AddNode,
+                                   SDValue ConstNode) const override;
 
   bool shouldConsiderGEPOffsetSplit() const override;
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index cdf5caff228e5..f0fc308410125 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -19406,8 +19406,8 @@ bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
 // Return false to prevent folding
 // (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
 // if the folding leads to worse code.
-bool ARMTargetLowering::isMulAddWithConstProfitable(
-    const SDValue &AddNode, const SDValue &ConstNode) const {
+bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,
+                                                    SDValue ConstNode) const {
   // Let the DAGCombiner decide for vector types and large types.
   const EVT VT = AddNode.getValueType();
   if (VT.isVector() || VT.getScalarSizeInBits() > 32)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 08ccd9db1bb01..df1dc2a9a7ae3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -713,8 +713,8 @@ class VectorType;
                                       Align Alignment,
                                       const DataLayout &DL) const;
 
-    bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                     const SDValue &ConstNode) const override;
+    bool isMulAddWithConstProfitable(SDValue AddNode,
+                                     SDValue ConstNode) const override;
 
     bool alignLoopsWithOptSize() const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index fbe767a4897d9..b8969a327d078 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -11262,8 +11262,8 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   return false;
 }
 
-bool RISCVTargetLowering::isMulAddWithConstProfitable(
-    const SDValue &AddNode, const SDValue &ConstNode) const {
+bool RISCVTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,
+                                                      SDValue ConstNode) const {
   // Let the DAGCombiner decide for vectors.
   EVT VT = AddNode.getValueType();
   if (VT.isVector())
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 2a4fa57aad662..6118d9be9ffff 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -497,8 +497,8 @@ class RISCVTargetLowering : public TargetLowering {
   bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
                               SDValue C) const override;
 
-  bool isMulAddWithConstProfitable(const SDValue &AddNode,
-                                   const SDValue &ConstNode) const override;
+  bool isMulAddWithConstProfitable(SDValue AddNode,
+                                   SDValue ConstNode) const override;
 
   TargetLowering::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;

From 3fb4439a0a31d98586b07612ecf9760c94318d06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 25 Jan 2022 09:38:41 +0000
Subject: [PATCH 691/748] [libcxx] [test] Fix time.get.byname get_one for Glibc
 and Windows

This matches the fixes for the wchar version in
f081cc50372f9415ef4fa2204a4b7f54153af455.

Differential Revision: https://reviews.llvm.org/D120283
---
 .../locale.time.get.byname/get_one.pass.cpp   | 54 ++++++++++++++++---
 1 file changed, 47 insertions(+), 7 deletions(-)

diff --git a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one.pass.cpp b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one.pass.cpp
index 791a71c17bc39..60c1e0297d2a4 100644
--- a/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.time/locale.time.get.byname/get_one.pass.cpp
@@ -9,8 +9,6 @@
 // NetBSD does not support LC_TIME at the moment
 // XFAIL: netbsd
 
-// XFAIL: LIBCXX-WINDOWS-FIXME
-
 // REQUIRES: locale.en_US.UTF-8
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ru_RU.UTF-8
@@ -23,9 +21,6 @@
 // iter_type get(iter_type s, iter_type end, ios_base& f,
 //               ios_base::iostate& err, tm *t, char format, char modifier = 0) const;
 
-// TODO: investigation needed
-// XFAIL: target={{.*}}-linux-gnu{{.*}}
-
 #include <locale>
 #include <cassert>
 #include "test_macros.h"
@@ -52,7 +47,15 @@ int main(int, char**)
     std::tm t;
     {
         const my_facet f(LOCALE_en_US_UTF_8, 1);
+#ifdef _WIN32
+        // On Windows, the "%c" format lacks the leading week day, which
+        // means that t.tm_wday doesn't get set when parsing the string.
+        const char in[] = "12/31/2061 11:55:59 PM";
+#elif defined(TEST_HAS_GLIBC)
+        const char in[] = "Sat 31 Dec 2061 11:55:59 PM";
+#else
         const char in[] = "Sat Dec 31 23:55:59 2061";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'c');
@@ -63,12 +66,18 @@ int main(int, char**)
         assert(t.tm_mday == 31);
         assert(t.tm_mon == 11);
         assert(t.tm_year == 161);
+#ifndef _WIN32
         assert(t.tm_wday == 6);
+#endif
         assert(err == std::ios_base::eofbit);
     }
     {
         const my_facet f(LOCALE_en_US_UTF_8, 1);
+#if defined(_WIN32) || defined(TEST_HAS_GLIBC)
+        const char in[] = "11:55:59 PM";
+#else
         const char in[] = "23:55:59";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'X');
@@ -80,7 +89,13 @@ int main(int, char**)
     }
     {
         const my_facet f(LOCALE_fr_FR_UTF_8, 1);
+#ifdef _WIN32
+        const char in[] = "31/12/2061 23:55:59";
+#elif defined(TEST_HAS_GLIBC)
+        const char in[] = "sam. 31 d""\xC3\xA9""c. 2061 23:55:59";
+#else
         const char in[] = "Sam 31 d""\xC3\xA9""c 23:55:59 2061";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'c');
@@ -91,7 +106,9 @@ int main(int, char**)
         assert(t.tm_mday == 31);
         assert(t.tm_mon == 11);
         assert(t.tm_year == 161);
+#ifndef _WIN32
         assert(t.tm_wday == 6);
+#endif
         assert(err == std::ios_base::eofbit);
     }
     {
@@ -108,6 +125,11 @@ int main(int, char**)
     }
     {
         const my_facet f(LOCALE_ru_RU_UTF_8, 1);
+#ifdef TEST_HAS_GLIBC
+        const char in[] = "\xD0\xA1\xD0\xB1 31 \xD0\xB4\xD0\xB5\xD0\xBA 2061 23:55:59";
+#elif defined(_WIN32)
+        const char in[] = "31.12.2061 23:55:59";
+#else
         const char in[] = "\xD1\x81\xD1\x83\xD0\xB1\xD0\xB1"
                           "\xD0\xBE\xD1\x82\xD0\xB0"
                           ", 31 "
@@ -116,6 +138,7 @@ int main(int, char**)
                           " 2061 "
                           "\xD0\xB3"
                           ". 23:55:59";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'c');
@@ -126,7 +149,9 @@ int main(int, char**)
         assert(t.tm_mday == 31);
         assert(t.tm_mon == 11);
         assert(t.tm_year == 161);
+#ifndef _WIN32
         assert(t.tm_wday == 6);
+#endif
         assert(err == std::ios_base::eofbit);
     }
     {
@@ -143,8 +168,17 @@ int main(int, char**)
     }
     {
         const my_facet f(LOCALE_zh_CN_UTF_8, 1);
-        const char in[] = "\xE5\x85\xAD"
-                          " 12/31 23:55:59 2061";
+#ifdef TEST_HAS_GLIBC
+        const char in[] = "2061" "\xE5\xB9\xB4" "12" "\xE6\x9C\x88" "31"
+                          "\xE6\x97\xA5" " "
+                          "\xE6\x98\x9F\xE6\x9c\x9F\xE5\x85\xAD"
+                          " 23" "\xE6\x97\xB6" "55" "\xE5\x88\x86" "59"
+                          "\xE7\xA7\x92";
+#elif defined(_WIN32)
+        const char in[] = "2061/12/31 23:55:59";
+#else
+        const char in[] = "\xE5\x85\xAD 12/31 23:55:59 2061";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'c');
@@ -155,12 +189,18 @@ int main(int, char**)
         assert(t.tm_mday == 31);
         assert(t.tm_mon == 11);
         assert(t.tm_year == 161);
+#ifndef _WIN32
         assert(t.tm_wday == 6);
+#endif
         assert(err == std::ios_base::eofbit);
     }
     {
         const my_facet f(LOCALE_zh_CN_UTF_8, 1);
+#if defined(_WIN32)
+        const char in[] = "23:55:59";
+#else
         const char in[] = "23""\xE6\x97\xB6""55""\xE5\x88\x86""59""\xE7\xA7\x92";
+#endif
         err = std::ios_base::goodbit;
         t = std::tm();
         I i = f.get(I(in), I(in+sizeof(in)/sizeof(in[0])-1), ios, err, &t, 'X');

From 75812e7704fcb7cf89e107cc773c5a2118f380ab Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Wed, 23 Feb 2022 16:10:35 -0500
Subject: [PATCH 692/748] [OpenMP][Offloading] Change N back to 256 in
 bug49334.cpp

---
 openmp/libomptarget/test/offloading/bug49334.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp
index 047a78c11ac8b..6c4d90467a1cb 100644
--- a/openmp/libomptarget/test/offloading/bug49334.cpp
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@@ -73,7 +73,7 @@ class BlockMatrix {
 };
 
 constexpr const int BS = 16;
-constexpr const int N = 16;
+constexpr const int N = 256;
 
 int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
 #pragma omp parallel

From a83441e8cdb2b59e9bd6b8ec6ac1ead27a664ad4 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 13:08:10 -0800
Subject: [PATCH 693/748] Revert "[SLP] Simplify extendSchedulingRegion"

This reverts commit 8c85f3a0523070ef656e30e368df0a679c1400cd.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 32 ++++++++++++++-----
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 921662ad6cb87..47e23fb636e57 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7618,8 +7618,21 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return;
   }
-
-  if (I->comesBefore(ScheduleStart)) {
+  // Search up and down at the same time, because we don't know if the new
+  // instruction is above or below the existing scheduling region.
+  BasicBlock::reverse_iterator UpIter =
+      ++ScheduleStart->getIterator().getReverse();
+  BasicBlock::reverse_iterator UpperEnd = BB->rend();
+  BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
+  BasicBlock::iterator LowerEnd = BB->end();
+  while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
+         &*DownIter != I) {
+    ++UpIter;
+    ++DownIter;
+  }
+  if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
+    assert(I->getParent() == ScheduleStart->getParent() &&
+           "Instruction is in wrong basic block.");
     initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
     ScheduleStart = I;
     if (isOneOf(S, I) != I)
@@ -7628,14 +7641,17 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
                       << "\n");
     return;
   }
-
-  auto *NextI = I->getNextNode();
-  assert(NextI && "tried to vectorize a terminator?");
-  assert(ScheduleEnd->comesBefore(NextI) && "must extend?");
-  initScheduleData(ScheduleEnd, NextI, LastLoadStoreInRegion, nullptr);
-  ScheduleEnd = NextI;
+  assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
+         "Expected to reach top of the basic block or instruction down the "
+         "lower end.");
+  assert(I->getParent() == ScheduleEnd->getParent() &&
+         "Instruction is in wrong basic block.");
+  initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+                   nullptr);
+  ScheduleEnd = I->getNextNode();
   if (isOneOf(S, I) != I)
     CheckSheduleForI(I);
+  assert(ScheduleEnd && "tried to vectorize a terminator?");
   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
   return;
 }

From 9392c0d4efc1aa6cb1f20c4b42cc4e2595fda265 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 13:08:18 -0800
Subject: [PATCH 694/748] Revert "[SLP] Remove cap on schedule window size"

This reverts commit 6adf4b039e095224edbbecda5972e5e3353b53b6.  Reverting while investigating https://github.com/llvm/llvm-project/issues/54029
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 72 +++++++++++++++----
 .../SLPVectorizer/AArch64/gather-root.ll      | 71 ++++++++++++++----
 .../X86/crash_exceed_scheduling.ll            |  2 +-
 .../X86/extract-shuffle-inseltpoison.ll       |  2 +-
 .../SLPVectorizer/X86/extract-shuffle.ll      |  2 +-
 .../SLPVectorizer/X86/schedule_budget.ll      |  8 +--
 6 files changed, 121 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 47e23fb636e57..12ab69aa74cea 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -138,6 +138,14 @@ static cl::opt<int>
 MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
     cl::desc("Maximum depth of the lookup for consecutive stores."));
 
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+    cl::desc("Limit the size of the SLP scheduling region per block"));
+
 static cl::opt<int> MinVectorRegSizeOption(
     "slp-min-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
@@ -169,6 +177,10 @@ static const unsigned AliasedCheckLimit = 10;
 // This limit is useful for very large basic blocks.
 static const unsigned MaxMemDepDistance = 160;
 
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
@@ -2612,6 +2624,13 @@ class BoUpSLP {
       FirstLoadStoreInRegion = nullptr;
       LastLoadStoreInRegion = nullptr;
 
+      // Reduce the maximum schedule region size by the size of the
+      // previous scheduling run.
+      ScheduleRegionSizeLimit -= ScheduleRegionSize;
+      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+        ScheduleRegionSizeLimit = MinScheduleRegionSize;
+      ScheduleRegionSize = 0;
+
       // Make a new scheduling region, i.e. all existing ScheduleData is not
       // in the new region yet.
       ++SchedulingRegionID;
@@ -2792,7 +2811,7 @@ class BoUpSLP {
 
     /// Extends the scheduling region so that V is inside the region.
     /// \returns true if the region size is within the limit.
-    void extendSchedulingRegion(Value *V, const InstructionsState &S);
+    bool extendSchedulingRegion(Value *V, const InstructionsState &S);
 
     /// Initialize the ScheduleData structures for new instructions in the
     /// scheduling region.
@@ -2846,6 +2865,12 @@ class BoUpSLP {
     /// (can be null).
     ScheduleData *LastLoadStoreInRegion = nullptr;
 
+    /// The current size of the scheduling region.
+    int ScheduleRegionSize = 0;
+
+    /// The maximum size allowed for the scheduling region.
+    int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
+
     /// The ID of the scheduling region. For a new vectorization iteration this
     /// is incremented which "removes" all ScheduleData from the region.
     /// Make sure that the initial SchedulingRegionID is greater than the
@@ -7489,9 +7514,11 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
         doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
       ReSchedule = true;
     }
-    LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
-                      << " in block " << BB->getName() << "\n");
-    calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
+    if (Bundle) {
+      LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
+                        << " in block " << BB->getName() << "\n");
+      calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
+    }
 
     if (ReSchedule) {
       resetSchedule();
@@ -7502,7 +7529,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     // dependencies. As soon as the bundle is "ready" it means that there are no
     // cyclic dependencies and we can schedule it. Note that's important that we
     // don't "schedule" the bundle yet (see cancelScheduling).
-    while (!Bundle->isReady() && !ReadyInsts.empty()) {
+    while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
+           !ReadyInsts.empty()) {
       ScheduleData *Picked = ReadyInsts.pop_back_val();
       assert(Picked->isSchedulingEntity() && Picked->isReady() &&
              "must be ready to schedule");
@@ -7512,8 +7540,18 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
-  for (Value *V : VL)
-    extendSchedulingRegion(V, S);
+  for (Value *V : VL) {
+    if (!extendSchedulingRegion(V, S)) {
+      // If the scheduling region got new instructions at the lower end (or it
+      // is a new region for the first bundle). This makes it necessary to
+      // recalculate all dependencies.
+      // Otherwise the compiler may crash trying to incorrectly calculate
+      // dependencies and emit instruction in the wrong order at the actual
+      // scheduling.
+      TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
+      return None;
+    }
+  }
 
   bool ReSchedule = false;
   for (Value *V : VL) {
@@ -7583,11 +7621,10 @@ BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
   return &(ScheduleDataChunks.back()[ChunkPos++]);
 }
 
-void
-BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
-                                                 const InstructionsState &S) {
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+                                                      const InstructionsState &S) {
   if (getScheduleData(V, isOneOf(S, V)))
-    return;
+    return true;
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
@@ -7606,7 +7643,7 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     return true;
   };
   if (CheckSheduleForI(I))
-    return;
+    return true;
   if (!ScheduleStart) {
     // It's the first instruction in the new region.
     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
@@ -7616,7 +7653,7 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
       CheckSheduleForI(I);
     assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
-    return;
+    return true;
   }
   // Search up and down at the same time, because we don't know if the new
   // instruction is above or below the existing scheduling region.
@@ -7627,6 +7664,11 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
   BasicBlock::iterator LowerEnd = BB->end();
   while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
          &*DownIter != I) {
+    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+      LLVM_DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
+      return false;
+    }
+
     ++UpIter;
     ++DownIter;
   }
@@ -7639,7 +7681,7 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
       CheckSheduleForI(I);
     LLVM_DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I
                       << "\n");
-    return;
+    return true;
   }
   assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
          "Expected to reach top of the basic block or instruction down the "
@@ -7653,7 +7695,7 @@ BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     CheckSheduleForI(I);
   assert(ScheduleEnd && "tried to vectorize a terminator?");
   LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
-  return;
+  return true;
 }
 
 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 1faadaba2bd72..e9c502b6982cd 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt < %s -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
-; RUN: opt < %s -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
+; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
+; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -35,14 +35,41 @@ define void @PR28330(i32 %n) {
 ;
 ; MAX-COST-LABEL: @PR28330(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; MAX-COST-NEXT:    [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+; MAX-COST-NEXT:    [[P1:%.*]] = icmp eq i8 [[P0]], 0
+; MAX-COST-NEXT:    [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+; MAX-COST-NEXT:    [[P3:%.*]] = icmp eq i8 [[P2]], 0
+; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
+; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
+; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
+; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
+; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P20:%.*]] = add i32 [[P17]], [[P19]]
+; MAX-COST-NEXT:    [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[P21]]
+; MAX-COST-NEXT:    [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[P23]]
+; MAX-COST-NEXT:    [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[P25]]
+; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
+; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
+; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[P31]]
+; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
@@ -112,14 +139,30 @@ define void @PR32038(i32 %n) {
 ;
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
+; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
+; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
+; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
+; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; MAX-COST-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[P27]]
+; MAX-COST-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[P29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP5]], -5
+; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
+; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
index 56f6b7b5d3588..7b6e6ca3c61af 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 define void @exceed(double %0, double %1) {
 ; CHECK-LABEL: @exceed(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
index a1b5f293602bd..293dcc0b1ef9e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -slp-schedule-budget=1 | FileCheck %s
 
 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
index ecffc1adb793c..61f25dd713775 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -slp-schedule-budget=1 | FileCheck %s
 
 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
index fa5534732b7a3..3e4cfe6e05157 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -S  -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -S  -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -15,9 +15,6 @@ define void @test(float * %a, float * %b, float * %c, float * %d) {
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
-; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    call void @unknown()
@@ -48,6 +45,9 @@ define void @test(float * %a, float * %b, float * %c, float * %d) {
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1

From b6817999384e5ae125eb477c003c040d51770c96 Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <snehasishk@google.com>
Date: Thu, 17 Feb 2022 14:44:49 -0800
Subject: [PATCH 695/748] [instrprof] Rename the profile kind types to be more
 descriptive.

Based on the discussion in D115393, I've updated the names to be more
descriptive.

Reviewed By: ellis, MaskRay

Differential Revision: https://reviews.llvm.org/D120092
---
 llvm/include/llvm/ProfileData/InstrProf.h     | 21 ++++++++++++-------
 .../llvm/ProfileData/InstrProfReader.h        |  7 ++++---
 .../llvm/ProfileData/InstrProfWriter.h        |  8 ++++---
 llvm/lib/ProfileData/InstrProfReader.cpp      | 18 ++++++++--------
 llvm/lib/ProfileData/InstrProfWriter.cpp      | 18 +++++++++-------
 5 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 9f7a6711131d6..401d278cbd06d 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -281,13 +281,20 @@ bool needsComdatForCounter(const Function &F, const Module &M);
 /// An enum describing the attributes of an instrumented profile.
 enum class InstrProfKind {
   Unknown = 0x0,
-  FE = 0x1, // A frontend clang profile, incompatible with other attrs.
-  IR = 0x2, // An IR-level profile (default when -fprofile-generate is used).
-  BB = 0x4, // A profile with entry basic block instrumentation.
-  CS = 0x8, // A context sensitive IR-level profile.
-  SingleByteCoverage = 0x10, // Use single byte probes for coverage.
-  FunctionEntryOnly = 0x20,  // Only instrument the function entry basic block.
-  MemProf = 0x40, // A memory profile collected using -fprofile=memory.
+  // A frontend clang profile, incompatible with other attrs.
+  FrontendInstrumentation = 0x1,
+  // An IR-level profile (default when -fprofile-generate is used).
+  IRInstrumentation = 0x2,
+  // A profile with entry basic block instrumentation.
+  FunctionEntryInstrumentation = 0x4,
+  // A context sensitive IR-level profile.
+  ContextSensitive = 0x8,
+  // Use single byte probes for coverage.
+  SingleByteCoverage = 0x10,
+  // Only instrument the function entry basic block.
+  FunctionEntryOnly = 0x20,
+  // A memory profile collected using -fprofile=memory.
+  MemProf = 0x40,
   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/MemProf)
 };
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 7a18d5a6a11af..7a01ef5e21949 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -213,15 +213,16 @@ class TextInstrProfReader : public InstrProfReader {
   static bool hasFormat(const MemoryBuffer &Buffer);
 
   bool isIRLevelProfile() const override {
-    return static_cast<bool>(ProfileKind & InstrProfKind::IR);
+    return static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation);
   }
 
   bool hasCSIRLevelProfile() const override {
-    return static_cast<bool>(ProfileKind & InstrProfKind::CS);
+    return static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive);
   }
 
   bool instrEntryBBEnabled() const override {
-    return static_cast<bool>(ProfileKind & InstrProfKind::BB);
+    return static_cast<bool>(ProfileKind &
+                             InstrProfKind::FunctionEntryInstrumentation);
   }
 
   bool hasSingleByteCoverage() const override {
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index bb180ac42c212..fb608269ad309 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -106,11 +106,13 @@ class InstrProfWriter {
 
     // Check if the profiles are in-compatible. Clang frontend profiles can't be
     // merged with other profile types.
-    if (static_cast<bool>((ProfileKind & InstrProfKind::FE) ^
-                          (Other & InstrProfKind::FE))) {
+    if (static_cast<bool>(
+            (ProfileKind & InstrProfKind::FrontendInstrumentation) ^
+            (Other & InstrProfKind::FrontendInstrumentation))) {
       return make_error<InstrProfError>(instrprof_error::unsupported_version);
     }
-    if (testIncompatible(InstrProfKind::FunctionEntryOnly, InstrProfKind::BB)) {
+    if (testIncompatible(InstrProfKind::FunctionEntryOnly,
+                         InstrProfKind::FunctionEntryInstrumentation)) {
       return make_error<InstrProfError>(
           instrprof_error::unsupported_version,
           "cannot merge FunctionEntryOnly profiles and BB profiles together");
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index f79169c7190f7..67f61589633ea 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -45,13 +45,13 @@ using namespace llvm;
 static InstrProfKind getProfileKindFromVersion(uint64_t Version) {
   InstrProfKind ProfileKind = InstrProfKind::Unknown;
   if (Version & VARIANT_MASK_IR_PROF) {
-    ProfileKind |= InstrProfKind::IR;
+    ProfileKind |= InstrProfKind::IRInstrumentation;
   }
   if (Version & VARIANT_MASK_CSIR_PROF) {
-    ProfileKind |= InstrProfKind::CS;
+    ProfileKind |= InstrProfKind::ContextSensitive;
   }
   if (Version & VARIANT_MASK_INSTR_ENTRY) {
-    ProfileKind |= InstrProfKind::BB;
+    ProfileKind |= InstrProfKind::FunctionEntryInstrumentation;
   }
   if (Version & VARIANT_MASK_BYTE_COVERAGE) {
     ProfileKind |= InstrProfKind::SingleByteCoverage;
@@ -177,16 +177,16 @@ Error TextInstrProfReader::readHeader() {
   while (Line->startswith(":")) {
     StringRef Str = Line->substr(1);
     if (Str.equals_insensitive("ir"))
-      ProfileKind |= InstrProfKind::IR;
+      ProfileKind |= InstrProfKind::IRInstrumentation;
     else if (Str.equals_insensitive("fe"))
-      ProfileKind |= InstrProfKind::FE;
+      ProfileKind |= InstrProfKind::FrontendInstrumentation;
     else if (Str.equals_insensitive("csir")) {
-      ProfileKind |= InstrProfKind::IR;
-      ProfileKind |= InstrProfKind::CS;
+      ProfileKind |= InstrProfKind::IRInstrumentation;
+      ProfileKind |= InstrProfKind::ContextSensitive;
     } else if (Str.equals_insensitive("entry_first"))
-      ProfileKind |= InstrProfKind::BB;
+      ProfileKind |= InstrProfKind::FunctionEntryInstrumentation;
     else if (Str.equals_insensitive("not_entry_first"))
-      ProfileKind &= ~InstrProfKind::BB;
+      ProfileKind &= ~InstrProfKind::FunctionEntryInstrumentation;
     else
       return error(instrprof_error::bad_header);
     ++Line;
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 4c974f402d2b3..2771b3a3eff1c 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -336,11 +336,12 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   IndexedInstrProf::Header Header;
   Header.Magic = IndexedInstrProf::Magic;
   Header.Version = IndexedInstrProf::ProfVersion::CurrentVersion;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::IR))
+  if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS))
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
     Header.Version |= VARIANT_MASK_CSIR_PROF;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::BB))
+  if (static_cast<bool>(ProfileKind &
+                        InstrProfKind::FunctionEntryInstrumentation))
     Header.Version |= VARIANT_MASK_INSTR_ENTRY;
   if (static_cast<bool>(ProfileKind & InstrProfKind::SingleByteCoverage))
     Header.Version |= VARIANT_MASK_BYTE_COVERAGE;
@@ -381,7 +382,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
     OS.write(0);
   uint64_t CSSummaryOffset = 0;
   uint64_t CSSummarySize = 0;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS)) {
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive)) {
     CSSummaryOffset = OS.tell();
     CSSummarySize = SummarySize / sizeof(uint64_t);
     for (unsigned I = 0; I < CSSummarySize; I++)
@@ -438,7 +439,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
 
   // For Context Sensitive summary.
   std::unique_ptr<IndexedInstrProf::Summary> TheCSSummary = nullptr;
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS)) {
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive)) {
     TheCSSummary = IndexedInstrProf::allocSummary(SummarySize);
     std::unique_ptr<ProfileSummary> CSPS = CSISB.getSummary();
     setSummary(TheCSSummary.get(), *CSPS);
@@ -553,12 +554,13 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
 
 Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
   // Check CS first since it implies an IR level profile.
-  if (static_cast<bool>(ProfileKind & InstrProfKind::CS))
+  if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
     OS << "# CSIR level Instrumentation Flag\n:csir\n";
-  else if (static_cast<bool>(ProfileKind & InstrProfKind::IR))
+  else if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     OS << "# IR level Instrumentation Flag\n:ir\n";
 
-  if (static_cast<bool>(ProfileKind & InstrProfKind::BB))
+  if (static_cast<bool>(ProfileKind &
+                        InstrProfKind::FunctionEntryInstrumentation))
     OS << "# Always instrument the function entry block\n:entry_first\n";
   InstrProfSymtab Symtab;
 

From 65a1dfaf4319fd89faa2d1d6e69d3f366d99f477 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 23 Feb 2022 01:26:53 -0800
Subject: [PATCH 696/748] [NFC] Add #include for constants

---
 llvm/unittests/Analysis/CGSCCPassManagerTest.cpp | 1 +
 llvm/unittests/Analysis/LoadsTest.cpp            | 1 +
 llvm/unittests/Analysis/PhiValuesTest.cpp        | 1 +
 3 files changed, 3 insertions(+)

diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
index e0b0aeda5bb28..d0bca9d1004d9 100644
--- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
diff --git a/llvm/unittests/Analysis/LoadsTest.cpp b/llvm/unittests/Analysis/LoadsTest.cpp
index 5570b747a4464..0111cfeefa41a 100644
--- a/llvm/unittests/Analysis/LoadsTest.cpp
+++ b/llvm/unittests/Analysis/LoadsTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/Loads.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/unittests/Analysis/PhiValuesTest.cpp b/llvm/unittests/Analysis/PhiValuesTest.cpp
index 82c02337ef2c9..a1506515f5430 100644
--- a/llvm/unittests/Analysis/PhiValuesTest.cpp
+++ b/llvm/unittests/Analysis/PhiValuesTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/PhiValues.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"

From 1333c573a9abc628605a36f1dcfd02302bee2074 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 23 Feb 2022 01:23:21 -0800
Subject: [PATCH 697/748] [NFC] Add #include for constants

---
 llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
index 023a41c5bfd69..c142729e2c6f4 100644
--- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -10,6 +10,7 @@
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"

From 53c5bd9da285be20325f6c63b3685865ae267ef6 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 23 Feb 2022 13:29:21 -0800
Subject: [PATCH 698/748] [ELF][test] Fix edata-etext.s

---
 lld/test/ELF/edata-etext.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lld/test/ELF/edata-etext.s b/lld/test/ELF/edata-etext.s
index 19cf2e5eb67d5..7223b6ba90758 100644
--- a/lld/test/ELF/edata-etext.s
+++ b/lld/test/ELF/edata-etext.s
@@ -29,8 +29,8 @@
 # CHECK-NEXT:  0000000000202162 g       .bss  0000000000000000 end
 # CHECK-NEXT:  0000000000201159 g       .text 0000000000000000 etext
 
-# RUN: ld.lld -r %t.o -o %t2
-# RUN: llvm-objdump -t %t2 | FileCheck %s --check-prefix=RELOCATABLE
+# RUN: ld.lld -r %t/a.o -o %t/a.ro
+# RUN: llvm-objdump -t %t/a.ro | FileCheck %s --check-prefix=RELOCATABLE
 # RELOCATABLE:       0000000000000000 *UND* 0000000000000000 _edata
 # RELOCATABLE-NEXT:  0000000000000000 *UND* 0000000000000000 _end
 # RELOCATABLE-NEXT:  0000000000000000 *UND* 0000000000000000 _etext

From 9a40f9f6814923a27ddeb439dd216d967de0d2d1 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 13:28:29 -0800
Subject: [PATCH 699/748] {SLP] Make it clear ScheduleDataMap is keyed by
 instructions [NFC]

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 12ab69aa74cea..d6ee0300d3296 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2636,13 +2636,19 @@ class BoUpSLP {
       ++SchedulingRegionID;
     }
 
-    ScheduleData *getScheduleData(Value *V) {
-      ScheduleData *SD = ScheduleDataMap[V];
+    ScheduleData *getScheduleData(Instruction *I) {
+      ScheduleData *SD = ScheduleDataMap[I];
       if (SD && isInSchedulingRegion(SD))
         return SD;
       return nullptr;
     }
 
+    ScheduleData *getScheduleData(Value *V) {
+      if (auto *I = dyn_cast<Instruction>(V))
+        return getScheduleData(I);
+      return nullptr;
+    }
+
     ScheduleData *getScheduleData(Value *V, Value *Key) {
       if (V == Key)
         return getScheduleData(V);
@@ -2842,7 +2848,7 @@ class BoUpSLP {
     /// Attaches ScheduleData to Instruction.
     /// Note that the mapping survives during all vectorization iterations, i.e.
     /// ScheduleData structures are recycled.
-    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+    DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
 
     /// Attaches ScheduleData to Instruction with the leading key.
     DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>

From 4cb24ef90a691489f22a36976a1b33acd65901fa Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 23 Feb 2022 13:26:32 -0800
Subject: [PATCH 700/748] [clang] Remove Address::deprecated() from CGClass.cpp

---
 clang/lib/CodeGen/CGClass.cpp       |  9 ++++-----
 clang/lib/CodeGen/CGExpr.cpp        |  6 +++---
 clang/lib/CodeGen/CGExprScalar.cpp  | 23 ++++++++++++++---------
 clang/lib/CodeGen/CodeGenFunction.h |  5 ++---
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 9a175d1f59d2b..4a49fc4a80c58 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2691,8 +2691,7 @@ void CodeGenFunction::EmitVTablePtrCheckForCall(const CXXRecordDecl *RD,
   EmitVTablePtrCheck(RD, VTable, TCK, Loc);
 }
 
-void CodeGenFunction::EmitVTablePtrCheckForCast(QualType T,
-                                                llvm::Value *Derived,
+void CodeGenFunction::EmitVTablePtrCheckForCast(QualType T, Address Derived,
                                                 bool MayBeNull,
                                                 CFITypeCheckKind TCK,
                                                 SourceLocation Loc) {
@@ -2715,7 +2714,7 @@ void CodeGenFunction::EmitVTablePtrCheckForCast(QualType T,
 
   if (MayBeNull) {
     llvm::Value *DerivedNotNull =
-        Builder.CreateIsNotNull(Derived, "cast.nonnull");
+        Builder.CreateIsNotNull(Derived.getPointer(), "cast.nonnull");
 
     llvm::BasicBlock *CheckBlock = createBasicBlock("cast.check");
     ContBlock = createBasicBlock("cast.cont");
@@ -2726,8 +2725,8 @@ void CodeGenFunction::EmitVTablePtrCheckForCast(QualType T,
   }
 
   llvm::Value *VTable;
-  std::tie(VTable, ClassDecl) = CGM.getCXXABI().LoadVTablePtr(
-      *this, Address::deprecated(Derived, getPointerAlign()), ClassDecl);
+  std::tie(VTable, ClassDecl) =
+      CGM.getCXXABI().LoadVTablePtr(*this, Derived, ClassDecl);
 
   EmitVTablePtrCheck(ClassDecl, VTable, TCK, Loc);
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 37f9a79c71325..8f1e60805356e 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1109,7 +1109,7 @@ Address CodeGenFunction::EmitPointerWithAlignment(const Expr *E,
         if (SanOpts.has(SanitizerKind::CFIUnrelatedCast) &&
             CE->getCastKind() == CK_BitCast) {
           if (auto PT = E->getType()->getAs<PointerType>())
-            EmitVTablePtrCheckForCast(PT->getPointeeType(), Addr.getPointer(),
+            EmitVTablePtrCheckForCast(PT->getPointeeType(), Addr,
                                       /*MayBeNull=*/true,
                                       CodeGenFunction::CFITCK_UnrelatedCast,
                                       CE->getBeginLoc());
@@ -4756,7 +4756,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
                     Derived.getPointer(), E->getType());
 
     if (SanOpts.has(SanitizerKind::CFIDerivedCast))
-      EmitVTablePtrCheckForCast(E->getType(), Derived.getPointer(),
+      EmitVTablePtrCheckForCast(E->getType(), Derived,
                                 /*MayBeNull=*/false, CFITCK_DerivedCast,
                                 E->getBeginLoc());
 
@@ -4774,7 +4774,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
         ConvertTypeForMem(CE->getTypeAsWritten()->getPointeeType()));
 
     if (SanOpts.has(SanitizerKind::CFIUnrelatedCast))
-      EmitVTablePtrCheckForCast(E->getType(), V.getPointer(),
+      EmitVTablePtrCheckForCast(E->getType(), V,
                                 /*MayBeNull=*/false, CFITCK_UnrelatedCast,
                                 E->getBeginLoc());
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index d3db63f9917fe..93fb7d37c3445 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2039,11 +2039,16 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     }
 
     if (CGF.SanOpts.has(SanitizerKind::CFIUnrelatedCast)) {
-      if (auto PT = DestTy->getAs<PointerType>())
-        CGF.EmitVTablePtrCheckForCast(PT->getPointeeType(), Src,
-                                      /*MayBeNull=*/true,
-                                      CodeGenFunction::CFITCK_UnrelatedCast,
-                                      CE->getBeginLoc());
+      if (auto PT = DestTy->getAs<PointerType>()) {
+        CGF.EmitVTablePtrCheckForCast(
+            PT->getPointeeType(),
+            Address(Src,
+                    CGF.ConvertTypeForMem(
+                        E->getType()->getAs<PointerType>()->getPointeeType()),
+                    CGF.getPointerAlign()),
+            /*MayBeNull=*/true, CodeGenFunction::CFITCK_UnrelatedCast,
+            CE->getBeginLoc());
+      }
     }
 
     if (CGF.CGM.getCodeGenOpts().StrictVTablePointers) {
@@ -2198,10 +2203,10 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
                         Derived.getPointer(), DestTy->getPointeeType());
 
     if (CGF.SanOpts.has(SanitizerKind::CFIDerivedCast))
-      CGF.EmitVTablePtrCheckForCast(
-          DestTy->getPointeeType(), Derived.getPointer(),
-          /*MayBeNull=*/true, CodeGenFunction::CFITCK_DerivedCast,
-          CE->getBeginLoc());
+      CGF.EmitVTablePtrCheckForCast(DestTy->getPointeeType(), Derived,
+                                    /*MayBeNull=*/true,
+                                    CodeGenFunction::CFITCK_DerivedCast,
+                                    CE->getBeginLoc());
 
     return Derived.getPointer();
   }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index f9932e4140813..5c8aef4fe6fb4 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2296,9 +2296,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// Derived is the presumed address of an object of type T after a
   /// cast. If T is a polymorphic class type, emit a check that the virtual
   /// table for Derived belongs to a class derived from T.
-  void EmitVTablePtrCheckForCast(QualType T, llvm::Value *Derived,
-                                 bool MayBeNull, CFITypeCheckKind TCK,
-                                 SourceLocation Loc);
+  void EmitVTablePtrCheckForCast(QualType T, Address Derived, bool MayBeNull,
+                                 CFITypeCheckKind TCK, SourceLocation Loc);
 
   /// EmitVTablePtrCheckForCall - Virtual method MD is being called via VTable.
   /// If vptr CFI is enabled, emit a check that VTable is valid.

From 632ea6929d5cb5b3c4ff1b44df5d4e075d9f1dda Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 23 Feb 2022 13:35:22 -0800
Subject: [PATCH 701/748] [sanitizer][sancov] Use pc-1 for s390x

The stack trace addresses may be odd (normally addresses should be even), but
seems a good compromise when the instruction length (2,4,6) cannot be detected
easily.

Reviewed By: uweigand

Differential Revision: https://reviews.llvm.org/D120432
---
 compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp | 2 +-
 compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h   | 2 +-
 compiler-rt/test/tsan/test.h                              | 7 +++----
 llvm/tools/sancov/sancov.cpp                              | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
index 5a6329eec484d..3013a0c4abdf0 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
@@ -46,7 +46,7 @@ uptr StackTrace::GetNextInstructionPc(uptr pc) {
   }
   // bail-out if could not figure out the instruction size
   return 0;
-#elif SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
+#elif SANITIZER_S390 || SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
   return pc + 1;
 #else
   return pc + 4;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
index 82c2fda351277..9a5f8fb13a29d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
@@ -98,7 +98,7 @@ uptr StackTrace::GetPreviousInstructionPc(uptr pc) {
   // It seems difficult to figure out the exact instruction length -
   // pc - 2 seems like a safe option for the purposes of stack tracing
   return pc - 2;
-#elif SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
+#elif SANITIZER_S390 || SANITIZER_I386 || SANITIZER_X32 || SANITIZER_X64
   return pc - 1;
 #else
   return pc - 4;
diff --git a/compiler-rt/test/tsan/test.h b/compiler-rt/test/tsan/test.h
index 66350996d104b..efd66cbf91a43 100644
--- a/compiler-rt/test/tsan/test.h
+++ b/compiler-rt/test/tsan/test.h
@@ -69,13 +69,12 @@ unsigned long long monotonic_clock_ns() {
 #endif
 
 //The const kPCInc must be in sync with StackTrace::GetPreviousInstructionPc
-#if defined(__powerpc64__) || defined(__arm__) || defined(__aarch64__)
-// PCs are always 4 byte aligned.
-const int kPCInc = 4;
+#if defined(__s390__) || defined(__i386__) || defined(__x86_64__)
+const int kPCInc = 1;
 #elif defined(__sparc__) || defined(__mips__)
 const int kPCInc = 8;
 #else
-const int kPCInc = 1;
+const int kPCInc = 4;
 #endif
 
 #ifdef __cplusplus
diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp
index 9a523984df75d..7609b2b53981b 100644
--- a/llvm/tools/sancov/sancov.cpp
+++ b/llvm/tools/sancov/sancov.cpp
@@ -698,7 +698,7 @@ static uint64_t getPreviousInstructionPc(uint64_t PC,
     return PC - 8;
   if (TheTriple.isRISCV())
     return PC - 2;
-  if (TheTriple.isX86())
+  if (TheTriple.isX86() || TheTriple.isSystemZ())
     return PC - 1;
   return PC - 4;
 }

From a4541fdfe480dd42b9dff725b6811256ebdf9d14 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 13:42:49 -0800
Subject: [PATCH 702/748] [SLP] Replace a impossible branch condition with an
 assert [NFC]

An entire bundle must be inside the scheduling window.  Assert that this property holds as opposed to checking it at runtime.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d6ee0300d3296..387d144d89a3c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2759,6 +2759,8 @@ class BoUpSLP {
         assert(SD && "primary scheduledata must exist in window");
         assert(isInSchedulingRegion(SD) &&
                "primary schedule data not in window?");
+        assert(isInSchedulingRegion(SD->FirstInBundle) &&
+               "entire bundle in window!");
         (void)SD;
         doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
       }
@@ -7765,8 +7767,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
 
       // Handle def-use chain dependencies.
       if (BundleMember->OpValue != BundleMember->Inst) {
-        ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
-        if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+        if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
           BundleMember->Dependencies++;
           ScheduleData *DestBundle = UseSD->FirstInBundle;
           if (!DestBundle->IsScheduled)
@@ -7776,10 +7777,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
         }
       } else {
         for (User *U : BundleMember->Inst->users()) {
-          assert(isa<Instruction>(U) &&
-                 "user of instruction must be instruction");
-          ScheduleData *UseSD = getScheduleData(U);
-          if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+          if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
             BundleMember->Dependencies++;
             ScheduleData *DestBundle = UseSD->FirstInBundle;
             if (!DestBundle->IsScheduled)

From 2b97b16f294af91281d8a138759f3f772952a902 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Tue, 22 Feb 2022 16:15:34 -0500
Subject: [PATCH 703/748] [OpenMP] Add option to make offloading mandatory

Currently when we generate OpenMP offloading code we always make
fallback code for the CPU. This is necessary for implementing features
like conditional offloading and ensuring that unhandled pragmas don't
result in missing symbols. However, this is problematic for a few cases.
For offloading tests we can silently fail to the host without realizing
that offloading failed. Additionally, this makes it impossible to
provide interoperabiility to other offloading schemes like HIP or CUDA
because those methods do not provide any such host fallback guaruntee.
this patch adds the `-fopenmp-offload-mandatory` flag to prevent
generating the fallback symbol on the CPU and instead replaces the
function with a dummy global and the failed branch with 'unreachable'.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D120353
---
 clang/include/clang/Basic/LangOptions.def     |  1 +
 clang/include/clang/Driver/Options.td         |  4 +
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 72 +++++++++------
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  7 ++
 clang/lib/Driver/ToolChains/Clang.cpp         |  2 +
 clang/lib/Sema/SemaOpenMP.cpp                 |  2 +-
 .../target_offload_mandatory_codegen.cpp      | 90 +++++++++++++++++++
 7 files changed, 151 insertions(+), 27 deletions(-)
 create mode 100644 clang/test/OpenMP/target_offload_mandatory_codegen.cpp

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index e21998860f217..6af474d613c98 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -247,6 +247,7 @@ LANGOPT(OpenMPOptimisticCollapse  , 1, 0, "Use at most 32 bits to represent the
 LANGOPT(OpenMPThreadSubscription  , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
 LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
+LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
 LANGOPT(RenderScript      , 1, 0, "RenderScript")
 
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index fcee066905703..6314f025e0585 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2479,6 +2479,10 @@ def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state
   Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, 
   HelpText<"Assert no thread in a parallel region modifies an ICV">,
   MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
+def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<f_Group>, 
+  Flags<[CC1Option, NoArgumentUnused]>, 
+  HelpText<"Do not create a host fallback if offloading to the device fails.">,
+  MarshallingInfoFlag<LangOpts<"OpenMPOffloadMandatory">>;
 defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime",
   LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue,
   PosFlag<SetTrue, [CC1Option], "Use the new bitcode library for OpenMP offloading">,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a77658060be72..b57495b042d2a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6538,6 +6538,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
   // mangled name of the function that encloses the target region and BB is the
   // line number of the target region.
 
+  const bool BuildOutlinedFn = CGM.getLangOpts().OpenMPIsDevice ||
+                               !CGM.getLangOpts().OpenMPOffloadMandatory;
   unsigned DeviceID;
   unsigned FileID;
   unsigned Line;
@@ -6556,7 +6558,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
   CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName);
   CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
 
-  OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
+  if (BuildOutlinedFn)
+    OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
 
   // If this target outline function is not an offload entry, we don't need to
   // register it.
@@ -6588,26 +6591,38 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
         llvm::Constant::getNullValue(CGM.Int8Ty), Name);
   }
 
+  // If we do not allow host fallback we still need a named address to use.
+  llvm::Constant *TargetRegionEntryAddr = OutlinedFn;
+  if (!BuildOutlinedFn) {
+    assert(!CGM.getModule().getGlobalVariable(EntryFnName, true) &&
+           "Named kernel already exists?");
+    TargetRegionEntryAddr = new llvm::GlobalVariable(
+        CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+        llvm::GlobalValue::InternalLinkage,
+        llvm::Constant::getNullValue(CGM.Int8Ty), EntryFnName);
+  }
+
   // Register the information for the entry associated with this target region.
   OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
-      DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID,
+      DeviceID, FileID, ParentName, Line, TargetRegionEntryAddr, OutlinedFnID,
       OffloadEntriesInfoManagerTy::OMPTargetRegionEntryTargetRegion);
 
   // Add NumTeams and ThreadLimit attributes to the outlined GPU function
   int32_t DefaultValTeams = -1;
   getNumTeamsExprForTargetDirective(CGF, D, DefaultValTeams);
-  if (DefaultValTeams > 0) {
+  if (DefaultValTeams > 0 && OutlinedFn) {
     OutlinedFn->addFnAttr("omp_target_num_teams",
                           std::to_string(DefaultValTeams));
   }
   int32_t DefaultValThreads = -1;
   getNumThreadsExprForTargetDirective(CGF, D, DefaultValThreads);
-  if (DefaultValThreads > 0) {
+  if (DefaultValThreads > 0 && OutlinedFn) {
     OutlinedFn->addFnAttr("omp_target_thread_limit",
                           std::to_string(DefaultValThreads));
   }
 
-  CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM);
+  if (BuildOutlinedFn)
+    CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM);
 }
 
 /// Checks if the expression is constant or does not have non-trivial function
@@ -10324,7 +10339,10 @@ void CGOpenMPRuntime::emitTargetCall(
   if (!CGF.HaveInsertPoint())
     return;
 
-  assert(OutlinedFn && "Invalid outlined function!");
+  const bool OffloadingMandatory = !CGM.getLangOpts().OpenMPIsDevice &&
+                                   CGM.getLangOpts().OpenMPOffloadMandatory;
+
+  assert((OffloadingMandatory || OutlinedFn) && "Invalid outlined function!");
 
   const bool RequiresOuterTask = D.hasClausesOfKind<OMPDependClause>() ||
                                  D.hasClausesOfKind<OMPNowaitClause>();
@@ -10339,18 +10357,28 @@ void CGOpenMPRuntime::emitTargetCall(
   CodeGenFunction::OMPTargetDataInfo InputInfo;
   llvm::Value *MapTypesArray = nullptr;
   llvm::Value *MapNamesArray = nullptr;
-  // Fill up the pointer arrays and transfer execution to the device.
-  auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo,
-                    &MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask,
-                    &CapturedVars,
-                    SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) {
-    if (Device.getInt() == OMPC_DEVICE_ancestor) {
-      // Reverse offloading is not supported, so just execute on the host.
+  // Generate code for the host fallback function.
+  auto &&FallbackGen = [this, OutlinedFn, OutlinedFnID, &D, &CapturedVars,
+                        RequiresOuterTask, &CS,
+                        OffloadingMandatory](CodeGenFunction &CGF) {
+    if (OffloadingMandatory) {
+      CGF.Builder.CreateUnreachable();
+    } else {
       if (RequiresOuterTask) {
         CapturedVars.clear();
         CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
       }
       emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars);
+    }
+  };
+  // Fill up the pointer arrays and transfer execution to the device.
+  auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo,
+                    &MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask,
+                    &CapturedVars, SizeEmitter,
+                    FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
+    if (Device.getInt() == OMPC_DEVICE_ancestor) {
+      // Reverse offloading is not supported, so just execute on the host.
+      FallbackGen(CGF);
       return;
     }
 
@@ -10494,25 +10522,17 @@ void CGOpenMPRuntime::emitTargetCall(
     CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
 
     CGF.EmitBlock(OffloadFailedBlock);
-    if (RequiresOuterTask) {
-      CapturedVars.clear();
-      CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
-    }
-    emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars);
+    FallbackGen(CGF);
+
     CGF.EmitBranch(OffloadContBlock);
 
     CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true);
   };
 
   // Notify that the host version must be executed.
-  auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars,
-                    RequiresOuterTask](CodeGenFunction &CGF,
-                                       PrePostActionTy &) {
-    if (RequiresOuterTask) {
-      CapturedVars.clear();
-      CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
-    }
-    emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars);
+  auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars, RequiresOuterTask,
+                    FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
+    FallbackGen(CGF);
   };
 
   auto &&TargetThenGen = [this, &ThenGen, &D, &InputInfo, &MapTypesArray,
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 5f6ab2769d033..4b839b0ed266c 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6289,6 +6289,13 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
   if (CGM.getLangOpts().OMPTargetTriples.empty())
     IsOffloadEntry = false;
 
+  if (CGM.getLangOpts().OpenMPOffloadMandatory && !IsOffloadEntry) {
+    unsigned DiagID = CGM.getDiags().getCustomDiagID(
+        DiagnosticsEngine::Error,
+        "No offloading entry generated while offloading is mandatory.");
+    CGM.getDiags().Report(DiagID);
+  }
+
   assert(CGF.CurFuncDecl && "No parent declaration for target region!");
   StringRef ParentName;
   // In case we have Ctors/Dtors we use the complete type variant to produce
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 32cbb7936f7ee..e3fd07a88389d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5997,6 +5997,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
       if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
         CmdArgs.push_back("-fopenmp-assume-no-thread-state");
+      if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
+        CmdArgs.push_back("-fopenmp-offload-mandatory");
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 55cee7813b26b..c6236e622c078 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2517,7 +2517,7 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
         << HostDevTy;
     return;
   }
-  if (!LangOpts.OpenMPIsDevice && DevTy &&
+  if (!LangOpts.OpenMPIsDevice && !LangOpts.OpenMPOffloadMandatory && DevTy &&
       *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
     // Diagnose nohost function called during host codegen.
     StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName(
diff --git a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
new file mode 100644
index 0000000000000..c8f44ebcb706a
--- /dev/null
+++ b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+"
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-offload-mandatory -emit-llvm %s -o - | FileCheck %s --check-prefix=MANDATORY
+// expected-no-diagnostics
+
+void foo() {}
+#pragma omp declare target(foo)
+
+void bar() {}
+#pragma omp declare target device_type(nohost) to(bar)
+
+void host() {
+#pragma omp target
+  { bar(); }
+}
+
+void host_if(bool cond) {
+#pragma omp target if(cond)
+  { bar(); }
+}
+
+void host_dev(int device) {
+#pragma omp target device(device)
+  { bar(); }
+}
+// MANDATORY-LABEL: define {{[^@]+}}@_Z3foov
+// MANDATORY-SAME: () #[[ATTR0:[0-9]+]] {
+// MANDATORY-NEXT:  entry:
+// MANDATORY-NEXT:    ret void
+//
+//
+// MANDATORY-LABEL: define {{[^@]+}}@_Z4hostv
+// MANDATORY-SAME: () #[[ATTR0]] {
+// MANDATORY-NEXT:  entry:
+// MANDATORY-NEXT:    [[TMP0:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4hostv_l12.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null)
+// MANDATORY-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+// MANDATORY-NEXT:    br i1 [[TMP1]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// MANDATORY:       omp_offload.failed:
+// MANDATORY-NEXT:    unreachable
+// MANDATORY:       omp_offload.cont:
+// MANDATORY-NEXT:    ret void
+//
+//
+// MANDATORY-LABEL: define {{[^@]+}}@_Z7host_ifb
+// MANDATORY-SAME: (i1 noundef zeroext [[COND:%.*]]) #[[ATTR0]] {
+// MANDATORY-NEXT:  entry:
+// MANDATORY-NEXT:    [[COND_ADDR:%.*]] = alloca i8, align 1
+// MANDATORY-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[COND]] to i8
+// MANDATORY-NEXT:    store i8 [[FROMBOOL]], i8* [[COND_ADDR]], align 1
+// MANDATORY-NEXT:    [[TMP0:%.*]] = load i8, i8* [[COND_ADDR]], align 1
+// MANDATORY-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
+// MANDATORY-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// MANDATORY:       omp_if.then:
+// MANDATORY-NEXT:    [[TMP1:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7host_ifb_l17.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null)
+// MANDATORY-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+// MANDATORY-NEXT:    br i1 [[TMP2]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// MANDATORY:       omp_offload.failed:
+// MANDATORY-NEXT:    unreachable
+// MANDATORY:       omp_offload.cont:
+// MANDATORY-NEXT:    br label [[OMP_IF_END:%.*]]
+// MANDATORY:       omp_if.else:
+// MANDATORY-NEXT:    unreachable
+// MANDATORY:       omp_if.end:
+// MANDATORY-NEXT:    ret void
+//
+//
+// MANDATORY-LABEL: define {{[^@]+}}@_Z8host_devi
+// MANDATORY-SAME: (i32 noundef signext [[DEVICE:%.*]]) #[[ATTR0]] {
+// MANDATORY-NEXT:  entry:
+// MANDATORY-NEXT:    [[DEVICE_ADDR:%.*]] = alloca i32, align 4
+// MANDATORY-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// MANDATORY-NEXT:    store i32 [[DEVICE]], i32* [[DEVICE_ADDR]], align 4
+// MANDATORY-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DEVICE_ADDR]], align 4
+// MANDATORY-NEXT:    store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// MANDATORY-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// MANDATORY-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+// MANDATORY-NEXT:    [[TMP3:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1]], i64 [[TMP2]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8host_devi_l22.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null)
+// MANDATORY-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+// MANDATORY-NEXT:    br i1 [[TMP4]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// MANDATORY:       omp_offload.failed:
+// MANDATORY-NEXT:    unreachable
+// MANDATORY:       omp_offload.cont:
+// MANDATORY-NEXT:    ret void
+//
+//
+// MANDATORY-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// MANDATORY-SAME: () #[[ATTR3:[0-9]+]] {
+// MANDATORY-NEXT:  entry:
+// MANDATORY-NEXT:    call void @__tgt_register_requires(i64 1)
+// MANDATORY-NEXT:    ret void
+//

From ed54296ea37facd77ac69c30e064df46d7faa883 Mon Sep 17 00:00:00 2001
From: Philip Reames <listmail@philipreames.com>
Date: Wed, 23 Feb 2022 13:48:03 -0800
Subject: [PATCH 704/748] [SLP] Fastpath instructions not in block being
 scheduled [nfc]

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 387d144d89a3c..9bba73bcc8391 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2637,6 +2637,9 @@ class BoUpSLP {
     }
 
     ScheduleData *getScheduleData(Instruction *I) {
+      if (BB != I->getParent())
+        // Avoid lookup if can't possibly be in map.
+        return nullptr;
       ScheduleData *SD = ScheduleDataMap[I];
       if (SD && isInSchedulingRegion(SD))
         return SD;

From 0c1fd90fe0828aa87518d9be542bd00d36d258d7 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Wed, 23 Feb 2022 16:53:30 -0500
Subject: [PATCH 705/748] [Clang][Docs] Add '-fopenmp-offload-mandatory' to
 command line reference

---
 clang/docs/ClangCommandLineReference.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
index c03a4c2bbc1e1..77c20d3646500 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -2097,6 +2097,10 @@ Enable debugging in the OpenMP offloading device RTL
 
 Use the new bitcode library for OpenMP offloading
 
+.. option:: -fopenmp-offload-mandatory
+
+Indicate that offloading to the device is mandatory and do not generate host-fallback code.
+
 .. option:: -fopenmp-version=<arg>
 
 Set OpenMP version (e.g. 45 for OpenMP 4.5, 50 for OpenMP 5.0). Default value is 50.

From e0dc4ac28f0080a10a51a4627c880ca795f07ba0 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Tue, 22 Feb 2022 14:42:15 -0800
Subject: [PATCH 706/748] [NVPTX] Expose float tys min, max, abs, neg as
 builtins

Adds support for the following builtins:

- abs, neg:
- .bf16,
- .bf16x2
- min, max
- {.ftz}{.NaN}{.xorsign.abs}.f16
- {.ftz}{.NaN}{.xorsign.abs}.f16x2
- {.NaN}{.xorsign.abs}.bf16
- {.NaN}{.xorsign.abs}.bf16x2
- {.ftz}{.NaN}{.xorsign.abs}.f32

Differential Revision: https://reviews.llvm.org/D117887
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  76 +++--
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |   2 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 151 +++++++++-
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp |  55 +++-
 .../math-intrins-sm80-ptx70-instcombine.ll    | 268 ++++++++++++++++++
 .../CodeGen/NVPTX/math-intrins-sm80-ptx70.ll  | 260 +++++++++++++++++
 .../CodeGen/NVPTX/math-intrins-sm86-ptx72.ll  | 259 +++++++++++++++++
 7 files changed, 1045 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 41b28db56c75c..f13abe91dbc8e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -564,26 +564,55 @@ let TargetPrefix = "nvvm" in {
 // Min Max
 //
 
-  def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_fmin_ftz_f : GCCBuiltin<"__nvvm_fmin_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+  foreach operation = ["min", "max"] in {
+    def int_nvvm_f # operation # _d :
+      GCCBuiltin<!strconcat("__nvvm_f", operation, "_d")>,
+      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable, Commutative]>;
 
-  def int_nvvm_fmax_f : GCCBuiltin<"__nvvm_fmax_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]
-        , [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_fmax_ftz_f : GCCBuiltin<"__nvvm_fmax_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
+    foreach variant = ["_f", "_ftz_f", "_nan_f", "_ftz_nan_f",
+      "_xorsign_abs_f", "_ftz_xorsign_abs_f", "_nan_xorsign_abs_f",
+      "_ftz_nan_xorsign_abs_f"] in {
+      def int_nvvm_f # operation # variant :
+        GCCBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
 
-  def int_nvvm_fmin_d : GCCBuiltin<"__nvvm_fmin_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
-  def int_nvvm_fmax_d : GCCBuiltin<"__nvvm_fmax_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
-        [IntrNoMem, IntrSpeculatable, Commutative]>;
+    foreach variant = ["_f16", "_ftz_f16", "_nan_f16", "_ftz_nan_f16",
+      "_xorsign_abs_f16", "_ftz_xorsign_abs_f16", "_nan_xorsign_abs_f16",
+      "_ftz_nan_xorsign_abs_f16"] in {
+      def int_nvvm_f # operation # variant :
+        GCCBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+
+    foreach variant = ["_f16x2", "_ftz_f16x2", "_nan_f16x2",
+      "_ftz_nan_f16x2", "_xorsign_abs_f16x2", "_ftz_xorsign_abs_f16x2",
+      "_nan_xorsign_abs_f16x2", "_ftz_nan_xorsign_abs_f16x2"] in {
+      def int_nvvm_f # operation # variant :
+        GCCBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+
+    foreach variant = ["_bf16", "_nan_bf16", "_xorsign_abs_bf16",
+      "_nan_xorsign_abs_bf16"] in {
+      def int_nvvm_f # operation # variant :
+        GCCBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+
+    foreach variant = ["_bf16x2", "_nan_bf16x2", "_xorsign_abs_bf16x2",
+      "_nan_xorsign_abs_bf16x2"] in {
+      def int_nvvm_f # operation # variant :
+        GCCBuiltin<!strconcat("__nvvm_f", operation, variant)>,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+          [IntrNoMem, IntrSpeculatable, Commutative]>;
+    }
+  }
 
 //
 // Multiplication
@@ -740,6 +769,19 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
+//
+// Abs, Neg bf16, bf16x2
+//
+
+  foreach unary = ["abs", "neg"] in {
+    def int_nvvm_ # unary # _bf16 :
+      GCCBuiltin<!strconcat("__nvvm_", unary, "_bf16")>,
+      DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>;
+    def int_nvvm_ # unary # _bf16x2 :
+      GCCBuiltin<!strconcat("__nvvm_", unary, "_bf16x2")>,
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  }
+
 //
 // Round
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 22084cddc0921..b72baadc6964b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -152,12 +152,14 @@ def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
 def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">;
 def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
 def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">;
+def hasPTX72 : Predicate<"Subtarget->getPTXVersion() >= 72">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
 def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
 def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
+def hasSM86 : Predicate<"Subtarget->getSmVersion() >= 86">;
 
 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 479b0143ab7cc..e0d01d1027010 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -549,19 +549,22 @@ def : Pat<(int_nvvm_fmin_d
 // We need a full string for OpcStr here because we need to deal with case like
 // INT_PTX_RECIP.
 class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
-  NVPTXRegClass src_regclass, Intrinsic IntOP>
+  NVPTXRegClass src_regclass, Intrinsic IntOP, list<Predicate> Preds = []>
             : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
             OpcStr,
-        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>,
+        Requires<Preds>;
 
 // We need a full string for OpcStr here because we need to deal with the case
 // like INT_PTX_NATIVE_POWR_F.
 class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
-  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP,
+  list<Predicate> Preds = []>
             : NVPTXInst<(outs t_regclass:$dst),
               (ins s0_regclass:$src0, s1_regclass:$src1),
             OpcStr,
-        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>,
+        Requires<Preds>;
 
 class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
   NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
@@ -587,17 +590,144 @@ def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
   Float32Regs, Float32Regs, int_nvvm_fmin_f>;
 def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+def INT_NVVM_FMIN_NAN_F : F_MATH_2<"min.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMIN_FTZ_NAN_F : F_MATH_2<"min.ftz.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMIN_XORSIGN_ABS_F :
+  F_MATH_2<"min.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMIN_FTZ_XORSIGN_ABS_F :
+  F_MATH_2<"min.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMIN_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"min.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMIN_FTZ_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"min.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
 
 def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
   Float32Regs, Float32Regs, int_nvvm_fmax_f>;
 def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
   Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+def INT_NVVM_FMAX_NAN_F : F_MATH_2<"max.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMAX_FTZ_NAN_F : F_MATH_2<"max.ftz.NaN.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_f,
+  [hasPTX70, hasSM80]>;
+def INT_NVVM_FMAX_XORSIGN_ABS_F :
+  F_MATH_2<"max.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMAX_FTZ_XORSIGN_ABS_F :
+  F_MATH_2<"max.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMAX_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"max.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
+def INT_NVVM_FMAX_FTZ_NAN_XORSIGN_ABS_F :
+  F_MATH_2<"max.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_xorsign_abs_f,
+    [hasPTX72, hasSM86]>;
 
 def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
   Float64Regs, Float64Regs, int_nvvm_fmin_d>;
 def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
   Float64Regs, Float64Regs, int_nvvm_fmax_d>;
 
+//
+// Min Max f16, f16x2, bf16, bf16x2
+//
+class MIN_MAX_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
+                    list<Predicate> Preds = [hasPTX70, hasSM80]> {
+  string Variant = V;
+  Intrinsic Intr = I;
+  NVPTXRegClass RegClass = RC;
+  list<Predicate> Predicates = Preds;
+}
+
+multiclass MIN_MAX<string IntName> {
+  foreach P = [
+    MIN_MAX_TUPLE<"_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_f16,
+      int_nvvm_fmax_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_ftz_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_ftz_f16,
+      int_nvvm_fmax_ftz_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_NaN_f16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_f16,
+      int_nvvm_fmax_nan_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_ftz_NaN_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_f16, int_nvvm_fmax_ftz_nan_f16), Float16Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_f16, int_nvvm_fmax_xorsign_abs_f16),
+      Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_xorsign_abs_f16, int_nvvm_fmax_ftz_xorsign_abs_f16),
+      Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_f16, int_nvvm_fmax_nan_xorsign_abs_f16),
+      Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_xorsign_abs_f16,
+      int_nvvm_fmax_ftz_nan_xorsign_abs_f16), Float16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_f16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_f16x2,
+      int_nvvm_fmax_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_ftz_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_f16x2, int_nvvm_fmax_ftz_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_NaN_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_f16x2, int_nvvm_fmax_nan_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_ftz_NaN_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_f16x2, int_nvvm_fmax_ftz_nan_f16x2), Float16x2Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_f16x2, int_nvvm_fmax_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_xorsign_abs_f16x2, int_nvvm_fmax_ftz_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_f16x2, int_nvvm_fmax_nan_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_ftz_NaN_xorsign_abs_f16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
+      int_nvvm_fmax_ftz_nan_xorsign_abs_f16x2),
+      Float16x2Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_bf16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_bf16, int_nvvm_fmax_bf16), Int16Regs>,
+    MIN_MAX_TUPLE<"_NaN_bf16", !if(!eq(IntName, "min"), int_nvvm_fmin_nan_bf16,
+      int_nvvm_fmax_nan_bf16), Int16Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_bf16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_bf16, int_nvvm_fmax_xorsign_abs_bf16),
+      Int16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_bf16, int_nvvm_fmax_nan_xorsign_abs_bf16),
+      Int16Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_bf16x2", !if(!eq(IntName, "min"), int_nvvm_fmin_bf16x2,
+      int_nvvm_fmax_bf16x2), Int32Regs>,
+    MIN_MAX_TUPLE<"_NaN_bf16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_bf16x2, int_nvvm_fmax_nan_bf16x2), Int32Regs>,
+    MIN_MAX_TUPLE<"_xorsign_abs_bf16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_xorsign_abs_bf16x2, int_nvvm_fmax_xorsign_abs_bf16x2),
+      Int32Regs, [hasPTX72, hasSM86]>,
+    MIN_MAX_TUPLE<"_NaN_xorsign_abs_bf16x2", !if(!eq(IntName, "min"),
+      int_nvvm_fmin_nan_xorsign_abs_bf16x2,
+      int_nvvm_fmax_nan_xorsign_abs_bf16x2),
+      Int32Regs, [hasPTX72, hasSM86]>] in {
+        def P.Variant : F_MATH_2<!strconcat(
+          IntName, !subst("_", ".", P.Variant), " \t$dst, $src0, $src1;"),
+          P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
+  }
+}
+
+defm INT_NVVM_FMIN : MIN_MAX<"min">;
+defm INT_NVVM_FMAN : MIN_MAX<"max">;
 
 //
 // Multiplication
@@ -719,6 +849,19 @@ def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
 def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_fabs_d>;
 
+//
+// Abs, Neg bf16, bf16x2
+//
+
+def INT_NVVM_ABS_BF16 : F_MATH_1<"abs.bf16 \t$dst, $dst;", Int16Regs,
+  Int16Regs, int_nvvm_abs_bf16, [hasPTX70, hasSM80]>;
+def INT_NVVM_ABS_BF16X2 : F_MATH_1<"abs.bf16x2 \t$dst, $dst;", Int32Regs,
+  Int32Regs, int_nvvm_abs_bf16x2, [hasPTX70, hasSM80]>;
+def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $dst;", Int16Regs,
+  Int16Regs, int_nvvm_neg_bf16, [hasPTX70, hasSM80]>;
+def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $dst;", Int32Regs,
+  Int32Regs, int_nvvm_neg_bf16x2, [hasPTX70, hasSM80]>;
+
 //
 // Round
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 466aa71302169..dba52bac32389 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -145,11 +145,15 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
     Optional<SpecialCase> Special;
 
     FtzRequirementTy FtzRequirement = FTZ_Any;
+    // Denormal handling is guarded by different attributes depending on the
+    // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
+    bool IsHalfTy = false;
 
     SimplifyAction() = default;
 
-    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
-        : IID(IID), FtzRequirement(FtzReq) {}
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
+                   bool IsHalfTy = false)
+        : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
 
     // Cast operations don't have anything to do with FTZ, so we skip that
     // argument.
@@ -197,12 +201,52 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
       return {Intrinsic::maxnum, FTZ_MustBeOff};
     case Intrinsic::nvvm_fmax_ftz_f:
       return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_nan_f:
+      return {Intrinsic::maximum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_nan_f:
+      return {Intrinsic::maximum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_f16:
+      return {Intrinsic::maxnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_f16:
+      return {Intrinsic::maxnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmax_f16x2:
+      return {Intrinsic::maxnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_f16x2:
+      return {Intrinsic::maxnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmax_nan_f16:
+      return {Intrinsic::maximum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_nan_f16:
+      return {Intrinsic::maximum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmax_nan_f16x2:
+      return {Intrinsic::maximum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
+      return {Intrinsic::maximum, FTZ_MustBeOn, true};
     case Intrinsic::nvvm_fmin_d:
       return {Intrinsic::minnum, FTZ_Any};
     case Intrinsic::nvvm_fmin_f:
       return {Intrinsic::minnum, FTZ_MustBeOff};
     case Intrinsic::nvvm_fmin_ftz_f:
       return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_nan_f:
+      return {Intrinsic::minimum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_nan_f:
+      return {Intrinsic::minimum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_f16:
+      return {Intrinsic::minnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_f16:
+      return {Intrinsic::minnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmin_f16x2:
+      return {Intrinsic::minnum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_f16x2:
+      return {Intrinsic::minnum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmin_nan_f16:
+      return {Intrinsic::minimum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_nan_f16:
+      return {Intrinsic::minimum, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fmin_nan_f16x2:
+      return {Intrinsic::minimum, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
+      return {Intrinsic::minimum, FTZ_MustBeOn, true};
     case Intrinsic::nvvm_round_d:
       return {Intrinsic::round, FTZ_Any};
     case Intrinsic::nvvm_round_f:
@@ -316,9 +360,10 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
   // intrinsic, we don't have to look up any module metadata, as
   // FtzRequirementTy will be FTZ_Any.)
   if (Action.FtzRequirement != FTZ_Any) {
-    StringRef Attr = II->getFunction()
-                         ->getFnAttribute("denormal-fp-math-f32")
-                         .getValueAsString();
+    const char *AttrName =
+        Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32";
+    StringRef Attr =
+        II->getFunction()->getFnAttribute(AttrName).getValueAsString();
     DenormalMode Mode = parseDenormalFPAttribute(Attr);
     bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
 
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
new file mode 100644
index 0000000000000..1c0537b270292
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
@@ -0,0 +1,268 @@
+; RUN: opt < %s -instcombine -S -mtriple=nvptx-nvidia-cuda -march=nvptx64 \
+; RUN:    -mcpu=sm_80 -mattr=+ptx70 | \
+; RUN: FileCheck %s
+
+declare half @llvm.nvvm.fmin.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>)
+declare float @llvm.nvvm.fmin.nan.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmin.nan.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>)
+
+declare half @llvm.nvvm.fmax.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>)
+declare float @llvm.nvvm.fmax.nan.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmax.nan.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>)
+
+; CHECK-LABEL: fmin_f16
+define half @fmin_f16(half %0, half %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmin.f16
+  ; CHECK: @llvm.minnum.f16
+  %res = call half @llvm.nvvm.fmin.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16
+define half @fmin_ftz_f16(half %0, half %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmin.ftz.f16
+  ; CHECK: @llvm.minnum.f16
+  %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16_no_attr
+define half @fmin_ftz_f16_no_attr(half %0, half %1) {
+  ; CHECK-NOT: @llvm.minnum.f16
+  ; CHECK: @llvm.nvvm.fmin.ftz.f16
+  %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_f16x2
+define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmin.f16x2
+  ; CHECK: @llvm.minnum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16x2
+define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmin.ftz.f16x2
+  ; CHECK: @llvm.minnum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16x2_no_attr
+define <2 x half> @fmin_ftz_f16x2_no_attr(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.minnum.v2f16
+  ; CHECK: @llvm.nvvm.fmin.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_nan_f
+define float @fmin_nan_f(float %0, float %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmin.nan.f
+  ; CHECK: @llvm.minimum.f32
+  %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f
+define float @fmin_ftz_nan_f(float %0, float %1) #1 {
+  ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f
+  ; CHECK: @llvm.minimum.f32
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f_no_attr
+define float @fmin_ftz_nan_f_no_attr(float %0, float %1) {
+  ; CHECK: @llvm.nvvm.fmin.ftz.nan.f
+  ; CHECK-NOT: @llvm.minimum.f32
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_nan_f16
+define half @fmin_nan_f16(half %0, half %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmin.nan.f16
+  ; CHECK: @llvm.minimum.f16
+  %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16
+define half @fmin_ftz_nan_f16(half %0, half %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f16
+  ; CHECK: @llvm.minimum.f16
+  %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16_no_attr
+define half @fmin_ftz_nan_f16_no_attr(half %0, half %1) {
+  ; CHECK: @llvm.nvvm.fmin.ftz.nan.f16
+  ; CHECK-NOT: @llvm.minimum.f16
+  %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_nan_f16x2
+define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmin.nan.f16x2
+  ; CHECK: @llvm.minimum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16x2
+define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmin.ftz.nan.f16x2
+  ; CHECK: @llvm.minimum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16x2_no_attr
+define <2 x half> @fmin_ftz_nan_f16x2_no_attr(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.minimum.v2f16
+  ; CHECK: @llvm.nvvm.fmin.ftz.nan.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_f16
+define half @fmax_f16(half %0, half %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmax.f16
+  ; CHECK: @llvm.maxnum.f16
+  %res = call half @llvm.nvvm.fmax.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_f16
+define half @fmax_ftz_f16(half %0, half %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmax.ftz.f16
+  ; CHECK: @llvm.maxnum.f16
+  %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_f16_no_attr
+define half @fmax_ftz_f16_no_attr(half %0, half %1) {
+  ; CHECK-NOT: @llvm.maxnum.f16
+  ; CHECK: @llvm.nvvm.fmax.ftz.f16
+  %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_f16x2
+define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmax.f16x2
+  ; CHECK: @llvm.maxnum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_f16x2
+define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmax.ftz.f16x2
+  ; CHECK: @llvm.maxnum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_f16x2_no_attr
+define <2 x half> @fmax_ftz_f16x2_no_attr(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.maxnum.v2f16
+  ; CHECK: @llvm.nvvm.fmax.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_nan_f
+define float @fmax_nan_f(float %0, float %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmax.nan.f
+  ; CHECK: @llvm.maximum.f32
+  %res = call float @llvm.nvvm.fmax.nan.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f
+define float @fmax_ftz_nan_f(float %0, float %1) #1 {
+  ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f
+  ; CHECK: @llvm.maximum.f32
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f_no_attr
+define float @fmax_ftz_nan_f_no_attr(float %0, float %1) {
+  ; CHECK: @llvm.nvvm.fmax.ftz.nan.f
+  ; CHECK-NOT: @llvm.maximum.f32
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_nan_f16
+define half @fmax_nan_f16(half %0, half %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmax.nan.f16
+  ; CHECK: @llvm.maximum.f16
+  %res = call half @llvm.nvvm.fmax.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f16
+define half @fmax_ftz_nan_f16(half %0, half %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f16
+  ; CHECK: @llvm.maximum.f16
+  %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f16_no_attr
+define half @fmax_ftz_nan_f16_no_attr(half %0, half %1) {
+  ; CHECK: @llvm.nvvm.fmax.ftz.nan.f16
+  ; CHECK-NOT: @llvm.maximum.f16
+  %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_nan_f16x2
+define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.nvvm.fmax.nan.f16x2
+  ; CHECK: @llvm.maximum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f16x2
+define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fmax.ftz.nan.f16x2
+  ; CHECK: @llvm.maximum.v2f16
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f16x2_no_attr
+define <2 x half> @fmax_ftz_nan_f16x2_no_attr(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: @llvm.maximum.v2f16
+  ; CHECK: @llvm.nvvm.fmax.ftz.nan.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+attributes #0 = { "denormal-fp-math"="preserve-sign" }
+attributes #1 = { "denormal-fp-math-f32"="preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
new file mode 100644
index 0000000000000..f510bd693265f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
@@ -0,0 +1,260 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s
+
+declare i16 @llvm.nvvm.abs.bf16(i16)
+declare i32 @llvm.nvvm.abs.bf16x2(i32)
+declare i16 @llvm.nvvm.neg.bf16(i16)
+declare i32 @llvm.nvvm.neg.bf16x2(i32)
+
+declare float @llvm.nvvm.fmin.nan.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmin.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.f16(half, half)
+declare half @llvm.nvvm.fmin.nan.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half>, <2 x half>)
+declare i16 @llvm.nvvm.fmin.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmin.nan.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmin.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmin.nan.bf16x2(i32, i32)
+
+declare float @llvm.nvvm.fmax.nan.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.nan.f(float, float)
+declare half @llvm.nvvm.fmax.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.f16(half, half)
+declare half @llvm.nvvm.fmax.nan.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>)
+declare i16 @llvm.nvvm.fmax.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmax.nan.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmax.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmax.nan.bf16x2(i32, i32)
+
+; CHECK-LABEL: abs_bf16
+define i16 @abs_bf16(i16 %0) {
+  ; CHECK: abs.bf16
+  %res = call i16 @llvm.nvvm.abs.bf16(i16 %0);
+  ret i16 %res
+}
+
+; CHECK-LABEL: abs_bf16x2
+define i32 @abs_bf16x2(i32 %0) {
+  ; CHECK: abs.bf16x2
+  %res = call i32 @llvm.nvvm.abs.bf16x2(i32 %0);
+  ret i32 %res
+}
+
+; CHECK-LABEL: neg_bf16
+define i16 @neg_bf16(i16 %0) {
+  ; CHECK: neg.bf16
+  %res = call i16 @llvm.nvvm.neg.bf16(i16 %0);
+  ret i16 %res
+}
+
+; CHECK-LABEL: neg_bf16x2
+define i32 @neg_bf16x2(i32 %0) {
+  ; CHECK: neg.bf16x2
+  %res = call i32 @llvm.nvvm.neg.bf16x2(i32 %0);
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmin_nan_f
+define float @fmin_nan_f(float %0, float %1) {
+  ; CHECK: min.NaN.f32
+  %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f
+define float @fmin_ftz_nan_f(float %0, float %1) {
+  ; CHECK: min.ftz.NaN.f32
+  %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_f16
+define half @fmin_f16(half %0, half %1) {
+  ; CHECK: min.f16
+  %res = call half @llvm.nvvm.fmin.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16
+define half @fmin_ftz_f16(half %0, half %1) {
+  ; CHECK: min.ftz.f16
+  %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_nan_f16
+define half @fmin_nan_f16(half %0, half %1) {
+  ; CHECK: min.NaN.f16
+  %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16
+define half @fmin_ftz_nan_f16(half %0, half %1) {
+  ; CHECK: min.ftz.NaN.f16
+  %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_f16x2
+define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_f16x2
+define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_nan_f16x2
+define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.NaN.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_f16x2
+define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.ftz.NaN.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_bf16
+define i16 @fmin_bf16(i16 %0, i16 %1) {
+  ; CHECK: min.bf16
+  %res = call i16 @llvm.nvvm.fmin.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmin_nan_bf16
+define i16 @fmin_nan_bf16(i16 %0, i16 %1) {
+  ; CHECK: min.NaN.bf16
+  %res = call i16 @llvm.nvvm.fmin.nan.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmin_bf16x2
+define i32 @fmin_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: min.bf16x2
+  %res = call i32 @llvm.nvvm.fmin.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmin_nan_bf16x2
+define i32 @fmin_nan_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: min.NaN.bf16x2
+  %res = call i32 @llvm.nvvm.fmin.nan.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmax_nan_f
+define float @fmax_nan_f(float %0, float %1) {
+  ; CHECK: max.NaN.f32
+  %res = call float @llvm.nvvm.fmax.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f
+define float @fmax_ftz_nan_f(float %0, float %1) {
+  ; CHECK: max.ftz.NaN.f32
+  %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1);
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_f16
+define half @fmax_f16(half %0, half %1) {
+  ; CHECK: max.f16
+  %res = call half @llvm.nvvm.fmax.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_f16
+define half @fmax_ftz_f16(half %0, half %1) {
+  ; CHECK: max.ftz.f16
+  %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_nan_f16
+define half @fmax_nan_f16(half %0, half %1) {
+  ; CHECK: max.NaN.f16
+  %res = call half @llvm.nvvm.fmax.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f16
+define half @fmax_ftz_nan_f16(half %0, half %1) {
+  ; CHECK: max.ftz.NaN.f16
+  %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_f16x2
+define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_f16x2
+define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_nan_f16x2
+define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.NaN.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_f16x2
+define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.ftz.NaN.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_bf16
+define i16 @fmax_bf16(i16 %0, i16 %1) {
+  ; CHECK: max.bf16
+  %res = call i16 @llvm.nvvm.fmax.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmax_nan_bf16
+define i16 @fmax_nan_bf16(i16 %0, i16 %1) {
+  ; CHECK: max.NaN.bf16
+  %res = call i16 @llvm.nvvm.fmax.nan.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmax_bf16x2
+define i32 @fmax_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: max.bf16x2
+  %res = call i32 @llvm.nvvm.fmax.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmax_nan_bf16x2
+define i32 @fmax_nan_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: max.NaN.bf16x2
+  %res = call i32 @llvm.nvvm.fmax.nan.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
new file mode 100644
index 0000000000000..ec6ba193a4d5b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -0,0 +1,259 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_86 -mattr=+ptx72 | FileCheck %s
+
+declare half @llvm.nvvm.fmin.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32, i32)
+declare float @llvm.nvvm.fmin.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmin.nan.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float, float)
+
+declare half @llvm.nvvm.fmax.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmax.nan.xorsign.abs.f16(half, half)
+declare half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16(half, half)
+declare <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> , <2 x half>)
+declare i16 @llvm.nvvm.fmax.xorsign.abs.bf16(i16, i16)
+declare i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16(i16, i16)
+declare i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2(i32, i32)
+declare i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(i32, i32)
+declare float @llvm.nvvm.fmax.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmax.nan.xorsign.abs.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float, float)
+
+; CHECK-LABEL: fmin_xorsign_abs_f16
+define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
+define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.ftz.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_nan_xorsign_abs_f16
+define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.NaN.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16
+define half @fmin_ftz_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: min.ftz.NaN.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmin_xorsign_abs_f16x2
+define <2 x half> @fmin_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_xorsign_abs_f16x2
+define <2 x half> @fmin_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.ftz.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_nan_xorsign_abs_f16x2
+define <2 x half> @fmin_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.NaN.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16x2
+define <2 x half> @fmin_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: min.ftz.NaN.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmin_xorsign_abs_bf16
+define i16 @fmin_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK: min.xorsign.abs.bf16
+  %res = call i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmin_nan_xorsign_abs_bf16
+define i16 @fmin_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK: min.NaN.xorsign.abs.bf16
+  %res = call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmin_xorsign_abs_bf16x2
+define i32 @fmin_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: min.xorsign.abs.bf16x2
+  %res = call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmin_nan_xorsign_abs_bf16x2
+define i32 @fmin_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: min.NaN.xorsign.abs.bf16x2
+  %res = call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmin_xorsign_abs_f
+define float @fmin_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: min.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_ftz_xorsign_abs_f
+define float @fmin_ftz_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: min.ftz.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_nan_xorsign_abs_f
+define float @fmin_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: min.NaN.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f
+define float @fmin_ftz_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: min.ftz.NaN.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_xorsign_abs_f16
+define half @fmax_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: max.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmax.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_xorsign_abs_f16
+define half @fmax_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: max.ftz.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_nan_xorsign_abs_f16
+define half @fmax_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: max.NaN.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmax.nan.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f16
+define half @fmax_ftz_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK: max.ftz.NaN.xorsign.abs.f16
+  %res = call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16(half %0, half %1)
+  ret half %res
+}
+
+; CHECK-LABEL: fmax_xorsign_abs_f16x2
+define <2 x half> @fmax_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_xorsign_abs_f16x2
+define <2 x half> @fmax_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.ftz.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_nan_xorsign_abs_f16x2
+define <2 x half> @fmax_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.NaN.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f16x2
+define <2 x half> @fmax_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK: max.ftz.NaN.xorsign.abs.f16x2
+  %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fmax_xorsign_abs_bf16
+define i16 @fmax_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK: max.xorsign.abs.bf16
+  %res = call i16 @llvm.nvvm.fmax.xorsign.abs.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmax_nan_xorsign_abs_bf16
+define i16 @fmax_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK: max.NaN.xorsign.abs.bf16
+  %res = call i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16(i16 %0, i16 %1)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fmax_xorsign_abs_bf16x2
+define i32 @fmax_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: max.xorsign.abs.bf16x2
+  %res = call i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmax_nan_xorsign_abs_bf16x2
+define i32 @fmax_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK: max.NaN.xorsign.abs.bf16x2
+  %res = call i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(i32 %0, i32 %1)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fmax_xorsign_abs_f
+define float @fmax_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: max.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_ftz_xorsign_abs_f
+define float @fmax_ftz_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: max.ftz.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_nan_xorsign_abs_f
+define float @fmax_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: max.NaN.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}
+
+; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f
+define float @fmax_ftz_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK: max.ftz.NaN.xorsign.abs.f
+  %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float %0, float %1)
+  ret float %res
+}

From be672934ff885255b7e5e393bf4606e9fb8894a0 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <j.chlanda@gmail.com>
Date: Tue, 22 Feb 2022 14:45:19 -0800
Subject: [PATCH 707/748] [NVPTX] Add more FMA intriniscs/builtins

This patch adds builtins/intrinsics for the following variants of FMA:

- f16, f16x2
  - rn
  - rn_ftz
  - rn_sat
  - rn_ftz_sat
  - rn_relu
  - rn_ftz_relu
- bf16, bf16x2
  - rn
  - rn_relu

ptxas (Cuda compilation tools, release 11.0, V11.0.194) is happy with the generated assembly.

Differential Revision: https://reviews.llvm.org/D118977
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  69 ++++++------
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |   2 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 101 +++++++++++------
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp |   8 ++
 .../CodeGen/NVPTX/math-intrins-sm53-ptx42.ll  |  74 ++++++++++++
 .../math-intrins-sm80-ptx70-instcombine.ll    |  54 +++++++++
 .../CodeGen/NVPTX/math-intrins-sm80-ptx70.ll  | 105 ++++++++++++++++++
 .../CodeGen/NVPTX/math-intrins-sm86-ptx72.ll  |  32 ++++++
 8 files changed, 381 insertions(+), 64 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index f13abe91dbc8e..37060d9318902 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -854,47 +854,50 @@ let TargetPrefix = "nvvm" in {
 // Fma
 //
 
-  def int_nvvm_fma_rn_ftz_f : GCCBuiltin<"__nvvm_fma_rn_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rn_f : GCCBuiltin<"__nvvm_fma_rn_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rz_ftz_f : GCCBuiltin<"__nvvm_fma_rz_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rz_f : GCCBuiltin<"__nvvm_fma_rz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rm_ftz_f : GCCBuiltin<"__nvvm_fma_rm_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rm_f : GCCBuiltin<"__nvvm_fma_rm_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rp_ftz_f : GCCBuiltin<"__nvvm_fma_rp_ftz_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
-        [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rp_f : GCCBuiltin<"__nvvm_fma_rp_f">,
-      DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+  foreach variant = ["_rn_f16", "_rn_ftz_f16", "_rn_sat_f16",
+    "_rn_ftz_sat_f16", "_rn_relu_f16", "_rn_ftz_relu_f16"] in {
+    def int_nvvm_fma # variant : GCCBuiltin<!strconcat("__nvvm_fma", variant)>,
+        DefaultAttrsIntrinsic<[llvm_half_ty],
+          [llvm_half_ty, llvm_half_ty, llvm_half_ty],
+          [IntrNoMem, IntrSpeculatable]>;
+  }
+
+  foreach variant = ["_rn_f16x2", "_rn_ftz_f16x2", "_rn_sat_f16x2",
+    "_rn_ftz_sat_f16x2", "_rn_relu_f16x2", "_rn_ftz_relu_f16x2"] in {
+    def int_nvvm_fma # variant : GCCBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_v2f16_ty],
+        [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
         [IntrNoMem, IntrSpeculatable]>;
+  }
 
-  def int_nvvm_fma_rn_d : GCCBuiltin<"__nvvm_fma_rn_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty],
-        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+  foreach variant = ["_rn_bf16", "_rn_relu_bf16"] in {
+    def int_nvvm_fma # variant : GCCBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_i16_ty],
+        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
         [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rz_d : GCCBuiltin<"__nvvm_fma_rz_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty],
-        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+  }
+
+  foreach variant = ["_rn_bf16x2", "_rn_relu_bf16x2"] in {
+    def int_nvvm_fma # variant : GCCBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_i32_ty],
+        [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
         [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rm_d : GCCBuiltin<"__nvvm_fma_rm_d">,
-      DefaultAttrsIntrinsic<[llvm_double_ty],
-        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+  }
+
+  foreach variant = ["_rn_ftz_f", "_rn_f", "_rz_ftz_f", "_rz_f", "_rm_ftz_f",
+    "_rm_f", "_rp_ftz_f", "_rp_f"] in {
+    def int_nvvm_fma # variant : GCCBuiltin<!strconcat("__nvvm_fma", variant)>,
+      DefaultAttrsIntrinsic<[llvm_float_ty],
+        [llvm_float_ty, llvm_float_ty, llvm_float_ty],
         [IntrNoMem, IntrSpeculatable]>;
-  def int_nvvm_fma_rp_d : GCCBuiltin<"__nvvm_fma_rp_d">,
+  }
+
+  foreach variant = ["_rn_d", "_rz_d", "_rm_d", "_rp_d"] in {
+    def int_nvvm_fma # variant : GCCBuiltin<!strconcat("__nvvm_fma", variant)>,
       DefaultAttrsIntrinsic<[llvm_double_ty],
         [llvm_double_ty, llvm_double_ty, llvm_double_ty],
         [IntrNoMem, IntrSpeculatable]>;
+  }
 
 //
 // Rcp
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b72baadc6964b..4f61171c6cad1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -145,6 +145,7 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def True : Predicate<"true">;
 
 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
+def hasPTX42 : Predicate<"Subtarget->getPTXVersion() >= 42">;
 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
 def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
 def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
@@ -155,6 +156,7 @@ def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">;
 def hasPTX72 : Predicate<"Subtarget->getPTXVersion() >= 72">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
+def hasSM53 : Predicate<"Subtarget->getSmVersion() >= 53">;
 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
 def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index e0d01d1027010..a7fad07026ee3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -568,12 +568,13 @@ class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
 
 class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
   NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
-  NVPTXRegClass s2_regclass, Intrinsic IntOP>
+  NVPTXRegClass s2_regclass, Intrinsic IntOP, list<Predicate> Preds = []>
             : NVPTXInst<(outs t_regclass:$dst),
               (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
             OpcStr,
         [(set t_regclass:$dst,
-          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>,
+          Requires<Preds>;
 
 //
 // MISC
@@ -648,6 +649,7 @@ def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
 //
 // Min Max f16, f16x2, bf16, bf16x2
 //
+
 class MIN_MAX_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
                     list<Predicate> Preds = [hasPTX70, hasSM80]> {
   string Variant = V;
@@ -931,35 +933,72 @@ def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
 // Fma
 //
 
-def INT_NVVM_FMA_RN_FTZ_F
-  : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
-def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
-def INT_NVVM_FMA_RZ_FTZ_F
-  : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
-def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
-def INT_NVVM_FMA_RM_FTZ_F
-  : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
-def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
-def INT_NVVM_FMA_RP_FTZ_F
-  : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
-    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
-def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
-  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
-
-def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
-def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
-def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
-def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
-  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+class FMA_TUPLE<string V, Intrinsic I, NVPTXRegClass RC,
+                list<Predicate> Preds = []> {
+  string Variant = V;
+  Intrinsic Intr = I;
+  NVPTXRegClass RegClass = RC;
+  list<Predicate> Predicates = Preds;
+}
+
+multiclass FMA_INST {
+  foreach P = [
+    FMA_TUPLE<"_rn_f64", int_nvvm_fma_rn_d, Float64Regs>,
+    FMA_TUPLE<"_rz_f64", int_nvvm_fma_rz_d, Float64Regs>,
+    FMA_TUPLE<"_rm_f64", int_nvvm_fma_rm_d, Float64Regs>,
+    FMA_TUPLE<"_rp_f64", int_nvvm_fma_rp_d, Float64Regs>,
+
+    FMA_TUPLE<"_rn_ftz_f32", int_nvvm_fma_rn_ftz_f, Float32Regs>,
+    FMA_TUPLE<"_rn_f32", int_nvvm_fma_rn_f, Float32Regs>,
+    FMA_TUPLE<"_rz_ftz_f32", int_nvvm_fma_rz_ftz_f, Float32Regs>,
+    FMA_TUPLE<"_rz_f32", int_nvvm_fma_rz_f, Float32Regs>,
+    FMA_TUPLE<"_rm_f32", int_nvvm_fma_rm_f, Float32Regs>,
+    FMA_TUPLE<"_rm_ftz_f32", int_nvvm_fma_rm_ftz_f, Float32Regs>,
+    FMA_TUPLE<"_rp_f32", int_nvvm_fma_rp_f, Float32Regs>,
+    FMA_TUPLE<"_rp_ftz_f32", int_nvvm_fma_rp_ftz_f, Float32Regs>,
+
+    FMA_TUPLE<"_rn_f16", int_nvvm_fma_rn_f16, Float16Regs, [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_f16", int_nvvm_fma_rn_ftz_f16, Float16Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_sat_f16", int_nvvm_fma_rn_sat_f16, Float16Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_sat_f16", int_nvvm_fma_rn_ftz_sat_f16, Float16Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_relu_f16", int_nvvm_fma_rn_relu_f16, Float16Regs,
+      [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_ftz_relu_f16", int_nvvm_fma_rn_ftz_relu_f16, Float16Regs,
+      [hasPTX70, hasSM80]>,
+
+    FMA_TUPLE<"_rn_f16x2", int_nvvm_fma_rn_f16x2, Float16x2Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_f16x2", int_nvvm_fma_rn_ftz_f16x2, Float16x2Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_sat_f16x2", int_nvvm_fma_rn_sat_f16x2, Float16x2Regs,
+      [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_ftz_sat_f16x2", int_nvvm_fma_rn_ftz_sat_f16x2,
+      Float16x2Regs, [hasPTX42, hasSM53]>,
+    FMA_TUPLE<"_rn_relu_f16x2", int_nvvm_fma_rn_relu_f16x2, Float16x2Regs,
+      [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_ftz_relu_f16x2", int_nvvm_fma_rn_ftz_relu_f16x2,
+      Float16x2Regs, [hasPTX70, hasSM80]>,
+
+    FMA_TUPLE<"_rn_bf16", int_nvvm_fma_rn_bf16, Int16Regs, [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_relu_bf16", int_nvvm_fma_rn_relu_bf16, Int16Regs,
+      [hasPTX70, hasSM80]>,
+
+    FMA_TUPLE<"_rn_bf16x2", int_nvvm_fma_rn_bf16x2, Int32Regs,
+      [hasPTX70, hasSM80]>,
+    FMA_TUPLE<"_rn_relu_bf16x2", int_nvvm_fma_rn_relu_bf16x2, Int32Regs,
+      [hasPTX70, hasSM80]>
+  ] in {
+    def P.Variant :
+      F_MATH_3<!strconcat("fma",
+        !subst("_", ".", P.Variant), " \t$dst, $src0, $src1, $src2;"),
+        P.RegClass, P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
+  }
+}
+
+defm INT_NVVM_FMA : FMA_INST;
 
 //
 // Rcp
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index dba52bac32389..936aa95c5b314 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -195,6 +195,14 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
       return {Intrinsic::fma, FTZ_MustBeOff};
     case Intrinsic::nvvm_fma_rn_ftz_f:
       return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_f16:
+      return {Intrinsic::fma, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fma_rn_ftz_f16:
+      return {Intrinsic::fma, FTZ_MustBeOn, true};
+    case Intrinsic::nvvm_fma_rn_f16x2:
+      return {Intrinsic::fma, FTZ_MustBeOff, true};
+    case Intrinsic::nvvm_fma_rn_ftz_f16x2:
+      return {Intrinsic::fma, FTZ_MustBeOn, true};
     case Intrinsic::nvvm_fmax_d:
       return {Intrinsic::maxnum, FTZ_Any};
     case Intrinsic::nvvm_fmax_f:
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll
new file mode 100644
index 0000000000000..a6d1ee93c6ec6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm53-ptx42.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s
+
+declare half @llvm.nvvm.fma.rn.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.sat.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.sat.f16(half, half, half)
+declare <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half>, <2 x half>, <2 x half>)
+
+; CHECK-LABEL: fma_rn_f16
+define half @fma_rn_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16
+  %res = call half @llvm.nvvm.fma.rn.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16
+define half @fma_rn_ftz_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16
+define half @fma_rn_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16
+define half @fma_rn_ftz_sat_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.sat.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_f16x2
+define <2 x half> @fma_rn_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16x2
+define <2 x half> @fma_rn_ftz_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_sat_f16x2
+define <2 x half> @fma_rn_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_sat_f16x2
+define <2 x half> @fma_rn_ftz_sat_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.sat.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
index 1c0537b270292..772ab2bd4f329 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-instcombine.ll
@@ -24,6 +24,12 @@ declare half @llvm.nvvm.fmax.ftz.nan.f16(half, half)
 declare <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half>, <2 x half>)
 declare <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half>, <2 x half>)
 
+; f16 and f16x2 fma are available since ptx 4.2 and sm_53.
+declare half @llvm.nvvm.fma.rn.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.f16(half, half, half)
+declare <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half>, <2 x half>, <2 x half>)
+
 ; CHECK-LABEL: fmin_f16
 define half @fmin_f16(half %0, half %1) {
   ; CHECK-NOT: @llvm.nvvm.fmin.f16
@@ -264,5 +270,53 @@ define <2 x half> @fmax_ftz_nan_f16x2_no_attr(<2 x half> %0, <2 x half> %1) {
   ret <2 x half> %res
 }
 
+; CHECK-LABEL: fma_rn_f16
+define half @fma_rn_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: @llvm.nvvm.fma.rn.f16
+  ; CHECK: @llvm.fma.f16
+  %res = call half @llvm.nvvm.fma.rn.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16_no_attr
+define half @fma_rn_ftz_f16_no_attr(half %0, half %1, half %2) {
+  ; CHECK-NOT: @llvm.fma.f16
+  ; CHECK: @llvm.nvvm.fma.rn.ftz.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16
+define half @fma_rn_ftz_f16(half %0, half %1, half %2) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fma.rn.ftz.f16
+  ; CHECK: @llvm.fma.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_f16x2
+define <2 x half> @fma_rn_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: @llvm.nvvm.fma.rn.f16x2
+  ; CHECK: @llvm.fma.v2f16
+  %res = call <2 x half> @llvm.nvvm.fma.rn.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16x2
+define <2 x half> @fma_rn_ftz_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) #0 {
+  ; CHECK-NOT: @llvm.nvvm.fma.rn.ftz.f16x2
+  ; CHECK: @llvm.fma.v2f16
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_f16x2_no_attr
+define <2 x half> @fma_rn_ftz_f16x2_no_attr(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: @llvm.fma.v2f16
+  ; CHECK: @llvm.nvvm.fma.rn.ftz.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
 attributes #0 = { "denormal-fp-math"="preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="preserve-sign" }
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
index f510bd693265f..e986e84233c77 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70.ll
@@ -35,8 +35,18 @@ declare i16 @llvm.nvvm.fmax.nan.bf16(i16, i16)
 declare i32 @llvm.nvvm.fmax.bf16x2(i32, i32)
 declare i32 @llvm.nvvm.fmax.nan.bf16x2(i32, i32)
 
+declare half @llvm.nvvm.fma.rn.relu.f16(half, half, half)
+declare half @llvm.nvvm.fma.rn.ftz.relu.f16(half, half, half)
+declare <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2(<2 x half>, <2 x half>, <2 x half>)
+declare i16 @llvm.nvvm.fma.rn.bf16(i16, i16, i16)
+declare i16 @llvm.nvvm.fma.rn.relu.bf16(i16, i16, i16)
+declare i32 @llvm.nvvm.fma.rn.bf16x2(i32, i32, i32)
+declare i32 @llvm.nvvm.fma.rn.relu.bf16x2(i32, i32, i32)
+
 ; CHECK-LABEL: abs_bf16
 define i16 @abs_bf16(i16 %0) {
+  ; CHECK-NOT: call
   ; CHECK: abs.bf16
   %res = call i16 @llvm.nvvm.abs.bf16(i16 %0);
   ret i16 %res
@@ -44,6 +54,7 @@ define i16 @abs_bf16(i16 %0) {
 
 ; CHECK-LABEL: abs_bf16x2
 define i32 @abs_bf16x2(i32 %0) {
+  ; CHECK-NOT: call
   ; CHECK: abs.bf16x2
   %res = call i32 @llvm.nvvm.abs.bf16x2(i32 %0);
   ret i32 %res
@@ -51,6 +62,7 @@ define i32 @abs_bf16x2(i32 %0) {
 
 ; CHECK-LABEL: neg_bf16
 define i16 @neg_bf16(i16 %0) {
+  ; CHECK-NOT: call
   ; CHECK: neg.bf16
   %res = call i16 @llvm.nvvm.neg.bf16(i16 %0);
   ret i16 %res
@@ -58,6 +70,7 @@ define i16 @neg_bf16(i16 %0) {
 
 ; CHECK-LABEL: neg_bf16x2
 define i32 @neg_bf16x2(i32 %0) {
+  ; CHECK-NOT: call
   ; CHECK: neg.bf16x2
   %res = call i32 @llvm.nvvm.neg.bf16x2(i32 %0);
   ret i32 %res
@@ -65,6 +78,7 @@ define i32 @neg_bf16x2(i32 %0) {
 
 ; CHECK-LABEL: fmin_nan_f
 define float @fmin_nan_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.f32
   %res = call float @llvm.nvvm.fmin.nan.f(float %0, float %1);
   ret float %res
@@ -72,6 +86,7 @@ define float @fmin_nan_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmin_ftz_nan_f
 define float @fmin_ftz_nan_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.f32
   %res = call float @llvm.nvvm.fmin.ftz.nan.f(float %0, float %1);
   ret float %res
@@ -79,6 +94,7 @@ define float @fmin_ftz_nan_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmin_f16
 define half @fmin_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.f16
   %res = call half @llvm.nvvm.fmin.f16(half %0, half %1)
   ret half %res
@@ -86,6 +102,7 @@ define half @fmin_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_ftz_f16
 define half @fmin_ftz_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.f16
   %res = call half @llvm.nvvm.fmin.ftz.f16(half %0, half %1)
   ret half %res
@@ -93,6 +110,7 @@ define half @fmin_ftz_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_nan_f16
 define half @fmin_nan_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.f16
   %res = call half @llvm.nvvm.fmin.nan.f16(half %0, half %1)
   ret half %res
@@ -100,6 +118,7 @@ define half @fmin_nan_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_ftz_nan_f16
 define half @fmin_ftz_nan_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.f16
   %res = call half @llvm.nvvm.fmin.ftz.nan.f16(half %0, half %1)
   ret half %res
@@ -107,6 +126,7 @@ define half @fmin_ftz_nan_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_f16x2
 define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -114,6 +134,7 @@ define <2 x half> @fmin_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmin_ftz_f16x2
 define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -121,6 +142,7 @@ define <2 x half> @fmin_ftz_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmin_nan_f16x2
 define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.nan.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -128,6 +150,7 @@ define <2 x half> @fmin_nan_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmin_ftz_nan_f16x2
 define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -135,6 +158,7 @@ define <2 x half> @fmin_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmin_bf16
 define i16 @fmin_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.bf16
   %res = call i16 @llvm.nvvm.fmin.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -142,6 +166,7 @@ define i16 @fmin_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmin_nan_bf16
 define i16 @fmin_nan_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.bf16
   %res = call i16 @llvm.nvvm.fmin.nan.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -149,6 +174,7 @@ define i16 @fmin_nan_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmin_bf16x2
 define i32 @fmin_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.bf16x2
   %res = call i32 @llvm.nvvm.fmin.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -156,6 +182,7 @@ define i32 @fmin_bf16x2(i32 %0, i32 %1) {
 
 ; CHECK-LABEL: fmin_nan_bf16x2
 define i32 @fmin_nan_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.bf16x2
   %res = call i32 @llvm.nvvm.fmin.nan.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -163,6 +190,7 @@ define i32 @fmin_nan_bf16x2(i32 %0, i32 %1) {
 
 ; CHECK-LABEL: fmax_nan_f
 define float @fmax_nan_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.f32
   %res = call float @llvm.nvvm.fmax.nan.f(float %0, float %1);
   ret float %res
@@ -170,6 +198,7 @@ define float @fmax_nan_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmax_ftz_nan_f
 define float @fmax_ftz_nan_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.NaN.f32
   %res = call float @llvm.nvvm.fmax.ftz.nan.f(float %0, float %1);
   ret float %res
@@ -177,6 +206,7 @@ define float @fmax_ftz_nan_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmax_f16
 define half @fmax_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.f16
   %res = call half @llvm.nvvm.fmax.f16(half %0, half %1)
   ret half %res
@@ -184,6 +214,7 @@ define half @fmax_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_ftz_f16
 define half @fmax_ftz_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.f16
   %res = call half @llvm.nvvm.fmax.ftz.f16(half %0, half %1)
   ret half %res
@@ -191,6 +222,7 @@ define half @fmax_ftz_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_nan_f16
 define half @fmax_nan_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.f16
   %res = call half @llvm.nvvm.fmax.nan.f16(half %0, half %1)
   ret half %res
@@ -198,6 +230,7 @@ define half @fmax_nan_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_ftz_nan_f16
 define half @fmax_ftz_nan_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.NaN.f16
   %res = call half @llvm.nvvm.fmax.ftz.nan.f16(half %0, half %1)
   ret half %res
@@ -205,6 +238,7 @@ define half @fmax_ftz_nan_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_f16x2
 define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -212,6 +246,7 @@ define <2 x half> @fmax_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmax_ftz_f16x2
 define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.ftz.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -219,6 +254,7 @@ define <2 x half> @fmax_ftz_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmax_nan_f16x2
 define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.nan.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -226,6 +262,7 @@ define <2 x half> @fmax_nan_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmax_ftz_nan_f16x2
 define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.NaN.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -233,6 +270,7 @@ define <2 x half> @fmax_ftz_nan_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmax_bf16
 define i16 @fmax_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.bf16
   %res = call i16 @llvm.nvvm.fmax.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -240,6 +278,7 @@ define i16 @fmax_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmax_nan_bf16
 define i16 @fmax_nan_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.bf16
   %res = call i16 @llvm.nvvm.fmax.nan.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -247,6 +286,7 @@ define i16 @fmax_nan_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmax_bf16x2
 define i32 @fmax_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.bf16x2
   %res = call i32 @llvm.nvvm.fmax.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -254,7 +294,72 @@ define i32 @fmax_bf16x2(i32 %0, i32 %1) {
 
 ; CHECK-LABEL: fmax_nan_bf16x2
 define i32 @fmax_nan_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.bf16x2
   %res = call i32 @llvm.nvvm.fmax.nan.bf16x2(i32 %0, i32 %1)
   ret i32 %res
 }
+
+; CHECK-LABEL: fma_rn_relu_f16
+define half @fma_rn_relu_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.relu.f16
+  %res = call half @llvm.nvvm.fma.rn.relu.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_relu_f16
+define half @fma_rn_ftz_relu_f16(half %0, half %1, half %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.relu.f16
+  %res = call half @llvm.nvvm.fma.rn.ftz.relu.f16(half %0, half %1, half %2)
+  ret half %res
+}
+
+; CHECK-LABEL: fma_rn_relu_f16x2
+define <2 x half> @fma_rn_relu_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.relu.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_ftz_relu_f16x2
+define <2 x half> @fma_rn_ftz_relu_f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.ftz.relu.f16x2
+  %res = call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+  ret <2 x half> %res
+}
+
+; CHECK-LABEL: fma_rn_bf16
+define i16 @fma_rn_bf16(i16 %0, i16 %1, i16 %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.bf16
+  %res = call i16 @llvm.nvvm.fma.rn.bf16(i16 %0, i16 %1, i16 %2)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fma_rn_relu_bf16
+define i16 @fma_rn_relu_bf16(i16 %0, i16 %1, i16 %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.relu.bf16
+  %res = call i16 @llvm.nvvm.fma.rn.relu.bf16(i16 %0, i16 %1, i16 %2)
+  ret i16 %res
+}
+
+; CHECK-LABEL: fma_rn_bf16x2
+define i32 @fma_rn_bf16x2(i32 %0, i32 %1, i32 %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.bf16x2
+  %res = call i32 @llvm.nvvm.fma.rn.bf16x2(i32 %0, i32 %1, i32 %2)
+  ret i32 %res
+}
+
+; CHECK-LABEL: fma_rn_relu_bf16x2
+define i32 @fma_rn_relu_bf16x2(i32 %0, i32 %1, i32 %2) {
+  ; CHECK-NOT: call
+  ; CHECK: fma.rn.relu.bf16x2
+  %res = call i32 @llvm.nvvm.fma.rn.relu.bf16x2(i32 %0, i32 %1, i32 %2)
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
index ec6ba193a4d5b..2d894ca78fadf 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm86-ptx72.ll
@@ -36,6 +36,7 @@ declare float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float, float)
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16
 define half @fmin_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -43,6 +44,7 @@ define half @fmin_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16
 define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -50,6 +52,7 @@ define half @fmin_ftz_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16
 define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -57,6 +60,7 @@ define half @fmin_nan_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16
 define half @fmin_ftz_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -64,6 +68,7 @@ define half @fmin_ftz_nan_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmin_xorsign_abs_f16x2
 define <2 x half> @fmin_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -71,6 +76,7 @@ define <2 x half> @fmin_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f16x2
 define <2 x half> @fmin_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -78,6 +84,7 @@ define <2 x half> @fmin_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f16x2
 define <2 x half> @fmin_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -85,6 +92,7 @@ define <2 x half> @fmin_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f16x2
 define <2 x half> @fmin_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -92,6 +100,7 @@ define <2 x half> @fmin_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1)
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16
 define i16 @fmin_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmin.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -99,6 +108,7 @@ define i16 @fmin_xorsign_abs_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16
 define i16 @fmin_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -106,6 +116,7 @@ define i16 @fmin_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmin_xorsign_abs_bf16x2
 define i32 @fmin_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -113,6 +124,7 @@ define i32 @fmin_xorsign_abs_bf16x2(i32 %0, i32 %1) {
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_bf16x2
 define i32 @fmin_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -120,6 +132,7 @@ define i32 @fmin_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
 
 ; CHECK-LABEL: fmin_xorsign_abs_f
 define float @fmin_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.xorsign.abs.f
   %res = call float @llvm.nvvm.fmin.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -127,6 +140,7 @@ define float @fmin_xorsign_abs_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmin_ftz_xorsign_abs_f
 define float @fmin_ftz_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.xorsign.abs.f
   %res = call float @llvm.nvvm.fmin.ftz.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -134,6 +148,7 @@ define float @fmin_ftz_xorsign_abs_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmin_nan_xorsign_abs_f
 define float @fmin_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.NaN.xorsign.abs.f
   %res = call float @llvm.nvvm.fmin.nan.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -141,6 +156,7 @@ define float @fmin_nan_xorsign_abs_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmin_ftz_nan_xorsign_abs_f
 define float @fmin_ftz_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: min.ftz.NaN.xorsign.abs.f
   %res = call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -148,6 +164,7 @@ define float @fmin_ftz_nan_xorsign_abs_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmax_xorsign_abs_f16
 define half @fmax_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmax.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -155,6 +172,7 @@ define half @fmax_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_ftz_xorsign_abs_f16
 define half @fmax_ftz_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -162,6 +180,7 @@ define half @fmax_ftz_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_nan_xorsign_abs_f16
 define half @fmax_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmax.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -169,6 +188,7 @@ define half @fmax_nan_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f16
 define half @fmax_ftz_nan_xorsign_abs_f16(half %0, half %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.NaN.xorsign.abs.f16
   %res = call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16(half %0, half %1)
   ret half %res
@@ -176,6 +196,7 @@ define half @fmax_ftz_nan_xorsign_abs_f16(half %0, half %1) {
 
 ; CHECK-LABEL: fmax_xorsign_abs_f16x2
 define <2 x half> @fmax_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -183,6 +204,7 @@ define <2 x half> @fmax_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmax_ftz_xorsign_abs_f16x2
 define <2 x half> @fmax_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -190,6 +212,7 @@ define <2 x half> @fmax_ftz_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmax_nan_xorsign_abs_f16x2
 define <2 x half> @fmax_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -197,6 +220,7 @@ define <2 x half> @fmax_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
 
 ; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f16x2
 define <2 x half> @fmax_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.NaN.xorsign.abs.f16x2
   %res = call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> %0, <2 x half> %1)
   ret <2 x half> %res
@@ -204,6 +228,7 @@ define <2 x half> @fmax_ftz_nan_xorsign_abs_f16x2(<2 x half> %0, <2 x half> %1)
 
 ; CHECK-LABEL: fmax_xorsign_abs_bf16
 define i16 @fmax_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmax.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -211,6 +236,7 @@ define i16 @fmax_xorsign_abs_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmax_nan_xorsign_abs_bf16
 define i16 @fmax_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.xorsign.abs.bf16
   %res = call i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16(i16 %0, i16 %1)
   ret i16 %res
@@ -218,6 +244,7 @@ define i16 @fmax_nan_xorsign_abs_bf16(i16 %0, i16 %1) {
 
 ; CHECK-LABEL: fmax_xorsign_abs_bf16x2
 define i32 @fmax_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -225,6 +252,7 @@ define i32 @fmax_xorsign_abs_bf16x2(i32 %0, i32 %1) {
 
 ; CHECK-LABEL: fmax_nan_xorsign_abs_bf16x2
 define i32 @fmax_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.xorsign.abs.bf16x2
   %res = call i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2(i32 %0, i32 %1)
   ret i32 %res
@@ -232,6 +260,7 @@ define i32 @fmax_nan_xorsign_abs_bf16x2(i32 %0, i32 %1) {
 
 ; CHECK-LABEL: fmax_xorsign_abs_f
 define float @fmax_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.xorsign.abs.f
   %res = call float @llvm.nvvm.fmax.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -239,6 +268,7 @@ define float @fmax_xorsign_abs_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmax_ftz_xorsign_abs_f
 define float @fmax_ftz_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.xorsign.abs.f
   %res = call float @llvm.nvvm.fmax.ftz.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -246,6 +276,7 @@ define float @fmax_ftz_xorsign_abs_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmax_nan_xorsign_abs_f
 define float @fmax_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.NaN.xorsign.abs.f
   %res = call float @llvm.nvvm.fmax.nan.xorsign.abs.f(float %0, float %1)
   ret float %res
@@ -253,6 +284,7 @@ define float @fmax_nan_xorsign_abs_f(float %0, float %1) {
 
 ; CHECK-LABEL: fmax_ftz_nan_xorsign_abs_f
 define float @fmax_ftz_nan_xorsign_abs_f(float %0, float %1) {
+  ; CHECK-NOT: call
   ; CHECK: max.ftz.NaN.xorsign.abs.f
   %res = call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f(float %0, float %1)
   ret float %res

From 69a8350c232af17e5c006a0be8fcf7d749a9728e Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 22 Feb 2022 14:50:42 -0800
Subject: [PATCH 708/748] [NVPTX] Add ex2.approx.f16/f16x2 support

his patch adds builtins and intrinsics for the f16 and f16x2 variants of the ex2
instruction.

These two variants were added in PTX7.0, and are supported by sm_75 and above.

Note that this isn't wired with the exp2 llvm intrinsic because the ex2
instruction is only available in its approx variant.

Running ptxas on the assembly generated by the test f16-ex2.ll works as
expected.

Differential Revision: https://reviews.llvm.org/D119157
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  4 ++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  4 ++++
 llvm/test/CodeGen/NVPTX/f16-ex2.ll       | 20 ++++++++++++++++++++
 3 files changed, 28 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/f16-ex2.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 37060d9318902..eb1318ee736f5 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -828,6 +828,10 @@ let TargetPrefix = "nvvm" in {
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_ex2_approx_d : GCCBuiltin<"__nvvm_ex2_approx_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_f16 : GCCBuiltin<"__nvvm_ex2_approx_f16">,
+      DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_f16x2 : GCCBuiltin<"__nvvm_ex2_approx_f16x2">,
+      DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
 
   def int_nvvm_lg2_approx_ftz_f : GCCBuiltin<"__nvvm_lg2_approx_ftz_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a7fad07026ee3..6cb240c08bcad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -907,6 +907,10 @@ def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
 def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
   Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
+  Float16Regs, Float16Regs, int_nvvm_ex2_approx_f16, [hasPTX70, hasSM75]>;
+def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
+  Float16x2Regs, Float16x2Regs, int_nvvm_ex2_approx_f16x2, [hasPTX70, hasSM75]>;
 
 def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
new file mode 100644
index 0000000000000..24238809e4933
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 -mattr=+ptx70 | FileCheck %s
+
+declare half @llvm.nvvm.ex2.approx.f16(half)
+declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>)
+
+; CHECK-LABEL: exp2_half
+define half @exp2_half(half %0) {
+  ; CHECK-NOT: call
+  ; CHECK: ex2.approx.f16
+  %res = call half @llvm.nvvm.ex2.approx.f16(half %0);
+  ret half %res
+}
+
+; CHECK-LABEL: exp2_2xhalf
+define <2 x half> @exp2_2xhalf(<2 x half> %0) {
+  ; CHECK-NOT: call
+  ; CHECK: ex2.approx.f16x2
+  %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0);
+  ret <2 x half> %res
+}

From cd37594c0374f287318d47818cc6d55496d3e9c0 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 23 Feb 2022 14:01:01 -0800
Subject: [PATCH 709/748] Fix unused lambda capture warning, NFC

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index b57495b042d2a..b4033da890c4c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10358,9 +10358,8 @@ void CGOpenMPRuntime::emitTargetCall(
   llvm::Value *MapTypesArray = nullptr;
   llvm::Value *MapNamesArray = nullptr;
   // Generate code for the host fallback function.
-  auto &&FallbackGen = [this, OutlinedFn, OutlinedFnID, &D, &CapturedVars,
-                        RequiresOuterTask, &CS,
-                        OffloadingMandatory](CodeGenFunction &CGF) {
+  auto &&FallbackGen = [this, OutlinedFn, &D, &CapturedVars, RequiresOuterTask,
+                        &CS, OffloadingMandatory](CodeGenFunction &CGF) {
     if (OffloadingMandatory) {
       CGF.Builder.CreateUnreachable();
     } else {

From 8f780ba362697c2c2b4df9c533d99f8dfde0af46 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Tue, 15 Feb 2022 11:41:52 -0800
Subject: [PATCH 710/748] [HWASan] add test for debug info of allocas that
 don't need padding.

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D119873
---
 .../dbg-value-tag-offset-nopad.ll             | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-value-tag-offset-nopad.ll

diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-value-tag-offset-nopad.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-value-tag-offset-nopad.ll
new file mode 100644
index 0000000000000..80f6acc90d956
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-value-tag-offset-nopad.ll
@@ -0,0 +1,61 @@
+; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+define dso_local void @f() sanitize_hwaddress !dbg !14 {
+  %a1 = alloca i128, align 4
+  %a2 = alloca i128, align 4
+  %1 = bitcast i128* %a1 to i8*, !dbg !21
+  %2 = bitcast i128* %a2 to i8*, !dbg !21
+; CHECK: call void @llvm.dbg.value(metadata i128 1, {{.*}}, metadata !DIExpression())
+  call void @llvm.dbg.value(metadata i128 1, metadata !20, metadata !DIExpression()), !dbg !22
+  store i128 1, i128* %a2, align 4, !dbg !23, !tbaa !24
+; CHECK: call void @llvm.dbg.value(metadata i128* %a1, {{.*}}, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0, DW_OP_deref))
+  call void @llvm.dbg.value(metadata i128* %a1, metadata !18, metadata !DIExpression(DW_OP_deref)), !dbg !22
+  call void @use(i8* nonnull %1), !dbg !28
+; CHECK: call void @llvm.dbg.value(metadata i128* %a2, {{.*}}, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128, DW_OP_deref))
+  call void @llvm.dbg.value(metadata i128* %a2, metadata !20, metadata !DIExpression(DW_OP_deref)), !dbg !22
+  call void @use(i8* nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(i8*)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0 (git@github.com:llvm/llvm-project.git 5560dd08b99a0e8b0c55116376624e4f967caec5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0 (git@github.com:llvm/llvm-project.git 5560dd08b99a0e8b0c55116376624e4f967caec5)"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)

From 489637e66dd38d29517d16da1afaee60485687d1 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 23 Feb 2022 23:05:22 +0100
Subject: [PATCH 711/748] [libc++] Granularize chrono includes

Reviewed By: Quuxplusone, #libc

Spies: libcxx-commits

Differential Revision: https://reviews.llvm.org/D120141
---
 libcxx/docs/ReleaseNotes.rst                              | 8 ++++----
 libcxx/include/__filesystem/directory_entry.h             | 2 +-
 libcxx/include/__filesystem/file_time_type.h              | 3 ++-
 libcxx/include/__filesystem/operations.h                  | 2 +-
 libcxx/include/__mutex_base                               | 5 ++++-
 libcxx/include/__thread/poll_with_backoff.h               | 6 +++++-
 libcxx/include/__thread/timed_backoff_policy.h            | 4 ++--
 libcxx/include/__threading_support                        | 3 ++-
 libcxx/include/atomic                                     | 1 +
 libcxx/include/future                                     | 5 ++++-
 libcxx/include/semaphore                                  | 1 +
 libcxx/include/thread                                     | 1 -
 .../test/std/thread/futures/futures.async/async.pass.cpp  | 5 +++--
 .../thread/futures/futures.shared_future/wait.pass.cpp    | 3 ++-
 .../futures/futures.shared_future/wait_for.pass.cpp       | 3 ++-
 .../futures/futures.shared_future/wait_until.pass.cpp     | 3 ++-
 .../thread/futures/futures.unique_future/wait.pass.cpp    | 3 ++-
 .../futures/futures.unique_future/wait_for.pass.cpp       | 3 ++-
 .../futures/futures.unique_future/wait_until.pass.cpp     | 3 ++-
 19 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index 1e24ad6b3c550..ce13166782f7c 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -53,10 +53,10 @@ API Changes
   ``<filesystem>`` header. The associated macro
   ``_LIBCPP_DEPRECATED_EXPERIMENTAL_FILESYSTEM`` has also been removed.
 
-- Transitive includes of ``<algorithm>`` have been removed. If you see compiler errors
-  related to missing declarations inside namespace ``std`` when updating libc++,
-  you are probably missing ``#include <algorithm>`` in a file where you use algorithms.
-
+- Some libc++ headers no longer transitively include all of ``<algorithm>``and ``<chrono>``.
+  If, after updating libc++, you see compiler errors related to missing declarations in
+  namespace ``std``, it might be because one of your source files now needs to
+  ``#include <algorithm>`` and/or ``#include <chrono>``.
 ABI Changes
 -----------
 
diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index a1f18add81b0c..3db244556cfd8 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H
 
 #include <__availability>
+#include <__chrono/time_point.h>
 #include <__config>
 #include <__errc>
 #include <__filesystem/file_status.h>
@@ -21,7 +22,6 @@
 #include <__filesystem/path.h>
 #include <__filesystem/perms.h>
 #include <__utility/unreachable.h>
-#include <chrono>
 #include <cstdint>
 #include <cstdlib>
 #include <iosfwd>
diff --git a/libcxx/include/__filesystem/file_time_type.h b/libcxx/include/__filesystem/file_time_type.h
index 620b5ec8d320c..7c4932e603bc1 100644
--- a/libcxx/include/__filesystem/file_time_type.h
+++ b/libcxx/include/__filesystem/file_time_type.h
@@ -11,8 +11,9 @@
 #define _LIBCPP___FILESYSTEM_FILE_TIME_TYPE_H
 
 #include <__availability>
+#include <__chrono/file_clock.h>
+#include <__chrono/time_point.h>
 #include <__config>
-#include <chrono>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__filesystem/operations.h b/libcxx/include/__filesystem/operations.h
index 894c501d4ace1..85c71f017f346 100644
--- a/libcxx/include/__filesystem/operations.h
+++ b/libcxx/include/__filesystem/operations.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___FILESYSTEM_OPERATIONS_H
 
 #include <__availability>
+#include <__chrono/time_point.h>
 #include <__config>
 #include <__filesystem/copy_options.h>
 #include <__filesystem/file_status.h>
@@ -20,7 +21,6 @@
 #include <__filesystem/perm_options.h>
 #include <__filesystem/perms.h>
 #include <__filesystem/space_info.h>
-#include <chrono>
 #include <cstdint>
 #include <system_error>
 
diff --git a/libcxx/include/__mutex_base b/libcxx/include/__mutex_base
index 78eda0882c5bb..da056b6d14237 100644
--- a/libcxx/include/__mutex_base
+++ b/libcxx/include/__mutex_base
@@ -10,9 +10,12 @@
 #ifndef _LIBCPP___MUTEX_BASE
 #define _LIBCPP___MUTEX_BASE
 
+#include <__chrono/duration.h>
+#include <__chrono/steady_clock.h>
+#include <__chrono/system_clock.h>
+#include <__chrono/time_point.h>
 #include <__config>
 #include <__threading_support>
-#include <chrono>
 #include <ratio>
 #include <system_error>
 #include <time.h>
diff --git a/libcxx/include/__thread/poll_with_backoff.h b/libcxx/include/__thread/poll_with_backoff.h
index 9b084b3302314..0bbafd186657c 100644
--- a/libcxx/include/__thread/poll_with_backoff.h
+++ b/libcxx/include/__thread/poll_with_backoff.h
@@ -10,8 +10,12 @@
 #define _LIBCPP___THREAD_POLL_WITH_BACKOFF_H
 
 #include <__availability>
+#include <__chrono/duration.h>
+#include <__chrono/high_resolution_clock.h>
+#include <__chrono/steady_clock.h>
+#include <__chrono/time_point.h>
 #include <__config>
-#include <chrono>
+#include <__filesystem/file_time_type.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__thread/timed_backoff_policy.h b/libcxx/include/__thread/timed_backoff_policy.h
index 58c2390bafa14..28fe75d6fd805 100644
--- a/libcxx/include/__thread/timed_backoff_policy.h
+++ b/libcxx/include/__thread/timed_backoff_policy.h
@@ -13,8 +13,8 @@
 
 #ifndef _LIBCPP_HAS_NO_THREADS
 
-#include <__threading_support>
-#include <chrono>
+#  include <__chrono/duration.h>
+#  include <__threading_support>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__threading_support b/libcxx/include/__threading_support
index 11899e6b6e4a8..4f1640ab36d0c 100644
--- a/libcxx/include/__threading_support
+++ b/libcxx/include/__threading_support
@@ -11,9 +11,10 @@
 #define _LIBCPP_THREADING_SUPPORT
 
 #include <__availability>
+#include <__chrono/convert_to_timespec.h>
+#include <__chrono/duration.h>
 #include <__config>
 #include <__thread/poll_with_backoff.h>
-#include <chrono>
 #include <errno.h>
 #include <iosfwd>
 #include <limits>
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 4a5c4847dabd3..dfa01bf239f86 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -519,6 +519,7 @@ template <class T>
 */
 
 #include <__availability>
+#include <__chrono/duration.h>
 #include <__config>
 #include <__thread/poll_with_backoff.h>
 #include <__thread/timed_backoff_policy.h>
diff --git a/libcxx/include/future b/libcxx/include/future
index 15685dfb22cbf..fff746a93dd25 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -363,12 +363,13 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 
 #include <__assert>
 #include <__availability>
+#include <__chrono/duration.h>
+#include <__chrono/time_point.h>
 #include <__config>
 #include <__memory/allocator_arg_t.h>
 #include <__memory/uses_allocator.h>
 #include <__utility/auto_cast.h>
 #include <__utility/forward.h>
-#include <chrono>
 #include <exception>
 #include <memory>
 #include <mutex>
@@ -376,6 +377,8 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #include <thread>
 #include <version>
 
+#include <chrono> // TODO: Remove unused header
+
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index 753d50f512667..df7b0d921e1ee 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -46,6 +46,7 @@ using binary_semaphore = counting_semaphore<1>;
 */
 
 #include <__availability>
+#include <__chrono/time_point.h>
 #include <__config>
 #include <__thread/timed_backoff_policy.h>
 #include <__threading_support>
diff --git a/libcxx/include/thread b/libcxx/include/thread
index b059cfb459905..5a8e830fcca73 100644
--- a/libcxx/include/thread
+++ b/libcxx/include/thread
@@ -89,7 +89,6 @@ void sleep_for(const chrono::duration<Rep, Period>& rel_time);
 #include <__thread/timed_backoff_policy.h>
 #include <__threading_support>
 #include <__utility/forward.h>
-#include <chrono>
 #include <cstddef>
 #include <functional>
 #include <iosfwd>
diff --git a/libcxx/test/std/thread/futures/futures.async/async.pass.cpp b/libcxx/test/std/thread/futures/futures.async/async.pass.cpp
index 5eded0506873e..2fdafec138f58 100644
--- a/libcxx/test/std/thread/futures/futures.async/async.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.async/async.pass.cpp
@@ -20,10 +20,11 @@
 //     async(launch policy, F&& f, Args&&... args);
 
 
-#include <future>
 #include <atomic>
-#include <memory>
 #include <cassert>
+#include <chrono>
+#include <future>
+#include <memory>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp b/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp
index 4be8c1bd70158..26a5f1f49f82d 100644
--- a/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.shared_future/wait.pass.cpp
@@ -15,8 +15,9 @@
 
 // void wait() const;
 
-#include <future>
 #include <cassert>
+#include <chrono>
+#include <future>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/futures/futures.shared_future/wait_for.pass.cpp b/libcxx/test/std/thread/futures/futures.shared_future/wait_for.pass.cpp
index d8b0ce3457bc5..44b88f38d0f0f 100644
--- a/libcxx/test/std/thread/futures/futures.shared_future/wait_for.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.shared_future/wait_for.pass.cpp
@@ -17,8 +17,9 @@
 //   future_status
 //   wait_for(const chrono::duration<Rep, Period>& rel_time) const;
 
-#include <future>
 #include <cassert>
+#include <chrono>
+#include <future>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/futures/futures.shared_future/wait_until.pass.cpp b/libcxx/test/std/thread/futures/futures.shared_future/wait_until.pass.cpp
index 1d58dc25f2185..fc356dc0d7724 100644
--- a/libcxx/test/std/thread/futures/futures.shared_future/wait_until.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.shared_future/wait_until.pass.cpp
@@ -17,9 +17,10 @@
 //   future_status
 //   wait_until(const chrono::time_point<Clock, Duration>& abs_time) const;
 
-#include <future>
 #include <atomic>
 #include <cassert>
+#include <chrono>
+#include <future>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp b/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp
index 712ee3b9b4a15..2b9b0f66e1178 100644
--- a/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.unique_future/wait.pass.cpp
@@ -15,8 +15,9 @@
 
 // void wait() const;
 
-#include <future>
 #include <cassert>
+#include <chrono>
+#include <future>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/futures/futures.unique_future/wait_for.pass.cpp b/libcxx/test/std/thread/futures/futures.unique_future/wait_for.pass.cpp
index ff5d979e8c142..3f41bc1a81cb5 100644
--- a/libcxx/test/std/thread/futures/futures.unique_future/wait_for.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.unique_future/wait_for.pass.cpp
@@ -19,8 +19,9 @@
 //   future_status
 //   wait_for(const chrono::duration<Rep, Period>& rel_time) const;
 
-#include <future>
 #include <cassert>
+#include <chrono>
+#include <future>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
diff --git a/libcxx/test/std/thread/futures/futures.unique_future/wait_until.pass.cpp b/libcxx/test/std/thread/futures/futures.unique_future/wait_until.pass.cpp
index e5171e5f9f1f5..d4e27d9d9761f 100644
--- a/libcxx/test/std/thread/futures/futures.unique_future/wait_until.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.unique_future/wait_until.pass.cpp
@@ -17,9 +17,10 @@
 //   future_status
 //   wait_until(const chrono::time_point<Clock, Duration>& abs_time) const;
 
-#include <future>
 #include <atomic>
 #include <cassert>
+#include <chrono>
+#include <future>
 
 #include "make_test_thread.h"
 #include "test_macros.h"

From 1d1b089c5d503e2fc8697887411730105f66c774 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Wed, 23 Feb 2022 14:05:26 -0800
Subject: [PATCH 712/748] Fix more unused lambda capture warnings, NFC

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index b4033da890c4c..3f4a78ddbf3c6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10358,8 +10358,8 @@ void CGOpenMPRuntime::emitTargetCall(
   llvm::Value *MapTypesArray = nullptr;
   llvm::Value *MapNamesArray = nullptr;
   // Generate code for the host fallback function.
-  auto &&FallbackGen = [this, OutlinedFn, &D, &CapturedVars, RequiresOuterTask,
-                        &CS, OffloadingMandatory](CodeGenFunction &CGF) {
+  auto &&FallbackGen = [this, &D, OutlinedFn, &CapturedVars, RequiresOuterTask, &CS,
+                        OffloadingMandatory](CodeGenFunction &CGF) {
     if (OffloadingMandatory) {
       CGF.Builder.CreateUnreachable();
     } else {
@@ -10371,9 +10371,8 @@ void CGOpenMPRuntime::emitTargetCall(
     }
   };
   // Fill up the pointer arrays and transfer execution to the device.
-  auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo,
-                    &MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask,
-                    &CapturedVars, SizeEmitter,
+  auto &&ThenGen = [this, Device, OutlinedFnID, &D, &InputInfo,
+                    &MapTypesArray, &MapNamesArray, SizeEmitter,
                     FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
     if (Device.getInt() == OMPC_DEVICE_ancestor) {
       // Reverse offloading is not supported, so just execute on the host.
@@ -10392,6 +10391,7 @@ void CGOpenMPRuntime::emitTargetCall(
 
     // From this point on, we need to have an ID of the target region defined.
     assert(OutlinedFnID && "Invalid outlined function ID!");
+    (void)OutlinedFnID;
 
     // Emit device ID if any.
     llvm::Value *DeviceID;
@@ -10529,8 +10529,7 @@ void CGOpenMPRuntime::emitTargetCall(
   };
 
   // Notify that the host version must be executed.
-  auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars, RequiresOuterTask,
-                    FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&ElseGen = [FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
     FallbackGen(CGF);
   };
 

From dcc4feb9a49ab88e892dd626ebb0a69ff5a8dc42 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 23 Feb 2022 17:11:34 -0500
Subject: [PATCH 713/748] Use function prototypes when appropriate; NFC

---
 clang/test/Analysis/taint-generic.c          | 2 +-
 clang/test/Driver/riscv-default-features.c   | 2 +-
 clang/test/OpenMP/atomic_compare_codegen.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index 0612e1b9f98bf..a2eac46cfa781 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -407,7 +407,7 @@ void testUnknownFunction(void (*foo)(void)) {
   foo(); // no-crash
 }
 
-void testProctitleFalseNegative() {
+void testProctitleFalseNegative(void) {
   char flag[80];
   fscanf(stdin, "%79s", flag);
   char *argv[] = {"myapp", flag};
diff --git a/clang/test/Driver/riscv-default-features.c b/clang/test/Driver/riscv-default-features.c
index 07b0778cd9938..853e4dbb2fd4b 100644
--- a/clang/test/Driver/riscv-default-features.c
+++ b/clang/test/Driver/riscv-default-features.c
@@ -4,6 +4,6 @@
 // RV64: "target-features"="+64bit,+a,+c,+m,+relax,-save-restore"
 
 // Dummy function
-int foo(){
+int foo(void){
   return  3;
 }
diff --git a/clang/test/OpenMP/atomic_compare_codegen.cpp b/clang/test/OpenMP/atomic_compare_codegen.cpp
index 9d3dcf0fee138..0645917ee9f79 100644
--- a/clang/test/OpenMP/atomic_compare_codegen.cpp
+++ b/clang/test/OpenMP/atomic_compare_codegen.cpp
@@ -12,7 +12,7 @@
 #ifndef HEADER
 #define HEADER
 
-void foo() {
+void foo(void) {
   char cx, ce, cd;
   unsigned char ucx, uce, ucd;
   short sx, se, sd;

From 5e1ae3aba2c90b6e4353beaec02b35c9fba383c5 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 23 Feb 2022 14:04:51 -0800
Subject: [PATCH 714/748] [NFC][hwasan] Clang-format the file

---
 compiler-rt/lib/hwasan/hwasan_new_delete.cpp | 63 ++++++++++----------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_new_delete.cpp b/compiler-rt/lib/hwasan/hwasan_new_delete.cpp
index 4e057a651e1dd..f2ae96fed2afa 100644
--- a/compiler-rt/lib/hwasan/hwasan_new_delete.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_new_delete.cpp
@@ -22,21 +22,23 @@
 #if HWASAN_REPLACE_OPERATORS_NEW_AND_DELETE
 
 // TODO(alekseys): throw std::bad_alloc instead of dying on OOM.
-#define OPERATOR_NEW_BODY(nothrow) \
-  GET_MALLOC_STACK_TRACE; \
-  void *res = hwasan_malloc(size, &stack);\
-  if (!nothrow && UNLIKELY(!res)) ReportOutOfMemory(size, &stack);\
-  return res
-#define OPERATOR_NEW_ALIGN_BODY(nothrow)                                    \
-  GET_MALLOC_STACK_TRACE;                                                   \
-  void *res = hwasan_aligned_alloc(static_cast<uptr>(align), size, &stack); \
-  if (!nothrow && UNLIKELY(!res))                                           \
-    ReportOutOfMemory(size, &stack);                                        \
-  return res
-
-#define OPERATOR_DELETE_BODY \
-  GET_MALLOC_STACK_TRACE; \
-  if (ptr) hwasan_free(ptr, &stack)
+#  define OPERATOR_NEW_BODY(nothrow)         \
+    GET_MALLOC_STACK_TRACE;                  \
+    void *res = hwasan_malloc(size, &stack); \
+    if (!nothrow && UNLIKELY(!res))          \
+      ReportOutOfMemory(size, &stack);       \
+    return res
+#  define OPERATOR_NEW_ALIGN_BODY(nothrow)                                    \
+    GET_MALLOC_STACK_TRACE;                                                   \
+    void *res = hwasan_aligned_alloc(static_cast<uptr>(align), size, &stack); \
+    if (!nothrow && UNLIKELY(!res))                                           \
+      ReportOutOfMemory(size, &stack);                                        \
+    return res
+
+#  define OPERATOR_DELETE_BODY \
+    GET_MALLOC_STACK_TRACE;    \
+    if (ptr)                   \
+    hwasan_free(ptr, &stack)
 
 #elif defined(__ANDROID__)
 
@@ -44,8 +46,8 @@
 // since we previously released a runtime that intercepted these functions,
 // removing the interceptors would break ABI. Therefore we simply forward to
 // malloc and free.
-#define OPERATOR_NEW_BODY(nothrow) return malloc(size)
-#define OPERATOR_DELETE_BODY free(ptr)
+#  define OPERATOR_NEW_BODY(nothrow) return malloc(size)
+#  define OPERATOR_DELETE_BODY free(ptr)
 
 #endif
 
@@ -55,26 +57,27 @@ using namespace __hwasan;
 
 // Fake std::nothrow_t to avoid including <new>.
 namespace std {
-  struct nothrow_t {};
+struct nothrow_t {};
 }  // namespace std
 
-
-
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new(size_t size) { OPERATOR_NEW_BODY(false /*nothrow*/); }
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new[](size_t size) { OPERATOR_NEW_BODY(false /*nothrow*/); }
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new(size_t size, std::nothrow_t const&) {
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new(size_t size) {
+  OPERATOR_NEW_BODY(false /*nothrow*/);
+}
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new[](
+    size_t size) {
+  OPERATOR_NEW_BODY(false /*nothrow*/);
+}
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new(
+    size_t size, std::nothrow_t const &) {
   OPERATOR_NEW_BODY(true /*nothrow*/);
 }
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
-void *operator new[](size_t size, std::nothrow_t const&) {
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void *operator new[](
+    size_t size, std::nothrow_t const &) {
   OPERATOR_NEW_BODY(true /*nothrow*/);
 }
 
-INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void operator delete(void *ptr)
-    NOEXCEPT {
+INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void operator delete(
+    void *ptr) NOEXCEPT {
   OPERATOR_DELETE_BODY;
 }
 INTERCEPTOR_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE void operator delete[](

From 142cedc28343771b9b80fa5bd9574cf3bc4556ae Mon Sep 17 00:00:00 2001
From: minglotus-6 <mingmingl@google.com>
Date: Tue, 22 Feb 2022 12:07:43 -0800
Subject: [PATCH 715/748] [SampleProf][Inliner] Add an option to turn off
 inliner in sample-profile pass.

Use case is offline evaluation (for inliner effectiveness) or debugging.

Differential Revision: https://reviews.llvm.org/D120344
---
 llvm/lib/Transforms/IPO/SampleProfile.cpp     | 10 ++++++-
 .../SampleProfile/profile-context-tracker.ll  | 30 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 5985281019b0a..71938968cab22 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -183,6 +183,11 @@ static cl::opt<bool> ProfileSizeInline(
     cl::desc("Inline cold call sites in profile loader if it's beneficial "
              "for code size."));
 
+static cl::opt<bool> DisableSampleLoaderInlining(
+    "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
+    cl::desc("If true, turn off inliner in sample profile loader. Used for "
+             "evaluation or debugging."));
+
 cl::opt<int> ProfileInlineGrowthLimit(
     "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
     cl::desc("The size growth ratio limit for proirity-based sample profile "
@@ -1124,6 +1129,8 @@ void SampleProfileLoader::findExternalInlineCandidate(
 /// \returns True if there is any inline happened.
 bool SampleProfileLoader::inlineHotFunctions(
     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+  if (DisableSampleLoaderInlining)
+    return false;
   // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
   // Profile symbol list is ignored when profile-sample-accurate is on.
   assert((!ProfAccForSymsInList ||
@@ -1400,7 +1407,8 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
 
 bool SampleProfileLoader::inlineHotFunctionsWithPriority(
     Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
-
+  if (DisableSampleLoaderInlining)
+    return false;
   // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
   // Profile symbol list is ignored when profile-sample-accurate is on.
   assert((!ProfAccForSymsInList ||
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
index c360c190dc0f0..323f662afcdde 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -17,12 +17,20 @@
 ;   _Z5funcAi:1 @ _Z8funcLeafi
 ;   _Z5funcBi:1 @ _Z8funcLeafi
 
+; Test the functions won't be inlined as a result of sampled profile if `disable-sample-loader-inlining` is true.
+;
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -disable-sample-loader-inlining -S | FileCheck %s --check-prefix=INLINE-NONE
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -disable-sample-loader-inlining -S | FileCheck %s --check-prefix=INLINE-NONE
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -disable-sample-loader-inlining -S | FileCheck %s --check-prefix=INLINE-NONE
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -disable-sample-loader-inlining -S | FileCheck %s --check-prefix=INLINE-NONE
+
 
 @factor = dso_local global i32 3, align 4, !dbg !0
 
 define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
 ; INLINE-ALL: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]]
 ; INLINE-HOT: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]]
+; INLINE-NONE: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]]
 entry:
   br label %for.body, !dbg !25
 
@@ -36,10 +44,16 @@ for.body:                                         ; preds = %for.body, %entry
 ; _Z5funcBi is marked noinline
 ; INLINE-ALL: call i32 @_Z5funcBi
 ; INLINE-HOT: call i32 @_Z5funcBi
+;
+; _Z5funcBi isn't inlined since disable-sample-loader-inlining is true.
+; INLINE-NONE:  call i32 @_Z5funcBi
   %add = add nuw nsw i32 %x.011, 1, !dbg !31
   %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
 ; INLINE-ALL-NOT: call i32 @_Z5funcAi
 ; INLINE-HOT: call i32 @_Z5funcAi
+;
+; _Z5funcAi is not inlined since `disable-sample-loader-inlining` is true.
+; INLINE-NONE: call i32 @_Z5funcAi
   %add2 = add i32 %call, %r.010, !dbg !34
   %add3 = add i32 %add2, %call1, !dbg !35
   %dec = add nsw i32 %x.011, -1, !dbg !36
@@ -51,6 +65,7 @@ define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
 ; _Z5funcAi is inlined, so outline remainder should have zero counts
 ; INLINE-ALL: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]]
 ; INLINE-HOT: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]]
+; INLINE-NONE: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]]
 entry:
   %add = add nsw i32 %x, 100000, !dbg !44
 ; _Z8funcLeafi is already inlined on main->_Z5funcAi->_Z8funcLeafi,
@@ -58,6 +73,9 @@ entry:
 ; (merged and promoted) context profile
 ; INLINE-ALL: call i32 @_Z8funcLeafi
 ; INLINE-HOT-NOT: call i32 @_Z8funcLeafi
+;
+; `_Z8funcLeafi` isn't inlined if `disable-sample-loader-inlining` is true.
+; INLINE-NONE: call i32 @_Z8funcLeafi
   %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
   ret i32 %call, !dbg !46
 }
@@ -67,6 +85,9 @@ define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
 ; inlined, so outline remainder should have empty profile
 ; INLINE-ALL: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]]
 ; INLINE-HOT: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]]
+;
+; _Z8funcLeafi won't be inlined if `disable-sample-loader-inlining` is true.
+; INLINE-NONE: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]]
 entry:
   %cmp = icmp sgt i32 %x, 0, !dbg !57
   br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
@@ -100,6 +121,8 @@ define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
 ; _Z5funcBi is marked noinline, so outline remainder has promoted context profile
 ; INLINE-ALL: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]]
 ; INLINE-HOT: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]]
+; _Z5funcBi won't be inlined since `disable-sample-loader-inlining` is true.
+; INLINE-NONE: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]]
 entry:
   %sub = add nsw i32 %x, -100000, !dbg !51
   %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
@@ -107,6 +130,8 @@ entry:
 ; should be inlined based on promoted context profile
 ; INLINE-ALL-NOT: call i32 @_Z8funcLeafi
 ; INLINE-HOT-NOT: call i32 @_Z8funcLeafi
+;
+; INLINE-NONE: call i32 @_Z8funcLeafi
   ret i32 %call, !dbg !53
 }
 
@@ -120,6 +145,11 @@ entry:
 ; INLINE-HOT-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0}
 ; INLINE-HOT-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 13}
 
+; INLINE-NONE: [[MAIN_PROF]] = !{!"function_entry_count", i64 1}
+; INLINE-NONE: [[FUNCA_PROF]] = !{!"function_entry_count", i64 24}
+; INLINE-NONE-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 22}
+; INLINE-NONE-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 32}
+
 declare i32 @_Z3fibi(i32)
 
 attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }

From c990d56d4228052fff079619770db9967157dfd4 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 23 Feb 2022 14:05:00 -0800
Subject: [PATCH 716/748] [HWASan] Use hwasan_memalign for aligned new.

Aligned new does not require size to be a multiple of alignment, so
memalign is the correct choice instead of aligned_alloc.

Fixes false reports for unaligned sizes.

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D119161
---
 compiler-rt/lib/hwasan/hwasan_new_delete.cpp  | 10 +++++-----
 .../test/hwasan/TestCases/new-test.cpp        | 20 +++++++++++++++----
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_new_delete.cpp b/compiler-rt/lib/hwasan/hwasan_new_delete.cpp
index f2ae96fed2afa..495046a754f10 100644
--- a/compiler-rt/lib/hwasan/hwasan_new_delete.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_new_delete.cpp
@@ -28,11 +28,11 @@
     if (!nothrow && UNLIKELY(!res))          \
       ReportOutOfMemory(size, &stack);       \
     return res
-#  define OPERATOR_NEW_ALIGN_BODY(nothrow)                                    \
-    GET_MALLOC_STACK_TRACE;                                                   \
-    void *res = hwasan_aligned_alloc(static_cast<uptr>(align), size, &stack); \
-    if (!nothrow && UNLIKELY(!res))                                           \
-      ReportOutOfMemory(size, &stack);                                        \
+#  define OPERATOR_NEW_ALIGN_BODY(nothrow)                               \
+    GET_MALLOC_STACK_TRACE;                                              \
+    void *res = hwasan_memalign(static_cast<uptr>(align), size, &stack); \
+    if (!nothrow && UNLIKELY(!res))                                      \
+      ReportOutOfMemory(size, &stack);                                   \
     return res
 
 #  define OPERATOR_DELETE_BODY \
diff --git a/compiler-rt/test/hwasan/TestCases/new-test.cpp b/compiler-rt/test/hwasan/TestCases/new-test.cpp
index 3b1991e4deaa5..7e4a4ae6557f2 100644
--- a/compiler-rt/test/hwasan/TestCases/new-test.cpp
+++ b/compiler-rt/test/hwasan/TestCases/new-test.cpp
@@ -1,11 +1,13 @@
 // Test basic new functionality.
-// RUN: %clangxx_hwasan %s -o %t
+// RUN: %clangxx_hwasan -std=c++17 %s -o %t
 // RUN: %run %t
 
-#include <stdlib.h>
-#include <assert.h>
-#include <sanitizer/hwasan_interface.h>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <new>
 #include <sanitizer/allocator_interface.h>
+#include <sanitizer/hwasan_interface.h>
 
 int main() {
   __hwasan_enable_allocator_tagging();
@@ -15,4 +17,14 @@ int main() {
   assert(a1 != nullptr);
   assert(__sanitizer_get_allocated_size(a1) == 0);
   delete[] a1;
+
+#ifdef __cpp_aligned_new
+  // Aligned new/delete
+  constexpr auto kAlign = std::align_val_t{8};
+  void *a2 = ::operator new(4, kAlign);
+  assert(a2 != nullptr);
+  assert(reinterpret_cast<uintptr_t>(a2) % static_cast<uintptr_t>(kAlign) == 0);
+  assert(__sanitizer_get_allocated_size(a2) >= 4);
+  ::operator delete(a2, kAlign);
+#endif
 }

From af26d68ddc392cdc0b87cf638f1b033520b9c68b Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Tue, 22 Feb 2022 16:34:04 -0600
Subject: [PATCH 717/748] [opt] Pin region viewer passes to legacy PM.

The RegionPrinter, RegionOnlyPrinter, RegionViewer and RegionOnlyViewer passes have not yet been ported to the new pass manager.

Reviewed By: aeubanks

Differential Revision: https://reviews.llvm.org/D119897
---
 llvm/tools/opt/opt.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index b503547221986..3eb7f8b82f4a8 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -499,7 +499,9 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "loop-reduce",          "lower-amx-type",
       "pre-amx-config",       "lower-amx-intrinsics",
       "polyhedral-info",      "replace-with-veclib",
-      "jmc-instrument"};
+      "jmc-instrument",       "dot-regions",
+      "dot-regions-only",     "view-regions",
+      "view-regions-only"};
   for (const auto &P : PassNamePrefix)
     if (Pass.startswith(P))
       return true;

From 924032c592658b19d9e1faeef0fea14414814f39 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 23 Feb 2022 14:29:09 -0800
Subject: [PATCH 718/748] [NFC][hwasan] Check _GLIBCXX_RELEASE in test

Differential Revision: https://reviews.llvm.org/D119161
---
 compiler-rt/test/hwasan/TestCases/new-test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/hwasan/TestCases/new-test.cpp b/compiler-rt/test/hwasan/TestCases/new-test.cpp
index 7e4a4ae6557f2..d21a80d0de327 100644
--- a/compiler-rt/test/hwasan/TestCases/new-test.cpp
+++ b/compiler-rt/test/hwasan/TestCases/new-test.cpp
@@ -18,7 +18,7 @@ int main() {
   assert(__sanitizer_get_allocated_size(a1) == 0);
   delete[] a1;
 
-#ifdef __cpp_aligned_new
+#if defined(__cpp_aligned_new) && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 7)
   // Aligned new/delete
   constexpr auto kAlign = std::align_val_t{8};
   void *a2 = ::operator new(4, kAlign);

From 1592d88aa7bc13c9f53cf09d25b98e7318a57bfb Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Tue, 19 Oct 2021 09:12:57 -0700
Subject: [PATCH 719/748] Add support for floating-point option
 `ffp-eval-method` and for `pragma clang fp eval_method`.

Differential Revision: https://reviews.llvm.org/D109239
---
 clang/docs/LanguageExtensions.rst             |  32 ++++
 clang/docs/UsersManual.rst                    |  27 +++
 .../include/clang/Basic/DiagnosticLexKinds.td |   4 +
 .../clang/Basic/DiagnosticParseKinds.td       |   3 +
 clang/include/clang/Basic/FPOptions.def       |   1 +
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Basic/LangOptions.h       |  18 ++
 clang/include/clang/Basic/TargetInfo.h        |   6 +-
 clang/include/clang/Driver/Options.td         |   5 +
 clang/include/clang/Lex/Preprocessor.h        |  47 +++++
 clang/include/clang/Parse/Parser.h            |   1 +
 clang/include/clang/Sema/Sema.h               |  14 +-
 clang/lib/Basic/Targets/OSTargets.h           |   4 +-
 clang/lib/Basic/Targets/X86.h                 |  14 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  17 ++
 clang/lib/Frontend/InitPreprocessor.cpp       |   1 -
 clang/lib/Lex/PPMacroExpansion.cpp            |  12 ++
 clang/lib/Lex/Preprocessor.cpp                |   3 +
 clang/lib/Parse/ParsePragma.cpp               |  25 ++-
 clang/lib/Parse/ParseStmt.cpp                 |  10 ++
 clang/lib/Sema/Sema.cpp                       |  21 +++
 clang/lib/Sema/SemaAttr.cpp                   |  21 +++
 clang/lib/Sema/SemaExpr.cpp                   |  34 ++++
 .../test/CodeGen/X86/32bit-behavior-no-eval.c |  30 ++++
 clang/test/CodeGen/X86/32bit-behavior.c       | 109 ++++++++++++
 clang/test/CodeGen/X86/fp-eval-method.c       |  20 +++
 clang/test/CodeGen/flt_eval_macro.cpp         |  79 +++++++++
 clang/test/CodeGen/fp-floatcontrol-pragma.cpp | 166 +++++++++++++++++-
 clang/test/Preprocessor/flt_eval_macro.cpp    |  82 +++++++++
 clang/test/Preprocessor/init-aarch64.c        |   3 -
 clang/test/Preprocessor/init-arm.c            |   5 -
 clang/test/Preprocessor/init-mips.c           |   6 -
 clang/test/Preprocessor/init-ppc.c            |   5 -
 clang/test/Preprocessor/init-ppc64.c          |   4 -
 clang/test/Preprocessor/init-s390x.c          |   1 -
 clang/test/Preprocessor/init-v7k-compat.c     |   1 -
 clang/test/Preprocessor/init-x86.c            |  15 --
 clang/test/Preprocessor/init.c                |  11 --
 clang/test/Sema/fp-eval-pragma.cpp            |  87 +++++++++
 clang/test/Sema/x86-eval-method.c             |  18 ++
 clang/test/Sema/x86_64-eval-method.c          |  13 ++
 41 files changed, 904 insertions(+), 72 deletions(-)
 create mode 100644 clang/test/CodeGen/X86/32bit-behavior-no-eval.c
 create mode 100644 clang/test/CodeGen/X86/32bit-behavior.c
 create mode 100644 clang/test/CodeGen/X86/fp-eval-method.c
 create mode 100644 clang/test/CodeGen/flt_eval_macro.cpp
 create mode 100644 clang/test/Preprocessor/flt_eval_macro.cpp
 create mode 100644 clang/test/Sema/fp-eval-pragma.cpp
 create mode 100644 clang/test/Sema/x86-eval-method.c
 create mode 100644 clang/test/Sema/x86_64-eval-method.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 865a877b02190..aefb09afc82bb 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3940,6 +3940,38 @@ A ``#pragma clang fp`` pragma may contain any number of options:
     ...
   }
 
+``#pragma clang fp eval_method`` allows floating-point behavior to be specified
+for a section of the source code. This pragma can appear at file or namespace
+scope, or at the start of a compound statement (excluding comments).
+The pragma is active within the scope of the compound statement.
+
+When ``pragma clang fp eval_method(source)`` is enabled, the section of code
+governed by the pragma behaves as though the command-line option
+``-ffp-eval-method=source`` is enabled. Rounds intermediate results to
+source-defined precision.
+
+When ``pragma clang fp eval_method(double)`` is enabled, the section of code
+governed by the pragma behaves as though the command-line option
+``-ffp-eval-method=double`` is enabled. Rounds intermediate results to
+``double`` precision.
+
+When ``pragma clang fp eval_method(extended)`` is enabled, the section of code
+governed by the pragma behaves as though the command-line option
+``-ffp-eval-method=extended`` is enabled. Rounds intermediate results to
+target-dependent ``long double`` precision. In Win32 programming, for instance,
+the long double data type maps to the double, 64-bit precision data type.
+
+The full syntax this pragma supports is
+``#pragma clang fp eval_method(source|double|extended)``.
+
+.. code-block:: c++
+
+  for(...) {
+    // The compiler will use long double as the floating-point evaluation
+    // method.
+    #pragma clang fp eval_method(extended)
+    a = b[i] * c[i] + e;
+  }
 
 The ``#pragma float_control`` pragma allows precise floating-point
 semantics and floating-point exception behavior to be specified
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 981909aa16eaf..4a776eb86775c 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1566,6 +1566,22 @@ Note that floating-point operations performed as part of constant initialization
    * ``maytrap`` The compiler avoids transformations that may raise exceptions that would not have been raised by the original code. Constant folding performed by the compiler is exempt from this option.
    * ``strict`` The compiler ensures that all transformations strictly preserve the floating point exception semantics of the original code.
 
+.. option:: -ffp-eval-method=<value>
+
+   Specify the floating-point evaluation method for intermediate results within
+   a single expression of the code.
+
+   Valid values are: ``source``, ``double``, and ``extended``.
+   For 64-bit targets, the default value is ``source``. For 32-bit x86 targets
+   however, in the case of NETBSD 6.99.26 and under, the default value is
+   ``double``; in the case of NETBSD greater than 6.99.26, with NoSSE, the
+   default value is ``extended``, with SSE the default value is ``source``.
+   Details:
+
+   * ``source`` The compiler uses the floating-point type declared in the source program as the evaluation method.
+   * ``double`` The compiler uses ``double`` as the floating-point evaluation method for all float expressions of type that is narrower than ``double``.
+   * ``extended`` The compiler uses ``long double`` as the floating-point evaluation method for all float expressions of type that is narrower than ``long double``.
+
 .. option:: -f[no-]protect-parens:
 
    This option pertains to floating-point types, complex types with
@@ -1587,6 +1603,17 @@ Note that floating-point operations performed as part of constant initialization
    has no effect because the optimizer is prohibited from making unsafe
    transformations.
 
+.. _FLT_EVAL_METHOD:
+
+A note about ``__FLT_EVAL_METHOD__``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The macro ``__FLT_EVAL_METHOD__`` will expand to either the value set from the
+command line option ``ffp-eval-method`` or to the value from the target info
+setting. The ``__FLT_EVAL_METHOD__`` macro cannot expand to the correct
+evaluation method in the presence of a ``#pragma`` which alters the evaluation
+method. An error is issued if ``__FLT_EVAL_METHOD__`` is expanded inside a scope
+modified by ``#pragma clang fp eval_method``.
+
 .. _fp-constant-eval:
 
 A note about Floating Point Constant Evaluation
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index a4436208799f9..0f424b02c812a 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -321,6 +321,10 @@ def err_pragma_include_instead_system_reserved : Error<
   "header '%0' is an implementation detail; #include %select{'%2'|either '%2' "
   "or '%3'|one of %2}1 instead">;
 
+def err_illegal_use_of_flt_eval_macro : Error<
+  "'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing "
+  "'#pragma clang fp eval_method'">;
+
 def pp_poisoning_existing_macro : Warning<"poisoning existing macro">;
 def pp_out_of_date_dependency : Warning<
   "current file is older than dependency %0">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index f21e841bcdd38..f7d9a685e8b92 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1267,6 +1267,9 @@ def err_pragma_attribute_namespace_on_attribute : Error<
 def note_pragma_attribute_namespace_on_attribute : Note<
   "omit the namespace to add attributes to the most-recently"
   " pushed attribute group">;
+def warn_no_support_for_eval_method_source_on_m32 : Warning<
+  "Setting the floating point evaluation method to `source` on a target"
+  " without SSE is not supported.">, InGroup<Pragmas>;
 
 // OpenCL EXTENSION pragma (OpenCL 1.1 [9.1])
 def warn_pragma_expected_colon : Warning<
diff --git a/clang/include/clang/Basic/FPOptions.def b/clang/include/clang/Basic/FPOptions.def
index a93fa475cd5f6..224c1827144f5 100644
--- a/clang/include/clang/Basic/FPOptions.def
+++ b/clang/include/clang/Basic/FPOptions.def
@@ -23,4 +23,5 @@ OPTION(NoHonorInfs, bool, 1, NoHonorNaNs)
 OPTION(NoSignedZero, bool, 1, NoHonorInfs)
 OPTION(AllowReciprocal, bool, 1, NoSignedZero)
 OPTION(AllowApproxFunc, bool, 1, AllowReciprocal)
+OPTION(FPEvalMethod, LangOptions::FPEvalMethodKind, 2, AllowApproxFunc)
 #undef OPTION
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 6af474d613c98..6d98850a2bea9 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -303,6 +303,7 @@ BENIGN_ENUM_LANGOPT(DefaultFPContractMode, FPModeKind, 2, FPM_Off, "FP contracti
 COMPATIBLE_LANGOPT(ExpStrictFP, 1, false, "Enable experimental strict floating point")
 BENIGN_ENUM_LANGOPT(FPRoundingMode, RoundingMode, 3, RoundingMode::NearestTiesToEven, "FP Rounding Mode type")
 BENIGN_ENUM_LANGOPT(FPExceptionMode, FPExceptionModeKind, 2, FPE_Ignore, "FP Exception Behavior Mode type")
+BENIGN_ENUM_LANGOPT(FPEvalMethod, FPEvalMethodKind, 2, FEM_UnsetOnCommandLine, "FP type used for floating point arithmetic")
 LANGOPT(NoBitFieldTypeAlign , 1, 0, "bit-field type alignment")
 LANGOPT(HexagonQdsp6Compat , 1, 0, "hexagon-qdsp6 backward compatibility")
 LANGOPT(ObjCAutoRefCount , 1, 0, "Objective-C automated reference counting")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 50c7f038fc6be..2e334e375950e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -235,6 +235,24 @@ class LangOptions : public LangOptionsBase {
     FPE_Strict
   };
 
+  /// Possible float expression evaluation method choices.
+  enum FPEvalMethodKind {
+    /// The evaluation method cannot be determined or is inconsistent for this
+    /// target.
+    FEM_Indeterminable = -1,
+    /// Use the declared type for fp arithmetic.
+    FEM_Source = 0,
+    /// Use the type double for fp arithmetic.
+    FEM_Double = 1,
+    /// Use extended type for fp arithmetic.
+    FEM_Extended = 2,
+    /// Used only for FE option processing; this is only used to indicate that
+    /// the user did not specify an explicit evaluation method on the command
+    /// line and so the target should be queried for its default evaluation
+    /// method instead.
+    FEM_UnsetOnCommandLine = 3
+  };
+
   /// Possible exception handling behavior.
   enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 22918f7e12e84..8e18ded7d3765 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -726,7 +726,11 @@ class TargetInfo : public virtual TransferrableTargetInfo,
   }
 
   /// Return the value for the C99 FLT_EVAL_METHOD macro.
-  virtual unsigned getFloatEvalMethod() const { return 0; }
+  virtual LangOptions::FPEvalMethodKind getFPEvalMethod() const {
+    return LangOptions::FPEvalMethodKind::FEM_Source;
+  }
+
+  virtual bool supportSourceEvalMethod() const { return true; }
 
   // getLargeArrayMinWidth/Align - Return the minimum array size that is
   // 'large' and its alignment.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6314f025e0585..f9d8e32169635 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1495,6 +1495,11 @@ def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
 def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
 def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
 def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
+def ffp_eval_method_EQ : Joined<["-"], "ffp-eval-method=">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Specifies the evaluation method to use for floating-point arithmetic.">,
+  Values<"source,double,extended">, NormalizedValuesScope<"LangOptions">,
+  NormalizedValues<["FEM_Source", "FEM_Double", "FEM_Extended"]>,
+  MarshallingInfoEnum<LangOpts<"FPEvalMethod">, "FEM_UnsetOnCommandLine">;
 def ffp_model_EQ : Joined<["-"], "ffp-model=">, Group<f_Group>, Flags<[NoXarchOption]>,
   HelpText<"Controls the semantics of floating-point calculations.">;
 def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<f_Group>, Flags<[CC1Option]>,
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 2802329a60220..4eb96f1cac4b7 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -179,12 +179,27 @@ class Preprocessor {
   IdentifierInfo *Ident__is_target_vendor;         // __is_target_vendor
   IdentifierInfo *Ident__is_target_os;             // __is_target_os
   IdentifierInfo *Ident__is_target_environment;    // __is_target_environment
+  IdentifierInfo *Ident__FLT_EVAL_METHOD__;        // __FLT_EVAL_METHOD
 
   // Weak, only valid (and set) while InMacroArgs is true.
   Token* ArgMacro;
 
   SourceLocation DATELoc, TIMELoc;
 
+  // FEM_UnsetOnCommandLine means that an explicit evaluation method was
+  // not specified on the command line. The target is queried to set the
+  // default evaluation method.
+  LangOptions::FPEvalMethodKind CurrentFPEvalMethod =
+      LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
+
+  // The most recent pragma location where the floating point evaluation
+  // method was modified. This is used to determine whether the
+  // 'pragma clang fp eval_method' was used whithin the current scope.
+  SourceLocation LastFPEvalPragmaLocation;
+
+  LangOptions::FPEvalMethodKind TUFPEvalMethod =
+      LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine;
+
   // Next __COUNTER__ value, starts at 0.
   unsigned CounterValue = 0;
 
@@ -2048,6 +2063,38 @@ class Preprocessor {
   unsigned getCounterValue() const { return CounterValue; }
   void setCounterValue(unsigned V) { CounterValue = V; }
 
+  LangOptions::FPEvalMethodKind getCurrentFPEvalMethod() const {
+    assert(CurrentFPEvalMethod != LangOptions::FEM_UnsetOnCommandLine &&
+           "FPEvalMethod should be set either from command line or from the "
+           "target info");
+    return CurrentFPEvalMethod;
+  }
+
+  LangOptions::FPEvalMethodKind getTUFPEvalMethod() const {
+    return TUFPEvalMethod;
+  }
+
+  SourceLocation getLastFPEvalPragmaLocation() const {
+    return LastFPEvalPragmaLocation;
+  }
+
+  void setCurrentFPEvalMethod(SourceLocation PragmaLoc,
+                              LangOptions::FPEvalMethodKind Val) {
+    assert(Val != LangOptions::FEM_UnsetOnCommandLine &&
+           "FPEvalMethod should never be set to FEM_UnsetOnCommandLine");
+    // This is the location of the '#pragma float_control" where the
+    // execution state is modifed.
+    LastFPEvalPragmaLocation = PragmaLoc;
+    CurrentFPEvalMethod = Val;
+    TUFPEvalMethod = Val;
+  }
+
+  void setTUFPEvalMethod(LangOptions::FPEvalMethodKind Val) {
+    assert(Val != LangOptions::FEM_UnsetOnCommandLine &&
+           "TUPEvalMethod should never be set to FEM_UnsetOnCommandLine");
+    TUFPEvalMethod = Val;
+  }
+
   /// Retrieves the module that we're currently building, if any.
   Module *getCurrentModule();
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 08d492a7ec721..64a8675a7c508 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -184,6 +184,7 @@ class Parser : public CodeCompletionHandler {
   std::unique_ptr<PragmaHandler> PCSectionHandler;
   std::unique_ptr<PragmaHandler> MSCommentHandler;
   std::unique_ptr<PragmaHandler> MSDetectMismatchHandler;
+  std::unique_ptr<PragmaHandler> FPEvalMethodHandler;
   std::unique_ptr<PragmaHandler> FloatControlHandler;
   std::unique_ptr<PragmaHandler> MSPointersToMembers;
   std::unique_ptr<PragmaHandler> MSVtorDisp;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 2d47a20711817..94283e2412a98 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1541,19 +1541,16 @@ class Sema final {
   /// statements.
   class FPFeaturesStateRAII {
   public:
-    FPFeaturesStateRAII(Sema &S) : S(S), OldFPFeaturesState(S.CurFPFeatures) {
-      OldOverrides = S.FpPragmaStack.CurrentValue;
-    }
-    ~FPFeaturesStateRAII() {
-      S.CurFPFeatures = OldFPFeaturesState;
-      S.FpPragmaStack.CurrentValue = OldOverrides;
-    }
+    FPFeaturesStateRAII(Sema &S);
+    ~FPFeaturesStateRAII();
     FPOptionsOverride getOverrides() { return OldOverrides; }
 
   private:
     Sema& S;
     FPOptions OldFPFeaturesState;
     FPOptionsOverride OldOverrides;
+    LangOptions::FPEvalMethodKind OldEvalMethod;
+    SourceLocation OldFPPragmaLocation;
   };
 
   void addImplicitTypedef(StringRef Name, QualType T);
@@ -10144,6 +10141,9 @@ class Sema final {
            !CurFPFeatures.getAllowApproxFunc();
   }
 
+  void ActOnPragmaFPEvalMethod(SourceLocation Loc,
+                               LangOptions::FPEvalMethodKind Value);
+
   /// ActOnPragmaFloatControl - Call on well-formed \#pragma float_control
   void ActOnPragmaFloatControl(SourceLocation Loc, PragmaMsStackAction Action,
                                PragmaFloatControlKind Value);
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 3c1830d5f8e89..f61652d285a89 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -749,7 +749,9 @@ class AIXTargetInfo : public OSTargetInfo<Target> {
   }
 
   // AIX sets FLT_EVAL_METHOD to be 1.
-  unsigned getFloatEvalMethod() const override { return 1; }
+  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
+    return LangOptions::FPEvalMethodKind::FEM_Double;
+  }
 
   bool defaultsToAIXPowerAlignment() const override { return true; }
 };
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index d1b66432e38b4..e0bb3c344c5b6 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -168,11 +168,15 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
     return LongDoubleFormat == &llvm::APFloat::IEEEquad() ? "g" : "e";
   }
 
-  unsigned getFloatEvalMethod() const override {
+  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
     // X87 evaluates with 80 bits "long double" precision.
-    return SSELevel == NoSSE ? 2 : 0;
+    return SSELevel == NoSSE ? LangOptions::FPEvalMethodKind::FEM_Extended
+                             : LangOptions::FPEvalMethodKind::FEM_Source;
   }
 
+  // EvalMethod `source` is not supported for targets with `NoSSE` feature.
+  bool supportSourceEvalMethod() const override { return SSELevel > NoSSE; }
+
   ArrayRef<const char *> getGCCRegNames() const override;
 
   ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
@@ -471,13 +475,13 @@ class LLVM_LIBRARY_VISIBILITY NetBSDI386TargetInfo
   NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}
 
-  unsigned getFloatEvalMethod() const override {
+  LangOptions::FPEvalMethodKind getFPEvalMethod() const override {
     VersionTuple OsVersion = getTriple().getOSVersion();
     // New NetBSD uses the default rounding mode.
     if (OsVersion >= VersionTuple(6, 99, 26) || OsVersion.getMajor() == 0)
-      return X86_32TargetInfo::getFloatEvalMethod();
+      return X86_32TargetInfo::getFPEvalMethod();
     // NetBSD before 6.99.26 defaults to "double" rounding.
-    return 1;
+    return LangOptions::FPEvalMethodKind::FEM_Double;
   }
 };
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index e3fd07a88389d..8df56a2df5b12 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2726,6 +2726,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   StringRef FPModel = "";
   // -ffp-exception-behavior options: strict, maytrap, ignore
   StringRef FPExceptionBehavior = "";
+  // -ffp-eval-method options: double, extended, source
+  StringRef FPEvalMethod = "";
   const llvm::DenormalMode DefaultDenormalFPMath =
       TC.getDefaultDenormalModeForType(Args, JA);
   const llvm::DenormalMode DefaultDenormalFP32Math =
@@ -2921,6 +2923,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       break;
     }
 
+    // Validate and pass through -ffp-eval-method option.
+    case options::OPT_ffp_eval_method_EQ: {
+      StringRef Val = A->getValue();
+      if (Val.equals("double") || Val.equals("extended") ||
+          Val.equals("source"))
+        FPEvalMethod = Val;
+      else
+        D.Diag(diag::err_drv_unsupported_option_argument)
+            << A->getOption().getName() << Val;
+      break;
+    }
+
     case options::OPT_ffinite_math_only:
       HonorINFs = false;
       HonorNaNs = false;
@@ -3076,6 +3090,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     CmdArgs.push_back(Args.MakeArgString("-ffp-exception-behavior=" +
                       FPExceptionBehavior));
 
+  if (!FPEvalMethod.empty())
+    CmdArgs.push_back(Args.MakeArgString("-ffp-eval-method=" + FPEvalMethod));
+
   ParseMRecip(D, Args, CmdArgs);
 
   // -ffast-math enables the __FAST_MATH__ preprocessor macro, but check for the
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index bf8a0b2abe22e..ff507e2c00aaa 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1136,7 +1136,6 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   }
 
   // Macros to control C99 numerics and <float.h>
-  Builder.defineMacro("__FLT_EVAL_METHOD__", Twine(TI.getFloatEvalMethod()));
   Builder.defineMacro("__FLT_RADIX__", "2");
   Builder.defineMacro("__DECIMAL_DIG__", "__LDBL_DECIMAL_DIG__");
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index a29ff215d7ea0..82fc57c8f2e88 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -342,6 +342,7 @@ void Preprocessor::RegisterBuiltinMacros() {
   Ident__TIME__ = RegisterBuiltinMacro(*this, "__TIME__");
   Ident__COUNTER__ = RegisterBuiltinMacro(*this, "__COUNTER__");
   Ident_Pragma  = RegisterBuiltinMacro(*this, "_Pragma");
+  Ident__FLT_EVAL_METHOD__ = RegisterBuiltinMacro(*this, "__FLT_EVAL_METHOD__");
 
   // C++ Standing Document Extensions.
   if (getLangOpts().CPlusPlus)
@@ -1574,6 +1575,17 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
     // Surround the string with " and strip the trailing newline.
     OS << '"' << StringRef(Result).drop_back() << '"';
     Tok.setKind(tok::string_literal);
+  } else if (II == Ident__FLT_EVAL_METHOD__) {
+    // __FLT_EVAL_METHOD__ is set to the default value.
+    OS << getTUFPEvalMethod();
+    // __FLT_EVAL_METHOD__ expands to a simple numeric value.
+    Tok.setKind(tok::numeric_constant);
+    if (getLastFPEvalPragmaLocation().isValid()) {
+      // The program is ill-formed. The value of __FLT_EVAL_METHOD__ is altered
+      // by the pragma.
+      Diag(Tok, diag::err_illegal_use_of_flt_eval_macro);
+      Diag(getLastFPEvalPragmaLocation(), diag::note_pragma_entered_here);
+    }
   } else if (II == Ident__COUNTER__) {
     // __COUNTER__ expands to a simple numeric value.
     OS << CounterValue++;
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 3c338a2b81235..43b9930db1d6d 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -208,6 +208,9 @@ void Preprocessor::Initialize(const TargetInfo &Target,
 
   // Populate the identifier table with info about keywords for the current language.
   Identifiers.AddKeywords(LangOpts);
+
+  // Initialize the __FTL_EVAL_METHOD__ macro to the TargetInfo.
+  setTUFPEvalMethod(getTargetInfo().getFPEvalMethod());
 }
 
 void Preprocessor::InitializeForModelFile() {
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 27e8501278626..5c6aa0e47635b 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -3028,12 +3028,13 @@ void PragmaOptimizeHandler::HandlePragma(Preprocessor &PP,
 namespace {
 /// Used as the annotation value for tok::annot_pragma_fp.
 struct TokFPAnnotValue {
-  enum FlagKinds { Contract, Reassociate, Exceptions };
+  enum FlagKinds { Contract, Reassociate, Exceptions, EvalMethod };
   enum FlagValues { On, Off, Fast };
 
   llvm::Optional<LangOptions::FPModeKind> ContractValue;
   llvm::Optional<LangOptions::FPModeKind> ReassociateValue;
   llvm::Optional<LangOptions::FPExceptionModeKind> ExceptionsValue;
+  llvm::Optional<LangOptions::FPEvalMethodKind> EvalMethodValue;
 };
 } // end anonymous namespace
 
@@ -3060,6 +3061,7 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
             .Case("contract", TokFPAnnotValue::Contract)
             .Case("reassociate", TokFPAnnotValue::Reassociate)
             .Case("exceptions", TokFPAnnotValue::Exceptions)
+            .Case("eval_method", TokFPAnnotValue::EvalMethod)
             .Default(None);
     if (!FlagKind) {
       PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_option)
@@ -3074,8 +3076,11 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
       return;
     }
     PP.Lex(Tok);
+    bool isEvalMethodDouble =
+        Tok.is(tok::kw_double) && FlagKind == TokFPAnnotValue::EvalMethod;
 
-    if (Tok.isNot(tok::identifier)) {
+    // Don't diagnose if we have an eval_metod pragma with "double" kind.
+    if (Tok.isNot(tok::identifier) && !isEvalMethodDouble) {
       PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_argument)
           << PP.getSpelling(Tok) << OptionInfo->getName()
           << static_cast<int>(*FlagKind);
@@ -3121,6 +3126,19 @@ void PragmaFPHandler::HandlePragma(Preprocessor &PP,
             << PP.getSpelling(Tok) << OptionInfo->getName() << *FlagKind;
         return;
       }
+    } else if (FlagKind == TokFPAnnotValue::EvalMethod) {
+      AnnotValue->EvalMethodValue =
+          llvm::StringSwitch<llvm::Optional<LangOptions::FPEvalMethodKind>>(
+              II->getName())
+              .Case("source", LangOptions::FPEvalMethodKind::FEM_Source)
+              .Case("double", LangOptions::FPEvalMethodKind::FEM_Double)
+              .Case("extended", LangOptions::FPEvalMethodKind::FEM_Extended)
+              .Default(llvm::None);
+      if (!AnnotValue->EvalMethodValue) {
+        PP.Diag(Tok.getLocation(), diag::err_pragma_fp_invalid_argument)
+            << PP.getSpelling(Tok) << OptionInfo->getName() << *FlagKind;
+        return;
+      }
     }
     PP.Lex(Tok);
 
@@ -3223,6 +3241,9 @@ void Parser::HandlePragmaFP() {
   if (AnnotValue->ExceptionsValue)
     Actions.ActOnPragmaFPExceptions(Tok.getLocation(),
                                     *AnnotValue->ExceptionsValue);
+  if (AnnotValue->EvalMethodValue)
+    Actions.ActOnPragmaFPEvalMethod(Tok.getLocation(),
+                                    *AnnotValue->EvalMethodValue);
   ConsumeAnnotationToken();
 }
 
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index ee07775b6346f..cadedf6d98dbd 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -1153,6 +1153,16 @@ StmtResult Parser::ParseCompoundStatementBody(bool isStmtExpr) {
     if (R.isUsable())
       Stmts.push_back(R.get());
   }
+  // Warn the user that using option `-ffp-eval-method=source` on a
+  // 32-bit target and feature `sse` disabled, or using
+  // `pragma clang fp eval_method=source` and feature `sse` disabled, is not
+  // supported.
+  if (!PP.getTargetInfo().supportSourceEvalMethod() &&
+      (PP.getLastFPEvalPragmaLocation().isValid() ||
+       PP.getCurrentFPEvalMethod() ==
+           LangOptions::FPEvalMethodKind::FEM_Source))
+    Diag(Tok.getLocation(),
+         diag::warn_no_support_for_eval_method_source_on_m32);
 
   SourceLocation CloseLoc = Tok.getLocation();
 
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 7b57c8da4e9cc..db3eda622639f 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -242,6 +242,15 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
   SemaPPCallbackHandler = Callbacks.get();
   PP.addPPCallbacks(std::move(Callbacks));
   SemaPPCallbackHandler->set(*this);
+  if (getLangOpts().getFPEvalMethod() == LangOptions::FEM_UnsetOnCommandLine)
+    // Use setting from TargetInfo.
+    PP.setCurrentFPEvalMethod(SourceLocation(),
+                              ctxt.getTargetInfo().getFPEvalMethod());
+  else
+    // Set initial value of __FLT_EVAL_METHOD__ from the command line.
+    PP.setCurrentFPEvalMethod(SourceLocation(),
+                              getLangOpts().getFPEvalMethod());
+  CurFPFeatures.setFPEvalMethod(PP.getCurrentFPEvalMethod());
 }
 
 // Anchor Sema's type info to this TU.
@@ -2630,3 +2639,15 @@ const llvm::MapVector<FieldDecl *, Sema::DeleteLocs> &
 Sema::getMismatchingDeleteExpressions() const {
   return DeleteExprs;
 }
+
+Sema::FPFeaturesStateRAII::FPFeaturesStateRAII(Sema &S)
+    : S(S), OldFPFeaturesState(S.CurFPFeatures),
+      OldOverrides(S.FpPragmaStack.CurrentValue),
+      OldEvalMethod(S.PP.getCurrentFPEvalMethod()),
+      OldFPPragmaLocation(S.PP.getLastFPEvalPragmaLocation()) {}
+
+Sema::FPFeaturesStateRAII::~FPFeaturesStateRAII() {
+  S.CurFPFeatures = OldFPFeaturesState;
+  S.FpPragmaStack.CurrentValue = OldOverrides;
+  S.PP.setCurrentFPEvalMethod(OldFPPragmaLocation, OldEvalMethod);
+}
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index 38e6e60af90db..d623060fd10cf 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -470,6 +470,27 @@ void Sema::ActOnPragmaDetectMismatch(SourceLocation Loc, StringRef Name,
   Consumer.HandleTopLevelDecl(DeclGroupRef(PDMD));
 }
 
+void Sema::ActOnPragmaFPEvalMethod(SourceLocation Loc,
+                                   LangOptions::FPEvalMethodKind Value) {
+  FPOptionsOverride NewFPFeatures = CurFPFeatureOverrides();
+  switch (Value) {
+  default:
+    llvm_unreachable("invalid pragma eval_method kind");
+  case LangOptions::FEM_Source:
+    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Source);
+    break;
+  case LangOptions::FEM_Double:
+    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Double);
+    break;
+  case LangOptions::FEM_Extended:
+    NewFPFeatures.setFPEvalMethodOverride(LangOptions::FEM_Extended);
+    break;
+  }
+  FpPragmaStack.Act(Loc, PSK_Set, StringRef(), NewFPFeatures);
+  CurFPFeatures = NewFPFeatures.applyOverrides(getLangOpts());
+  PP.setCurrentFPEvalMethod(Loc, Value);
+}
+
 void Sema::ActOnPragmaFloatControl(SourceLocation Loc,
                                    PragmaMsStackAction Action,
                                    PragmaFloatControlKind Value) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 22b3f371afe79..88fc89bec629a 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -773,6 +773,40 @@ ExprResult Sema::UsualUnaryConversions(Expr *E) {
   QualType Ty = E->getType();
   assert(!Ty.isNull() && "UsualUnaryConversions - missing type");
 
+  LangOptions::FPEvalMethodKind EvalMethod = CurFPFeatures.getFPEvalMethod();
+  if (EvalMethod != LangOptions::FEM_Source && Ty->isFloatingType() &&
+      (getLangOpts().getFPEvalMethod() !=
+           LangOptions::FPEvalMethodKind::FEM_UnsetOnCommandLine ||
+       PP.getLastFPEvalPragmaLocation().isValid())) {
+    switch (EvalMethod) {
+    default:
+      llvm_unreachable("Unrecognized float evaluation method");
+      break;
+    case LangOptions::FEM_UnsetOnCommandLine:
+      llvm_unreachable("Float evaluation method should be set by now");
+      break;
+    case LangOptions::FEM_Double:
+      if (Context.getFloatingTypeOrder(Context.DoubleTy, Ty) > 0)
+        // Widen the expression to double.
+        return Ty->isComplexType()
+                   ? ImpCastExprToType(E,
+                                       Context.getComplexType(Context.DoubleTy),
+                                       CK_FloatingComplexCast)
+                   : ImpCastExprToType(E, Context.DoubleTy, CK_FloatingCast);
+      break;
+    case LangOptions::FEM_Extended:
+      if (Context.getFloatingTypeOrder(Context.LongDoubleTy, Ty) > 0)
+        // Widen the expression to long double.
+        return Ty->isComplexType()
+                   ? ImpCastExprToType(
+                         E, Context.getComplexType(Context.LongDoubleTy),
+                         CK_FloatingComplexCast)
+                   : ImpCastExprToType(E, Context.LongDoubleTy,
+                                       CK_FloatingCast);
+      break;
+    }
+  }
+
   // Half FP have to be promoted to float unless it is natively supported
   if (Ty->isHalfType() && !getLangOpts().NativeHalfType)
     return ImpCastExprToType(Res.get(), Context.FloatTy, CK_FloatingCast);
diff --git a/clang/test/CodeGen/X86/32bit-behavior-no-eval.c b/clang/test/CodeGen/X86/32bit-behavior-no-eval.c
new file mode 100644
index 0000000000000..d040e827ce31c
--- /dev/null
+++ b/clang/test/CodeGen/X86/32bit-behavior-no-eval.c
@@ -0,0 +1,30 @@
+// SSE
+// RUN: %clang_cc1  \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CHECK %s
+
+// NO SSE
+// RUN: %clang_cc1  \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=CHECK %s
+
+// NO SSE Fast Math
+// RUN: %clang_cc1  \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -ffast-math -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-FM %s
+
+float addit(float a, float b, float c) {
+  // CHECK: load float, float*
+  // CHECK: load float, float*
+  // CHECK: fadd float
+  // CHECK: load float, float*
+  // CHECK: fadd float
+
+  // CHECK-FM: load float, float*
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn float
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn float
+
+  return a + b + c;
+}
diff --git a/clang/test/CodeGen/X86/32bit-behavior.c b/clang/test/CodeGen/X86/32bit-behavior.c
new file mode 100644
index 0000000000000..a7e0f008c9f35
--- /dev/null
+++ b/clang/test/CodeGen/X86/32bit-behavior.c
@@ -0,0 +1,109 @@
+// SSE
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// SSE Fast Math
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature +sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-FM %s
+
+// NO SSE
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source  \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: | FileCheck -check-prefix=CHECK-DBL %s
+
+// NO SSE Fast Math
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-DBL-FM %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: -ffast-math | FileCheck -check-prefix=CHECK-DBL-FM %s
+
+float addit(float a, float b, float c) {
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+
+  // CHECK-FM-SRC: load float, float*
+  // CHECK-FM-SRC: load float, float*
+  // CHECK-FM-SRC: fadd reassoc nnan ninf nsz arcp afn float
+  // CHECK-FM-SRC: load float, float*
+  // CHECK-FM-SRC: fadd reassoc nnan ninf nsz arcp afn float
+
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fpext float {{.*}} to double
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fpext float {{.*}} to double
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-FM: load float, float*
+  // CHECK-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-FM: fptrunc double {{.*}} to float
+
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float {{.*}} to double
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float {{.*}} to double
+  // CHECK-DBL: fadd double
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float {{.*}} to double
+  // CHECK-DBL: fadd double
+  // CHECK-DBL: fptrunc double {{.*}} to float
+
+  // CHECK-DBL-FM: load float, float*
+  // CHECK-DBL-FM: fpext float {{.*}} to double
+  // CHECK-DBL-FM: load float, float*
+  // CHECK-DBL-FM: fpext float {{.*}} to double
+  // CHECK-DBL-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-DBL-FM: load float, float*
+  // CHECK-DBL-FM: fpext float {{.*}} to double
+  // CHECK-DBL-FM: fadd reassoc nnan ninf nsz arcp afn double
+  // CHECK-DBL-FM: fptrunc double {{.*}} to float
+
+  // CHECK: ret float
+  return a + b + c;
+}
diff --git a/clang/test/CodeGen/X86/fp-eval-method.c b/clang/test/CodeGen/X86/fp-eval-method.c
new file mode 100644
index 0000000000000..5bfc3701050f5
--- /dev/null
+++ b/clang/test/CodeGen/X86/fp-eval-method.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple i386-unknown-netbsd6 -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefixes=CHECK
+
+// RUN: %clang_cc1 -triple i386-unknown-netbsd7 -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefixes=CHECK-EXT
+
+// RUN: %clang_cc1 -triple i386--linux -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefixes=CHECK-EXT
+
+float f(float x, float y) {
+  // CHECK: define{{.*}} float @f
+  // CHECK: fadd float
+  return 2.0f + x + y;
+}
+
+int getEvalMethod() {
+  // CHECK: ret i32 1
+  // CHECK-EXT: ret i32 2
+  return __FLT_EVAL_METHOD__;
+}
diff --git a/clang/test/CodeGen/flt_eval_macro.cpp b/clang/test/CodeGen/flt_eval_macro.cpp
new file mode 100644
index 0000000000000..aa7455f0efe0b
--- /dev/null
+++ b/clang/test/CodeGen/flt_eval_macro.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 \
+// RUN: -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck -check-prefix=CHECK-SRC %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck -check-prefixes=CHECK-DBL %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: | FileCheck -check-prefixes=CHECK-EXT-FLT %s
+
+// RUN: %clang_cc1 -triple powerpc-unknown-aix -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=CHECK-DBL-PPC
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended -mlong-double-80 \
+// RUN: | FileCheck %s -check-prefix=CHECK-EXT-FLT
+
+int getFEM() {
+  // LABEL: define {{.*}}getFEM{{.*}}
+  return __FLT_EVAL_METHOD__;
+  // CHECK-SRC: ret {{.*}} 0
+  // CHECK-DBL: ret {{.*}} 1
+  // CHECK-DBL-PPC: ret {{.*}} 1
+  // CHECK-EXT-FLT: ret {{.*}} 2
+}
+
+float func() {
+  // LABEL: define {{.*}}@_Z4func{{.*}}
+  float X = 100.0f;
+  float Y = -45.3f;
+  float Z = 393.78f;
+  float temp;
+#if __FLT_EVAL_METHOD__ == 0
+  temp = X + Y + Z;
+#elif __FLT_EVAL_METHOD__ == 1
+  temp = X * Y * Z;
+#elif __FLT_EVAL_METHOD__ == 2
+  temp = X * Y - Z;
+#endif
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+  // CHECK-SRC: load float, float*
+  // CHECK-SRC: fadd float
+
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float
+  // CHECK-DBL: fmul double
+  // CHECK-DBL: load float, float*
+  // CHECK-DBL: fpext float
+  // CHECK-DBL: fmul double
+  // CHECK-DBL: fptrunc double
+
+  // CHECK-EXT-FLT: load float, float*
+  // CHECK-EXT-FLT: fpext float
+  // CHECK-EXT-FLT: load float, float*
+  // CHECK-EXT-FLT: fpext float
+  // CHECK-EXT-FLT: fmul x86_fp80
+  // CHECK-EXT-FLT: load float, float*
+  // CHECK-EXT-FLT: fpext float
+  // CHECK-EXT-FLT: fsub x86_fp80
+  // CHECK-EXT-FLT: fptrunc x86_fp80
+
+  // CHECK-DBL-PPC: load float, float*
+  // CHECK-DBL-PPC: load float, float*
+  // CHECK-DBL-PPC: fmul float
+  // CHECK-DBL-PPC: load float, float*
+  // CHECK-DBL-PPC: fmul float
+
+  return temp;
+}
diff --git a/clang/test/CodeGen/fp-floatcontrol-pragma.cpp b/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
index ef29d24de1dbc..966eaf6053970 100644
--- a/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
+++ b/clang/test/CodeGen/fp-floatcontrol-pragma.cpp
@@ -1,7 +1,53 @@
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NS %s
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -DFENV_ON=1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-FENV %s
-// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple %itanium_abi_triple -O3 -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-O3 %s
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DEXCEPT=1 \
+// RUN: -fcxx-exceptions -triple x86_64-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-NS %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s \
+// RUN: -check-prefixes=CHECK-DEFAULT,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DFENV_ON=1 \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-FENV %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -DNF128 \
+// RUN: -triple %itanium_abi_triple -O3 -emit-llvm -o - %s \
+// RUN: | FileCheck -check-prefix=CHECK-O3 %s
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck %s -check-prefixes=CHECK-SOURCE,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=double \
+// RUN: | FileCheck %s -check-prefixes=CHECK-DOUBLE,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - %s -ffp-eval-method=extended \
+// RUN: -mlong-double-80 | FileCheck %s \
+// RUN: -check-prefixes=CHECK-EXTENDED,CHECK-CONST-ARGS
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-linux-gnu -emit-llvm -o - %s -ffp-eval-method=source \
+// RUN: | FileCheck %s -check-prefix=CHECK-SOURCE
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=double | FileCheck %s \
+// RUN: -check-prefix=CHECK-DOUBLE
+
+// RUN: %clang_cc1 -fexperimental-strict-floating-point -triple i386-linux-gnu \
+// RUN: -emit-llvm -o - %s -ffp-eval-method=extended -mlong-double-80 \
+// RUN: | FileCheck %s -check-prefix=CHECK-EXTENDED
+
+// RUN: %clang_cc1 -triple powerpc-unknown-aix -DNF128 -emit-llvm -o - %s \
+// RUN: | FileCheck %s -check-prefix=CHECK-AIX
+
+bool f() {
+  // CHECK: define {{.*}}f{{.*}}
+  return __FLT_EVAL_METHOD__ < 0 &&
+         __FLT_EVAL_METHOD__ == -1;
+  // CHECK: ret {{.*}} true
+}
 
 // Verify float_control(precise, off) enables fast math flags on fp operations.
 float fp_precise_1(float a, float b, float c) {
@@ -229,3 +275,115 @@ float try_lam(float x, unsigned n) {
   result = x + t;
   return result;
 }
+
+float mySub(float x, float y) {
+  // CHECK: define {{.*}}float {{.*}}mySub{{.*}}
+  // CHECK-NS: fsub float
+  // CHECK-SOURCE: fsub float
+  // CHECK-DOUBLE: fpext float
+  // CHECK-DOUBLE: fpext float
+  // CHECK-DOUBLE: fsub double
+  // CHECK-DOUBLE: fptrunc double {{.*}} to float
+  // CHECK-EXTENDED: fpext float
+  // CHECK-EXTENDED: fpext float
+  // CHECK-EXTENDED: fsub double
+  // CHECK-EXTENDED: fptrunc double {{.*}} to float
+  return x - y;
+}
+
+float mySubSource(float x, float y) {
+// CHECK: define {{.*}}float {{.*}}mySubSource{{.*}}
+#pragma clang fp eval_method(source)
+  return x - y;
+  // CHECK: fsub float
+}
+
+float mySubExtended(float x, float y) {
+// CHECK: define {{.*}}float {{.*}}mySubExtended{{.*}}
+#pragma clang fp eval_method(extended)
+  return x - y;
+  // CHECK: fpext float
+  // CHECK: fpext float
+  // CHECK: fsub x86_fp80
+  // CHECK: fptrunc x86_fp80 {{.*}} to float
+  // CHECK-AIX: fsub double
+  // CHECK-AIX: fptrunc double
+}
+
+float mySubDouble(float x, float y) {
+// CHECK: define {{.*}}float {{.*}}mySubDouble{{.*}}
+#pragma clang fp eval_method(double)
+  return x - y;
+  // CHECK: fpext float
+  // CHECK: fpext float
+  // CHECK: fsub double
+  // CHECK: fptrunc double {{.*}} to float
+}
+
+#ifndef NF128
+__float128 mySub128(__float128 x, __float128 y) {
+  // CHECK: define {{.*}}mySub128{{.*}}
+  // Expect no fpext since fp128 is already widest
+  // CHECK: load fp128
+  // CHECK-NEXT: load fp128
+  // CHECK-NEXT: fsub fp128
+  // CHECK-NEXT: ret fp128
+  return x - y;
+}
+#endif
+
+void mySubfp16(__fp16 *res, __fp16 *x, __fp16 *y) {
+  // CHECK: define {{.*}}mySubfp16{{.*}}
+  *res = *x - *y;
+  // CHECK: load half
+  // CHECK-NEXT: load half
+  // CHECK-NEXT: fpext half{{.*}}
+  // CHECK-NEXT: load half
+  // CHECK-NEXT: load half
+  // CHECK-NS: fpext half{{.*}} to float
+  // CHECK-DEFAULT: fpext half{{.*}} to float
+  // CHECK-DOUBLE: fpext half{{.*}} to float
+  // CHECK-EXTENDED: fpext half{{.*}} to float
+  // CHECK-NEXT: fsub
+  // CHECK-NEXT: fptrunc {{.*}}to half
+  // CHECK-NS: fptrunc float {{.*}} to half
+  // CHECK-DOUBLE: fptrunc float {{.*}} to half
+  // CHECK-EXTENDED: fptrunc float {{.*}} to half
+}
+
+float Div(float x, float y, float z) {
+  // CHECK: define{{.*}}float {{.*}}Div{{.*}}
+  // CHECK-CONST-ARGS: fdiv float
+  return x / (y / z);
+}
+
+float DivExtended(float x, float y, float z) {
+// CHECK: define{{.*}}float {{.*}}DivExtended{{.*}}
+#pragma clang fp eval_method(extended)
+  // CHECK-CONST-ARGS: fdiv x86_fp80
+  // CHECK-CONST-ARGS: fptrunc x86_fp80
+  return x / (y / z);
+}
+
+float DivDouble(float x, float y, float z) {
+// CHECK: define{{.*}}float {{.*}}DivDouble{{.*}}
+#pragma clang fp eval_method(double)
+  // CHECK-CONST-ARGS: fdiv double
+  // CHECK-CONST-ARGS: fptrunc double
+  return x / (y / z);
+}
+
+float DivSource(float x, float y, float z) {
+// CHECK: define{{.*}}float {{.*}}DivSource{{.*}}
+#pragma clang fp eval_method(source)
+  // CHECK-CONST-ARGS: fdiv float
+  return x / (y / z);
+}
+
+int main() {
+  float f = Div(4.2f, 1.0f, 3.0f);
+  float fextended = DivExtended(4.2f, 1.0f, 3.0f);
+  float fdouble = DivDouble(4.2f, 1.0f, 3.0f);
+  float fsource = DivSource(4.2f, 1.0f, 3.0f);
+  // CHECK: store float
+}
diff --git a/clang/test/Preprocessor/flt_eval_macro.cpp b/clang/test/Preprocessor/flt_eval_macro.cpp
new file mode 100644
index 0000000000000..02829dcd267ec
--- /dev/null
+++ b/clang/test/Preprocessor/flt_eval_macro.cpp
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -E -dM -triple=x86_64-none-none  %s -o - \
+// RUN:   | FileCheck %s -strict-whitespace
+
+// RUN: %clang_cc1 -E -dM -triple=x86_64-none-none -target-feature -sse \
+// RUN:   %s -o - | FileCheck %s -check-prefix=EXT -strict-whitespace
+
+// RUN: %clang_cc1 -E -dM -triple=arm64e-apple-ios -target-feature -sse \
+// RUN:   %s -o - | FileCheck %s  -strict-whitespace
+
+// RUN: %clang_cc1 -E -dM -triple=arm64e-apple-ios -target-feature +sse \
+// RUN:   %s -o - | FileCheck %s  -strict-whitespace
+
+// RUN: %clang_cc1 -E -dM -triple=arm64_32-apple-ios  %s -o - \
+// RUN:   | FileCheck %s  -strict-whitespace
+
+// RUN: %clang_cc1 -E -dM -triple=arm64_32-apple-ios -target-feature -sse \
+// RUN:   %s -o - | FileCheck %s  -strict-whitespace
+
+// RUN: %clang_cc1 -E -dM -triple i386-pc-windows -target-cpu pentium4 %s -o - \
+// RUN:   | FileCheck %s  -strict-whitespace
+
+// RUN: %clang_cc1 -E -dM -triple i386-pc-windows -target-cpu pentium4 \
+// RUN:   -target-feature -sse %s -o - | FileCheck -check-prefix=EXT %s \
+// RUN:   -strict-whitespace
+
+#ifdef __FLT_EVAL_METHOD__
+#if __FLT_EVAL_METHOD__ == 3
+#define __GLIBC_FLT_EVAL_METHOD 2
+#else
+#define __GLIBC_FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#endif
+#elif defined __x86_64__
+#define __GLIBC_FLT_EVAL_METHOD 0
+#else
+#define __GLIBC_FLT_EVAL_METHOD 2
+#endif
+
+#if __GLIBC_FLT_EVAL_METHOD == 0 || __GLIBC_FLT_EVAL_METHOD == 16
+#define Name "One"
+#elif __GLIBC_FLT_EVAL_METHOD == 1
+#define Name "Two"
+#elif __GLIBC_FLT_EVAL_METHOD == 2
+#define Name "Three"
+#elif __GLIBC_FLT_EVAL_METHOD == 32
+#define Name "Four"
+#elif __GLIBC_FLT_EVAL_METHOD == 33
+#define Name "Five"
+#elif __GLIBC_FLT_EVAL_METHOD == 64
+#define Name "Six"
+#elif __GLIBC_FLT_EVAL_METHOD == 65
+#define Name "Seven"
+#elif __GLIBC_FLT_EVAL_METHOD == 128
+#define Name "Eight"
+#elif __GLIBC_FLT_EVAL_METHOD == 129
+#define Name "Nine"
+#else
+#error "Unknown __GLIBC_FLT_EVAL_METHOD"
+#endif
+
+int foo() {
+  // CHECK: #define Name "One"
+  // EXT: #define Name "Three"
+  return Name;
+}
+
+#pragma fp eval_method(double)
+
+#if __FLT_EVAL_METHOD__ == 3
+#define Val "Unset"
+#elif __FLT_EVAL_METHOD__ == 0
+#define Val "val0"
+#elif __FLT_EVAL_METHOD__ == 1
+#define Val "val1"
+#elif __FLT_EVAL_METHOD__ == 2
+#define Val "val2"
+#endif
+
+int goo() {
+  // CHECK: #define Val "val0"
+  // EXT: #define Val "val2"
+  return Val;
+}
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index f6809d8d9b48f..66cab8b1f8d04 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -93,7 +93,6 @@
 // AARCH64-NEXT: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-NEXT: #define __FLT_DIG__ 6
 // AARCH64-NEXT: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-NEXT: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-NEXT: #define __FLT_HAS_DENORM__ 1
 // AARCH64-NEXT: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-NEXT: #define __FLT_HAS_QUIET_NAN__ 1
@@ -388,7 +387,6 @@
 // AARCH64-DARWIN: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-DARWIN: #define __FLT_DIG__ 6
 // AARCH64-DARWIN: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-DARWIN: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-DARWIN: #define __FLT_HAS_DENORM__ 1
 // AARCH64-DARWIN: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-DARWIN: #define __FLT_HAS_QUIET_NAN__ 1
@@ -604,7 +602,6 @@
 // AARCH64-MSVC: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AARCH64-MSVC: #define __FLT_DIG__ 6
 // AARCH64-MSVC: #define __FLT_EPSILON__ 1.19209290e-7F
-// AARCH64-MSVC: #define __FLT_EVAL_METHOD__ 0
 // AARCH64-MSVC: #define __FLT_HAS_DENORM__ 1
 // AARCH64-MSVC: #define __FLT_HAS_INFINITY__ 1
 // AARCH64-MSVC: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-arm.c b/clang/test/Preprocessor/init-arm.c
index 32eb2c513f8b0..2d1503c18560e 100644
--- a/clang/test/Preprocessor/init-arm.c
+++ b/clang/test/Preprocessor/init-arm.c
@@ -35,7 +35,6 @@
 // ARM:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM:#define __FLT_DIG__ 6
 // ARM:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARM:#define __FLT_EVAL_METHOD__ 0
 // ARM:#define __FLT_HAS_DENORM__ 1
 // ARM:#define __FLT_HAS_INFINITY__ 1
 // ARM:#define __FLT_HAS_QUIET_NAN__ 1
@@ -235,7 +234,6 @@
 // ARM-BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM-BE:#define __FLT_DIG__ 6
 // ARM-BE:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARM-BE:#define __FLT_EVAL_METHOD__ 0
 // ARM-BE:#define __FLT_HAS_DENORM__ 1
 // ARM-BE:#define __FLT_HAS_INFINITY__ 1
 // ARM-BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -428,7 +426,6 @@
 // ARMEABISOFTFP:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARMEABISOFTFP:#define __FLT_DIG__ 6
 // ARMEABISOFTFP:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARMEABISOFTFP:#define __FLT_EVAL_METHOD__ 0
 // ARMEABISOFTFP:#define __FLT_HAS_DENORM__ 1
 // ARMEABISOFTFP:#define __FLT_HAS_INFINITY__ 1
 // ARMEABISOFTFP:#define __FLT_HAS_QUIET_NAN__ 1
@@ -623,7 +620,6 @@
 // ARMEABIHARDFP:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARMEABIHARDFP:#define __FLT_DIG__ 6
 // ARMEABIHARDFP:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARMEABIHARDFP:#define __FLT_EVAL_METHOD__ 0
 // ARMEABIHARDFP:#define __FLT_HAS_DENORM__ 1
 // ARMEABIHARDFP:#define __FLT_HAS_INFINITY__ 1
 // ARMEABIHARDFP:#define __FLT_HAS_QUIET_NAN__ 1
@@ -821,7 +817,6 @@
 // ARM-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // ARM-NETBSD:#define __FLT_DIG__ 6
 // ARM-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// ARM-NETBSD:#define __FLT_EVAL_METHOD__ 0
 // ARM-NETBSD:#define __FLT_HAS_DENORM__ 1
 // ARM-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // ARM-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c
index d76396aa35c91..a07cee64e6848 100644
--- a/clang/test/Preprocessor/init-mips.c
+++ b/clang/test/Preprocessor/init-mips.c
@@ -37,7 +37,6 @@
 // MIPS32BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS32BE:#define __FLT_DIG__ 6
 // MIPS32BE:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS32BE:#define __FLT_EVAL_METHOD__ 0
 // MIPS32BE:#define __FLT_HAS_DENORM__ 1
 // MIPS32BE:#define __FLT_HAS_INFINITY__ 1
 // MIPS32BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -247,7 +246,6 @@
 // MIPS32EL:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS32EL:#define __FLT_DIG__ 6
 // MIPS32EL:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS32EL:#define __FLT_EVAL_METHOD__ 0
 // MIPS32EL:#define __FLT_HAS_DENORM__ 1
 // MIPS32EL:#define __FLT_HAS_INFINITY__ 1
 // MIPS32EL:#define __FLT_HAS_QUIET_NAN__ 1
@@ -467,7 +465,6 @@
 // MIPSN32BE: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPSN32BE: #define __FLT_DIG__ 6
 // MIPSN32BE: #define __FLT_EPSILON__ 1.19209290e-7F
-// MIPSN32BE: #define __FLT_EVAL_METHOD__ 0
 // MIPSN32BE: #define __FLT_HAS_DENORM__ 1
 // MIPSN32BE: #define __FLT_HAS_INFINITY__ 1
 // MIPSN32BE: #define __FLT_HAS_QUIET_NAN__ 1
@@ -774,7 +771,6 @@
 // MIPSN32EL: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPSN32EL: #define __FLT_DIG__ 6
 // MIPSN32EL: #define __FLT_EPSILON__ 1.19209290e-7F
-// MIPSN32EL: #define __FLT_EVAL_METHOD__ 0
 // MIPSN32EL: #define __FLT_HAS_DENORM__ 1
 // MIPSN32EL: #define __FLT_HAS_INFINITY__ 1
 // MIPSN32EL: #define __FLT_HAS_QUIET_NAN__ 1
@@ -1074,7 +1070,6 @@
 // MIPS64BE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS64BE:#define __FLT_DIG__ 6
 // MIPS64BE:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS64BE:#define __FLT_EVAL_METHOD__ 0
 // MIPS64BE:#define __FLT_HAS_DENORM__ 1
 // MIPS64BE:#define __FLT_HAS_INFINITY__ 1
 // MIPS64BE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1284,7 +1279,6 @@
 // MIPS64EL:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MIPS64EL:#define __FLT_DIG__ 6
 // MIPS64EL:#define __FLT_EPSILON__ 1.19209290e-7F
-// MIPS64EL:#define __FLT_EVAL_METHOD__ 0
 // MIPS64EL:#define __FLT_HAS_DENORM__ 1
 // MIPS64EL:#define __FLT_HAS_INFINITY__ 1
 // MIPS64EL:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-ppc.c b/clang/test/Preprocessor/init-ppc.c
index 611b16dfb8f7e..45c8a5e53ad4f 100644
--- a/clang/test/Preprocessor/init-ppc.c
+++ b/clang/test/Preprocessor/init-ppc.c
@@ -30,7 +30,6 @@
 // PPC603E:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC603E:#define __FLT_DIG__ 6
 // PPC603E:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC603E:#define __FLT_EVAL_METHOD__ 0
 // PPC603E:#define __FLT_HAS_DENORM__ 1
 // PPC603E:#define __FLT_HAS_INFINITY__ 1
 // PPC603E:#define __FLT_HAS_QUIET_NAN__ 1
@@ -224,7 +223,6 @@
 // PPC:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC:#define __FLT_DIG__ 6
 // PPC:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC:#define __FLT_EVAL_METHOD__ 0
 // PPC:#define __FLT_HAS_DENORM__ 1
 // PPC:#define __FLT_HAS_INFINITY__ 1
 // PPC:#define __FLT_HAS_QUIET_NAN__ 1
@@ -425,7 +423,6 @@
 // PPC-AIX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-AIX:#define __FLT_DIG__ 6
 // PPC-AIX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC-AIX:#define __FLT_EVAL_METHOD__ 1
 // PPC-AIX:#define __FLT_HAS_DENORM__ 1
 // PPC-AIX:#define __FLT_HAS_INFINITY__ 1
 // PPC-AIX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -798,7 +795,6 @@
 // PPC-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-LINUX:#define __FLT_DIG__ 6
 // PPC-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC-LINUX:#define __FLT_EVAL_METHOD__ 0
 // PPC-LINUX:#define __FLT_HAS_DENORM__ 1
 // PPC-LINUX:#define __FLT_HAS_INFINITY__ 1
 // PPC-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1006,7 +1002,6 @@
 // PPC-DARWIN:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC-DARWIN:#define __FLT_DIG__ 6
 // PPC-DARWIN:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC-DARWIN:#define __FLT_EVAL_METHOD__ 0
 // PPC-DARWIN:#define __FLT_HAS_DENORM__ 1
 // PPC-DARWIN:#define __FLT_HAS_INFINITY__ 1
 // PPC-DARWIN:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index 7a9525228c3b6..f0ccd1638c04d 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -35,7 +35,6 @@
 // PPC64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64:#define __FLT_DIG__ 6
 // PPC64:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64:#define __FLT_EVAL_METHOD__ 0
 // PPC64:#define __FLT_HAS_DENORM__ 1
 // PPC64:#define __FLT_HAS_INFINITY__ 1
 // PPC64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -240,7 +239,6 @@
 // PPC64LE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64LE:#define __FLT_DIG__ 6
 // PPC64LE:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64LE:#define __FLT_EVAL_METHOD__ 0
 // PPC64LE:#define __FLT_HAS_DENORM__ 1
 // PPC64LE:#define __FLT_HAS_INFINITY__ 1
 // PPC64LE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -703,7 +701,6 @@
 // PPC64-AIX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64-AIX:#define __FLT_DIG__ 6
 // PPC64-AIX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64-AIX:#define __FLT_EVAL_METHOD__ 1
 // PPC64-AIX:#define __FLT_HAS_DENORM__ 1
 // PPC64-AIX:#define __FLT_HAS_INFINITY__ 1
 // PPC64-AIX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -902,7 +899,6 @@
 // PPC64-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PPC64-LINUX:#define __FLT_DIG__ 6
 // PPC64-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// PPC64-LINUX:#define __FLT_EVAL_METHOD__ 0
 // PPC64-LINUX:#define __FLT_HAS_DENORM__ 1
 // PPC64-LINUX:#define __FLT_HAS_INFINITY__ 1
 // PPC64-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c
index b0e45b5348ce9..6c646527f50f7 100644
--- a/clang/test/Preprocessor/init-s390x.c
+++ b/clang/test/Preprocessor/init-s390x.c
@@ -23,7 +23,6 @@
 // S390X:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // S390X:#define __FLT_DIG__ 6
 // S390X:#define __FLT_EPSILON__ 1.19209290e-7F
-// S390X:#define __FLT_EVAL_METHOD__ 0
 // S390X:#define __FLT_HAS_DENORM__ 1
 // S390X:#define __FLT_HAS_INFINITY__ 1
 // S390X:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-v7k-compat.c b/clang/test/Preprocessor/init-v7k-compat.c
index 482c7ad6ff687..ff5d4bbdea53a 100644
--- a/clang/test/Preprocessor/init-v7k-compat.c
+++ b/clang/test/Preprocessor/init-v7k-compat.c
@@ -28,7 +28,6 @@
 // CHECK: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // CHECK: #define __FLT_DIG__ 6
 // CHECK: #define __FLT_EPSILON__ 1.19209290e-7F
-// CHECK: #define __FLT_EVAL_METHOD__ 0
 // CHECK: #define __FLT_HAS_DENORM__ 1
 // CHECK: #define __FLT_HAS_INFINITY__ 1
 // CHECK: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init-x86.c b/clang/test/Preprocessor/init-x86.c
index 527cd39508889..aa2e05ec807c7 100644
--- a/clang/test/Preprocessor/init-x86.c
+++ b/clang/test/Preprocessor/init-x86.c
@@ -24,7 +24,6 @@
 // I386:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386:#define __FLT_DIG__ 6
 // I386:#define __FLT_EPSILON__ 1.19209290e-7F
-// I386:#define __FLT_EVAL_METHOD__ 2
 // I386:#define __FLT_HAS_DENORM__ 1
 // I386:#define __FLT_HAS_INFINITY__ 1
 // I386:#define __FLT_HAS_QUIET_NAN__ 1
@@ -213,7 +212,6 @@
 // I386-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386-LINUX:#define __FLT_DIG__ 6
 // I386-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// I386-LINUX:#define __FLT_EVAL_METHOD__ 0
 // I386-LINUX:#define __FLT_HAS_DENORM__ 1
 // I386-LINUX:#define __FLT_HAS_INFINITY__ 1
 // I386-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -416,7 +414,6 @@
 // I386-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // I386-NETBSD:#define __FLT_DIG__ 6
 // I386-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// I386-NETBSD:#define __FLT_EVAL_METHOD__ 2
 // I386-NETBSD:#define __FLT_HAS_DENORM__ 1
 // I386-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // I386-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
@@ -590,13 +587,6 @@
 // I386-NETBSD:#define __i386__ 1
 // I386-NETBSD:#define i386 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD-SSE %s
-// I386-NETBSD-SSE:#define __FLT_EVAL_METHOD__ 0
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6  < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6 %s
-// I386-NETBSD6:#define __FLT_EVAL_METHOD__ 1
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6 -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6-SSE %s
-// I386-NETBSD6-SSE:#define __FLT_EVAL_METHOD__ 1
-
 // RUN: %clang_cc1 -E -dM -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
 // RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
 // RUN: %clang_cc1 -E -dM -triple=i686-unknown-cygwin < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
@@ -631,7 +621,6 @@
 // X86_64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64:#define __FLT_DIG__ 6
 // X86_64:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64:#define __FLT_EVAL_METHOD__ 0
 // X86_64:#define __FLT_HAS_DENORM__ 1
 // X86_64:#define __FLT_HAS_INFINITY__ 1
 // X86_64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -839,7 +828,6 @@
 // X32:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X32:#define __FLT_DIG__ 6
 // X32:#define __FLT_EPSILON__ 1.19209290e-7F
-// X32:#define __FLT_EVAL_METHOD__ 0
 // X32:#define __FLT_HAS_DENORM__ 1
 // X32:#define __FLT_HAS_INFINITY__ 1
 // X32:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1046,7 +1034,6 @@
 // X86_64-CLOUDABI:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-CLOUDABI:#define __FLT_DIG__ 6
 // X86_64-CLOUDABI:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64-CLOUDABI:#define __FLT_EVAL_METHOD__ 0
 // X86_64-CLOUDABI:#define __FLT_HAS_DENORM__ 1
 // X86_64-CLOUDABI:#define __FLT_HAS_INFINITY__ 1
 // X86_64-CLOUDABI:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1341,7 +1328,6 @@
 // X86_64-LINUX:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-LINUX:#define __FLT_DIG__ 6
 // X86_64-LINUX:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64-LINUX:#define __FLT_EVAL_METHOD__ 0
 // X86_64-LINUX:#define __FLT_HAS_DENORM__ 1
 // X86_64-LINUX:#define __FLT_HAS_INFINITY__ 1
 // X86_64-LINUX:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1554,7 +1540,6 @@
 // X86_64-NETBSD:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // X86_64-NETBSD:#define __FLT_DIG__ 6
 // X86_64-NETBSD:#define __FLT_EPSILON__ 1.19209290e-7F
-// X86_64-NETBSD:#define __FLT_EVAL_METHOD__ 0
 // X86_64-NETBSD:#define __FLT_HAS_DENORM__ 1
 // X86_64-NETBSD:#define __FLT_HAS_INFINITY__ 1
 // X86_64-NETBSD:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index dd645bf6003ce..a08e503570723 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -325,7 +325,6 @@
 // MSP430:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // MSP430:#define __FLT_DIG__ 6
 // MSP430:#define __FLT_EPSILON__ 1.19209290e-7F
-// MSP430:#define __FLT_EVAL_METHOD__ 0
 // MSP430:#define __FLT_HAS_DENORM__ 1
 // MSP430:#define __FLT_HAS_INFINITY__ 1
 // MSP430:#define __FLT_HAS_QUIET_NAN__ 1
@@ -513,7 +512,6 @@
 // NVPTX32:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // NVPTX32:#define __FLT_DIG__ 6
 // NVPTX32:#define __FLT_EPSILON__ 1.19209290e-7F
-// NVPTX32:#define __FLT_EVAL_METHOD__ 0
 // NVPTX32:#define __FLT_HAS_DENORM__ 1
 // NVPTX32:#define __FLT_HAS_INFINITY__ 1
 // NVPTX32:#define __FLT_HAS_QUIET_NAN__ 1
@@ -702,7 +700,6 @@
 // NVPTX64:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // NVPTX64:#define __FLT_DIG__ 6
 // NVPTX64:#define __FLT_EPSILON__ 1.19209290e-7F
-// NVPTX64:#define __FLT_EVAL_METHOD__ 0
 // NVPTX64:#define __FLT_HAS_DENORM__ 1
 // NVPTX64:#define __FLT_HAS_INFINITY__ 1
 // NVPTX64:#define __FLT_HAS_QUIET_NAN__ 1
@@ -906,7 +903,6 @@
 // SPARC:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // SPARC:#define __FLT_DIG__ 6
 // SPARC:#define __FLT_EPSILON__ 1.19209290e-7F
-// SPARC:#define __FLT_EVAL_METHOD__ 0
 // SPARC:#define __FLT_HAS_DENORM__ 1
 // SPARC:#define __FLT_HAS_INFINITY__ 1
 // SPARC:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1107,7 +1103,6 @@
 // TCE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // TCE:#define __FLT_DIG__ 6
 // TCE:#define __FLT_EPSILON__ 1.19209290e-7F
-// TCE:#define __FLT_EVAL_METHOD__ 0
 // TCE:#define __FLT_HAS_DENORM__ 1
 // TCE:#define __FLT_HAS_INFINITY__ 1
 // TCE:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1274,7 +1269,6 @@
 // PS4:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // PS4:#define __FLT_DIG__ 6
 // PS4:#define __FLT_EPSILON__ 1.19209290e-7F
-// PS4:#define __FLT_EVAL_METHOD__ 0
 // PS4:#define __FLT_HAS_DENORM__ 1
 // PS4:#define __FLT_HAS_INFINITY__ 1
 // PS4:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1576,7 +1570,6 @@
 // WEBASSEMBLY-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // WEBASSEMBLY-NEXT:#define __FLT_DIG__ 6
 // WEBASSEMBLY-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F
-// WEBASSEMBLY-NEXT:#define __FLT_EVAL_METHOD__ 0
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_DENORM__ 1
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_INFINITY__ 1
 // WEBASSEMBLY-NEXT:#define __FLT_HAS_QUIET_NAN__ 1
@@ -1946,7 +1939,6 @@
 // AVR:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // AVR:#define __FLT_DIG__ 6
 // AVR:#define __FLT_EPSILON__ 1.19209290e-7F
-// AVR:#define __FLT_EVAL_METHOD__ 0
 // AVR:#define __FLT_HAS_DENORM__ 1
 // AVR:#define __FLT_HAS_INFINITY__ 1
 // AVR:#define __FLT_HAS_QUIET_NAN__ 1
@@ -2083,7 +2075,6 @@
 // AVR:#define __WCHAR_TYPE__ int
 // AVR:#define __WINT_TYPE__ int
 
-
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:    -triple i686-windows-msvc -fms-compatibility -x c++ < /dev/null \
 // RUN:  | FileCheck -match-full-lines -check-prefix MSVC-X32 %s
@@ -2229,7 +2220,6 @@
 // RISCV32: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // RISCV32: #define __FLT_DIG__ 6
 // RISCV32: #define __FLT_EPSILON__ 1.19209290e-7F
-// RISCV32: #define __FLT_EVAL_METHOD__ 0
 // RISCV32: #define __FLT_HAS_DENORM__ 1
 // RISCV32: #define __FLT_HAS_INFINITY__ 1
 // RISCV32: #define __FLT_HAS_QUIET_NAN__ 1
@@ -2437,7 +2427,6 @@
 // RISCV64: #define __FLT_DENORM_MIN__ 1.40129846e-45F
 // RISCV64: #define __FLT_DIG__ 6
 // RISCV64: #define __FLT_EPSILON__ 1.19209290e-7F
-// RISCV64: #define __FLT_EVAL_METHOD__ 0
 // RISCV64: #define __FLT_HAS_DENORM__ 1
 // RISCV64: #define __FLT_HAS_INFINITY__ 1
 // RISCV64: #define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Sema/fp-eval-pragma.cpp b/clang/test/Sema/fp-eval-pragma.cpp
new file mode 100644
index 0000000000000..42d88fd438e81
--- /dev/null
+++ b/clang/test/Sema/fp-eval-pragma.cpp
@@ -0,0 +1,87 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s \
+// RUN: -ffp-eval-method=source
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify %s \
+// RUN: -ffp-eval-method=double
+
+extern "C" int printf(const char *, ...);
+
+void foo1() {
+  printf("FP: %d\n", __FLT_EVAL_METHOD__);
+}
+
+void apply_pragma() {
+  // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+  // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+  printf("FP: %d\n", __FLT_EVAL_METHOD__);
+}
+
+int foo2() {
+  apply_pragma();
+  return 0;
+}
+
+void foo() {
+  auto a = __FLT_EVAL_METHOD__;
+  {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    auto b = __FLT_EVAL_METHOD__;
+  }
+  auto c = __FLT_EVAL_METHOD__;
+}
+
+void func() {
+  {
+    {
+#pragma clang fp eval_method(source)
+    }
+    int i = __FLT_EVAL_METHOD__; // ok, not in a scope changed by the pragma
+  }
+  {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(source)
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    int i = __FLT_EVAL_METHOD__;
+  }
+}
+
+float G;
+
+int f(float x, float y, float z) {
+  G = x * y + z;
+  return __FLT_EVAL_METHOD__;
+}
+
+int foo(int flag, float x, float y, float z) {
+  if (flag) {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+    G = x + y + z;
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    return __FLT_EVAL_METHOD__;
+  } else {
+    // expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(extended)
+    G = x + y + z;
+    // expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+    return __FLT_EVAL_METHOD__;
+  }
+}
+
+#if __FLT_EVAL_METHOD__ == 1
+#endif
+#pragma clang fp eval_method(source)
+
+// expected-note@+1{{#pragma entered here}}
+#pragma clang fp eval_method(double)
+// expected-error@+1{{'__FLT_EVAL_METHOD__' cannot be expanded inside a scope containing '#pragma clang fp eval_method'}}
+#if __FLT_EVAL_METHOD__ == 1
+#endif
diff --git a/clang/test/Sema/x86-eval-method.c b/clang/test/Sema/x86-eval-method.c
new file mode 100644
index 0000000000000..f475b0d1b29bc
--- /dev/null
+++ b/clang/test/Sema/x86-eval-method.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 -target-feature -sse \
+// RUN: -emit-llvm -ffp-eval-method=source  -o - -verify=warn %s
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple i386-pc-windows -target-cpu pentium4 \
+// RUN: -emit-llvm -ffp-eval-method=source  -o - -verify=no-warn %s
+
+// no-warn-no-diagnostics
+
+float add1(float a, float b, float c) {
+  return a + b + c;
+} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+
+float add2(float a, float b, float c) {
+#pragma clang fp eval_method(source)
+  return a + b + c;
+} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
diff --git a/clang/test/Sema/x86_64-eval-method.c b/clang/test/Sema/x86_64-eval-method.c
new file mode 100644
index 0000000000000..dbdc1f881b4a8
--- /dev/null
+++ b/clang/test/Sema/x86_64-eval-method.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -target-feature -sse -emit-llvm \
+// RUN: -o - -verify=warn %s
+//
+// RUN: %clang_cc1 -fexperimental-strict-floating-point \
+// RUN: -triple x86_64-linux-gnu -emit-llvm -o - -verify=no-warn %s
+
+// no-warn-no-diagnostics
+
+float add2(float a, float b, float c) {
+#pragma clang fp eval_method(source)
+  return a + b + c;
+} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}

From 119d71cb73a888a57b368cb5c74ac8ac07fb5c32 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Wed, 23 Feb 2022 17:40:29 -0500
Subject: [PATCH 720/748] [OpenMP][NFC] Address warnings and lint messages in
 CGOpenMPRuntime

Summary:
This patch addressed the warnings and linting messages for the
CGOpenMPRuntime.cpp file. This was causing some -Werror builds to fail.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 49 ++++++++++++++-------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3f4a78ddbf3c6..c02641c4cba97 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1349,7 +1349,7 @@ static void buildStructValue(ConstantStructBuilder &Fields, CodeGenModule &CGM,
   llvm::StructType *StructTy = RL.getLLVMType();
   unsigned PrevIdx = 0;
   ConstantInitBuilder CIBuilder(CGM);
-  auto DI = Data.begin();
+  const auto *DI = Data.begin();
   for (const FieldDecl *FD : RD->fields()) {
     unsigned Idx = RL.getLLVMFieldNo(FD);
     // Fill the alignment.
@@ -3975,7 +3975,7 @@ static bool checkInitIsRequired(CodeGenFunction &CGF,
       continue;
     const VarDecl *VD = Pair.second.PrivateCopy;
     const Expr *Init = VD->getAnyInitializer();
-    InitRequired = InitRequired || (Init && isa<CXXConstructExpr>(Init) &&
+    InitRequired = InitRequired || (isa_and_nonnull<CXXConstructExpr>(Init) &&
                                     !CGF.isTrivialInitializer(Init));
     if (InitRequired)
       break;
@@ -5476,7 +5476,7 @@ llvm::Function *CGOpenMPRuntime::emitReductionFunction(
   //  *(Type<i>*)lhs[i] = RedOp<i>(*(Type<i>*)lhs[i], *(Type<i>*)rhs[i]);
   //  ...
   CodeGenFunction::OMPPrivateScope Scope(CGF);
-  auto IPriv = Privates.begin();
+  const auto *IPriv = Privates.begin();
   unsigned Idx = 0;
   for (unsigned I = 0, E = ReductionOps.size(); I < E; ++I, ++IPriv, ++Idx) {
     const auto *RHSVar =
@@ -5505,8 +5505,8 @@ llvm::Function *CGOpenMPRuntime::emitReductionFunction(
   }
   Scope.Privatize();
   IPriv = Privates.begin();
-  auto ILHS = LHSExprs.begin();
-  auto IRHS = RHSExprs.begin();
+  const auto *ILHS = LHSExprs.begin();
+  const auto *IRHS = RHSExprs.begin();
   for (const Expr *E : ReductionOps) {
     if ((*IPriv)->getType()->isArrayType()) {
       // Emit reduction for array section.
@@ -5601,9 +5601,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
 
   if (SimpleReduction) {
     CodeGenFunction::RunCleanupsScope Scope(CGF);
-    auto IPriv = Privates.begin();
-    auto ILHS = LHSExprs.begin();
-    auto IRHS = RHSExprs.begin();
+    const auto *IPriv = Privates.begin();
+    const auto *ILHS = LHSExprs.begin();
+    const auto *IRHS = RHSExprs.begin();
     for (const Expr *E : ReductionOps) {
       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
                                   cast<DeclRefExpr>(*IRHS));
@@ -5628,7 +5628,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
                              /*IndexTypeQuals=*/0);
   Address ReductionList =
       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
+  const auto *IPriv = Privates.begin();
   unsigned Idx = 0;
   for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
@@ -5705,9 +5705,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
   auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps](
                        CodeGenFunction &CGF, PrePostActionTy &Action) {
     CGOpenMPRuntime &RT = CGF.CGM.getOpenMPRuntime();
-    auto IPriv = Privates.begin();
-    auto ILHS = LHSExprs.begin();
-    auto IRHS = RHSExprs.begin();
+    const auto *IPriv = Privates.begin();
+    const auto *ILHS = LHSExprs.begin();
+    const auto *IRHS = RHSExprs.begin();
     for (const Expr *E : ReductionOps) {
       RT.emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
                                      cast<DeclRefExpr>(*IRHS));
@@ -5739,9 +5739,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
 
   auto &&AtomicCodeGen = [Loc, Privates, LHSExprs, RHSExprs, ReductionOps](
                              CodeGenFunction &CGF, PrePostActionTy &Action) {
-    auto ILHS = LHSExprs.begin();
-    auto IRHS = RHSExprs.begin();
-    auto IPriv = Privates.begin();
+    const auto *ILHS = LHSExprs.begin();
+    const auto *IRHS = RHSExprs.begin();
+    const auto *IPriv = Privates.begin();
     for (const Expr *E : ReductionOps) {
       const Expr *XExpr = nullptr;
       const Expr *EExpr = nullptr;
@@ -8352,7 +8352,7 @@ class MappableExprsHandler {
     }
 
     // Skip the dummy dimension since we have already have its information.
-    auto DI = DimSizes.begin() + 1;
+    auto *DI = DimSizes.begin() + 1;
     // Product of dimension.
     llvm::Value *DimProd =
         llvm::ConstantInt::get(CGF.CGM.Int64Ty, ElementTypeSize);
@@ -10358,8 +10358,8 @@ void CGOpenMPRuntime::emitTargetCall(
   llvm::Value *MapTypesArray = nullptr;
   llvm::Value *MapNamesArray = nullptr;
   // Generate code for the host fallback function.
-  auto &&FallbackGen = [this, &D, OutlinedFn, &CapturedVars, RequiresOuterTask, &CS,
-                        OffloadingMandatory](CodeGenFunction &CGF) {
+  auto &&FallbackGen = [this, OutlinedFn, &D, &CapturedVars, RequiresOuterTask,
+                        &CS, OffloadingMandatory](CodeGenFunction &CGF) {
     if (OffloadingMandatory) {
       CGF.Builder.CreateUnreachable();
     } else {
@@ -10371,8 +10371,8 @@ void CGOpenMPRuntime::emitTargetCall(
     }
   };
   // Fill up the pointer arrays and transfer execution to the device.
-  auto &&ThenGen = [this, Device, OutlinedFnID, &D, &InputInfo,
-                    &MapTypesArray, &MapNamesArray, SizeEmitter,
+  auto &&ThenGen = [this, Device, OutlinedFnID, &D, &InputInfo, &MapTypesArray,
+                    &MapNamesArray, SizeEmitter,
                     FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
     if (Device.getInt() == OMPC_DEVICE_ancestor) {
       // Reverse offloading is not supported, so just execute on the host.
@@ -11995,7 +11995,7 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
         ParamAttrs[Pos].Kind = Uniform;
       }
       // Get alignment info.
-      auto NI = Attr->alignments_begin();
+      auto *NI = Attr->alignments_begin();
       for (const Expr *E : Attr->aligneds()) {
         E = E->IgnoreParenImpCasts();
         unsigned Pos;
@@ -12018,8 +12018,8 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
         ++NI;
       }
       // Mark linear parameters.
-      auto SI = Attr->steps_begin();
-      auto MI = Attr->modifiers_begin();
+      auto *SI = Attr->steps_begin();
+      auto *MI = Attr->modifiers_begin();
       for (const Expr *E : Attr->linears()) {
         E = E->IgnoreParenImpCasts();
         unsigned Pos;
@@ -12046,7 +12046,8 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
           if (!(*SI)->EvaluateAsInt(Result, C, Expr::SE_AllowSideEffects)) {
             if (const auto *DRE =
                     cast<DeclRefExpr>((*SI)->IgnoreParenImpCasts())) {
-              if (const auto *StridePVD = cast<ParmVarDecl>(DRE->getDecl())) {
+              if (const auto *StridePVD =
+                      dyn_cast<ParmVarDecl>(DRE->getDecl())) {
                 ParamAttr.Kind = LinearWithVarStride;
                 ParamAttr.StrideOrArg = llvm::APSInt::getUnsigned(
                     ParamPositions[StridePVD->getCanonicalDecl()]);

From deed2466315ecb269368ba5799e4bb17c619fcaf Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 24 Feb 2022 00:09:18 +0100
Subject: [PATCH 721/748] [libc++] Add empty line in ReleaseNotes.rst

---
 libcxx/docs/ReleaseNotes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index ce13166782f7c..86a9bc0c504f6 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -57,6 +57,7 @@ API Changes
   If, after updating libc++, you see compiler errors related to missing declarations in
   namespace ``std``, it might be because one of your source files now needs to
   ``#include <algorithm>`` and/or ``#include <chrono>``.
+
 ABI Changes
 -----------
 

From 18fa0b15ccf610f34af1231440f89d20cb99e7a0 Mon Sep 17 00:00:00 2001
From: Vladimir Vereschaka <vvereschaka@accesssoftek.com>
Date: Wed, 16 Feb 2022 17:20:39 -0800
Subject: [PATCH 722/748] [CMake] Use CMAKE_SYSROOT to build libs for Win to
 ARM cross tooolchain. NFC.

Provide CMAKE_SYSROOT for the libc++/libc++abi/libunwind libraries
instead of specific <foo>_SYSROOT for each of them.

Fixed passing some CMake arguments for the runtimes.

Referenced Differentials:
 * https://reviews.llvm.org/D119836
 * https://reviews.llvm.org/D112155
 * https://reviews.llvm.org/D111672

Differential Revision: https://reviews.llvm.org/D120383
---
 clang/cmake/caches/CrossWinToARMLinux.cmake | 151 +++++++++++---------
 1 file changed, 84 insertions(+), 67 deletions(-)

diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index dd03a37b4b8f9..5165992b7b923 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -10,7 +10,7 @@
 #
 # Configure:
 #  cmake -G Ninja ^
-#       -DTARGET_TRIPLE=armv7-linux-gnueabihf ^
+#       -DTARGET_TRIPLE=armv7-unknown-linux-gnueabihf ^
 #       -DCMAKE_INSTALL_PREFIX=../install ^
 #       -DDEFAULT_SYSROOT=<path-to-develop-arm-linux-root-fs> ^
 #       -DLLVM_AR=<llvm_obj_root>/bin/llvm-ar[.exe] ^
@@ -25,10 +25,10 @@
 #  cmake --build . --target check-llvm
 #  cmake --build . --target check-clang
 #  cmake --build . --target check-lld
-#  cmake --build . --target check-compiler-rt
-#  cmake --build . --target check-cxxabi
-#  cmake --build . --target check-unwind
-#  cmake --build . --target check-cxx
+#  cmake --build . --target check-compiler-rt-<TARGET_TRIPLE>
+#  cmake --build . --target check-cxxabi-<TARGET_TRIPLE>
+#  cmake --build . --target check-unwind-<TARGET_TRIPLE>
+#  cmake --build . --target check-cxx-<TARGET_TRIPLE>
 
 # LLVM_PROJECT_DIR is the path to the llvm-project directory.
 # The right way to compute it would probably be to use "${CMAKE_SOURCE_DIR}/../",
@@ -86,49 +86,20 @@ message(STATUS "Toolchain target triple: ${TARGET_TRIPLE}")
 set(CMAKE_CROSSCOMPILING                    ON CACHE BOOL "")
 set(CMAKE_CL_SHOWINCLUDES_PREFIX            "Note: including file: " CACHE STRING "")
 # Required if COMPILER_RT_DEFAULT_TARGET_ONLY is ON
-set(CMAKE_C_COMPILER_TARGET         "${TARGET_TRIPLE}" CACHE STRING "")
+set(CMAKE_C_COMPILER_TARGET                 "${TARGET_TRIPLE}" CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET               "${TARGET_TRIPLE}" CACHE STRING "")
 
-set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR    ON CACHE BOOL "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR      ON CACHE BOOL "")
 set(LLVM_DEFAULT_TARGET_TRIPLE              "${TARGET_TRIPLE}" CACHE STRING "")
 set(LLVM_TARGET_ARCH                        "${TARGET_TRIPLE}" CACHE STRING "")
 set(LLVM_LIT_ARGS                           "-vv ${LLVM_LIT_ARGS}" CACHE STRING "" FORCE)
 
 set(CLANG_DEFAULT_LINKER                    "lld" CACHE STRING "")
 
-set(COMPILER_RT_BUILD_BUILTINS              ON CACHE BOOL "")
-set(COMPILER_RT_BUILD_SANITIZERS            OFF CACHE BOOL "")
-set(COMPILER_RT_BUILD_XRAY                  OFF CACHE BOOL "")
-set(COMPILER_RT_BUILD_LIBFUZZER             OFF CACHE BOOL "")
-set(COMPILER_RT_BUILD_PROFILE               OFF CACHE BOOL "")
-set(COMPILER_RT_BUILD_CRT                   OFF CACHE BOOL "")
-set(COMPILER_RT_BUILD_ORC                   OFF CACHE BOOL "")
-set(COMPILER_RT_DEFAULT_TARGET_ONLY         ON CACHE BOOL "")
-set(COMPILER_RT_INCLUDE_TESTS               ON CACHE BOOL "")
-
-set(LIBUNWIND_USE_COMPILER_RT               ON CACHE BOOL "")
-set(LIBUNWIND_TARGET_TRIPLE                 "${TARGET_TRIPLE}" CACHE STRING "")
-set(LIBUNWIND_SYSROOT                       "${DEFAULT_SYSROOT}" CACHE STRING "")
-set(LIBUNWIND_ENABLE_SHARED                 OFF CACHE BOOL "")
-
-set(LIBCXXABI_USE_LLVM_UNWINDER             ON CACHE BOOL "")
-set(LIBCXXABI_ENABLE_STATIC_UNWINDER        ON CACHE BOOL "")
-set(LIBCXXABI_USE_COMPILER_RT               ON CACHE BOOL "")
-set(LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS OFF CACHE BOOL "")
-set(LIBCXXABI_TARGET_TRIPLE                 "${TARGET_TRIPLE}" CACHE STRING "")
-set(LIBCXXABI_SYSROOT                       "${DEFAULT_SYSROOT}" CACHE STRING "")
-set(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI OFF CACHE BOOL "")
-set(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX    OFF CACHE BOOL "")
-set(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI    OFF CACHE BOOL "")
-set(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX       OFF CACHE BOOL "")
-
-set(LIBCXX_USE_COMPILER_RT                  ON CACHE BOOL "")
-set(LIBCXX_TARGET_TRIPLE                    "${TARGET_TRIPLE}" CACHE STRING "")
-set(LIBCXX_SYSROOT                          "${DEFAULT_SYSROOT}" CACHE STRING "")
-set(LIBCXX_ENABLE_SHARED                    OFF CACHE BOOL "")
-set(LIBCXX_CXX_ABI                          "libcxxabi" CACHE STRING "")
-set(LIBCXX_CXX_ABI_INCLUDE_PATHS            "${LLVM_PROJECT_DIR}/libcxxabi/include" CACHE PATH "")
-set(LIBCXX_CXX_ABI_LIBRARY_PATH             "${CMAKE_BINARY_DIR}/lib/${LIBCXX_TARGET_TRIPLE}" CACHE PATH "")
-set(LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS    ON CACHE BOOL "")
+if(WIN32)
+  set(CMAKE_MSVC_RUNTIME_LIBRARY            "MultiThreaded" CACHE STRING "")
+  set(LLVM_USE_CRT_RELEASE                  "MT" CACHE STRING "")
+endif()
 
 # Set up RPATH for the target runtime/builtin libraries.
 # See some details here: https://reviews.llvm.org/D91099
@@ -136,44 +107,90 @@ if (NOT DEFINED RUNTIMES_INSTALL_RPATH)
   set(RUNTIMES_INSTALL_RPATH                "\$ORIGIN/../lib;${CMAKE_INSTALL_PREFIX}/lib")
 endif()
 
-set(BUILTINS_CMAKE_ARGS                     "-DCMAKE_SYSTEM_NAME=Linux;-DCMAKE_AR=${CMAKE_AR};-DCMAKE_INSTALL_RPATH=${RUNTIMES_INSTALL_RPATH};-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON" CACHE STRING "")
-set(RUNTIMES_CMAKE_ARGS                     "-DCMAKE_SYSTEM_NAME=Linux;-DCMAKE_AR=${CMAKE_AR};-DCMAKE_INSTALL_RPATH=${RUNTIMES_INSTALL_RPATH};-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON" CACHE STRING "")
+set(LLVM_BUILTIN_TARGETS                    "${TARGET_TRIPLE}" CACHE STRING "")
+
+set(BUILTINS_${TARGET_TRIPLE}_CMAKE_SYSTEM_NAME                         "Linux" CACHE STRING "")
+set(BUILTINS_${TARGET_TRIPLE}_CMAKE_SYSROOT                             "${DEFAULT_SYSROOT}"  CACHE STRING "")
+set(BUILTINS_${TARGET_TRIPLE}_CMAKE_AR                                  "${CMAKE_AR}"  CACHE STRING "")
+set(BUILTINS_${TARGET_TRIPLE}_CMAKE_INSTALL_RPATH                       "${RUNTIMES_INSTALL_RPATH}"  CACHE STRING "")
+set(BUILTINS_${TARGET_TRIPLE}_CMAKE_BUILD_WITH_INSTALL_RPATH            ON  CACHE BOOL "")
+
+
+set(LLVM_RUNTIME_TARGETS                    "${TARGET_TRIPLE}" CACHE STRING "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_LLVM_ENABLE_RUNTIMES                      "${LLVM_ENABLE_RUNTIMES}" CACHE STRING "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_CMAKE_SYSTEM_NAME                         "Linux" CACHE STRING "")
+set(RUNTIMES_${TARGET_TRIPLE}_CMAKE_SYSROOT                             "${DEFAULT_SYSROOT}"  CACHE STRING "")
+set(RUNTIMES_${TARGET_TRIPLE}_CMAKE_AR                                  "${CMAKE_AR}"  CACHE STRING "")
+set(RUNTIMES_${TARGET_TRIPLE}_CMAKE_INSTALL_RPATH                       "${RUNTIMES_INSTALL_RPATH}"  CACHE STRING "")
+set(RUNTIMES_${TARGET_TRIPLE}_CMAKE_BUILD_WITH_INSTALL_RPATH            ON  CACHE BOOL "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_BUILD_BUILTINS                ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_BUILD_SANITIZERS              OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_BUILD_XRAY                    OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_BUILD_LIBFUZZER               OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_BUILD_PROFILE                 OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_BUILD_CRT                     OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_BUILD_ORC                     OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_DEFAULT_TARGET_ONLY           ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_INCLUDE_TESTS                 ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_CAN_EXECUTE_TESTS             ON CACHE BOOL "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_USE_BUILTINS_LIBRARY          ON CACHE BOOL "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_LIBUNWIND_USE_COMPILER_RT                 ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBUNWIND_ENABLE_SHARED                   OFF CACHE BOOL "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_USE_LLVM_UNWINDER               ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_ENABLE_STATIC_UNWINDER          ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_USE_COMPILER_RT                 ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS   OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX   OFF CACHE BOOL "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI   OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX      OFF CACHE BOOL "")
+
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_USE_COMPILER_RT                    ON CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_ENABLE_SHARED                      OFF CACHE BOOL "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_ABI_VERSION                        2 CACHE STRING "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_CXX_ABI                            "libcxxabi" CACHE STRING "")    #!!!
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_CXX_ABI_INCLUDE_PATHS              "${LLVM_PROJECT_DIR}/libcxxabi/include" CACHE PATH "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_CXX_ABI_LIBRARY_PATH               "${CMAKE_BINARY_DIR}/lib/${TARGET_TRIPLE}" CACHE PATH "")
+set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS      ON CACHE BOOL "")
+
 
 find_package(Python3 COMPONENTS Interpreter)
 
 # Remote test configuration.
 if(DEFINED REMOTE_TEST_HOST)
-  set(DEFAULT_TEST_EXECUTOR                 "\\\"${Python3_EXECUTABLE}\\\" \\\"${LLVM_PROJECT_DIR}/libcxx/utils/ssh.py\\\" --host='${REMOTE_TEST_USER}@${REMOTE_TEST_HOST}'")
-  set(DEFAULT_TEST_TARGET_INFO              "libcxx.test.target_info.LinuxRemoteTI")
-
   # Allow override with the custom values.
-  if(NOT DEFINED COMPILER_RT_EMULATOR)
-    set(COMPILER_RT_EMULATOR                "\\\"${Python3_EXECUTABLE}\\\" \\\"${LLVM_PROJECT_DIR}/llvm/utils/remote-exec.py\\\" --execdir %%T --exec-pattern='.*\\.c.*\\.tmp.*' --host='${REMOTE_TEST_USER}@${REMOTE_TEST_HOST}'" CACHE STRING "")
-  endif()
-  if(NOT DEFINED LIBUNWIND_TARGET_INFO)
-    set(LIBUNWIND_TARGET_INFO               "${DEFAULT_TEST_TARGET_INFO}" CACHE STRING "")
+  if(NOT DEFINED DEFAULT_TEST_EXECUTOR)
+    set(DEFAULT_TEST_EXECUTOR                 "\\\"${Python3_EXECUTABLE}\\\" \\\"${LLVM_PROJECT_DIR}/libcxx/utils/ssh.py\\\" --host='${REMOTE_TEST_USER}@${REMOTE_TEST_HOST}'")
   endif()
-  if(NOT DEFINED LIBUNWIND_EXECUTOR)
-    set(LIBUNWIND_EXECUTOR                  "${DEFAULT_TEST_EXECUTOR}" CACHE STRING "")
+  if(NOT DEFINED DEFAULT_TEST_TARGET_INFO)
+    set(DEFAULT_TEST_TARGET_INFO              "libcxx.test.target_info.LinuxRemoteTI")
   endif()
+
+  set(RUNTIMES_${TARGET_TRIPLE}_COMPILER_RT_EMULATOR
+        "\\\"${Python3_EXECUTABLE}\\\" \\\"${LLVM_PROJECT_DIR}/llvm/utils/remote-exec.py\\\" --execdir %%T --exec-pattern='.*\\.c.*\\.tmp.*' --host='${REMOTE_TEST_USER}@${REMOTE_TEST_HOST}'"
+        CACHE STRING "")
+
+  set(RUNTIMES_${TARGET_TRIPLE}_LIBUNWIND_TARGET_INFO                   "${DEFAULT_TEST_TARGET_INFO}" CACHE STRING "")
+  set(RUNTIMES_${TARGET_TRIPLE}_LIBUNWIND_EXECUTOR                      "${DEFAULT_TEST_EXECUTOR}" CACHE STRING "")
+
   #NOTE: temporary workaround to fix the remote execution for libunwind tests.
   # https://reviews.llvm.org/D112082
-  set(LIBUNWIND_TEST_CONFIG                 "${LLVM_PROJECT_DIR}/libunwind/test/lit.site.cfg.in" CACHE PATH "")  
-  if(NOT DEFINED LIBCXXABI_TARGET_INFO)
-    set(LIBCXXABI_TARGET_INFO               "${DEFAULT_TEST_TARGET_INFO}" CACHE STRING "")
-  endif()
-  if(NOT DEFINED LIBCXXABI_EXECUTOR)
-    set(LIBCXXABI_EXECUTOR                  "${DEFAULT_TEST_EXECUTOR}" CACHE STRING "")
-  endif()
-  if(NOT DEFINED LIBCXX_TARGET_INFO)
-    set(LIBCXX_TARGET_INFO                  "${DEFAULT_TEST_TARGET_INFO}" CACHE STRING "")
-  endif()
-  if(NOT DEFINED LIBCXX_EXECUTOR)
-    set(LIBCXX_EXECUTOR                     "${DEFAULT_TEST_EXECUTOR}" CACHE STRING "")
-  endif()
+  set(RUNTIMES_${TARGET_TRIPLE}_LIBUNWIND_TEST_CONFIG                   "${LLVM_PROJECT_DIR}/libunwind/test/lit.site.cfg.in" CACHE PATH "")
+
+  set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_TARGET_INFO                   "${DEFAULT_TEST_TARGET_INFO}" CACHE STRING "")
+  set(RUNTIMES_${TARGET_TRIPLE}_LIBCXXABI_EXECUTOR                      "${DEFAULT_TEST_EXECUTOR}" CACHE STRING "")
+  set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_TARGET_INFO                      "${DEFAULT_TEST_TARGET_INFO}" CACHE STRING "")
+  set(RUNTIMES_${TARGET_TRIPLE}_LIBCXX_EXECUTOR                         "${DEFAULT_TEST_EXECUTOR}" CACHE STRING "")
 endif()
 
-set(LLVM_INSTALL_TOOLCHAIN_ONLY             ON CACHE BOOL "")
+set(LLVM_INSTALL_TOOLCHAIN_ONLY ON CACHE BOOL "")
 set(LLVM_TOOLCHAIN_TOOLS
   llvm-ar
   llvm-cov

From 948d69fd574fbbbc311d06c5c4481408ef4e384d Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Wed, 2 Feb 2022 10:45:19 -0800
Subject: [PATCH 723/748] [FormatVariadic] Mark index as required in docstring

After looking at the formatv docstring, I thought the index was optional (as it
is in other languages). This changes the header docs to show `index` instead of
`[index]`, to indicate that it is required.

Differential Revision: https://reviews.llvm.org/D118833
---
 llvm/include/llvm/Support/FormatVariadic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index a872afb5e45e5..c1707b4fe9cb9 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -172,7 +172,7 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
 // Formats textual output.  `Fmt` is a string consisting of one or more
 // replacement sequences with the following grammar:
 //
-// rep_field ::= "{" [index] ["," layout] [":" format] "}"
+// rep_field ::= "{" index ["," layout] [":" format] "}"
 // index     ::= <non-negative integer>
 // layout    ::= [[[char]loc]width]
 // format    ::= <any string not containing "{" or "}">

From 34285bcd5ac260246c9d59708a63ea3d5972f75c Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 23 Feb 2022 18:41:20 -0500
Subject: [PATCH 724/748] Reland "unbreak Modules/cxx20-export-import.cpp with
 LLVM_APPEND_VC_REV after 32b73bc6ab82"

This reverts commit 5086cff04eec4327acc22a90466854ad4d89d570.
32b73bc6ab82 relanded in 1592d88aa7bc.
---
 clang/include/clang/Serialization/ASTBitCodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index f98e173b158c1..c94274ff34b8f 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -41,7 +41,7 @@ namespace serialization {
 /// Version 4 of AST files also requires that the version control branch and
 /// revision match exactly, since there is no backward compatibility of
 /// AST files at this time.
-const unsigned VERSION_MAJOR = 15;
+const unsigned VERSION_MAJOR = 16;
 
 /// AST file minor version number supported by this version of
 /// Clang.

From d7105e76319c992fcbcf4e5e174c06534b061fb7 Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor@mac.com>
Date: Fri, 18 Feb 2022 00:15:25 -0800
Subject: [PATCH 725/748] Teach the AArch64 backend to instruction select the
 BCAX instruction.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D120112
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td |  9 +++
 llvm/test/CodeGen/AArch64/bcax.ll           | 71 +++++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/bcax.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f3aff92d4bac5..8e1f61925794f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1024,6 +1024,15 @@ def : EOR3_pattern<v8i16>;
 def : EOR3_pattern<v4i32>;
 def : EOR3_pattern<v2i64>;
 
+class BCAX_pattern<ValueType VecTy>
+  : Pat<(xor (VecTy V128:$Vn), (and (VecTy V128:$Vm), (vnot (VecTy V128:$Va)))),
+        (BCAX (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>;
+
+def : BCAX_pattern<v16i8>;
+def : BCAX_pattern<v8i16>;
+def : BCAX_pattern<v4i32>;
+def : BCAX_pattern<v2i64>;
+
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v16i8>;
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v8i16>;
 def : SHA3_pattern<BCAX, int_aarch64_crypto_bcaxu, v4i32>;
diff --git a/llvm/test/CodeGen/AArch64/bcax.ll b/llvm/test/CodeGen/AArch64/bcax.ll
new file mode 100644
index 0000000000000..15e8b3421bf9b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bcax.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+
+define <2 x i64> @bcax_64x2(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
+; SHA3-LABEL: bcax_64x2:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    bcax v0.16b, v2.16b, v0.16b, v1.16b
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: bcax_64x2:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    bic v0.16b, v0.16b, v1.16b
+; NOSHA3-NEXT:    eor v0.16b, v0.16b, v2.16b
+; NOSHA3-NEXT:    ret
+  %4 = xor <2 x i64> %1, <i64 -1, i64 -1>
+  %5 = and <2 x i64> %4, %0
+  %6 = xor <2 x i64> %5, %2
+  ret <2 x i64> %6
+}
+
+define <4 x i32> @bcax_32x4(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) {
+; SHA3-LABEL: bcax_32x4:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    bcax v0.16b, v2.16b, v0.16b, v1.16b
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: bcax_32x4:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    bic v0.16b, v0.16b, v1.16b
+; NOSHA3-NEXT:    eor v0.16b, v0.16b, v2.16b
+; NOSHA3-NEXT:    ret
+  %4 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = and <4 x i32> %4, %0
+  %6 = xor <4 x i32> %5, %2
+  ret <4 x i32> %6
+}
+
+define <8 x i16> @bcax_16x8(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2) {
+; SHA3-LABEL: bcax_16x8:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    bcax v0.16b, v2.16b, v0.16b, v1.16b
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: bcax_16x8:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    bic v0.16b, v0.16b, v1.16b
+; NOSHA3-NEXT:    eor v0.16b, v0.16b, v2.16b
+; NOSHA3-NEXT:    ret
+  %4 = xor <8 x i16> %1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %5 = and <8 x i16> %4, %0
+  %6 = xor <8 x i16> %5, %2
+  ret <8 x i16> %6
+}
+
+define <16 x i8> @bcax_8x16(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) {
+; SHA3-LABEL: bcax_8x16:
+; SHA3:       // %bb.0:
+; SHA3-NEXT:    bcax v0.16b, v2.16b, v0.16b, v1.16b
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: bcax_8x16:
+; NOSHA3:       // %bb.0:
+; NOSHA3-NEXT:    bic v0.16b, v0.16b, v1.16b
+; NOSHA3-NEXT:    eor v0.16b, v0.16b, v2.16b
+; NOSHA3-NEXT:    ret
+  %4 = xor <16 x i8> %1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %5 = and <16 x i8> %4, %0
+  %6 = xor <16 x i8> %5, %2
+  ret <16 x i8> %6
+}

From 454c149898d37b8e227f0d0347f7abd7ecc715e0 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 22 Feb 2022 22:54:15 -0800
Subject: [PATCH 726/748] [BOLT][NFC] Fix undefined behavior in
 encodeAnnotationImm

Fix UBSan-reported issue in MCPlusBuilder::encodeAnnotationImm (left shift of a
negative value).

Test Plan:
```
ninja check-bolt
...
PASS: BOLT-Unit :: Core/./CoreTests/AArch64/MCPlusBuilderTester.Annotation/0 (1 of 140)
PASS: BOLT-Unit :: Core/./CoreTests/X86/MCPlusBuilderTester.Annotation/0 (131 of 134)
```

Reviewed By: maksfb, yota9

Differential Revision: https://reviews.llvm.org/D120260
---
 bolt/include/bolt/Core/MCPlusBuilder.h | 11 +++++------
 bolt/unittests/Core/CMakeLists.txt     |  1 +
 bolt/unittests/Core/MCPlusBuilder.cpp  | 26 ++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 3876b6e379daf..93970e02a4d10 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -74,9 +74,9 @@ class MCPlusBuilder {
   AllocatorIdTy MaxAllocatorId = 0;
 
   /// We encode Index and Value into a 64-bit immediate operand value.
-  static int64_t encodeAnnotationImm(unsigned Index, int64_t Value) {
-    assert(Index < 256 && "annotation index max value exceeded");
-    assert((Value == (Value << 8) >> 8) && "annotation value out of range");
+  static int64_t encodeAnnotationImm(uint8_t Index, int64_t Value) {
+    if (LLVM_UNLIKELY(Value != extractAnnotationValue(Value)))
+      report_fatal_error("annotation value out of range");
 
     Value &= 0xff'ffff'ffff'ffff;
     Value |= (int64_t)Index << 56;
@@ -85,14 +85,13 @@ class MCPlusBuilder {
   }
 
   /// Extract annotation index from immediate operand value.
-  static unsigned extractAnnotationIndex(int64_t ImmValue) {
+  static uint8_t extractAnnotationIndex(int64_t ImmValue) {
     return ImmValue >> 56;
   }
 
   /// Extract annotation value from immediate operand value.
   static int64_t extractAnnotationValue(int64_t ImmValue) {
-    ImmValue &= 0xff'ffff'ffff'ffff;
-    return (ImmValue << 8) >> 8;
+    return SignExtend64<56>(ImmValue & 0xff'ffff'ffff'ffffULL);
   }
 
   MCInst *getAnnotationInst(const MCInst &Inst) const {
diff --git a/bolt/unittests/Core/CMakeLists.txt b/bolt/unittests/Core/CMakeLists.txt
index 1516eb4a6762a..7abb0bea041a0 100644
--- a/bolt/unittests/Core/CMakeLists.txt
+++ b/bolt/unittests/Core/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
   BOLTRewrite
   DebugInfoDWARF
   Object
+  MC
   ${LLVM_TARGETS_TO_BUILD}
   )
 
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index 2158f652c0719..a1ebde8c11234 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -110,3 +110,29 @@ TEST_P(MCPlusBuilderTester, AliasSmallerAX) {
 }
 
 #endif // X86_AVAILABLE
+
+TEST_P(MCPlusBuilderTester, Annotation) {
+  MCInst Inst;
+  bool Success = BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
+                                         BC->Ctx.get());
+  ASSERT_TRUE(Success);
+  MCSymbol *LPSymbol = BC->Ctx->createNamedTempSymbol("LP");
+  uint64_t Value = INT32_MIN;
+  // Test encodeAnnotationImm using this indirect way
+  BC->MIB->addEHInfo(Inst, MCPlus::MCLandingPad(LPSymbol, Value));
+  // Round-trip encoding-decoding check for negative values
+  Optional<MCPlus::MCLandingPad> EHInfo = BC->MIB->getEHInfo(Inst);
+  ASSERT_TRUE(EHInfo.hasValue());
+  MCPlus::MCLandingPad LP = EHInfo.getValue();
+  uint64_t DecodedValue = LP.second;
+  ASSERT_EQ(Value, DecodedValue);
+
+  // Large int64 should trigger an out of range assertion
+  Value = 0x1FF'FFFF'FFFF'FFFFULL;
+  Inst.clear();
+  Success = BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
+                                    BC->Ctx.get());
+  ASSERT_TRUE(Success);
+  ASSERT_DEATH(BC->MIB->addEHInfo(Inst, MCPlus::MCLandingPad(LPSymbol, Value)),
+               "annotation value out of range");
+}

From 0477cac332d5abf7b2b51b470370afcbb1e8d513 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 23 Feb 2022 16:08:25 -0800
Subject: [PATCH 727/748] [asan] Allow
 -fsanitize-address-globals-dead-stripping with -fno-data-sections for ELF

-fdata-sections decides whether global variables go into different sections.
This is orthogonal to whether we place their metadata (`.data` or `asan_globals`) into different sections.

With -fno-data-sections, `-fsanitize-address-globals-dead-stripping` can still:

* deduplicate COMDAT `asan.module_ctor` and `asan.module_dtor`
* (with ld --gc-sections): for a data section (e.g. `.data`), if all global variables defined relative to it are unreferenced, discard them and associated `asan_globals` sections (rare but no need to exclude this case)

Similar to c7b90947bd0179d914fea56b52be545c8f60f20a for PE/COFF.

Reviewed By: #sanitizers, kstoimenov, vitalybuka

Differential Revision: https://reviews.llvm.org/D120394
---
 clang/lib/CodeGen/BackendUtil.cpp                | 2 +-
 clang/test/CodeGen/asan-globals-gc.cpp           | 3 +--
 clang/test/CodeGen/asan-no-globals-no-comdat.cpp | 4 +---
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index f3026040ad216..ebfbe991881c6 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -281,7 +281,7 @@ static bool asanUseGlobalsGC(const Triple &T, const CodeGenOptions &CGOpts) {
   case Triple::COFF:
     return true;
   case Triple::ELF:
-    return CGOpts.DataSections && !CGOpts.DisableIntegratedAS;
+    return !CGOpts.DisableIntegratedAS;
   case Triple::GOFF:
     llvm::report_fatal_error("ASan not implemented for GOFF");
   case Triple::XCOFF:
diff --git a/clang/test/CodeGen/asan-globals-gc.cpp b/clang/test/CodeGen/asan-globals-gc.cpp
index 95702516d8a16..f2adbfb6eb36f 100644
--- a/clang/test/CodeGen/asan-globals-gc.cpp
+++ b/clang/test/CodeGen/asan-globals-gc.cpp
@@ -1,6 +1,5 @@
-// RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
+// RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITH-GC
 // RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -fdata-sections -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITH-GC
-// RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -fno-integrated-as -fdata-sections -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
 // RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -fno-integrated-as -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
 // RUN: %clang_cc1 -fsanitize=address -fdata-sections -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
 
diff --git a/clang/test/CodeGen/asan-no-globals-no-comdat.cpp b/clang/test/CodeGen/asan-no-globals-no-comdat.cpp
index b7a6b16159a5f..9c7a0d2ff53da 100644
--- a/clang/test/CodeGen/asan-no-globals-no-comdat.cpp
+++ b/clang/test/CodeGen/asan-no-globals-no-comdat.cpp
@@ -1,9 +1,7 @@
 // Test that on Linux asan constructor is placed in a comdat iff globals-gc is on.
 // Even if there are no globals in the module.
 
-// RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
-// RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -fdata-sections -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITH-GC
-// RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -fno-integrated-as -fdata-sections -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
+// RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITH-GC
 // RUN: %clang_cc1 -fsanitize=address -fsanitize-address-globals-dead-stripping -fno-integrated-as -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
 // RUN: %clang_cc1 -fsanitize=address -fdata-sections -emit-llvm -o - -triple x86_64-linux %s | FileCheck %s --check-prefix=WITHOUT-GC
 

From d7a307320e50a0c83d052b04c766e9cf4d618b37 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Thu, 17 Feb 2022 13:57:06 -0800
Subject: [PATCH 728/748] Simplify/cleanup BasicBlockUtilsTest

Cleanup BasicBolckUtilsTest using C++ raw string literals, remove
duplicated block functions and smaller style changes.

Differential Revision: https://reviews.llvm.org/D120095
---
 .../Transforms/Utils/BasicBlockUtilsTest.cpp  | 544 ++++++++----------
 1 file changed, 253 insertions(+), 291 deletions(-)

diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index 8800b4339fab2..ac78ffa1f31a8 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -44,24 +44,20 @@ static BasicBlock *getBasicBlockByName(Function &F, StringRef Name) {
 
 TEST(BasicBlockUtils, EliminateUnreachableBlocks) {
   LLVMContext C;
-
-  std::unique_ptr<Module> M = parseIR(
-    C,
-    "define i32 @has_unreachable(i1 %cond) {\n"
-    "entry:\n"
-    "  br i1 %cond, label %bb0, label %bb1\n"
-    "bb0:\n"
-    "  br label %bb1\n"
-    "bb1:\n"
-    "  %phi = phi i32 [ 0, %entry ], [ 1, %bb0 ]"
-    "  ret i32 %phi\n"
-    "bb2:\n"
-    "  ret i32 42\n"
-    "}\n"
-    "\n"
-    );
-
-  auto *F = M->getFunction("has_unreachable");
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define i32 @has_unreachable(i1 %cond) {
+entry:
+  br i1 %cond, label %bb0, label %bb1
+bb0:
+  br label %bb1
+bb1:
+  %phi = phi i32 [ 0, %entry ], [ 1, %bb0 ]
+  ret i32 %phi
+bb2:
+  ret i32 42
+}
+)IR");
+  Function *F = M->getFunction("has_unreachable");
   DominatorTree DT(*F);
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
 
@@ -74,20 +70,19 @@ TEST(BasicBlockUtils, EliminateUnreachableBlocks) {
 
 TEST(BasicBlockUtils, SplitEdge_ex1) {
   LLVMContext C;
-  std::unique_ptr<Module> M =
-      parseIR(C, "define void @foo(i1 %cond0) {\n"
-                 "entry:\n"
-                 "  br i1 %cond0, label %bb0, label %bb1\n"
-                 "bb0:\n"
-                 " %0 = mul i32 1, 2\n"
-                 "  br label %bb1\n"
-                 "bb1:\n"
-                 "  br label %bb2\n"
-                 "bb2:\n"
-                 "  ret void\n"
-                 "}\n"
-                 "\n");
-
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @foo(i1 %cond0) {
+entry:
+  br i1 %cond0, label %bb0, label %bb1
+bb0:
+ %0 = mul i32 1, 2
+  br label %bb1
+bb1:
+  br label %bb2
+bb2:
+  ret void
+}
+)IR");
   Function *F = M->getFunction("foo");
   DominatorTree DT(*F);
   BasicBlock *SrcBlock;
@@ -114,16 +109,16 @@ TEST(BasicBlockUtils, SplitEdge_ex1) {
 
 TEST(BasicBlockUtils, SplitEdge_ex2) {
   LLVMContext C;
-  std::unique_ptr<Module> M = parseIR(C, "define void @foo() {\n"
-                                         "bb0:\n"
-                                         "  br label %bb2\n"
-                                         "bb1:\n"
-                                         "  br label %bb2\n"
-                                         "bb2:\n"
-                                         "  ret void\n"
-                                         "}\n"
-                                         "\n");
-
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @foo() {
+bb0:
+  br label %bb2
+bb1:
+  br label %bb2
+bb2:
+  ret void
+}
+)IR");
   Function *F = M->getFunction("foo");
   DominatorTree DT(*F);
 
@@ -151,34 +146,33 @@ TEST(BasicBlockUtils, SplitEdge_ex2) {
 
 TEST(BasicBlockUtils, SplitEdge_ex3) {
   LLVMContext C;
-  std::unique_ptr<Module> M =
-      parseIR(C, "define i32 @foo(i32 %n) {\n"
-                 "entry:\n"
-                 " br label %header\n"
-                 "header:\n"
-                 " %sum.02 = phi i32 [ 0, %entry ], [ %sum.1, %bb3 ]\n"
-                 " %0 = phi i32 [ 0, %entry ], [ %4, %bb3 ] \n"
-                 " %1 = icmp slt i32 %0, %n \n"
-                 " br i1 %1, label %bb0, label %bb1\n"
-                 "bb0:\n"
-                 "  %2 = add nsw i32 %sum.02, 2\n"
-                 "  br label %bb2\n"
-                 "bb1:\n"
-                 "  %3 = add nsw i32 %sum.02, 1\n"
-                 "  br label %bb2\n"
-                 "bb2:\n"
-                 "  %sum.1 = phi i32 [ %2, %bb0 ], [ %3, %bb1 ]\n"
-                 "  br label %bb3\n"
-                 "bb3:\n"
-                 "  %4 = add nsw i32 %0, 1 \n"
-                 "  %5 = icmp slt i32 %4, 100\n"
-                 "  br i1 %5, label %header, label %bb4\n"
-                 "bb4:\n"
-                 " %sum.0.lcssa = phi i32 [ %sum.1, %bb3 ]\n"
-                 " ret i32 %sum.0.lcssa\n"
-                 "}\n"
-                 "\n");
-
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define i32 @foo(i32 %n) {
+entry:
+ br label %header
+header:
+ %sum.02 = phi i32 [ 0, %entry ], [ %sum.1, %bb3 ]
+ %0 = phi i32 [ 0, %entry ], [ %4, %bb3 ]
+ %1 = icmp slt i32 %0, %n
+ br i1 %1, label %bb0, label %bb1
+bb0:
+  %2 = add nsw i32 %sum.02, 2
+  br label %bb2
+bb1:
+  %3 = add nsw i32 %sum.02, 1
+  br label %bb2
+bb2:
+  %sum.1 = phi i32 [ %2, %bb0 ], [ %3, %bb1 ]
+  br label %bb3
+bb3:
+  %4 = add nsw i32 %0, 1
+  %5 = icmp slt i32 %4, 100
+  br i1 %5, label %header, label %bb4
+bb4:
+ %sum.0.lcssa = phi i32 [ %sum.1, %bb3 ]
+ ret i32 %sum.0.lcssa
+}
+)IR");
   Function *F = M->getFunction("foo");
   DominatorTree DT(*F);
 
@@ -224,36 +218,36 @@ TEST(BasicBlockUtils, SplitEdge_ex3) {
 
 TEST(BasicBlockUtils, SplitEdge_ex4) {
   LLVMContext C;
-  std::unique_ptr<Module> M = parseIR(
-      C, "define void @bar(i32 %cond) personality i8 0 {\n"
-         "entry:\n"
-         "  switch i32 %cond, label %exit [\n"
-         "    i32 -1, label %continue\n"
-         "    i32 0, label %continue\n"
-         "    i32 1, label %continue_alt\n"
-         "    i32 2, label %continue_alt\n"
-         "  ]\n"
-         "exit:\n"
-         "  ret void\n"
-         "continue:\n"
-         "  invoke void @sink() to label %normal unwind label %exception\n"
-         "continue_alt:\n"
-         "  invoke void @sink_alt() to label %normal unwind label %exception\n"
-         "exception:\n"
-         "  %cleanup = landingpad i8 cleanup\n"
-         "  br label %trivial-eh-handler\n"
-         "trivial-eh-handler:\n"
-         "  call void @sideeffect(i32 1)\n"
-         "  br label %normal\n"
-         "normal:\n"
-         "  call void @sideeffect(i32 0)\n"
-         "  ret void\n"
-         "}\n"
-         "\n"
-         "declare void @sideeffect(i32)\n"
-         "declare void @sink() cold\n"
-         "declare void @sink_alt() cold\n");
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @bar(i32 %cond) personality i8 0 {
+entry:
+  switch i32 %cond, label %exit [
+    i32 -1, label %continue
+    i32 0, label %continue
+    i32 1, label %continue_alt
+    i32 2, label %continue_alt
+  ]
+exit:
+  ret void
+continue:
+  invoke void @sink() to label %normal unwind label %exception
+continue_alt:
+  invoke void @sink_alt() to label %normal unwind label %exception
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %trivial-eh-handler
+trivial-eh-handler:
+  call void @sideeffect(i32 1)
+  br label %normal
+normal:
+  call void @sideeffect(i32 0)
+  ret void
+}
 
+declare void @sideeffect(i32)
+declare void @sink() cold
+declare void @sink_alt() cold
+)IR");
   Function *F = M->getFunction("bar");
 
   DominatorTree DT(*F);
@@ -310,20 +304,20 @@ TEST(BasicBlockUtils, SplitEdge_ex4) {
 
 TEST(BasicBlockUtils, splitBasicBlockBefore_ex1) {
   LLVMContext C;
-  std::unique_ptr<Module> M = parseIR(C, "define void @foo() {\n"
-                                         "bb0:\n"
-                                         " %0 = mul i32 1, 2\n"
-                                         "  br label %bb2\n"
-                                         "bb1:\n"
-                                         "  br label %bb3\n"
-                                         "bb2:\n"
-                                         "  %1 = phi  i32 [ %0, %bb0 ]\n"
-                                         "  br label %bb3\n"
-                                         "bb3:\n"
-                                         "  ret void\n"
-                                         "}\n"
-                                         "\n");
-
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @foo() {
+bb0:
+ %0 = mul i32 1, 2
+  br label %bb2
+bb1:
+  br label %bb3
+bb2:
+  %1 = phi  i32 [ %0, %bb0 ]
+  br label %bb3
+bb3:
+  ret void
+}
+)IR");
   Function *F = M->getFunction("foo");
   DominatorTree DT(*F);
 
@@ -345,27 +339,24 @@ TEST(BasicBlockUtils, splitBasicBlockBefore_ex1) {
 #ifndef NDEBUG
 TEST(BasicBlockUtils, splitBasicBlockBefore_ex2) {
   LLVMContext C;
-  std::unique_ptr<Module> M =
-      parseIR(C, "define void @foo() {\n"
-                 "bb0:\n"
-                 " %0 = mul i32 1, 2\n"
-                 "  br label %bb2\n"
-                 "bb1:\n"
-                 "  br label %bb2\n"
-                 "bb2:\n"
-                 "  %1 = phi  i32 [ %0, %bb0 ], [ 1, %bb1 ]\n"
-                 "  br label %bb3\n"
-                 "bb3:\n"
-                 "  ret void\n"
-                 "}\n"
-                 "\n");
-
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @foo() {
+bb0:
+ %0 = mul i32 1, 2
+  br label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %1 = phi  i32 [ %0, %bb0 ], [ 1, %bb1 ]
+  br label %bb3
+bb3:
+  ret void
+}
+)IR");
   Function *F = M->getFunction("foo");
   DominatorTree DT(*F);
 
-  BasicBlock *DestBlock;
-
-  DestBlock = getBasicBlockByName(*F, "bb2");
+  BasicBlock *DestBlock = getBasicBlockByName(*F, "bb2");
 
   ASSERT_DEATH(
       {
@@ -378,22 +369,18 @@ TEST(BasicBlockUtils, splitBasicBlockBefore_ex2) {
 
 TEST(BasicBlockUtils, NoUnreachableBlocksToEliminate) {
   LLVMContext C;
-
-  std::unique_ptr<Module> M = parseIR(
-    C,
-    "define i32 @no_unreachable(i1 %cond) {\n"
-    "entry:\n"
-    "  br i1 %cond, label %bb0, label %bb1\n"
-    "bb0:\n"
-    "  br label %bb1\n"
-    "bb1:\n"
-    "  %phi = phi i32 [ 0, %entry ], [ 1, %bb0 ]"
-    "  ret i32 %phi\n"
-    "}\n"
-    "\n"
-    );
-
-  auto *F = M->getFunction("no_unreachable");
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define i32 @no_unreachable(i1 %cond) {
+entry:
+  br i1 %cond, label %bb0, label %bb1
+bb0:
+  br label %bb1
+bb1:
+  %phi = phi i32 [ 0, %entry ], [ 1, %bb0 ]
+  ret i32 %phi
+}
+)IR");
+  Function *F = M->getFunction("no_unreachable");
   DominatorTree DT(*F);
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
 
@@ -406,22 +393,18 @@ TEST(BasicBlockUtils, NoUnreachableBlocksToEliminate) {
 
 TEST(BasicBlockUtils, SplitBlockPredecessors) {
   LLVMContext C;
-
-  std::unique_ptr<Module> M = parseIR(
-    C,
-    "define i32 @basic_func(i1 %cond) {\n"
-    "entry:\n"
-    "  br i1 %cond, label %bb0, label %bb1\n"
-    "bb0:\n"
-    "  br label %bb1\n"
-    "bb1:\n"
-    "  %phi = phi i32 [ 0, %entry ], [ 1, %bb0 ]"
-    "  ret i32 %phi\n"
-    "}\n"
-    "\n"
-    );
-
-  auto *F = M->getFunction("basic_func");
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define i32 @basic_func(i1 %cond) {
+entry:
+  br i1 %cond, label %bb0, label %bb1
+bb0:
+  br label %bb1
+bb1:
+  %phi = phi i32 [ 0, %entry ], [ 1, %bb0 ]
+  ret i32 %phi
+}
+)IR");
+  Function *F = M->getFunction("basic_func");
   DominatorTree DT(*F);
 
   // Make sure the dominator tree is properly updated if calling this on the
@@ -432,23 +415,19 @@ TEST(BasicBlockUtils, SplitBlockPredecessors) {
 
 TEST(BasicBlockUtils, SplitCriticalEdge) {
   LLVMContext C;
-
-  std::unique_ptr<Module> M = parseIR(
-    C,
-    "define void @crit_edge(i1 %cond0, i1 %cond1) {\n"
-    "entry:\n"
-    "  br i1 %cond0, label %bb0, label %bb1\n"
-    "bb0:\n"
-    "  br label %bb1\n"
-    "bb1:\n"
-    "  br label %bb2\n"
-    "bb2:\n"
-    "  ret void\n"
-    "}\n"
-    "\n"
-    );
-
-  auto *F = M->getFunction("crit_edge");
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @crit_edge(i1 %cond0, i1 %cond1) {
+entry:
+  br i1 %cond0, label %bb0, label %bb1
+bb0:
+  br label %bb1
+bb1:
+  br label %bb2
+bb2:
+  ret void
+}
+)IR");
+  Function *F = M->getFunction("crit_edge");
   DominatorTree DT(*F);
   PostDominatorTree PDT(*F);
 
@@ -460,41 +439,32 @@ TEST(BasicBlockUtils, SplitCriticalEdge) {
 
 TEST(BasicBlockUtils, SplitIndirectBrCriticalEdge) {
   LLVMContext C;
-
-  std::unique_ptr<Module> M =
-      parseIR(C, "define void @crit_edge(i8* %cond0, i1 %cond1) {\n"
-                 "entry:\n"
-                 "  indirectbr i8* %cond0, [label %bb0, label %bb1]\n"
-                 "bb0:\n"
-                 "  br label %bb1\n"
-                 "bb1:\n"
-                 "  %p = phi i32 [0, %bb0], [0, %entry]\n"
-                 "  br i1 %cond1, label %bb2, label %bb3\n"
-                 "bb2:\n"
-                 "  ret void\n"
-                 "bb3:\n"
-                 "  ret void\n"
-                 "}\n");
-
-  auto *F = M->getFunction("crit_edge");
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @crit_edge(i8* %cond0, i1 %cond1) {
+entry:
+  indirectbr i8* %cond0, [label %bb0, label %bb1]
+bb0:
+  br label %bb1
+bb1:
+  %p = phi i32 [0, %bb0], [0, %entry]
+  br i1 %cond1, label %bb2, label %bb3
+bb2:
+  ret void
+bb3:
+  ret void
+}
+)IR");
+  Function *F = M->getFunction("crit_edge");
   DominatorTree DT(*F);
   LoopInfo LI(DT);
   BranchProbabilityInfo BPI(*F, LI);
   BlockFrequencyInfo BFI(*F, BPI, LI);
 
-  auto Block = [&F](StringRef BBName) -> const BasicBlock & {
-    for (auto &BB : *F)
-      if (BB.getName() == BBName)
-        return BB;
-    llvm_unreachable("Block not found");
-  };
-
-  bool Split = SplitIndirectBrCriticalEdges(*F, &BPI, &BFI);
-
-  EXPECT_TRUE(Split);
+  ASSERT_TRUE(SplitIndirectBrCriticalEdges(*F, &BPI, &BFI));
 
   // Check that successors of the split block get their probability correct.
-  BasicBlock *SplitBB = Block("bb1").getTerminator()->getSuccessor(0);
+  BasicBlock *BB1 = getBasicBlockByName(*F, "bb1");
+  BasicBlock *SplitBB = BB1->getTerminator()->getSuccessor(0);
   EXPECT_EQ(2u, SplitBB->getTerminator()->getNumSuccessors());
   EXPECT_EQ(BranchProbability(1, 2), BPI.getEdgeProbability(SplitBB, 0u));
   EXPECT_EQ(BranchProbability(1, 2), BPI.getEdgeProbability(SplitBB, 1u));
@@ -502,93 +472,85 @@ TEST(BasicBlockUtils, SplitIndirectBrCriticalEdge) {
 
 TEST(BasicBlockUtils, SetEdgeProbability) {
   LLVMContext C;
-
-  std::unique_ptr<Module> M = parseIR(
-      C, "define void @edge_probability(i32 %0) {\n"
-         "entry:\n"
-         "switch i32 %0, label %LD [\n"
-         "  i32 700, label %L0\n"
-         "  i32 701, label %L1\n"
-         "  i32 702, label %L2\n"
-         "  i32 703, label %L3\n"
-         "  i32 704, label %L4\n"
-         "  i32 705, label %L5\n"
-         "  i32 706, label %L6\n"
-         "  i32 707, label %L7\n"
-         "  i32 708, label %L8\n"
-         "  i32 709, label %L9\n"
-         "  i32 710, label %L10\n"
-         "  i32 711, label %L11\n"
-         "  i32 712, label %L12\n"
-         "  i32 713, label %L13\n"
-         "  i32 714, label %L14\n"
-         "  i32 715, label %L15\n"
-         "  i32 716, label %L16\n"
-         "  i32 717, label %L17\n"
-         "  i32 718, label %L18\n"
-         "  i32 719, label %L19\n"
-         "], !prof !{!\"branch_weights\", i32 1, i32 1, i32 1, i32 1, i32 1, "
-         "i32 451, i32 1, i32 12, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, "
-         "i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}\n"
-         "LD:\n"
-         "  unreachable\n"
-         "L0:\n"
-         "  ret void\n"
-         "L1:\n"
-         "  ret void\n"
-         "L2:\n"
-         "  ret void\n"
-         "L3:\n"
-         "  ret void\n"
-         "L4:\n"
-         "  ret void\n"
-         "L5:\n"
-         "  ret void\n"
-         "L6:\n"
-         "  ret void\n"
-         "L7:\n"
-         "  ret void\n"
-         "L8:\n"
-         "  ret void\n"
-         "L9:\n"
-         "  ret void\n"
-         "L10:\n"
-         "  ret void\n"
-         "L11:\n"
-         "  ret void\n"
-         "L12:\n"
-         "  ret void\n"
-         "L13:\n"
-         "  ret void\n"
-         "L14:\n"
-         "  ret void\n"
-         "L15:\n"
-         "  ret void\n"
-         "L16:\n"
-         "  ret void\n"
-         "L17:\n"
-         "  ret void\n"
-         "L18:\n"
-         "  ret void\n"
-         "L19:\n"
-         "  ret void\n"
-         "}\n");
-
-  auto *F = M->getFunction("edge_probability");
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @edge_probability(i32 %0) {
+entry:
+switch i32 %0, label %LD [
+  i32 700, label %L0
+  i32 701, label %L1
+  i32 702, label %L2
+  i32 703, label %L3
+  i32 704, label %L4
+  i32 705, label %L5
+  i32 706, label %L6
+  i32 707, label %L7
+  i32 708, label %L8
+  i32 709, label %L9
+  i32 710, label %L10
+  i32 711, label %L11
+  i32 712, label %L12
+  i32 713, label %L13
+  i32 714, label %L14
+  i32 715, label %L15
+  i32 716, label %L16
+  i32 717, label %L17
+  i32 718, label %L18
+  i32 719, label %L19
+], !prof !{!"branch_weights", i32 1, i32 1, i32 1, i32 1, i32 1, i32 451, i32 1,
+           i32 12, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+           i32 1, i32 1, i32 1, i32 1, i32 1}
+LD:
+  unreachable
+L0:
+  ret void
+L1:
+  ret void
+L2:
+  ret void
+L3:
+  ret void
+L4:
+  ret void
+L5:
+  ret void
+L6:
+  ret void
+L7:
+  ret void
+L8:
+  ret void
+L9:
+  ret void
+L10:
+  ret void
+L11:
+  ret void
+L12:
+  ret void
+L13:
+  ret void
+L14:
+  ret void
+L15:
+  ret void
+L16:
+  ret void
+L17:
+  ret void
+L18:
+  ret void
+L19:
+  ret void
+}
+)IR");
+  Function *F = M->getFunction("edge_probability");
   DominatorTree DT(*F);
   LoopInfo LI(DT);
   BranchProbabilityInfo BPI(*F, LI);
 
-  auto Block = [&F](StringRef BBName) -> const BasicBlock & {
-    for (auto &BB : *F)
-      if (BB.getName() == BBName)
-        return BB;
-    llvm_unreachable("Block not found");
-  };
-
   // Check that the unreachable block has the minimal probability.
-  const BasicBlock &EntryBB = Block("entry");
-  const BasicBlock &UnreachableBB = Block("LD");
+  const BasicBlock *EntryBB = getBasicBlockByName(*F, "entry");
+  const BasicBlock *UnreachableBB = getBasicBlockByName(*F, "LD");
   EXPECT_EQ(BranchProbability::getRaw(1),
-            BPI.getEdgeProbability(&EntryBB, &UnreachableBB));
+            BPI.getEdgeProbability(EntryBB, UnreachableBB));
 }

From 6a383369f9b800eac5de2456e49fa70577be8e33 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matthiasb@fb.com>
Date: Wed, 16 Feb 2022 15:01:37 -0800
Subject: [PATCH 729/748] PGOInstrumentation, GCOVProfiling: Split indirectbr
 critical edges regardless of PHIs

The `SplitIndirectBrCriticalEdges` function was originally designed for
`CodeGenPrepare` and skipped splitting of edges when the destination
block didn't contain any `PHI` instructions. This only makes sense when
reducing COPYs like `CodeGenPrepare`. In the case of
`PGOInstrumentation` or `GCOVProfiling` it would result in missed
counters and wrong result in functions with computed goto.

Differential Revision: https://reviews.llvm.org/D120096
---
 .../llvm/Transforms/Utils/BasicBlockUtils.h   |  4 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  3 +-
 .../Instrumentation/GCOVProfiling.cpp         |  3 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  4 +-
 .../Transforms/Utils/BreakCriticalEdges.cpp   | 13 ++--
 .../split-indirectbr-critical-edges.ll        |  5 ++
 .../PGOProfile/Inputs/irreducible.proftext    |  4 +-
 .../Inputs/irreducible_entry.proftext         |  4 +-
 .../test/Transforms/PGOProfile/irreducible.ll |  2 +-
 .../split-indirectbr-critical-edges.ll        |  8 +++
 .../Transforms/Utils/BasicBlockUtilsTest.cpp  | 71 +++++++++++++++++--
 11 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index d99b2a56559de..d9a8aa88b3011 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -500,7 +500,9 @@ BranchInst *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
 // create the following structure:
 // A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1
 // If BPI and BFI aren't non-null, BPI/BFI will be updated accordingly.
-bool SplitIndirectBrCriticalEdges(Function &F,
+// When `IgnoreBlocksWithoutPHI` is set to `true` critical edges leading to a
+// block without phi-instructions will not be split.
+bool SplitIndirectBrCriticalEdges(Function &F, bool IgnoreBlocksWithoutPHI,
                                   BranchProbabilityInfo *BPI = nullptr,
                                   BlockFrequencyInfo *BFI = nullptr);
 
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9df3411c59658..efa0f80845e9a 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -524,7 +524,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 
   // Split some critical edges where one of the sources is an indirect branch,
   // to help generate sane code for PHIs involving such edges.
-  EverMadeChange |= SplitIndirectBrCriticalEdges(F);
+  EverMadeChange |=
+      SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true);
 
   bool MadeChange = true;
   while (MadeChange) {
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 325089fc44026..30375ac9199b5 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -862,7 +862,8 @@ bool GCOVProfiler::emitProfileNotes(
 
       // Split indirectbr critical edges here before computing the MST rather
       // than later in getInstrBB() to avoid invalidating it.
-      SplitIndirectBrCriticalEdges(F, BPI, BFI);
+      SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI,
+                                   BFI);
 
       CFGMST<Edge, BBInfo> MST(F, /*InstrumentFuncEntry_=*/false, BPI, BFI);
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 0902a94452e34..d7e61d31c46fc 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -940,7 +940,7 @@ static void instrumentOneFunc(
     bool IsCS) {
   // Split indirectbr critical edges here before computing the MST rather than
   // later in getInstrBB() to avoid invalidating it.
-  SplitIndirectBrCriticalEdges(F, BPI, BFI);
+  SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI, BFI);
 
   FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(
       F, TLI, ComdatMembers, true, BPI, BFI, IsCS, PGOInstrumentEntry);
@@ -1929,7 +1929,7 @@ static bool annotateAllFunctions(
     auto *BFI = LookupBFI(F);
     // Split indirectbr critical edges here before computing the MST rather than
     // later in getInstrBB() to avoid invalidating it.
-    SplitIndirectBrCriticalEdges(F, BPI, BFI);
+    SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI, BFI);
     PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS,
                     InstrumentFuncEntry);
     // When AllMinusOnes is true, it means the profile for the function
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 1bb80be8ef99c..6905cee431948 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -317,18 +317,11 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
 // predecessors of BB.
 static BasicBlock *
 findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
-  // If the block doesn't have any PHIs, we don't care about it, since there's
-  // no point in splitting it.
-  PHINode *PN = dyn_cast<PHINode>(BB->begin());
-  if (!PN)
-    return nullptr;
-
   // Verify we have exactly one IBR predecessor.
   // Conservatively bail out if one of the other predecessors is not a "regular"
   // terminator (that is, not a switch or a br).
   BasicBlock *IBB = nullptr;
-  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
-    BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+  for (BasicBlock *PredBB : predecessors(BB)) {
     Instruction *PredTerm = PredBB->getTerminator();
     switch (PredTerm->getOpcode()) {
     case Instruction::IndirectBr:
@@ -349,6 +342,7 @@ findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
 }
 
 bool llvm::SplitIndirectBrCriticalEdges(Function &F,
+                                        bool IgnoreBlocksWithoutPHI,
                                         BranchProbabilityInfo *BPI,
                                         BlockFrequencyInfo *BFI) {
   // Check whether the function has any indirectbrs, and collect which blocks
@@ -370,6 +364,9 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
   bool ShouldUpdateAnalysis = BPI && BFI;
   bool Changed = false;
   for (BasicBlock *Target : Targets) {
+    if (IgnoreBlocksWithoutPHI && Target->phis().empty())
+      continue;
+
     SmallVector<BasicBlock *, 16> OtherPreds;
     BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
     // If we did not found an indirectbr, or the indirectbr is the only
diff --git a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
index 4d4ffe4021fa1..b672c79d39718 100644
--- a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
+++ b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
@@ -33,6 +33,11 @@ indirect.preheader:                               ; preds = %for.cond
 indirect:                                         ; preds = %indirect.preheader, %indirect
   indirectbr i8* %2, [label %indirect, label %end]
 
+indirect2:
+  ; For this test we do not want critical edges split. Adding a 2nd `indirectbr`
+  ; does the trick.
+  indirectbr i8* %2, [label %indirect, label %end]
+
 end:                                              ; preds = %indirect
   ret i32 0, !dbg !22
 }
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/irreducible.proftext b/llvm/test/Transforms/PGOProfile/Inputs/irreducible.proftext
index 24eacb886489e..b8828cc1238d7 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/irreducible.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/irreducible.proftext
@@ -14,11 +14,10 @@ _Z11irreducibleii
 
 _Z11irreduciblePh
 # Func Hash:
-331779889035882993
+52047014671956012
 # Num Counters:
 9
 # Counter Values:
-100
 300
 99
 300
@@ -27,3 +26,4 @@ _Z11irreduciblePh
 1
 0
 0
+0
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/irreducible_entry.proftext b/llvm/test/Transforms/PGOProfile/Inputs/irreducible_entry.proftext
index 702c42d1addc8..da84cd0ff34a0 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/irreducible_entry.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/irreducible_entry.proftext
@@ -15,12 +15,11 @@ _Z11irreducibleii
 
 _Z11irreduciblePh
 # Func Hash:
-331779889035882993
+52047014671956012
 # Num Counters:
 9
 # Counter Values:
 1
-100
 300
 99
 300
@@ -28,3 +27,4 @@ _Z11irreduciblePh
 1
 0
 0
+0
diff --git a/llvm/test/Transforms/PGOProfile/irreducible.ll b/llvm/test/Transforms/PGOProfile/irreducible.ll
index eabcd1a054670..ca0c53224717a 100644
--- a/llvm/test/Transforms/PGOProfile/irreducible.ll
+++ b/llvm/test/Transforms/PGOProfile/irreducible.ll
@@ -139,4 +139,4 @@ indirectgoto:                                     ; preds = %if.then18, %if.then
 ; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000}
 ; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501}
 ; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100}
-; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400}
+; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 399}
diff --git a/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll
index 70daa54331a30..0b29013b267f0 100644
--- a/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll
+++ b/llvm/test/Transforms/PGOProfile/split-indirectbr-critical-edges.ll
@@ -43,7 +43,10 @@ if.end:                                           ; preds = %if.end.preheader, %
 ; CHECK-LABEL: @cannot_split(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.instrprof.increment
+; CHECK: indirect:
 ; CHECK-NOT:     call void @llvm.instrprof.increment
+; CHECK: indirect2:
+; CHECK-NEXT:    call void @llvm.instrprof.increment
 define i32 @cannot_split(i8* nocapture readonly %p) {
 entry:
   %targets = alloca <2 x i8*>, align 16
@@ -56,6 +59,11 @@ entry:
   br label %indirect
 
 indirect:                                         ; preds = %entry, %indirect
+  indirectbr i8* %1, [label %indirect, label %end, label %indirect2]
+
+indirect2:
+  ; For this test we do not want critical edges split. Adding a 2nd `indirectbr`
+  ; does the trick.
   indirectbr i8* %1, [label %indirect, label %end]
 
 end:                                              ; preds = %indirect
diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index ac78ffa1f31a8..d2b19aa4c4ef8 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -437,21 +437,23 @@ define void @crit_edge(i1 %cond0, i1 %cond1) {
   EXPECT_TRUE(PDT.verify());
 }
 
-TEST(BasicBlockUtils, SplitIndirectBrCriticalEdge) {
+TEST(BasicBlockUtils, SplitIndirectBrCriticalEdgesIgnorePHIs) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"IR(
-define void @crit_edge(i8* %cond0, i1 %cond1) {
+define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
 entry:
-  indirectbr i8* %cond0, [label %bb0, label %bb1]
+  indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2]
 bb0:
-  br label %bb1
+  br i1 %cond0, label %bb1, label %bb2
 bb1:
   %p = phi i32 [0, %bb0], [0, %entry]
-  br i1 %cond1, label %bb2, label %bb3
+  br i1 %cond1, label %bb3, label %bb4
 bb2:
   ret void
 bb3:
   ret void
+bb4:
+  ret void
 }
 )IR");
   Function *F = M->getFunction("crit_edge");
@@ -460,14 +462,69 @@ define void @crit_edge(i8* %cond0, i1 %cond1) {
   BranchProbabilityInfo BPI(*F, LI);
   BlockFrequencyInfo BFI(*F, BPI, LI);
 
-  ASSERT_TRUE(SplitIndirectBrCriticalEdges(*F, &BPI, &BFI));
+  ASSERT_TRUE(SplitIndirectBrCriticalEdges(*F, /*IgnoreBlocksWithoutPHI=*/true,
+                                           &BPI, &BFI));
 
   // Check that successors of the split block get their probability correct.
   BasicBlock *BB1 = getBasicBlockByName(*F, "bb1");
   BasicBlock *SplitBB = BB1->getTerminator()->getSuccessor(0);
-  EXPECT_EQ(2u, SplitBB->getTerminator()->getNumSuccessors());
+  ASSERT_EQ(2u, SplitBB->getTerminator()->getNumSuccessors());
   EXPECT_EQ(BranchProbability(1, 2), BPI.getEdgeProbability(SplitBB, 0u));
   EXPECT_EQ(BranchProbability(1, 2), BPI.getEdgeProbability(SplitBB, 1u));
+
+  // bb2 has no PHI, so we shouldn't split bb0 -> bb2
+  BasicBlock *BB0 = getBasicBlockByName(*F, "bb0");
+  ASSERT_EQ(2u, BB0->getTerminator()->getNumSuccessors());
+  EXPECT_EQ(BB0->getTerminator()->getSuccessor(1),
+            getBasicBlockByName(*F, "bb2"));
+}
+
+TEST(BasicBlockUtils, SplitIndirectBrCriticalEdges) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) {
+entry:
+  indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2]
+bb0:
+  br i1 %cond0, label %bb1, label %bb2
+bb1:
+  %p = phi i32 [0, %bb0], [0, %entry]
+  br i1 %cond1, label %bb3, label %bb4
+bb2:
+  ret void
+bb3:
+  ret void
+bb4:
+  ret void
+}
+)IR");
+  Function *F = M->getFunction("crit_edge");
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  BranchProbabilityInfo BPI(*F, LI);
+  BlockFrequencyInfo BFI(*F, BPI, LI);
+
+  ASSERT_TRUE(SplitIndirectBrCriticalEdges(*F, /*IgnoreBlocksWithoutPHI=*/false,
+                                           &BPI, &BFI));
+
+  // Check that successors of the split block get their probability correct.
+  BasicBlock *BB1 = getBasicBlockByName(*F, "bb1");
+  BasicBlock *SplitBB = BB1->getTerminator()->getSuccessor(0);
+  ASSERT_EQ(2u, SplitBB->getTerminator()->getNumSuccessors());
+  EXPECT_EQ(BranchProbability(1, 2), BPI.getEdgeProbability(SplitBB, 0u));
+  EXPECT_EQ(BranchProbability(1, 2), BPI.getEdgeProbability(SplitBB, 1u));
+
+  // Should split, resulting in:
+  //   bb0 -> bb2.clone; bb2 -> split1; bb2.clone -> split,
+  BasicBlock *BB0 = getBasicBlockByName(*F, "bb0");
+  ASSERT_EQ(2u, BB0->getTerminator()->getNumSuccessors());
+  BasicBlock *BB2Clone = BB0->getTerminator()->getSuccessor(1);
+  BasicBlock *BB2 = getBasicBlockByName(*F, "bb2");
+  EXPECT_NE(BB2Clone, BB2);
+  ASSERT_EQ(1u, BB2->getTerminator()->getNumSuccessors());
+  ASSERT_EQ(1u, BB2Clone->getTerminator()->getNumSuccessors());
+  EXPECT_EQ(BB2->getTerminator()->getSuccessor(0),
+            BB2Clone->getTerminator()->getSuccessor(0));
 }
 
 TEST(BasicBlockUtils, SetEdgeProbability) {

From 3e3e79a9e4c378b59f5f393f556e6a84edcd8898 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <medismail.bennani@gmail.com>
Date: Wed, 23 Feb 2022 16:42:53 -0800
Subject: [PATCH 730/748] [lldb/test] Fix TestProgressReporting.py race issue
 with the event listener

This patch is a follow-up of D120100 to address some feedbacks from
@labath.

This should mainly fix the race issue with the even listener by moving
the listener setup to the main thread.

This also changes the SBDebugger::GetProgressFromEvent SWIG binding
arguments to be output only, so the user don't have to provide them.

Finally, this updates the test to check it the out arguments are returned
in a tuple and re-enables the test on all platforms.

Differential Revision: https://reviews.llvm.org/D120284

Signed-off-by: Med Ismail Bennani <medismail.bennani@gmail.com>
---
 lldb/bindings/interface/SBDebugger.i          | 11 ++----
 .../TestProgressReporting.py                  | 37 ++++++++++---------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/lldb/bindings/interface/SBDebugger.i b/lldb/bindings/interface/SBDebugger.i
index 3790857b8ab61..0ef1766a50c6b 100644
--- a/lldb/bindings/interface/SBDebugger.i
+++ b/lldb/bindings/interface/SBDebugger.i
@@ -123,14 +123,11 @@ public:
     };
 
 
-    %apply uint64_t& INOUT { uint64_t& progress_id };
-    %apply uint64_t& INOUT { uint64_t& completed };
-    %apply uint64_t& INOUT { uint64_t& total };
-    %apply bool& INOUT { bool& is_debugger_specific };
     static const char *GetProgressFromEvent(const lldb::SBEvent &event,
-                                        uint64_t &progress_id,
-                                        uint64_t &completed, uint64_t &total,
-                                        bool &is_debugger_specific);
+                                        uint64_t &OUTPUT,
+                                        uint64_t &OUTPUT,
+                                        uint64_t &OUTPUT,
+                                        bool &OUTPUT);
 
     SBBroadcaster GetBroadcaster();
 
diff --git a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
index b9d9953539c11..79ef4e3f9f861 100644
--- a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
+++ b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
@@ -17,41 +17,42 @@ def setUp(self):
         TestBase.setUp(self)
         self.progress_events = []
 
-    def fetch_events(self, test_broadcaster):
-        listener = lldb.SBListener("lldb.progress.listener")
-        listener.StartListeningForEvents(test_broadcaster,
-                                         self.eBroadcastBitStopProgressThread)
-
-        progress_broadcaster = self.dbg.GetBroadcaster()
-        progress_broadcaster.AddListener(listener, lldb.SBDebugger.eBroadcastBitProgress)
-
+    def fetch_events(self):
         event = lldb.SBEvent()
 
         done = False
         while not done:
-            if listener.WaitForEvent(1, event):
+            if self.listener.WaitForEvent(1, event):
                 event_mask = event.GetType();
-                if event.BroadcasterMatchesRef(test_broadcaster):
+                if event.BroadcasterMatchesRef(self.test_broadcaster):
                     if event_mask & self.eBroadcastBitStopProgressThread:
                         done = True;
-                elif event.BroadcasterMatchesRef(progress_broadcaster):
-                    message = lldb.SBDebugger().GetProgressFromEvent(event, 0, 0, 0, False);
+                elif event.BroadcasterMatchesRef(self.progress_broadcaster):
+                    ret_args = lldb.SBDebugger().GetProgressFromEvent(event);
+                    self.assertGreater(len(ret_args), 1)
+
+                    message = ret_args[0]
                     if message:
                         self.progress_events.append((message, event))
 
-    @skipUnlessDarwin
     def test_dwarf_symbol_loading_progress_report(self):
         """Test that we are able to fetch dwarf symbol loading progress events"""
         self.build()
 
-        test_broadcaster = lldb.SBBroadcaster('lldb.broadcaster.test')
-        listener_thread = threading.Thread(target=self.fetch_events,
-                                           args=[test_broadcaster])
+        self.listener = lldb.SBListener("lldb.progress.listener")
+        self.test_broadcaster = lldb.SBBroadcaster('lldb.broadcaster.test')
+        self.listener.StartListeningForEvents(self.test_broadcaster,
+                                              self.eBroadcastBitStopProgressThread)
+
+        self.progress_broadcaster = self.dbg.GetBroadcaster()
+        self.progress_broadcaster.AddListener(self.listener, lldb.SBDebugger.eBroadcastBitProgress)
+
+        listener_thread = threading.Thread(target=self.fetch_events)
         listener_thread.start()
 
         lldbutil.run_to_source_breakpoint(self, 'break here', lldb.SBFileSpec('main.c'))
 
-        test_broadcaster.BroadcastEventByType(self.eBroadcastBitStopProgressThread)
+        self.test_broadcaster.BroadcastEventByType(self.eBroadcastBitStopProgressThread)
         listener_thread.join()
 
-        self.assertTrue(len(self.progress_events) > 0)
+        self.assertGreater(len(self.progress_events), 0)

From 652b39b46f85ad826a20d3e0cec5d0db91b43daf Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Wed, 23 Feb 2022 16:22:42 -0800
Subject: [PATCH 731/748] [mlir][sparse][linalg] add linalg rewriting specific
 to sparse tensors

Now that sparse tensor types are first-class citizens and the sparse compiler
is taking shape, it is time to make sure other compiler optimizations compose
well with sparse tensors. Mostly, this should be completely transparent (i.e.,
dense and sparse take the same path). However, in some cases, optimizations
only make sense in the context of sparse tensors. This is a first example of
such an optimization, where fusing a sampled elt-wise multiplication only makes
sense when the resulting kernel has a potential lower asymptotic complexity due
to the sparsity.

As an extreme example, running SDDMM with 1024x1024 matrices and a sparse
sampling matrix with only two elements runs in 463.55ms in the unfused
case but just 0.032ms in the fused case, with a speedup of 14485x that
is only possible in the exciting world of sparse computations!

Reviewed By: mravishankar

Differential Revision: https://reviews.llvm.org/D120429
---
 .../Dialect/Linalg/Transforms/Transforms.h    |   3 +
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |   1 +
 .../Linalg/Transforms/ElementwiseOpFusion.cpp |   3 +-
 .../Transforms/SparseTensorRewriting.cpp      | 213 ++++++++++++++++++
 .../CPU/sparse_sampled_mm_fusion.mlir         |  96 ++++++--
 .../Dialect/SparseTensor/taco/test_SDDMM.py   |   5 +-
 6 files changed, 305 insertions(+), 16 deletions(-)
 create mode 100644 mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 80ec20ac617ac..a8cd374ab0b84 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -59,6 +59,9 @@ void populateConvolutionVectorizationPatterns(RewritePatternSet &patterns,
 /// parallel loops.
 void populateElementwiseToLinalgConversionPatterns(RewritePatternSet &patterns);
 
+/// Populate patterns that are only useful in the context of sparse tensors.
+void populateSparseTensorRewriting(RewritePatternSet &patterns);
+
 /// Function type which is used to control when to stop fusion. It is expected
 /// that OpOperand is not modified in the callback. The OpOperand is not marked
 /// as const to allow callers to use non-const methods.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index f758546bb9afc..ec8c8c438635e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   NamedOpConversions.cpp
   PadOpInterchange.cpp
   Promotion.cpp
+  SparseTensorRewriting.cpp
   Tiling.cpp
   Transforms.cpp
   Vectorization.cpp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 7e0e857643eb6..3493f4e3c7598 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -49,7 +49,7 @@ static AffineMap getIndexingMapOfProducerOperandsInCoordinatesOfFusedOp(
   AffineMap invProducerResultIndexMap =
       inversePermutation(producerResultIndexMap);
   assert(invProducerResultIndexMap &&
-         "expected producer result indexig map to be invertible");
+         "expected producer result indexing map to be invertible");
 
   LinalgOp producer = cast<LinalgOp>(producerOpOperand->getOwner());
   // argMap is a map from producer loop -> producer arg tensor index.
@@ -2264,6 +2264,7 @@ void mlir::linalg::populateElementwiseOpsFusionPatterns(
                FoldConstantTranspose>(context,
                                       options.controlElementwiseOpsFusionFn);
   patterns.add<RemoveOutsDependency>(context);
+  populateSparseTensorRewriting(patterns);
   populateFoldReshapeOpsByExpansionPatterns(patterns,
                                             options.controlFoldingReshapesFn);
   AffineApplyOp::getCanonicalizationPatterns(patterns, context);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp
new file mode 100644
index 0000000000000..3958ab3baf178
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/SparseTensorRewriting.cpp
@@ -0,0 +1,213 @@
+//===- SparseTensorRewriting.cpp - Sparse tensor rewriting rules ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements linalg dialect rewriting specific to sparse tensors.
+//
+// Sparsity should be mostly transparent to the linalg dialect optimizations
+// (i.e., the dense and sparse take the same path). However, in some cases,
+// optimizations only make sense in the context of sparse tensors. This file
+// implements such sparsity specific rewriting rules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Support/LLVM.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+using namespace mlir::sparse_tensor;
+
+//===---------------------------------------------------------------------===//
+// Helper methods for the actual rewriting rules.
+//===---------------------------------------------------------------------===//
+
+// Helper to detect a sparse tensor type operand.
+static bool isSparseTensor(OpOperand *op) {
+  if (auto enc = getSparseTensorEncoding(op->get().getType())) {
+    ArrayRef<SparseTensorEncodingAttr::DimLevelType> dimTypes =
+        enc.getDimLevelType();
+    for (unsigned i = 0, e = dimTypes.size(); i < e; i++)
+      if (dimTypes[i] == SparseTensorEncodingAttr::DimLevelType::Compressed)
+        return true; // at least one compressed
+  }
+  return false;
+}
+
+// Helper method to find zero or empty initialization.
+static bool isEmptyInit(OpOperand *op) {
+  Value val = op->get();
+  if (matchPattern(val, m_Zero()))
+    return true;
+  if (matchPattern(val, m_AnyZeroFloat()))
+    return true;
+  if (val.getDefiningOp<InitTensorOp>())
+    return true;
+  if (val.getDefiningOp<InitOp>())
+    return true;
+  return false;
+}
+
+// Helper to detect sampling operation.
+static bool isSampling(GenericOp op) {
+  auto yieldOp = cast<linalg::YieldOp>(op.region().front().getTerminator());
+  if (auto def = yieldOp.getOperand(0).getDefiningOp()) {
+    if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def)) {
+      // Both scalar input arguments used exactly once.
+      Value s1 = op.getBlock()->getArgument(0);
+      Value s2 = op.getBlock()->getArgument(1);
+      return (def->getOperand(0) == s1 && def->getOperand(1) == s2) ||
+             (def->getOperand(1) == s1 && def->getOperand(0) == s2);
+    }
+  }
+  return false;
+}
+
+// Helper to detect chain of multiplications that do not involve x.
+static bool isMulChain(Value val, Value x) {
+  if (auto arg = val.dyn_cast<BlockArgument>())
+    return arg != x;
+  if (auto def = val.getDefiningOp()) {
+    if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def))
+      return isMulChain(def->getOperand(0), x) &&
+             isMulChain(def->getOperand(1), x);
+  }
+  return false;
+}
+
+// Helper to detect x = x + <multiplications>.
+static bool isSumOfMul(GenericOp op) {
+  auto yieldOp = cast<linalg::YieldOp>(op.region().front().getTerminator());
+  if (auto def = yieldOp.getOperand(0).getDefiningOp()) {
+    if (isa<arith::AddFOp>(def) || isa<arith::AddIOp>(def)) {
+      Value x = op.getBlock()->getArguments().back();
+      return (def->getOperand(0) == x && isMulChain(def->getOperand(1), x)) ||
+             (def->getOperand(1) == x && isMulChain(def->getOperand(0), x));
+    }
+  }
+  return false;
+}
+
+//===---------------------------------------------------------------------===//
+// The actual sparse tensor rewriting rules.
+//===---------------------------------------------------------------------===//
+
+namespace {
+/// Rewriting rule that converts two kernels:
+///
+///      T(i,j) = SUM(k, A(i,j,k) * B(i,j,k) * ... )
+///      X(i,j) = S(i,j) * T(i,j)
+///
+/// into a single kernel, using distributive law:
+///
+///      X(i,j) = SUM(k, S(i,j) * A(i,j,k) * B(i,j,k) * ... )
+///
+/// This kind of fusion (merging two ops into one but using arithmetic
+/// equalities that may not hold for floating-point computations) would
+/// be undesirable in the dense case, since we distribute the multiplication
+/// into the reduction loop. However, for sparse sampling tensor S, such
+/// a fusion may actually reduce the asymptotic complexity of the kernel,
+/// since intermediate results may be nullified.
+struct FuseSparseMultiplyOverAdd : public OpRewritePattern<GenericOp> {
+  using OpRewritePattern<GenericOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GenericOp op,
+                                PatternRewriter &rewriter) const override {
+    // Check consumer.
+    if (!op.hasTensorSemantics() || op.getNumInputs() != 2 ||
+        op.getNumResults() != 1)
+      return failure();
+    if (op.getNumParallelLoops() != op.getNumLoops())
+      return failure();
+    if (!op.getTiedIndexingMap(op.getOutputOperand(0)).isIdentity() ||
+        !op.getTiedIndexingMap(op.getInputOperand(0)).isIdentity() ||
+        !op.getTiedIndexingMap(op.getInputOperand(1)).isIdentity())
+      return failure();
+    // Find consuming OP2(sparse, other) or OP2(other, sparse). The other
+    // operand can be sparse or dense, since the point of this rewriting rule
+    // is detecting a situation in which *more* sparsity is introduced into
+    // a computation, be it already sparse or still dense.
+    unsigned other = 0;
+    if (isSparseTensor(op.getInputOperand(0)))
+      other = 1;
+    else if (!isSparseTensor(op.getInputOperand(1)))
+      return failure();
+    // Check producer.
+    auto prod = dyn_cast_or_null<GenericOp>(
+        op.getInputOperand(other)->get().getDefiningOp());
+    if (!prod || !prod.hasTensorSemantics() || prod.getNumResults() != 1)
+      return failure();
+    if (!prod.getResult(0).hasOneUse())
+      return failure();
+    // Sampling consumer and sum of multiplication chain producer.
+    if (!isEmptyInit(op.getOutputOperand(0)) ||
+        !isEmptyInit(prod.getOutputOperand(0)))
+      return failure();
+    if (!isSampling(op) || !isSumOfMul(prod))
+      return failure();
+    // Modify operand structure of producer and consumer.
+    Location loc = prod.getLoc();
+    SmallVector<Value> inputOps = prod.getInputOperands();
+    SmallVector<Value> outputOps = op.getOutputOperands();
+    SmallVector<AffineMap> fusedIndexMaps = prod.getIndexingMaps();
+    inputOps.push_back(op.getInputOperand(1 - other)->get());
+    fusedIndexMaps.push_back(fusedIndexMaps.back()); // mimic other
+    // Fuse producer and consumer into a new generic op.
+    auto fusedOp = rewriter.create<GenericOp>(
+        loc, op.getResult(0).getType(), inputOps, outputOps,
+        rewriter.getAffineMapArrayAttr(fusedIndexMaps), prod.iterator_types(),
+        /*doc=*/nullptr, /*library_call=*/nullptr);
+    Block &prodBlock = prod.region().front();
+    Block &consBlock = op.region().front();
+    BlockAndValueMapping mapper;
+    Block *fusedBlock = new Block();
+    fusedOp.region().push_back(fusedBlock);
+    unsigned num = prodBlock.getNumArguments();
+    for (unsigned i = 0; i < num - 1; i++)
+      addArg(mapper, fusedBlock, prodBlock.getArgument(i));
+    addArg(mapper, fusedBlock, consBlock.getArgument(1 - other));
+    addArg(mapper, fusedBlock, prodBlock.getArgument(num - 1));
+    // Clone bodies of the producer and consumer in new evaluation order.
+    auto acc = prodBlock.getTerminator()->getOperand(0).getDefiningOp();
+    auto sampler = consBlock.getTerminator()->getOperand(0).getDefiningOp();
+    rewriter.setInsertionPointToStart(fusedBlock);
+    Value last;
+    for (auto &op : prodBlock.without_terminator())
+      if (&op != acc) {
+        last = op.getResult(0);
+        rewriter.clone(op, mapper);
+      }
+    mapper.map(consBlock.getArgument(other), fusedBlock->back().getResult(0));
+    mapper.map(last, rewriter.clone(*sampler, mapper)->getResult(0));
+    last = rewriter.clone(*acc, mapper)->getResult(0);
+    rewriter.create<linalg::YieldOp>(loc, last);
+    // Replace consumer with fused operation. Old producer
+    // and consumer ops will be removed by DCE.
+    rewriter.replaceOp(op, fusedOp->getResults());
+    return success();
+  }
+
+private:
+  // Helper to add argument and record the mapping.
+  static void addArg(BlockAndValueMapping &mapper, Block *b, BlockArgument a) {
+    mapper.map(a, b->addArgument(a.getType(), a.getLoc()));
+  }
+};
+} // namespace
+
+//===---------------------------------------------------------------------===//
+// Methods that add patterns described in this file to a pattern list.
+//===---------------------------------------------------------------------===//
+
+void mlir::linalg::populateSparseTensorRewriting(RewritePatternSet &patterns) {
+  auto *context = patterns.getContext();
+  patterns.add<FuseSparseMultiplyOverAdd>(context);
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
index 017950391e39f..0cbdd7a7f4d0e 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sampled_mm_fusion.mlir
@@ -5,7 +5,7 @@
 //
 // Do the same run, but now with SIMDization as well. This should not change the outcome.
 //
-// RUN: mlir-opt %s -sparse-compiler="vectorization-strategy=2 vl=8" | \
+// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=2 vl=8" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
@@ -46,7 +46,8 @@
 //
 module {
   //
-  // A kernel that computes a direct sampled matrix matrix multiplication.
+  // A kernel that computes a direct sampled matrix matrix multiplication
+  // (with dense result).
   //
   func @sampled_dd(%args: tensor<8x8xf64, #SM>,
                    %arga: tensor<8x8xf64>,
@@ -66,11 +67,13 @@ module {
   }
 
   //
-  // A kernel that computes an unfused sampled matrix matrix multiplication.
+  // A kernel that computes an unfused sampled matrix matrix multiplication
+  // (with dense result).
   //
   func @sampled_dd_unfused(%args: tensor<8x8xf64, #SM>,
                            %arga: tensor<8x8xf64>,
-                           %argb: tensor<8x8xf64>) -> (tensor<8x8xf64>, tensor<8x8xf64>) {
+                           %argb: tensor<8x8xf64>) -> tensor<8x8xf64> {
+    // Perform dense-dense matrix matrix multiplication.
     %1 = arith.constant dense<0.0> : tensor<8x8xf64>
     %2 = linalg.generic #trait_matmul
       ins(%arga, %argb : tensor<8x8xf64>, tensor<8x8xf64>)
@@ -80,17 +83,68 @@ module {
           %q = arith.addf %x, %p : f64
           linalg.yield %q : f64
     } -> tensor<8x8xf64>
-
-    %3 = arith.constant dense<0.0> : tensor<8x8xf64>
-    %4 = linalg.generic #trait_scale
+    // Sample the result with elements-wise multiplication with sparse matrix.
+    %3 = linalg.generic #trait_scale
       ins(%2, %args : tensor<8x8xf64>, tensor<8x8xf64, #SM>)
-      outs(%3 : tensor<8x8xf64>) {
+      outs(%1 : tensor<8x8xf64>) {
         ^bb0(%t: f64, %s: f64, %x: f64):
           %r = arith.mulf %t, %s : f64
           linalg.yield %r : f64
     } -> tensor<8x8xf64>
+    return %3 : tensor<8x8xf64>
+  }
 
-    return %4, %2 : tensor<8x8xf64>, tensor<8x8xf64>
+  //
+  // A kernel that computes a direct sampled matrix matrix multiplication
+  // (with sparse result).
+  //
+  func @sparse_sampled_dd(%args: tensor<8x8xf64, #SM>,
+                          %arga: tensor<8x8xf64>,
+                          %argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SM> {
+    %c8 = arith.constant 8 : index
+    %1 = sparse_tensor.init [%c8, %c8] : tensor<8x8xf64, #SM>
+    %2 = linalg.generic #trait_sampled_dense_dense
+      ins(%args, %arga, %argb: tensor<8x8xf64, #SM>,
+                               tensor<8x8xf64>, tensor<8x8xf64>)
+      outs(%1: tensor<8x8xf64, #SM>) {
+        ^bb(%s: f64, %a: f64, %b: f64, %x: f64):
+          %p = arith.mulf %a, %b : f64
+          %q = arith.mulf %s, %p : f64
+          %r = arith.addf %x, %q : f64
+          linalg.yield %r : f64
+    } -> tensor<8x8xf64, #SM>
+    return %2 : tensor<8x8xf64, #SM>
+  }
+
+  //
+  // A kernel that computes an unfused sampled matrix matrix multiplication
+  // (with sparse result).
+  //
+  func @sparse_sampled_dd_unfused(
+        %args: tensor<8x8xf64, #SM>,
+        %arga: tensor<8x8xf64>,
+        %argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SM> {
+    // Perform dense-dense matrix matrix multiplication.
+    %1 = arith.constant dense<0.0> : tensor<8x8xf64>
+    %2 = linalg.generic #trait_matmul
+      ins(%arga, %argb : tensor<8x8xf64>, tensor<8x8xf64>)
+      outs(%1 : tensor<8x8xf64>) {
+        ^bb0(%a: f64, %b: f64, %x: f64):
+          %p = arith.mulf %a, %b : f64
+          %q = arith.addf %x, %p : f64
+          linalg.yield %q : f64
+    } -> tensor<8x8xf64>
+    // Sample the result with elements-wise multiplication with sparse matrix.
+    %c8 = arith.constant 8 : index
+    %3 = sparse_tensor.init [%c8, %c8] : tensor<8x8xf64, #SM>
+    %4 = linalg.generic #trait_scale
+      ins(%2, %args : tensor<8x8xf64>, tensor<8x8xf64, #SM>)
+      outs(%3 : tensor<8x8xf64, #SM>) {
+        ^bb0(%t: f64, %s: f64, %x: f64):
+          %r = arith.mulf %t, %s : f64
+          linalg.yield %r : f64
+    } -> tensor<8x8xf64, #SM>
+    return %4 : tensor<8x8xf64, #SM>
   }
 
   //
@@ -112,9 +166,15 @@ module {
     %0 = call @sampled_dd(%s, %a, %b)
       : (tensor<8x8xf64, #SM>,
          tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64>
-    %1, %2 = call @sampled_dd_unfused(%s, %a, %b)
+    %1 = call @sampled_dd_unfused(%s, %a, %b)
       : (tensor<8x8xf64, #SM>,
-         tensor<8x8xf64>, tensor<8x8xf64>) -> (tensor<8x8xf64>, tensor<8x8xf64>)
+         tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64>
+    %2 = call @sparse_sampled_dd(%s, %a, %b)
+      : (tensor<8x8xf64, #SM>,
+         tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64, #SM>
+    %3 = call @sparse_sampled_dd_unfused(%s, %a, %b)
+      : (tensor<8x8xf64, #SM>,
+         tensor<8x8xf64>, tensor<8x8xf64>) -> tensor<8x8xf64, #SM>
 
     // Verify the outputs.
     //
@@ -128,21 +188,31 @@ module {
     // CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0 ),
     // CHECK-SAME: ( 0, 0, 0, 0, 0, 0, 0, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 192 ) )
     //
+    // CHECK-NEXT: ( 96, 192, 0, 0 )
+    //
+    // CHECK-NEXT: ( 96, 192, 0, 0 )
+    //
     %m0 = bufferization.to_memref %0 : memref<8x8xf64>
     %m1 = bufferization.to_memref %1 : memref<8x8xf64>
-    %m2 = bufferization.to_memref %2 : memref<8x8xf64>
+    %m2 = sparse_tensor.values %2 : tensor<8x8xf64, #SM> to memref<?xf64>
+    %m3 = sparse_tensor.values %3 : tensor<8x8xf64, #SM> to memref<?xf64>
     %v0 = vector.transfer_read %m0[%c0, %c0], %d0
         : memref<8x8xf64>, vector<8x8xf64>
     %v1 = vector.transfer_read %m1[%c0, %c0], %d0
         : memref<8x8xf64>, vector<8x8xf64>
+    %v2 = vector.transfer_read %m2[%c0], %d0 : memref<?xf64>, vector<4xf64>
+    %v3 = vector.transfer_read %m3[%c0], %d0 : memref<?xf64>, vector<4xf64>
     vector.print %v0 : vector<8x8xf64>
     vector.print %v1 : vector<8x8xf64>
+    vector.print %v2 : vector<4xf64>
+    vector.print %v3 : vector<4xf64>
 
     // Release the resources.
     sparse_tensor.release %s : tensor<8x8xf64, #SM>
     memref.dealloc %m0 : memref<8x8xf64>
     memref.dealloc %m1 : memref<8x8xf64>
-    memref.dealloc %m2 : memref<8x8xf64>
+    sparse_tensor.release %2 : tensor<8x8xf64, #SM>
+    sparse_tensor.release %3 : tensor<8x8xf64, #SM>
 
     return
   }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py b/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py
index 876e6bd073a09..9f017ad157783 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py
@@ -33,8 +33,9 @@
 
 # Alternative way to define SDDMM kernel. Since this performs the reduction as
 #   sum(k, A[i, k] * B[k, j]) * S[i, j]
-# the MLIR lowering results in two separate tensor index expressions that
-# need to be fused properly to guarantee proper asymptotic complexity.
+# the MLIR lowering results in two separate tensor index expressions that are
+# fused prior to running the sparse compiler in order to guarantee proper
+# asymptotic complexity.
 Y[i, j] = A[i, k] * B[k, j] * S[i, j]
 
 expected = """; extended FROSTT format

From 7e3606f43c63f9622f176a786424c3c92c15f5c0 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 23 Feb 2022 17:45:32 -0800
Subject: [PATCH 732/748] [ScalarEvolution] Control flag for nonstrict
 inequalities in finite loops

D118090 causes a pretty significant (19%) regression in some Eigen
benchmarks. Investigating is a bit time consuming as the compilation
unit where this occurs is large. Rather than revert, this patch adds a
flag controlling that behavior (enabled by default).
---
 llvm/lib/Analysis/ScalarEvolution.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 08caaaabc4259..24fb780d0f7e6 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -239,6 +239,11 @@ static cl::opt<unsigned> MaxPhiSCCAnalysisSize(
              "Phi strongly connected components"),
     cl::init(8));
 
+static cl::opt<bool>
+    EnableFiniteLoopControl("scalar-evolution-finite-loop", cl::Hidden,
+                            cl::desc("Handle <= and >= in finite loops"),
+                            cl::init(true));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -8660,7 +8665,8 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
       ControlsExit && loopHasNoAbnormalExits(L) && loopIsFiniteByAssumption(L);
   // Simplify the operands before analyzing them.
   (void)SimplifyICmpOperands(Pred, LHS, RHS, /*Depth=*/0,
-                             ControllingFiniteLoop);
+                             (EnableFiniteLoopControl ? ControllingFiniteLoop
+                                                     : false));
 
   // If we have a comparison of a chrec against a constant, try to use value
   // ranges to answer this query.

From 47d18be58b61f80f9be0b84c2ac00f0741608a83 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 23 Feb 2022 17:57:52 -0800
Subject: [PATCH 733/748] [ELF] Remove SharedSymbol::getFile. NFC

Symbol.h depends on InputFiles.h. This change moves us toward dropping the
weird dependency.

The call sites will become slightly uglier (`cast<SharedFile>(s->file)`), but
the compromise is acceptable.
---
 lld/ELF/Driver.cpp      | 2 +-
 lld/ELF/MarkLive.cpp    | 4 ++--
 lld/ELF/Relocations.cpp | 6 +++---
 lld/ELF/Symbols.h       | 2 --
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index cfc99dd115dc1..d34ee094506d6 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1832,7 +1832,7 @@ static void demoteSharedAndLazySymbols() {
   llvm::TimeTraceScope timeScope("Demote shared and lazy symbols");
   for (Symbol *sym : symtab->symbols()) {
     auto *s = dyn_cast<SharedSymbol>(sym);
-    if (!(s && !s->getFile().isNeeded) && !sym->isLazy())
+    if (!(s && !cast<SharedFile>(s->file)->isNeeded) && !sym->isLazy())
       continue;
 
     bool used = sym->used;
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index b197dd45d765b..cf8e639ad6482 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -118,7 +118,7 @@ void MarkLive<ELFT>::resolveReloc(InputSectionBase &sec, RelTy &rel,
 
   if (auto *ss = dyn_cast<SharedSymbol>(&sym))
     if (!ss->isWeak())
-      ss->getFile().isNeeded = true;
+      cast<SharedFile>(ss->file)->isNeeded = true;
 
   for (InputSectionBase *sec : cNamedSections.lookup(sym.getName()))
     enqueue(sec, 0);
@@ -373,7 +373,7 @@ template <class ELFT> void elf::markLive() {
     for (Symbol *sym : symtab->symbols())
       if (auto *s = dyn_cast<SharedSymbol>(sym))
         if (s->isUsedInRegularObj && !s->isWeak())
-          s->getFile().isNeeded = true;
+          cast<SharedFile>(s->file)->isNeeded = true;
     return;
   }
 
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 8bc52ed2e3771..33b5b3dd098b0 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -247,7 +247,7 @@ template <class ELFT> static bool isReadOnly(SharedSymbol &ss) {
   using Elf_Phdr = typename ELFT::Phdr;
 
   // Determine if the symbol is read-only by scanning the DSO's program headers.
-  const SharedFile &file = ss.getFile();
+  const auto &file = cast<SharedFile>(*ss.file);
   for (const Elf_Phdr &phdr :
        check(file.template getObj<ELFT>().program_headers()))
     if ((phdr.p_type == ELF::PT_LOAD || phdr.p_type == ELF::PT_GNU_RELRO) &&
@@ -266,7 +266,7 @@ template <class ELFT>
 static SmallSet<SharedSymbol *, 4> getSymbolsAt(SharedSymbol &ss) {
   using Elf_Sym = typename ELFT::Sym;
 
-  SharedFile &file = ss.getFile();
+  const auto &file = cast<SharedFile>(*ss.file);
 
   SmallSet<SharedSymbol *, 4> ret;
   for (const Elf_Sym &s : file.template getGlobalELFSyms<ELFT>()) {
@@ -382,7 +382,7 @@ template <class ELFT> static void addCopyRelSymbolImpl(SharedSymbol &ss) {
 }
 
 static void addCopyRelSymbol(SharedSymbol &ss) {
-  const SharedFile &file = ss.getFile();
+  const auto &file = cast<SharedFile>(*ss.file);
   switch (file.ekind) {
   case ELF32LEKind:
     addCopyRelSymbolImpl<ELF32LE>(ss);
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index dd245660d13d5..77b1e62e1986d 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -403,8 +403,6 @@ class SharedSymbol : public Symbol {
       this->type = llvm::ELF::STT_FUNC;
   }
 
-  SharedFile &getFile() const { return *cast<SharedFile>(file); }
-
   uint64_t value; // st_value
   uint64_t size;  // st_size
   uint32_t alignment;

From c8ae8cfb5d53217f8ee11af666c3d5a51a3e3230 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 22 Feb 2022 14:04:30 -0800
Subject: [PATCH 734/748] [mlir][sparse][taco] Add support for float32.

Previously, we only support float64. We now support float32 and float64. When
constructing a tensor without providing a data type, the default is float32.

Fix the tests to data type consistency. All PyTACO application tests now use
float32 to match the default data type of TACO. Other tests may use float32 or
float64.

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D120356
---
 .../Dialect/SparseTensor/taco/test_MTTKRP.py  |  4 +-
 .../Dialect/SparseTensor/taco/test_SDDMM.py   |  4 +-
 .../Dialect/SparseTensor/taco/test_SpMV.py    |  4 +-
 .../taco/test_true_dense_tensor_algebra.py    |  2 +-
 .../SparseTensor/taco/tools/mlir_pytaco.py    | 55 ++++++++---
 .../SparseTensor/taco/tools/mlir_pytaco_io.py |  6 +-
 .../taco/tools/mlir_pytaco_utils.py           | 95 +++++++++++--------
 .../taco/unit_test_tensor_utils.py            |  2 +-
 8 files changed, 112 insertions(+), 60 deletions(-)

diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/test_MTTKRP.py b/mlir/test/Integration/Dialect/SparseTensor/taco/test_MTTKRP.py
index 1e35a85755382..4e6fc1136d487 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/test_MTTKRP.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/test_MTTKRP.py
@@ -30,8 +30,8 @@
 
 # These two lines have been modified from the original program to use static
 # data to support result comparison.
-C = pt.from_array(np.full((B.shape[1], 25), 1, dtype=np.float64))
-D = pt.from_array(np.full((B.shape[2], 25), 2, dtype=np.float64))
+C = pt.from_array(np.full((B.shape[1], 25), 1, dtype=np.float32))
+D = pt.from_array(np.full((B.shape[2], 25), 2, dtype=np.float32))
 
 # Declare the result to be a dense matrix.
 A = pt.tensor([B.shape[0], 25], rm)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py b/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py
index 9f017ad157783..1001490240169 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/test_SDDMM.py
@@ -15,8 +15,8 @@
 i, j, k = pt.get_index_vars(3)
 
 # Set up dense matrices.
-A = pt.from_array(np.full((8, 8), 2.0))
-B = pt.from_array(np.full((8, 8), 3.0))
+A = pt.from_array(np.full((8, 8), 2.0, dtype=np.float32))
+B = pt.from_array(np.full((8, 8), 3.0, dtype=np.float32))
 
 # Set up sparse matrices.
 S = pt.tensor([8, 8], pt.format([pt.compressed, pt.compressed]))
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/test_SpMV.py b/mlir/test/Integration/Dialect/SparseTensor/taco/test_SpMV.py
index 41ee71fab4310..a0ddb6229f6a9 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/test_SpMV.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/test_SpMV.py
@@ -31,8 +31,8 @@
 
 # These two lines have been modified from the original program to use static
 # data to support result comparison.
-x = pt.from_array(np.full((A.shape[1],), 1, dtype=np.float64))
-z = pt.from_array(np.full((A.shape[0],), 2, dtype=np.float64))
+x = pt.from_array(np.full((A.shape[1],), 1, dtype=np.float32))
+z = pt.from_array(np.full((A.shape[0],), 2, dtype=np.float32))
 
 # Declare the result to be a dense vector
 y = pt.tensor([A.shape[0]], dv)
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/test_true_dense_tensor_algebra.py b/mlir/test/Integration/Dialect/SparseTensor/taco/test_true_dense_tensor_algebra.py
index 6055e7947102c..3d3d4c0d39170 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/test_true_dense_tensor_algebra.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/test_true_dense_tensor_algebra.py
@@ -14,7 +14,7 @@
 B = pt.from_array(np.full([2,3], 2, dtype=np.float64))
 # Define the result tensor as a true dense tensor. The parameter is_dense=True
 # is an MLIR-PyTACO extension.
-C = pt.tensor([2, 3], is_dense=True)
+C = pt.tensor([2, 3], dtype=pt.float64, is_dense=True)
 
 C[i, j] = A[i, j] + B[i, j]
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
index c44a84e25a25d..59063c003ab17 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py
@@ -96,7 +96,7 @@ class DType:
     kind: A Type enum representing the data type.
     value: The numpy data type for the TACO data type.
   """
-  kind: Type = Type.FLOAT64
+  kind: Type = Type.FLOAT32
 
   def is_float(self) -> bool:
     """Returns whether the data type represents a floating point value."""
@@ -112,6 +112,30 @@ def value(self) -> _AnyRuntimeType:
     return self.kind.value
 
 
+def _dtype_to_mlir_str(dtype: DType) -> str:
+  """Returns the MLIR string for the given dtype."""
+  dtype_to_str = {
+      Type.INT16: "i16",
+      Type.INT32: "i32",
+      Type.INT64: "i64",
+      Type.FLOAT32: "f32",
+      Type.FLOAT64: "f64"
+  }
+  return dtype_to_str[dtype.kind]
+
+
+def _nptype_to_taco_type(ty: np.dtype) -> DType:
+  """Returns the TACO type for the given numpy type."""
+  nptype_to_dtype = {
+      np.int16: Type.INT16,
+      np.int32: Type.INT32,
+      np.int64: Type.INT64,
+      np.float32: Type.FLOAT32,
+      np.float64: Type.FLOAT64
+  }
+  return DType(nptype_to_dtype[ty])
+
+
 def _mlir_type_from_taco_type(dtype: DType) -> ir.Type:
   """Returns the MLIR type corresponding to the given TACO type."""
   dtype_to_irtype = {
@@ -123,7 +147,6 @@ def _mlir_type_from_taco_type(dtype: DType) -> ir.Type:
   }
   return dtype_to_irtype[dtype.kind]
 
-
 def _ctype_pointer_from_array(array: np.ndarray) -> ctypes.pointer:
   """Returns the ctype pointer for the given numpy array."""
   return ctypes.pointer(
@@ -632,7 +655,7 @@ def __init__(self,
     """
     # Take care of the argument default values common to both sparse tensors
     # and dense tensors.
-    dtype = dtype or DType(Type.FLOAT64)
+    dtype = dtype or DType(Type.FLOAT32)
     self._name = name or self._get_unique_name()
     self._assignment = None
     self._sparse_value_location = _SparseValueInfo._UNPACKED
@@ -688,7 +711,7 @@ def unpack(self) -> None:
     # Use the output MLIR sparse tensor pointer to retrieve the COO-flavored
     # values and verify the values.
     rank, nse, shape, values, indices = utils.sparse_tensor_to_coo_tensor(
-        self._packed_sparse_value, np.float64)
+        self._packed_sparse_value, self._dtype.value)
     assert rank == self.order
     assert np.allclose(self.shape, shape)
     assert nse == len(values)
@@ -757,7 +780,8 @@ def to_array(self) -> np.ndarray:
   def from_array(array: np.ndarray) -> "Tensor":
     """Returns a dense tensor with the value copied from the input array.
 
-    We currently only support the conversion of float64 numpy arrays to Tensor.
+    We currently only support the conversion of float32 and float64 numpy arrays
+    to Tensor.
 
     Args:
       array: The numpy array that provides the data type, shape and value for
@@ -767,11 +791,14 @@ def from_array(array: np.ndarray) -> "Tensor":
       A Tensor object.
 
     Raises:
-      ValueError if the data type of the numpy array is not float64.
+      ValueError if the data type of the numpy array is not supported.
     """
-    if array.dtype != np.float64:
-      raise ValueError(f"Expected float64 value type: {array.dtype}.")
-    tensor = Tensor(array.shape, is_dense=True)
+    if array.dtype != np.float32 and array.dtype != np.float64:
+      raise ValueError(f"Expected floating point value type: {array.dtype}.")
+    tensor = Tensor(
+        array.shape,
+        dtype=_nptype_to_taco_type(array.dtype.type),
+        is_dense=True)
     tensor._dense_storage = np.copy(array)
     return tensor
 
@@ -808,7 +835,7 @@ def from_coo(
     # The size of each dimension is one more that such a maximum coordinate
     # value.
     shape = [c + 1 for c in max_coordinate]
-    tensor = Tensor(shape, fmt)
+    tensor = Tensor(shape, fmt, dtype=dtype)
     tensor._coords = coordinates
     tensor._values = values
 
@@ -833,8 +860,9 @@ def from_file(
       value is stored as an MLIR sparse tensor.
     """
     sparse_tensor, shape = utils.create_sparse_tensor(filename,
-                                                      fmt.format_pack.formats)
-    tensor = Tensor(shape.tolist(), fmt)
+                                                      fmt.format_pack.formats,
+                                                      _dtype_to_mlir_str(dtype))
+    tensor = Tensor(shape.tolist(), fmt, dtype=dtype)
     tensor._set_packed_sparse_tensor(sparse_tensor)
 
     return tensor
@@ -862,7 +890,8 @@ def to_file(self, filename: str) -> None:
                        "supported.")
 
     utils.output_sparse_tensor(self._packed_sparse_value, filename,
-                               self._format.format_pack.formats)
+                               self._format.format_pack.formats,
+                               _dtype_to_mlir_str(self._dtype))
 
   @property
   def dtype(self) -> DType:
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py
index 94cb740a006fe..e6a7d8e1b4b85 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_io.py
@@ -31,7 +31,8 @@
 _TNS_FILENAME_SUFFIX = ".tns"
 
 
-def read(filename: str, fmt: Format) -> Tensor:
+def read(filename: str, fmt: Format,
+         dtype: DType = DType(Type.FLOAT32)) -> Tensor:
   """Inputs a tensor from a given file.
 
   The name suffix of the file specifies the format of the input tensor. We
@@ -40,6 +41,7 @@ def read(filename: str, fmt: Format) -> Tensor:
   Args:
     filename: A string input filename.
     fmt: The storage format of the tensor.
+    dtype: The data type, default to float32.
 
   Raises:
     ValueError: If filename doesn't end with .mtx or .tns, or fmt is not an
@@ -52,7 +54,7 @@ def read(filename: str, fmt: Format) -> Tensor:
                      f"{_MTX_FILENAME_SUFFIX} or {_TNS_FILENAME_SUFFIX}: "
                      f"{filename}.")
 
-  return Tensor.from_file(filename, fmt, DType(Type.FLOAT64))
+  return Tensor.from_file(filename, fmt, dtype)
 
 
 def write(filename: str, tensor: Tensor) -> None:
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
index 62aa98ee8aaf8..3272a71b6c92b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py
@@ -4,7 +4,7 @@
 
 # This file contains the utilities to process sparse tensor outputs.
 
-from typing import Sequence, Tuple
+from typing import Callable, Dict, Sequence, Tuple
 import ctypes
 import functools
 import numpy as np
@@ -18,6 +18,10 @@
 from mlir.dialects import sparse_tensor
 from mlir.passmanager import PassManager
 
+# Type aliases for type annotation.
+_SupportFunc = Callable[..., None]
+_SupportFuncLocator = Callable[[np.dtype], Tuple[_SupportFunc, _SupportFunc]]
+
 # The name for the environment variable that provides the full path for the
 # supporting library.
 _SUPPORTLIB_ENV_VAR = "SUPPORTLIB"
@@ -36,15 +40,28 @@ def _get_support_lib_name() -> str:
   return os.getenv(_SUPPORTLIB_ENV_VAR, _DEFAULT_SUPPORTLIB)
 
 
+def _record_support_funcs(
+    ty: np.dtype, to_func: _SupportFunc, from_func: _SupportFunc,
+    ty_to_funcs: Dict[np.dtype, Tuple[_SupportFunc, _SupportFunc]]) -> None:
+  """Records the two supporting functions for a given data type."""
+  to_func.restype = ctypes.c_void_p
+  from_func.restype = ctypes.c_void_p
+  ty_to_funcs[ty] = (to_func, from_func)
+
+
 @functools.lru_cache()
-def _get_c_shared_lib() -> ctypes.CDLL:
-  """Loads the supporting C shared library with the needed routines.
+def _get_support_func_locator() -> _SupportFuncLocator:
+  """Constructs a function to locate the supporting functions for a data type.
+
+  Loads the supporting C shared library with the needed routines. Constructs a
+  dictionary from the supported data types to the routines for the data types,
+  and then a function to look up the dictionary for a given data type.
 
   The name of the supporting C shared library is either provided by an
   an environment variable or a default value.
 
   Returns:
-    The supporting C shared library.
+    The function to look up the supporting functions for a given data type.
 
   Raises:
     OSError: If there is any problem in loading the shared library.
@@ -54,19 +71,25 @@ def _get_c_shared_lib() -> ctypes.CDLL:
   # library.
   c_lib = ctypes.CDLL(_get_support_lib_name())
 
+  type_to_funcs = {}
   try:
-    c_lib.convertToMLIRSparseTensorF64.restype = ctypes.c_void_p
+    _record_support_funcs(np.float32, c_lib.convertToMLIRSparseTensorF32,
+                          c_lib.convertFromMLIRSparseTensorF32, type_to_funcs)
   except Exception as e:
-    raise ValueError("Missing function convertToMLIRSparseTensorF64 from "
-                     f"the supporting C shared library: {e} ") from e
+    raise ValueError(f"Missing supporting function: {e}") from e
 
   try:
-    c_lib.convertFromMLIRSparseTensorF64.restype = ctypes.c_void_p
+    _record_support_funcs(np.float64, c_lib.convertToMLIRSparseTensorF64,
+                          c_lib.convertFromMLIRSparseTensorF64, type_to_funcs)
   except Exception as e:
-    raise ValueError("Missing function convertFromMLIRSparseTensorF64 from "
-                     f"the C shared library: {e} ") from e
+    raise ValueError(f"Missing supporting function: {e}") from e
+
+  def get_support_funcs(ty: np.dtype):
+    funcs = type_to_funcs[ty]
+    assert funcs is not None
+    return funcs
 
-  return c_lib
+  return get_support_funcs
 
 
 def sparse_tensor_to_coo_tensor(
@@ -93,17 +116,14 @@ def sparse_tensor_to_coo_tensor(
     OSError: If there is any problem in loading the shared library.
     ValueError: If the shared library doesn't contain the needed routines.
   """
-  c_lib = _get_c_shared_lib()
-
+  convert_from = _get_support_func_locator()(dtype)[1]
   rank = ctypes.c_ulonglong(0)
   nse = ctypes.c_ulonglong(0)
   shape = ctypes.POINTER(ctypes.c_ulonglong)()
   values = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))()
   indices = ctypes.POINTER(ctypes.c_ulonglong)()
-  c_lib.convertFromMLIRSparseTensorF64(sparse_tensor, ctypes.byref(rank),
-                                       ctypes.byref(nse), ctypes.byref(shape),
-                                       ctypes.byref(values),
-                                       ctypes.byref(indices))
+  convert_from(sparse_tensor, ctypes.byref(rank), ctypes.byref(nse),
+               ctypes.byref(shape), ctypes.byref(values), ctypes.byref(indices))
 
   # Convert the returned values to the corresponding numpy types.
   shape = np.ctypeslib.as_array(shape, shape=[rank.value])
@@ -138,8 +158,8 @@ def coo_tensor_to_sparse_tensor(np_shape: np.ndarray, np_values: np.ndarray,
       ctypes.POINTER(np.ctypeslib.as_ctypes_type(np_values.dtype)))
   indices = np_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_ulonglong))
 
-  c_lib = _get_c_shared_lib()
-  ptr = c_lib.convertToMLIRSparseTensorF64(rank, nse, shape, values, indices)
+  convert_to = _get_support_func_locator()(np_values.dtype.type)[0]
+  ptr = convert_to(rank, nse, shape, values, indices)
   assert ptr is not None, "Problem with calling convertToMLIRSparseTensorF64"
   return ptr
 
@@ -171,11 +191,11 @@ class _SparseTensorDescriptor(ctypes.Structure):
   ]
 
 
-def _output_one_dim(dim: int, rank: int, shape: str) -> str:
+def _output_one_dim(dim: int, rank: int, shape: str, type: str) -> str:
   """Produces the MLIR text code to output the size for the given dimension."""
   return f"""
   %c{dim} = arith.constant {dim} : index
-  %d{dim} = tensor.dim %t, %c{dim} : tensor<{shape}xf64, #enc>
+  %d{dim} = tensor.dim %t, %c{dim} : tensor<{shape}x{type}, #enc>
   memref.store %d{dim}, %b[%c{dim}] : memref<{rank}xindex>
 """
 
@@ -187,7 +207,7 @@ def _output_one_dim(dim: int, rank: int, shape: str) -> str:
 # (2) Use scf.for instead of an unrolled loop to write out the dimension sizes
 #     when tensor.dim supports non-constant dimension value.
 def _get_create_sparse_tensor_kernel(
-    sparsity_codes: Sequence[sparse_tensor.DimLevelType]) -> str:
+    sparsity_codes: Sequence[sparse_tensor.DimLevelType], type: str) -> str:
   """Creates an MLIR text kernel to contruct a sparse tensor from a file.
 
   The kernel returns a _SparseTensorDescriptor structure.
@@ -203,7 +223,7 @@ def _get_create_sparse_tensor_kernel(
 
   # Get the MLIR text code to write the dimension sizes to the output buffer.
   output_dims = "\n".join(
-      map(lambda d: _output_one_dim(d, rank, shape), range(rank)))
+      map(lambda d: _output_one_dim(d, rank, shape, type), range(rank)))
 
   # Return the MLIR text kernel.
   return f"""
@@ -211,18 +231,18 @@ def _get_create_sparse_tensor_kernel(
 #enc = #sparse_tensor.encoding<{{
   dimLevelType = [ {sparsity} ]
 }}>
-func @{_ENTRY_NAME}(%filename: !Ptr) -> (tensor<{shape}xf64, #enc>, memref<{rank}xindex>)
+func @{_ENTRY_NAME}(%filename: !Ptr) -> (tensor<{shape}x{type}, #enc>, memref<{rank}xindex>)
 attributes {{ llvm.emit_c_interface }} {{
-  %t = sparse_tensor.new %filename : !Ptr to tensor<{shape}xf64, #enc>
+  %t = sparse_tensor.new %filename : !Ptr to tensor<{shape}x{type}, #enc>
   %b = memref.alloc() : memref<{rank}xindex>
   {output_dims}
-  return %t, %b : tensor<{shape}xf64, #enc>, memref<{rank}xindex>
+  return %t, %b : tensor<{shape}x{type}, #enc>, memref<{rank}xindex>
 }}"""
 
 
-def create_sparse_tensor(
-    filename: str, sparsity: Sequence[sparse_tensor.DimLevelType]
-) -> Tuple[ctypes.c_void_p, np.ndarray]:
+def create_sparse_tensor(filename: str,
+                         sparsity: Sequence[sparse_tensor.DimLevelType],
+                         type: str) -> Tuple[ctypes.c_void_p, np.ndarray]:
   """Creates an MLIR sparse tensor from the input file.
 
   Args:
@@ -241,7 +261,7 @@ def create_sparse_tensor(
     ValueError:  If the shared library doesn't contain the needed routine.
   """
   with ir.Context() as ctx, ir.Location.unknown():
-    module = _get_create_sparse_tensor_kernel(sparsity)
+    module = _get_create_sparse_tensor_kernel(sparsity, type)
     module = ir.Module.parse(module)
     engine = compile_and_build_engine(module)
 
@@ -265,7 +285,7 @@ def create_sparse_tensor(
 # by using Python code to generate the kernel instead of doing MLIR text code
 # stitching.
 def _get_output_sparse_tensor_kernel(
-    sparsity_codes: Sequence[sparse_tensor.DimLevelType]) -> str:
+    sparsity_codes: Sequence[sparse_tensor.DimLevelType], type: str) -> str:
   """Creates an MLIR text kernel to output a sparse tensor to a file.
 
   The kernel returns void.
@@ -285,16 +305,16 @@ def _get_output_sparse_tensor_kernel(
 #enc = #sparse_tensor.encoding<{{
   dimLevelType = [ {sparsity} ]
 }}>
-func @{_ENTRY_NAME}(%t: tensor<{shape}xf64, #enc>, %filename: !Ptr)
+func @{_ENTRY_NAME}(%t: tensor<{shape}x{type}, #enc>, %filename: !Ptr)
 attributes {{ llvm.emit_c_interface }} {{
-  sparse_tensor.out %t, %filename : tensor<{shape}xf64, #enc>, !Ptr
+  sparse_tensor.out %t, %filename : tensor<{shape}x{type}, #enc>, !Ptr
   std.return
 }}"""
 
 
-def output_sparse_tensor(
-    tensor: ctypes.c_void_p, filename: str,
-    sparsity: Sequence[sparse_tensor.DimLevelType]) -> None:
+def output_sparse_tensor(tensor: ctypes.c_void_p, filename: str,
+                         sparsity: Sequence[sparse_tensor.DimLevelType],
+                         type: str) -> None:
   """Outputs an MLIR sparse tensor to the given file.
 
   Args:
@@ -303,13 +323,14 @@ def output_sparse_tensor(
       a COO-flavored format.
     sparsity: A sequence of DimLevelType values, one for each dimension of the
       tensor.
+    type: The MLIR string for the data type.
 
   Raises:
     OSError: If there is any problem in loading the supporting C shared library.
     ValueError:  If the shared library doesn't contain the needed routine.
   """
   with ir.Context() as ctx, ir.Location.unknown():
-    module = _get_output_sparse_tensor_kernel(sparsity)
+    module = _get_output_sparse_tensor_kernel(sparsity, type)
     module = ir.Module.parse(module)
     engine = compile_and_build_engine(module)
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_utils.py b/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_utils.py
index 273b913b3a205..b3c05335052f5 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_utils.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/taco/unit_test_tensor_utils.py
@@ -75,7 +75,7 @@ def _implement_read_tns_test(
 
     # Read the data from the file and construct an MLIR sparse tensor.
     sparse_tensor, o_shape = pytaco_utils.create_sparse_tensor(
-        file_name, sparsity_codes)
+        file_name, sparsity_codes, "f64")
 
   passed = 0
 

From b01430a04f191d2191be98568d721c8f111d9b71 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 23 Feb 2022 19:18:24 -0800
Subject: [PATCH 735/748] [ELF] Don't rely on Symbols.h's transitive inclusion
 of InputFiles.h. NFC

---
 lld/ELF/ICF.cpp            | 1 +
 lld/ELF/LinkerScript.cpp   | 1 +
 lld/ELF/OutputSections.cpp | 1 +
 lld/ELF/ScriptParser.cpp   | 1 +
 lld/ELF/Writer.cpp         | 1 +
 5 files changed, 5 insertions(+)

diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index 76beff491f52e..a72d172b71db6 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -74,6 +74,7 @@
 
 #include "ICF.h"
 #include "Config.h"
+#include "InputFiles.h"
 #include "LinkerScript.h"
 #include "OutputSections.h"
 #include "SymbolTable.h"
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 4b80d6af6e264..d5cfc34d9eaae 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -12,6 +12,7 @@
 
 #include "LinkerScript.h"
 #include "Config.h"
+#include "InputFiles.h"
 #include "InputSection.h"
 #include "OutputSections.h"
 #include "SymbolTable.h"
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 252108b464b2b..2960117f057fb 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -8,6 +8,7 @@
 
 #include "OutputSections.h"
 #include "Config.h"
+#include "InputFiles.h"
 #include "LinkerScript.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 595050a83a386..d8839804f656e 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -14,6 +14,7 @@
 #include "ScriptParser.h"
 #include "Config.h"
 #include "Driver.h"
+#include "InputFiles.h"
 #include "LinkerScript.h"
 #include "OutputSections.h"
 #include "ScriptLexer.h"
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index bf9e315ec0d2b..f690711af5c13 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -11,6 +11,7 @@
 #include "ARMErrataFix.h"
 #include "CallGraphSort.h"
 #include "Config.h"
+#include "InputFiles.h"
 #include "LinkerScript.h"
 #include "MapFile.h"
 #include "OutputSections.h"

From 9d899d8f01872e91b9909a6ee5937a796a399276 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Mon, 21 Feb 2022 20:16:04 -0500
Subject: [PATCH 736/748] [HIP] Support `-fgpu-default-stream`

Introduce -fgpu-default-stream={legacy|per-thread} option to
support per-thread default stream for HIP runtime.

When -fgpu-default-stream=per-thread, HIP kernels are
launched through hipLaunchKernel_spt instead of
hipLaunchKernel. Also HIP_API_PER_THREAD_DEFAULT_STREAM=1
is defined by the preprocessor to enable other per-thread stream
API's.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D120298
---
 clang/include/clang/Basic/LangOptions.h     | 10 ++++++++++
 clang/include/clang/Driver/Options.td       |  7 +++++++
 clang/lib/CodeGen/CGCUDANV.cpp              | 15 +++++++++++----
 clang/lib/Driver/ToolChains/Clang.cpp       |  4 +++-
 clang/lib/Frontend/InitPreprocessor.cpp     |  3 +++
 clang/test/CodeGenCUDA/Inputs/cuda.h        |  7 +++++++
 clang/test/CodeGenCUDA/kernel-call.cu       | 11 +++++++++--
 clang/test/Driver/hip-options.hip           |  8 ++++++++
 clang/test/Preprocessor/predefined-macros.c | 10 ++++++++++
 9 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 2e334e375950e..6aa24d2facc2a 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -309,6 +309,13 @@ class LangOptions : public LangOptionsBase {
     ExtendTo64
   };
 
+  enum class GPUDefaultStreamKind {
+    /// Legacy default stream
+    Legacy,
+    /// Per-thread default stream
+    PerThread,
+  };
+
 public:
   /// The used language standard.
   LangStandard::Kind LangStd;
@@ -402,6 +409,9 @@ class LangOptions : public LangOptionsBase {
   /// input is a header file (i.e. -x c-header).
   bool IsHeaderFile = false;
 
+  /// The default stream kind used for HIP kernel launching.
+  GPUDefaultStreamKind GPUDefaultStream;
+
   LangOptions();
 
   // Define accessors/mutators for language options of enumeration type.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f9d8e32169635..8dd16ca990a14 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -959,6 +959,13 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
   TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
   NegFlag<SetFalse>>;
+def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
+  HelpText<"Specify default stream. Valid values are 'legacy' and 'per-thread'. The default value is 'legacy'. (HIP only)">,
+  Flags<[CC1Option]>,
+  Values<"legacy,per-thread">,
+  NormalizedValuesScope<"LangOptions::GPUDefaultStreamKind">,
+  NormalizedValues<["Legacy", "PerThread"]>,
+  MarshallingInfoEnum<LangOpts<"GPUDefaultStream">, "Legacy">;
 def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
   HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
 def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 293bdf99d272f..b832c686b8b69 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -332,15 +332,22 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 
   // Lookup cudaLaunchKernel/hipLaunchKernel function.
+  // HIP kernel launching API name depends on -fgpu-default-stream option. For
+  // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
+  // it is hipLaunchKernel_spt.
   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
   //                              void **args, size_t sharedMem,
   //                              cudaStream_t stream);
-  // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
-  //                            void **args, size_t sharedMem,
-  //                            hipStream_t stream);
+  // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
+  //                                  dim3 blockDim, void **args,
+  //                                  size_t sharedMem, hipStream_t stream);
   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
-  auto LaunchKernelName = addPrefixToName("LaunchKernel");
+  std::string KernelLaunchAPI = "LaunchKernel";
+  if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream ==
+                                   LangOptions::GPUDefaultStreamKind::PerThread)
+    KernelLaunchAPI = KernelLaunchAPI + "_spt";
+  auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
   IdentifierInfo &cudaLaunchKernelII =
       CGM.getContext().Idents.get(LaunchKernelName);
   FunctionDecl *cudaLaunchKernelFD = nullptr;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8df56a2df5b12..341e108ed65da 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6915,8 +6915,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back(Args.MakeArgString(Twine("-cuid=") + Twine(CUID)));
   }
 
-  if (IsHIP)
+  if (IsHIP) {
     CmdArgs.push_back("-fcuda-allow-variadic-functions");
+    Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ);
+  }
 
   if (IsCudaDevice || IsHIPDevice) {
     StringRef InlineThresh =
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index ff507e2c00aaa..6b7c743d4004f 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -538,6 +538,9 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
     if (LangOpts.CUDAIsDevice)
       Builder.defineMacro("__HIP_DEVICE_COMPILE__");
+    if (LangOpts.GPUDefaultStream ==
+        LangOptions::GPUDefaultStreamKind::PerThread)
+      Builder.defineMacro("HIP_API_PER_THREAD_DEFAULT_STREAM");
   }
 }
 
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index af395b3b97bb6..25f64ccefe937 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -35,11 +35,18 @@ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
 extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
                                                  size_t sharedSize = 0,
                                                  hipStream_t stream = 0);
+#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
 extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
                                       dim3 blockDim, void **args,
                                       size_t sharedMem,
                                       hipStream_t stream);
 #else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#endif //HIP_API_PER_THREAD_DEFAULT_STREAM
+#else
 typedef struct cudaStream *cudaStream_t;
 typedef enum cudaError {} cudaError_t;
 extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
diff --git a/clang/test/CodeGenCUDA/kernel-call.cu b/clang/test/CodeGenCUDA/kernel-call.cu
index b76f2c1883576..40407f1c29a38 100644
--- a/clang/test/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CodeGenCUDA/kernel-call.cu
@@ -5,7 +5,13 @@
 // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
 // RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
 // RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK
+// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
+// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
+// RUN:   -fgpu-default-stream=legacy \
+// RUN:   | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
+// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
+// RUN:   -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \
+// RUN:   | FileCheck %s --check-prefixes=HIP-NEW,PTH,CHECK
 
 #include "Inputs/cuda.h"
 
@@ -13,7 +19,8 @@
 // HIP-OLD: call{{.*}}hipSetupArgument
 // HIP-OLD: call{{.*}}hipLaunchByPtr
 // HIP-NEW: call{{.*}}__hipPopCallConfiguration
-// HIP-NEW: call{{.*}}hipLaunchKernel
+// LEGACY: call{{.*}}hipLaunchKernel
+// PTH: call{{.*}}hipLaunchKernel_spt
 // CUDA-OLD: call{{.*}}cudaSetupArgument
 // CUDA-OLD: call{{.*}}cudaLaunch
 // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index da82410a4fcfd..1169444a0dd7c 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -14,6 +14,14 @@
 // DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
 // DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
 
+// Check -fgpu-default-stream=per-thread.
+// RUN: %clang -### -nogpuinc -nogpulib -fgpu-default-stream=per-thread \
+// RUN:   %s -save-temps 2>&1 | FileCheck -check-prefix=PTH %s
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
+
 // RUN: %clang -### -x hip -target x86_64-pc-windows-msvc -fms-extensions \
 // RUN:   -mllvm -amdgpu-early-inline-all=true  %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=MLLVM %s
diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c
index 0b67cbe233ca2..897145516c52c 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -247,6 +247,7 @@
 // CHECK-HIP-NEG-NOT: #define __CUDA_ARCH__
 // CHECK-HIP-NEG-NOT: #define __HIP_DEVICE_COMPILE__ 1
 // CHECK-HIP-NEG-NOT: #define __CLANG_RDC__ 1
+// CHECK-HIP-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
 
 // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
 // RUN:   -fcuda-is-device \
@@ -265,6 +266,7 @@
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-DEV-NEG
 // CHECK-HIP-DEV-NEG-NOT: #define __CUDA_ARCH__
 // CHECK-HIP-DEV-NEG-NOT: #define __CLANG_RDC__ 1
+// CHECK-HIP-DEV-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
 
 // RUN: %clang_cc1 %s -E -dM -o - -x cuda -triple x86_64-unknown-linux-gnu \
 // RUN:   -fgpu-rdc | FileCheck %s --check-prefix=CHECK-RDC
@@ -277,3 +279,11 @@
 // RUN:   -fgpu-rdc -fcuda-is-device \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-RDC
 // CHECK-RDC: #define __CLANG_RDC__ 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \
+// RUN:   -fgpu-default-stream=per-thread \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
+// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
+// RUN:   -fcuda-is-device -fgpu-default-stream=per-thread \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
+// CHECK-PTH: #define HIP_API_PER_THREAD_DEFAULT_STREAM 1

From 84204f1e88e06a1276d091d7ba97fd7cf699fbaf Mon Sep 17 00:00:00 2001
From: Anatoly Parshintsev <anatoly.parshintsev@intel.com>
Date: Mon, 21 Feb 2022 18:15:42 +0300
Subject: [PATCH 737/748] fixed discarding of debug info metadata by
 SPIRVLowerBool pass

SPIRVLowerBool pass silently discards line information
producing IR without !dbg nodes arrached.
This commit addresses this issue and adds test

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/ee391ea
---
 llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp       |   3 +
 .../dbginfo-bug-on-bool-converts.ll           | 107 ++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp
index 069639162b975..9b1f3bbf62b4b 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVLowerBool.cpp
@@ -57,6 +57,7 @@ class SPIRVLowerBoolBase : public InstVisitor<SPIRVLowerBoolBase> {
   virtual ~SPIRVLowerBoolBase() {}
   void replace(Instruction *I, Instruction *NewI) {
     NewI->takeName(I);
+    NewI->setDebugLoc(I->getDebugLoc());
     I->replaceAllUsesWith(NewI);
     I->dropAllReferences();
     I->eraseFromParent();
@@ -73,6 +74,7 @@ class SPIRVLowerBoolBase : public InstVisitor<SPIRVLowerBoolBase> {
       auto Op = I.getOperand(0);
       auto And = BinaryOperator::CreateAnd(
           Op, getScalarOrVectorConstantInt(Op->getType(), 1, false), "", &I);
+      And->setDebugLoc(I.getDebugLoc());
       auto Zero = getScalarOrVectorConstantInt(Op->getType(), 0, false);
       auto Cmp = new ICmpInst(&I, CmpInst::ICMP_NE, And, Zero);
       replace(&I, Cmp);
@@ -102,6 +104,7 @@ class SPIRVLowerBoolBase : public InstVisitor<SPIRVLowerBoolBase> {
       auto One = getScalarOrVectorConstantInt(Ty, 1, false);
       assert(Zero && One && "Couldn't create constant int");
       auto Sel = SelectInst::Create(Op, One, Zero, "", &I);
+      Sel->setDebugLoc(I.getDebugLoc());
       I.setOperand(0, Sel);
     }
   }
diff --git a/llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll b/llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll
new file mode 100644
index 0000000000000..9591e6e69f742
--- /dev/null
+++ b/llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll
@@ -0,0 +1,107 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s
+
+source_filename = "/the_file.ll"
+target triple = "spir"
+
+; Function Attrs: nounwind
+define spir_func i1 @trunc_to_i1(i32 %iarg) #0 !dbg !7 {
+; CHECK: @trunc_to_i1(i32 %iarg) #[[#]] !dbg ![[#]] {
+; CHECK-NEXT: !dbg ![[#TRUNC_LINE:]]
+; CHECK-NEXT: !dbg ![[#TRUNC_LINE]]
+; CHECK-NEXT: ret i1 %res, !dbg ![[#TRUNC_RET_LINE:]]
+  %res = trunc i32 %iarg to i1, !dbg !9
+  ret i1 %res, !dbg !10
+}
+
+; Function Attrs: nounwind
+define spir_func i32 @sext_from_i1(i1 %barg) #0 !dbg !11 {
+; CHECK: @sext_from_i1(i1 %barg) #[[#]] !dbg ![[#]] {
+; CHECK-NEXT: !dbg ![[#SEXT_LINE:]]
+; CHECK: ret i32 %res, !dbg ![[#SEXT_RET_LINE:]]
+  %res = sext i1 %barg to i32, !dbg !12
+  ret i32 %res, !dbg !13
+}
+
+; Function Attrs: nounwind
+define spir_func i32 @zext_from_i1(i1 %barg) #0 !dbg !14 {
+; CHECK: @zext_from_i1(i1 %barg) #[[#]] !dbg ![[#]] {
+; CHECK-NEXT: !dbg ![[#ZEXT_LINE:]]
+; CHECK: ret i32 %res, !dbg ![[#ZEXT_RET_LINE:]]
+  %res = zext i1 %barg to i32, !dbg !15
+  ret i32 %res, !dbg !16
+}
+
+; Function Attrs: nounwind
+define spir_func float @sitofp_b(i1 %barg) #0 !dbg !17 {
+; CHECK: @sitofp_b(i1 %barg) #[[#]] !dbg ![[#]] {
+; CHECK-NEXT: !dbg ![[#SITOFP_LINE:]]
+; CHECK-NEXT: !dbg ![[#SITOFP_LINE]]
+; CHECK: ret float %res, !dbg ![[#SITOFP_RET_LINE:]]
+  %res = sitofp i1 %barg to float, !dbg !18
+  ret float %res, !dbg !19
+}
+
+; Function Attrs: nounwind
+define spir_func float @uitofp_b(i1 %barg) #0 !dbg !20 {
+; CHECK: @uitofp_b(i1 %barg) #[[#]] !dbg ![[#]] {
+; CHECK-NEXT: !dbg ![[#UITOFP_LINE:]]
+; CHECK-NEXT: !dbg ![[#UITOFP_LINE]]
+; CHECK: ret float %res, !dbg ![[#UITOFP_RET_LINE:]]
+  %res = uitofp i1 %barg to float, !dbg !21
+  ret float %res, !dbg !22
+}
+
+; CHECK-DAG: ![[#TRUNC_LINE]] = !DILocation(line: 1, column: 1
+; CHECK-DAG: ![[#TRUNC_RET_LINE]] = !DILocation(line: 2, column: 1
+
+; CHECK-DAG: ![[#SEXT_LINE]] = !DILocation(line: 3, column: 1
+; CHECK-DAG: ![[#SEXT_RET_LINE]] = !DILocation(line: 4, column: 1
+
+; CHECK-DAG: ![[#ZEXT_LINE]] = !DILocation(line: 5, column: 1
+; CHECK-DAG: ![[#ZEXT_RET_LINE]] = !DILocation(line: 6, column: 1
+
+; CHECK-DAG: ![[#SITOFP_LINE]] = !DILocation(line: 7, column: 1
+; CHECK-DAG: ![[#SITOFP_RET_LINE]] = !DILocation(line: 8, column: 1
+
+; CHECK-DAG: ![[#UITOFP_LINE]] = !DILocation(line: 9, column: 1
+; CHECK-DAG: ![[#UITOFP_RET_LINE]] = !DILocation(line: 10, column: 1
+
+attributes #0 = { nounwind }
+
+!opencl.enable.FP_CONTRACT = !{}
+!opencl.spir.version = !{!0}
+!opencl.ocl.version = !{!0}
+!opencl.used.extensions = !{!1}
+!opencl.used.optional.core.features = !{!1}
+!opencl.compiler.options = !{!1}
+!llvm.dbg.cu = !{!2}
+!llvm.debugify = !{!4, !5}
+!llvm.module.flags = !{!6}
+
+!0 = !{i32 1, i32 2}
+!1 = !{}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "/the_file.ll", directory: "/")
+!4 = !{i32 10}
+!5 = !{i32 0}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = distinct !DISubprogram(name: "trunc_to_i1", linkageName: "trunc_to_i1", scope: null, file: !3, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!8 = !DISubroutineType(types: !1)
+!9 = !DILocation(line: 1, column: 1, scope: !7)
+!10 = !DILocation(line: 2, column: 1, scope: !7)
+!11 = distinct !DISubprogram(name: "sext_from_i1", linkageName: "sext_from_i1", scope: null, file: !3, line: 3, type: !8, scopeLine: 3, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!12 = !DILocation(line: 3, column: 1, scope: !11)
+!13 = !DILocation(line: 4, column: 1, scope: !11)
+!14 = distinct !DISubprogram(name: "zext_from_i1", linkageName: "zext_from_i1", scope: null, file: !3, line: 5, type: !8, scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!15 = !DILocation(line: 5, column: 1, scope: !14)
+!16 = !DILocation(line: 6, column: 1, scope: !14)
+!17 = distinct !DISubprogram(name: "sitofp_b", linkageName: "sitofp_b", scope: null, file: !3, line: 7, type: !8, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!18 = !DILocation(line: 7, column: 1, scope: !17)
+!19 = !DILocation(line: 8, column: 1, scope: !17)
+!20 = distinct !DISubprogram(name: "uitofp_b", linkageName: "uitofp_b", scope: null, file: !3, line: 9, type: !8, scopeLine: 9, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!21 = !DILocation(line: 9, column: 1, scope: !20)
+!22 = !DILocation(line: 10, column: 1, scope: !20)

From 381bb8474e85010bdff8c70277ba5be2c202e808 Mon Sep 17 00:00:00 2001
From: Anatoly Parshintsev <anatoly.parshintsev@intel.com>
Date: Mon, 21 Feb 2022 20:45:34 +0300
Subject: [PATCH 738/748] disable spirv-val since it seems that it does work
 well with existing sw stack

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/25fc23d
---
 .../dbginfo-bug-on-bool-converts.ll           | 25 +++++++------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll b/llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll
index 9591e6e69f742..896de53fee418 100644
--- a/llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll
+++ b/llvm-spirv/test/transcoding/dbginfo-bug-on-bool-converts.ll
@@ -1,11 +1,10 @@
 ; RUN: llvm-as %s -o %t.bc
 ; RUN: llvm-spirv %t.bc -o %t.spv
-; RUN: spirv-val %t.spv
 ; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
 ; RUN: llvm-dis < %t.rev.bc | FileCheck %s
 
-source_filename = "/the_file.ll"
-target triple = "spir"
+source_filename = "the_file.ll"
+target triple = "spir64-unknown-unknown"
 
 ; Function Attrs: nounwind
 define spir_func i1 @trunc_to_i1(i32 %iarg) #0 !dbg !7 {
@@ -72,36 +71,30 @@ define spir_func float @uitofp_b(i1 %barg) #0 !dbg !20 {
 
 attributes #0 = { nounwind }
 
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!0}
-!opencl.ocl.version = !{!0}
-!opencl.used.extensions = !{!1}
-!opencl.used.optional.core.features = !{!1}
-!opencl.compiler.options = !{!1}
 !llvm.dbg.cu = !{!2}
 !llvm.debugify = !{!4, !5}
 !llvm.module.flags = !{!6}
 
 !0 = !{i32 1, i32 2}
 !1 = !{}
-!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
-!3 = !DIFile(filename: "/the_file.ll", directory: "/")
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "the_file.ll", directory: "", checksumkind: CSK_MD5, checksum: "18aa9ce738eaafc7b7b7181c19092815")
 !4 = !{i32 10}
 !5 = !{i32 0}
 !6 = !{i32 2, !"Debug Info Version", i32 3}
-!7 = distinct !DISubprogram(name: "trunc_to_i1", linkageName: "trunc_to_i1", scope: null, file: !3, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!7 = distinct !DISubprogram(name: "trunc_to_i1", scope: !3, file: !3, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !1)
 !8 = !DISubroutineType(types: !1)
 !9 = !DILocation(line: 1, column: 1, scope: !7)
 !10 = !DILocation(line: 2, column: 1, scope: !7)
-!11 = distinct !DISubprogram(name: "sext_from_i1", linkageName: "sext_from_i1", scope: null, file: !3, line: 3, type: !8, scopeLine: 3, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!11 = distinct !DISubprogram(name: "sext_from_i1", scope: !3, file: !3, line: 3, type: !8, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !1)
 !12 = !DILocation(line: 3, column: 1, scope: !11)
 !13 = !DILocation(line: 4, column: 1, scope: !11)
-!14 = distinct !DISubprogram(name: "zext_from_i1", linkageName: "zext_from_i1", scope: null, file: !3, line: 5, type: !8, scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!14 = distinct !DISubprogram(name: "zext_from_i1", scope: !3, file: !3, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped , spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !1)
 !15 = !DILocation(line: 5, column: 1, scope: !14)
 !16 = !DILocation(line: 6, column: 1, scope: !14)
-!17 = distinct !DISubprogram(name: "sitofp_b", linkageName: "sitofp_b", scope: null, file: !3, line: 7, type: !8, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!17 = distinct !DISubprogram(name: "sitofp_b", scope: !3, file: !3, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !1)
 !18 = !DILocation(line: 7, column: 1, scope: !17)
 !19 = !DILocation(line: 8, column: 1, scope: !17)
-!20 = distinct !DISubprogram(name: "uitofp_b", linkageName: "uitofp_b", scope: null, file: !3, line: 9, type: !8, scopeLine: 9, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !1)
+!20 = distinct !DISubprogram(name: "uitofp_b", scope: !3, file: !3, line: 9, type: !8, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !1)
 !21 = !DILocation(line: 9, column: 1, scope: !20)
 !22 = !DILocation(line: 10, column: 1, scope: !20)

From 27cc93035026c2e13a5836519f162fa53ccbdd7a Mon Sep 17 00:00:00 2001
From: Greg Lueck <gregory.m.lueck@intel.com>
Date: Tue, 1 Mar 2022 08:41:26 -0500
Subject: [PATCH 739/748] [SYCL][DOC] Clarify "[[uses_aspects()]]" in design
 (#5594)

Clarify the design doc to note that the C++ attribute
`[[sycl_detail::uses_aspects()]]` is only needed for the device
compiler and should be protected via `#ifdef`.
---
 sycl/doc/design/OptionalDeviceFeatures.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sycl/doc/design/OptionalDeviceFeatures.md b/sycl/doc/design/OptionalDeviceFeatures.md
index 088880559b2e3..64250ad12f648 100644
--- a/sycl/doc/design/OptionalDeviceFeatures.md
+++ b/sycl/doc/design/OptionalDeviceFeatures.md
@@ -369,6 +369,11 @@ instantiations of `sycl::atomic_ref` as an optional feature.
 
 [6]: <#appendix-adding-an-attribute-to-8-byte-atomic_ref>
 
+Because the `[[sycl_detail::uses_aspects()]]` attribute is only needed for the
+device compiler, the headers should protect it with
+`#ifdef __SYCL_DEVICE_ONLY__`.  This avoids warnings when our headers are
+compiled with a third-party host compiler.
+
 Although the examples above show only a single aspect parameter to the
 `[[sycl_detail::uses_aspects()]]` attribute, this attribute should support a
 list of aspects, similar to the `[[sycl::device_has()]]` attribute.  This will

From 08b14da9181c88c6184f3200e99ed36b121cd33d Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 1 Mar 2022 13:59:55 +0000
Subject: [PATCH 740/748] [SYCL] Fix host device local accessor alignment
 (#5554)

Local kernel arguments must be aligned to the type size, simply using `std::vector<char>` doesn't always provide the correct alignment. So this patch adds extra padding to the vector and ensures that the pointer returned for the accessor is actually aligned to the type size.

This issue was exposed by: https://github.com/intel/llvm-test-suite/pull/608, which was a follow up to fixing local accessor alignment for the CUDA plugin.
---
 sycl/include/CL/sycl/detail/accessor_impl.hpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/accessor_impl.hpp b/sycl/include/CL/sycl/detail/accessor_impl.hpp
index 7c231b1ad27dc..7a1c7a6933c94 100644
--- a/sycl/include/CL/sycl/detail/accessor_impl.hpp
+++ b/sycl/include/CL/sycl/detail/accessor_impl.hpp
@@ -170,9 +170,11 @@ class AccessorBaseHost {
 
 class __SYCL_EXPORT LocalAccessorImplHost {
 public:
+  // Allocate ElemSize more data to have sufficient padding to enforce
+  // alignment.
   LocalAccessorImplHost(sycl::range<3> Size, int Dims, int ElemSize)
       : MSize(Size), MDims(Dims), MElemSize(ElemSize),
-        MMem(Size[0] * Size[1] * Size[2] * ElemSize) {}
+        MMem(Size[0] * Size[1] * Size[2] * ElemSize + ElemSize) {}
 
   sycl::range<3> MSize;
   int MDims;
@@ -190,9 +192,20 @@ class LocalAccessorBaseHost {
   }
   sycl::range<3> &getSize() { return impl->MSize; }
   const sycl::range<3> &getSize() const { return impl->MSize; }
-  void *getPtr() { return impl->MMem.data(); }
+  void *getPtr() {
+    // Const cast this in order to call the const getPtr.
+    return const_cast<const LocalAccessorBaseHost *>(this)->getPtr();
+  }
   void *getPtr() const {
-    return const_cast<void *>(reinterpret_cast<void *>(impl->MMem.data()));
+    char *ptr = impl->MMem.data();
+
+    // Align the pointer to MElemSize.
+    size_t val = reinterpret_cast<size_t>(ptr);
+    if (val % impl->MElemSize != 0) {
+      ptr += impl->MElemSize - val % impl->MElemSize;
+    }
+
+    return ptr;
   }
 
   int getNumOfDims() { return impl->MDims; }

From 4bd6d2029860fb6cd438d42c5350bb2cf0271130 Mon Sep 17 00:00:00 2001
From: Greg Lueck <gregory.m.lueck@intel.com>
Date: Tue, 1 Mar 2022 14:48:50 -0500
Subject: [PATCH 741/748] [SYCL][DOC] Add extension template (#5663)

Add a template document to use when creating new SYCL extension
specifications.  We will change our existing specifications to follow
this template over time.  Also add a README describing the process we
follow to create, modify, and maintain these specifications.
---
 sycl/doc/extensions/README-process.md | 181 +++++++++++++++++++++
 sycl/doc/extensions/template.asciidoc | 226 ++++++++++++++++++++++++++
 2 files changed, 407 insertions(+)
 create mode 100644 sycl/doc/extensions/README-process.md
 create mode 100644 sycl/doc/extensions/template.asciidoc

diff --git a/sycl/doc/extensions/README-process.md b/sycl/doc/extensions/README-process.md
new file mode 100644
index 0000000000000..476b09cd6300a
--- /dev/null
+++ b/sycl/doc/extensions/README-process.md
@@ -0,0 +1,181 @@
+# Lifetime of an Extension
+
+This document describes the process for creating and maintaining SYCL extension
+documents over their lifetime.
+
+
+## Creating a new extension
+
+Start by making a copy of the [template][1] extension specification document,
+and follow the instructions in that document.  Your extension should also
+follow the rules in [chapter 6][2] of the SYCL specification, including the
+"Guidelines for portable extensions".  These rules require you to choose a
+`<vendorstring>`.  For DPC++, we use the string "oneapi" unless the extension
+is very specific to Intel hardware in which case we use the string "intel".
+The template uses the string "oneapi", so you must change occurrences of that
+string if your extension is specific to Intel hardware.
+
+[1]: <template.asciidoc>
+[2]: <https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#chapter.extensions>
+
+Some sections in the template are optional.  Remove these sections if they are
+not needed.  You should also remove any of the explanatory text (generally in
+italics).
+
+Each extension must have a name.  The template uses
+"sycl\_ext\_oneapi\_myextension", but you must change the "myextension" to
+something appropriate.  Use lower case, and separate words with underbars.
+Each extension also has a feature-test macro, which is the same as the
+extension's name, except it uses all upper case.  Finally, the name of the
+specification file should match the name of the extension, for example
+"sycl\_ext\_oneapi\_myextension.asciidoc".
+
+Usually new extensions are first created in the "proposed" state, so the
+document should be added to that directory.  However, it is also possible to
+add a new extension at the same time as its implementation, in which case the
+specification should be added to either the "supported" or "experimental"
+directories.
+
+While an extension is in the "proposed" state, it is perfectly OK to make
+further modifications to its specification.  There is no need to change the
+version of the extension's feature-test macro when this occurs.
+
+
+## Implementing an extension
+
+Often, an extension is implemented sometime after it is proposed.  When this
+happens, the PR that implements the extension should also move the
+specification to either the "supported" or "experimental" directory, as
+appropriate.  It is common to make small change to the specification when it is
+implemented, so the PR that implements the extension may also make
+modifications to the specification document.
+
+Be sure to change the text in the "Status" section when the extension is
+implemented.  See the [template][1] for the proper text.
+
+Sometimes an extension is implemented with multiple PRs.  When this happens,
+the last PR that implements the extension should also move the specification
+document.  We want the specification document to reflect the features that are
+implemented in DPC++, so a specification should not be moved to "supported" or
+"experimental" before the final PR that implements it.
+
+Ideally, all APIs in an extension should be implemented by the time we announce
+support.  If this is not possible, something must be done to ensure that the
+specification is an accurate description of what is implemented.  Following are
+some techniques to accomplish this.
+
+### Split the specification into versions
+
+This is the preferred technique if the first release of an extension implements
+some APIs but not others.  In this case, the extension document should be
+copied to the "supported" (or "experimental") directory, but the description of
+the unimplemented APIs should be removed from this copy.  Thus, the document in
+that directory is an accurate description of the implementation.
+
+The original version of the specification in the "proposed" folder should
+remain.  In addition, a new "version" row should be added to the table that
+describes the feature-test macro, and all the unimplemented APIs become part of
+"version 2" of the specification.  These APIs can be implemented later,
+following the normal process of [adding a new version to an existing
+extension][3].
+
+[3]: <#adding-a-new-version-to-an-existing-extension>
+
+### Add NOTEs describing what is not implemented
+
+Sometimes all of the APIs in an extension are implemented, but they are not yet
+implemented for all devices or backends.  When this happens, we prefer to add
+non-normative "notes" to the extension specification indicating what is not
+yet implemented.  The placement of these notes depends on the nature of the
+unimplemented thing.  For example, if the entire extension is unimplemented on
+a certain backend, a note should be added in the "Status" section of the
+document, as demonstrated in the [template][1].  If there are restrictions with
+certain APIs, a note should be added near the description of each such API.
+
+
+## Adding a new version to an existing extension
+
+It is common to add new APIs to an extension after it is first released.  When
+this happens, the new APIs should be protected by a new version of the
+extension's feature-test macro.  This allows an application to test the value
+of the macro to know whether the implementation supports the API.
+
+Assuming the extension document is currently in the "supported" directory, make
+a copy of that document in the "proposed" directory.  Update the "Status"
+section as shown in the [template][1], and add a new "version" row to the table
+that describes the feature-test macro with a short summary of the new APIs
+enabled in that version.  The description of each new API should contain a
+statement saying which version adds the API.  For example,
+
+> This API is available starting in version 2 of this specification.
+
+Avoid unnecessary reformatting of the extension after it is copied.  It should
+be possible to see the new APIs that are proposed in the new version by using a
+command like:
+
+```
+$ git diff {supported,proposed}/sycl_ext_oneapi_myextension.asciidoc
+```
+
+When the new version of the extension is implemented, the "proposed" version of
+the specification should be moved back to the "supported" directory,
+overwriting the previous version.
+
+Note that a new version of a supported extension should never remove any
+functionality from the previous version.  We expect existing code that uses the
+old version to still work with the new version.
+
+
+## Deprecating an extension
+
+Occasionally, we may decide to deprecate a supported extension.  For example,
+this might happen if an extension is adopted into a new version of the core
+SYCL specification.  When this happens, the specification is moved from the
+"supported" directory to the "deprecated" directory, and the "Status" section
+is changed as shown in the [template][1].  A signpost file is also added to
+the "supported" directory with the same name as the original file and content
+that looks like:
+
+```
+This extension has been deprecated, but the specification is still available
+link:../deprecated/sycl_ext_oneapi_myextension.asciidoc[here].
+```
+
+The purpose of the signpost file is to ensure that external links to the
+extension are not broken, while still making it obvious that the extension is
+now deprecated.
+
+Note that a deprecated extension is still supported, so the implementation is
+not removed.
+
+We usually do not deprecate experimental extensions since there is no guarantee
+that these extension remain supported from one DPC++ release to the next.
+Instead, these extensions can be removed without a deprecation period.
+
+
+## Removing support for an extension
+
+Eventually, we typically remove an extension some time after it is deprecated.
+When this happens, we move the specification file to the "removed" directory
+and update the "Status" section as shown in the [template][1].  We also remove
+the signpost file.  This typically happens in the same PR that removes the
+implementation of the extension.
+
+
+## Experimental extensions
+
+The process of creating and implementing an "experimental" extension has mostly
+been described already, but there are some additional things to keep in mind.
+Even though an extension may be experimental, we still want the specification
+to accurately describe the API.  Usually, the extension document is the main
+user-facing description of the API, so it must be accurate in order for
+customers to use the extension.  Therefore, even an experimental extension
+specification must contain [NOTEs][4] describing any APIs that are not yet
+implemented.
+
+[4]: <#add-notes-describing-what-is-not-implemented>
+
+Since experimental extensions have no guaranteed compatibility from one DPC++
+release to another, we typically do not bother to add versions to the
+feature-test macro.  This is still allowed, of course, but it is also OK to
+add, remove, or modify APIs without changing the version.
diff --git a/sycl/doc/extensions/template.asciidoc b/sycl/doc/extensions/template.asciidoc
new file mode 100644
index 0000000000000..96a067c412582
--- /dev/null
+++ b/sycl/doc/extensions/template.asciidoc
@@ -0,0 +1,226 @@
+= sycl_ext_oneapi_myextension
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2022-2022 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 4 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+_If your extension depends on other SYCL extensions, add a paragraph and
+bulleted list like this:_
+
+This extension also depends on the following other SYCL extensions:
+
+* link:../supported/sycl_ext_oneapi_myotherextension.asciidoc[
+  sycl_ext_oneapi_myotherextension]
+
+
+== Status
+
+_Choose one of the following according to the status of your extension.
+For a "proposed" extension:_
+
+This is a proposed extension specification, intended to gather community
+feedback.  Interfaces defined in this specification may not be implemented yet
+or may be in a preliminary state.  The specification itself may also change in
+incompatible ways before it is finalized.  *Shipping software products should
+not rely on APIs defined in this specification.*
+
+_Use this if the extension becomes "supported":_
+
+This extension is implemented and fully supported by {dpcpp}.
+
+_Use this if the extension is a proposed new version of an existing extension:_
+
+This is a proposed update to an existing extension.  Interfaces defined in this
+specification may not be implemented yet or may be in a preliminary state.  The
+specification itself may also change in incompatible ways before it is
+finalized.  *Shipping software products should not rely on APIs defined in this
+specification.*  See
+link:../supported/sycl_ext_oneapi_myextension.asciidoc[here] for the existing
+extension, which is implemented.
+
+_Use this if the extension becomes "experimental":_
+
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
+
+_Use this if the extension becomes "deprecated":_
+
+This extension has been deprecated.  Although it is still supported in {dpcpp},
+we expect that the interfaces defined in this specification will be removed in
+an upcoming {dpcpp} release.  *Shipping software products should stop using
+APIs defined in this specification and use an alternative instead.*
+
+_Use this if the extension becomes "removed":_
+
+This extension is no longer implemented in {dpcpp}.  This specification is
+being archived only for historical purposes.  *The APIs defined in this
+specification no longer exist and cannot be used.*
+
+_This is an example of a note which can be added if your extension is
+implemented only for certain backends.  A note like this is appropriate if the
+extension is implemented only on certain devices or backends._
+
+[NOTE]
+====
+This extension is currently implemented in {dpcpp} only for GPU devices and
+only when using the Level Zero backend.  Attempting to use this extension in
+kernels that run on other devices or backends may result in undefined behavior.
+Be aware that the compiler is not able to issue a diagnostic to warn you if
+this happens.
+====
+
+
+== Overview
+
+_Provide a brief overview of the extension here and explain the motivation if
+appropriate.  This is also a good place to show an example usage, but there is
+no need to exhaustively show all aspects of your extension.  Those details
+should be explained in the sections that follow.  This section is just an
+overview to introduce your readers to your extension._
+
+_Note that text should be wrapped at 80 columns as shown in this template.
+Extensions use AsciiDoc markup language (like this template).  If you need help
+with AsciiDoc syntax, see
+https://docs.asciidoctor.org/asciidoc/latest[the manual]._
+
+_GitHub understands most AsciiDoc syntax, so files with the `.asciidoc`
+extension are rendered nicely into HTML.  Occasionally, you will find an
+AsciiDoc feature that GitHub does not understand, though, so you should check
+that GitHub renders your extension document nicely before merging it.  If you
+find a feature that GitHub does not understand, *do not use it*.  We expect
+users to read these specifications directly in the GitHub UI, so make sure that
+your specification is nicely formatted when viewed this way._
+
+
+== Specification
+
+=== Feature test macro
+
+_All extensions should provide a feature-test macro, so that applications
+can use `#ifdef` to protect code that uses your extension.  Use this text
+for all extensions:_
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_MYEXTENSION` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+_And follow the text with a table like this *unless the extension is
+"experimental"*.  Note that your table may have more than one row if it
+has multiple versions._
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|Initial version of this extension.
+|===
+
+_If your extension is "experimental", use this table instead:_
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+=== More sections at your discretion
+
+_Your extension specification will certainly have more sections which describe
+the APIs of your extension.  Define these sections as you see fit._
+
+_As a general rule, do not describe *how* you will change the core SYCL
+specification.  For example, there is no need to say something like "Add the
+following paragraph to section 4.6.7 of the core SYCL specification ..."
+Instead, just describe the semantics and APIs of the extension itself.  We will
+figure out later how to change the core SYCL specification if the extension is
+adopted._
+
+_There may be times when an extension adds a new member function to an existing
+SYCL class or a new enumerator to an existing SYCL enumeration.  In cases like
+these, say something like:_
+
+> This extension adds the following new member function to the SYCL `queue`
+> class:
+>
+> ```
+> class queue {
+>   void ext_oneapi_myfunction();
+> };
+> ```
+
+_Avoid unnecessary duplication of core SYCL APIs.  For example, there is no
+need to list the existing member functions in `queue` if your extension adds a
+new member function.  Duplication like this can be troublesome if something in
+the core SYCL specification changes later._
+
+
+== Implementation notes
+
+This non-normative section provides information about one possible
+implementation of this extension.  It is not part of the specification of the
+extension's API.
+
+_This section is not normally needed, but occasionally a "proposed" extension
+will contains some notes about the intended implementation.  If so, add this
+section, and include the text in the first paragraph above indicating that the
+section is non-normative.  Follow that paragraph with whatever implementation
+notes you think are necessary.  Usually, this section will be removed by the
+time the extension is implemented, and a more detailed {dpcpp} design document
+will be written instead._
+
+
+== Issues
+
+_Sometimes there will be unresolved issues in a "proposed" extension.  If this
+is the case, add an "Issues" section towards the end of the document, and list
+each issue._

From b1852692a0b4f45fae8db80840d7cda368ecfdc3 Mon Sep 17 00:00:00 2001
From: Greg Lueck <gregory.m.lueck@intel.com>
Date: Tue, 1 Mar 2022 14:49:55 -0500
Subject: [PATCH 742/748] [SYCL][DOC] Deprecate old spec constant extension
 (#5676)

This old extension for specialization constants is marked deprecated
in the headers, so make the specification deprecated too.
---
 .../sycl_ext_oneapi_spec_constants.md         | 74 ++++++++++++++++++
 .../sycl_ext_oneapi_spec_constants.md         | 76 +------------------
 2 files changed, 77 insertions(+), 73 deletions(-)
 create mode 100644 sycl/doc/extensions/deprecated/sycl_ext_oneapi_spec_constants.md

diff --git a/sycl/doc/extensions/deprecated/sycl_ext_oneapi_spec_constants.md b/sycl/doc/extensions/deprecated/sycl_ext_oneapi_spec_constants.md
new file mode 100644
index 0000000000000..e5c1cad2500be
--- /dev/null
+++ b/sycl/doc/extensions/deprecated/sycl_ext_oneapi_spec_constants.md
@@ -0,0 +1,74 @@
+# Specialization constants.
+
+Specialization constant is basically a variable in a SYCL program set by host
+code and used in device code which appears to be constant for the online (JIT)
+compiler of the device code. Things like optimal tile size in a tiled matrix
+multiplication kernel may depend on the hardware and can be expressed via a
+specialization constant for better code generation.
+
+This version of oneAPI provides experimental implementation of specialization
+constants based on the
+[proposal](https://github.com/codeplaysoftware/standards-proposals/blob/master/spec-constant/index.md)
+from Codeplay.
+
+**NOTE:** This extension is now deprecated.  Use the core SYCL specialization
+constant APIs defined in the
+[SYCL 2020 specification](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html)
+instead.
+
+A specialization constant is identified by a C++ type name, similarly to a
+kernel, its value is set via `program::set_spec_constant` class API and is
+"frozen" once the program is built. The following example shows how
+different values of a specialization constant can be used within the same
+kernel:
+
+```cpp
+  for (int i = 0; i < n_sc_sets; i++) {
+    cl::sycl::program program(q.get_context());
+    const int *sc_set = &sc_vals[i][0];
+    cl::sycl::ext::oneapi::experimental::spec_constant<int32_t, SC0> sc0 =
+        program.set_spec_constant<SC0>(sc_set[0]);
+    cl::sycl::ext::oneapi::experimental::spec_constant<int32_t, SC1> sc1 =
+        program.set_spec_constant<SC1>(sc_set[1]);
+
+    program.build_with_kernel_type<KernelAAA>();
+
+    try {
+      cl::sycl::buffer<int, 1> buf(vec.data(), vec.size());
+
+      q.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::write>(cgh);
+        cgh.single_task<KernelAAA>(
+            program.get_kernel<KernelAAA>(),
+            [=]() {
+              acc[i] = sc0.get() + sc1.get();
+            });
+      });
+    } catch (cl::sycl::exception &e) {
+      std::cout << "*** Exception caught: " << e.what() << "\n";
+      return 1;
+    }
+    ...
+  }
+```
+Here the values of specialization constants `SC0` and `SC1` are changed on
+every loop iteration. All what's needed is re-creating a `program` class
+instance, setting new values and rebuilding it via
+`program::build_with_kernel_type`. JIT compiler will effectively replace
+`sc0.get()` and  `sc1.get()` within thhe device code with the corresponding
+constant values (`sc_vals[i][0]` and `sc_vals[i][1]`). Full runnable example
+can be found on
+[github](https://github.com/intel/llvm-test-suite/blob/intel/SYCL/SpecConstants/1.2.1/spec_const_redefine.cpp).
+
+Specialization constants can be used in programs compiled Ahead-Of-Time, in this
+case a specialization constant takes default value for its type (as specified by
+[C++ standard](https://en.cppreference.com/w/cpp/language/value_initialization)).
+
+#### Limitations
+- The implementation does not support the `template <unsigned NID> struct spec_constant_id`
+  API design for interoperability with OpenCL - to set specializataion constants
+  in SYCL programs originating from external SPIRV modules and wrapped by OpenCL
+  program objects. In SPIRV/OpenCL specialization constants are identified by an
+  integer number, and the `spec_constant_id` class models that.
+- Only primitive numeric types are supported.
+
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_spec_constants.md b/sycl/doc/extensions/experimental/sycl_ext_oneapi_spec_constants.md
index e5c1cad2500be..a05f8096ff9e9 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_spec_constants.md
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_spec_constants.md
@@ -1,74 +1,4 @@
-# Specialization constants.
-
-Specialization constant is basically a variable in a SYCL program set by host
-code and used in device code which appears to be constant for the online (JIT)
-compiler of the device code. Things like optimal tile size in a tiled matrix
-multiplication kernel may depend on the hardware and can be expressed via a
-specialization constant for better code generation.
-
-This version of oneAPI provides experimental implementation of specialization
-constants based on the
-[proposal](https://github.com/codeplaysoftware/standards-proposals/blob/master/spec-constant/index.md)
-from Codeplay.
-
-**NOTE:** This extension is now deprecated.  Use the core SYCL specialization
-constant APIs defined in the
-[SYCL 2020 specification](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html)
-instead.
-
-A specialization constant is identified by a C++ type name, similarly to a
-kernel, its value is set via `program::set_spec_constant` class API and is
-"frozen" once the program is built. The following example shows how
-different values of a specialization constant can be used within the same
-kernel:
-
-```cpp
-  for (int i = 0; i < n_sc_sets; i++) {
-    cl::sycl::program program(q.get_context());
-    const int *sc_set = &sc_vals[i][0];
-    cl::sycl::ext::oneapi::experimental::spec_constant<int32_t, SC0> sc0 =
-        program.set_spec_constant<SC0>(sc_set[0]);
-    cl::sycl::ext::oneapi::experimental::spec_constant<int32_t, SC1> sc1 =
-        program.set_spec_constant<SC1>(sc_set[1]);
-
-    program.build_with_kernel_type<KernelAAA>();
-
-    try {
-      cl::sycl::buffer<int, 1> buf(vec.data(), vec.size());
-
-      q.submit([&](cl::sycl::handler &cgh) {
-        auto acc = buf.get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.single_task<KernelAAA>(
-            program.get_kernel<KernelAAA>(),
-            [=]() {
-              acc[i] = sc0.get() + sc1.get();
-            });
-      });
-    } catch (cl::sycl::exception &e) {
-      std::cout << "*** Exception caught: " << e.what() << "\n";
-      return 1;
-    }
-    ...
-  }
-```
-Here the values of specialization constants `SC0` and `SC1` are changed on
-every loop iteration. All what's needed is re-creating a `program` class
-instance, setting new values and rebuilding it via
-`program::build_with_kernel_type`. JIT compiler will effectively replace
-`sc0.get()` and  `sc1.get()` within thhe device code with the corresponding
-constant values (`sc_vals[i][0]` and `sc_vals[i][1]`). Full runnable example
-can be found on
-[github](https://github.com/intel/llvm-test-suite/blob/intel/SYCL/SpecConstants/1.2.1/spec_const_redefine.cpp).
-
-Specialization constants can be used in programs compiled Ahead-Of-Time, in this
-case a specialization constant takes default value for its type (as specified by
-[C++ standard](https://en.cppreference.com/w/cpp/language/value_initialization)).
-
-#### Limitations
-- The implementation does not support the `template <unsigned NID> struct spec_constant_id`
-  API design for interoperability with OpenCL - to set specializataion constants
-  in SYCL programs originating from external SPIRV modules and wrapped by OpenCL
-  program objects. In SPIRV/OpenCL specialization constants are identified by an
-  integer number, and the `spec_constant_id` class models that.
-- Only primitive numeric types are supported.
+This extension has been deprecated, but the specification is still available
+[here][1].
 
+[1]: <../deprecated/sycl_ext_oneapi_spec_constants.md>

From dfaa07027e548ce9e15339d7af85d906b1583ce0 Mon Sep 17 00:00:00 2001
From: Greg Lueck <gregory.m.lueck@intel.com>
Date: Tue, 1 Mar 2022 14:52:15 -0500
Subject: [PATCH 743/748] [SYCL][DOC] Add validation rules to SPIR-V ext
 (#5687)

Add validation rules to the SPIR-V extension
SPV_INTEL_global_variable_decorations.
---
 .../SPV_INTEL_global_variable_decorations.asciidoc             | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/doc/design/spirv-extensions/SPV_INTEL_global_variable_decorations.asciidoc b/sycl/doc/design/spirv-extensions/SPV_INTEL_global_variable_decorations.asciidoc
index 44aa92d34168a..d0a1e0d555a65 100644
--- a/sycl/doc/design/spirv-extensions/SPV_INTEL_global_variable_decorations.asciidoc
+++ b/sycl/doc/design/spirv-extensions/SPV_INTEL_global_variable_decorations.asciidoc
@@ -207,7 +207,8 @@ Modify Section 3.31, Capability, adding a row to the Capability table:
 
 === Validation Rules
 
-None.
+* It is invalid for two *HostAccessINTEL* decorations in the same module to
+  have the same _Name_ operand.
 
 == Issues
 

From 1984e742eb2f4d700c7016428d70640687590593 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Tue, 1 Mar 2022 19:57:01 +0000
Subject: [PATCH 744/748] [SYCL][DOC] Enable SYCL_EXT_ONEAPI_PROPERTIES
 extension (#5693)

These changes adds the definition of the SYCL_EXT_ONEAPI_PROPERTIES
feature macro and moves the corresponding proposal to experimental
support.

Signed-off-by: Steffen Larsen <steffen.larsen@intel.com>
---
 .../sycl_ext_oneapi_properties.asciidoc             | 13 ++++++++-----
 sycl/include/CL/sycl/feature_test.hpp.in            |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)
 rename sycl/doc/extensions/{proposed => experimental}/sycl_ext_oneapi_properties.asciidoc (98%)

diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_properties.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_properties.asciidoc
similarity index 98%
rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_properties.asciidoc
rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_properties.asciidoc
index 8f50d2491f70e..00cb3e28454e7 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_properties.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_properties.asciidoc
@@ -2,6 +2,8 @@
 
 = `sycl_ext_oneapi_properties`
 
+:dpcpp: pass:[DPC++]
+
 == Introduction
 
 The `sycl::property_list` found in SYCL 2020 is used to store properties used in the construction of runtime classes. It does so in a fully dynamic manner, such that it is not possible to obtain any useful information about the types of properties passed nor their values at compile time.
@@ -25,11 +27,12 @@ Copyright (c) 2021 Intel Corporation.  All rights reserved.
 
 == Status
 
-Working Draft
-
-This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension.
-
-Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products.
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
 
 == Version
 
diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index 30164956bb0f0..a7f0ca071ee46 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -53,6 +53,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY 1
 #define SYCL_EXT_ONEAPI_SRGB 1
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
+#define SYCL_EXT_ONEAPI_PROPERTIES 1
 #define SYCL_EXT_INTEL_BF16_CONVERSION 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__

From 0e44f1bf9180181035023a528ce437966321ad59 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Tue, 1 Mar 2022 23:01:20 +0300
Subject: [PATCH 745/748] [SYCL][DOC] Update spelling in device global design
 doc (#5654)

The change was suggested by @premanandrao during code review of the
implementation.
---
 sycl/doc/design/DeviceGlobal.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/design/DeviceGlobal.md b/sycl/doc/design/DeviceGlobal.md
index c82f3cfc040b1..012232fc0066b 100644
--- a/sycl/doc/design/DeviceGlobal.md
+++ b/sycl/doc/design/DeviceGlobal.md
@@ -254,7 +254,7 @@ class __sycl_device_global_registration {
  public:
   __sycl_device_global_registration() noexcept;
 };
-__sycl_device_global_registration __sycl_device_global_registerer;
+__sycl_device_global_registration __sycl_device_global_registrar;
 
 } // namespace (unnamed)
 } // namespace sycl::detail
@@ -297,7 +297,7 @@ constructed before subsequent global variables in the same translation unit.
 Therefore, a user application could reference a device global from another
 global constructor only if that global constructor is for an object defined
 *after* the device global in the same translation unit.  However, the
-integration header defines `__sycl_device_global_registerer` *before* all
+integration header defines `__sycl_device_global_registrar` *before* all
 device globals in the user's translation unit.  Therefore, the address of all
 device global variables in the translation unit will be registered with the
 DPC++ runtime before any user code could legally use them.

From 2ceeba56faa9d5af480f8f4c3108fd58a1e8f450 Mon Sep 17 00:00:00 2001
From: Greg Lueck <gregory.m.lueck@intel.com>
Date: Tue, 1 Mar 2022 15:09:14 -0500
Subject: [PATCH 746/748] [SYCL][DOC] Refactor OCL global variable spec (#5659)

We decided that the OpenCL API Specification should not contain
detailed information about how the APIs interact with SPIR-V. Instead,
we want this information in the OpenCL SPIR-V Environment
Specification.

Change this extension specification accordingly.
---
 .../cl_intel_global_variable_access.asciidoc  | 43 ++++++++++++-------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/sycl/doc/design/opencl-extensions/cl_intel_global_variable_access.asciidoc b/sycl/doc/design/opencl-extensions/cl_intel_global_variable_access.asciidoc
index e2b80c6ec9bb7..2ae724dd887da 100644
--- a/sycl/doc/design/opencl-extensions/cl_intel_global_variable_access.asciidoc
+++ b/sycl/doc/design/opencl-extensions/cl_intel_global_variable_access.asciidoc
@@ -147,22 +147,10 @@ cl_int clEnqueueWriteGlobalVariableINTEL(
   _program_ must define a global variable identified by _name_.
 
 * _name_ identifies the global variable to read or write.  Must be non-NULL.
-  The interpretation depends on how _program_ was created:
-
-** If _program_ was created with *clCreateProgramWithIL* from SPIR-V, there are
-   two cases:
-
-*** If the SPIR-V module declares the *GlobalVariableDecorationsINTEL*
-    capability, the implementation looks first for an *OpVariable* that is
-    decorated with *HostAccessINTEL* where the _Name_ operand is the same as
-    _name_.
-
-*** The implementation next looks for an *OpVariable* that is decorated with
-    *LinkageAttributes* where the _Linkage Type_ is *Export* and the _Name_
-    operand is the same as _name_.
-
-** If _program_ was created in any other way, the interpretation of
-   _name_ is implementation-defined.
+  The interpretation depends on how _program_ was created, so see the
+  appropriate environment specification for details.  (For example, if
+  _program_ was created from SPIR-V, see the OpenCL SPIR-V Environment
+  Specification.)
 
 * _blocking_read_ and _blocking_write_ indicate if the read and write
   operations are _blocking_ or _non-blocking_ (see below).
@@ -292,6 +280,29 @@ Add two new rows to Table 37, *List of supported event command types*:
 |===
 
 
+== Modifications to the OpenCL SPIR-V Environment Specification
+
+=== New Section "Global Variables"
+
+Add a new subsection under section 2, *Common Properties* named *Global
+Variables* with the following content:
+
+Host code may read or write the content of a global variable in a `cl_program`
+by calling *clEnqueueReadGlobalVariableINTEL* or
+*clEnqueueWriteGlobalVariableINTEL*.  Those two functions both take a _name_
+parameter which identifies the variable.  For a `cl_program` created from
+SPIR-V, this parameter is interpreted as follows:
+
+* If the SPIR-V module used to create _program_ declares the
+  *GlobalVariableDecorationsINTEL* capability, the implementation looks first
+  for an *OpVariable* that is decorated with *HostAccessINTEL* where the _Name_
+  operand is the same as _name_.
+
+* The implementation next looks for an *OpVariable* that is decorated with
+  *LinkageAttributes* where the _Linkage Type_ is *Export* and the _Name_
+  operand is the same as _name_.
+
+
 == Issues
 
 . We do not have a formal definition for the _name_ of a global variable when

From 4bd50e7d7bfdefb1e9165b54cb40dfd481b12e93 Mon Sep 17 00:00:00 2001
From: Sergey Dmitriev <serguei.n.dmitriev@intel.com>
Date: Wed, 2 Mar 2022 08:37:03 +0700
Subject: [PATCH 747/748] [SYCL][ESIMD] Add support for lsc mem access APIs
 (#5512)

* [SYCL][ESIMD] Add support for lsc mem access APIs

Signed-off-by: Sergey Dmitriev <serguei.n.dmitriev@intel.com>

* Removed XeHP_SDV from the list of supported platforms
* Removed DG2 from the list of supported platforms for 2d intrinsics
* Removed cache hints from user-visible lsc SLM APIs
* Replaced "flat-address" with "USM pointer"
* Removed Transposed and Transformed params from lsc_store2d template
* Removed L1 cache hint from atomic operations
* Removed NElts from atomic operations
* Reordered parameters for lsc atomic templates to make them consistent with regular atomics
* Added static asserts to check data sizes for Transformed and Transposed messages
* Added checks for allowed cache hints
* Add special handling for u8 and u16 data types
* Remove 'Transposed' and 'Transformed' perameters from prefetch 2d
---
 llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp     |  127 ++
 .../ext/intel/experimental/esimd/common.hpp   |  357 +++++
 .../esimd/detail/memory_intrin.hpp            |  735 ++++++++++
 .../intel/experimental/esimd/detail/util.hpp  |    5 +
 .../ext/intel/experimental/esimd/memory.hpp   | 1207 ++++++++++++++++-
 sycl/test/esimd/lsc.cpp                       |  153 +++
 6 files changed, 2583 insertions(+), 1 deletion(-)
 create mode 100644 sycl/test/esimd/lsc.cpp

diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
index e08f4177f7f7a..8c6cd6b535086 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
@@ -79,6 +79,42 @@ ModulePass *llvm::createSYCLLowerESIMDPass() {
 }
 
 namespace {
+enum class lsc_subopcode : uint8_t {
+  load = 0x00,
+  load_strided = 0x01,
+  load_quad = 0x02,
+  load_block2d = 0x03,
+  store = 0x04,
+  store_strided = 0x05,
+  store_quad = 0x06,
+  store_block2d = 0x07,
+  //
+  atomic_iinc = 0x08,
+  atomic_idec = 0x09,
+  atomic_load = 0x0a,
+  atomic_store = 0x0b,
+  atomic_iadd = 0x0c,
+  atomic_isub = 0x0d,
+  atomic_smin = 0x0e,
+  atomic_smax = 0x0f,
+  atomic_umin = 0x10,
+  atomic_umax = 0x11,
+  atomic_icas = 0x12,
+  atomic_fadd = 0x13,
+  atomic_fsub = 0x14,
+  atomic_fmin = 0x15,
+  atomic_fmax = 0x16,
+  atomic_fcas = 0x17,
+  atomic_and = 0x18,
+  atomic_or = 0x19,
+  atomic_xor = 0x1a,
+  //
+  load_status = 0x1b,
+  store_uncompressed = 0x1c,
+  ccs_update = 0x1d,
+  read_state_info = 0x1e,
+  fence = 0x1f,
+};
 // The regexp for ESIMD intrinsics:
 // /^_Z(\d+)__esimd_\w+/
 static constexpr char ESIMD_INTRIN_PREF0[] = "_Z";
@@ -227,6 +263,10 @@ class ESIMDIntrinDescTable {
     return ESIMDIntrinDesc::ArgRule{ESIMDIntrinDesc::CONST_INT8, {{N, {}}}};
   }
 
+  static constexpr ESIMDIntrinDesc::ArgRule c8(lsc_subopcode OpCode) {
+    return c8(static_cast<uint8_t>(OpCode));
+  }
+
   static constexpr ESIMDIntrinDesc::ArgRule c16(int16_t N) {
     return ESIMDIntrinDesc::ArgRule{ESIMDIntrinDesc::CONST_INT16, {{N, {}}}};
   }
@@ -454,6 +494,87 @@ class ESIMDIntrinDescTable {
         {"nbarrier", {"nbarrier", {a(0), a(1), a(2)}}},
         {"raw_send_nbarrier_signal",
          {"raw.send.noresult", {a(0), ai1(4), a(1), a(2), a(3)}}},
+        {"lsc_load_slm",
+         {"lsc.load.slm",
+          {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
+           t8(6), t8(7), c8(0), a(1), c32(0)}}},
+        {"lsc_load_bti",
+         {"lsc.load.bti",
+          {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
+           t8(6), t8(7), c8(0), a(1), aSI(2)}}},
+        {"lsc_load_stateless",
+         {"lsc.load.stateless",
+          {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
+           t8(6), t8(7), c8(0), a(1), c32(0)}}},
+        {"lsc_prefetch_bti",
+         {"lsc.prefetch.bti",
+          {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
+           t8(6), t8(7), c8(0), a(1), aSI(2)}}},
+        {"lsc_prefetch_stateless",
+         {"lsc.prefetch.stateless",
+          {ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
+           t8(6), t8(7), c8(0), a(1), c32(0)}}},
+        {"lsc_store_slm",
+         {"lsc.store.slm",
+          {ai1(0), c8(lsc_subopcode::store), t8(1), t8(2), t16(3), t32(4),
+           t8(5), t8(6), t8(7), c8(0), a(1), a(2), c32(0)}}},
+        {"lsc_store_bti",
+         {"lsc.store.bti",
+          {ai1(0), c8(lsc_subopcode::store), t8(1), t8(2), t16(3), t32(4),
+           t8(5), t8(6), t8(7), c8(0), a(1), a(2), aSI(3)}}},
+        {"lsc_store_stateless",
+         {"lsc.store.stateless",
+          {ai1(0), c8(lsc_subopcode::store), t8(1), t8(2), t16(3), t32(4),
+           t8(5), t8(6), t8(7), c8(0), a(1), a(2), c32(0)}}},
+        {"lsc_load2d_stateless",
+         {"lsc.load2d.stateless",
+          {ai1(0), t8(1), t8(2), t8(3), t8(4), t8(5), t16(6), t16(7), t8(8),
+           a(1), a(2), a(3), a(4), a(5), a(6)}}},
+        {"lsc_prefetch2d_stateless",
+         {"lsc.prefetch2d.stateless",
+          {ai1(0), t8(1), t8(2), t8(3), t8(4), t8(5), t16(6), t16(7), t8(8),
+           a(1), a(2), a(3), a(4), a(5), a(6)}}},
+        {"lsc_store2d_stateless",
+         {"lsc.store2d.stateless",
+          {ai1(0), t8(1), t8(2), t8(3), t8(4), t8(5), t16(6), t16(7), t8(8),
+           a(1), a(2), a(3), a(4), a(5), a(6), a(7)}}},
+        {"lsc_xatomic_slm_0",
+         {"lsc.xatomic.slm",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), u(-1), u(-1), c32(0), u(-1)}}},
+        {"lsc_xatomic_slm_1",
+         {"lsc.xatomic.slm",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), a(2), u(-1), c32(0), u(-1)}}},
+        {"lsc_xatomic_slm_2",
+         {"lsc.xatomic.slm",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), a(2), a(3), c32(0), u(-1)}}},
+        {"lsc_xatomic_bti_0",
+         {"lsc.xatomic.bti",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), u(-1), u(-1), aSI(2), u(-1)}}},
+        {"lsc_xatomic_bti_1",
+         {"lsc.xatomic.bti",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), a(2), u(-1), aSI(3), u(-1)}}},
+        {"lsc_xatomic_bti_2",
+         {"lsc.xatomic.bti",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), a(2), a(3), aSI(4), u(-1)}}},
+        {"lsc_xatomic_stateless_0",
+         {"lsc.xatomic.stateless",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), u(-1), u(-1), c32(0), u(-1)}}},
+        {"lsc_xatomic_stateless_1",
+         {"lsc.xatomic.stateless",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), a(2), u(-1), c32(0), u(-1)}}},
+        {"lsc_xatomic_stateless_2",
+         {"lsc.xatomic.stateless",
+          {ai1(0), t8(1), t8(2), t8(3), t16(4), t32(5), t8(6), t8(7), t8(8),
+           c8(0), a(1), a(2), a(3), c32(0), u(-1)}}},
+        {"lsc_fence", {"lsc.fence", {ai1(0), t8(0), t8(1), t8(2)}}},
         {"sat", {"sat", {a(0)}}},
         {"fptoui_sat", {"fptoui.sat", {a(0)}}},
         {"fptosi_sat", {"fptosi.sat", {a(0)}}},
@@ -723,6 +844,12 @@ static std::string getESIMDIntrinSuffix(id::FunctionEncoding *FE,
     case 0x12:
       Suff = ".fcmpwr";
       break;
+    case 0x13:
+      Suff = ".fadd";
+      break;
+    case 0x14:
+      Suff = ".fsub";
+      break;
     case 0xff:
       Suff = ".predec";
       break;
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
index bbb33a6ecde0e..db55138e20ade 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
@@ -203,11 +203,368 @@ enum class atomic_op : uint8_t {
   /// Compare and exchange (floating point).
   /// <code>if (*addr == src0) *addr = src1;</code>
   fcmpwr = 0x12,
+  fadd = 0x13,
+  fsub = 0x14,
+  load = 0x15,
+  store = 0x16,
   /// Decrement: <code>*addr = *addr - 1</code>. The only operation which
   /// returns new value of the destination rather than old.
   predec = 0xff,
 };
 
+/// The scope that lsc_fence operation should apply to
+/// Supported platforms: DG2, PVC
+enum class lsc_scope : uint8_t {
+  group = 0,  /// flush out to the threadgroup's scope
+  local = 1,  /// flush out to the local scope
+  tile = 2,   /// tile, flush out to several DSSs
+  gpu = 3,    /// entire GPU, flush out to the GPUs LLC
+  gpus = 4,   /// all GPUs in the system, flush out to memory shared by all GPUs
+  system = 5, /// the entire system memory space
+  sysacq = 6, /// the entire system memory space with system-acquire semantics
+};
+
+/// The lsc_fence operation to apply to caches
+/// Supported platforms: DG2, PVC
+enum class lsc_fence_op : uint8_t {
+  none = 0,       /// no operation
+  evict = 1,      /// dirty lines evicted and invalidated from L1
+  invalidate = 2, /// invalidate all clean lines
+  discard = 3,    /// direct and clean lines are discarded w/o eviction
+  clean = 4,      /// dirty lines are written to memory, but retained in cache
+                  /// in clean state
+  flushl3 = 5,    /// flush only L3
+};
+
+/// The specific LSC shared function to fence with lsc_fence
+/// Supported platforms: DG2, PVC
+enum class lsc_memory_kind : uint8_t {
+  untyped_global = 0,         /// untyped global memory
+  untyped_global_low_pri = 1, /// low-priority untyped global memory
+  typed_global = 2,           /// typed global memory
+  shared_local = 3,           /// shared local memory
+};
+
+/// Data size or format to read or store
+enum class lsc_data_size : uint8_t {
+  default_size = 0,
+  u8 = 1,
+  u16 = 2,
+  u32 = 3,
+  u64 = 4,
+  u8u32 = 5,   /// load 8b, zero extend to 32b; store the opposite
+  u16u32 = 6,  /// load 16b, zero extend to 32b; store the opposite
+  u16u32h = 7, /// load 16b into high 16 of each 32b; store the high 16
+};
+
+namespace detail {
+/// LSC atomic operations op codes
+enum class lsc_atomic_op : uint8_t {
+  iinc = 0x08,    // atomic integer increment
+  idec = 0x09,    // atomic integer decrement
+  load = 0x0a,    // atomic load
+  store = 0x0b,   // atomic store
+  iadd = 0x0c,    // atomic integer add
+  isub = 0x0d,    // atomic integer subtract
+  smin = 0x0e,    // atomic signed int min
+  smax = 0x0f,    // atomic signed int max
+  umin = 0x10,    // atomic unsigned int min
+  umax = 0x11,    // atomic unsigned int max
+  icas = 0x12,    // atomic int compare and swap
+  fadd = 0x13,    // floating-point add
+  fsub = 0x14,    // floating-point subtract
+  fmin = 0x15,    // floating-point min
+  fmax = 0x16,    // floating-point max
+  fcas = 0x17,    // floating-point CAS
+  bit_and = 0x18, // logical (bitwise) AND
+  bit_or = 0x19,  // logical (bitwise) OR
+  bit_xor = 0x1a, // logical (bitwise) XOR
+};
+
+enum class lsc_vector_size : uint8_t {
+  n1 = 1,
+  n2 = 2,
+  n3 = 3,
+  n4 = 4,
+  n8 = 5,
+  n16 = 6,
+  n32 = 7,
+  n64 = 8,
+};
+
+enum class lsc_data_order : uint8_t {
+  nontranspose = 1,
+  transpose = 2,
+};
+
+template <lsc_vector_size VS> constexpr void check_lsc_vector_size() {
+  static_assert(VS == lsc_vector_size::n1 || VS == lsc_vector_size::n2 ||
+                    VS == lsc_vector_size::n3 || VS == lsc_vector_size::n4 ||
+                    VS == lsc_vector_size::n8 || VS == lsc_vector_size::n16 ||
+                    VS == lsc_vector_size::n64 || VS == lsc_vector_size::n32,
+                "Unsupported vector size");
+}
+
+template <uint8_t VS> constexpr void check_lsc_vector_size() {
+  static_assert(VS == 1 || VS == 2 || VS == 3 || VS == 4 || VS == 8 ||
+                    VS == 16 || VS == 32 || VS == 64,
+                "Unsupported vector size");
+}
+
+template <typename T, lsc_data_size DS> constexpr void check_lsc_data_size() {
+  static_assert(DS != lsc_data_size::default_size || sizeof(T) == 1 ||
+                    sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8,
+                "Unsupported data type");
+}
+
+template <atomic_op Op> constexpr void check_lsc_atomic_op() {
+  static_assert(Op == atomic_op::add || Op == atomic_op::sub ||
+                    Op == atomic_op::inc || Op == atomic_op::dec ||
+                    Op == atomic_op::min || Op == atomic_op::max ||
+                    Op == atomic_op::cmpxchg || Op == atomic_op::bit_and ||
+                    Op == atomic_op::bit_or || Op == atomic_op::bit_xor ||
+                    Op == atomic_op::minsint || Op == atomic_op::maxsint ||
+                    Op == atomic_op::fmax || Op == atomic_op::fmin ||
+                    Op == atomic_op::fcmpwr || Op == atomic_op::fadd ||
+                    Op == atomic_op::fsub || Op == atomic_op::load ||
+                    Op == atomic_op::store,
+                "Unsupported operation for LSC atomics");
+}
+
+/// Check the legality of lsc xatomic call in terms of size and type.
+template <atomic_op Op, unsigned NumSrc> constexpr void check_lsc_atomic() {
+  check_lsc_atomic_op<Op>();
+  if constexpr (Op == atomic_op::inc || Op == atomic_op::dec ||
+                Op == atomic_op::load) {
+    static_assert(NumSrc == 0, "No source operands are expected");
+  }
+  if constexpr (Op == atomic_op::store || Op == atomic_op::add ||
+                Op == atomic_op::sub || Op == atomic_op::minsint ||
+                Op == atomic_op::maxsint || Op == atomic_op::min ||
+                Op == atomic_op::max || Op == atomic_op::fadd ||
+                Op == atomic_op::fsub || Op == atomic_op::fmin ||
+                Op == atomic_op::fmax || Op == atomic_op::bit_and ||
+                Op == atomic_op::bit_or || Op == atomic_op::bit_xor) {
+    static_assert(NumSrc == 1, "One source operand is expected");
+  }
+  if constexpr (Op == atomic_op::cmpxchg || Op == atomic_op::fcmpwr) {
+    static_assert(NumSrc == 2, "Two source operands are expected");
+  }
+}
+
+template <atomic_op Op> constexpr lsc_atomic_op to_lsc_atomic_op() {
+  check_lsc_atomic_op<Op>();
+  switch (Op) {
+  case atomic_op::add:
+    return lsc_atomic_op::iadd;
+  case atomic_op::sub:
+    return lsc_atomic_op::isub;
+  case atomic_op::inc:
+    return lsc_atomic_op::iinc;
+  case atomic_op::dec:
+    return lsc_atomic_op::idec;
+  case atomic_op::min:
+    return lsc_atomic_op::umin;
+  case atomic_op::max:
+    return lsc_atomic_op::umax;
+  case atomic_op::cmpxchg:
+    return lsc_atomic_op::icas;
+  case atomic_op::bit_and:
+    return lsc_atomic_op::bit_and;
+  case atomic_op::bit_or:
+    return lsc_atomic_op::bit_or;
+  case atomic_op::bit_xor:
+    return lsc_atomic_op::bit_xor;
+  case atomic_op::minsint:
+    return lsc_atomic_op::smin;
+  case atomic_op::maxsint:
+    return lsc_atomic_op::smax;
+  case atomic_op::fmax:
+    return lsc_atomic_op::fmax;
+  case atomic_op::fmin:
+    return lsc_atomic_op::fmin;
+  case atomic_op::fcmpwr:
+    return lsc_atomic_op::fcas;
+  case atomic_op::fadd:
+    return lsc_atomic_op::fadd;
+  case atomic_op::fsub:
+    return lsc_atomic_op::fsub;
+  case atomic_op::load:
+    return lsc_atomic_op::load;
+  case atomic_op::store:
+    return lsc_atomic_op::store;
+  default:
+    return lsc_atomic_op::iinc;
+  }
+}
+
+template <lsc_vector_size VS> constexpr uint8_t to_int() {
+  check_lsc_vector_size<VS>();
+  switch (VS) {
+  case lsc_vector_size::n1:
+    return 1;
+  case lsc_vector_size::n2:
+    return 2;
+  case lsc_vector_size::n3:
+    return 3;
+  case lsc_vector_size::n4:
+    return 4;
+  case lsc_vector_size::n8:
+    return 8;
+  case lsc_vector_size::n16:
+    return 16;
+  case lsc_vector_size::n32:
+    return 32;
+  case lsc_vector_size::n64:
+    return 64;
+  default:
+    return 1;
+  }
+}
+
+template <uint8_t VS> constexpr lsc_vector_size to_lsc_vector_size() {
+  check_lsc_vector_size<VS>();
+  switch (VS) {
+  case 1:
+    return lsc_vector_size::n1;
+  case 2:
+    return lsc_vector_size::n2;
+  case 3:
+    return lsc_vector_size::n3;
+  case 4:
+    return lsc_vector_size::n4;
+  case 8:
+    return lsc_vector_size::n8;
+  case 16:
+    return lsc_vector_size::n16;
+  case 32:
+    return lsc_vector_size::n32;
+  case 64:
+    return lsc_vector_size::n64;
+  default:
+    return lsc_vector_size::n1;
+  }
+}
+
+template <typename T, lsc_data_size DS>
+constexpr lsc_data_size finalize_data_size() {
+  check_lsc_data_size<T, DS>();
+  if (DS != lsc_data_size::default_size)
+    return DS;
+  else if (sizeof(T) == 1)
+    return lsc_data_size::u8;
+  else if (sizeof(T) == 2)
+    return lsc_data_size::u16;
+  else if (sizeof(T) == 4)
+    return lsc_data_size::u32;
+  else if (sizeof(T) == 8)
+    return lsc_data_size::u64;
+  else
+    return DS;
+}
+
+constexpr lsc_data_size expand_data_size(lsc_data_size DS) {
+  if (DS == lsc_data_size::u8)
+    return lsc_data_size::u8u32;
+  if (DS == lsc_data_size::u16)
+    return lsc_data_size::u16u32;
+  return DS;
+}
+
+template <typename T> struct lsc_expand_type {
+  using type = typename std::conditional<sizeof(T) < 4, uint32_t, T>::type;
+};
+
+template <typename T> struct lsc_bitcast_type {
+private:
+  using _type1 = typename std::conditional<sizeof(T) == 2, uint16_t, T>::type;
+  using _type2 = typename std::conditional<sizeof(T) == 1, uint8_t, T>::type;
+
+public:
+  using type =
+      typename std::conditional<sizeof(_type2) == 1, _type2, _type1>::type;
+};
+
+} // namespace detail
+
+/// L1 or L3 cache hint kinds.
+enum class cache_hint : uint8_t {
+  none = 0,
+  uncached = 1,
+  cached = 2,
+  write_back = 3,
+  write_through = 4,
+  streaming = 5,
+  read_invalidate = 6
+};
+
+namespace detail {
+
+template <cache_hint Hint> class cache_hint_wrap {
+  template <cache_hint...> class is_one_of_t;
+  template <cache_hint Last>
+  struct is_one_of_t<Last>
+      : std::conditional<Last == Hint, std::true_type, std::false_type>::type {
+  };
+  template <cache_hint Head, cache_hint... Tail>
+  struct is_one_of_t<Head, Tail...>
+      : std::conditional<Head == Hint, std::true_type,
+                         is_one_of_t<Tail...>>::type {};
+
+public:
+  constexpr operator cache_hint() const { return Hint; }
+  template <cache_hint... Hints> constexpr bool is_one_of() const {
+    return is_one_of_t<Hints...>::value;
+  }
+};
+
+constexpr bool are_both(cache_hint First, cache_hint Second, cache_hint Val) {
+  return First == Val && Second == Val;
+}
+
+enum class lsc_action { prefetch, load, store, atomic };
+
+template <lsc_action Action, cache_hint L1, cache_hint L3>
+constexpr void check_lsc_cache_hint() {
+  constexpr auto L1H = cache_hint_wrap<L1>{};
+  constexpr auto L3H = cache_hint_wrap<L3>{};
+  if constexpr (Action == lsc_action::prefetch) {
+    static_assert(
+        L1H.template is_one_of<cache_hint::cached, cache_hint::uncached,
+                               cache_hint::streaming>() &&
+            L3H.template is_one_of<cache_hint::cached,
+                                   cache_hint::uncached>() &&
+            !are_both(L1H, L3H, cache_hint::uncached),
+        "unsupported cache hint");
+  } else if constexpr (Action == lsc_action::load) {
+    static_assert(
+        are_both(L1H, L3H, cache_hint::none) ||
+            (L1H.template is_one_of<cache_hint::uncached, cache_hint::cached,
+                                    cache_hint::streaming>() &&
+             L3H.template is_one_of<cache_hint::uncached,
+                                    cache_hint::cached>()) ||
+            (L1H == cache_hint::read_invalidate && L3H == cache_hint::cached),
+        "unsupported cache hint");
+  } else if constexpr (Action == lsc_action::store) {
+    static_assert(are_both(L1H, L3H, cache_hint::none) ||
+                      are_both(L1H, L3H, cache_hint::write_back) ||
+                      (L1H.template is_one_of<cache_hint::uncached,
+                                              cache_hint::write_through,
+                                              cache_hint::streaming>() &&
+                       L3H.template is_one_of<cache_hint::uncached,
+                                              cache_hint::write_back>()),
+                  "unsupported cache hint");
+  } else if constexpr (Action == lsc_action::atomic) {
+    static_assert(are_both(L1H, L3H, cache_hint::none) ||
+                      (L1H == cache_hint::uncached &&
+                       L3H.template is_one_of<cache_hint::uncached,
+                                              cache_hint::write_back>()),
+                  "unsupported cache hint");
+  }
+}
+
+} // namespace detail
+
 /// Represents a split barrier action.
 enum class split_barrier_action : uint8_t {
   wait = 0,   // split barrier wait
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
index 66f81989c9b81..5256f33ce819b 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
@@ -1339,4 +1339,739 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal(
 }
 #endif // __SYCL_DEVICE_ONLY__
 
+/// SLM gather.
+/// Supported platforms: DG2, PVC
+///
+/// Collects elements located at slm and returns them
+/// as a single \ref simd object.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @return is a vector of type T and size N * to_int<VS>()
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_load_slm(__SEIEED::simd_mask_storage_t<N> pred,
+                     __SEIEED::vector_type_t<uint32_t, N> offsets)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Surface-based gather.
+/// Supported platforms: DG2, PVC
+///
+/// Collects elements located at surface and returns them
+/// as a single \ref simd object.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam SurfIndAliasTy is the \ref sycl::accessor type.
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param surf_ind is the surface index.
+/// @return is a vector of type T and N * to_int<VS>()
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N, typename SurfIndAliasTy>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_load_bti(__SEIEED::simd_mask_storage_t<N> pred,
+                     __SEIEED::vector_type_t<uint32_t, N> offsets,
+                     SurfIndAliasTy surf_ind)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// USM pointer gather.
+/// Supported platforms: DG2, PVC
+///
+/// Collects elements located at specified address and returns them
+/// as a single \ref simd object.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param addrs is the load addresses.
+/// @return is a vector of type T and N * to_int<VS>()
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_load_stateless(__SEIEED::simd_mask_storage_t<N> pred,
+                           __SEIEED::vector_type_t<uintptr_t, N> addrs)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Surface-based prefetch gather.
+/// Supported platforms: DG2, PVC
+///
+/// Prefetches elements located at surface.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam SurfIndAliasTy is the \ref sycl::accessor type.
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param surf_ind is the surface index.
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N, typename SurfIndAliasTy>
+__ESIMD_INTRIN void
+__esimd_lsc_prefetch_bti(__SEIEED::simd_mask_storage_t<N> pred,
+                         __SEIEED::vector_type_t<uint32_t, N> offsets,
+                         SurfIndAliasTy surf_ind)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// USM pointer prefetch gather.
+/// Supported platforms: DG2, PVC
+///
+/// Prefetches elements located at specified address.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param addrs is the prefetch addresses.
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N>
+__ESIMD_INTRIN void
+__esimd_lsc_prefetch_stateless(__SEIEED::simd_mask_storage_t<N> pred,
+                               __SEIEED::vector_type_t<uintptr_t, N> addrs)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// SLM scatter.
+/// Supported platforms: DG2, PVC
+///
+/// Scatters elements located to slm.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @param vals is values to store.
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N>
+__ESIMD_INTRIN void __esimd_lsc_store_slm(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uint32_t, N> offsets,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> vals)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Surface-based scatter.
+/// Supported platforms: DG2, PVC
+///
+/// Scatters elements to surface.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam SurfIndAliasTy is the \ref sycl::accessor type.
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param vals is values to store.
+/// @param surf_ind is the surface index.
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N, typename SurfIndAliasTy>
+__ESIMD_INTRIN void __esimd_lsc_store_bti(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uint32_t, N> offsets,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> vals,
+    SurfIndAliasTy surf_ind)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// USM pointer scatter.
+/// Supported platforms: DG2, PVC
+///
+/// Scatters elements to specific address.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param addrs is the prefetch addresses.
+/// @param vals is values to store.
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          uint16_t AddressScale, int ImmOffset, __SEIEE::lsc_data_size DS,
+          __SEIEED::lsc_vector_size VS, __SEIEED::lsc_data_order _Transposed,
+          int N>
+__ESIMD_INTRIN void __esimd_lsc_store_stateless(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uintptr_t, N> addrs,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> vals)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// 2D USM pointer block load.
+/// Supported platforms: PVC
+///
+/// Collects elements located at specified address and returns them
+/// as a single \ref simd object.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam DS is the data size.
+/// @tparam Transposed is the transposed version or not.
+/// @tparam NBlocks is the number of blocks.
+/// @tparam BlockWidth is the block width in number of elements.
+/// @tparam BlockHeight is the block height in number of elements.
+/// @tparam Transformed is apply VNNI transform or not.
+/// @tparam N is the data size
+/// @param Pred is predicates.
+/// @param Ptr is the surface base address for this operation.
+/// @param SurfaceWidth is the surface width minus 1 in bytes
+/// @param SurfaceHeight is the surface height minus 1 in rows
+/// @param SurfacePitch is the surface pitch minus 1 in bytes
+/// @param X is zero based X-coordinate of the left upper rectangle corner in
+/// number of elements.
+/// @param Y is zero based Y-coordinate of the left upper rectangle corner in
+/// rows.
+/// @return is a vector of type T and size N, where N is
+///  BlockWidth * BlockHeight * NBlocks, if transformed;
+///  otherwise,
+///  N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) *
+///   getNextPowerOf2(BlockWidth) * NBlocks
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_data_order _Transposed,
+          uint8_t NBlocks, int BlockWidth, int BlockHeight, bool Transformed,
+          int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N>
+__esimd_lsc_load2d_stateless(__SEIEED::simd_mask_storage_t<N> Pred,
+                             uintptr_t Ptr, int SurfaceWidth, int SurfaceHeight,
+                             int SurfacePitch, int X, int Y)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// 2D USM pointer block prefetch.
+/// Supported platforms: PVC
+///
+/// Prefetches elements located at specified address.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam DS is the data size.
+/// @tparam NBlocks is the number of blocks.
+/// @tparam Transposed is the transposed version or not.
+/// @tparam BlockWidth is the block width in number of elements.
+/// @tparam BlockHeight is the block height in number of elements.
+/// @tparam Transformed is apply VNNI transform or not.
+/// @tparam N is the data size
+/// @param Pred is predicates.
+/// @param Ptr is the surface base address for this operation.
+/// @param SurfaceWidth is the surface width minus 1 in bytes
+/// @param SurfaceHeight is the surface height minus 1 in rows
+/// @param SurfacePitch is the surface pitch minus 1 in bytes
+/// @param X is zero based X-coordinate of the left upper rectangle corner in
+/// number of elements.
+/// @param Y is zero based Y-coordinate of the left upper rectangle corner in
+/// rows.
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_data_order _Transposed,
+          uint8_t NBlocks, int BlockWidth, int BlockHeight, bool Transformed,
+          int N>
+__ESIMD_INTRIN void __esimd_lsc_prefetch2d_stateless(
+    __SEIEED::simd_mask_storage_t<N> Pred, uintptr_t Ptr, int SurfaceWidth,
+    int SurfaceHeight, int SurfacePitch, int X, int Y)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// 2D USM pointer block store.
+/// Supported platforms: PVC
+///
+/// Stores elements at specified address.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam DS is the data size.
+/// @tparam Transposed is the transposed version or not.
+/// @tparam NBlocks is the number of blocks.
+/// @tparam BlockWidth is the block width in number of elements.
+/// @tparam BlockHeight is the block height in number of elements.
+/// @tparam Transformed is apply VNNI transform or not.
+/// @tparam N is the data size
+/// @param Pred is predicates.
+/// @param Ptr is the surface base address for this operation.
+/// @param SurfaceWidth is the surface width minus 1 in bytes
+/// @param SurfaceHeight is the surface height minus 1 in rows
+/// @param SurfacePitch is the surface pitch minus 1 in bytes
+/// @param X is zero based X-coordinate of the left upper rectangle corner in
+/// number of elements.
+/// @param Y is zero based Y-coordinate of the left upper rectangle corner in
+/// rows.
+/// @param Vals is a vector to store of type T and size N, where N is
+///  BlockWidth * BlockHeight * NBlocks, if transformed;
+///  otherwise,
+///  N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) *
+///   getNextPowerOf2(BlockWidth) * NBlocks
+template <typename Ty, __SEIEE::cache_hint L1H, __SEIEE::cache_hint L3H,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_data_order _Transposed,
+          uint8_t NBlocks, int BlockWidth, int BlockHeight, bool Transformed,
+          int N>
+__ESIMD_INTRIN void
+__esimd_lsc_store2d_stateless(__SEIEED::simd_mask_storage_t<N> Pred,
+                              uintptr_t Ptr, int SurfaceWidth,
+                              int SurfaceHeight, int SurfacePitch, int X, int Y,
+                              __SEIEED::vector_type_t<Ty, N> vals)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_slm_0(__SEIEED::simd_mask_storage_t<N> pred,
+                          __SEIEED::vector_type_t<uint32_t, N> offsets)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_slm_1(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uint32_t, N> offsets,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src0)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param src1 is the second atomic operand.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_slm_2(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uint32_t, N> offsets,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src0,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src1)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Accessor-based atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam SurfIndAliasTy is the \ref sycl::accessor type.
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+/// @param surf_ind is the surface index.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N, typename SurfIndAliasTy>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_bti_0(__SEIEED::simd_mask_storage_t<N> pred,
+                          __SEIEED::vector_type_t<uint32_t, N> offsets,
+                          SurfIndAliasTy surf_ind)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Accessor-based atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam SurfIndAliasTy is the \ref sycl::accessor type.
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param surf_ind is the surface index.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N, typename SurfIndAliasTy>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_bti_1(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uint32_t, N> offsets,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src0,
+    SurfIndAliasTy surf_ind)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Accessor-based atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam SurfIndAliasTy is the \ref sycl::accessor type.
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param src1 is the second atomic operand.
+/// @param surf_ind is the surface index.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N, typename SurfIndAliasTy>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_bti_2(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uint32_t, N> offsets,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src0,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src1,
+    SurfIndAliasTy surf_ind)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// USM pointer atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param addrs is the prefetch addresses.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_stateless_0(__SEIEED::simd_mask_storage_t<N> pred,
+                                __SEIEED::vector_type_t<uintptr_t, N> addrs)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// USM pointer atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param addrs is the prefetch addresses.
+/// @param src0 is the first atomic operand.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_stateless_1(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uintptr_t, N> addrs,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src0)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// USM pointer atomic.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Ty is element type.
+/// @tparam Op is operation type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+/// @param addrs is the prefetch addresses.
+/// @param src0 is the first atomic operand.
+/// @param src1 is the second atomic operand.
+template <typename Ty, __SEIEED::lsc_atomic_op Op, __SEIEE::cache_hint L1H,
+          __SEIEE::cache_hint L3H, uint16_t AddressScale, int ImmOffset,
+          __SEIEE::lsc_data_size DS, __SEIEED::lsc_vector_size VS,
+          __SEIEED::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()>
+__esimd_lsc_xatomic_stateless_2(
+    __SEIEED::simd_mask_storage_t<N> pred,
+    __SEIEED::vector_type_t<uintptr_t, N> addrs,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src0,
+    __SEIEED::vector_type_t<Ty, N * __SEIEED::to_int<VS>()> src1)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+  return 0;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Memory fence.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Kind is the Sfid shaded function.
+/// @tparam FenceOp is the fence operation.
+/// @tparam Scope is the operation scope.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+template <__SEIEE::lsc_memory_kind Kind, __SEIEE::lsc_fence_op FenceOp,
+          __SEIEE::lsc_scope Scope, int N>
+__ESIMD_INTRIN void __esimd_lsc_fence(__SEIEED::simd_mask_storage_t<N> pred)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
 /// @endcond ESIMD_DETAIL
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
index 7156dc6785283..fc5d82c260c74 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
@@ -84,6 +84,11 @@ static ESIMD_INLINE constexpr bool isPowerOf2(unsigned int n,
   return (n & (n - 1)) == 0 && n <= limit;
 }
 
+template <unsigned int N, unsigned int M>
+constexpr unsigned int roundUpNextMultiple() {
+  return ((N + M - 1) / M) * M;
+}
+
 /// type traits
 template <typename T> struct is_esimd_vector : public std::false_type {};
 
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 1312747d03195..b73b8e4256963 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -624,7 +624,8 @@ constexpr bool check_atomic() {
   }
 
   // One source float operand.
-  if constexpr (Op == atomic_op::fmax || Op == atomic_op::fmin) {
+  if constexpr (Op == atomic_op::fmax || Op == atomic_op::fmin ||
+                Op == atomic_op::fadd || Op == atomic_op::fsub) {
     if constexpr (NumSrc != 1) {
       static_assert(NumSrc == 1, "One source operand is expected");
       return false;
@@ -1278,6 +1279,1210 @@ __ESIMD_API void nbarrier_signal(uint8_t barrier_id,
 
 /// @} sycl_esimd_memory_nbarrier
 
+/// @defgroup sycl_esimd_memory_lsc LSC memory access APIs.
+/// @ingroup sycl_esimd_memory
+
+/// @addtogroup sycl_esimd_memory_lsc
+/// @{
+
+namespace detail {
+// Compute the data size for 2d block load or store.
+template <typename T, int NBlocks, int Height, int Width, bool Transposed,
+          bool Transformed>
+constexpr int get_lsc_block_2d_data_size() {
+  if (Transformed)
+    return detail::roundUpNextMultiple<Height, 4 / sizeof(T)>() *
+           detail::getNextPowerOf2<Width>() * NBlocks;
+  return Width * Height * NBlocks;
+}
+
+// Format u8u32 and u16u32 back to u8 and u16.
+template <typename T, typename T1, int N>
+ESIMD_INLINE simd<T, N> lsc_format_ret(simd<T1, N> Vals) {
+  auto Formatted = Vals.template bit_cast_view<T>();
+  constexpr int Stride = Formatted.length / N;
+  return Formatted.template select<N, Stride>(0);
+}
+} // namespace detail
+
+/// SLM gather.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.slm
+///
+/// Collects elements located at slm and returns them
+/// as a single \ref simd object.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam N is the number of channels (platform dependent).
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @param pred is predicates.
+/// @return is a vector of type T and size N * NElts
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size, int N>
+__ESIMD_API simd<T, N * NElts> lsc_slm_gather(simd<uint32_t, N> offsets,
+                                              simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<_MsgT, N *NElts> Tmp =
+      __esimd_lsc_load_slm<_MsgT, cache_hint::none, cache_hint::none,
+                           _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
+          pred.data(), offsets.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// Transposed SLM gather with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.slm
+///
+/// Collects elements located at slm and returns them
+/// as a single \ref simd object.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @param offset is the zero-based offset for SLM buffer in bytes.
+/// @return is a vector of type T and size NElts
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size>
+__ESIMD_API simd<T, NElts> lsc_slm_block_load(uint32_t offset) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+                "Transposed load is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uint32_t, N> offsets = offset;
+  return __esimd_lsc_load_slm<T, cache_hint::none, cache_hint::none,
+                              _AddressScale, _ImmOffset, _DS, _VS, _Transposed,
+                              N>(pred.data(), offsets.data());
+}
+
+/// Accessor-based gather.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Collects elements located at surface and returns them
+/// as a single \ref simd object.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param pred is predicates.
+/// @return is a vector of type T and size N * NElts
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N, typename AccessorTy>
+__ESIMD_API
+    std::enable_if_t<!std::is_pointer<AccessorTy>::value, simd<T, N * NElts>>
+    lsc_gather(AccessorTy acc, simd<uint32_t, N> offsets,
+               simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  simd<_MsgT, N *NElts> Tmp =
+      __esimd_lsc_load_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                           _Transposed, N>(pred.data(), offsets.data(), si);
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// Accessor-based transposed gather with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Collects elements located at surface and returns them
+/// as a single \ref simd object.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offset is the zero-based offset in bytes.
+/// @return is a vector of type T and size NElts
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          typename AccessorTy>
+__ESIMD_API
+    std::enable_if_t<!std::is_pointer<AccessorTy>::value, simd<T, NElts>>
+    lsc_block_load(AccessorTy acc, uint32_t offset) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+                "Transposed load is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uint32_t, N> offsets = offset;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                              _Transposed, N>(pred.data(), offsets.data(), si);
+}
+
+/// USM pointer gather.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Collects elements located at specified address and returns them
+/// as a single \ref simd object.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param pred is predicates.
+/// @return is a vector of type T and size N * NElts
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N>
+__ESIMD_API simd<T, N * NElts> lsc_gather(const T *p, simd<uint32_t, N> offsets,
+                                          simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  simd<_MsgT, N *NElts> Tmp =
+      __esimd_lsc_load_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset,
+                                 _DS, _VS, _Transposed, N>(pred.data(),
+                                                           addrs.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// USM pointer transposed gather with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Collects elements located at specified address and returns them
+/// as a single \ref simd object.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @param p is the base pointer.
+/// @return is a vector of type T and size NElts
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
+__ESIMD_API simd<T, NElts> lsc_block_load(const T *p) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+                "Transposed load is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
+                                    _VS, _Transposed, N>(pred.data(),
+                                                         addrs.data());
+}
+
+/// Accessor-based prefetch gather.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Prefetches elements located at surface.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param pred is predicates.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N, typename AccessorTy>
+__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
+lsc_prefetch(AccessorTy acc, simd<uint32_t, N> offsets, simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  __esimd_lsc_prefetch_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                           _Transposed, N>(pred.data(), offsets.data(), si);
+}
+
+/// Accessor-based transposed prefetch gather with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Prefetches elements located at surface.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offset is the zero-based offset in bytes.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          typename AccessorTy>
+__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
+lsc_prefetch(AccessorTy acc, uint32_t offset) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(
+      _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+      "Transposed prefetch is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uint32_t, N> offsets = offset;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  __esimd_lsc_prefetch_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                           _Transposed, N>(pred.data(), offsets.data(), si);
+}
+
+/// USM pointer prefetch gather.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Prefetches elements located at specified address.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param pred is predicates.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N>
+__ESIMD_API void lsc_prefetch(const T *p, simd<uint32_t, N> offsets,
+                              simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  __esimd_lsc_prefetch_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset,
+                                 _DS, _VS, _Transposed, N>(pred.data(),
+                                                           addrs.data());
+}
+
+/// USM pointer prefetch transposed gather with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_load.ugm
+///
+/// Prefetches elements located at specified address.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to load per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @param p is the base pointer.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
+__ESIMD_API void lsc_prefetch(const T *p) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(
+      _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+      "Transposed prefetch is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  __esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
+                                 _VS, _Transposed, N>(pred.data(),
+                                                      addrs.data());
+}
+
+/// SLM scatter.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.slm
+///
+/// Scatters elements located to slm.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam N is the number of channels (platform dependent).
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size, int N>
+__ESIMD_API void lsc_slm_scatter(simd<uint32_t, N> offsets,
+                                 simd<T, N * NElts> vals,
+                                 simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  using _CstT = typename detail::lsc_bitcast_type<T>::type;
+  simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>();
+  __esimd_lsc_store_slm<_MsgT, cache_hint::none, cache_hint::none,
+                        _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
+      pred.data(), offsets.data(), Tmp.data());
+}
+
+/// Transposed SLM scatter with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.slm
+///
+/// Scatters elements located to slm.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @param offset is the zero-based offset for SLM buffer in bytes.
+/// @param vals is values to store.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size>
+__ESIMD_API void lsc_slm_block_store(uint32_t offset, simd<T, NElts> vals) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+                "Transposed store is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uint32_t, N> offsets = offset;
+  __esimd_lsc_store_slm<T, cache_hint::none, cache_hint::none, _AddressScale,
+                        _ImmOffset, _DS, _VS, _Transposed, N>(
+      pred.data(), offsets.data(), vals.data());
+}
+
+/// Accessor-based scatter.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.ugm
+///
+/// Scatters elements to surface.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N, typename AccessorTy>
+__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
+lsc_scatter(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N * NElts> vals,
+            simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  using _CstT = typename detail::lsc_bitcast_type<T>::type;
+  simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>();
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  __esimd_lsc_store_bti<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                        _Transposed, N>(pred.data(), offsets.data(), Tmp.data(),
+                                        si);
+}
+
+/// Accessor-based transposed scatter with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.ugm
+///
+/// Scatters elements to surface.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offset is the zero-based offset in bytes.
+/// @param vals is values to store.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          typename AccessorTy>
+__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value>
+lsc_block_store(AccessorTy acc, uint32_t offset, simd<T, NElts> vals) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+                "Transposed store is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uint32_t, N> offsets = offset;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                        _Transposed, N>(pred.data(), offsets.data(),
+                                        vals.data(), si);
+}
+
+/// USM pointer scatter.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.ugm
+///
+/// Scatters elements to specific address.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the number of channels (platform dependent).
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets in bytes.
+/// @param vals is values to store.
+/// @param pred is predicates.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N>
+__ESIMD_API void lsc_scatter(T *p, simd<uint32_t, N> offsets,
+                             simd<T, N * NElts> vals, simd_mask<N> pred = 1) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  using _CstT = typename detail::lsc_bitcast_type<T>::type;
+  simd<_MsgT, N *NElts> Tmp = vals.template bit_cast_view<_CstT>();
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  __esimd_lsc_store_stateless<_MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
+                              _VS, _Transposed, N>(pred.data(), addrs.data(),
+                                                   Tmp.data());
+}
+
+/// USM pointer transposed scatter with 1 channel.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_store.ugm
+///
+/// Scatters elements to specific address.
+///
+/// @tparam T is element type.
+/// @tparam NElts is the number of elements to store per address.
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @param p is the base pointer.
+/// @param vals is values to store.
+///
+template <typename T, uint8_t NElts = 1,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
+__ESIMD_API void lsc_block_store(T *p, simd<T, NElts> vals) {
+  detail::check_lsc_vector_size<NElts>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
+  static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
+                "Transposed store is supported only for data size u32 or u64");
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::transpose;
+  constexpr int N = 1;
+  simd_mask<N> pred = 1;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
+                              _Transposed, N>(pred.data(), addrs.data(),
+                                              vals.data());
+}
+
+/// 2D USM pointer block load.
+/// Supported platforms: PVC
+/// VISA instruction: lsc_load_block2d.ugm
+///
+/// Collects elements located at specified address and returns them
+/// as a single \ref simd object.
+///
+/// @tparam T is element type.
+/// @tparam BlockWidth is the block width in number of elements.
+/// @tparam BlockHeight is the block height in number of elements.
+/// @tparam NBlocks is the number of blocks.
+/// @tparam Transposed is the transposed version or not.
+/// @tparam Transformed is apply VNNI transform or not.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the data size
+/// @param Ptr is the surface base address for this operation.
+/// @param SurfaceWidth is the surface width minus 1 in bytes
+/// @param SurfaceHeight is the surface height minus 1 in rows
+/// @param SurfacePitch is the surface pitch minus 1 in bytes
+/// @param X is zero based X-coordinate of the left upper rectangle corner in
+/// number of elements.
+/// @param Y is zero based Y-coordinate of the left upper rectangle corner in
+/// rows.
+/// @return is a vector of type T and size N, where N is
+///  BlockWidth * BlockHeight * NBlocks, if transformed;
+///  otherwise,
+///  N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) *
+///   getNextPowerOf2(BlockWidth) * NBlocks
+///
+template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
+          bool Transposed = false, bool Transformed = false,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N = detail::get_lsc_block_2d_data_size<
+              T, NBlocks, BlockHeight, BlockWidth, Transposed, Transformed>()>
+__ESIMD_API simd<T, N> lsc_load2d(const T *Ptr, unsigned SurfaceWidth,
+                                  unsigned SurfaceHeight, unsigned SurfacePitch,
+                                  int X, int Y) {
+  static_assert(!Transposed || !Transformed,
+                "Transposed and transformed is not supported");
+  static_assert(!Transposed || (Transposed && NBlocks == 1),
+                "Transposed expected to be 1 block only");
+  detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
+  constexpr int ElemsPerDword = 4 / sizeof(T);
+  constexpr int GRFRowSize = Transposed ? BlockHeight : BlockWidth;
+  constexpr int GRFRowPitch = detail::getNextPowerOf2<GRFRowSize>();
+  constexpr int GRFBlockSize =
+      GRFRowPitch * (Transposed ? BlockWidth : BlockHeight);
+  constexpr int GRFBlockPitch =
+      detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
+  constexpr int ActualN = NBlocks * GRFBlockPitch;
+  static_assert(
+      ActualN == N,
+      "These parameters require unpadding. It is not implemented yet");
+  constexpr lsc_data_size DS =
+      detail::finalize_data_size<T, lsc_data_size::default_size>();
+  static_assert(!Transformed ||
+                    (DS == lsc_data_size::u8 || DS == lsc_data_size::u16),
+                "VNNI transform is supported only for data size U8 or U16");
+  static_assert(!Transposed ||
+                    (DS == lsc_data_size::u32 || DS == lsc_data_size::u64),
+                "Transposed load is supported only for data size u32 or u64");
+  simd_mask<N> pred = 1;
+  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
+  constexpr detail::lsc_data_order _Transposed =
+      Transposed ? detail::lsc_data_order::transpose
+                 : detail::lsc_data_order::nontranspose;
+  return __esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
+                                      BlockWidth, BlockHeight, Transformed, N>(
+      pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
+}
+
+/// 2D USM pointer block prefetch.
+/// Supported platforms: PVC
+/// VISA instruction: lsc_load_block2d.ugm
+///
+/// Prefetches elements located at specified address.
+///
+/// @tparam T is element type.
+/// @tparam BlockWidth is the block width in number of elements.
+/// @tparam BlockHeight is the block height in number of elements.
+/// @tparam NBlocks is the number of blocks.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the data size
+/// @param Ptr is the surface base address for this operation.
+/// @param SurfaceWidth is the surface width minus 1 in bytes
+/// @param SurfaceHeight is the surface height minus 1 in rows
+/// @param SurfacePitch is the surface pitch minus 1 in bytes
+/// @param X is zero based X-coordinate of the left upper rectangle corner in
+/// number of elements.
+/// @param Y is zero based Y-coordinate of the left upper rectangle corner in
+/// rows.
+///
+template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N = detail::get_lsc_block_2d_data_size<
+              T, NBlocks, BlockHeight, BlockWidth, false, false>()>
+__ESIMD_API void lsc_prefetch2d(const T *Ptr, unsigned SurfaceWidth,
+                                unsigned SurfaceHeight, unsigned SurfacePitch,
+                                int X, int Y) {
+  detail::check_lsc_cache_hint<detail::lsc_action::prefetch, L1H, L3H>();
+  constexpr lsc_data_size DS =
+      detail::finalize_data_size<T, lsc_data_size::default_size>();
+  simd_mask<N> pred = 1;
+  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  __esimd_lsc_prefetch2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
+                                   BlockWidth, BlockHeight, false, N>(
+      pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
+}
+
+/// 2D USM pointer block store.
+/// Supported platforms: PVC
+/// VISA instruction: lsc_store_block2d.ugm
+///
+/// Stores elements at specified address.
+///
+/// @tparam T is element type.
+/// @tparam BlockWidth is the block width in number of elements.
+/// @tparam BlockHeight is the block height in number of elements.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam N is the data size
+/// @param Ptr is the surface base address for this operation.
+/// @param SurfaceWidth is the surface width minus 1 in bytes
+/// @param SurfaceHeight is the surface height minus 1 in rows
+/// @param SurfacePitch is the surface pitch minus 1 in bytes
+/// @param X is zero based X-coordinate of the left upper rectangle corner in
+/// number of elements.
+/// @param Y is zero based Y-coordinate of the left upper rectangle corner in
+/// rows.
+/// @param Vals is a vector to store of type T and size N, where
+///  N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) *
+///   getNextPowerOf2(BlockWidth) * NBlocks
+///
+template <typename T, int BlockWidth, int BlockHeight = 1,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          int N = detail::get_lsc_block_2d_data_size<
+              T, 1u, BlockHeight, BlockWidth, false, false>()>
+__ESIMD_API void lsc_store2d(T *Ptr, unsigned SurfaceWidth,
+                             unsigned SurfaceHeight, unsigned SurfacePitch,
+                             int X, int Y, simd<T, N> Vals) {
+  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
+  constexpr lsc_data_size DS =
+      detail::finalize_data_size<T, lsc_data_size::default_size>();
+  simd_mask<N> pred = 1;
+  uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  __esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
+                                BlockHeight, false, N>(
+      pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
+      Vals.data());
+}
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size>
+__ESIMD_API simd<T, N> lsc_slm_atomic_update(simd<uint32_t, N> offsets,
+                                             simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 0>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_slm_0<_MsgT, _Op, cache_hint::none, cache_hint::none,
+                                _AddressScale, _ImmOffset, _DS, _VS,
+                                _Transposed, N>(pred.data(), offsets.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size>
+__ESIMD_API simd<T, N> lsc_slm_atomic_update(simd<uint32_t, N> offsets,
+                                             simd<T, N> src0,
+                                             simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 1>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_slm_1<_MsgT, _Op, cache_hint::none, cache_hint::none,
+                                _AddressScale, _ImmOffset, _DS, _VS,
+                                _Transposed, N>(pred.data(), offsets.data(),
+                                                src0.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param src1 is the second atomic operand.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size>
+__ESIMD_API simd<T, N> lsc_slm_atomic_update(simd<uint32_t, N> offsets,
+                                             simd<T, N> src0, simd<T, N> src1,
+                                             simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 2>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_slm_2<_MsgT, _Op, cache_hint::none, cache_hint::none,
+                                _AddressScale, _ImmOffset, _DS, _VS,
+                                _Transposed, N>(pred.data(), offsets.data(),
+                                                src0.data(), src1.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// Accessor-based atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.ugm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offsets is the zero-based offsets.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          typename AccessorTy>
+__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value, simd<T, N>>
+lsc_atomic_update(AccessorTy acc, simd<uint32_t, N> offsets,
+                  simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 0>();
+  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_bti_0<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
+                                _DS, _VS, _Transposed, N>(pred.data(),
+                                                          offsets.data(), si);
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// Accessor-based atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.ugm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          typename AccessorTy>
+__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value, simd<T, N>>
+lsc_atomic_update(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> src0,
+                  simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 1>();
+  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_bti_1<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
+                                _DS, _VS, _Transposed, N>(
+          pred.data(), offsets.data(), src0.data(), si);
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// Accessor-based atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.ugm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @tparam AccessorTy is the \ref sycl::accessor type.
+/// @param acc is the SYCL accessor.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param src1 is the second atomic operand.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none,
+          typename AccessorTy>
+__ESIMD_API std::enable_if_t<!std::is_pointer<AccessorTy>::value, simd<T, N>>
+lsc_atomic_update(AccessorTy acc, simd<uint32_t, N> offsets, simd<T, N> src0,
+                  simd<T, N> src1, simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 2>();
+  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  auto si = __ESIMD_GET_SURF_HANDLE(acc);
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_bti_2<_MsgT, _Op, L1H, L3H, _AddressScale, _ImmOffset,
+                                _DS, _VS, _Transposed, N>(
+          pred.data(), offsets.data(), src0.data(), src1.data(), si);
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// USM pointer atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.ugm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
+__ESIMD_API simd<T, N> lsc_atomic_update(T *p, simd<uint32_t, N> offsets,
+                                         simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 0>();
+  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_stateless_0<_MsgT, _Op, L1H, L3H, _AddressScale,
+                                      _ImmOffset, _DS, _VS, _Transposed, N>(
+          pred.data(), addrs.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// USM pointer atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.ugm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
+__ESIMD_API simd<T, N> lsc_atomic_update(T *p, simd<uint32_t, N> offsets,
+                                         simd<T, N> src0, simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 1>();
+  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_stateless_1<_MsgT, _Op, L1H, L3H, _AddressScale,
+                                      _ImmOffset, _DS, _VS, _Transposed, N>(
+          pred.data(), addrs.data(), src0.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// USM pointer atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.ugm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L3H is L3 cache hint.
+/// @param p is the base pointer.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param src1 is the second atomic operand.
+/// @param pred is predicates.
+///
+template <atomic_op Op, typename T, int N,
+          lsc_data_size DS = lsc_data_size::default_size,
+          cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
+__ESIMD_API simd<T, N> lsc_atomic_update(T *p, simd<uint32_t, N> offsets,
+                                         simd<T, N> src0, simd<T, N> src1,
+                                         simd_mask<N> pred) {
+  detail::check_lsc_vector_size<1>();
+  detail::check_lsc_data_size<T, DS>();
+  detail::check_lsc_atomic<Op, 2>();
+  detail::check_lsc_cache_hint<detail::lsc_action::atomic, L1H, L3H>();
+  constexpr uint16_t _AddressScale = 1;
+  constexpr int _ImmOffset = 0;
+  constexpr lsc_data_size _DS =
+      detail::expand_data_size(detail::finalize_data_size<T, DS>());
+  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<1>();
+  constexpr detail::lsc_data_order _Transposed =
+      detail::lsc_data_order::nontranspose;
+  constexpr detail::lsc_atomic_op _Op = detail::to_lsc_atomic_op<Op>();
+  using _MsgT = typename detail::lsc_expand_type<T>::type;
+  simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
+  addrs += convert<uintptr_t>(offsets);
+  simd<_MsgT, N> Tmp =
+      __esimd_lsc_xatomic_stateless_2<_MsgT, _Op, L1H, L3H, _AddressScale,
+                                      _ImmOffset, _DS, _VS, _Transposed, N>(
+          pred.data(), addrs.data(), src0.data(), src1.data());
+  return detail::lsc_format_ret<T>(Tmp);
+}
+
+/// Memory fence.
+/// Supported platforms: DG2, PVC
+///
+/// @tparam Kind is the Sfid shaded function.
+/// @tparam FenceOp is the fence operation.
+/// @tparam Scope is the operation scope.
+/// @tparam N is the number of channels (platform dependent).
+/// @param pred is predicates.
+template <lsc_memory_kind Kind = lsc_memory_kind::untyped_global,
+          lsc_fence_op FenceOp = lsc_fence_op::none,
+          lsc_scope Scope = lsc_scope::group, int N = 16>
+__ESIMD_API void lsc_fence(simd_mask<N> pred = 1) {
+  static_assert(
+      Kind != lsc_memory_kind::shared_local ||
+          (FenceOp == lsc_fence_op::none && Scope == lsc_scope::group),
+      "SLM fence must have 'none' lsc_fence_op and 'group' scope");
+  __esimd_lsc_fence<Kind, FenceOp, Scope, N>(pred.data());
+}
+
+/// @} sycl_esimd_memory_lsc
+
 #undef __ESIMD_GET_SURF_HANDLE
 
 /// @cond EXCLUDE
diff --git a/sycl/test/esimd/lsc.cpp b/sycl/test/esimd/lsc.cpp
new file mode 100644
index 0000000000000..ec358c5cd8747
--- /dev/null
+++ b/sycl/test/esimd/lsc.cpp
@@ -0,0 +1,153 @@
+// RUN: %clangxx -O0 -fsycl -fsycl-device-only -Xclang -emit-llvm %s -o %t
+// RUN: sycl-post-link -split-esimd -lower-esimd -O0 -S %t -o %t.table
+// RUN: FileCheck %s -input-file=%t_esimd_0.ll
+
+// Checks ESIMD intrinsic translation.
+// NOTE: must be run in -O0, as optimizer optimizes away some of the code
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/detail/image_ocl_types.hpp>
+#include <sycl/ext/intel/experimental/esimd.hpp>
+
+using namespace sycl::ext::intel::experimental::esimd;
+
+SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo();
+
+class EsimdFunctor {
+public:
+  void operator()() __attribute__((sycl_explicit_simd)) { foo(); }
+};
+
+template <typename name, typename Func>
+__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
+  kernelFunc();
+}
+
+void bar() {
+  EsimdFunctor esimdf;
+  kernel<class kernel_esimd>(esimdf);
+}
+
+SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo() {
+  constexpr int VL = 4;
+  int *ptr = 0;
+  uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+
+  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  simd<int, VL> data1 = 1;
+  lsc_block_store<int, VL>(ptr, data1);
+
+  // CHECK: {{[^)]+}} = call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
+  simd<int, VL> data2 = lsc_block_load<int, VL>(ptr);
+
+  //CHECK: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
+  lsc_prefetch<int, VL, lsc_data_size::default_size, cache_hint::uncached,
+               cache_hint::cached>(ptr);
+
+  simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int);
+
+  // CHECK: call void @llvm.genx.lsc.store.stateless.v4i1.v4i64.v4i32(<4 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  lsc_scatter<int>(ptr, offsets, data1);
+
+  // CHECK: {{[^)]+}} = call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0)
+  simd<int, VL> data3 = lsc_gather<int>(ptr, offsets);
+
+  // CHECK: call void @llvm.genx.lsc.prefetch.stateless.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, i32 0)
+  lsc_prefetch<int, 1, lsc_data_size::default_size, cache_hint::uncached,
+               cache_hint::cached>(ptr, offsets);
+
+  sycl::accessor<uint8_t, 1, sycl::access::mode::read_write> acc;
+  uint32_t surf_offset = 1 * VL * sizeof(int);
+
+  // CHECK: call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  lsc_block_store<int, VL>(acc, surf_offset, data1);
+
+  // CHECK: {{[^)]+}} = call <4 x i32> @llvm.genx.lsc.load.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  simd<int, VL> data4 = lsc_block_load<int, VL>(acc, surf_offset);
+
+  // CHECK: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  lsc_prefetch<int, 4, lsc_data_size::default_size, cache_hint::uncached,
+               cache_hint::cached>(acc, surf_offset);
+
+  // CHECK: call void @llvm.genx.lsc.store.bti.v4i1.v4i32.v4i32(<4 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  lsc_scatter<int>(acc, offsets, data1);
+
+  // CHECK: {{[^)]+}} = call <4 x i32> @llvm.genx.lsc.load.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  simd<int, VL> data5 = lsc_gather<int>(acc, offsets);
+
+  // CHECK: call void @llvm.genx.lsc.prefetch.bti.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  lsc_prefetch<int, 1, lsc_data_size::default_size, cache_hint::uncached,
+               cache_hint::cached>(acc, offsets);
+
+  // CHECK: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  lsc_slm_block_store<int, VL>(surf_offset, data1);
+
+  // CHECK: {{[^)]+}} = call <4 x i32> @llvm.genx.lsc.load.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, i32 0)
+  simd<int, VL> data6 = lsc_slm_gather<int>(offsets);
+
+  auto add = simd<int, VL>(5);
+  auto compare = simd<int, VL>(VL, 1);
+  auto swap = compare * 2;
+  auto pred = simd_mask<VL>(1);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+  auto res_flat_atomic_0 =
+      lsc_atomic_update<atomic_op::inc, int>(ptr, offsets, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+  auto res_flat_atomic_1 =
+      lsc_atomic_update<atomic_op::add, int>(ptr, offsets, add, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+  auto res_flat_atomic_2 = lsc_atomic_update<atomic_op::cmpxchg, int>(
+      ptr, offsets, compare, swap, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+  auto res_slm_atomic_0 =
+      lsc_slm_atomic_update<atomic_op::inc, int>(offsets, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+  auto res_slm_atomic_1 =
+      lsc_slm_atomic_update<atomic_op::add, int>(offsets, add, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+  auto res_slm_atomic_2 = lsc_slm_atomic_update<atomic_op::cmpxchg, int>(
+      offsets, compare, swap, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
+  auto res_surf_atomic_0 =
+      lsc_atomic_update<atomic_op::inc, int>(acc, offsets, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
+  auto res_surf_atomic_1 =
+      lsc_atomic_update<atomic_op::add, int>(acc, offsets, add, pred);
+
+  // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
+  auto res_surf_atomic_2 = lsc_atomic_update<atomic_op::cmpxchg, int>(
+      acc, offsets, compare, swap, pred);
+
+  constexpr unsigned Width = 4;
+  constexpr unsigned Height = 4;
+  constexpr unsigned NumBlocks = 2;
+  unsigned data_height, data_width, data_pitch, x, y;
+
+  // CHECK: {{[^)]+}} = call <32 x i32> @llvm.genx.lsc.load2d.stateless.v32i32.v32i1.i64(<32 x i1> {{[^)]+}}, i8 1, i8 1, i8 3, i8 1, i8 2, i16 4, i16 4, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
+  simd<int, Width *Height *NumBlocks> data7 =
+      lsc_load2d<int, Width, Height, NumBlocks, false, false,
+                 cache_hint::uncached, cache_hint::uncached>(
+          ptr, data_width, data_height, data_pitch, x, y);
+
+  simd<int, Width *Height * 1> data8 = 7;
+  // CHECK: call void @llvm.genx.lsc.store2d.stateless.v16i1.i64.v16i32(<16 x i1> {{[^)]+}}, i8 1, i8 1, i8 3, i8 1, i8 1, i16 4, i16 4, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, <16 x i32> {{[^)]+}})
+  lsc_store2d<int, Width, Height, cache_hint::uncached, cache_hint::uncached>(
+      ptr, data_width, data_height, data_pitch, x, y, data8);
+
+  // CHECK: call void @llvm.genx.lsc.prefetch2d.stateless.v32i1.i64(<32 x i1> {{[^)]+}}, i8 1, i8 2, i8 3, i8 1, i8 2, i16 4, i16 4, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
+  lsc_prefetch2d<int, Width, Height, NumBlocks, cache_hint::uncached,
+                 cache_hint::cached>(ptr, data_width, data_height, data_pitch,
+                                     x, y);
+
+  lsc_fence<lsc_memory_kind::shared_local, lsc_fence_op::none, lsc_scope::group,
+            16>();
+  // CHECK: call void @llvm.genx.lsc.fence.v16i1(<16 x i1> {{[^)]+}}, i8 3, i8 0, i8 0)
+}

From 70cd71067c2c0deacb4f7b70772347ebc7b084c0 Mon Sep 17 00:00:00 2001
From: Xiaodong Li <xiaodong.li@intel.com>
Date: Tue, 1 Mar 2022 18:24:08 -0800
Subject: [PATCH 748/748] Debug auto merge function

Signed-off-by: Xiaodong Li <xiaodong.li@intel.com>
---
 merge_test.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 merge_test.txt

diff --git a/merge_test.txt b/merge_test.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d